1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7 // +build gc,!purego
8
9 #include "textflag.h"
10 // General register allocation
11 #define oup DI
12 #define inp SI
13 #define inl BX
14 #define adp CX // free to reuse, after we hash the additional data
15 #define keyp R8 // free to reuse, when we copy the key to stack
16 #define itr2 R9 // general iterator
17 #define itr1 CX // general iterator
18 #define acc0 R10
19 #define acc1 R11
20 #define acc2 R12
21 #define t0 R13
22 #define t1 R14
23 #define t2 R15
24 #define t3 R8
25 // Register and stack allocation for the SSE code
26 #define rStore (0*16)(BP)
27 #define sStore (1*16)(BP)
28 #define state1Store (2*16)(BP)
29 #define state2Store (3*16)(BP)
30 #define tmpStore (4*16)(BP)
31 #define ctr0Store (5*16)(BP)
32 #define ctr1Store (6*16)(BP)
33 #define ctr2Store (7*16)(BP)
34 #define ctr3Store (8*16)(BP)
35 #define A0 X0
36 #define A1 X1
37 #define A2 X2
38 #define B0 X3
39 #define B1 X4
40 #define B2 X5
41 #define C0 X6
42 #define C1 X7
43 #define C2 X8
44 #define D0 X9
45 #define D1 X10
46 #define D2 X11
47 #define T0 X12
48 #define T1 X13
49 #define T2 X14
50 #define T3 X15
51 #define A3 T0
52 #define B3 T1
53 #define C3 T2
54 #define D3 T3
55 // Register and stack allocation for the AVX2 code
56 #define rsStoreAVX2 (0*32)(BP)
57 #define state1StoreAVX2 (1*32)(BP)
58 #define state2StoreAVX2 (2*32)(BP)
59 #define ctr0StoreAVX2 (3*32)(BP)
60 #define ctr1StoreAVX2 (4*32)(BP)
61 #define ctr2StoreAVX2 (5*32)(BP)
62 #define ctr3StoreAVX2 (6*32)(BP)
63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
64 #define AA0 Y0
65 #define AA1 Y5
66 #define AA2 Y6
67 #define AA3 Y7
68 #define BB0 Y14
69 #define BB1 Y9
70 #define BB2 Y10
71 #define BB3 Y11
72 #define CC0 Y12
73 #define CC1 Y13
74 #define CC2 Y8
75 #define CC3 Y15
76 #define DD0 Y4
77 #define DD1 Y1
78 #define DD2 Y2
79 #define DD3 Y3
80 #define TT0 DD3
81 #define TT1 AA3
82 #define TT2 BB3
83 #define TT3 CC3
84 // ChaCha20 constants
85 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
86 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
87 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
88 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
89 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
90 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
91 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
92 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
93 // <<< 16 with PSHUFB
94 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
95 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
96 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
97 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
98 // <<< 8 with PSHUFB
99 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
100 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
101 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
102 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
103
104 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
105 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
106 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
107 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
108
109 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
110 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
111 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
112 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
113 // Poly1305 key clamp
114 DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
115 DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
116 DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
117 DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
118
119 DATA ·sseIncMask<>+0x00(SB)/8, $0x1
120 DATA ·sseIncMask<>+0x08(SB)/8, $0x0
121 // To load/store the last < 16 bytes in a buffer
122 DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
123 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
124 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
125 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
126 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
127 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
128 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
129 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
130 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
131 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
132 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
133 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
134 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
135 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
136 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
137 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
138 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
139 DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
140 DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
141 DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
142 DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
143 DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
144 DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
145 DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
146 DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
147 DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
148 DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
149 DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
150 DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
151 DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
152
153 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
154 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
155 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
156 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
157 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
158 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
159 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
160 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
161 // No PALIGNR in Go ASM yet (but VPALIGNR is present).
162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
178 #define shiftC0Right shiftC0Left
179 #define shiftC1Right shiftC1Left
180 #define shiftC2Right shiftC2Left
181 #define shiftC3Right shiftC3Left
182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
186 // Some macros
187 #define chachaQR(A, B, C, D, T) \
188 PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
189 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
190 PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
191 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
192
193 #define chachaQR_AVX2(A, B, C, D, T) \
194 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
195 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
196 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
197 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
198
199 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
200 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
201 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
202 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
203 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
204
205 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
206 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
207 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
208
209 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
210 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
211 // ----------------------------------------------------------------------------
212 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
213 // adp points to beginning of additional data
214 // itr2 holds ad length
215 XORQ acc0, acc0
216 XORQ acc1, acc1
217 XORQ acc2, acc2
218 CMPQ itr2, $13
219 JNE hashADLoop
220
221 openFastTLSAD:
222 // Special treatment for the TLS case of 13 bytes
223 MOVQ (adp), acc0
224 MOVQ 5(adp), acc1
225 SHRQ $24, acc1
226 MOVQ $1, acc2
227 polyMul
228 RET
229
230 hashADLoop:
231 // Hash in 16 byte chunks
232 CMPQ itr2, $16
233 JB hashADTail
234 polyAdd(0(adp))
235 LEAQ (1*16)(adp), adp
236 SUBQ $16, itr2
237 polyMul
238 JMP hashADLoop
239
240 hashADTail:
241 CMPQ itr2, $0
242 JE hashADDone
243
244 // Hash last < 16 byte tail
245 XORQ t0, t0
246 XORQ t1, t1
247 XORQ t2, t2
248 ADDQ itr2, adp
249
250 hashADTailLoop:
251 SHLQ $8, t0, t1
252 SHLQ $8, t0
253 MOVB -1(adp), t2
254 XORQ t2, t0
255 DECQ adp
256 DECQ itr2
257 JNE hashADTailLoop
258
259 hashADTailFinish:
260 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
261 polyMul
262
263 // Finished AD
264 hashADDone:
265 RET
266
267 // ----------------------------------------------------------------------------
268 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
269 TEXT ·chacha20Poly1305Open(SB), 0, $288-97
270 // For aligned stack access
271 MOVQ SP, BP
272 ADDQ $32, BP
273 ANDQ $-32, BP
274 MOVQ dst+0(FP), oup
275 MOVQ key+24(FP), keyp
276 MOVQ src+48(FP), inp
277 MOVQ src_len+56(FP), inl
278 MOVQ ad+72(FP), adp
279
280 // Check for AVX2 support
281 CMPB ·useAVX2(SB), $1
282 JE chacha20Poly1305Open_AVX2
283
284 // Special optimization, for very short buffers
285 CMPQ inl, $128
286 JBE openSSE128 // About 16% faster
287
288 // For long buffers, prepare the poly key first
289 MOVOU ·chacha20Constants<>(SB), A0
290 MOVOU (1*16)(keyp), B0
291 MOVOU (2*16)(keyp), C0
292 MOVOU (3*16)(keyp), D0
293 MOVO D0, T1
294
295 // Store state on stack for future use
296 MOVO B0, state1Store
297 MOVO C0, state2Store
298 MOVO D0, ctr3Store
299 MOVQ $10, itr2
300
301 openSSEPreparePolyKey:
302 chachaQR(A0, B0, C0, D0, T0)
303 shiftB0Left; shiftC0Left; shiftD0Left
304 chachaQR(A0, B0, C0, D0, T0)
305 shiftB0Right; shiftC0Right; shiftD0Right
306 DECQ itr2
307 JNE openSSEPreparePolyKey
308
309 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
310 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
311
312 // Clamp and store the key
313 PAND ·polyClampMask<>(SB), A0
314 MOVO A0, rStore; MOVO B0, sStore
315
316 // Hash AAD
317 MOVQ ad_len+80(FP), itr2
318 CALL polyHashADInternal<>(SB)
319
320 openSSEMainLoop:
321 CMPQ inl, $256
322 JB openSSEMainLoopDone
323
324 // Load state, increment counter blocks
325 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
326 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
327 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
328 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
329
330 // Store counters
331 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
332
333 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
334 MOVQ $4, itr1
335 MOVQ inp, itr2
336
337 openSSEInternalLoop:
338 MOVO C3, tmpStore
339 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
340 MOVO tmpStore, C3
341 MOVO C1, tmpStore
342 chachaQR(A3, B3, C3, D3, C1)
343 MOVO tmpStore, C1
344 polyAdd(0(itr2))
345 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
346 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
347 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
348 polyMulStage1
349 polyMulStage2
350 LEAQ (2*8)(itr2), itr2
351 MOVO C3, tmpStore
352 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
353 MOVO tmpStore, C3
354 MOVO C1, tmpStore
355 polyMulStage3
356 chachaQR(A3, B3, C3, D3, C1)
357 MOVO tmpStore, C1
358 polyMulReduceStage
359 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
360 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
361 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
362 DECQ itr1
363 JGE openSSEInternalLoop
364
365 polyAdd(0(itr2))
366 polyMul
367 LEAQ (2*8)(itr2), itr2
368
369 CMPQ itr1, $-6
370 JG openSSEInternalLoop
371
372 // Add in the state
373 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
374 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
375 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
376 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
377
378 // Load - xor - store
379 MOVO D3, tmpStore
380 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
381 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
382 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
383 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
384 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
385 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
386 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
387 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
388 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
389 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
390 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
391 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
392 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
393 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
394 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
395 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
396 LEAQ 256(inp), inp
397 LEAQ 256(oup), oup
398 SUBQ $256, inl
399 JMP openSSEMainLoop
400
401 openSSEMainLoopDone:
402 // Handle the various tail sizes efficiently
403 TESTQ inl, inl
404 JE openSSEFinalize
405 CMPQ inl, $64
406 JBE openSSETail64
407 CMPQ inl, $128
408 JBE openSSETail128
409 CMPQ inl, $192
410 JBE openSSETail192
411 JMP openSSETail256
412
413 openSSEFinalize:
414 // Hash in the PT, AAD lengths
415 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
416 polyMul
417
418 // Final reduce
419 MOVQ acc0, t0
420 MOVQ acc1, t1
421 MOVQ acc2, t2
422 SUBQ $-5, acc0
423 SBBQ $-1, acc1
424 SBBQ $3, acc2
425 CMOVQCS t0, acc0
426 CMOVQCS t1, acc1
427 CMOVQCS t2, acc2
428
429 // Add in the "s" part of the key
430 ADDQ 0+sStore, acc0
431 ADCQ 8+sStore, acc1
432
433 // Finally, constant time compare to the tag at the end of the message
434 XORQ AX, AX
435 MOVQ $1, DX
436 XORQ (0*8)(inp), acc0
437 XORQ (1*8)(inp), acc1
438 ORQ acc1, acc0
439 CMOVQEQ DX, AX
440
441 // Return true iff tags are equal
442 MOVB AX, ret+96(FP)
443 RET
444
445 // ----------------------------------------------------------------------------
446 // Special optimization for buffers smaller than 129 bytes
447 openSSE128:
448 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
449 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
450 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
451 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
452 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
453 MOVQ $10, itr2
454
455 openSSE128InnerCipherLoop:
456 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
457 shiftB0Left; shiftB1Left; shiftB2Left
458 shiftC0Left; shiftC1Left; shiftC2Left
459 shiftD0Left; shiftD1Left; shiftD2Left
460 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
461 shiftB0Right; shiftB1Right; shiftB2Right
462 shiftC0Right; shiftC1Right; shiftC2Right
463 shiftD0Right; shiftD1Right; shiftD2Right
464 DECQ itr2
465 JNE openSSE128InnerCipherLoop
466
467 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
468 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
469 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
470 PADDL T2, C1; PADDL T2, C2
471 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
472
473 // Clamp and store the key
474 PAND ·polyClampMask<>(SB), A0
475 MOVOU A0, rStore; MOVOU B0, sStore
476
477 // Hash
478 MOVQ ad_len+80(FP), itr2
479 CALL polyHashADInternal<>(SB)
480
481 openSSE128Open:
482 CMPQ inl, $16
483 JB openSSETail16
484 SUBQ $16, inl
485
486 // Load for hashing
487 polyAdd(0(inp))
488
489 // Load for decryption
490 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
491 LEAQ (1*16)(inp), inp
492 LEAQ (1*16)(oup), oup
493 polyMul
494
495 // Shift the stream "left"
496 MOVO B1, A1
497 MOVO C1, B1
498 MOVO D1, C1
499 MOVO A2, D1
500 MOVO B2, A2
501 MOVO C2, B2
502 MOVO D2, C2
503 JMP openSSE128Open
504
505 openSSETail16:
506 TESTQ inl, inl
507 JE openSSEFinalize
508
509 // We can safely load the CT from the end, because it is padded with the MAC
510 MOVQ inl, itr2
511 SHLQ $4, itr2
512 LEAQ ·andMask<>(SB), t0
513 MOVOU (inp), T0
514 ADDQ inl, inp
515 PAND -16(t0)(itr2*1), T0
516 MOVO T0, 0+tmpStore
517 MOVQ T0, t0
518 MOVQ 8+tmpStore, t1
519 PXOR A1, T0
520
521 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
522 openSSETail16Store:
523 MOVQ T0, t3
524 MOVB t3, (oup)
525 PSRLDQ $1, T0
526 INCQ oup
527 DECQ inl
528 JNE openSSETail16Store
529 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
530 polyMul
531 JMP openSSEFinalize
532
533 // ----------------------------------------------------------------------------
534 // Special optimization for the last 64 bytes of ciphertext
535 openSSETail64:
536 // Need to decrypt up to 64 bytes - prepare single block
537 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
538 XORQ itr2, itr2
539 MOVQ inl, itr1
540 CMPQ itr1, $16
541 JB openSSETail64LoopB
542
543 openSSETail64LoopA:
544 // Perform ChaCha rounds, while hashing the remaining input
545 polyAdd(0(inp)(itr2*1))
546 polyMul
547 SUBQ $16, itr1
548
549 openSSETail64LoopB:
550 ADDQ $16, itr2
551 chachaQR(A0, B0, C0, D0, T0)
552 shiftB0Left; shiftC0Left; shiftD0Left
553 chachaQR(A0, B0, C0, D0, T0)
554 shiftB0Right; shiftC0Right; shiftD0Right
555
556 CMPQ itr1, $16
557 JAE openSSETail64LoopA
558
559 CMPQ itr2, $160
560 JNE openSSETail64LoopB
561
562 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
563
564 openSSETail64DecLoop:
565 CMPQ inl, $16
566 JB openSSETail64DecLoopDone
567 SUBQ $16, inl
568 MOVOU (inp), T0
569 PXOR T0, A0
570 MOVOU A0, (oup)
571 LEAQ 16(inp), inp
572 LEAQ 16(oup), oup
573 MOVO B0, A0
574 MOVO C0, B0
575 MOVO D0, C0
576 JMP openSSETail64DecLoop
577
578 openSSETail64DecLoopDone:
579 MOVO A0, A1
580 JMP openSSETail16
581
582 // ----------------------------------------------------------------------------
583 // Special optimization for the last 128 bytes of ciphertext
584 openSSETail128:
585 // Need to decrypt up to 128 bytes - prepare two blocks
586 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
587 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
588 XORQ itr2, itr2
589 MOVQ inl, itr1
590 ANDQ $-16, itr1
591
592 openSSETail128LoopA:
593 // Perform ChaCha rounds, while hashing the remaining input
594 polyAdd(0(inp)(itr2*1))
595 polyMul
596
597 openSSETail128LoopB:
598 ADDQ $16, itr2
599 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
600 shiftB0Left; shiftC0Left; shiftD0Left
601 shiftB1Left; shiftC1Left; shiftD1Left
602 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
603 shiftB0Right; shiftC0Right; shiftD0Right
604 shiftB1Right; shiftC1Right; shiftD1Right
605
606 CMPQ itr2, itr1
607 JB openSSETail128LoopA
608
609 CMPQ itr2, $160
610 JNE openSSETail128LoopB
611
612 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
613 PADDL state1Store, B0; PADDL state1Store, B1
614 PADDL state2Store, C0; PADDL state2Store, C1
615 PADDL ctr1Store, D0; PADDL ctr0Store, D1
616
617 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
618 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
619 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
620
621 SUBQ $64, inl
622 LEAQ 64(inp), inp
623 LEAQ 64(oup), oup
624 JMP openSSETail64DecLoop
625
626 // ----------------------------------------------------------------------------
627 // Special optimization for the last 192 bytes of ciphertext
628 openSSETail192:
629 // Need to decrypt up to 192 bytes - prepare three blocks
630 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
631 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
632 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
633
634 MOVQ inl, itr1
635 MOVQ $160, itr2
636 CMPQ itr1, $160
637 CMOVQGT itr2, itr1
638 ANDQ $-16, itr1
639 XORQ itr2, itr2
640
641 openSSLTail192LoopA:
642 // Perform ChaCha rounds, while hashing the remaining input
643 polyAdd(0(inp)(itr2*1))
644 polyMul
645
646 openSSLTail192LoopB:
647 ADDQ $16, itr2
648 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
649 shiftB0Left; shiftC0Left; shiftD0Left
650 shiftB1Left; shiftC1Left; shiftD1Left
651 shiftB2Left; shiftC2Left; shiftD2Left
652
653 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
654 shiftB0Right; shiftC0Right; shiftD0Right
655 shiftB1Right; shiftC1Right; shiftD1Right
656 shiftB2Right; shiftC2Right; shiftD2Right
657
658 CMPQ itr2, itr1
659 JB openSSLTail192LoopA
660
661 CMPQ itr2, $160
662 JNE openSSLTail192LoopB
663
664 CMPQ inl, $176
665 JB openSSLTail192Store
666
667 polyAdd(160(inp))
668 polyMul
669
670 CMPQ inl, $192
671 JB openSSLTail192Store
672
673 polyAdd(176(inp))
674 polyMul
675
676 openSSLTail192Store:
677 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
678 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
679 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
680 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
681
682 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
683 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
684 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
685
686 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
687 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
688 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
689
690 SUBQ $128, inl
691 LEAQ 128(inp), inp
692 LEAQ 128(oup), oup
693 JMP openSSETail64DecLoop
694
695 // ----------------------------------------------------------------------------
696 // Special optimization for the last 256 bytes of ciphertext
697 openSSETail256:
698 // Need to decrypt up to 256 bytes - prepare four blocks
699 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
700 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
701 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
702 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
703
704 // Store counters
705 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
706 XORQ itr2, itr2
707
708 openSSETail256Loop:
709 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
710 polyAdd(0(inp)(itr2*1))
711 MOVO C3, tmpStore
712 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
713 MOVO tmpStore, C3
714 MOVO C1, tmpStore
715 chachaQR(A3, B3, C3, D3, C1)
716 MOVO tmpStore, C1
717 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
718 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
719 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
720 polyMulStage1
721 polyMulStage2
722 MOVO C3, tmpStore
723 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
724 MOVO tmpStore, C3
725 MOVO C1, tmpStore
726 chachaQR(A3, B3, C3, D3, C1)
727 MOVO tmpStore, C1
728 polyMulStage3
729 polyMulReduceStage
730 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
731 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
732 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
733 ADDQ $2*8, itr2
734 CMPQ itr2, $160
735 JB openSSETail256Loop
736 MOVQ inl, itr1
737 ANDQ $-16, itr1
738
739 openSSETail256HashLoop:
740 polyAdd(0(inp)(itr2*1))
741 polyMul
742 ADDQ $2*8, itr2
743 CMPQ itr2, itr1
744 JB openSSETail256HashLoop
745
746 // Add in the state
747 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
748 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
749 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
750 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
751 MOVO D3, tmpStore
752
753 // Load - xor - store
754 MOVOU (0*16)(inp), D3; PXOR D3, A0
755 MOVOU (1*16)(inp), D3; PXOR D3, B0
756 MOVOU (2*16)(inp), D3; PXOR D3, C0
757 MOVOU (3*16)(inp), D3; PXOR D3, D0
758 MOVOU A0, (0*16)(oup)
759 MOVOU B0, (1*16)(oup)
760 MOVOU C0, (2*16)(oup)
761 MOVOU D0, (3*16)(oup)
762 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
763 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
764 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
765 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
766 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
767 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
768 LEAQ 192(inp), inp
769 LEAQ 192(oup), oup
770 SUBQ $192, inl
771 MOVO A3, A0
772 MOVO B3, B0
773 MOVO C3, C0
774 MOVO tmpStore, D0
775
776 JMP openSSETail64DecLoop
777
778 // ----------------------------------------------------------------------------
779 // ------------------------- AVX2 Code ----------------------------------------
780 chacha20Poly1305Open_AVX2:
781 VZEROUPPER
782 VMOVDQU ·chacha20Constants<>(SB), AA0
783 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
784 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
785 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
786 VPADDD ·avx2InitMask<>(SB), DD0, DD0
787
788 // Special optimization, for very short buffers
789 CMPQ inl, $192
790 JBE openAVX2192
791 CMPQ inl, $320
792 JBE openAVX2320
793
794 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
795 VMOVDQA BB0, state1StoreAVX2
796 VMOVDQA CC0, state2StoreAVX2
797 VMOVDQA DD0, ctr3StoreAVX2
798 MOVQ $10, itr2
799
800 openAVX2PreparePolyKey:
801 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
802 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
803 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
804 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
805 DECQ itr2
806 JNE openAVX2PreparePolyKey
807
808 VPADDD ·chacha20Constants<>(SB), AA0, AA0
809 VPADDD state1StoreAVX2, BB0, BB0
810 VPADDD state2StoreAVX2, CC0, CC0
811 VPADDD ctr3StoreAVX2, DD0, DD0
812
813 VPERM2I128 $0x02, AA0, BB0, TT0
814
815 // Clamp and store poly key
816 VPAND ·polyClampMask<>(SB), TT0, TT0
817 VMOVDQA TT0, rsStoreAVX2
818
819 // Stream for the first 64 bytes
820 VPERM2I128 $0x13, AA0, BB0, AA0
821 VPERM2I128 $0x13, CC0, DD0, BB0
822
823 // Hash AD + first 64 bytes
824 MOVQ ad_len+80(FP), itr2
825 CALL polyHashADInternal<>(SB)
826 XORQ itr1, itr1
827
828 openAVX2InitialHash64:
829 polyAdd(0(inp)(itr1*1))
830 polyMulAVX2
831 ADDQ $16, itr1
832 CMPQ itr1, $64
833 JNE openAVX2InitialHash64
834
835 // Decrypt the first 64 bytes
836 VPXOR (0*32)(inp), AA0, AA0
837 VPXOR (1*32)(inp), BB0, BB0
838 VMOVDQU AA0, (0*32)(oup)
839 VMOVDQU BB0, (1*32)(oup)
840 LEAQ (2*32)(inp), inp
841 LEAQ (2*32)(oup), oup
842 SUBQ $64, inl
843
844 openAVX2MainLoop:
845 CMPQ inl, $512
846 JB openAVX2MainLoopDone
847
848 // Load state, increment counter blocks, store the incremented counters
849 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
850 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
851 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
852 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
853 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
854 XORQ itr1, itr1
855
856 openAVX2InternalLoop:
857 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
858 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
859 polyAdd(0*8(inp)(itr1*1))
860 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
861 polyMulStage1_AVX2
862 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
863 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
864 polyMulStage2_AVX2
865 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
866 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
867 polyMulStage3_AVX2
868 VMOVDQA CC3, tmpStoreAVX2
869 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
870 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
871 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
872 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
873 VMOVDQA tmpStoreAVX2, CC3
874 polyMulReduceStage
875 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
876 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
877 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
878 polyAdd(2*8(inp)(itr1*1))
879 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
880 polyMulStage1_AVX2
881 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
882 VMOVDQA CC3, tmpStoreAVX2
883 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
884 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
885 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
886 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
887 VMOVDQA tmpStoreAVX2, CC3
888 polyMulStage2_AVX2
889 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
890 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
891 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
892 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
893 polyMulStage3_AVX2
894 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
895 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
896 polyMulReduceStage
897 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
898 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
899 polyAdd(4*8(inp)(itr1*1))
900 LEAQ (6*8)(itr1), itr1
901 VMOVDQA CC3, tmpStoreAVX2
902 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
903 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
904 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
905 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
906 VMOVDQA tmpStoreAVX2, CC3
907 polyMulStage1_AVX2
908 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
909 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
910 polyMulStage2_AVX2
911 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
912 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
913 polyMulStage3_AVX2
914 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
915 VMOVDQA CC3, tmpStoreAVX2
916 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
917 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
918 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
919 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
920 VMOVDQA tmpStoreAVX2, CC3
921 polyMulReduceStage
922 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
923 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
924 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
925 CMPQ itr1, $480
926 JNE openAVX2InternalLoop
927
928 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
929 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
930 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
931 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
932 VMOVDQA CC3, tmpStoreAVX2
933
934 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
935 polyAdd(480(inp))
936 polyMulAVX2
937 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
938 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
939 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
940 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
941 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
942 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
943
944 // and here
945 polyAdd(496(inp))
946 polyMulAVX2
947 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
948 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
949 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
950 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
951 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
952 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
953 LEAQ (32*16)(inp), inp
954 LEAQ (32*16)(oup), oup
955 SUBQ $(32*16), inl
956 JMP openAVX2MainLoop
957
958 openAVX2MainLoopDone:
959 // Handle the various tail sizes efficiently
960 TESTQ inl, inl
961 JE openSSEFinalize
962 CMPQ inl, $128
963 JBE openAVX2Tail128
964 CMPQ inl, $256
965 JBE openAVX2Tail256
966 CMPQ inl, $384
967 JBE openAVX2Tail384
968 JMP openAVX2Tail512
969
970 // ----------------------------------------------------------------------------
971 // Special optimization for buffers smaller than 193 bytes
972 openAVX2192:
973 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
974 VMOVDQA AA0, AA1
975 VMOVDQA BB0, BB1
976 VMOVDQA CC0, CC1
977 VPADDD ·avx2IncMask<>(SB), DD0, DD1
978 VMOVDQA AA0, AA2
979 VMOVDQA BB0, BB2
980 VMOVDQA CC0, CC2
981 VMOVDQA DD0, DD2
982 VMOVDQA DD1, TT3
983 MOVQ $10, itr2
984
985 openAVX2192InnerCipherLoop:
986 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
987 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
988 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
989 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
990 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
991 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
992 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
993 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
994 DECQ itr2
995 JNE openAVX2192InnerCipherLoop
996 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
997 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
998 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
999 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1000 VPERM2I128 $0x02, AA0, BB0, TT0
1001
1002 // Clamp and store poly key
1003 VPAND ·polyClampMask<>(SB), TT0, TT0
1004 VMOVDQA TT0, rsStoreAVX2
1005
1006 // Stream for up to 192 bytes
1007 VPERM2I128 $0x13, AA0, BB0, AA0
1008 VPERM2I128 $0x13, CC0, DD0, BB0
1009 VPERM2I128 $0x02, AA1, BB1, CC0
1010 VPERM2I128 $0x02, CC1, DD1, DD0
1011 VPERM2I128 $0x13, AA1, BB1, AA1
1012 VPERM2I128 $0x13, CC1, DD1, BB1
1013
1014 openAVX2ShortOpen:
1015 // Hash
1016 MOVQ ad_len+80(FP), itr2
1017 CALL polyHashADInternal<>(SB)
1018
1019 openAVX2ShortOpenLoop:
1020 CMPQ inl, $32
1021 JB openAVX2ShortTail32
1022 SUBQ $32, inl
1023
1024 // Load for hashing
1025 polyAdd(0*8(inp))
1026 polyMulAVX2
1027 polyAdd(2*8(inp))
1028 polyMulAVX2
1029
1030 // Load for decryption
1031 VPXOR (inp), AA0, AA0
1032 VMOVDQU AA0, (oup)
1033 LEAQ (1*32)(inp), inp
1034 LEAQ (1*32)(oup), oup
1035
1036 // Shift stream left
1037 VMOVDQA BB0, AA0
1038 VMOVDQA CC0, BB0
1039 VMOVDQA DD0, CC0
1040 VMOVDQA AA1, DD0
1041 VMOVDQA BB1, AA1
1042 VMOVDQA CC1, BB1
1043 VMOVDQA DD1, CC1
1044 VMOVDQA AA2, DD1
1045 VMOVDQA BB2, AA2
1046 JMP openAVX2ShortOpenLoop
1047
1048 openAVX2ShortTail32:
1049 CMPQ inl, $16
1050 VMOVDQA A0, A1
1051 JB openAVX2ShortDone
1052
1053 SUBQ $16, inl
1054
1055 // Load for hashing
1056 polyAdd(0*8(inp))
1057 polyMulAVX2
1058
1059 // Load for decryption
1060 VPXOR (inp), A0, T0
1061 VMOVDQU T0, (oup)
1062 LEAQ (1*16)(inp), inp
1063 LEAQ (1*16)(oup), oup
1064 VPERM2I128 $0x11, AA0, AA0, AA0
1065 VMOVDQA A0, A1
1066
1067 openAVX2ShortDone:
1068 VZEROUPPER
1069 JMP openSSETail16
1070
1071 // ----------------------------------------------------------------------------
1072 // Special optimization for buffers smaller than 321 bytes
1073 openAVX2320:
1074 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1075 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1076 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1077 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1078 MOVQ $10, itr2
1079
1080 openAVX2320InnerCipherLoop:
1081 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1082 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1083 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1084 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1085 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1086 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1087 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1088 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1089 DECQ itr2
1090 JNE openAVX2320InnerCipherLoop
1091
1092 VMOVDQA ·chacha20Constants<>(SB), TT0
1093 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1094 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1095 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1096 VMOVDQA ·avx2IncMask<>(SB), TT0
1097 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1098 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1099 VPADDD TT3, DD2, DD2
1100
1101 // Clamp and store poly key
1102 VPERM2I128 $0x02, AA0, BB0, TT0
1103 VPAND ·polyClampMask<>(SB), TT0, TT0
1104 VMOVDQA TT0, rsStoreAVX2
1105
1106 // Stream for up to 320 bytes
1107 VPERM2I128 $0x13, AA0, BB0, AA0
1108 VPERM2I128 $0x13, CC0, DD0, BB0
1109 VPERM2I128 $0x02, AA1, BB1, CC0
1110 VPERM2I128 $0x02, CC1, DD1, DD0
1111 VPERM2I128 $0x13, AA1, BB1, AA1
1112 VPERM2I128 $0x13, CC1, DD1, BB1
1113 VPERM2I128 $0x02, AA2, BB2, CC1
1114 VPERM2I128 $0x02, CC2, DD2, DD1
1115 VPERM2I128 $0x13, AA2, BB2, AA2
1116 VPERM2I128 $0x13, CC2, DD2, BB2
1117 JMP openAVX2ShortOpen
1118
1119 // ----------------------------------------------------------------------------
1120 // Special optimization for the last 128 bytes of ciphertext
1121 openAVX2Tail128:
1122 // Need to decrypt up to 128 bytes - prepare two blocks
1123 VMOVDQA ·chacha20Constants<>(SB), AA1
1124 VMOVDQA state1StoreAVX2, BB1
1125 VMOVDQA state2StoreAVX2, CC1
1126 VMOVDQA ctr3StoreAVX2, DD1
1127 VPADDD ·avx2IncMask<>(SB), DD1, DD1
1128 VMOVDQA DD1, DD0
1129
1130 XORQ itr2, itr2
1131 MOVQ inl, itr1
1132 ANDQ $-16, itr1
1133 TESTQ itr1, itr1
1134 JE openAVX2Tail128LoopB
1135
1136 openAVX2Tail128LoopA:
1137 // Perform ChaCha rounds, while hashing the remaining input
1138 polyAdd(0(inp)(itr2*1))
1139 polyMulAVX2
1140
1141 openAVX2Tail128LoopB:
1142 ADDQ $16, itr2
1143 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1144 VPALIGNR $4, BB1, BB1, BB1
1145 VPALIGNR $8, CC1, CC1, CC1
1146 VPALIGNR $12, DD1, DD1, DD1
1147 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1148 VPALIGNR $12, BB1, BB1, BB1
1149 VPALIGNR $8, CC1, CC1, CC1
1150 VPALIGNR $4, DD1, DD1, DD1
1151 CMPQ itr2, itr1
1152 JB openAVX2Tail128LoopA
1153 CMPQ itr2, $160
1154 JNE openAVX2Tail128LoopB
1155
1156 VPADDD ·chacha20Constants<>(SB), AA1, AA1
1157 VPADDD state1StoreAVX2, BB1, BB1
1158 VPADDD state2StoreAVX2, CC1, CC1
1159 VPADDD DD0, DD1, DD1
1160 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1161
1162 openAVX2TailLoop:
1163 CMPQ inl, $32
1164 JB openAVX2Tail
1165 SUBQ $32, inl
1166
1167 // Load for decryption
1168 VPXOR (inp), AA0, AA0
1169 VMOVDQU AA0, (oup)
1170 LEAQ (1*32)(inp), inp
1171 LEAQ (1*32)(oup), oup
1172 VMOVDQA BB0, AA0
1173 VMOVDQA CC0, BB0
1174 VMOVDQA DD0, CC0
1175 JMP openAVX2TailLoop
1176
1177 openAVX2Tail:
1178 CMPQ inl, $16
1179 VMOVDQA A0, A1
1180 JB openAVX2TailDone
1181 SUBQ $16, inl
1182
1183 // Load for decryption
1184 VPXOR (inp), A0, T0
1185 VMOVDQU T0, (oup)
1186 LEAQ (1*16)(inp), inp
1187 LEAQ (1*16)(oup), oup
1188 VPERM2I128 $0x11, AA0, AA0, AA0
1189 VMOVDQA A0, A1
1190
1191 openAVX2TailDone:
1192 VZEROUPPER
1193 JMP openSSETail16
1194
1195 // ----------------------------------------------------------------------------
1196 // Special optimization for the last 256 bytes of ciphertext
1197 openAVX2Tail256:
1198 // Need to decrypt up to 256 bytes - prepare four blocks
1199 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1200 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1201 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1202 VMOVDQA ctr3StoreAVX2, DD0
1203 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1204 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1205 VMOVDQA DD0, TT1
1206 VMOVDQA DD1, TT2
1207
1208 // Compute the number of iterations that will hash data
1209 MOVQ inl, tmpStoreAVX2
1210 MOVQ inl, itr1
1211 SUBQ $128, itr1
1212 SHRQ $4, itr1
1213 MOVQ $10, itr2
1214 CMPQ itr1, $10
1215 CMOVQGT itr2, itr1
1216 MOVQ inp, inl
1217 XORQ itr2, itr2
1218
1219 openAVX2Tail256LoopA:
1220 polyAdd(0(inl))
1221 polyMulAVX2
1222 LEAQ 16(inl), inl
1223
1224 // Perform ChaCha rounds, while hashing the remaining input
1225 openAVX2Tail256LoopB:
1226 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1227 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1228 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1229 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1230 INCQ itr2
1231 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1232 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1233 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1234 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1235 CMPQ itr2, itr1
1236 JB openAVX2Tail256LoopA
1237
1238 CMPQ itr2, $10
1239 JNE openAVX2Tail256LoopB
1240
1241 MOVQ inl, itr2
1242 SUBQ inp, inl
1243 MOVQ inl, itr1
1244 MOVQ tmpStoreAVX2, inl
1245
1246 // Hash the remainder of data (if any)
1247 openAVX2Tail256Hash:
1248 ADDQ $16, itr1
1249 CMPQ itr1, inl
1250 JGT openAVX2Tail256HashEnd
1251 polyAdd (0(itr2))
1252 polyMulAVX2
1253 LEAQ 16(itr2), itr2
1254 JMP openAVX2Tail256Hash
1255
1256 // Store 128 bytes safely, then go to store loop
1257 openAVX2Tail256HashEnd:
1258 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1259 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1260 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1261 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1262 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1263 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1264
1265 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1266 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1267 LEAQ (4*32)(inp), inp
1268 LEAQ (4*32)(oup), oup
1269 SUBQ $4*32, inl
1270
1271 JMP openAVX2TailLoop
1272
1273 // ----------------------------------------------------------------------------
1274 // Special optimization for the last 384 bytes of ciphertext
1275 openAVX2Tail384:
1276 // Need to decrypt up to 384 bytes - prepare six blocks
1277 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1278 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1279 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1280 VMOVDQA ctr3StoreAVX2, DD0
1281 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1282 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1283 VPADDD ·avx2IncMask<>(SB), DD1, DD2
1284 VMOVDQA DD0, ctr0StoreAVX2
1285 VMOVDQA DD1, ctr1StoreAVX2
1286 VMOVDQA DD2, ctr2StoreAVX2
1287
1288 // Compute the number of iterations that will hash two blocks of data
1289 MOVQ inl, tmpStoreAVX2
1290 MOVQ inl, itr1
1291 SUBQ $256, itr1
1292 SHRQ $4, itr1
1293 ADDQ $6, itr1
1294 MOVQ $10, itr2
1295 CMPQ itr1, $10
1296 CMOVQGT itr2, itr1
1297 MOVQ inp, inl
1298 XORQ itr2, itr2
1299
1300 // Perform ChaCha rounds, while hashing the remaining input
1301 openAVX2Tail384LoopB:
1302 polyAdd(0(inl))
1303 polyMulAVX2
1304 LEAQ 16(inl), inl
1305
1306 openAVX2Tail384LoopA:
1307 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1308 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1309 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1310 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1311 polyAdd(0(inl))
1312 polyMulAVX2
1313 LEAQ 16(inl), inl
1314 INCQ itr2
1315 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1316 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1317 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1318 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1319
1320 CMPQ itr2, itr1
1321 JB openAVX2Tail384LoopB
1322
1323 CMPQ itr2, $10
1324 JNE openAVX2Tail384LoopA
1325
1326 MOVQ inl, itr2
1327 SUBQ inp, inl
1328 MOVQ inl, itr1
1329 MOVQ tmpStoreAVX2, inl
1330
1331 openAVX2Tail384Hash:
1332 ADDQ $16, itr1
1333 CMPQ itr1, inl
1334 JGT openAVX2Tail384HashEnd
1335 polyAdd(0(itr2))
1336 polyMulAVX2
1337 LEAQ 16(itr2), itr2
1338 JMP openAVX2Tail384Hash
1339
1340 // Store 256 bytes safely, then go to store loop
1341 openAVX2Tail384HashEnd:
1342 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1343 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1344 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1345 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1346 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1347 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1348 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1349 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1350 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1351 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1352 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1353 LEAQ (8*32)(inp), inp
1354 LEAQ (8*32)(oup), oup
1355 SUBQ $8*32, inl
1356 JMP openAVX2TailLoop
1357
1358 // ----------------------------------------------------------------------------
1359 // Special optimization for the last 512 bytes of ciphertext
1360 openAVX2Tail512:
1361 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1362 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1363 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1364 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1365 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1366 XORQ itr1, itr1
1367 MOVQ inp, itr2
1368
1369 openAVX2Tail512LoopB:
1370 polyAdd(0(itr2))
1371 polyMulAVX2
1372 LEAQ (2*8)(itr2), itr2
1373
1374 openAVX2Tail512LoopA:
1375 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1376 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1377 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1378 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1379 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1380 VMOVDQA CC3, tmpStoreAVX2
1381 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1382 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1383 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1384 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1385 VMOVDQA tmpStoreAVX2, CC3
1386 polyAdd(0*8(itr2))
1387 polyMulAVX2
1388 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1389 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1390 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1391 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1392 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1393 VMOVDQA CC3, tmpStoreAVX2
1394 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1395 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1396 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1397 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1398 VMOVDQA tmpStoreAVX2, CC3
1399 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1400 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1401 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1402 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1403 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1404 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1405 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1406 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1407 polyAdd(2*8(itr2))
1408 polyMulAVX2
1409 LEAQ (4*8)(itr2), itr2
1410 VMOVDQA CC3, tmpStoreAVX2
1411 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1412 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1413 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1414 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1415 VMOVDQA tmpStoreAVX2, CC3
1416 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1417 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1418 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1419 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1420 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1421 VMOVDQA CC3, tmpStoreAVX2
1422 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1423 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1424 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1425 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1426 VMOVDQA tmpStoreAVX2, CC3
1427 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1428 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1429 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1430 INCQ itr1
1431 CMPQ itr1, $4
1432 JLT openAVX2Tail512LoopB
1433
1434 CMPQ itr1, $10
1435 JNE openAVX2Tail512LoopA
1436
1437 MOVQ inl, itr1
1438 SUBQ $384, itr1
1439 ANDQ $-16, itr1
1440
1441 openAVX2Tail512HashLoop:
1442 TESTQ itr1, itr1
1443 JE openAVX2Tail512HashEnd
1444 polyAdd(0(itr2))
1445 polyMulAVX2
1446 LEAQ 16(itr2), itr2
1447 SUBQ $16, itr1
1448 JMP openAVX2Tail512HashLoop
1449
1450 openAVX2Tail512HashEnd:
1451 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1452 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1453 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1454 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1455 VMOVDQA CC3, tmpStoreAVX2
1456 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1457 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1458 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1459 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1460 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1461 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1462 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1463 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1464 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1465 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1466
1467 LEAQ (12*32)(inp), inp
1468 LEAQ (12*32)(oup), oup
1469 SUBQ $12*32, inl
1470
1471 JMP openAVX2TailLoop
1472
1473 // ----------------------------------------------------------------------------
1474 // ----------------------------------------------------------------------------
1475 // func chacha20Poly1305Seal(dst, key, src, ad []byte)
1476 TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1477 // For aligned stack access
1478 MOVQ SP, BP
1479 ADDQ $32, BP
1480 ANDQ $-32, BP
1481 MOVQ dst+0(FP), oup
1482 MOVQ key+24(FP), keyp
1483 MOVQ src+48(FP), inp
1484 MOVQ src_len+56(FP), inl
1485 MOVQ ad+72(FP), adp
1486
1487 CMPB ·useAVX2(SB), $1
1488 JE chacha20Poly1305Seal_AVX2
1489
1490 // Special optimization, for very short buffers
1491 CMPQ inl, $128
1492 JBE sealSSE128 // About 15% faster
1493
1494 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1495 MOVOU ·chacha20Constants<>(SB), A0
1496 MOVOU (1*16)(keyp), B0
1497 MOVOU (2*16)(keyp), C0
1498 MOVOU (3*16)(keyp), D0
1499
1500 // Store state on stack for future use
1501 MOVO B0, state1Store
1502 MOVO C0, state2Store
1503
1504 // Load state, increment counter blocks
1505 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1506 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1507 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1508
1509 // Store counters
1510 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1511 MOVQ $10, itr2
1512
1513 sealSSEIntroLoop:
1514 MOVO C3, tmpStore
1515 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1516 MOVO tmpStore, C3
1517 MOVO C1, tmpStore
1518 chachaQR(A3, B3, C3, D3, C1)
1519 MOVO tmpStore, C1
1520 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1521 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1522 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1523
1524 MOVO C3, tmpStore
1525 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1526 MOVO tmpStore, C3
1527 MOVO C1, tmpStore
1528 chachaQR(A3, B3, C3, D3, C1)
1529 MOVO tmpStore, C1
1530 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1531 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1532 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1533 DECQ itr2
1534 JNE sealSSEIntroLoop
1535
1536 // Add in the state
1537 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1538 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1539 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1540 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1541
1542 // Clamp and store the key
1543 PAND ·polyClampMask<>(SB), A0
1544 MOVO A0, rStore
1545 MOVO B0, sStore
1546
1547 // Hash AAD
1548 MOVQ ad_len+80(FP), itr2
1549 CALL polyHashADInternal<>(SB)
1550
1551 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1552 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1553 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1554 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1555 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1556 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1557
1558 MOVQ $128, itr1
1559 SUBQ $128, inl
1560 LEAQ 128(inp), inp
1561
1562 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1563
1564 CMPQ inl, $64
1565 JBE sealSSE128SealHash
1566
1567 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1568 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1569 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1570
1571 ADDQ $64, itr1
1572 SUBQ $64, inl
1573 LEAQ 64(inp), inp
1574
1575 MOVQ $2, itr1
1576 MOVQ $8, itr2
1577
1578 CMPQ inl, $64
1579 JBE sealSSETail64
1580 CMPQ inl, $128
1581 JBE sealSSETail128
1582 CMPQ inl, $192
1583 JBE sealSSETail192
1584
1585 sealSSEMainLoop:
1586 // Load state, increment counter blocks
1587 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1588 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1589 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1590 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1591
1592 // Store counters
1593 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1594
1595 sealSSEInnerLoop:
1596 MOVO C3, tmpStore
1597 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1598 MOVO tmpStore, C3
1599 MOVO C1, tmpStore
1600 chachaQR(A3, B3, C3, D3, C1)
1601 MOVO tmpStore, C1
1602 polyAdd(0(oup))
1603 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1604 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1605 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1606 polyMulStage1
1607 polyMulStage2
1608 LEAQ (2*8)(oup), oup
1609 MOVO C3, tmpStore
1610 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1611 MOVO tmpStore, C3
1612 MOVO C1, tmpStore
1613 polyMulStage3
1614 chachaQR(A3, B3, C3, D3, C1)
1615 MOVO tmpStore, C1
1616 polyMulReduceStage
1617 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1618 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1619 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1620 DECQ itr2
1621 JGE sealSSEInnerLoop
1622 polyAdd(0(oup))
1623 polyMul
1624 LEAQ (2*8)(oup), oup
1625 DECQ itr1
1626 JG sealSSEInnerLoop
1627
1628 // Add in the state
1629 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1630 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1631 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1632 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1633 MOVO D3, tmpStore
1634
1635 // Load - xor - store
1636 MOVOU (0*16)(inp), D3; PXOR D3, A0
1637 MOVOU (1*16)(inp), D3; PXOR D3, B0
1638 MOVOU (2*16)(inp), D3; PXOR D3, C0
1639 MOVOU (3*16)(inp), D3; PXOR D3, D0
1640 MOVOU A0, (0*16)(oup)
1641 MOVOU B0, (1*16)(oup)
1642 MOVOU C0, (2*16)(oup)
1643 MOVOU D0, (3*16)(oup)
1644 MOVO tmpStore, D3
1645
1646 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1647 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1648 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1649 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1650 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1651 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1652 ADDQ $192, inp
1653 MOVQ $192, itr1
1654 SUBQ $192, inl
1655 MOVO A3, A1
1656 MOVO B3, B1
1657 MOVO C3, C1
1658 MOVO D3, D1
1659 CMPQ inl, $64
1660 JBE sealSSE128SealHash
1661 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1662 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1663 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1664 LEAQ 64(inp), inp
1665 SUBQ $64, inl
1666 MOVQ $6, itr1
1667 MOVQ $4, itr2
1668 CMPQ inl, $192
1669 JG sealSSEMainLoop
1670
1671 MOVQ inl, itr1
1672 TESTQ inl, inl
1673 JE sealSSE128SealHash
1674 MOVQ $6, itr1
1675 CMPQ inl, $64
1676 JBE sealSSETail64
1677 CMPQ inl, $128
1678 JBE sealSSETail128
1679 JMP sealSSETail192
1680
1681 // ----------------------------------------------------------------------------
1682 // Special optimization for the last 64 bytes of plaintext
1683 sealSSETail64:
1684 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1685 MOVO ·chacha20Constants<>(SB), A1
1686 MOVO state1Store, B1
1687 MOVO state2Store, C1
1688 MOVO ctr3Store, D1
1689 PADDL ·sseIncMask<>(SB), D1
1690 MOVO D1, ctr0Store
1691
1692 sealSSETail64LoopA:
1693 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1694 polyAdd(0(oup))
1695 polyMul
1696 LEAQ 16(oup), oup
1697
1698 sealSSETail64LoopB:
1699 chachaQR(A1, B1, C1, D1, T1)
1700 shiftB1Left; shiftC1Left; shiftD1Left
1701 chachaQR(A1, B1, C1, D1, T1)
1702 shiftB1Right; shiftC1Right; shiftD1Right
1703 polyAdd(0(oup))
1704 polyMul
1705 LEAQ 16(oup), oup
1706
1707 DECQ itr1
1708 JG sealSSETail64LoopA
1709
1710 DECQ itr2
1711 JGE sealSSETail64LoopB
1712 PADDL ·chacha20Constants<>(SB), A1
1713 PADDL state1Store, B1
1714 PADDL state2Store, C1
1715 PADDL ctr0Store, D1
1716
1717 JMP sealSSE128Seal
1718
1719 // ----------------------------------------------------------------------------
1720 // Special optimization for the last 128 bytes of plaintext
1721 sealSSETail128:
1722 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1723 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1724 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1725
1726 sealSSETail128LoopA:
1727 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1728 polyAdd(0(oup))
1729 polyMul
1730 LEAQ 16(oup), oup
1731
1732 sealSSETail128LoopB:
1733 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1734 shiftB0Left; shiftC0Left; shiftD0Left
1735 shiftB1Left; shiftC1Left; shiftD1Left
1736 polyAdd(0(oup))
1737 polyMul
1738 LEAQ 16(oup), oup
1739 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1740 shiftB0Right; shiftC0Right; shiftD0Right
1741 shiftB1Right; shiftC1Right; shiftD1Right
1742
1743 DECQ itr1
1744 JG sealSSETail128LoopA
1745
1746 DECQ itr2
1747 JGE sealSSETail128LoopB
1748
1749 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1750 PADDL state1Store, B0; PADDL state1Store, B1
1751 PADDL state2Store, C0; PADDL state2Store, C1
1752 PADDL ctr0Store, D0; PADDL ctr1Store, D1
1753
1754 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1755 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1756 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1757
1758 MOVQ $64, itr1
1759 LEAQ 64(inp), inp
1760 SUBQ $64, inl
1761
1762 JMP sealSSE128SealHash
1763
1764 // ----------------------------------------------------------------------------
1765 // Special optimization for the last 192 bytes of plaintext
1766 sealSSETail192:
1767 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1768 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1769 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1770 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1771
1772 sealSSETail192LoopA:
1773 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1774 polyAdd(0(oup))
1775 polyMul
1776 LEAQ 16(oup), oup
1777
1778 sealSSETail192LoopB:
1779 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1780 shiftB0Left; shiftC0Left; shiftD0Left
1781 shiftB1Left; shiftC1Left; shiftD1Left
1782 shiftB2Left; shiftC2Left; shiftD2Left
1783
1784 polyAdd(0(oup))
1785 polyMul
1786 LEAQ 16(oup), oup
1787
1788 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1789 shiftB0Right; shiftC0Right; shiftD0Right
1790 shiftB1Right; shiftC1Right; shiftD1Right
1791 shiftB2Right; shiftC2Right; shiftD2Right
1792
1793 DECQ itr1
1794 JG sealSSETail192LoopA
1795
1796 DECQ itr2
1797 JGE sealSSETail192LoopB
1798
1799 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1800 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1801 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1802 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1803
1804 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1805 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1806 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1807 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1808 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1809 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1810
1811 MOVO A2, A1
1812 MOVO B2, B1
1813 MOVO C2, C1
1814 MOVO D2, D1
1815 MOVQ $128, itr1
1816 LEAQ 128(inp), inp
1817 SUBQ $128, inl
1818
1819 JMP sealSSE128SealHash
1820
1821 // ----------------------------------------------------------------------------
1822 // Special seal optimization for buffers smaller than 129 bytes
1823 sealSSE128:
1824 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1825 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1826 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1827 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1828 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
1829 MOVQ $10, itr2
1830
1831 sealSSE128InnerCipherLoop:
1832 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1833 shiftB0Left; shiftB1Left; shiftB2Left
1834 shiftC0Left; shiftC1Left; shiftC2Left
1835 shiftD0Left; shiftD1Left; shiftD2Left
1836 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1837 shiftB0Right; shiftB1Right; shiftB2Right
1838 shiftC0Right; shiftC1Right; shiftC2Right
1839 shiftD0Right; shiftD1Right; shiftD2Right
1840 DECQ itr2
1841 JNE sealSSE128InnerCipherLoop
1842
1843 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1844 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1845 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1846 PADDL T2, C1; PADDL T2, C2
1847 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1848 PAND ·polyClampMask<>(SB), A0
1849 MOVOU A0, rStore
1850 MOVOU B0, sStore
1851
1852 // Hash
1853 MOVQ ad_len+80(FP), itr2
1854 CALL polyHashADInternal<>(SB)
1855 XORQ itr1, itr1
1856
1857 sealSSE128SealHash:
1858 // itr1 holds the number of bytes encrypted but not yet hashed
1859 CMPQ itr1, $16
1860 JB sealSSE128Seal
1861 polyAdd(0(oup))
1862 polyMul
1863
1864 SUBQ $16, itr1
1865 ADDQ $16, oup
1866
1867 JMP sealSSE128SealHash
1868
1869 sealSSE128Seal:
1870 CMPQ inl, $16
1871 JB sealSSETail
1872 SUBQ $16, inl
1873
1874 // Load for decryption
1875 MOVOU (inp), T0
1876 PXOR T0, A1
1877 MOVOU A1, (oup)
1878 LEAQ (1*16)(inp), inp
1879 LEAQ (1*16)(oup), oup
1880
1881 // Extract for hashing
1882 MOVQ A1, t0
1883 PSRLDQ $8, A1
1884 MOVQ A1, t1
1885 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1886 polyMul
1887
1888 // Shift the stream "left"
1889 MOVO B1, A1
1890 MOVO C1, B1
1891 MOVO D1, C1
1892 MOVO A2, D1
1893 MOVO B2, A2
1894 MOVO C2, B2
1895 MOVO D2, C2
1896 JMP sealSSE128Seal
1897
1898 sealSSETail:
1899 TESTQ inl, inl
1900 JE sealSSEFinalize
1901
1902 // We can only load the PT one byte at a time to avoid read after end of buffer
1903 MOVQ inl, itr2
1904 SHLQ $4, itr2
1905 LEAQ ·andMask<>(SB), t0
1906 MOVQ inl, itr1
1907 LEAQ -1(inp)(inl*1), inp
1908 XORQ t2, t2
1909 XORQ t3, t3
1910 XORQ AX, AX
1911
1912 sealSSETailLoadLoop:
1913 SHLQ $8, t2, t3
1914 SHLQ $8, t2
1915 MOVB (inp), AX
1916 XORQ AX, t2
1917 LEAQ -1(inp), inp
1918 DECQ itr1
1919 JNE sealSSETailLoadLoop
1920 MOVQ t2, 0+tmpStore
1921 MOVQ t3, 8+tmpStore
1922 PXOR 0+tmpStore, A1
1923 MOVOU A1, (oup)
1924 MOVOU -16(t0)(itr2*1), T0
1925 PAND T0, A1
1926 MOVQ A1, t0
1927 PSRLDQ $8, A1
1928 MOVQ A1, t1
1929 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1930 polyMul
1931
1932 ADDQ inl, oup
1933
1934 sealSSEFinalize:
1935 // Hash in the buffer lengths
1936 ADDQ ad_len+80(FP), acc0
1937 ADCQ src_len+56(FP), acc1
1938 ADCQ $1, acc2
1939 polyMul
1940
1941 // Final reduce
1942 MOVQ acc0, t0
1943 MOVQ acc1, t1
1944 MOVQ acc2, t2
1945 SUBQ $-5, acc0
1946 SBBQ $-1, acc1
1947 SBBQ $3, acc2
1948 CMOVQCS t0, acc0
1949 CMOVQCS t1, acc1
1950 CMOVQCS t2, acc2
1951
1952 // Add in the "s" part of the key
1953 ADDQ 0+sStore, acc0
1954 ADCQ 8+sStore, acc1
1955
1956 // Finally store the tag at the end of the message
1957 MOVQ acc0, (0*8)(oup)
1958 MOVQ acc1, (1*8)(oup)
1959 RET
1960
1961 // ----------------------------------------------------------------------------
1962 // ------------------------- AVX2 Code ----------------------------------------
1963 chacha20Poly1305Seal_AVX2:
1964 VZEROUPPER
1965 VMOVDQU ·chacha20Constants<>(SB), AA0
1966 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1967 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1968 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1969 VPADDD ·avx2InitMask<>(SB), DD0, DD0
1970
1971 // Special optimizations, for very short buffers
1972 CMPQ inl, $192
1973 JBE seal192AVX2 // 33% faster
1974 CMPQ inl, $320
1975 JBE seal320AVX2 // 17% faster
1976
1977 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1978 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1979 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
1980 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
1981 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
1982 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
1983 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
1984 VMOVDQA DD3, ctr3StoreAVX2
1985 MOVQ $10, itr2
1986
1987 sealAVX2IntroLoop:
1988 VMOVDQA CC3, tmpStoreAVX2
1989 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
1990 VMOVDQA tmpStoreAVX2, CC3
1991 VMOVDQA CC1, tmpStoreAVX2
1992 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
1993 VMOVDQA tmpStoreAVX2, CC1
1994
1995 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
1996 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
1997 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
1998 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
1999
2000 VMOVDQA CC3, tmpStoreAVX2
2001 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2002 VMOVDQA tmpStoreAVX2, CC3
2003 VMOVDQA CC1, tmpStoreAVX2
2004 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2005 VMOVDQA tmpStoreAVX2, CC1
2006
2007 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2008 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2009 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2010 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2011 DECQ itr2
2012 JNE sealAVX2IntroLoop
2013
2014 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2015 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2016 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2017 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2018
2019 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2020 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2021 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2022
2023 // Clamp and store poly key
2024 VPAND ·polyClampMask<>(SB), DD0, DD0
2025 VMOVDQA DD0, rsStoreAVX2
2026
2027 // Hash AD
2028 MOVQ ad_len+80(FP), itr2
2029 CALL polyHashADInternal<>(SB)
2030
2031 // Can store at least 320 bytes
2032 VPXOR (0*32)(inp), AA0, AA0
2033 VPXOR (1*32)(inp), CC0, CC0
2034 VMOVDQU AA0, (0*32)(oup)
2035 VMOVDQU CC0, (1*32)(oup)
2036
2037 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2038 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2039 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2040 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2041 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2042 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2043
2044 MOVQ $320, itr1
2045 SUBQ $320, inl
2046 LEAQ 320(inp), inp
2047
2048 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2049 CMPQ inl, $128
2050 JBE sealAVX2SealHash
2051
2052 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2053 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2054 SUBQ $128, inl
2055 LEAQ 128(inp), inp
2056
2057 MOVQ $8, itr1
2058 MOVQ $2, itr2
2059
2060 CMPQ inl, $128
2061 JBE sealAVX2Tail128
2062 CMPQ inl, $256
2063 JBE sealAVX2Tail256
2064 CMPQ inl, $384
2065 JBE sealAVX2Tail384
2066 CMPQ inl, $512
2067 JBE sealAVX2Tail512
2068
2069 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2070 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2071 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2072 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2073 VMOVDQA ctr3StoreAVX2, DD0
2074 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2075 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2076
2077 VMOVDQA CC3, tmpStoreAVX2
2078 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2079 VMOVDQA tmpStoreAVX2, CC3
2080 VMOVDQA CC1, tmpStoreAVX2
2081 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2082 VMOVDQA tmpStoreAVX2, CC1
2083
2084 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2085 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2086 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2087 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2088
2089 VMOVDQA CC3, tmpStoreAVX2
2090 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2091 VMOVDQA tmpStoreAVX2, CC3
2092 VMOVDQA CC1, tmpStoreAVX2
2093 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2094 VMOVDQA tmpStoreAVX2, CC1
2095
2096 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2097 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2098 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2099 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2100 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2101 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2102 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2103 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2104 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2105 VMOVDQA CC3, tmpStoreAVX2
2106 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2107 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2108 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2109 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2110 VMOVDQA tmpStoreAVX2, CC3
2111
2112 SUBQ $16, oup // Adjust the pointer
2113 MOVQ $9, itr1
2114 JMP sealAVX2InternalLoopStart
2115
2116 sealAVX2MainLoop:
2117 // Load state, increment counter blocks, store the incremented counters
2118 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2119 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2120 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2121 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2122 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2123 MOVQ $10, itr1
2124
2125 sealAVX2InternalLoop:
2126 polyAdd(0*8(oup))
2127 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2128 polyMulStage1_AVX2
2129 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2130 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2131 polyMulStage2_AVX2
2132 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2133 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2134 polyMulStage3_AVX2
2135 VMOVDQA CC3, tmpStoreAVX2
2136 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2137 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2138 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2139 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2140 VMOVDQA tmpStoreAVX2, CC3
2141 polyMulReduceStage
2142
2143 sealAVX2InternalLoopStart:
2144 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2145 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2146 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2147 polyAdd(2*8(oup))
2148 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2149 polyMulStage1_AVX2
2150 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2151 VMOVDQA CC3, tmpStoreAVX2
2152 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2153 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2154 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2155 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2156 VMOVDQA tmpStoreAVX2, CC3
2157 polyMulStage2_AVX2
2158 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2159 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2160 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2161 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2162 polyMulStage3_AVX2
2163 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2164 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2165 polyMulReduceStage
2166 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2167 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2168 polyAdd(4*8(oup))
2169 LEAQ (6*8)(oup), oup
2170 VMOVDQA CC3, tmpStoreAVX2
2171 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2172 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2173 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2174 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2175 VMOVDQA tmpStoreAVX2, CC3
2176 polyMulStage1_AVX2
2177 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2178 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2179 polyMulStage2_AVX2
2180 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2181 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2182 polyMulStage3_AVX2
2183 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2184 VMOVDQA CC3, tmpStoreAVX2
2185 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2186 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2187 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2188 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2189 VMOVDQA tmpStoreAVX2, CC3
2190 polyMulReduceStage
2191 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2192 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2193 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2194 DECQ itr1
2195 JNE sealAVX2InternalLoop
2196
2197 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2198 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2199 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2200 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2201 VMOVDQA CC3, tmpStoreAVX2
2202
2203 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2204 polyAdd(0*8(oup))
2205 polyMulAVX2
2206 LEAQ (4*8)(oup), oup
2207 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2208 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2209 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2210 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2211 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2212 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2213
2214 // and here
2215 polyAdd(-2*8(oup))
2216 polyMulAVX2
2217 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2218 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2219 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2220 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2221 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2222 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2223 LEAQ (32*16)(inp), inp
2224 SUBQ $(32*16), inl
2225 CMPQ inl, $512
2226 JG sealAVX2MainLoop
2227
2228 // Tail can only hash 480 bytes
2229 polyAdd(0*8(oup))
2230 polyMulAVX2
2231 polyAdd(2*8(oup))
2232 polyMulAVX2
2233 LEAQ 32(oup), oup
2234
2235 MOVQ $10, itr1
2236 MOVQ $0, itr2
2237 CMPQ inl, $128
2238 JBE sealAVX2Tail128
2239 CMPQ inl, $256
2240 JBE sealAVX2Tail256
2241 CMPQ inl, $384
2242 JBE sealAVX2Tail384
2243 JMP sealAVX2Tail512
2244
2245 // ----------------------------------------------------------------------------
2246 // Special optimization for buffers smaller than 193 bytes
2247 seal192AVX2:
2248 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2249 VMOVDQA AA0, AA1
2250 VMOVDQA BB0, BB1
2251 VMOVDQA CC0, CC1
2252 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2253 VMOVDQA AA0, AA2
2254 VMOVDQA BB0, BB2
2255 VMOVDQA CC0, CC2
2256 VMOVDQA DD0, DD2
2257 VMOVDQA DD1, TT3
2258 MOVQ $10, itr2
2259
2260 sealAVX2192InnerCipherLoop:
2261 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2262 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2263 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2264 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2265 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2266 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2267 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2268 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2269 DECQ itr2
2270 JNE sealAVX2192InnerCipherLoop
2271 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2272 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2273 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2274 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2275 VPERM2I128 $0x02, AA0, BB0, TT0
2276
2277 // Clamp and store poly key
2278 VPAND ·polyClampMask<>(SB), TT0, TT0
2279 VMOVDQA TT0, rsStoreAVX2
2280
2281 // Stream for up to 192 bytes
2282 VPERM2I128 $0x13, AA0, BB0, AA0
2283 VPERM2I128 $0x13, CC0, DD0, BB0
2284 VPERM2I128 $0x02, AA1, BB1, CC0
2285 VPERM2I128 $0x02, CC1, DD1, DD0
2286 VPERM2I128 $0x13, AA1, BB1, AA1
2287 VPERM2I128 $0x13, CC1, DD1, BB1
2288
2289 sealAVX2ShortSeal:
2290 // Hash aad
2291 MOVQ ad_len+80(FP), itr2
2292 CALL polyHashADInternal<>(SB)
2293 XORQ itr1, itr1
2294
2295 sealAVX2SealHash:
2296 // itr1 holds the number of bytes encrypted but not yet hashed
2297 CMPQ itr1, $16
2298 JB sealAVX2ShortSealLoop
2299 polyAdd(0(oup))
2300 polyMul
2301 SUBQ $16, itr1
2302 ADDQ $16, oup
2303 JMP sealAVX2SealHash
2304
2305 sealAVX2ShortSealLoop:
2306 CMPQ inl, $32
2307 JB sealAVX2ShortTail32
2308 SUBQ $32, inl
2309
2310 // Load for encryption
2311 VPXOR (inp), AA0, AA0
2312 VMOVDQU AA0, (oup)
2313 LEAQ (1*32)(inp), inp
2314
2315 // Now can hash
2316 polyAdd(0*8(oup))
2317 polyMulAVX2
2318 polyAdd(2*8(oup))
2319 polyMulAVX2
2320 LEAQ (1*32)(oup), oup
2321
2322 // Shift stream left
2323 VMOVDQA BB0, AA0
2324 VMOVDQA CC0, BB0
2325 VMOVDQA DD0, CC0
2326 VMOVDQA AA1, DD0
2327 VMOVDQA BB1, AA1
2328 VMOVDQA CC1, BB1
2329 VMOVDQA DD1, CC1
2330 VMOVDQA AA2, DD1
2331 VMOVDQA BB2, AA2
2332 JMP sealAVX2ShortSealLoop
2333
2334 sealAVX2ShortTail32:
2335 CMPQ inl, $16
2336 VMOVDQA A0, A1
2337 JB sealAVX2ShortDone
2338
2339 SUBQ $16, inl
2340
2341 // Load for encryption
2342 VPXOR (inp), A0, T0
2343 VMOVDQU T0, (oup)
2344 LEAQ (1*16)(inp), inp
2345
2346 // Hash
2347 polyAdd(0*8(oup))
2348 polyMulAVX2
2349 LEAQ (1*16)(oup), oup
2350 VPERM2I128 $0x11, AA0, AA0, AA0
2351 VMOVDQA A0, A1
2352
2353 sealAVX2ShortDone:
2354 VZEROUPPER
2355 JMP sealSSETail
2356
2357 // ----------------------------------------------------------------------------
2358 // Special optimization for buffers smaller than 321 bytes
2359 seal320AVX2:
2360 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2361 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2362 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2363 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2364 MOVQ $10, itr2
2365
2366 sealAVX2320InnerCipherLoop:
2367 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2368 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2369 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2370 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2371 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2372 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2373 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2374 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2375 DECQ itr2
2376 JNE sealAVX2320InnerCipherLoop
2377
2378 VMOVDQA ·chacha20Constants<>(SB), TT0
2379 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2380 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2381 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2382 VMOVDQA ·avx2IncMask<>(SB), TT0
2383 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2384 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2385 VPADDD TT3, DD2, DD2
2386
2387 // Clamp and store poly key
2388 VPERM2I128 $0x02, AA0, BB0, TT0
2389 VPAND ·polyClampMask<>(SB), TT0, TT0
2390 VMOVDQA TT0, rsStoreAVX2
2391
2392 // Stream for up to 320 bytes
2393 VPERM2I128 $0x13, AA0, BB0, AA0
2394 VPERM2I128 $0x13, CC0, DD0, BB0
2395 VPERM2I128 $0x02, AA1, BB1, CC0
2396 VPERM2I128 $0x02, CC1, DD1, DD0
2397 VPERM2I128 $0x13, AA1, BB1, AA1
2398 VPERM2I128 $0x13, CC1, DD1, BB1
2399 VPERM2I128 $0x02, AA2, BB2, CC1
2400 VPERM2I128 $0x02, CC2, DD2, DD1
2401 VPERM2I128 $0x13, AA2, BB2, AA2
2402 VPERM2I128 $0x13, CC2, DD2, BB2
2403 JMP sealAVX2ShortSeal
2404
2405 // ----------------------------------------------------------------------------
2406 // Special optimization for the last 128 bytes of ciphertext
2407 sealAVX2Tail128:
2408 // Need to decrypt up to 128 bytes - prepare two blocks
2409 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2410 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2411 VMOVDQA ·chacha20Constants<>(SB), AA0
2412 VMOVDQA state1StoreAVX2, BB0
2413 VMOVDQA state2StoreAVX2, CC0
2414 VMOVDQA ctr3StoreAVX2, DD0
2415 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2416 VMOVDQA DD0, DD1
2417
2418 sealAVX2Tail128LoopA:
2419 polyAdd(0(oup))
2420 polyMul
2421 LEAQ 16(oup), oup
2422
2423 sealAVX2Tail128LoopB:
2424 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2425 polyAdd(0(oup))
2426 polyMul
2427 VPALIGNR $4, BB0, BB0, BB0
2428 VPALIGNR $8, CC0, CC0, CC0
2429 VPALIGNR $12, DD0, DD0, DD0
2430 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2431 polyAdd(16(oup))
2432 polyMul
2433 LEAQ 32(oup), oup
2434 VPALIGNR $12, BB0, BB0, BB0
2435 VPALIGNR $8, CC0, CC0, CC0
2436 VPALIGNR $4, DD0, DD0, DD0
2437 DECQ itr1
2438 JG sealAVX2Tail128LoopA
2439 DECQ itr2
2440 JGE sealAVX2Tail128LoopB
2441
2442 VPADDD ·chacha20Constants<>(SB), AA0, AA1
2443 VPADDD state1StoreAVX2, BB0, BB1
2444 VPADDD state2StoreAVX2, CC0, CC1
2445 VPADDD DD1, DD0, DD1
2446
2447 VPERM2I128 $0x02, AA1, BB1, AA0
2448 VPERM2I128 $0x02, CC1, DD1, BB0
2449 VPERM2I128 $0x13, AA1, BB1, CC0
2450 VPERM2I128 $0x13, CC1, DD1, DD0
2451 JMP sealAVX2ShortSealLoop
2452
2453 // ----------------------------------------------------------------------------
2454 // Special optimization for the last 256 bytes of ciphertext
2455 sealAVX2Tail256:
2456 // Need to decrypt up to 256 bytes - prepare two blocks
2457 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2458 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2459 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2460 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2461 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2462 VMOVDQA ctr3StoreAVX2, DD0
2463 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2464 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2465 VMOVDQA DD0, TT1
2466 VMOVDQA DD1, TT2
2467
2468 sealAVX2Tail256LoopA:
2469 polyAdd(0(oup))
2470 polyMul
2471 LEAQ 16(oup), oup
2472
2473 sealAVX2Tail256LoopB:
2474 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2475 polyAdd(0(oup))
2476 polyMul
2477 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2478 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2479 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2480 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2481 polyAdd(16(oup))
2482 polyMul
2483 LEAQ 32(oup), oup
2484 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2485 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2486 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2487 DECQ itr1
2488 JG sealAVX2Tail256LoopA
2489 DECQ itr2
2490 JGE sealAVX2Tail256LoopB
2491
2492 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2493 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2494 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2495 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2496 VPERM2I128 $0x02, AA0, BB0, TT0
2497 VPERM2I128 $0x02, CC0, DD0, TT1
2498 VPERM2I128 $0x13, AA0, BB0, TT2
2499 VPERM2I128 $0x13, CC0, DD0, TT3
2500 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2501 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2502 MOVQ $128, itr1
2503 LEAQ 128(inp), inp
2504 SUBQ $128, inl
2505 VPERM2I128 $0x02, AA1, BB1, AA0
2506 VPERM2I128 $0x02, CC1, DD1, BB0
2507 VPERM2I128 $0x13, AA1, BB1, CC0
2508 VPERM2I128 $0x13, CC1, DD1, DD0
2509
2510 JMP sealAVX2SealHash
2511
2512 // ----------------------------------------------------------------------------
2513 // Special optimization for the last 384 bytes of ciphertext
2514 sealAVX2Tail384:
2515 // Need to decrypt up to 384 bytes - prepare two blocks
2516 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2517 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2518 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2519 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2520 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2521 VMOVDQA ctr3StoreAVX2, DD0
2522 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2523 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2524
2525 sealAVX2Tail384LoopA:
2526 polyAdd(0(oup))
2527 polyMul
2528 LEAQ 16(oup), oup
2529
2530 sealAVX2Tail384LoopB:
2531 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2532 polyAdd(0(oup))
2533 polyMul
2534 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2535 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2536 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2537 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2538 polyAdd(16(oup))
2539 polyMul
2540 LEAQ 32(oup), oup
2541 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2542 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2543 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2544 DECQ itr1
2545 JG sealAVX2Tail384LoopA
2546 DECQ itr2
2547 JGE sealAVX2Tail384LoopB
2548
2549 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2550 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2551 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2552 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2553 VPERM2I128 $0x02, AA0, BB0, TT0
2554 VPERM2I128 $0x02, CC0, DD0, TT1
2555 VPERM2I128 $0x13, AA0, BB0, TT2
2556 VPERM2I128 $0x13, CC0, DD0, TT3
2557 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2558 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2559 VPERM2I128 $0x02, AA1, BB1, TT0
2560 VPERM2I128 $0x02, CC1, DD1, TT1
2561 VPERM2I128 $0x13, AA1, BB1, TT2
2562 VPERM2I128 $0x13, CC1, DD1, TT3
2563 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2564 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2565 MOVQ $256, itr1
2566 LEAQ 256(inp), inp
2567 SUBQ $256, inl
2568 VPERM2I128 $0x02, AA2, BB2, AA0
2569 VPERM2I128 $0x02, CC2, DD2, BB0
2570 VPERM2I128 $0x13, AA2, BB2, CC0
2571 VPERM2I128 $0x13, CC2, DD2, DD0
2572
2573 JMP sealAVX2SealHash
2574
2575 // ----------------------------------------------------------------------------
2576 // Special optimization for the last 512 bytes of ciphertext
2577 sealAVX2Tail512:
2578 // Need to decrypt up to 512 bytes - prepare two blocks
2579 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2580 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2581 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2582 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2583 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2584 VMOVDQA ctr3StoreAVX2, DD0
2585 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2586 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2587
2588 sealAVX2Tail512LoopA:
2589 polyAdd(0(oup))
2590 polyMul
2591 LEAQ 16(oup), oup
2592
2593 sealAVX2Tail512LoopB:
2594 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2595 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2596 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2597 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2598 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2599 VMOVDQA CC3, tmpStoreAVX2
2600 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2601 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2602 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2603 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2604 VMOVDQA tmpStoreAVX2, CC3
2605 polyAdd(0*8(oup))
2606 polyMulAVX2
2607 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2608 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2609 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2610 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2611 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2612 VMOVDQA CC3, tmpStoreAVX2
2613 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2614 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2615 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2616 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2617 VMOVDQA tmpStoreAVX2, CC3
2618 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2619 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2620 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2621 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2622 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2623 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2624 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2625 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2626 polyAdd(2*8(oup))
2627 polyMulAVX2
2628 LEAQ (4*8)(oup), oup
2629 VMOVDQA CC3, tmpStoreAVX2
2630 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2631 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2632 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2633 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2634 VMOVDQA tmpStoreAVX2, CC3
2635 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2636 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2637 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2638 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2639 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2640 VMOVDQA CC3, tmpStoreAVX2
2641 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2642 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2643 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2644 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2645 VMOVDQA tmpStoreAVX2, CC3
2646 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2647 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2648 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2649
2650 DECQ itr1
2651 JG sealAVX2Tail512LoopA
2652 DECQ itr2
2653 JGE sealAVX2Tail512LoopB
2654
2655 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2656 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2657 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2658 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2659 VMOVDQA CC3, tmpStoreAVX2
2660 VPERM2I128 $0x02, AA0, BB0, CC3
2661 VPXOR (0*32)(inp), CC3, CC3
2662 VMOVDQU CC3, (0*32)(oup)
2663 VPERM2I128 $0x02, CC0, DD0, CC3
2664 VPXOR (1*32)(inp), CC3, CC3
2665 VMOVDQU CC3, (1*32)(oup)
2666 VPERM2I128 $0x13, AA0, BB0, CC3
2667 VPXOR (2*32)(inp), CC3, CC3
2668 VMOVDQU CC3, (2*32)(oup)
2669 VPERM2I128 $0x13, CC0, DD0, CC3
2670 VPXOR (3*32)(inp), CC3, CC3
2671 VMOVDQU CC3, (3*32)(oup)
2672
2673 VPERM2I128 $0x02, AA1, BB1, AA0
2674 VPERM2I128 $0x02, CC1, DD1, BB0
2675 VPERM2I128 $0x13, AA1, BB1, CC0
2676 VPERM2I128 $0x13, CC1, DD1, DD0
2677 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2678 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2679
2680 VPERM2I128 $0x02, AA2, BB2, AA0
2681 VPERM2I128 $0x02, CC2, DD2, BB0
2682 VPERM2I128 $0x13, AA2, BB2, CC0
2683 VPERM2I128 $0x13, CC2, DD2, DD0
2684 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2685 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2686
2687 MOVQ $384, itr1
2688 LEAQ 384(inp), inp
2689 SUBQ $384, inl
2690 VPERM2I128 $0x02, AA3, BB3, AA0
2691 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2692 VPERM2I128 $0x13, AA3, BB3, CC0
2693 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2694
2695 JMP sealAVX2SealHash
2696
View as plain text