]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/x86/crypto/chacha20-ssse3-x86_64.S
Merge remote-tracking branches 'asoc/fix/amd', 'asoc/fix/arizona', 'asoc/fix/dpcm...
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12 #include <linux/linkage.h>
13
14 .data
15 .align 16
16
17 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
18 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
19 CTRINC: .octa 0x00000003000000020000000100000000
20
21 .text
22
23 ENTRY(chacha20_block_xor_ssse3)
24 # %rdi: Input state matrix, s
25 # %rsi: 1 data block output, o
26 # %rdx: 1 data block input, i
27
28 # This function encrypts one ChaCha20 block by loading the state matrix
29 # in four SSE registers. It performs matrix operation on four words in
30 # parallel, but requireds shuffling to rearrange the words after each
31 # round. 8/16-bit word rotation is done with the slightly better
32 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
33 # traditional shift+OR.
34
35 # x0..3 = s0..3
36 movdqa 0x00(%rdi),%xmm0
37 movdqa 0x10(%rdi),%xmm1
38 movdqa 0x20(%rdi),%xmm2
39 movdqa 0x30(%rdi),%xmm3
40 movdqa %xmm0,%xmm8
41 movdqa %xmm1,%xmm9
42 movdqa %xmm2,%xmm10
43 movdqa %xmm3,%xmm11
44
45 movdqa ROT8(%rip),%xmm4
46 movdqa ROT16(%rip),%xmm5
47
48 mov $10,%ecx
49
50 .Ldoubleround:
51
52 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53 paddd %xmm1,%xmm0
54 pxor %xmm0,%xmm3
55 pshufb %xmm5,%xmm3
56
57 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58 paddd %xmm3,%xmm2
59 pxor %xmm2,%xmm1
60 movdqa %xmm1,%xmm6
61 pslld $12,%xmm6
62 psrld $20,%xmm1
63 por %xmm6,%xmm1
64
65 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
66 paddd %xmm1,%xmm0
67 pxor %xmm0,%xmm3
68 pshufb %xmm4,%xmm3
69
70 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
71 paddd %xmm3,%xmm2
72 pxor %xmm2,%xmm1
73 movdqa %xmm1,%xmm7
74 pslld $7,%xmm7
75 psrld $25,%xmm1
76 por %xmm7,%xmm1
77
78 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
79 pshufd $0x39,%xmm1,%xmm1
80 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
81 pshufd $0x4e,%xmm2,%xmm2
82 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
83 pshufd $0x93,%xmm3,%xmm3
84
85 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
86 paddd %xmm1,%xmm0
87 pxor %xmm0,%xmm3
88 pshufb %xmm5,%xmm3
89
90 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
91 paddd %xmm3,%xmm2
92 pxor %xmm2,%xmm1
93 movdqa %xmm1,%xmm6
94 pslld $12,%xmm6
95 psrld $20,%xmm1
96 por %xmm6,%xmm1
97
98 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
99 paddd %xmm1,%xmm0
100 pxor %xmm0,%xmm3
101 pshufb %xmm4,%xmm3
102
103 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
104 paddd %xmm3,%xmm2
105 pxor %xmm2,%xmm1
106 movdqa %xmm1,%xmm7
107 pslld $7,%xmm7
108 psrld $25,%xmm1
109 por %xmm7,%xmm1
110
111 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
112 pshufd $0x93,%xmm1,%xmm1
113 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
114 pshufd $0x4e,%xmm2,%xmm2
115 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
116 pshufd $0x39,%xmm3,%xmm3
117
118 dec %ecx
119 jnz .Ldoubleround
120
121 # o0 = i0 ^ (x0 + s0)
122 movdqu 0x00(%rdx),%xmm4
123 paddd %xmm8,%xmm0
124 pxor %xmm4,%xmm0
125 movdqu %xmm0,0x00(%rsi)
126 # o1 = i1 ^ (x1 + s1)
127 movdqu 0x10(%rdx),%xmm5
128 paddd %xmm9,%xmm1
129 pxor %xmm5,%xmm1
130 movdqu %xmm1,0x10(%rsi)
131 # o2 = i2 ^ (x2 + s2)
132 movdqu 0x20(%rdx),%xmm6
133 paddd %xmm10,%xmm2
134 pxor %xmm6,%xmm2
135 movdqu %xmm2,0x20(%rsi)
136 # o3 = i3 ^ (x3 + s3)
137 movdqu 0x30(%rdx),%xmm7
138 paddd %xmm11,%xmm3
139 pxor %xmm7,%xmm3
140 movdqu %xmm3,0x30(%rsi)
141
142 ret
143 ENDPROC(chacha20_block_xor_ssse3)
144
145 ENTRY(chacha20_4block_xor_ssse3)
146 # %rdi: Input state matrix, s
147 # %rsi: 4 data blocks output, o
148 # %rdx: 4 data blocks input, i
149
150 # This function encrypts four consecutive ChaCha20 blocks by loading the
151 # the state matrix in SSE registers four times. As we need some scratch
152 # registers, we save the first four registers on the stack. The
153 # algorithm performs each operation on the corresponding word of each
154 # state matrix, hence requires no word shuffling. For final XORing step
155 # we transpose the matrix by interleaving 32- and then 64-bit words,
156 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157 # done with the slightly better performing SSSE3 byte shuffling,
158 # 7/12-bit word rotation uses traditional shift+OR.
159
160 mov %rsp,%r11
161 sub $0x80,%rsp
162 and $~63,%rsp
163
164 # x0..15[0-3] = s0..3[0..3]
165 movq 0x00(%rdi),%xmm1
166 pshufd $0x00,%xmm1,%xmm0
167 pshufd $0x55,%xmm1,%xmm1
168 movq 0x08(%rdi),%xmm3
169 pshufd $0x00,%xmm3,%xmm2
170 pshufd $0x55,%xmm3,%xmm3
171 movq 0x10(%rdi),%xmm5
172 pshufd $0x00,%xmm5,%xmm4
173 pshufd $0x55,%xmm5,%xmm5
174 movq 0x18(%rdi),%xmm7
175 pshufd $0x00,%xmm7,%xmm6
176 pshufd $0x55,%xmm7,%xmm7
177 movq 0x20(%rdi),%xmm9
178 pshufd $0x00,%xmm9,%xmm8
179 pshufd $0x55,%xmm9,%xmm9
180 movq 0x28(%rdi),%xmm11
181 pshufd $0x00,%xmm11,%xmm10
182 pshufd $0x55,%xmm11,%xmm11
183 movq 0x30(%rdi),%xmm13
184 pshufd $0x00,%xmm13,%xmm12
185 pshufd $0x55,%xmm13,%xmm13
186 movq 0x38(%rdi),%xmm15
187 pshufd $0x00,%xmm15,%xmm14
188 pshufd $0x55,%xmm15,%xmm15
189 # x0..3 on stack
190 movdqa %xmm0,0x00(%rsp)
191 movdqa %xmm1,0x10(%rsp)
192 movdqa %xmm2,0x20(%rsp)
193 movdqa %xmm3,0x30(%rsp)
194
195 movdqa CTRINC(%rip),%xmm1
196 movdqa ROT8(%rip),%xmm2
197 movdqa ROT16(%rip),%xmm3
198
199 # x12 += counter values 0-3
200 paddd %xmm1,%xmm12
201
202 mov $10,%ecx
203
204 .Ldoubleround4:
205 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
206 movdqa 0x00(%rsp),%xmm0
207 paddd %xmm4,%xmm0
208 movdqa %xmm0,0x00(%rsp)
209 pxor %xmm0,%xmm12
210 pshufb %xmm3,%xmm12
211 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
212 movdqa 0x10(%rsp),%xmm0
213 paddd %xmm5,%xmm0
214 movdqa %xmm0,0x10(%rsp)
215 pxor %xmm0,%xmm13
216 pshufb %xmm3,%xmm13
217 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
218 movdqa 0x20(%rsp),%xmm0
219 paddd %xmm6,%xmm0
220 movdqa %xmm0,0x20(%rsp)
221 pxor %xmm0,%xmm14
222 pshufb %xmm3,%xmm14
223 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
224 movdqa 0x30(%rsp),%xmm0
225 paddd %xmm7,%xmm0
226 movdqa %xmm0,0x30(%rsp)
227 pxor %xmm0,%xmm15
228 pshufb %xmm3,%xmm15
229
230 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
231 paddd %xmm12,%xmm8
232 pxor %xmm8,%xmm4
233 movdqa %xmm4,%xmm0
234 pslld $12,%xmm0
235 psrld $20,%xmm4
236 por %xmm0,%xmm4
237 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
238 paddd %xmm13,%xmm9
239 pxor %xmm9,%xmm5
240 movdqa %xmm5,%xmm0
241 pslld $12,%xmm0
242 psrld $20,%xmm5
243 por %xmm0,%xmm5
244 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
245 paddd %xmm14,%xmm10
246 pxor %xmm10,%xmm6
247 movdqa %xmm6,%xmm0
248 pslld $12,%xmm0
249 psrld $20,%xmm6
250 por %xmm0,%xmm6
251 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
252 paddd %xmm15,%xmm11
253 pxor %xmm11,%xmm7
254 movdqa %xmm7,%xmm0
255 pslld $12,%xmm0
256 psrld $20,%xmm7
257 por %xmm0,%xmm7
258
259 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
260 movdqa 0x00(%rsp),%xmm0
261 paddd %xmm4,%xmm0
262 movdqa %xmm0,0x00(%rsp)
263 pxor %xmm0,%xmm12
264 pshufb %xmm2,%xmm12
265 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
266 movdqa 0x10(%rsp),%xmm0
267 paddd %xmm5,%xmm0
268 movdqa %xmm0,0x10(%rsp)
269 pxor %xmm0,%xmm13
270 pshufb %xmm2,%xmm13
271 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
272 movdqa 0x20(%rsp),%xmm0
273 paddd %xmm6,%xmm0
274 movdqa %xmm0,0x20(%rsp)
275 pxor %xmm0,%xmm14
276 pshufb %xmm2,%xmm14
277 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
278 movdqa 0x30(%rsp),%xmm0
279 paddd %xmm7,%xmm0
280 movdqa %xmm0,0x30(%rsp)
281 pxor %xmm0,%xmm15
282 pshufb %xmm2,%xmm15
283
284 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
285 paddd %xmm12,%xmm8
286 pxor %xmm8,%xmm4
287 movdqa %xmm4,%xmm0
288 pslld $7,%xmm0
289 psrld $25,%xmm4
290 por %xmm0,%xmm4
291 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
292 paddd %xmm13,%xmm9
293 pxor %xmm9,%xmm5
294 movdqa %xmm5,%xmm0
295 pslld $7,%xmm0
296 psrld $25,%xmm5
297 por %xmm0,%xmm5
298 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
299 paddd %xmm14,%xmm10
300 pxor %xmm10,%xmm6
301 movdqa %xmm6,%xmm0
302 pslld $7,%xmm0
303 psrld $25,%xmm6
304 por %xmm0,%xmm6
305 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
306 paddd %xmm15,%xmm11
307 pxor %xmm11,%xmm7
308 movdqa %xmm7,%xmm0
309 pslld $7,%xmm0
310 psrld $25,%xmm7
311 por %xmm0,%xmm7
312
313 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
314 movdqa 0x00(%rsp),%xmm0
315 paddd %xmm5,%xmm0
316 movdqa %xmm0,0x00(%rsp)
317 pxor %xmm0,%xmm15
318 pshufb %xmm3,%xmm15
319 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
320 movdqa 0x10(%rsp),%xmm0
321 paddd %xmm6,%xmm0
322 movdqa %xmm0,0x10(%rsp)
323 pxor %xmm0,%xmm12
324 pshufb %xmm3,%xmm12
325 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
326 movdqa 0x20(%rsp),%xmm0
327 paddd %xmm7,%xmm0
328 movdqa %xmm0,0x20(%rsp)
329 pxor %xmm0,%xmm13
330 pshufb %xmm3,%xmm13
331 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
332 movdqa 0x30(%rsp),%xmm0
333 paddd %xmm4,%xmm0
334 movdqa %xmm0,0x30(%rsp)
335 pxor %xmm0,%xmm14
336 pshufb %xmm3,%xmm14
337
338 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
339 paddd %xmm15,%xmm10
340 pxor %xmm10,%xmm5
341 movdqa %xmm5,%xmm0
342 pslld $12,%xmm0
343 psrld $20,%xmm5
344 por %xmm0,%xmm5
345 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
346 paddd %xmm12,%xmm11
347 pxor %xmm11,%xmm6
348 movdqa %xmm6,%xmm0
349 pslld $12,%xmm0
350 psrld $20,%xmm6
351 por %xmm0,%xmm6
352 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
353 paddd %xmm13,%xmm8
354 pxor %xmm8,%xmm7
355 movdqa %xmm7,%xmm0
356 pslld $12,%xmm0
357 psrld $20,%xmm7
358 por %xmm0,%xmm7
359 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
360 paddd %xmm14,%xmm9
361 pxor %xmm9,%xmm4
362 movdqa %xmm4,%xmm0
363 pslld $12,%xmm0
364 psrld $20,%xmm4
365 por %xmm0,%xmm4
366
367 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
368 movdqa 0x00(%rsp),%xmm0
369 paddd %xmm5,%xmm0
370 movdqa %xmm0,0x00(%rsp)
371 pxor %xmm0,%xmm15
372 pshufb %xmm2,%xmm15
373 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
374 movdqa 0x10(%rsp),%xmm0
375 paddd %xmm6,%xmm0
376 movdqa %xmm0,0x10(%rsp)
377 pxor %xmm0,%xmm12
378 pshufb %xmm2,%xmm12
379 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
380 movdqa 0x20(%rsp),%xmm0
381 paddd %xmm7,%xmm0
382 movdqa %xmm0,0x20(%rsp)
383 pxor %xmm0,%xmm13
384 pshufb %xmm2,%xmm13
385 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
386 movdqa 0x30(%rsp),%xmm0
387 paddd %xmm4,%xmm0
388 movdqa %xmm0,0x30(%rsp)
389 pxor %xmm0,%xmm14
390 pshufb %xmm2,%xmm14
391
392 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
393 paddd %xmm15,%xmm10
394 pxor %xmm10,%xmm5
395 movdqa %xmm5,%xmm0
396 pslld $7,%xmm0
397 psrld $25,%xmm5
398 por %xmm0,%xmm5
399 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
400 paddd %xmm12,%xmm11
401 pxor %xmm11,%xmm6
402 movdqa %xmm6,%xmm0
403 pslld $7,%xmm0
404 psrld $25,%xmm6
405 por %xmm0,%xmm6
406 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
407 paddd %xmm13,%xmm8
408 pxor %xmm8,%xmm7
409 movdqa %xmm7,%xmm0
410 pslld $7,%xmm0
411 psrld $25,%xmm7
412 por %xmm0,%xmm7
413 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
414 paddd %xmm14,%xmm9
415 pxor %xmm9,%xmm4
416 movdqa %xmm4,%xmm0
417 pslld $7,%xmm0
418 psrld $25,%xmm4
419 por %xmm0,%xmm4
420
421 dec %ecx
422 jnz .Ldoubleround4
423
424 # x0[0-3] += s0[0]
425 # x1[0-3] += s0[1]
426 movq 0x00(%rdi),%xmm3
427 pshufd $0x00,%xmm3,%xmm2
428 pshufd $0x55,%xmm3,%xmm3
429 paddd 0x00(%rsp),%xmm2
430 movdqa %xmm2,0x00(%rsp)
431 paddd 0x10(%rsp),%xmm3
432 movdqa %xmm3,0x10(%rsp)
433 # x2[0-3] += s0[2]
434 # x3[0-3] += s0[3]
435 movq 0x08(%rdi),%xmm3
436 pshufd $0x00,%xmm3,%xmm2
437 pshufd $0x55,%xmm3,%xmm3
438 paddd 0x20(%rsp),%xmm2
439 movdqa %xmm2,0x20(%rsp)
440 paddd 0x30(%rsp),%xmm3
441 movdqa %xmm3,0x30(%rsp)
442
443 # x4[0-3] += s1[0]
444 # x5[0-3] += s1[1]
445 movq 0x10(%rdi),%xmm3
446 pshufd $0x00,%xmm3,%xmm2
447 pshufd $0x55,%xmm3,%xmm3
448 paddd %xmm2,%xmm4
449 paddd %xmm3,%xmm5
450 # x6[0-3] += s1[2]
451 # x7[0-3] += s1[3]
452 movq 0x18(%rdi),%xmm3
453 pshufd $0x00,%xmm3,%xmm2
454 pshufd $0x55,%xmm3,%xmm3
455 paddd %xmm2,%xmm6
456 paddd %xmm3,%xmm7
457
458 # x8[0-3] += s2[0]
459 # x9[0-3] += s2[1]
460 movq 0x20(%rdi),%xmm3
461 pshufd $0x00,%xmm3,%xmm2
462 pshufd $0x55,%xmm3,%xmm3
463 paddd %xmm2,%xmm8
464 paddd %xmm3,%xmm9
465 # x10[0-3] += s2[2]
466 # x11[0-3] += s2[3]
467 movq 0x28(%rdi),%xmm3
468 pshufd $0x00,%xmm3,%xmm2
469 pshufd $0x55,%xmm3,%xmm3
470 paddd %xmm2,%xmm10
471 paddd %xmm3,%xmm11
472
473 # x12[0-3] += s3[0]
474 # x13[0-3] += s3[1]
475 movq 0x30(%rdi),%xmm3
476 pshufd $0x00,%xmm3,%xmm2
477 pshufd $0x55,%xmm3,%xmm3
478 paddd %xmm2,%xmm12
479 paddd %xmm3,%xmm13
480 # x14[0-3] += s3[2]
481 # x15[0-3] += s3[3]
482 movq 0x38(%rdi),%xmm3
483 pshufd $0x00,%xmm3,%xmm2
484 pshufd $0x55,%xmm3,%xmm3
485 paddd %xmm2,%xmm14
486 paddd %xmm3,%xmm15
487
488 # x12 += counter values 0-3
489 paddd %xmm1,%xmm12
490
491 # interleave 32-bit words in state n, n+1
492 movdqa 0x00(%rsp),%xmm0
493 movdqa 0x10(%rsp),%xmm1
494 movdqa %xmm0,%xmm2
495 punpckldq %xmm1,%xmm2
496 punpckhdq %xmm1,%xmm0
497 movdqa %xmm2,0x00(%rsp)
498 movdqa %xmm0,0x10(%rsp)
499 movdqa 0x20(%rsp),%xmm0
500 movdqa 0x30(%rsp),%xmm1
501 movdqa %xmm0,%xmm2
502 punpckldq %xmm1,%xmm2
503 punpckhdq %xmm1,%xmm0
504 movdqa %xmm2,0x20(%rsp)
505 movdqa %xmm0,0x30(%rsp)
506 movdqa %xmm4,%xmm0
507 punpckldq %xmm5,%xmm4
508 punpckhdq %xmm5,%xmm0
509 movdqa %xmm0,%xmm5
510 movdqa %xmm6,%xmm0
511 punpckldq %xmm7,%xmm6
512 punpckhdq %xmm7,%xmm0
513 movdqa %xmm0,%xmm7
514 movdqa %xmm8,%xmm0
515 punpckldq %xmm9,%xmm8
516 punpckhdq %xmm9,%xmm0
517 movdqa %xmm0,%xmm9
518 movdqa %xmm10,%xmm0
519 punpckldq %xmm11,%xmm10
520 punpckhdq %xmm11,%xmm0
521 movdqa %xmm0,%xmm11
522 movdqa %xmm12,%xmm0
523 punpckldq %xmm13,%xmm12
524 punpckhdq %xmm13,%xmm0
525 movdqa %xmm0,%xmm13
526 movdqa %xmm14,%xmm0
527 punpckldq %xmm15,%xmm14
528 punpckhdq %xmm15,%xmm0
529 movdqa %xmm0,%xmm15
530
531 # interleave 64-bit words in state n, n+2
532 movdqa 0x00(%rsp),%xmm0
533 movdqa 0x20(%rsp),%xmm1
534 movdqa %xmm0,%xmm2
535 punpcklqdq %xmm1,%xmm2
536 punpckhqdq %xmm1,%xmm0
537 movdqa %xmm2,0x00(%rsp)
538 movdqa %xmm0,0x20(%rsp)
539 movdqa 0x10(%rsp),%xmm0
540 movdqa 0x30(%rsp),%xmm1
541 movdqa %xmm0,%xmm2
542 punpcklqdq %xmm1,%xmm2
543 punpckhqdq %xmm1,%xmm0
544 movdqa %xmm2,0x10(%rsp)
545 movdqa %xmm0,0x30(%rsp)
546 movdqa %xmm4,%xmm0
547 punpcklqdq %xmm6,%xmm4
548 punpckhqdq %xmm6,%xmm0
549 movdqa %xmm0,%xmm6
550 movdqa %xmm5,%xmm0
551 punpcklqdq %xmm7,%xmm5
552 punpckhqdq %xmm7,%xmm0
553 movdqa %xmm0,%xmm7
554 movdqa %xmm8,%xmm0
555 punpcklqdq %xmm10,%xmm8
556 punpckhqdq %xmm10,%xmm0
557 movdqa %xmm0,%xmm10
558 movdqa %xmm9,%xmm0
559 punpcklqdq %xmm11,%xmm9
560 punpckhqdq %xmm11,%xmm0
561 movdqa %xmm0,%xmm11
562 movdqa %xmm12,%xmm0
563 punpcklqdq %xmm14,%xmm12
564 punpckhqdq %xmm14,%xmm0
565 movdqa %xmm0,%xmm14
566 movdqa %xmm13,%xmm0
567 punpcklqdq %xmm15,%xmm13
568 punpckhqdq %xmm15,%xmm0
569 movdqa %xmm0,%xmm15
570
571 # xor with corresponding input, write to output
572 movdqa 0x00(%rsp),%xmm0
573 movdqu 0x00(%rdx),%xmm1
574 pxor %xmm1,%xmm0
575 movdqu %xmm0,0x00(%rsi)
576 movdqa 0x10(%rsp),%xmm0
577 movdqu 0x80(%rdx),%xmm1
578 pxor %xmm1,%xmm0
579 movdqu %xmm0,0x80(%rsi)
580 movdqa 0x20(%rsp),%xmm0
581 movdqu 0x40(%rdx),%xmm1
582 pxor %xmm1,%xmm0
583 movdqu %xmm0,0x40(%rsi)
584 movdqa 0x30(%rsp),%xmm0
585 movdqu 0xc0(%rdx),%xmm1
586 pxor %xmm1,%xmm0
587 movdqu %xmm0,0xc0(%rsi)
588 movdqu 0x10(%rdx),%xmm1
589 pxor %xmm1,%xmm4
590 movdqu %xmm4,0x10(%rsi)
591 movdqu 0x90(%rdx),%xmm1
592 pxor %xmm1,%xmm5
593 movdqu %xmm5,0x90(%rsi)
594 movdqu 0x50(%rdx),%xmm1
595 pxor %xmm1,%xmm6
596 movdqu %xmm6,0x50(%rsi)
597 movdqu 0xd0(%rdx),%xmm1
598 pxor %xmm1,%xmm7
599 movdqu %xmm7,0xd0(%rsi)
600 movdqu 0x20(%rdx),%xmm1
601 pxor %xmm1,%xmm8
602 movdqu %xmm8,0x20(%rsi)
603 movdqu 0xa0(%rdx),%xmm1
604 pxor %xmm1,%xmm9
605 movdqu %xmm9,0xa0(%rsi)
606 movdqu 0x60(%rdx),%xmm1
607 pxor %xmm1,%xmm10
608 movdqu %xmm10,0x60(%rsi)
609 movdqu 0xe0(%rdx),%xmm1
610 pxor %xmm1,%xmm11
611 movdqu %xmm11,0xe0(%rsi)
612 movdqu 0x30(%rdx),%xmm1
613 pxor %xmm1,%xmm12
614 movdqu %xmm12,0x30(%rsi)
615 movdqu 0xb0(%rdx),%xmm1
616 pxor %xmm1,%xmm13
617 movdqu %xmm13,0xb0(%rsi)
618 movdqu 0x70(%rdx),%xmm1
619 pxor %xmm1,%xmm14
620 movdqu %xmm14,0x70(%rsi)
621 movdqu 0xf0(%rdx),%xmm1
622 pxor %xmm1,%xmm15
623 movdqu %xmm15,0xf0(%rsi)
624
625 mov %r11,%rsp
626 ret
627 ENDPROC(chacha20_4block_xor_ssse3)