]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86/crypto/chacha20-ssse3-x86_64.S
Merge branch 'for-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
[mirror_ubuntu-artful-kernel.git] / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12 #include <linux/linkage.h>
13
14 .section .rodata.cst16.ROT8, "aM", @progbits, 16
15 .align 16
16 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
17 .section .rodata.cst16.ROT16, "aM", @progbits, 16
18 .align 16
19 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
20 .section .rodata.cst16.CTRINC, "aM", @progbits, 16
21 .align 16
22 CTRINC: .octa 0x00000003000000020000000100000000
23
24 .text
25
26 ENTRY(chacha20_block_xor_ssse3)
27 # %rdi: Input state matrix, s
28 # %rsi: 1 data block output, o
29 # %rdx: 1 data block input, i
30
31 # This function encrypts one ChaCha20 block by loading the state matrix
32 # in four SSE registers. It performs matrix operation on four words in
33 # parallel, but requireds shuffling to rearrange the words after each
34 # round. 8/16-bit word rotation is done with the slightly better
35 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
36 # traditional shift+OR.
37
38 # x0..3 = s0..3
39 movdqa 0x00(%rdi),%xmm0
40 movdqa 0x10(%rdi),%xmm1
41 movdqa 0x20(%rdi),%xmm2
42 movdqa 0x30(%rdi),%xmm3
43 movdqa %xmm0,%xmm8
44 movdqa %xmm1,%xmm9
45 movdqa %xmm2,%xmm10
46 movdqa %xmm3,%xmm11
47
48 movdqa ROT8(%rip),%xmm4
49 movdqa ROT16(%rip),%xmm5
50
51 mov $10,%ecx
52
53 .Ldoubleround:
54
55 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
56 paddd %xmm1,%xmm0
57 pxor %xmm0,%xmm3
58 pshufb %xmm5,%xmm3
59
60 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
61 paddd %xmm3,%xmm2
62 pxor %xmm2,%xmm1
63 movdqa %xmm1,%xmm6
64 pslld $12,%xmm6
65 psrld $20,%xmm1
66 por %xmm6,%xmm1
67
68 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
69 paddd %xmm1,%xmm0
70 pxor %xmm0,%xmm3
71 pshufb %xmm4,%xmm3
72
73 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
74 paddd %xmm3,%xmm2
75 pxor %xmm2,%xmm1
76 movdqa %xmm1,%xmm7
77 pslld $7,%xmm7
78 psrld $25,%xmm1
79 por %xmm7,%xmm1
80
81 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
82 pshufd $0x39,%xmm1,%xmm1
83 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
84 pshufd $0x4e,%xmm2,%xmm2
85 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
86 pshufd $0x93,%xmm3,%xmm3
87
88 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
89 paddd %xmm1,%xmm0
90 pxor %xmm0,%xmm3
91 pshufb %xmm5,%xmm3
92
93 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
94 paddd %xmm3,%xmm2
95 pxor %xmm2,%xmm1
96 movdqa %xmm1,%xmm6
97 pslld $12,%xmm6
98 psrld $20,%xmm1
99 por %xmm6,%xmm1
100
101 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
102 paddd %xmm1,%xmm0
103 pxor %xmm0,%xmm3
104 pshufb %xmm4,%xmm3
105
106 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
107 paddd %xmm3,%xmm2
108 pxor %xmm2,%xmm1
109 movdqa %xmm1,%xmm7
110 pslld $7,%xmm7
111 psrld $25,%xmm1
112 por %xmm7,%xmm1
113
114 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
115 pshufd $0x93,%xmm1,%xmm1
116 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
117 pshufd $0x4e,%xmm2,%xmm2
118 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
119 pshufd $0x39,%xmm3,%xmm3
120
121 dec %ecx
122 jnz .Ldoubleround
123
124 # o0 = i0 ^ (x0 + s0)
125 movdqu 0x00(%rdx),%xmm4
126 paddd %xmm8,%xmm0
127 pxor %xmm4,%xmm0
128 movdqu %xmm0,0x00(%rsi)
129 # o1 = i1 ^ (x1 + s1)
130 movdqu 0x10(%rdx),%xmm5
131 paddd %xmm9,%xmm1
132 pxor %xmm5,%xmm1
133 movdqu %xmm1,0x10(%rsi)
134 # o2 = i2 ^ (x2 + s2)
135 movdqu 0x20(%rdx),%xmm6
136 paddd %xmm10,%xmm2
137 pxor %xmm6,%xmm2
138 movdqu %xmm2,0x20(%rsi)
139 # o3 = i3 ^ (x3 + s3)
140 movdqu 0x30(%rdx),%xmm7
141 paddd %xmm11,%xmm3
142 pxor %xmm7,%xmm3
143 movdqu %xmm3,0x30(%rsi)
144
145 ret
146 ENDPROC(chacha20_block_xor_ssse3)
147
148 ENTRY(chacha20_4block_xor_ssse3)
149 # %rdi: Input state matrix, s
150 # %rsi: 4 data blocks output, o
151 # %rdx: 4 data blocks input, i
152
153 # This function encrypts four consecutive ChaCha20 blocks by loading the
154 # the state matrix in SSE registers four times. As we need some scratch
155 # registers, we save the first four registers on the stack. The
156 # algorithm performs each operation on the corresponding word of each
157 # state matrix, hence requires no word shuffling. For final XORing step
158 # we transpose the matrix by interleaving 32- and then 64-bit words,
159 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
160 # done with the slightly better performing SSSE3 byte shuffling,
161 # 7/12-bit word rotation uses traditional shift+OR.
162
163 mov %rsp,%r11
164 sub $0x80,%rsp
165 and $~63,%rsp
166
167 # x0..15[0-3] = s0..3[0..3]
168 movq 0x00(%rdi),%xmm1
169 pshufd $0x00,%xmm1,%xmm0
170 pshufd $0x55,%xmm1,%xmm1
171 movq 0x08(%rdi),%xmm3
172 pshufd $0x00,%xmm3,%xmm2
173 pshufd $0x55,%xmm3,%xmm3
174 movq 0x10(%rdi),%xmm5
175 pshufd $0x00,%xmm5,%xmm4
176 pshufd $0x55,%xmm5,%xmm5
177 movq 0x18(%rdi),%xmm7
178 pshufd $0x00,%xmm7,%xmm6
179 pshufd $0x55,%xmm7,%xmm7
180 movq 0x20(%rdi),%xmm9
181 pshufd $0x00,%xmm9,%xmm8
182 pshufd $0x55,%xmm9,%xmm9
183 movq 0x28(%rdi),%xmm11
184 pshufd $0x00,%xmm11,%xmm10
185 pshufd $0x55,%xmm11,%xmm11
186 movq 0x30(%rdi),%xmm13
187 pshufd $0x00,%xmm13,%xmm12
188 pshufd $0x55,%xmm13,%xmm13
189 movq 0x38(%rdi),%xmm15
190 pshufd $0x00,%xmm15,%xmm14
191 pshufd $0x55,%xmm15,%xmm15
192 # x0..3 on stack
193 movdqa %xmm0,0x00(%rsp)
194 movdqa %xmm1,0x10(%rsp)
195 movdqa %xmm2,0x20(%rsp)
196 movdqa %xmm3,0x30(%rsp)
197
198 movdqa CTRINC(%rip),%xmm1
199 movdqa ROT8(%rip),%xmm2
200 movdqa ROT16(%rip),%xmm3
201
202 # x12 += counter values 0-3
203 paddd %xmm1,%xmm12
204
205 mov $10,%ecx
206
207 .Ldoubleround4:
208 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
209 movdqa 0x00(%rsp),%xmm0
210 paddd %xmm4,%xmm0
211 movdqa %xmm0,0x00(%rsp)
212 pxor %xmm0,%xmm12
213 pshufb %xmm3,%xmm12
214 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
215 movdqa 0x10(%rsp),%xmm0
216 paddd %xmm5,%xmm0
217 movdqa %xmm0,0x10(%rsp)
218 pxor %xmm0,%xmm13
219 pshufb %xmm3,%xmm13
220 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
221 movdqa 0x20(%rsp),%xmm0
222 paddd %xmm6,%xmm0
223 movdqa %xmm0,0x20(%rsp)
224 pxor %xmm0,%xmm14
225 pshufb %xmm3,%xmm14
226 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
227 movdqa 0x30(%rsp),%xmm0
228 paddd %xmm7,%xmm0
229 movdqa %xmm0,0x30(%rsp)
230 pxor %xmm0,%xmm15
231 pshufb %xmm3,%xmm15
232
233 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
234 paddd %xmm12,%xmm8
235 pxor %xmm8,%xmm4
236 movdqa %xmm4,%xmm0
237 pslld $12,%xmm0
238 psrld $20,%xmm4
239 por %xmm0,%xmm4
240 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
241 paddd %xmm13,%xmm9
242 pxor %xmm9,%xmm5
243 movdqa %xmm5,%xmm0
244 pslld $12,%xmm0
245 psrld $20,%xmm5
246 por %xmm0,%xmm5
247 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
248 paddd %xmm14,%xmm10
249 pxor %xmm10,%xmm6
250 movdqa %xmm6,%xmm0
251 pslld $12,%xmm0
252 psrld $20,%xmm6
253 por %xmm0,%xmm6
254 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
255 paddd %xmm15,%xmm11
256 pxor %xmm11,%xmm7
257 movdqa %xmm7,%xmm0
258 pslld $12,%xmm0
259 psrld $20,%xmm7
260 por %xmm0,%xmm7
261
262 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
263 movdqa 0x00(%rsp),%xmm0
264 paddd %xmm4,%xmm0
265 movdqa %xmm0,0x00(%rsp)
266 pxor %xmm0,%xmm12
267 pshufb %xmm2,%xmm12
268 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
269 movdqa 0x10(%rsp),%xmm0
270 paddd %xmm5,%xmm0
271 movdqa %xmm0,0x10(%rsp)
272 pxor %xmm0,%xmm13
273 pshufb %xmm2,%xmm13
274 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
275 movdqa 0x20(%rsp),%xmm0
276 paddd %xmm6,%xmm0
277 movdqa %xmm0,0x20(%rsp)
278 pxor %xmm0,%xmm14
279 pshufb %xmm2,%xmm14
280 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
281 movdqa 0x30(%rsp),%xmm0
282 paddd %xmm7,%xmm0
283 movdqa %xmm0,0x30(%rsp)
284 pxor %xmm0,%xmm15
285 pshufb %xmm2,%xmm15
286
287 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
288 paddd %xmm12,%xmm8
289 pxor %xmm8,%xmm4
290 movdqa %xmm4,%xmm0
291 pslld $7,%xmm0
292 psrld $25,%xmm4
293 por %xmm0,%xmm4
294 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
295 paddd %xmm13,%xmm9
296 pxor %xmm9,%xmm5
297 movdqa %xmm5,%xmm0
298 pslld $7,%xmm0
299 psrld $25,%xmm5
300 por %xmm0,%xmm5
301 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
302 paddd %xmm14,%xmm10
303 pxor %xmm10,%xmm6
304 movdqa %xmm6,%xmm0
305 pslld $7,%xmm0
306 psrld $25,%xmm6
307 por %xmm0,%xmm6
308 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
309 paddd %xmm15,%xmm11
310 pxor %xmm11,%xmm7
311 movdqa %xmm7,%xmm0
312 pslld $7,%xmm0
313 psrld $25,%xmm7
314 por %xmm0,%xmm7
315
316 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
317 movdqa 0x00(%rsp),%xmm0
318 paddd %xmm5,%xmm0
319 movdqa %xmm0,0x00(%rsp)
320 pxor %xmm0,%xmm15
321 pshufb %xmm3,%xmm15
322 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
323 movdqa 0x10(%rsp),%xmm0
324 paddd %xmm6,%xmm0
325 movdqa %xmm0,0x10(%rsp)
326 pxor %xmm0,%xmm12
327 pshufb %xmm3,%xmm12
328 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
329 movdqa 0x20(%rsp),%xmm0
330 paddd %xmm7,%xmm0
331 movdqa %xmm0,0x20(%rsp)
332 pxor %xmm0,%xmm13
333 pshufb %xmm3,%xmm13
334 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
335 movdqa 0x30(%rsp),%xmm0
336 paddd %xmm4,%xmm0
337 movdqa %xmm0,0x30(%rsp)
338 pxor %xmm0,%xmm14
339 pshufb %xmm3,%xmm14
340
341 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
342 paddd %xmm15,%xmm10
343 pxor %xmm10,%xmm5
344 movdqa %xmm5,%xmm0
345 pslld $12,%xmm0
346 psrld $20,%xmm5
347 por %xmm0,%xmm5
348 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
349 paddd %xmm12,%xmm11
350 pxor %xmm11,%xmm6
351 movdqa %xmm6,%xmm0
352 pslld $12,%xmm0
353 psrld $20,%xmm6
354 por %xmm0,%xmm6
355 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
356 paddd %xmm13,%xmm8
357 pxor %xmm8,%xmm7
358 movdqa %xmm7,%xmm0
359 pslld $12,%xmm0
360 psrld $20,%xmm7
361 por %xmm0,%xmm7
362 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
363 paddd %xmm14,%xmm9
364 pxor %xmm9,%xmm4
365 movdqa %xmm4,%xmm0
366 pslld $12,%xmm0
367 psrld $20,%xmm4
368 por %xmm0,%xmm4
369
370 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
371 movdqa 0x00(%rsp),%xmm0
372 paddd %xmm5,%xmm0
373 movdqa %xmm0,0x00(%rsp)
374 pxor %xmm0,%xmm15
375 pshufb %xmm2,%xmm15
376 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
377 movdqa 0x10(%rsp),%xmm0
378 paddd %xmm6,%xmm0
379 movdqa %xmm0,0x10(%rsp)
380 pxor %xmm0,%xmm12
381 pshufb %xmm2,%xmm12
382 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
383 movdqa 0x20(%rsp),%xmm0
384 paddd %xmm7,%xmm0
385 movdqa %xmm0,0x20(%rsp)
386 pxor %xmm0,%xmm13
387 pshufb %xmm2,%xmm13
388 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
389 movdqa 0x30(%rsp),%xmm0
390 paddd %xmm4,%xmm0
391 movdqa %xmm0,0x30(%rsp)
392 pxor %xmm0,%xmm14
393 pshufb %xmm2,%xmm14
394
395 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
396 paddd %xmm15,%xmm10
397 pxor %xmm10,%xmm5
398 movdqa %xmm5,%xmm0
399 pslld $7,%xmm0
400 psrld $25,%xmm5
401 por %xmm0,%xmm5
402 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
403 paddd %xmm12,%xmm11
404 pxor %xmm11,%xmm6
405 movdqa %xmm6,%xmm0
406 pslld $7,%xmm0
407 psrld $25,%xmm6
408 por %xmm0,%xmm6
409 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
410 paddd %xmm13,%xmm8
411 pxor %xmm8,%xmm7
412 movdqa %xmm7,%xmm0
413 pslld $7,%xmm0
414 psrld $25,%xmm7
415 por %xmm0,%xmm7
416 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
417 paddd %xmm14,%xmm9
418 pxor %xmm9,%xmm4
419 movdqa %xmm4,%xmm0
420 pslld $7,%xmm0
421 psrld $25,%xmm4
422 por %xmm0,%xmm4
423
424 dec %ecx
425 jnz .Ldoubleround4
426
427 # x0[0-3] += s0[0]
428 # x1[0-3] += s0[1]
429 movq 0x00(%rdi),%xmm3
430 pshufd $0x00,%xmm3,%xmm2
431 pshufd $0x55,%xmm3,%xmm3
432 paddd 0x00(%rsp),%xmm2
433 movdqa %xmm2,0x00(%rsp)
434 paddd 0x10(%rsp),%xmm3
435 movdqa %xmm3,0x10(%rsp)
436 # x2[0-3] += s0[2]
437 # x3[0-3] += s0[3]
438 movq 0x08(%rdi),%xmm3
439 pshufd $0x00,%xmm3,%xmm2
440 pshufd $0x55,%xmm3,%xmm3
441 paddd 0x20(%rsp),%xmm2
442 movdqa %xmm2,0x20(%rsp)
443 paddd 0x30(%rsp),%xmm3
444 movdqa %xmm3,0x30(%rsp)
445
446 # x4[0-3] += s1[0]
447 # x5[0-3] += s1[1]
448 movq 0x10(%rdi),%xmm3
449 pshufd $0x00,%xmm3,%xmm2
450 pshufd $0x55,%xmm3,%xmm3
451 paddd %xmm2,%xmm4
452 paddd %xmm3,%xmm5
453 # x6[0-3] += s1[2]
454 # x7[0-3] += s1[3]
455 movq 0x18(%rdi),%xmm3
456 pshufd $0x00,%xmm3,%xmm2
457 pshufd $0x55,%xmm3,%xmm3
458 paddd %xmm2,%xmm6
459 paddd %xmm3,%xmm7
460
461 # x8[0-3] += s2[0]
462 # x9[0-3] += s2[1]
463 movq 0x20(%rdi),%xmm3
464 pshufd $0x00,%xmm3,%xmm2
465 pshufd $0x55,%xmm3,%xmm3
466 paddd %xmm2,%xmm8
467 paddd %xmm3,%xmm9
468 # x10[0-3] += s2[2]
469 # x11[0-3] += s2[3]
470 movq 0x28(%rdi),%xmm3
471 pshufd $0x00,%xmm3,%xmm2
472 pshufd $0x55,%xmm3,%xmm3
473 paddd %xmm2,%xmm10
474 paddd %xmm3,%xmm11
475
476 # x12[0-3] += s3[0]
477 # x13[0-3] += s3[1]
478 movq 0x30(%rdi),%xmm3
479 pshufd $0x00,%xmm3,%xmm2
480 pshufd $0x55,%xmm3,%xmm3
481 paddd %xmm2,%xmm12
482 paddd %xmm3,%xmm13
483 # x14[0-3] += s3[2]
484 # x15[0-3] += s3[3]
485 movq 0x38(%rdi),%xmm3
486 pshufd $0x00,%xmm3,%xmm2
487 pshufd $0x55,%xmm3,%xmm3
488 paddd %xmm2,%xmm14
489 paddd %xmm3,%xmm15
490
491 # x12 += counter values 0-3
492 paddd %xmm1,%xmm12
493
494 # interleave 32-bit words in state n, n+1
495 movdqa 0x00(%rsp),%xmm0
496 movdqa 0x10(%rsp),%xmm1
497 movdqa %xmm0,%xmm2
498 punpckldq %xmm1,%xmm2
499 punpckhdq %xmm1,%xmm0
500 movdqa %xmm2,0x00(%rsp)
501 movdqa %xmm0,0x10(%rsp)
502 movdqa 0x20(%rsp),%xmm0
503 movdqa 0x30(%rsp),%xmm1
504 movdqa %xmm0,%xmm2
505 punpckldq %xmm1,%xmm2
506 punpckhdq %xmm1,%xmm0
507 movdqa %xmm2,0x20(%rsp)
508 movdqa %xmm0,0x30(%rsp)
509 movdqa %xmm4,%xmm0
510 punpckldq %xmm5,%xmm4
511 punpckhdq %xmm5,%xmm0
512 movdqa %xmm0,%xmm5
513 movdqa %xmm6,%xmm0
514 punpckldq %xmm7,%xmm6
515 punpckhdq %xmm7,%xmm0
516 movdqa %xmm0,%xmm7
517 movdqa %xmm8,%xmm0
518 punpckldq %xmm9,%xmm8
519 punpckhdq %xmm9,%xmm0
520 movdqa %xmm0,%xmm9
521 movdqa %xmm10,%xmm0
522 punpckldq %xmm11,%xmm10
523 punpckhdq %xmm11,%xmm0
524 movdqa %xmm0,%xmm11
525 movdqa %xmm12,%xmm0
526 punpckldq %xmm13,%xmm12
527 punpckhdq %xmm13,%xmm0
528 movdqa %xmm0,%xmm13
529 movdqa %xmm14,%xmm0
530 punpckldq %xmm15,%xmm14
531 punpckhdq %xmm15,%xmm0
532 movdqa %xmm0,%xmm15
533
534 # interleave 64-bit words in state n, n+2
535 movdqa 0x00(%rsp),%xmm0
536 movdqa 0x20(%rsp),%xmm1
537 movdqa %xmm0,%xmm2
538 punpcklqdq %xmm1,%xmm2
539 punpckhqdq %xmm1,%xmm0
540 movdqa %xmm2,0x00(%rsp)
541 movdqa %xmm0,0x20(%rsp)
542 movdqa 0x10(%rsp),%xmm0
543 movdqa 0x30(%rsp),%xmm1
544 movdqa %xmm0,%xmm2
545 punpcklqdq %xmm1,%xmm2
546 punpckhqdq %xmm1,%xmm0
547 movdqa %xmm2,0x10(%rsp)
548 movdqa %xmm0,0x30(%rsp)
549 movdqa %xmm4,%xmm0
550 punpcklqdq %xmm6,%xmm4
551 punpckhqdq %xmm6,%xmm0
552 movdqa %xmm0,%xmm6
553 movdqa %xmm5,%xmm0
554 punpcklqdq %xmm7,%xmm5
555 punpckhqdq %xmm7,%xmm0
556 movdqa %xmm0,%xmm7
557 movdqa %xmm8,%xmm0
558 punpcklqdq %xmm10,%xmm8
559 punpckhqdq %xmm10,%xmm0
560 movdqa %xmm0,%xmm10
561 movdqa %xmm9,%xmm0
562 punpcklqdq %xmm11,%xmm9
563 punpckhqdq %xmm11,%xmm0
564 movdqa %xmm0,%xmm11
565 movdqa %xmm12,%xmm0
566 punpcklqdq %xmm14,%xmm12
567 punpckhqdq %xmm14,%xmm0
568 movdqa %xmm0,%xmm14
569 movdqa %xmm13,%xmm0
570 punpcklqdq %xmm15,%xmm13
571 punpckhqdq %xmm15,%xmm0
572 movdqa %xmm0,%xmm15
573
574 # xor with corresponding input, write to output
575 movdqa 0x00(%rsp),%xmm0
576 movdqu 0x00(%rdx),%xmm1
577 pxor %xmm1,%xmm0
578 movdqu %xmm0,0x00(%rsi)
579 movdqa 0x10(%rsp),%xmm0
580 movdqu 0x80(%rdx),%xmm1
581 pxor %xmm1,%xmm0
582 movdqu %xmm0,0x80(%rsi)
583 movdqa 0x20(%rsp),%xmm0
584 movdqu 0x40(%rdx),%xmm1
585 pxor %xmm1,%xmm0
586 movdqu %xmm0,0x40(%rsi)
587 movdqa 0x30(%rsp),%xmm0
588 movdqu 0xc0(%rdx),%xmm1
589 pxor %xmm1,%xmm0
590 movdqu %xmm0,0xc0(%rsi)
591 movdqu 0x10(%rdx),%xmm1
592 pxor %xmm1,%xmm4
593 movdqu %xmm4,0x10(%rsi)
594 movdqu 0x90(%rdx),%xmm1
595 pxor %xmm1,%xmm5
596 movdqu %xmm5,0x90(%rsi)
597 movdqu 0x50(%rdx),%xmm1
598 pxor %xmm1,%xmm6
599 movdqu %xmm6,0x50(%rsi)
600 movdqu 0xd0(%rdx),%xmm1
601 pxor %xmm1,%xmm7
602 movdqu %xmm7,0xd0(%rsi)
603 movdqu 0x20(%rdx),%xmm1
604 pxor %xmm1,%xmm8
605 movdqu %xmm8,0x20(%rsi)
606 movdqu 0xa0(%rdx),%xmm1
607 pxor %xmm1,%xmm9
608 movdqu %xmm9,0xa0(%rsi)
609 movdqu 0x60(%rdx),%xmm1
610 pxor %xmm1,%xmm10
611 movdqu %xmm10,0x60(%rsi)
612 movdqu 0xe0(%rdx),%xmm1
613 pxor %xmm1,%xmm11
614 movdqu %xmm11,0xe0(%rsi)
615 movdqu 0x30(%rdx),%xmm1
616 pxor %xmm1,%xmm12
617 movdqu %xmm12,0x30(%rsi)
618 movdqu 0xb0(%rdx),%xmm1
619 pxor %xmm1,%xmm13
620 movdqu %xmm13,0xb0(%rsi)
621 movdqu 0x70(%rdx),%xmm1
622 pxor %xmm1,%xmm14
623 movdqu %xmm14,0x70(%rsi)
624 movdqu 0xf0(%rdx),%xmm1
625 pxor %xmm1,%xmm15
626 movdqu %xmm15,0xf0(%rsi)
627
628 mov %r11,%rsp
629 ret
630 ENDPROC(chacha20_4block_xor_ssse3)