]> git.proxmox.com Git - mirror_ubuntu-disco-kernel.git/blob - arch/x86/crypto/morus1280-sse2-asm.S
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[mirror_ubuntu-disco-kernel.git] / arch / x86 / crypto / morus1280-sse2-asm.S
1 /*
2 * SSE2 implementation of MORUS-1280
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12 #include <linux/linkage.h>
13 #include <asm/frame.h>
14
15 #define SHUFFLE_MASK(i0, i1, i2, i3) \
16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
17
18 #define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
19
20 #define STATE0_LO %xmm0
21 #define STATE0_HI %xmm1
22 #define STATE1_LO %xmm2
23 #define STATE1_HI %xmm3
24 #define STATE2_LO %xmm4
25 #define STATE2_HI %xmm5
26 #define STATE3_LO %xmm6
27 #define STATE3_HI %xmm7
28 #define STATE4_LO %xmm8
29 #define STATE4_HI %xmm9
30 #define KEY_LO %xmm10
31 #define KEY_HI %xmm11
32 #define MSG_LO %xmm10
33 #define MSG_HI %xmm11
34 #define T0_LO %xmm12
35 #define T0_HI %xmm13
36 #define T1_LO %xmm14
37 #define T1_HI %xmm15
38
39 .section .rodata.cst16.morus640_const, "aM", @progbits, 16
40 .align 16
41 .Lmorus640_const_0:
42 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
43 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
44 .Lmorus640_const_1:
45 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
46 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
47
48 .section .rodata.cst16.morus640_counter, "aM", @progbits, 16
49 .align 16
50 .Lmorus640_counter_0:
51 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
52 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
53 .Lmorus640_counter_1:
54 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
55 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
56
57 .text
58
59 .macro rol1 hi, lo
60 /*
61 * HI_1 | HI_0 || LO_1 | LO_0
62 * ==>
63 * HI_0 | HI_1 || LO_1 | LO_0
64 * ==>
65 * HI_0 | LO_1 || LO_0 | HI_1
66 */
67 pshufd $MASK2, \hi, \hi
68 movdqa \hi, T0_LO
69 punpcklqdq \lo, T0_LO
70 punpckhqdq \hi, \lo
71 movdqa \lo, \hi
72 movdqa T0_LO, \lo
73 .endm
74
75 .macro rol2 hi, lo
76 movdqa \lo, T0_LO
77 movdqa \hi, \lo
78 movdqa T0_LO, \hi
79 .endm
80
81 .macro rol3 hi, lo
82 /*
83 * HI_1 | HI_0 || LO_1 | LO_0
84 * ==>
85 * HI_0 | HI_1 || LO_1 | LO_0
86 * ==>
87 * LO_0 | HI_1 || HI_0 | LO_1
88 */
89 pshufd $MASK2, \hi, \hi
90 movdqa \lo, T0_LO
91 punpckhqdq \hi, T0_LO
92 punpcklqdq \lo, \hi
93 movdqa T0_LO, \lo
94 .endm
95
96 .macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
97 movdqa \s1_l, T0_LO
98 pand \s2_l, T0_LO
99 pxor T0_LO, \s0_l
100
101 movdqa \s1_h, T0_LO
102 pand \s2_h, T0_LO
103 pxor T0_LO, \s0_h
104
105 pxor \s3_l, \s0_l
106 pxor \s3_h, \s0_h
107
108 movdqa \s0_l, T0_LO
109 psllq $\b, T0_LO
110 psrlq $(64 - \b), \s0_l
111 pxor T0_LO, \s0_l
112
113 movdqa \s0_h, T0_LO
114 psllq $\b, T0_LO
115 psrlq $(64 - \b), \s0_h
116 pxor T0_LO, \s0_h
117
118 \w \s3_h, \s3_l
119 .endm
120
121 /*
122 * __morus1280_update: internal ABI
123 * input:
124 * STATE[0-4] - input state
125 * MSG - message block
126 * output:
127 * STATE[0-4] - output state
128 * changed:
129 * T0
130 */
131 __morus1280_update:
132 morus1280_round \
133 STATE0_LO, STATE0_HI, \
134 STATE1_LO, STATE1_HI, \
135 STATE2_LO, STATE2_HI, \
136 STATE3_LO, STATE3_HI, \
137 STATE4_LO, STATE4_HI, \
138 13, rol1
139 pxor MSG_LO, STATE1_LO
140 pxor MSG_HI, STATE1_HI
141 morus1280_round \
142 STATE1_LO, STATE1_HI, \
143 STATE2_LO, STATE2_HI, \
144 STATE3_LO, STATE3_HI, \
145 STATE4_LO, STATE4_HI, \
146 STATE0_LO, STATE0_HI, \
147 46, rol2
148 pxor MSG_LO, STATE2_LO
149 pxor MSG_HI, STATE2_HI
150 morus1280_round \
151 STATE2_LO, STATE2_HI, \
152 STATE3_LO, STATE3_HI, \
153 STATE4_LO, STATE4_HI, \
154 STATE0_LO, STATE0_HI, \
155 STATE1_LO, STATE1_HI, \
156 38, rol3
157 pxor MSG_LO, STATE3_LO
158 pxor MSG_HI, STATE3_HI
159 morus1280_round \
160 STATE3_LO, STATE3_HI, \
161 STATE4_LO, STATE4_HI, \
162 STATE0_LO, STATE0_HI, \
163 STATE1_LO, STATE1_HI, \
164 STATE2_LO, STATE2_HI, \
165 7, rol2
166 pxor MSG_LO, STATE4_LO
167 pxor MSG_HI, STATE4_HI
168 morus1280_round \
169 STATE4_LO, STATE4_HI, \
170 STATE0_LO, STATE0_HI, \
171 STATE1_LO, STATE1_HI, \
172 STATE2_LO, STATE2_HI, \
173 STATE3_LO, STATE3_HI, \
174 4, rol1
175 ret
176 ENDPROC(__morus1280_update)
177
178 /*
179 * __morus1280_update_zero: internal ABI
180 * input:
181 * STATE[0-4] - input state
182 * output:
183 * STATE[0-4] - output state
184 * changed:
185 * T0
186 */
187 __morus1280_update_zero:
188 morus1280_round \
189 STATE0_LO, STATE0_HI, \
190 STATE1_LO, STATE1_HI, \
191 STATE2_LO, STATE2_HI, \
192 STATE3_LO, STATE3_HI, \
193 STATE4_LO, STATE4_HI, \
194 13, rol1
195 morus1280_round \
196 STATE1_LO, STATE1_HI, \
197 STATE2_LO, STATE2_HI, \
198 STATE3_LO, STATE3_HI, \
199 STATE4_LO, STATE4_HI, \
200 STATE0_LO, STATE0_HI, \
201 46, rol2
202 morus1280_round \
203 STATE2_LO, STATE2_HI, \
204 STATE3_LO, STATE3_HI, \
205 STATE4_LO, STATE4_HI, \
206 STATE0_LO, STATE0_HI, \
207 STATE1_LO, STATE1_HI, \
208 38, rol3
209 morus1280_round \
210 STATE3_LO, STATE3_HI, \
211 STATE4_LO, STATE4_HI, \
212 STATE0_LO, STATE0_HI, \
213 STATE1_LO, STATE1_HI, \
214 STATE2_LO, STATE2_HI, \
215 7, rol2
216 morus1280_round \
217 STATE4_LO, STATE4_HI, \
218 STATE0_LO, STATE0_HI, \
219 STATE1_LO, STATE1_HI, \
220 STATE2_LO, STATE2_HI, \
221 STATE3_LO, STATE3_HI, \
222 4, rol1
223 ret
224 ENDPROC(__morus1280_update_zero)
225
226 /*
227 * __load_partial: internal ABI
228 * input:
229 * %rsi - src
230 * %rcx - bytes
231 * output:
232 * MSG - message block
233 * changed:
234 * %r8
235 * %r9
236 */
237 __load_partial:
238 xor %r9d, %r9d
239 pxor MSG_LO, MSG_LO
240 pxor MSG_HI, MSG_HI
241
242 mov %rcx, %r8
243 and $0x1, %r8
244 jz .Lld_partial_1
245
246 mov %rcx, %r8
247 and $0x1E, %r8
248 add %rsi, %r8
249 mov (%r8), %r9b
250
251 .Lld_partial_1:
252 mov %rcx, %r8
253 and $0x2, %r8
254 jz .Lld_partial_2
255
256 mov %rcx, %r8
257 and $0x1C, %r8
258 add %rsi, %r8
259 shl $16, %r9
260 mov (%r8), %r9w
261
262 .Lld_partial_2:
263 mov %rcx, %r8
264 and $0x4, %r8
265 jz .Lld_partial_4
266
267 mov %rcx, %r8
268 and $0x18, %r8
269 add %rsi, %r8
270 shl $32, %r9
271 mov (%r8), %r8d
272 xor %r8, %r9
273
274 .Lld_partial_4:
275 movq %r9, MSG_LO
276
277 mov %rcx, %r8
278 and $0x8, %r8
279 jz .Lld_partial_8
280
281 mov %rcx, %r8
282 and $0x10, %r8
283 add %rsi, %r8
284 pslldq $8, MSG_LO
285 movq (%r8), T0_LO
286 pxor T0_LO, MSG_LO
287
288 .Lld_partial_8:
289 mov %rcx, %r8
290 and $0x10, %r8
291 jz .Lld_partial_16
292
293 movdqa MSG_LO, MSG_HI
294 movdqu (%rsi), MSG_LO
295
296 .Lld_partial_16:
297 ret
298 ENDPROC(__load_partial)
299
300 /*
301 * __store_partial: internal ABI
302 * input:
303 * %rdx - dst
304 * %rcx - bytes
305 * output:
306 * T0 - message block
307 * changed:
308 * %r8
309 * %r9
310 * %r10
311 */
312 __store_partial:
313 mov %rcx, %r8
314 mov %rdx, %r9
315
316 cmp $16, %r8
317 jl .Lst_partial_16
318
319 movdqu T0_LO, (%r9)
320 movdqa T0_HI, T0_LO
321
322 sub $16, %r8
323 add $16, %r9
324
325 .Lst_partial_16:
326 movq T0_LO, %r10
327
328 cmp $8, %r8
329 jl .Lst_partial_8
330
331 mov %r10, (%r9)
332 psrldq $8, T0_LO
333 movq T0_LO, %r10
334
335 sub $8, %r8
336 add $8, %r9
337
338 .Lst_partial_8:
339 cmp $4, %r8
340 jl .Lst_partial_4
341
342 mov %r10d, (%r9)
343 shr $32, %r10
344
345 sub $4, %r8
346 add $4, %r9
347
348 .Lst_partial_4:
349 cmp $2, %r8
350 jl .Lst_partial_2
351
352 mov %r10w, (%r9)
353 shr $16, %r10
354
355 sub $2, %r8
356 add $2, %r9
357
358 .Lst_partial_2:
359 cmp $1, %r8
360 jl .Lst_partial_1
361
362 mov %r10b, (%r9)
363
364 .Lst_partial_1:
365 ret
366 ENDPROC(__store_partial)
367
368 /*
369 * void crypto_morus1280_sse2_init(void *state, const void *key,
370 * const void *iv);
371 */
372 ENTRY(crypto_morus1280_sse2_init)
373 FRAME_BEGIN
374
375 /* load IV: */
376 pxor STATE0_HI, STATE0_HI
377 movdqu (%rdx), STATE0_LO
378 /* load key: */
379 movdqu 0(%rsi), KEY_LO
380 movdqu 16(%rsi), KEY_HI
381 movdqa KEY_LO, STATE1_LO
382 movdqa KEY_HI, STATE1_HI
383 /* load all ones: */
384 pcmpeqd STATE2_LO, STATE2_LO
385 pcmpeqd STATE2_HI, STATE2_HI
386 /* load all zeros: */
387 pxor STATE3_LO, STATE3_LO
388 pxor STATE3_HI, STATE3_HI
389 /* load the constant: */
390 movdqa .Lmorus640_const_0, STATE4_LO
391 movdqa .Lmorus640_const_1, STATE4_HI
392
393 /* update 16 times with zero: */
394 call __morus1280_update_zero
395 call __morus1280_update_zero
396 call __morus1280_update_zero
397 call __morus1280_update_zero
398 call __morus1280_update_zero
399 call __morus1280_update_zero
400 call __morus1280_update_zero
401 call __morus1280_update_zero
402 call __morus1280_update_zero
403 call __morus1280_update_zero
404 call __morus1280_update_zero
405 call __morus1280_update_zero
406 call __morus1280_update_zero
407 call __morus1280_update_zero
408 call __morus1280_update_zero
409 call __morus1280_update_zero
410
411 /* xor-in the key again after updates: */
412 pxor KEY_LO, STATE1_LO
413 pxor KEY_HI, STATE1_HI
414
415 /* store the state: */
416 movdqu STATE0_LO, (0 * 16)(%rdi)
417 movdqu STATE0_HI, (1 * 16)(%rdi)
418 movdqu STATE1_LO, (2 * 16)(%rdi)
419 movdqu STATE1_HI, (3 * 16)(%rdi)
420 movdqu STATE2_LO, (4 * 16)(%rdi)
421 movdqu STATE2_HI, (5 * 16)(%rdi)
422 movdqu STATE3_LO, (6 * 16)(%rdi)
423 movdqu STATE3_HI, (7 * 16)(%rdi)
424 movdqu STATE4_LO, (8 * 16)(%rdi)
425 movdqu STATE4_HI, (9 * 16)(%rdi)
426
427 FRAME_END
428 ret
429 ENDPROC(crypto_morus1280_sse2_init)
430
431 /*
432 * void crypto_morus1280_sse2_ad(void *state, const void *data,
433 * unsigned int length);
434 */
435 ENTRY(crypto_morus1280_sse2_ad)
436 FRAME_BEGIN
437
438 cmp $32, %rdx
439 jb .Lad_out
440
441 /* load the state: */
442 movdqu (0 * 16)(%rdi), STATE0_LO
443 movdqu (1 * 16)(%rdi), STATE0_HI
444 movdqu (2 * 16)(%rdi), STATE1_LO
445 movdqu (3 * 16)(%rdi), STATE1_HI
446 movdqu (4 * 16)(%rdi), STATE2_LO
447 movdqu (5 * 16)(%rdi), STATE2_HI
448 movdqu (6 * 16)(%rdi), STATE3_LO
449 movdqu (7 * 16)(%rdi), STATE3_HI
450 movdqu (8 * 16)(%rdi), STATE4_LO
451 movdqu (9 * 16)(%rdi), STATE4_HI
452
453 mov %rsi, %r8
454 and $0xF, %r8
455 jnz .Lad_u_loop
456
457 .align 4
458 .Lad_a_loop:
459 movdqa 0(%rsi), MSG_LO
460 movdqa 16(%rsi), MSG_HI
461 call __morus1280_update
462 sub $32, %rdx
463 add $32, %rsi
464 cmp $32, %rdx
465 jge .Lad_a_loop
466
467 jmp .Lad_cont
468 .align 4
469 .Lad_u_loop:
470 movdqu 0(%rsi), MSG_LO
471 movdqu 16(%rsi), MSG_HI
472 call __morus1280_update
473 sub $32, %rdx
474 add $32, %rsi
475 cmp $32, %rdx
476 jge .Lad_u_loop
477
478 .Lad_cont:
479 /* store the state: */
480 movdqu STATE0_LO, (0 * 16)(%rdi)
481 movdqu STATE0_HI, (1 * 16)(%rdi)
482 movdqu STATE1_LO, (2 * 16)(%rdi)
483 movdqu STATE1_HI, (3 * 16)(%rdi)
484 movdqu STATE2_LO, (4 * 16)(%rdi)
485 movdqu STATE2_HI, (5 * 16)(%rdi)
486 movdqu STATE3_LO, (6 * 16)(%rdi)
487 movdqu STATE3_HI, (7 * 16)(%rdi)
488 movdqu STATE4_LO, (8 * 16)(%rdi)
489 movdqu STATE4_HI, (9 * 16)(%rdi)
490
491 .Lad_out:
492 FRAME_END
493 ret
494 ENDPROC(crypto_morus1280_sse2_ad)
495
496 /*
497 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
498 * unsigned int length);
499 */
500 ENTRY(crypto_morus1280_sse2_enc)
501 FRAME_BEGIN
502
503 cmp $32, %rcx
504 jb .Lenc_out
505
506 /* load the state: */
507 movdqu (0 * 16)(%rdi), STATE0_LO
508 movdqu (1 * 16)(%rdi), STATE0_HI
509 movdqu (2 * 16)(%rdi), STATE1_LO
510 movdqu (3 * 16)(%rdi), STATE1_HI
511 movdqu (4 * 16)(%rdi), STATE2_LO
512 movdqu (5 * 16)(%rdi), STATE2_HI
513 movdqu (6 * 16)(%rdi), STATE3_LO
514 movdqu (7 * 16)(%rdi), STATE3_HI
515 movdqu (8 * 16)(%rdi), STATE4_LO
516 movdqu (9 * 16)(%rdi), STATE4_HI
517
518 mov %rsi, %r8
519 or %rdx, %r8
520 and $0xF, %r8
521 jnz .Lenc_u_loop
522
523 .align 4
524 .Lenc_a_loop:
525 movdqa 0(%rsi), MSG_LO
526 movdqa 16(%rsi), MSG_HI
527 movdqa STATE1_LO, T1_LO
528 movdqa STATE1_HI, T1_HI
529 rol3 T1_HI, T1_LO
530 movdqa MSG_LO, T0_LO
531 movdqa MSG_HI, T0_HI
532 pxor T1_LO, T0_LO
533 pxor T1_HI, T0_HI
534 pxor STATE0_LO, T0_LO
535 pxor STATE0_HI, T0_HI
536 movdqa STATE2_LO, T1_LO
537 movdqa STATE2_HI, T1_HI
538 pand STATE3_LO, T1_LO
539 pand STATE3_HI, T1_HI
540 pxor T1_LO, T0_LO
541 pxor T1_HI, T0_HI
542 movdqa T0_LO, 0(%rdx)
543 movdqa T0_HI, 16(%rdx)
544
545 call __morus1280_update
546 sub $32, %rcx
547 add $32, %rsi
548 add $32, %rdx
549 cmp $32, %rcx
550 jge .Lenc_a_loop
551
552 jmp .Lenc_cont
553 .align 4
554 .Lenc_u_loop:
555 movdqu 0(%rsi), MSG_LO
556 movdqu 16(%rsi), MSG_HI
557 movdqa STATE1_LO, T1_LO
558 movdqa STATE1_HI, T1_HI
559 rol3 T1_HI, T1_LO
560 movdqa MSG_LO, T0_LO
561 movdqa MSG_HI, T0_HI
562 pxor T1_LO, T0_LO
563 pxor T1_HI, T0_HI
564 pxor STATE0_LO, T0_LO
565 pxor STATE0_HI, T0_HI
566 movdqa STATE2_LO, T1_LO
567 movdqa STATE2_HI, T1_HI
568 pand STATE3_LO, T1_LO
569 pand STATE3_HI, T1_HI
570 pxor T1_LO, T0_LO
571 pxor T1_HI, T0_HI
572 movdqu T0_LO, 0(%rdx)
573 movdqu T0_HI, 16(%rdx)
574
575 call __morus1280_update
576 sub $32, %rcx
577 add $32, %rsi
578 add $32, %rdx
579 cmp $32, %rcx
580 jge .Lenc_u_loop
581
582 .Lenc_cont:
583 /* store the state: */
584 movdqu STATE0_LO, (0 * 16)(%rdi)
585 movdqu STATE0_HI, (1 * 16)(%rdi)
586 movdqu STATE1_LO, (2 * 16)(%rdi)
587 movdqu STATE1_HI, (3 * 16)(%rdi)
588 movdqu STATE2_LO, (4 * 16)(%rdi)
589 movdqu STATE2_HI, (5 * 16)(%rdi)
590 movdqu STATE3_LO, (6 * 16)(%rdi)
591 movdqu STATE3_HI, (7 * 16)(%rdi)
592 movdqu STATE4_LO, (8 * 16)(%rdi)
593 movdqu STATE4_HI, (9 * 16)(%rdi)
594
595 .Lenc_out:
596 FRAME_END
597 ret
598 ENDPROC(crypto_morus1280_sse2_enc)
599
600 /*
601 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
602 * unsigned int length);
603 */
604 ENTRY(crypto_morus1280_sse2_enc_tail)
605 FRAME_BEGIN
606
607 /* load the state: */
608 movdqu (0 * 16)(%rdi), STATE0_LO
609 movdqu (1 * 16)(%rdi), STATE0_HI
610 movdqu (2 * 16)(%rdi), STATE1_LO
611 movdqu (3 * 16)(%rdi), STATE1_HI
612 movdqu (4 * 16)(%rdi), STATE2_LO
613 movdqu (5 * 16)(%rdi), STATE2_HI
614 movdqu (6 * 16)(%rdi), STATE3_LO
615 movdqu (7 * 16)(%rdi), STATE3_HI
616 movdqu (8 * 16)(%rdi), STATE4_LO
617 movdqu (9 * 16)(%rdi), STATE4_HI
618
619 /* encrypt message: */
620 call __load_partial
621
622 movdqa STATE1_LO, T1_LO
623 movdqa STATE1_HI, T1_HI
624 rol3 T1_HI, T1_LO
625 movdqa MSG_LO, T0_LO
626 movdqa MSG_HI, T0_HI
627 pxor T1_LO, T0_LO
628 pxor T1_HI, T0_HI
629 pxor STATE0_LO, T0_LO
630 pxor STATE0_HI, T0_HI
631 movdqa STATE2_LO, T1_LO
632 movdqa STATE2_HI, T1_HI
633 pand STATE3_LO, T1_LO
634 pand STATE3_HI, T1_HI
635 pxor T1_LO, T0_LO
636 pxor T1_HI, T0_HI
637
638 call __store_partial
639
640 call __morus1280_update
641
642 /* store the state: */
643 movdqu STATE0_LO, (0 * 16)(%rdi)
644 movdqu STATE0_HI, (1 * 16)(%rdi)
645 movdqu STATE1_LO, (2 * 16)(%rdi)
646 movdqu STATE1_HI, (3 * 16)(%rdi)
647 movdqu STATE2_LO, (4 * 16)(%rdi)
648 movdqu STATE2_HI, (5 * 16)(%rdi)
649 movdqu STATE3_LO, (6 * 16)(%rdi)
650 movdqu STATE3_HI, (7 * 16)(%rdi)
651 movdqu STATE4_LO, (8 * 16)(%rdi)
652 movdqu STATE4_HI, (9 * 16)(%rdi)
653
654 FRAME_END
655 ret
656 ENDPROC(crypto_morus1280_sse2_enc_tail)
657
658 /*
659 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
660 * unsigned int length);
661 */
662 ENTRY(crypto_morus1280_sse2_dec)
663 FRAME_BEGIN
664
665 cmp $32, %rcx
666 jb .Ldec_out
667
668 /* load the state: */
669 movdqu (0 * 16)(%rdi), STATE0_LO
670 movdqu (1 * 16)(%rdi), STATE0_HI
671 movdqu (2 * 16)(%rdi), STATE1_LO
672 movdqu (3 * 16)(%rdi), STATE1_HI
673 movdqu (4 * 16)(%rdi), STATE2_LO
674 movdqu (5 * 16)(%rdi), STATE2_HI
675 movdqu (6 * 16)(%rdi), STATE3_LO
676 movdqu (7 * 16)(%rdi), STATE3_HI
677 movdqu (8 * 16)(%rdi), STATE4_LO
678 movdqu (9 * 16)(%rdi), STATE4_HI
679
680 mov %rsi, %r8
681 or %rdx, %r8
682 and $0xF, %r8
683 jnz .Ldec_u_loop
684
685 .align 4
686 .Ldec_a_loop:
687 movdqa 0(%rsi), MSG_LO
688 movdqa 16(%rsi), MSG_HI
689 pxor STATE0_LO, MSG_LO
690 pxor STATE0_HI, MSG_HI
691 movdqa STATE1_LO, T1_LO
692 movdqa STATE1_HI, T1_HI
693 rol3 T1_HI, T1_LO
694 pxor T1_LO, MSG_LO
695 pxor T1_HI, MSG_HI
696 movdqa STATE2_LO, T1_LO
697 movdqa STATE2_HI, T1_HI
698 pand STATE3_LO, T1_LO
699 pand STATE3_HI, T1_HI
700 pxor T1_LO, MSG_LO
701 pxor T1_HI, MSG_HI
702 movdqa MSG_LO, 0(%rdx)
703 movdqa MSG_HI, 16(%rdx)
704
705 call __morus1280_update
706 sub $32, %rcx
707 add $32, %rsi
708 add $32, %rdx
709 cmp $32, %rcx
710 jge .Ldec_a_loop
711
712 jmp .Ldec_cont
713 .align 4
714 .Ldec_u_loop:
715 movdqu 0(%rsi), MSG_LO
716 movdqu 16(%rsi), MSG_HI
717 pxor STATE0_LO, MSG_LO
718 pxor STATE0_HI, MSG_HI
719 movdqa STATE1_LO, T1_LO
720 movdqa STATE1_HI, T1_HI
721 rol3 T1_HI, T1_LO
722 pxor T1_LO, MSG_LO
723 pxor T1_HI, MSG_HI
724 movdqa STATE2_LO, T1_LO
725 movdqa STATE2_HI, T1_HI
726 pand STATE3_LO, T1_LO
727 pand STATE3_HI, T1_HI
728 pxor T1_LO, MSG_LO
729 pxor T1_HI, MSG_HI
730 movdqu MSG_LO, 0(%rdx)
731 movdqu MSG_HI, 16(%rdx)
732
733 call __morus1280_update
734 sub $32, %rcx
735 add $32, %rsi
736 add $32, %rdx
737 cmp $32, %rcx
738 jge .Ldec_u_loop
739
740 .Ldec_cont:
741 /* store the state: */
742 movdqu STATE0_LO, (0 * 16)(%rdi)
743 movdqu STATE0_HI, (1 * 16)(%rdi)
744 movdqu STATE1_LO, (2 * 16)(%rdi)
745 movdqu STATE1_HI, (3 * 16)(%rdi)
746 movdqu STATE2_LO, (4 * 16)(%rdi)
747 movdqu STATE2_HI, (5 * 16)(%rdi)
748 movdqu STATE3_LO, (6 * 16)(%rdi)
749 movdqu STATE3_HI, (7 * 16)(%rdi)
750 movdqu STATE4_LO, (8 * 16)(%rdi)
751 movdqu STATE4_HI, (9 * 16)(%rdi)
752
753 .Ldec_out:
754 FRAME_END
755 ret
756 ENDPROC(crypto_morus1280_sse2_dec)
757
758 /*
759 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
760 * unsigned int length);
761 */
762 ENTRY(crypto_morus1280_sse2_dec_tail)
763 FRAME_BEGIN
764
765 /* load the state: */
766 movdqu (0 * 16)(%rdi), STATE0_LO
767 movdqu (1 * 16)(%rdi), STATE0_HI
768 movdqu (2 * 16)(%rdi), STATE1_LO
769 movdqu (3 * 16)(%rdi), STATE1_HI
770 movdqu (4 * 16)(%rdi), STATE2_LO
771 movdqu (5 * 16)(%rdi), STATE2_HI
772 movdqu (6 * 16)(%rdi), STATE3_LO
773 movdqu (7 * 16)(%rdi), STATE3_HI
774 movdqu (8 * 16)(%rdi), STATE4_LO
775 movdqu (9 * 16)(%rdi), STATE4_HI
776
777 /* decrypt message: */
778 call __load_partial
779
780 pxor STATE0_LO, MSG_LO
781 pxor STATE0_HI, MSG_HI
782 movdqa STATE1_LO, T1_LO
783 movdqa STATE1_HI, T1_HI
784 rol3 T1_HI, T1_LO
785 pxor T1_LO, MSG_LO
786 pxor T1_HI, MSG_HI
787 movdqa STATE2_LO, T1_LO
788 movdqa STATE2_HI, T1_HI
789 pand STATE3_LO, T1_LO
790 pand STATE3_HI, T1_HI
791 pxor T1_LO, MSG_LO
792 pxor T1_HI, MSG_HI
793 movdqa MSG_LO, T0_LO
794 movdqa MSG_HI, T0_HI
795
796 call __store_partial
797
798 /* mask with byte count: */
799 movq %rcx, T0_LO
800 punpcklbw T0_LO, T0_LO
801 punpcklbw T0_LO, T0_LO
802 punpcklbw T0_LO, T0_LO
803 punpcklbw T0_LO, T0_LO
804 movdqa T0_LO, T0_HI
805 movdqa .Lmorus640_counter_0, T1_LO
806 movdqa .Lmorus640_counter_1, T1_HI
807 pcmpgtb T1_LO, T0_LO
808 pcmpgtb T1_HI, T0_HI
809 pand T0_LO, MSG_LO
810 pand T0_HI, MSG_HI
811
812 call __morus1280_update
813
814 /* store the state: */
815 movdqu STATE0_LO, (0 * 16)(%rdi)
816 movdqu STATE0_HI, (1 * 16)(%rdi)
817 movdqu STATE1_LO, (2 * 16)(%rdi)
818 movdqu STATE1_HI, (3 * 16)(%rdi)
819 movdqu STATE2_LO, (4 * 16)(%rdi)
820 movdqu STATE2_HI, (5 * 16)(%rdi)
821 movdqu STATE3_LO, (6 * 16)(%rdi)
822 movdqu STATE3_HI, (7 * 16)(%rdi)
823 movdqu STATE4_LO, (8 * 16)(%rdi)
824 movdqu STATE4_HI, (9 * 16)(%rdi)
825
826 FRAME_END
827 ret
828 ENDPROC(crypto_morus1280_sse2_dec_tail)
829
830 /*
831 * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
832 * u64 assoclen, u64 cryptlen);
833 */
834 ENTRY(crypto_morus1280_sse2_final)
835 FRAME_BEGIN
836
837 /* load the state: */
838 movdqu (0 * 16)(%rdi), STATE0_LO
839 movdqu (1 * 16)(%rdi), STATE0_HI
840 movdqu (2 * 16)(%rdi), STATE1_LO
841 movdqu (3 * 16)(%rdi), STATE1_HI
842 movdqu (4 * 16)(%rdi), STATE2_LO
843 movdqu (5 * 16)(%rdi), STATE2_HI
844 movdqu (6 * 16)(%rdi), STATE3_LO
845 movdqu (7 * 16)(%rdi), STATE3_HI
846 movdqu (8 * 16)(%rdi), STATE4_LO
847 movdqu (9 * 16)(%rdi), STATE4_HI
848
849 /* xor state[0] into state[4]: */
850 pxor STATE0_LO, STATE4_LO
851 pxor STATE0_HI, STATE4_HI
852
853 /* prepare length block: */
854 movq %rdx, MSG_LO
855 movq %rcx, T0_LO
856 pslldq $8, T0_LO
857 pxor T0_LO, MSG_LO
858 psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
859 pxor MSG_HI, MSG_HI
860
861 /* update state: */
862 call __morus1280_update
863 call __morus1280_update
864 call __morus1280_update
865 call __morus1280_update
866 call __morus1280_update
867 call __morus1280_update
868 call __morus1280_update
869 call __morus1280_update
870 call __morus1280_update
871 call __morus1280_update
872
873 /* xor tag: */
874 movdqu 0(%rsi), MSG_LO
875 movdqu 16(%rsi), MSG_HI
876
877 pxor STATE0_LO, MSG_LO
878 pxor STATE0_HI, MSG_HI
879 movdqa STATE1_LO, T0_LO
880 movdqa STATE1_HI, T0_HI
881 rol3 T0_HI, T0_LO
882 pxor T0_LO, MSG_LO
883 pxor T0_HI, MSG_HI
884 movdqa STATE2_LO, T0_LO
885 movdqa STATE2_HI, T0_HI
886 pand STATE3_LO, T0_LO
887 pand STATE3_HI, T0_HI
888 pxor T0_LO, MSG_LO
889 pxor T0_HI, MSG_HI
890
891 movdqu MSG_LO, 0(%rsi)
892 movdqu MSG_HI, 16(%rsi)
893
894 FRAME_END
895 ret
896 ENDPROC(crypto_morus1280_sse2_final)