]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/crypto/morus1280-avx2-asm.S
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / crypto / morus1280-avx2-asm.S
CommitLineData
d2912cb1 1/* SPDX-License-Identifier: GPL-2.0-only */
6ecc9d9f
OM
2/*
3 * AVX2 implementation of MORUS-1280
4 *
5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6ecc9d9f
OM
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11
12#define SHUFFLE_MASK(i0, i1, i2, i3) \
13 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
14
15#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
16#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
17#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
18
19#define STATE0 %ymm0
20#define STATE0_LOW %xmm0
21#define STATE1 %ymm1
22#define STATE2 %ymm2
23#define STATE3 %ymm3
24#define STATE4 %ymm4
25#define KEY %ymm5
26#define MSG %ymm5
27#define MSG_LOW %xmm5
28#define T0 %ymm6
29#define T0_LOW %xmm6
30#define T1 %ymm7
31
32.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
33.align 32
34.Lmorus1280_const:
35 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
36 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
37 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
38 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
39
40.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
41.align 32
42.Lmorus1280_counter:
43 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
44 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
45 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
46 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
47
48.text
49
50.macro morus1280_round s0, s1, s2, s3, s4, b, w
51 vpand \s1, \s2, T0
52 vpxor T0, \s0, \s0
53 vpxor \s3, \s0, \s0
54 vpsllq $\b, \s0, T0
55 vpsrlq $(64 - \b), \s0, \s0
56 vpxor T0, \s0, \s0
57 vpermq $\w, \s3, \s3
58.endm
59
60/*
61 * __morus1280_update: internal ABI
62 * input:
63 * STATE[0-4] - input state
64 * MSG - message block
65 * output:
66 * STATE[0-4] - output state
67 * changed:
68 * T0
69 */
70__morus1280_update:
71 morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
72 vpxor MSG, STATE1, STATE1
73 morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
74 vpxor MSG, STATE2, STATE2
75 morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
76 vpxor MSG, STATE3, STATE3
77 morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
78 vpxor MSG, STATE4, STATE4
79 morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
80 ret
81ENDPROC(__morus1280_update)
82
83/*
84 * __morus1280_update_zero: internal ABI
85 * input:
86 * STATE[0-4] - input state
87 * output:
88 * STATE[0-4] - output state
89 * changed:
90 * T0
91 */
92__morus1280_update_zero:
93 morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
94 morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
95 morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
96 morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
97 morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
98 ret
99ENDPROC(__morus1280_update_zero)
100
101/*
102 * __load_partial: internal ABI
103 * input:
104 * %rsi - src
105 * %rcx - bytes
106 * output:
107 * MSG - message block
108 * changed:
109 * %r8
110 * %r9
111 */
112__load_partial:
a7bea830 113 xor %r9d, %r9d
6ecc9d9f
OM
114 vpxor MSG, MSG, MSG
115
116 mov %rcx, %r8
117 and $0x1, %r8
118 jz .Lld_partial_1
119
120 mov %rcx, %r8
121 and $0x1E, %r8
122 add %rsi, %r8
123 mov (%r8), %r9b
124
125.Lld_partial_1:
126 mov %rcx, %r8
127 and $0x2, %r8
128 jz .Lld_partial_2
129
130 mov %rcx, %r8
131 and $0x1C, %r8
132 add %rsi, %r8
133 shl $16, %r9
134 mov (%r8), %r9w
135
136.Lld_partial_2:
137 mov %rcx, %r8
138 and $0x4, %r8
139 jz .Lld_partial_4
140
141 mov %rcx, %r8
142 and $0x18, %r8
143 add %rsi, %r8
144 shl $32, %r9
145 mov (%r8), %r8d
146 xor %r8, %r9
147
148.Lld_partial_4:
149 movq %r9, MSG_LOW
150
151 mov %rcx, %r8
152 and $0x8, %r8
153 jz .Lld_partial_8
154
155 mov %rcx, %r8
156 and $0x10, %r8
157 add %rsi, %r8
158 pshufd $MASK2, MSG_LOW, MSG_LOW
159 pinsrq $0, (%r8), MSG_LOW
160
161.Lld_partial_8:
162 mov %rcx, %r8
163 and $0x10, %r8
164 jz .Lld_partial_16
165
166 vpermq $MASK2, MSG, MSG
167 movdqu (%rsi), MSG_LOW
168
169.Lld_partial_16:
170 ret
171ENDPROC(__load_partial)
172
173/*
174 * __store_partial: internal ABI
175 * input:
176 * %rdx - dst
177 * %rcx - bytes
178 * output:
179 * T0 - message block
180 * changed:
181 * %r8
182 * %r9
183 * %r10
184 */
185__store_partial:
186 mov %rcx, %r8
187 mov %rdx, %r9
188
189 cmp $16, %r8
190 jl .Lst_partial_16
191
192 movdqu T0_LOW, (%r9)
193 vpermq $MASK2, T0, T0
194
195 sub $16, %r8
196 add $16, %r9
197
198.Lst_partial_16:
199 movq T0_LOW, %r10
200
201 cmp $8, %r8
202 jl .Lst_partial_8
203
204 mov %r10, (%r9)
205 pextrq $1, T0_LOW, %r10
206
207 sub $8, %r8
208 add $8, %r9
209
210.Lst_partial_8:
211 cmp $4, %r8
212 jl .Lst_partial_4
213
214 mov %r10d, (%r9)
215 shr $32, %r10
216
217 sub $4, %r8
218 add $4, %r9
219
220.Lst_partial_4:
221 cmp $2, %r8
222 jl .Lst_partial_2
223
224 mov %r10w, (%r9)
225 shr $16, %r10
226
227 sub $2, %r8
228 add $2, %r9
229
230.Lst_partial_2:
231 cmp $1, %r8
232 jl .Lst_partial_1
233
234 mov %r10b, (%r9)
235
236.Lst_partial_1:
237 ret
238ENDPROC(__store_partial)
239
240/*
241 * void crypto_morus1280_avx2_init(void *state, const void *key,
242 * const void *iv);
243 */
244ENTRY(crypto_morus1280_avx2_init)
245 FRAME_BEGIN
246
247 /* load IV: */
248 vpxor STATE0, STATE0, STATE0
249 movdqu (%rdx), STATE0_LOW
250 /* load key: */
251 vmovdqu (%rsi), KEY
252 vmovdqa KEY, STATE1
253 /* load all ones: */
254 vpcmpeqd STATE2, STATE2, STATE2
255 /* load all zeros: */
256 vpxor STATE3, STATE3, STATE3
257 /* load the constant: */
258 vmovdqa .Lmorus1280_const, STATE4
259
260 /* update 16 times with zero: */
261 call __morus1280_update_zero
262 call __morus1280_update_zero
263 call __morus1280_update_zero
264 call __morus1280_update_zero
265 call __morus1280_update_zero
266 call __morus1280_update_zero
267 call __morus1280_update_zero
268 call __morus1280_update_zero
269 call __morus1280_update_zero
270 call __morus1280_update_zero
271 call __morus1280_update_zero
272 call __morus1280_update_zero
273 call __morus1280_update_zero
274 call __morus1280_update_zero
275 call __morus1280_update_zero
276 call __morus1280_update_zero
277
278 /* xor-in the key again after updates: */
279 vpxor KEY, STATE1, STATE1
280
281 /* store the state: */
282 vmovdqu STATE0, (0 * 32)(%rdi)
283 vmovdqu STATE1, (1 * 32)(%rdi)
284 vmovdqu STATE2, (2 * 32)(%rdi)
285 vmovdqu STATE3, (3 * 32)(%rdi)
286 vmovdqu STATE4, (4 * 32)(%rdi)
287
288 FRAME_END
289 ret
290ENDPROC(crypto_morus1280_avx2_init)
291
292/*
293 * void crypto_morus1280_avx2_ad(void *state, const void *data,
294 * unsigned int length);
295 */
296ENTRY(crypto_morus1280_avx2_ad)
297 FRAME_BEGIN
298
299 cmp $32, %rdx
300 jb .Lad_out
301
302 /* load the state: */
303 vmovdqu (0 * 32)(%rdi), STATE0
304 vmovdqu (1 * 32)(%rdi), STATE1
305 vmovdqu (2 * 32)(%rdi), STATE2
306 vmovdqu (3 * 32)(%rdi), STATE3
307 vmovdqu (4 * 32)(%rdi), STATE4
308
309 mov %rsi, %r8
310 and $0x1F, %r8
311 jnz .Lad_u_loop
312
313.align 4
314.Lad_a_loop:
315 vmovdqa (%rsi), MSG
316 call __morus1280_update
317 sub $32, %rdx
318 add $32, %rsi
319 cmp $32, %rdx
320 jge .Lad_a_loop
321
322 jmp .Lad_cont
323.align 4
324.Lad_u_loop:
325 vmovdqu (%rsi), MSG
326 call __morus1280_update
327 sub $32, %rdx
328 add $32, %rsi
329 cmp $32, %rdx
330 jge .Lad_u_loop
331
332.Lad_cont:
333 /* store the state: */
334 vmovdqu STATE0, (0 * 32)(%rdi)
335 vmovdqu STATE1, (1 * 32)(%rdi)
336 vmovdqu STATE2, (2 * 32)(%rdi)
337 vmovdqu STATE3, (3 * 32)(%rdi)
338 vmovdqu STATE4, (4 * 32)(%rdi)
339
340.Lad_out:
341 FRAME_END
342 ret
343ENDPROC(crypto_morus1280_avx2_ad)
344
345/*
346 * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
347 * unsigned int length);
348 */
349ENTRY(crypto_morus1280_avx2_enc)
350 FRAME_BEGIN
351
352 cmp $32, %rcx
353 jb .Lenc_out
354
355 /* load the state: */
356 vmovdqu (0 * 32)(%rdi), STATE0
357 vmovdqu (1 * 32)(%rdi), STATE1
358 vmovdqu (2 * 32)(%rdi), STATE2
359 vmovdqu (3 * 32)(%rdi), STATE3
360 vmovdqu (4 * 32)(%rdi), STATE4
361
362 mov %rsi, %r8
363 or %rdx, %r8
364 and $0x1F, %r8
365 jnz .Lenc_u_loop
366
367.align 4
368.Lenc_a_loop:
369 vmovdqa (%rsi), MSG
370 vmovdqa MSG, T0
371 vpxor STATE0, T0, T0
372 vpermq $MASK3, STATE1, T1
373 vpxor T1, T0, T0
374 vpand STATE2, STATE3, T1
375 vpxor T1, T0, T0
376 vmovdqa T0, (%rdx)
377
378 call __morus1280_update
379 sub $32, %rcx
380 add $32, %rsi
381 add $32, %rdx
382 cmp $32, %rcx
383 jge .Lenc_a_loop
384
385 jmp .Lenc_cont
386.align 4
387.Lenc_u_loop:
388 vmovdqu (%rsi), MSG
389 vmovdqa MSG, T0
390 vpxor STATE0, T0, T0
391 vpermq $MASK3, STATE1, T1
392 vpxor T1, T0, T0
393 vpand STATE2, STATE3, T1
394 vpxor T1, T0, T0
395 vmovdqu T0, (%rdx)
396
397 call __morus1280_update
398 sub $32, %rcx
399 add $32, %rsi
400 add $32, %rdx
401 cmp $32, %rcx
402 jge .Lenc_u_loop
403
404.Lenc_cont:
405 /* store the state: */
406 vmovdqu STATE0, (0 * 32)(%rdi)
407 vmovdqu STATE1, (1 * 32)(%rdi)
408 vmovdqu STATE2, (2 * 32)(%rdi)
409 vmovdqu STATE3, (3 * 32)(%rdi)
410 vmovdqu STATE4, (4 * 32)(%rdi)
411
412.Lenc_out:
413 FRAME_END
414 ret
415ENDPROC(crypto_morus1280_avx2_enc)
416
417/*
418 * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
419 * unsigned int length);
420 */
421ENTRY(crypto_morus1280_avx2_enc_tail)
422 FRAME_BEGIN
423
424 /* load the state: */
425 vmovdqu (0 * 32)(%rdi), STATE0
426 vmovdqu (1 * 32)(%rdi), STATE1
427 vmovdqu (2 * 32)(%rdi), STATE2
428 vmovdqu (3 * 32)(%rdi), STATE3
429 vmovdqu (4 * 32)(%rdi), STATE4
430
431 /* encrypt message: */
432 call __load_partial
433
434 vmovdqa MSG, T0
435 vpxor STATE0, T0, T0
436 vpermq $MASK3, STATE1, T1
437 vpxor T1, T0, T0
438 vpand STATE2, STATE3, T1
439 vpxor T1, T0, T0
440
441 call __store_partial
442
443 call __morus1280_update
444
445 /* store the state: */
446 vmovdqu STATE0, (0 * 32)(%rdi)
447 vmovdqu STATE1, (1 * 32)(%rdi)
448 vmovdqu STATE2, (2 * 32)(%rdi)
449 vmovdqu STATE3, (3 * 32)(%rdi)
450 vmovdqu STATE4, (4 * 32)(%rdi)
451
452 FRAME_END
221e00d1 453 ret
6ecc9d9f
OM
454ENDPROC(crypto_morus1280_avx2_enc_tail)
455
456/*
457 * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
458 * unsigned int length);
459 */
460ENTRY(crypto_morus1280_avx2_dec)
461 FRAME_BEGIN
462
463 cmp $32, %rcx
464 jb .Ldec_out
465
466 /* load the state: */
467 vmovdqu (0 * 32)(%rdi), STATE0
468 vmovdqu (1 * 32)(%rdi), STATE1
469 vmovdqu (2 * 32)(%rdi), STATE2
470 vmovdqu (3 * 32)(%rdi), STATE3
471 vmovdqu (4 * 32)(%rdi), STATE4
472
473 mov %rsi, %r8
474 or %rdx, %r8
475 and $0x1F, %r8
476 jnz .Ldec_u_loop
477
478.align 4
479.Ldec_a_loop:
480 vmovdqa (%rsi), MSG
481 vpxor STATE0, MSG, MSG
482 vpermq $MASK3, STATE1, T0
483 vpxor T0, MSG, MSG
484 vpand STATE2, STATE3, T0
485 vpxor T0, MSG, MSG
486 vmovdqa MSG, (%rdx)
487
488 call __morus1280_update
489 sub $32, %rcx
490 add $32, %rsi
491 add $32, %rdx
492 cmp $32, %rcx
493 jge .Ldec_a_loop
494
495 jmp .Ldec_cont
496.align 4
497.Ldec_u_loop:
498 vmovdqu (%rsi), MSG
499 vpxor STATE0, MSG, MSG
500 vpermq $MASK3, STATE1, T0
501 vpxor T0, MSG, MSG
502 vpand STATE2, STATE3, T0
503 vpxor T0, MSG, MSG
504 vmovdqu MSG, (%rdx)
505
506 call __morus1280_update
507 sub $32, %rcx
508 add $32, %rsi
509 add $32, %rdx
510 cmp $32, %rcx
511 jge .Ldec_u_loop
512
513.Ldec_cont:
514 /* store the state: */
515 vmovdqu STATE0, (0 * 32)(%rdi)
516 vmovdqu STATE1, (1 * 32)(%rdi)
517 vmovdqu STATE2, (2 * 32)(%rdi)
518 vmovdqu STATE3, (3 * 32)(%rdi)
519 vmovdqu STATE4, (4 * 32)(%rdi)
520
521.Ldec_out:
522 FRAME_END
523 ret
524ENDPROC(crypto_morus1280_avx2_dec)
525
526/*
527 * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
528 * unsigned int length);
529 */
530ENTRY(crypto_morus1280_avx2_dec_tail)
531 FRAME_BEGIN
532
533 /* load the state: */
534 vmovdqu (0 * 32)(%rdi), STATE0
535 vmovdqu (1 * 32)(%rdi), STATE1
536 vmovdqu (2 * 32)(%rdi), STATE2
537 vmovdqu (3 * 32)(%rdi), STATE3
538 vmovdqu (4 * 32)(%rdi), STATE4
539
540 /* decrypt message: */
541 call __load_partial
542
543 vpxor STATE0, MSG, MSG
544 vpermq $MASK3, STATE1, T0
545 vpxor T0, MSG, MSG
546 vpand STATE2, STATE3, T0
547 vpxor T0, MSG, MSG
548 vmovdqa MSG, T0
549
550 call __store_partial
551
552 /* mask with byte count: */
553 movq %rcx, T0_LOW
554 vpbroadcastb T0_LOW, T0
555 vmovdqa .Lmorus1280_counter, T1
556 vpcmpgtb T1, T0, T0
557 vpand T0, MSG, MSG
558
559 call __morus1280_update
560
561 /* store the state: */
562 vmovdqu STATE0, (0 * 32)(%rdi)
563 vmovdqu STATE1, (1 * 32)(%rdi)
564 vmovdqu STATE2, (2 * 32)(%rdi)
565 vmovdqu STATE3, (3 * 32)(%rdi)
566 vmovdqu STATE4, (4 * 32)(%rdi)
567
568 FRAME_END
569 ret
570ENDPROC(crypto_morus1280_avx2_dec_tail)
571
572/*
573 * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
574 * u64 assoclen, u64 cryptlen);
575 */
576ENTRY(crypto_morus1280_avx2_final)
577 FRAME_BEGIN
578
579 /* load the state: */
580 vmovdqu (0 * 32)(%rdi), STATE0
581 vmovdqu (1 * 32)(%rdi), STATE1
582 vmovdqu (2 * 32)(%rdi), STATE2
583 vmovdqu (3 * 32)(%rdi), STATE3
584 vmovdqu (4 * 32)(%rdi), STATE4
585
586 /* xor state[0] into state[4]: */
587 vpxor STATE0, STATE4, STATE4
588
589 /* prepare length block: */
590 vpxor MSG, MSG, MSG
591 vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
592 vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
593 vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
594
595 /* update state: */
596 call __morus1280_update
597 call __morus1280_update
598 call __morus1280_update
599 call __morus1280_update
600 call __morus1280_update
601 call __morus1280_update
602 call __morus1280_update
603 call __morus1280_update
604 call __morus1280_update
605 call __morus1280_update
606
607 /* xor tag: */
608 vmovdqu (%rsi), MSG
609
610 vpxor STATE0, MSG, MSG
611 vpermq $MASK3, STATE1, T0
612 vpxor T0, MSG, MSG
613 vpand STATE2, STATE3, T0
614 vpxor T0, MSG, MSG
615 vmovdqu MSG, (%rsi)
616
617 FRAME_END
618 ret
619ENDPROC(crypto_morus1280_avx2_final)