]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/crypto/cast6-avx-x86_64-asm_64.S
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / crypto / cast6-avx-x86_64-asm_64.S
CommitLineData
1a59d1b8 1/* SPDX-License-Identifier: GPL-2.0-or-later */
4ea1277d
JG
2/*
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 *
5 * Copyright (C) 2012 Johannes Goetzfried
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 *
70177286 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4ea1277d
JG
9 */
10
1985fecf 11#include <linux/linkage.h>
8691ccd7 12#include <asm/frame.h>
cba1cce0
JK
13#include "glue_helper-asm-avx.S"
14
4ea1277d 15.file "cast6-avx-x86_64-asm_64.S"
4ea1277d 16
044ab525
JK
17.extern cast_s1
18.extern cast_s2
19.extern cast_s3
20.extern cast_s4
4ea1277d
JG
21
22/* structure of crypto context */
23#define km 0
24#define kr (12*4*4)
25
26/* s-boxes */
044ab525
JK
27#define s1 cast_s1
28#define s2 cast_s2
29#define s3 cast_s3
30#define s4 cast_s4
4ea1277d
JG
31
32/**********************************************************************
33 8-way AVX cast6
34 **********************************************************************/
c66cc3be 35#define CTX %r15
4ea1277d
JG
36
37#define RA1 %xmm0
38#define RB1 %xmm1
39#define RC1 %xmm2
40#define RD1 %xmm3
41
42#define RA2 %xmm4
43#define RB2 %xmm5
44#define RC2 %xmm6
45#define RD2 %xmm7
46
c09220e1 47#define RX %xmm8
4ea1277d
JG
48
49#define RKM %xmm9
c09220e1
JK
50#define RKR %xmm10
51#define RKRF %xmm11
52#define RKRR %xmm12
53#define R32 %xmm13
54#define R1ST %xmm14
4ea1277d 55
c09220e1 56#define RTMP %xmm15
4ea1277d 57
c66cc3be
JP
58#define RID1 %rdi
59#define RID1d %edi
c09220e1
JK
60#define RID2 %rsi
61#define RID2d %esi
4ea1277d
JG
62
63#define RGI1 %rdx
64#define RGI1bl %dl
65#define RGI1bh %dh
66#define RGI2 %rcx
67#define RGI2bl %cl
68#define RGI2bh %ch
69
c09220e1
JK
70#define RGI3 %rax
71#define RGI3bl %al
72#define RGI3bh %ah
73#define RGI4 %rbx
74#define RGI4bl %bl
75#define RGI4bh %bh
76
4ea1277d
JG
77#define RFS1 %r8
78#define RFS1d %r8d
79#define RFS2 %r9
80#define RFS2d %r9d
81#define RFS3 %r10
82#define RFS3d %r10d
83
84
c09220e1
JK
85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 movzbl src ## bl, RID2d; \
88 shrq $16, src; \
4ea1277d
JG
89 movl s1(, RID1, 4), dst ## d; \
90 op1 s2(, RID2, 4), dst ## d; \
c09220e1
JK
91 movzbl src ## bh, RID1d; \
92 movzbl src ## bl, RID2d; \
93 interleave_op(il_reg); \
4ea1277d
JG
94 op2 s3(, RID1, 4), dst ## d; \
95 op3 s4(, RID2, 4), dst ## d;
96
c09220e1
JK
97#define dummy(d) /* do nothing */
98
99#define shr_next(reg) \
100 shrq $16, reg;
101
102#define F_head(a, x, gi1, gi2, op0) \
4ea1277d 103 op0 a, RKM, x; \
c09220e1
JK
104 vpslld RKRF, x, RTMP; \
105 vpsrld RKRR, x, x; \
4ea1277d
JG
106 vpor RTMP, x, x; \
107 \
c09220e1
JK
108 vmovq x, gi1; \
109 vpextrq $1, x, gi2;
110
111#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
4ea1277d 114 \
c09220e1
JK
115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
116 shlq $32, RFS2; \
117 orq RFS1, RFS2; \
118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
119 shlq $32, RFS1; \
120 orq RFS1, RFS3; \
4ea1277d 121 \
c09220e1 122 vmovq RFS2, x; \
4ea1277d
JG
123 vpinsrq $1, RFS3, x, x;
124
c09220e1
JK
125#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
126 F_head(b1, RX, RGI1, RGI2, op0); \
127 F_head(b2, RX, RGI3, RGI4, op0); \
128 \
129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
131 \
132 vpxor a1, RX, a1; \
133 vpxor a2, RTMP, a2;
134
135#define F1_2(a1, b1, a2, b2) \
136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
137#define F2_2(a1, b1, a2, b2) \
138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
139#define F3_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
4ea1277d 141
c09220e1
JK
142#define qop(in, out, f) \
143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
144
145#define get_round_keys(nn) \
146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
147 vpand R1ST, RKR, RKRF; \
148 vpsubq RKRF, R32, RKRR; \
149 vpsrldq $1, RKR, RKR;
4ea1277d
JG
150
151#define Q(n) \
c09220e1
JK
152 get_round_keys(4*n+0); \
153 qop(RD, RC, 1); \
4ea1277d 154 \
c09220e1
JK
155 get_round_keys(4*n+1); \
156 qop(RC, RB, 2); \
4ea1277d 157 \
c09220e1
JK
158 get_round_keys(4*n+2); \
159 qop(RB, RA, 3); \
4ea1277d 160 \
c09220e1
JK
161 get_round_keys(4*n+3); \
162 qop(RA, RD, 1);
4ea1277d
JG
163
164#define QBAR(n) \
c09220e1
JK
165 get_round_keys(4*n+3); \
166 qop(RA, RD, 1); \
4ea1277d 167 \
c09220e1
JK
168 get_round_keys(4*n+2); \
169 qop(RB, RA, 3); \
4ea1277d 170 \
c09220e1
JK
171 get_round_keys(4*n+1); \
172 qop(RC, RB, 2); \
4ea1277d 173 \
c09220e1
JK
174 get_round_keys(4*n+0); \
175 qop(RD, RC, 1);
176
177#define shuffle(mask) \
178 vpshufb mask, RKR, RKR;
4ea1277d 179
c09220e1
JK
180#define preload_rkr(n, do_mask, mask) \
181 vbroadcastss .L16_mask, RKR; \
182 /* add 16-bit rotation to key rotations (mod 32) */ \
183 vpxor (kr+n*16)(CTX), RKR, RKR; \
184 do_mask(mask);
4ea1277d
JG
185
186#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
187 vpunpckldq x1, x0, t0; \
188 vpunpckhdq x1, x0, t2; \
189 vpunpckldq x3, x2, t1; \
190 vpunpckhdq x3, x2, x3; \
191 \
192 vpunpcklqdq t1, t0, x0; \
193 vpunpckhqdq t1, t0, x1; \
194 vpunpcklqdq x3, t2, x2; \
195 vpunpckhqdq x3, t2, x3;
196
cba1cce0 197#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
c09220e1
JK
198 vpshufb rmask, x0, x0; \
199 vpshufb rmask, x1, x1; \
200 vpshufb rmask, x2, x2; \
201 vpshufb rmask, x3, x3; \
4ea1277d
JG
202 \
203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
204
cba1cce0 205#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
4ea1277d
JG
206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
207 \
c09220e1
JK
208 vpshufb rmask, x0, x0; \
209 vpshufb rmask, x1, x1; \
210 vpshufb rmask, x2, x2; \
cba1cce0 211 vpshufb rmask, x3, x3;
4ea1277d 212
e183914a 213.section .rodata.cst16, "aM", @progbits, 16
4ea1277d 214.align 16
70177286
JK
215.Lxts_gf128mul_and_shl1_mask:
216 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
4ea1277d
JG
217.Lbswap_mask:
218 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
cba1cce0
JK
219.Lbswap128_mask:
220 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
c09220e1
JK
221.Lrkr_enc_Q_Q_QBAR_QBAR:
222 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
223.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
224 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
225.Lrkr_dec_Q_Q_Q_Q:
226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
227.Lrkr_dec_Q_Q_QBAR_QBAR:
228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
229.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
230 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
e183914a
DV
231
232.section .rodata.cst4.L16_mask, "aM", @progbits, 4
233.align 4
c09220e1
JK
234.L16_mask:
235 .byte 16, 16, 16, 16
e183914a
DV
236
237.section .rodata.cst4.L32_mask, "aM", @progbits, 4
238.align 4
4ea1277d 239.L32_mask:
c09220e1 240 .byte 32, 0, 0, 0
e183914a
DV
241
242.section .rodata.cst4.first_mask, "aM", @progbits, 4
243.align 4
c09220e1
JK
244.Lfirst_mask:
245 .byte 0x1f, 0, 0, 0
246
247.text
4ea1277d 248
cba1cce0 249.align 8
cba1cce0 250__cast6_enc_blk8:
4ea1277d 251 /* input:
c66cc3be 252 * %rdi: ctx
cba1cce0
JK
253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
254 * output:
255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
4ea1277d
JG
256 */
257
c66cc3be 258 pushq %r15;
4ea1277d 259 pushq %rbx;
4ea1277d 260
c66cc3be
JP
261 movq %rdi, CTX;
262
c09220e1
JK
263 vmovdqa .Lbswap_mask, RKM;
264 vmovd .Lfirst_mask, R1ST;
265 vmovd .L32_mask, R32;
4ea1277d 266
cba1cce0
JK
267 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
268 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
4ea1277d 269
c09220e1 270 preload_rkr(0, dummy, none);
4ea1277d
JG
271 Q(0);
272 Q(1);
273 Q(2);
274 Q(3);
c09220e1 275 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
4ea1277d
JG
276 Q(4);
277 Q(5);
278 QBAR(6);
279 QBAR(7);
c09220e1 280 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
4ea1277d
JG
281 QBAR(8);
282 QBAR(9);
283 QBAR(10);
284 QBAR(11);
285
4ea1277d 286 popq %rbx;
c66cc3be 287 popq %r15;
4ea1277d 288
c09220e1 289 vmovdqa .Lbswap_mask, RKM;
4ea1277d 290
cba1cce0
JK
291 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
292 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
4ea1277d
JG
293
294 ret;
1985fecf 295ENDPROC(__cast6_enc_blk8)
4ea1277d 296
cba1cce0 297.align 8
cba1cce0 298__cast6_dec_blk8:
4ea1277d 299 /* input:
c66cc3be 300 * %rdi: ctx
cba1cce0
JK
301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
302 * output:
303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
4ea1277d
JG
304 */
305
c66cc3be 306 pushq %r15;
4ea1277d
JG
307 pushq %rbx;
308
c66cc3be
JP
309 movq %rdi, CTX;
310
c09220e1
JK
311 vmovdqa .Lbswap_mask, RKM;
312 vmovd .Lfirst_mask, R1ST;
313 vmovd .L32_mask, R32;
4ea1277d 314
cba1cce0
JK
315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
4ea1277d 317
c09220e1 318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
4ea1277d
JG
319 Q(11);
320 Q(10);
321 Q(9);
322 Q(8);
c09220e1 323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
4ea1277d
JG
324 Q(7);
325 Q(6);
326 QBAR(5);
327 QBAR(4);
c09220e1 328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
4ea1277d
JG
329 QBAR(3);
330 QBAR(2);
331 QBAR(1);
332 QBAR(0);
333
334 popq %rbx;
c66cc3be 335 popq %r15;
4ea1277d 336
c09220e1 337 vmovdqa .Lbswap_mask, RKM;
cba1cce0
JK
338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
340
341 ret;
1985fecf 342ENDPROC(__cast6_dec_blk8)
cba1cce0 343
1985fecf 344ENTRY(cast6_ecb_enc_8way)
cba1cce0 345 /* input:
c66cc3be 346 * %rdi: ctx
cba1cce0
JK
347 * %rsi: dst
348 * %rdx: src
349 */
8691ccd7 350 FRAME_BEGIN
c66cc3be 351 pushq %r15;
cba1cce0 352
c66cc3be 353 movq %rdi, CTX;
cba1cce0
JK
354 movq %rsi, %r11;
355
356 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
357
358 call __cast6_enc_blk8;
359
360 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
361
c66cc3be 362 popq %r15;
8691ccd7 363 FRAME_END
cba1cce0 364 ret;
1985fecf 365ENDPROC(cast6_ecb_enc_8way)
cba1cce0 366
1985fecf 367ENTRY(cast6_ecb_dec_8way)
cba1cce0 368 /* input:
c66cc3be 369 * %rdi: ctx
cba1cce0
JK
370 * %rsi: dst
371 * %rdx: src
372 */
8691ccd7 373 FRAME_BEGIN
c66cc3be 374 pushq %r15;
cba1cce0 375
c66cc3be 376 movq %rdi, CTX;
cba1cce0
JK
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
c66cc3be 385 popq %r15;
8691ccd7 386 FRAME_END
cba1cce0 387 ret;
1985fecf 388ENDPROC(cast6_ecb_dec_8way)
cba1cce0 389
1985fecf 390ENTRY(cast6_cbc_dec_8way)
cba1cce0 391 /* input:
c66cc3be 392 * %rdi: ctx
cba1cce0
JK
393 * %rsi: dst
394 * %rdx: src
395 */
8691ccd7 396 FRAME_BEGIN
cba1cce0 397 pushq %r12;
c66cc3be 398 pushq %r15;
cba1cce0 399
c66cc3be 400 movq %rdi, CTX;
cba1cce0
JK
401 movq %rsi, %r11;
402 movq %rdx, %r12;
403
404 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
405
406 call __cast6_dec_blk8;
407
408 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
409
c66cc3be 410 popq %r15;
cba1cce0 411 popq %r12;
8691ccd7 412 FRAME_END
cba1cce0 413 ret;
1985fecf 414ENDPROC(cast6_cbc_dec_8way)
cba1cce0 415
1985fecf 416ENTRY(cast6_ctr_8way)
cba1cce0
JK
417 /* input:
418 * %rdi: ctx, CTX
419 * %rsi: dst
420 * %rdx: src
421 * %rcx: iv (little endian, 128bit)
422 */
8691ccd7 423 FRAME_BEGIN
cba1cce0 424 pushq %r12;
c66cc3be 425 pushq %r15
cba1cce0 426
c66cc3be 427 movq %rdi, CTX;
cba1cce0
JK
428 movq %rsi, %r11;
429 movq %rdx, %r12;
430
431 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
432 RD2, RX, RKR, RKM);
433
434 call __cast6_enc_blk8;
435
436 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
437
c66cc3be 438 popq %r15;
cba1cce0 439 popq %r12;
8691ccd7 440 FRAME_END
4ea1277d 441 ret;
1985fecf 442ENDPROC(cast6_ctr_8way)
70177286
JK
443
444ENTRY(cast6_xts_enc_8way)
445 /* input:
446 * %rdi: ctx, CTX
447 * %rsi: dst
448 * %rdx: src
449 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
450 */
8691ccd7 451 FRAME_BEGIN
c66cc3be 452 pushq %r15;
70177286 453
c66cc3be 454 movq %rdi, CTX
70177286
JK
455 movq %rsi, %r11;
456
457 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
458 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
459 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
460
461 call __cast6_enc_blk8;
462
463 /* dst <= regs xor IVs(in dst) */
464 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
465
c66cc3be 466 popq %r15;
8691ccd7 467 FRAME_END
70177286
JK
468 ret;
469ENDPROC(cast6_xts_enc_8way)
470
471ENTRY(cast6_xts_dec_8way)
472 /* input:
473 * %rdi: ctx, CTX
474 * %rsi: dst
475 * %rdx: src
476 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
477 */
8691ccd7 478 FRAME_BEGIN
c66cc3be 479 pushq %r15;
70177286 480
c66cc3be 481 movq %rdi, CTX
70177286
JK
482 movq %rsi, %r11;
483
484 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
485 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
486 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
487
488 call __cast6_dec_blk8;
489
490 /* dst <= regs xor IVs(in dst) */
491 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
492
c66cc3be 493 popq %r15;
8691ccd7 494 FRAME_END
70177286
JK
495 ret;
496ENDPROC(cast6_xts_dec_8way)