]>
Commit | Line | Data |
---|---|---|
1a59d1b8 | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
4ea1277d JG |
2 | /* |
3 | * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) | |
4 | * | |
5 | * Copyright (C) 2012 Johannes Goetzfried | |
6 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | |
7 | * | |
70177286 | 8 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
4ea1277d JG |
9 | */ |
10 | ||
1985fecf | 11 | #include <linux/linkage.h> |
8691ccd7 | 12 | #include <asm/frame.h> |
cba1cce0 JK |
13 | #include "glue_helper-asm-avx.S" |
14 | ||
4ea1277d | 15 | .file "cast6-avx-x86_64-asm_64.S" |
4ea1277d | 16 | |
044ab525 JK |
17 | .extern cast_s1 |
18 | .extern cast_s2 | |
19 | .extern cast_s3 | |
20 | .extern cast_s4 | |
4ea1277d JG |
21 | |
22 | /* structure of crypto context */ | |
23 | #define km 0 | |
24 | #define kr (12*4*4) | |
25 | ||
26 | /* s-boxes */ | |
044ab525 JK |
27 | #define s1 cast_s1 |
28 | #define s2 cast_s2 | |
29 | #define s3 cast_s3 | |
30 | #define s4 cast_s4 | |
4ea1277d JG |
31 | |
32 | /********************************************************************** | |
33 | 8-way AVX cast6 | |
34 | **********************************************************************/ | |
c66cc3be | 35 | #define CTX %r15 |
4ea1277d JG |
36 | |
37 | #define RA1 %xmm0 | |
38 | #define RB1 %xmm1 | |
39 | #define RC1 %xmm2 | |
40 | #define RD1 %xmm3 | |
41 | ||
42 | #define RA2 %xmm4 | |
43 | #define RB2 %xmm5 | |
44 | #define RC2 %xmm6 | |
45 | #define RD2 %xmm7 | |
46 | ||
c09220e1 | 47 | #define RX %xmm8 |
4ea1277d JG |
48 | |
49 | #define RKM %xmm9 | |
c09220e1 JK |
50 | #define RKR %xmm10 |
51 | #define RKRF %xmm11 | |
52 | #define RKRR %xmm12 | |
53 | #define R32 %xmm13 | |
54 | #define R1ST %xmm14 | |
4ea1277d | 55 | |
c09220e1 | 56 | #define RTMP %xmm15 |
4ea1277d | 57 | |
c66cc3be JP |
58 | #define RID1 %rdi |
59 | #define RID1d %edi | |
c09220e1 JK |
60 | #define RID2 %rsi |
61 | #define RID2d %esi | |
4ea1277d JG |
62 | |
63 | #define RGI1 %rdx | |
64 | #define RGI1bl %dl | |
65 | #define RGI1bh %dh | |
66 | #define RGI2 %rcx | |
67 | #define RGI2bl %cl | |
68 | #define RGI2bh %ch | |
69 | ||
c09220e1 JK |
70 | #define RGI3 %rax |
71 | #define RGI3bl %al | |
72 | #define RGI3bh %ah | |
73 | #define RGI4 %rbx | |
74 | #define RGI4bl %bl | |
75 | #define RGI4bh %bh | |
76 | ||
4ea1277d JG |
77 | #define RFS1 %r8 |
78 | #define RFS1d %r8d | |
79 | #define RFS2 %r9 | |
80 | #define RFS2d %r9d | |
81 | #define RFS3 %r10 | |
82 | #define RFS3d %r10d | |
83 | ||
84 | ||
c09220e1 JK |
85 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
86 | movzbl src ## bh, RID1d; \ | |
87 | movzbl src ## bl, RID2d; \ | |
88 | shrq $16, src; \ | |
4ea1277d JG |
89 | movl s1(, RID1, 4), dst ## d; \ |
90 | op1 s2(, RID2, 4), dst ## d; \ | |
c09220e1 JK |
91 | movzbl src ## bh, RID1d; \ |
92 | movzbl src ## bl, RID2d; \ | |
93 | interleave_op(il_reg); \ | |
4ea1277d JG |
94 | op2 s3(, RID1, 4), dst ## d; \ |
95 | op3 s4(, RID2, 4), dst ## d; | |
96 | ||
c09220e1 JK |
97 | #define dummy(d) /* do nothing */ |
98 | ||
99 | #define shr_next(reg) \ | |
100 | shrq $16, reg; | |
101 | ||
102 | #define F_head(a, x, gi1, gi2, op0) \ | |
4ea1277d | 103 | op0 a, RKM, x; \ |
c09220e1 JK |
104 | vpslld RKRF, x, RTMP; \ |
105 | vpsrld RKRR, x, x; \ | |
4ea1277d JG |
106 | vpor RTMP, x, x; \ |
107 | \ | |
c09220e1 JK |
108 | vmovq x, gi1; \ |
109 | vpextrq $1, x, gi2; | |
110 | ||
111 | #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ | |
112 | lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ | |
113 | lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ | |
4ea1277d | 114 | \ |
c09220e1 JK |
115 | lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ |
116 | shlq $32, RFS2; \ | |
117 | orq RFS1, RFS2; \ | |
118 | lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ | |
119 | shlq $32, RFS1; \ | |
120 | orq RFS1, RFS3; \ | |
4ea1277d | 121 | \ |
c09220e1 | 122 | vmovq RFS2, x; \ |
4ea1277d JG |
123 | vpinsrq $1, RFS3, x, x; |
124 | ||
c09220e1 JK |
125 | #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ |
126 | F_head(b1, RX, RGI1, RGI2, op0); \ | |
127 | F_head(b2, RX, RGI3, RGI4, op0); \ | |
128 | \ | |
129 | F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ | |
130 | F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ | |
131 | \ | |
132 | vpxor a1, RX, a1; \ | |
133 | vpxor a2, RTMP, a2; | |
134 | ||
135 | #define F1_2(a1, b1, a2, b2) \ | |
136 | F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) | |
137 | #define F2_2(a1, b1, a2, b2) \ | |
138 | F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) | |
139 | #define F3_2(a1, b1, a2, b2) \ | |
140 | F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) | |
4ea1277d | 141 | |
c09220e1 JK |
142 | #define qop(in, out, f) \ |
143 | F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); | |
144 | ||
145 | #define get_round_keys(nn) \ | |
146 | vbroadcastss (km+(4*(nn)))(CTX), RKM; \ | |
147 | vpand R1ST, RKR, RKRF; \ | |
148 | vpsubq RKRF, R32, RKRR; \ | |
149 | vpsrldq $1, RKR, RKR; | |
4ea1277d JG |
150 | |
151 | #define Q(n) \ | |
c09220e1 JK |
152 | get_round_keys(4*n+0); \ |
153 | qop(RD, RC, 1); \ | |
4ea1277d | 154 | \ |
c09220e1 JK |
155 | get_round_keys(4*n+1); \ |
156 | qop(RC, RB, 2); \ | |
4ea1277d | 157 | \ |
c09220e1 JK |
158 | get_round_keys(4*n+2); \ |
159 | qop(RB, RA, 3); \ | |
4ea1277d | 160 | \ |
c09220e1 JK |
161 | get_round_keys(4*n+3); \ |
162 | qop(RA, RD, 1); | |
4ea1277d JG |
163 | |
164 | #define QBAR(n) \ | |
c09220e1 JK |
165 | get_round_keys(4*n+3); \ |
166 | qop(RA, RD, 1); \ | |
4ea1277d | 167 | \ |
c09220e1 JK |
168 | get_round_keys(4*n+2); \ |
169 | qop(RB, RA, 3); \ | |
4ea1277d | 170 | \ |
c09220e1 JK |
171 | get_round_keys(4*n+1); \ |
172 | qop(RC, RB, 2); \ | |
4ea1277d | 173 | \ |
c09220e1 JK |
174 | get_round_keys(4*n+0); \ |
175 | qop(RD, RC, 1); | |
176 | ||
177 | #define shuffle(mask) \ | |
178 | vpshufb mask, RKR, RKR; | |
4ea1277d | 179 | |
c09220e1 JK |
180 | #define preload_rkr(n, do_mask, mask) \ |
181 | vbroadcastss .L16_mask, RKR; \ | |
182 | /* add 16-bit rotation to key rotations (mod 32) */ \ | |
183 | vpxor (kr+n*16)(CTX), RKR, RKR; \ | |
184 | do_mask(mask); | |
4ea1277d JG |
185 | |
186 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
187 | vpunpckldq x1, x0, t0; \ | |
188 | vpunpckhdq x1, x0, t2; \ | |
189 | vpunpckldq x3, x2, t1; \ | |
190 | vpunpckhdq x3, x2, x3; \ | |
191 | \ | |
192 | vpunpcklqdq t1, t0, x0; \ | |
193 | vpunpckhqdq t1, t0, x1; \ | |
194 | vpunpcklqdq x3, t2, x2; \ | |
195 | vpunpckhqdq x3, t2, x3; | |
196 | ||
cba1cce0 | 197 | #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
c09220e1 JK |
198 | vpshufb rmask, x0, x0; \ |
199 | vpshufb rmask, x1, x1; \ | |
200 | vpshufb rmask, x2, x2; \ | |
201 | vpshufb rmask, x3, x3; \ | |
4ea1277d JG |
202 | \ |
203 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | |
204 | ||
cba1cce0 | 205 | #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
4ea1277d JG |
206 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
207 | \ | |
c09220e1 JK |
208 | vpshufb rmask, x0, x0; \ |
209 | vpshufb rmask, x1, x1; \ | |
210 | vpshufb rmask, x2, x2; \ | |
cba1cce0 | 211 | vpshufb rmask, x3, x3; |
4ea1277d | 212 | |
e183914a | 213 | .section .rodata.cst16, "aM", @progbits, 16 |
4ea1277d | 214 | .align 16 |
70177286 JK |
215 | .Lxts_gf128mul_and_shl1_mask: |
216 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
4ea1277d JG |
217 | .Lbswap_mask: |
218 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |
cba1cce0 JK |
219 | .Lbswap128_mask: |
220 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
c09220e1 JK |
221 | .Lrkr_enc_Q_Q_QBAR_QBAR: |
222 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 | |
223 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: | |
224 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |
225 | .Lrkr_dec_Q_Q_Q_Q: | |
226 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 | |
227 | .Lrkr_dec_Q_Q_QBAR_QBAR: | |
228 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 | |
229 | .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: | |
230 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
e183914a DV |
231 | |
232 | .section .rodata.cst4.L16_mask, "aM", @progbits, 4 | |
233 | .align 4 | |
c09220e1 JK |
234 | .L16_mask: |
235 | .byte 16, 16, 16, 16 | |
e183914a DV |
236 | |
237 | .section .rodata.cst4.L32_mask, "aM", @progbits, 4 | |
238 | .align 4 | |
4ea1277d | 239 | .L32_mask: |
c09220e1 | 240 | .byte 32, 0, 0, 0 |
e183914a DV |
241 | |
242 | .section .rodata.cst4.first_mask, "aM", @progbits, 4 | |
243 | .align 4 | |
c09220e1 JK |
244 | .Lfirst_mask: |
245 | .byte 0x1f, 0, 0, 0 | |
246 | ||
247 | .text | |
4ea1277d | 248 | |
cba1cce0 | 249 | .align 8 |
cba1cce0 | 250 | __cast6_enc_blk8: |
4ea1277d | 251 | /* input: |
c66cc3be | 252 | * %rdi: ctx |
cba1cce0 JK |
253 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
254 | * output: | |
255 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks | |
4ea1277d JG |
256 | */ |
257 | ||
c66cc3be | 258 | pushq %r15; |
4ea1277d | 259 | pushq %rbx; |
4ea1277d | 260 | |
c66cc3be JP |
261 | movq %rdi, CTX; |
262 | ||
c09220e1 JK |
263 | vmovdqa .Lbswap_mask, RKM; |
264 | vmovd .Lfirst_mask, R1ST; | |
265 | vmovd .L32_mask, R32; | |
4ea1277d | 266 | |
cba1cce0 JK |
267 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
268 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d | 269 | |
c09220e1 | 270 | preload_rkr(0, dummy, none); |
4ea1277d JG |
271 | Q(0); |
272 | Q(1); | |
273 | Q(2); | |
274 | Q(3); | |
c09220e1 | 275 | preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); |
4ea1277d JG |
276 | Q(4); |
277 | Q(5); | |
278 | QBAR(6); | |
279 | QBAR(7); | |
c09220e1 | 280 | preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); |
4ea1277d JG |
281 | QBAR(8); |
282 | QBAR(9); | |
283 | QBAR(10); | |
284 | QBAR(11); | |
285 | ||
4ea1277d | 286 | popq %rbx; |
c66cc3be | 287 | popq %r15; |
4ea1277d | 288 | |
c09220e1 | 289 | vmovdqa .Lbswap_mask, RKM; |
4ea1277d | 290 | |
cba1cce0 JK |
291 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
292 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d JG |
293 | |
294 | ret; | |
1985fecf | 295 | ENDPROC(__cast6_enc_blk8) |
4ea1277d | 296 | |
cba1cce0 | 297 | .align 8 |
cba1cce0 | 298 | __cast6_dec_blk8: |
4ea1277d | 299 | /* input: |
c66cc3be | 300 | * %rdi: ctx |
cba1cce0 JK |
301 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
302 | * output: | |
303 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks | |
4ea1277d JG |
304 | */ |
305 | ||
c66cc3be | 306 | pushq %r15; |
4ea1277d JG |
307 | pushq %rbx; |
308 | ||
c66cc3be JP |
309 | movq %rdi, CTX; |
310 | ||
c09220e1 JK |
311 | vmovdqa .Lbswap_mask, RKM; |
312 | vmovd .Lfirst_mask, R1ST; | |
313 | vmovd .L32_mask, R32; | |
4ea1277d | 314 | |
cba1cce0 JK |
315 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
316 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
4ea1277d | 317 | |
c09220e1 | 318 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); |
4ea1277d JG |
319 | Q(11); |
320 | Q(10); | |
321 | Q(9); | |
322 | Q(8); | |
c09220e1 | 323 | preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); |
4ea1277d JG |
324 | Q(7); |
325 | Q(6); | |
326 | QBAR(5); | |
327 | QBAR(4); | |
c09220e1 | 328 | preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); |
4ea1277d JG |
329 | QBAR(3); |
330 | QBAR(2); | |
331 | QBAR(1); | |
332 | QBAR(0); | |
333 | ||
334 | popq %rbx; | |
c66cc3be | 335 | popq %r15; |
4ea1277d | 336 | |
c09220e1 | 337 | vmovdqa .Lbswap_mask, RKM; |
cba1cce0 JK |
338 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
339 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | |
340 | ||
341 | ret; | |
1985fecf | 342 | ENDPROC(__cast6_dec_blk8) |
cba1cce0 | 343 | |
1985fecf | 344 | ENTRY(cast6_ecb_enc_8way) |
cba1cce0 | 345 | /* input: |
c66cc3be | 346 | * %rdi: ctx |
cba1cce0 JK |
347 | * %rsi: dst |
348 | * %rdx: src | |
349 | */ | |
8691ccd7 | 350 | FRAME_BEGIN |
c66cc3be | 351 | pushq %r15; |
cba1cce0 | 352 | |
c66cc3be | 353 | movq %rdi, CTX; |
cba1cce0 JK |
354 | movq %rsi, %r11; |
355 | ||
356 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
357 | ||
358 | call __cast6_enc_blk8; | |
359 | ||
360 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
361 | ||
c66cc3be | 362 | popq %r15; |
8691ccd7 | 363 | FRAME_END |
cba1cce0 | 364 | ret; |
1985fecf | 365 | ENDPROC(cast6_ecb_enc_8way) |
cba1cce0 | 366 | |
1985fecf | 367 | ENTRY(cast6_ecb_dec_8way) |
cba1cce0 | 368 | /* input: |
c66cc3be | 369 | * %rdi: ctx |
cba1cce0 JK |
370 | * %rsi: dst |
371 | * %rdx: src | |
372 | */ | |
8691ccd7 | 373 | FRAME_BEGIN |
c66cc3be | 374 | pushq %r15; |
cba1cce0 | 375 | |
c66cc3be | 376 | movq %rdi, CTX; |
cba1cce0 JK |
377 | movq %rsi, %r11; |
378 | ||
379 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
380 | ||
381 | call __cast6_dec_blk8; | |
382 | ||
383 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
384 | ||
c66cc3be | 385 | popq %r15; |
8691ccd7 | 386 | FRAME_END |
cba1cce0 | 387 | ret; |
1985fecf | 388 | ENDPROC(cast6_ecb_dec_8way) |
cba1cce0 | 389 | |
1985fecf | 390 | ENTRY(cast6_cbc_dec_8way) |
cba1cce0 | 391 | /* input: |
c66cc3be | 392 | * %rdi: ctx |
cba1cce0 JK |
393 | * %rsi: dst |
394 | * %rdx: src | |
395 | */ | |
8691ccd7 | 396 | FRAME_BEGIN |
cba1cce0 | 397 | pushq %r12; |
c66cc3be | 398 | pushq %r15; |
cba1cce0 | 399 | |
c66cc3be | 400 | movq %rdi, CTX; |
cba1cce0 JK |
401 | movq %rsi, %r11; |
402 | movq %rdx, %r12; | |
403 | ||
404 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
405 | ||
406 | call __cast6_dec_blk8; | |
407 | ||
408 | store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
409 | ||
c66cc3be | 410 | popq %r15; |
cba1cce0 | 411 | popq %r12; |
8691ccd7 | 412 | FRAME_END |
cba1cce0 | 413 | ret; |
1985fecf | 414 | ENDPROC(cast6_cbc_dec_8way) |
cba1cce0 | 415 | |
1985fecf | 416 | ENTRY(cast6_ctr_8way) |
cba1cce0 JK |
417 | /* input: |
418 | * %rdi: ctx, CTX | |
419 | * %rsi: dst | |
420 | * %rdx: src | |
421 | * %rcx: iv (little endian, 128bit) | |
422 | */ | |
8691ccd7 | 423 | FRAME_BEGIN |
cba1cce0 | 424 | pushq %r12; |
c66cc3be | 425 | pushq %r15 |
cba1cce0 | 426 | |
c66cc3be | 427 | movq %rdi, CTX; |
cba1cce0 JK |
428 | movq %rsi, %r11; |
429 | movq %rdx, %r12; | |
430 | ||
431 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | |
432 | RD2, RX, RKR, RKM); | |
433 | ||
434 | call __cast6_enc_blk8; | |
435 | ||
436 | store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
437 | ||
c66cc3be | 438 | popq %r15; |
cba1cce0 | 439 | popq %r12; |
8691ccd7 | 440 | FRAME_END |
4ea1277d | 441 | ret; |
1985fecf | 442 | ENDPROC(cast6_ctr_8way) |
70177286 JK |
443 | |
444 | ENTRY(cast6_xts_enc_8way) | |
445 | /* input: | |
446 | * %rdi: ctx, CTX | |
447 | * %rsi: dst | |
448 | * %rdx: src | |
449 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
450 | */ | |
8691ccd7 | 451 | FRAME_BEGIN |
c66cc3be | 452 | pushq %r15; |
70177286 | 453 | |
c66cc3be | 454 | movq %rdi, CTX |
70177286 JK |
455 | movq %rsi, %r11; |
456 | ||
457 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | |
458 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | |
459 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | |
460 | ||
461 | call __cast6_enc_blk8; | |
462 | ||
463 | /* dst <= regs xor IVs(in dst) */ | |
464 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
465 | ||
c66cc3be | 466 | popq %r15; |
8691ccd7 | 467 | FRAME_END |
70177286 JK |
468 | ret; |
469 | ENDPROC(cast6_xts_enc_8way) | |
470 | ||
471 | ENTRY(cast6_xts_dec_8way) | |
472 | /* input: | |
473 | * %rdi: ctx, CTX | |
474 | * %rsi: dst | |
475 | * %rdx: src | |
476 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
477 | */ | |
8691ccd7 | 478 | FRAME_BEGIN |
c66cc3be | 479 | pushq %r15; |
70177286 | 480 | |
c66cc3be | 481 | movq %rdi, CTX |
70177286 JK |
482 | movq %rsi, %r11; |
483 | ||
484 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | |
485 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | |
486 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | |
487 | ||
488 | call __cast6_dec_blk8; | |
489 | ||
490 | /* dst <= regs xor IVs(in dst) */ | |
491 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | |
492 | ||
c66cc3be | 493 | popq %r15; |
8691ccd7 | 494 | FRAME_END |
70177286 JK |
495 | ret; |
496 | ENDPROC(cast6_xts_dec_8way) |