]>
Commit | Line | Data |
---|---|---|
d9b1d2e7 JK |
1 | /* |
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | |
3 | * | |
b5c5b072 | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
d9b1d2e7 JK |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | */ | |
12 | ||
13 | /* | |
14 | * Version licensed under 2-clause BSD License is available at: | |
15 | * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz | |
16 | */ | |
17 | ||
59990684 | 18 | #include <linux/linkage.h> |
8691ccd7 | 19 | #include <asm/frame.h> |
9697fa39 | 20 | #include <asm/nospec-branch.h> |
59990684 | 21 | |
d9b1d2e7 JK |
22 | #define CAMELLIA_TABLE_BYTE_LEN 272 |
23 | ||
24 | /* struct camellia_ctx: */ | |
25 | #define key_table 0 | |
26 | #define key_length CAMELLIA_TABLE_BYTE_LEN | |
27 | ||
28 | /* register macros */ | |
29 | #define CTX %rdi | |
30 | ||
31 | /********************************************************************** | |
32 | 16-way camellia | |
33 | **********************************************************************/ | |
34 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | |
35 | vpand x, mask4bit, tmp0; \ | |
36 | vpandn x, mask4bit, x; \ | |
37 | vpsrld $4, x, x; \ | |
38 | \ | |
39 | vpshufb tmp0, lo_t, tmp0; \ | |
40 | vpshufb x, hi_t, x; \ | |
41 | vpxor tmp0, x, x; | |
42 | ||
43 | /* | |
44 | * IN: | |
45 | * x0..x7: byte-sliced AB state | |
46 | * mem_cd: register pointer storing CD state | |
47 | * key: index for key material | |
48 | * OUT: | |
49 | * x0..x7: new byte-sliced CD state | |
50 | */ | |
51 | #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | |
52 | t7, mem_cd, key) \ | |
53 | /* \ | |
54 | * S-function with AES subbytes \ | |
55 | */ \ | |
56 | vmovdqa .Linv_shift_row, t4; \ | |
57 | vbroadcastss .L0f0f0f0f, t7; \ | |
58 | vmovdqa .Lpre_tf_lo_s1, t0; \ | |
59 | vmovdqa .Lpre_tf_hi_s1, t1; \ | |
60 | \ | |
61 | /* AES inverse shift rows */ \ | |
62 | vpshufb t4, x0, x0; \ | |
63 | vpshufb t4, x7, x7; \ | |
64 | vpshufb t4, x1, x1; \ | |
65 | vpshufb t4, x4, x4; \ | |
66 | vpshufb t4, x2, x2; \ | |
67 | vpshufb t4, x5, x5; \ | |
68 | vpshufb t4, x3, x3; \ | |
69 | vpshufb t4, x6, x6; \ | |
70 | \ | |
71 | /* prefilter sboxes 1, 2 and 3 */ \ | |
72 | vmovdqa .Lpre_tf_lo_s4, t2; \ | |
73 | vmovdqa .Lpre_tf_hi_s4, t3; \ | |
74 | filter_8bit(x0, t0, t1, t7, t6); \ | |
75 | filter_8bit(x7, t0, t1, t7, t6); \ | |
76 | filter_8bit(x1, t0, t1, t7, t6); \ | |
77 | filter_8bit(x4, t0, t1, t7, t6); \ | |
78 | filter_8bit(x2, t0, t1, t7, t6); \ | |
79 | filter_8bit(x5, t0, t1, t7, t6); \ | |
80 | \ | |
81 | /* prefilter sbox 4 */ \ | |
82 | vpxor t4, t4, t4; \ | |
83 | filter_8bit(x3, t2, t3, t7, t6); \ | |
84 | filter_8bit(x6, t2, t3, t7, t6); \ | |
85 | \ | |
86 | /* AES subbytes + AES shift rows */ \ | |
87 | vmovdqa .Lpost_tf_lo_s1, t0; \ | |
88 | vmovdqa .Lpost_tf_hi_s1, t1; \ | |
89 | vaesenclast t4, x0, x0; \ | |
90 | vaesenclast t4, x7, x7; \ | |
91 | vaesenclast t4, x1, x1; \ | |
92 | vaesenclast t4, x4, x4; \ | |
93 | vaesenclast t4, x2, x2; \ | |
94 | vaesenclast t4, x5, x5; \ | |
95 | vaesenclast t4, x3, x3; \ | |
96 | vaesenclast t4, x6, x6; \ | |
97 | \ | |
98 | /* postfilter sboxes 1 and 4 */ \ | |
99 | vmovdqa .Lpost_tf_lo_s3, t2; \ | |
100 | vmovdqa .Lpost_tf_hi_s3, t3; \ | |
101 | filter_8bit(x0, t0, t1, t7, t6); \ | |
102 | filter_8bit(x7, t0, t1, t7, t6); \ | |
103 | filter_8bit(x3, t0, t1, t7, t6); \ | |
104 | filter_8bit(x6, t0, t1, t7, t6); \ | |
105 | \ | |
106 | /* postfilter sbox 3 */ \ | |
107 | vmovdqa .Lpost_tf_lo_s2, t4; \ | |
108 | vmovdqa .Lpost_tf_hi_s2, t5; \ | |
109 | filter_8bit(x2, t2, t3, t7, t6); \ | |
110 | filter_8bit(x5, t2, t3, t7, t6); \ | |
111 | \ | |
112 | vpxor t6, t6, t6; \ | |
113 | vmovq key, t0; \ | |
114 | \ | |
115 | /* postfilter sbox 2 */ \ | |
116 | filter_8bit(x1, t4, t5, t7, t2); \ | |
117 | filter_8bit(x4, t4, t5, t7, t2); \ | |
118 | \ | |
119 | vpsrldq $5, t0, t5; \ | |
120 | vpsrldq $1, t0, t1; \ | |
121 | vpsrldq $2, t0, t2; \ | |
122 | vpsrldq $3, t0, t3; \ | |
123 | vpsrldq $4, t0, t4; \ | |
124 | vpshufb t6, t0, t0; \ | |
125 | vpshufb t6, t1, t1; \ | |
126 | vpshufb t6, t2, t2; \ | |
127 | vpshufb t6, t3, t3; \ | |
128 | vpshufb t6, t4, t4; \ | |
129 | vpsrldq $2, t5, t7; \ | |
130 | vpshufb t6, t7, t7; \ | |
131 | \ | |
132 | /* \ | |
133 | * P-function \ | |
134 | */ \ | |
135 | vpxor x5, x0, x0; \ | |
136 | vpxor x6, x1, x1; \ | |
137 | vpxor x7, x2, x2; \ | |
138 | vpxor x4, x3, x3; \ | |
139 | \ | |
140 | vpxor x2, x4, x4; \ | |
141 | vpxor x3, x5, x5; \ | |
142 | vpxor x0, x6, x6; \ | |
143 | vpxor x1, x7, x7; \ | |
144 | \ | |
145 | vpxor x7, x0, x0; \ | |
146 | vpxor x4, x1, x1; \ | |
147 | vpxor x5, x2, x2; \ | |
148 | vpxor x6, x3, x3; \ | |
149 | \ | |
150 | vpxor x3, x4, x4; \ | |
151 | vpxor x0, x5, x5; \ | |
152 | vpxor x1, x6, x6; \ | |
153 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | |
154 | \ | |
155 | /* \ | |
156 | * Add key material and result to CD (x becomes new CD) \ | |
157 | */ \ | |
158 | \ | |
159 | vpxor t3, x4, x4; \ | |
160 | vpxor 0 * 16(mem_cd), x4, x4; \ | |
161 | \ | |
162 | vpxor t2, x5, x5; \ | |
163 | vpxor 1 * 16(mem_cd), x5, x5; \ | |
164 | \ | |
165 | vpsrldq $1, t5, t3; \ | |
166 | vpshufb t6, t5, t5; \ | |
167 | vpshufb t6, t3, t6; \ | |
168 | \ | |
169 | vpxor t1, x6, x6; \ | |
170 | vpxor 2 * 16(mem_cd), x6, x6; \ | |
171 | \ | |
172 | vpxor t0, x7, x7; \ | |
173 | vpxor 3 * 16(mem_cd), x7, x7; \ | |
174 | \ | |
175 | vpxor t7, x0, x0; \ | |
176 | vpxor 4 * 16(mem_cd), x0, x0; \ | |
177 | \ | |
178 | vpxor t6, x1, x1; \ | |
179 | vpxor 5 * 16(mem_cd), x1, x1; \ | |
180 | \ | |
181 | vpxor t5, x2, x2; \ | |
182 | vpxor 6 * 16(mem_cd), x2, x2; \ | |
183 | \ | |
184 | vpxor t4, x3, x3; \ | |
185 | vpxor 7 * 16(mem_cd), x3, x3; | |
186 | ||
187 | /* | |
188 | * Size optimization... with inlined roundsm16, binary would be over 5 times | |
189 | * larger and would only be 0.5% faster (on sandy-bridge). | |
190 | */ | |
191 | .align 8 | |
192 | roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | |
193 | roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
194 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, | |
195 | %rcx, (%r9)); | |
196 | ret; | |
59990684 | 197 | ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) |
d9b1d2e7 JK |
198 | |
199 | .align 8 | |
200 | roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | |
201 | roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, | |
202 | %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, | |
203 | %rax, (%r9)); | |
204 | ret; | |
59990684 | 205 | ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) |
d9b1d2e7 JK |
206 | |
207 | /* | |
208 | * IN/OUT: | |
209 | * x0..x7: byte-sliced AB state preloaded | |
210 | * mem_ab: byte-sliced AB state in memory | |
211 | * mem_cb: byte-sliced CD state in memory | |
212 | */ | |
213 | #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
214 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | |
215 | leaq (key_table + (i) * 8)(CTX), %r9; \ | |
216 | call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | |
217 | \ | |
218 | vmovdqu x4, 0 * 16(mem_cd); \ | |
219 | vmovdqu x5, 1 * 16(mem_cd); \ | |
220 | vmovdqu x6, 2 * 16(mem_cd); \ | |
221 | vmovdqu x7, 3 * 16(mem_cd); \ | |
222 | vmovdqu x0, 4 * 16(mem_cd); \ | |
223 | vmovdqu x1, 5 * 16(mem_cd); \ | |
224 | vmovdqu x2, 6 * 16(mem_cd); \ | |
225 | vmovdqu x3, 7 * 16(mem_cd); \ | |
226 | \ | |
227 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | |
228 | call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | |
229 | \ | |
230 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | |
231 | ||
232 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | |
233 | ||
234 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | |
235 | /* Store new AB state */ \ | |
236 | vmovdqu x0, 0 * 16(mem_ab); \ | |
237 | vmovdqu x1, 1 * 16(mem_ab); \ | |
238 | vmovdqu x2, 2 * 16(mem_ab); \ | |
239 | vmovdqu x3, 3 * 16(mem_ab); \ | |
240 | vmovdqu x4, 4 * 16(mem_ab); \ | |
241 | vmovdqu x5, 5 * 16(mem_ab); \ | |
242 | vmovdqu x6, 6 * 16(mem_ab); \ | |
243 | vmovdqu x7, 7 * 16(mem_ab); | |
244 | ||
245 | #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
246 | y6, y7, mem_ab, mem_cd, i) \ | |
247 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
248 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | |
249 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
250 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | |
251 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
252 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | |
253 | ||
254 | #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
255 | y6, y7, mem_ab, mem_cd, i) \ | |
256 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
257 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | |
258 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
259 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | |
260 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
261 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | |
262 | ||
263 | /* | |
264 | * IN: | |
265 | * v0..3: byte-sliced 32-bit integers | |
266 | * OUT: | |
267 | * v0..3: (IN <<< 1) | |
268 | */ | |
269 | #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ | |
270 | vpcmpgtb v0, zero, t0; \ | |
271 | vpaddb v0, v0, v0; \ | |
272 | vpabsb t0, t0; \ | |
273 | \ | |
274 | vpcmpgtb v1, zero, t1; \ | |
275 | vpaddb v1, v1, v1; \ | |
276 | vpabsb t1, t1; \ | |
277 | \ | |
278 | vpcmpgtb v2, zero, t2; \ | |
279 | vpaddb v2, v2, v2; \ | |
280 | vpabsb t2, t2; \ | |
281 | \ | |
282 | vpor t0, v1, v1; \ | |
283 | \ | |
284 | vpcmpgtb v3, zero, t0; \ | |
285 | vpaddb v3, v3, v3; \ | |
286 | vpabsb t0, t0; \ | |
287 | \ | |
288 | vpor t1, v2, v2; \ | |
289 | vpor t2, v3, v3; \ | |
290 | vpor t0, v0, v0; | |
291 | ||
292 | /* | |
293 | * IN: | |
294 | * r: byte-sliced AB state in memory | |
295 | * l: byte-sliced CD state in memory | |
296 | * OUT: | |
297 | * x0..x7: new byte-sliced CD state | |
298 | */ | |
299 | #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | |
300 | tt1, tt2, tt3, kll, klr, krl, krr) \ | |
301 | /* \ | |
302 | * t0 = kll; \ | |
303 | * t0 &= ll; \ | |
304 | * lr ^= rol32(t0, 1); \ | |
305 | */ \ | |
306 | vpxor tt0, tt0, tt0; \ | |
307 | vmovd kll, t0; \ | |
308 | vpshufb tt0, t0, t3; \ | |
309 | vpsrldq $1, t0, t0; \ | |
310 | vpshufb tt0, t0, t2; \ | |
311 | vpsrldq $1, t0, t0; \ | |
312 | vpshufb tt0, t0, t1; \ | |
313 | vpsrldq $1, t0, t0; \ | |
314 | vpshufb tt0, t0, t0; \ | |
315 | \ | |
316 | vpand l0, t0, t0; \ | |
317 | vpand l1, t1, t1; \ | |
318 | vpand l2, t2, t2; \ | |
319 | vpand l3, t3, t3; \ | |
320 | \ | |
321 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | |
322 | \ | |
323 | vpxor l4, t0, l4; \ | |
324 | vmovdqu l4, 4 * 16(l); \ | |
325 | vpxor l5, t1, l5; \ | |
326 | vmovdqu l5, 5 * 16(l); \ | |
327 | vpxor l6, t2, l6; \ | |
328 | vmovdqu l6, 6 * 16(l); \ | |
329 | vpxor l7, t3, l7; \ | |
330 | vmovdqu l7, 7 * 16(l); \ | |
331 | \ | |
332 | /* \ | |
333 | * t2 = krr; \ | |
334 | * t2 |= rr; \ | |
335 | * rl ^= t2; \ | |
336 | */ \ | |
337 | \ | |
338 | vmovd krr, t0; \ | |
339 | vpshufb tt0, t0, t3; \ | |
340 | vpsrldq $1, t0, t0; \ | |
341 | vpshufb tt0, t0, t2; \ | |
342 | vpsrldq $1, t0, t0; \ | |
343 | vpshufb tt0, t0, t1; \ | |
344 | vpsrldq $1, t0, t0; \ | |
345 | vpshufb tt0, t0, t0; \ | |
346 | \ | |
347 | vpor 4 * 16(r), t0, t0; \ | |
348 | vpor 5 * 16(r), t1, t1; \ | |
349 | vpor 6 * 16(r), t2, t2; \ | |
350 | vpor 7 * 16(r), t3, t3; \ | |
351 | \ | |
352 | vpxor 0 * 16(r), t0, t0; \ | |
353 | vpxor 1 * 16(r), t1, t1; \ | |
354 | vpxor 2 * 16(r), t2, t2; \ | |
355 | vpxor 3 * 16(r), t3, t3; \ | |
356 | vmovdqu t0, 0 * 16(r); \ | |
357 | vmovdqu t1, 1 * 16(r); \ | |
358 | vmovdqu t2, 2 * 16(r); \ | |
359 | vmovdqu t3, 3 * 16(r); \ | |
360 | \ | |
361 | /* \ | |
362 | * t2 = krl; \ | |
363 | * t2 &= rl; \ | |
364 | * rr ^= rol32(t2, 1); \ | |
365 | */ \ | |
366 | vmovd krl, t0; \ | |
367 | vpshufb tt0, t0, t3; \ | |
368 | vpsrldq $1, t0, t0; \ | |
369 | vpshufb tt0, t0, t2; \ | |
370 | vpsrldq $1, t0, t0; \ | |
371 | vpshufb tt0, t0, t1; \ | |
372 | vpsrldq $1, t0, t0; \ | |
373 | vpshufb tt0, t0, t0; \ | |
374 | \ | |
375 | vpand 0 * 16(r), t0, t0; \ | |
376 | vpand 1 * 16(r), t1, t1; \ | |
377 | vpand 2 * 16(r), t2, t2; \ | |
378 | vpand 3 * 16(r), t3, t3; \ | |
379 | \ | |
380 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | |
381 | \ | |
382 | vpxor 4 * 16(r), t0, t0; \ | |
383 | vpxor 5 * 16(r), t1, t1; \ | |
384 | vpxor 6 * 16(r), t2, t2; \ | |
385 | vpxor 7 * 16(r), t3, t3; \ | |
386 | vmovdqu t0, 4 * 16(r); \ | |
387 | vmovdqu t1, 5 * 16(r); \ | |
388 | vmovdqu t2, 6 * 16(r); \ | |
389 | vmovdqu t3, 7 * 16(r); \ | |
390 | \ | |
391 | /* \ | |
392 | * t0 = klr; \ | |
393 | * t0 |= lr; \ | |
394 | * ll ^= t0; \ | |
395 | */ \ | |
396 | \ | |
397 | vmovd klr, t0; \ | |
398 | vpshufb tt0, t0, t3; \ | |
399 | vpsrldq $1, t0, t0; \ | |
400 | vpshufb tt0, t0, t2; \ | |
401 | vpsrldq $1, t0, t0; \ | |
402 | vpshufb tt0, t0, t1; \ | |
403 | vpsrldq $1, t0, t0; \ | |
404 | vpshufb tt0, t0, t0; \ | |
405 | \ | |
406 | vpor l4, t0, t0; \ | |
407 | vpor l5, t1, t1; \ | |
408 | vpor l6, t2, t2; \ | |
409 | vpor l7, t3, t3; \ | |
410 | \ | |
411 | vpxor l0, t0, l0; \ | |
412 | vmovdqu l0, 0 * 16(l); \ | |
413 | vpxor l1, t1, l1; \ | |
414 | vmovdqu l1, 1 * 16(l); \ | |
415 | vpxor l2, t2, l2; \ | |
416 | vmovdqu l2, 2 * 16(l); \ | |
417 | vpxor l3, t3, l3; \ | |
418 | vmovdqu l3, 3 * 16(l); | |
419 | ||
420 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | |
421 | vpunpckhdq x1, x0, t2; \ | |
422 | vpunpckldq x1, x0, x0; \ | |
423 | \ | |
424 | vpunpckldq x3, x2, t1; \ | |
425 | vpunpckhdq x3, x2, x2; \ | |
426 | \ | |
427 | vpunpckhqdq t1, x0, x1; \ | |
428 | vpunpcklqdq t1, x0, x0; \ | |
429 | \ | |
430 | vpunpckhqdq x2, t2, x3; \ | |
431 | vpunpcklqdq x2, t2, x2; | |
432 | ||
433 | #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ | |
434 | b3, c3, d3, st0, st1) \ | |
435 | vmovdqu d2, st0; \ | |
436 | vmovdqu d3, st1; \ | |
437 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | |
438 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | |
439 | vmovdqu st0, d2; \ | |
440 | vmovdqu st1, d3; \ | |
441 | \ | |
442 | vmovdqu a0, st0; \ | |
443 | vmovdqu a1, st1; \ | |
444 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | |
445 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | |
446 | \ | |
447 | vmovdqu .Lshufb_16x16b, a0; \ | |
448 | vmovdqu st1, a1; \ | |
449 | vpshufb a0, a2, a2; \ | |
450 | vpshufb a0, a3, a3; \ | |
451 | vpshufb a0, b0, b0; \ | |
452 | vpshufb a0, b1, b1; \ | |
453 | vpshufb a0, b2, b2; \ | |
454 | vpshufb a0, b3, b3; \ | |
455 | vpshufb a0, a1, a1; \ | |
456 | vpshufb a0, c0, c0; \ | |
457 | vpshufb a0, c1, c1; \ | |
458 | vpshufb a0, c2, c2; \ | |
459 | vpshufb a0, c3, c3; \ | |
460 | vpshufb a0, d0, d0; \ | |
461 | vpshufb a0, d1, d1; \ | |
462 | vpshufb a0, d2, d2; \ | |
463 | vpshufb a0, d3, d3; \ | |
464 | vmovdqu d3, st1; \ | |
465 | vmovdqu st0, d3; \ | |
466 | vpshufb a0, d3, a0; \ | |
467 | vmovdqu d2, st0; \ | |
468 | \ | |
469 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | |
470 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | |
471 | vmovdqu st0, d2; \ | |
472 | vmovdqu st1, d3; \ | |
473 | \ | |
474 | vmovdqu b0, st0; \ | |
475 | vmovdqu b1, st1; \ | |
476 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | |
477 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | |
478 | vmovdqu st0, b0; \ | |
479 | vmovdqu st1, b1; \ | |
480 | /* does not adjust output bytes inside vectors */ | |
481 | ||
482 | /* load blocks to registers and apply pre-whitening */ | |
483 | #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
484 | y6, y7, rio, key) \ | |
485 | vmovq key, x0; \ | |
486 | vpshufb .Lpack_bswap, x0, x0; \ | |
487 | \ | |
488 | vpxor 0 * 16(rio), x0, y7; \ | |
489 | vpxor 1 * 16(rio), x0, y6; \ | |
490 | vpxor 2 * 16(rio), x0, y5; \ | |
491 | vpxor 3 * 16(rio), x0, y4; \ | |
492 | vpxor 4 * 16(rio), x0, y3; \ | |
493 | vpxor 5 * 16(rio), x0, y2; \ | |
494 | vpxor 6 * 16(rio), x0, y1; \ | |
495 | vpxor 7 * 16(rio), x0, y0; \ | |
496 | vpxor 8 * 16(rio), x0, x7; \ | |
497 | vpxor 9 * 16(rio), x0, x6; \ | |
498 | vpxor 10 * 16(rio), x0, x5; \ | |
499 | vpxor 11 * 16(rio), x0, x4; \ | |
500 | vpxor 12 * 16(rio), x0, x3; \ | |
501 | vpxor 13 * 16(rio), x0, x2; \ | |
502 | vpxor 14 * 16(rio), x0, x1; \ | |
503 | vpxor 15 * 16(rio), x0, x0; | |
504 | ||
505 | /* byteslice pre-whitened blocks and store to temporary memory */ | |
506 | #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
507 | y6, y7, mem_ab, mem_cd) \ | |
508 | byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | |
509 | y5, y6, y7, (mem_ab), (mem_cd)); \ | |
510 | \ | |
511 | vmovdqu x0, 0 * 16(mem_ab); \ | |
512 | vmovdqu x1, 1 * 16(mem_ab); \ | |
513 | vmovdqu x2, 2 * 16(mem_ab); \ | |
514 | vmovdqu x3, 3 * 16(mem_ab); \ | |
515 | vmovdqu x4, 4 * 16(mem_ab); \ | |
516 | vmovdqu x5, 5 * 16(mem_ab); \ | |
517 | vmovdqu x6, 6 * 16(mem_ab); \ | |
518 | vmovdqu x7, 7 * 16(mem_ab); \ | |
519 | vmovdqu y0, 0 * 16(mem_cd); \ | |
520 | vmovdqu y1, 1 * 16(mem_cd); \ | |
521 | vmovdqu y2, 2 * 16(mem_cd); \ | |
522 | vmovdqu y3, 3 * 16(mem_cd); \ | |
523 | vmovdqu y4, 4 * 16(mem_cd); \ | |
524 | vmovdqu y5, 5 * 16(mem_cd); \ | |
525 | vmovdqu y6, 6 * 16(mem_cd); \ | |
526 | vmovdqu y7, 7 * 16(mem_cd); | |
527 | ||
528 | /* de-byteslice, apply post-whitening and store blocks */ | |
529 | #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | |
530 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | |
531 | byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ | |
532 | y7, x3, x7, stack_tmp0, stack_tmp1); \ | |
533 | \ | |
534 | vmovdqu x0, stack_tmp0; \ | |
535 | \ | |
536 | vmovq key, x0; \ | |
537 | vpshufb .Lpack_bswap, x0, x0; \ | |
538 | \ | |
539 | vpxor x0, y7, y7; \ | |
540 | vpxor x0, y6, y6; \ | |
541 | vpxor x0, y5, y5; \ | |
542 | vpxor x0, y4, y4; \ | |
543 | vpxor x0, y3, y3; \ | |
544 | vpxor x0, y2, y2; \ | |
545 | vpxor x0, y1, y1; \ | |
546 | vpxor x0, y0, y0; \ | |
547 | vpxor x0, x7, x7; \ | |
548 | vpxor x0, x6, x6; \ | |
549 | vpxor x0, x5, x5; \ | |
550 | vpxor x0, x4, x4; \ | |
551 | vpxor x0, x3, x3; \ | |
552 | vpxor x0, x2, x2; \ | |
553 | vpxor x0, x1, x1; \ | |
554 | vpxor stack_tmp0, x0, x0; | |
555 | ||
556 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | |
557 | y6, y7, rio) \ | |
558 | vmovdqu x0, 0 * 16(rio); \ | |
559 | vmovdqu x1, 1 * 16(rio); \ | |
560 | vmovdqu x2, 2 * 16(rio); \ | |
561 | vmovdqu x3, 3 * 16(rio); \ | |
562 | vmovdqu x4, 4 * 16(rio); \ | |
563 | vmovdqu x5, 5 * 16(rio); \ | |
564 | vmovdqu x6, 6 * 16(rio); \ | |
565 | vmovdqu x7, 7 * 16(rio); \ | |
566 | vmovdqu y0, 8 * 16(rio); \ | |
567 | vmovdqu y1, 9 * 16(rio); \ | |
568 | vmovdqu y2, 10 * 16(rio); \ | |
569 | vmovdqu y3, 11 * 16(rio); \ | |
570 | vmovdqu y4, 12 * 16(rio); \ | |
571 | vmovdqu y5, 13 * 16(rio); \ | |
572 | vmovdqu y6, 14 * 16(rio); \ | |
573 | vmovdqu y7, 15 * 16(rio); | |
574 | ||
e183914a DV |
575 | |
576 | /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ | |
577 | .section .rodata.cst16, "aM", @progbits, 16 | |
d9b1d2e7 JK |
578 | .align 16 |
579 | ||
580 | #define SHUFB_BYTES(idx) \ | |
581 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | |
582 | ||
583 | .Lshufb_16x16b: | |
584 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); | |
585 | ||
586 | .Lpack_bswap: | |
587 | .long 0x00010203 | |
588 | .long 0x04050607 | |
589 | .long 0x80808080 | |
590 | .long 0x80808080 | |
591 | ||
592 | /* For CTR-mode IV byteswap */ | |
593 | .Lbswap128_mask: | |
594 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
595 | ||
b5c5b072 JK |
596 | /* For XTS mode IV generation */ |
597 | .Lxts_gf128mul_and_shl1_mask: | |
598 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
599 | ||
d9b1d2e7 JK |
600 | /* |
601 | * pre-SubByte transform | |
602 | * | |
603 | * pre-lookup for sbox1, sbox2, sbox3: | |
604 | * swap_bitendianness( | |
605 | * isom_map_camellia_to_aes( | |
606 | * camellia_f( | |
607 | * swap_bitendianess(in) | |
608 | * ) | |
609 | * ) | |
610 | * ) | |
611 | * | |
612 | * (note: '⊕ 0xc5' inside camellia_f()) | |
613 | */ | |
614 | .Lpre_tf_lo_s1: | |
615 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | |
616 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | |
617 | .Lpre_tf_hi_s1: | |
618 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | |
619 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | |
620 | ||
621 | /* | |
622 | * pre-SubByte transform | |
623 | * | |
624 | * pre-lookup for sbox4: | |
625 | * swap_bitendianness( | |
626 | * isom_map_camellia_to_aes( | |
627 | * camellia_f( | |
628 | * swap_bitendianess(in <<< 1) | |
629 | * ) | |
630 | * ) | |
631 | * ) | |
632 | * | |
633 | * (note: '⊕ 0xc5' inside camellia_f()) | |
634 | */ | |
635 | .Lpre_tf_lo_s4: | |
636 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | |
637 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | |
638 | .Lpre_tf_hi_s4: | |
639 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | |
640 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | |
641 | ||
642 | /* | |
643 | * post-SubByte transform | |
644 | * | |
645 | * post-lookup for sbox1, sbox4: | |
646 | * swap_bitendianness( | |
647 | * camellia_h( | |
648 | * isom_map_aes_to_camellia( | |
649 | * swap_bitendianness( | |
650 | * aes_inverse_affine_transform(in) | |
651 | * ) | |
652 | * ) | |
653 | * ) | |
654 | * ) | |
655 | * | |
656 | * (note: '⊕ 0x6e' inside camellia_h()) | |
657 | */ | |
658 | .Lpost_tf_lo_s1: | |
659 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | |
660 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | |
661 | .Lpost_tf_hi_s1: | |
662 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | |
663 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | |
664 | ||
665 | /* | |
666 | * post-SubByte transform | |
667 | * | |
668 | * post-lookup for sbox2: | |
669 | * swap_bitendianness( | |
670 | * camellia_h( | |
671 | * isom_map_aes_to_camellia( | |
672 | * swap_bitendianness( | |
673 | * aes_inverse_affine_transform(in) | |
674 | * ) | |
675 | * ) | |
676 | * ) | |
677 | * ) <<< 1 | |
678 | * | |
679 | * (note: '⊕ 0x6e' inside camellia_h()) | |
680 | */ | |
681 | .Lpost_tf_lo_s2: | |
682 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | |
683 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | |
684 | .Lpost_tf_hi_s2: | |
685 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | |
686 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | |
687 | ||
688 | /* | |
689 | * post-SubByte transform | |
690 | * | |
691 | * post-lookup for sbox3: | |
692 | * swap_bitendianness( | |
693 | * camellia_h( | |
694 | * isom_map_aes_to_camellia( | |
695 | * swap_bitendianness( | |
696 | * aes_inverse_affine_transform(in) | |
697 | * ) | |
698 | * ) | |
699 | * ) | |
700 | * ) >>> 1 | |
701 | * | |
702 | * (note: '⊕ 0x6e' inside camellia_h()) | |
703 | */ | |
704 | .Lpost_tf_lo_s3: | |
705 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | |
706 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | |
707 | .Lpost_tf_hi_s3: | |
708 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | |
709 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | |
710 | ||
711 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | |
712 | .Linv_shift_row: | |
713 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | |
714 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | |
715 | ||
716 | /* 4-bit mask */ | |
e183914a | 717 | .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 |
d9b1d2e7 JK |
718 | .align 4 |
719 | .L0f0f0f0f: | |
720 | .long 0x0f0f0f0f | |
721 | ||
722 | .text | |
723 | ||
724 | .align 8 | |
d9b1d2e7 JK |
725 | __camellia_enc_blk16: |
726 | /* input: | |
727 | * %rdi: ctx, CTX | |
728 | * %rax: temporary storage, 256 bytes | |
729 | * %xmm0..%xmm15: 16 plaintext blocks | |
730 | * output: | |
731 | * %xmm0..%xmm15: 16 encrypted blocks, order swapped: | |
732 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | |
733 | */ | |
8691ccd7 | 734 | FRAME_BEGIN |
d9b1d2e7 JK |
735 | |
736 | leaq 8 * 16(%rax), %rcx; | |
737 | ||
738 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
739 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
740 | %xmm15, %rax, %rcx); | |
741 | ||
742 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
743 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
744 | %xmm15, %rax, %rcx, 0); | |
745 | ||
746 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
747 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
748 | %xmm15, | |
749 | ((key_table + (8) * 8) + 0)(CTX), | |
750 | ((key_table + (8) * 8) + 4)(CTX), | |
751 | ((key_table + (8) * 8) + 8)(CTX), | |
752 | ((key_table + (8) * 8) + 12)(CTX)); | |
753 | ||
754 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
755 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
756 | %xmm15, %rax, %rcx, 8); | |
757 | ||
758 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
759 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
760 | %xmm15, | |
761 | ((key_table + (16) * 8) + 0)(CTX), | |
762 | ((key_table + (16) * 8) + 4)(CTX), | |
763 | ((key_table + (16) * 8) + 8)(CTX), | |
764 | ((key_table + (16) * 8) + 12)(CTX)); | |
765 | ||
766 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
767 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
768 | %xmm15, %rax, %rcx, 16); | |
769 | ||
770 | movl $24, %r8d; | |
771 | cmpl $16, key_length(CTX); | |
772 | jne .Lenc_max32; | |
773 | ||
774 | .Lenc_done: | |
775 | /* load CD for output */ | |
776 | vmovdqu 0 * 16(%rcx), %xmm8; | |
777 | vmovdqu 1 * 16(%rcx), %xmm9; | |
778 | vmovdqu 2 * 16(%rcx), %xmm10; | |
779 | vmovdqu 3 * 16(%rcx), %xmm11; | |
780 | vmovdqu 4 * 16(%rcx), %xmm12; | |
781 | vmovdqu 5 * 16(%rcx), %xmm13; | |
782 | vmovdqu 6 * 16(%rcx), %xmm14; | |
783 | vmovdqu 7 * 16(%rcx), %xmm15; | |
784 | ||
785 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
786 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
787 | %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); | |
788 | ||
8691ccd7 | 789 | FRAME_END |
d9b1d2e7 JK |
790 | ret; |
791 | ||
792 | .align 8 | |
793 | .Lenc_max32: | |
794 | movl $32, %r8d; | |
795 | ||
796 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
797 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
798 | %xmm15, | |
799 | ((key_table + (24) * 8) + 0)(CTX), | |
800 | ((key_table + (24) * 8) + 4)(CTX), | |
801 | ((key_table + (24) * 8) + 8)(CTX), | |
802 | ((key_table + (24) * 8) + 12)(CTX)); | |
803 | ||
804 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
805 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
806 | %xmm15, %rax, %rcx, 24); | |
807 | ||
808 | jmp .Lenc_done; | |
59990684 | 809 | ENDPROC(__camellia_enc_blk16) |
d9b1d2e7 JK |
810 | |
811 | .align 8 | |
d9b1d2e7 JK |
812 | __camellia_dec_blk16: |
813 | /* input: | |
814 | * %rdi: ctx, CTX | |
815 | * %rax: temporary storage, 256 bytes | |
816 | * %r8d: 24 for 16 byte key, 32 for larger | |
817 | * %xmm0..%xmm15: 16 encrypted blocks | |
818 | * output: | |
819 | * %xmm0..%xmm15: 16 plaintext blocks, order swapped: | |
820 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | |
821 | */ | |
8691ccd7 | 822 | FRAME_BEGIN |
d9b1d2e7 JK |
823 | |
824 | leaq 8 * 16(%rax), %rcx; | |
825 | ||
826 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
827 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
828 | %xmm15, %rax, %rcx); | |
829 | ||
830 | cmpl $32, %r8d; | |
831 | je .Ldec_max32; | |
832 | ||
833 | .Ldec_max24: | |
834 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
835 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
836 | %xmm15, %rax, %rcx, 16); | |
837 | ||
838 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
839 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
840 | %xmm15, | |
841 | ((key_table + (16) * 8) + 8)(CTX), | |
842 | ((key_table + (16) * 8) + 12)(CTX), | |
843 | ((key_table + (16) * 8) + 0)(CTX), | |
844 | ((key_table + (16) * 8) + 4)(CTX)); | |
845 | ||
846 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
847 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
848 | %xmm15, %rax, %rcx, 8); | |
849 | ||
850 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
851 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
852 | %xmm15, | |
853 | ((key_table + (8) * 8) + 8)(CTX), | |
854 | ((key_table + (8) * 8) + 12)(CTX), | |
855 | ((key_table + (8) * 8) + 0)(CTX), | |
856 | ((key_table + (8) * 8) + 4)(CTX)); | |
857 | ||
858 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
859 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
860 | %xmm15, %rax, %rcx, 0); | |
861 | ||
862 | /* load CD for output */ | |
863 | vmovdqu 0 * 16(%rcx), %xmm8; | |
864 | vmovdqu 1 * 16(%rcx), %xmm9; | |
865 | vmovdqu 2 * 16(%rcx), %xmm10; | |
866 | vmovdqu 3 * 16(%rcx), %xmm11; | |
867 | vmovdqu 4 * 16(%rcx), %xmm12; | |
868 | vmovdqu 5 * 16(%rcx), %xmm13; | |
869 | vmovdqu 6 * 16(%rcx), %xmm14; | |
870 | vmovdqu 7 * 16(%rcx), %xmm15; | |
871 | ||
872 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
873 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
874 | %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); | |
875 | ||
8691ccd7 | 876 | FRAME_END |
d9b1d2e7 JK |
877 | ret; |
878 | ||
879 | .align 8 | |
880 | .Ldec_max32: | |
881 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
882 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
883 | %xmm15, %rax, %rcx, 24); | |
884 | ||
885 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
886 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
887 | %xmm15, | |
888 | ((key_table + (24) * 8) + 8)(CTX), | |
889 | ((key_table + (24) * 8) + 12)(CTX), | |
890 | ((key_table + (24) * 8) + 0)(CTX), | |
891 | ((key_table + (24) * 8) + 4)(CTX)); | |
892 | ||
893 | jmp .Ldec_max24; | |
59990684 | 894 | ENDPROC(__camellia_dec_blk16) |
d9b1d2e7 | 895 | |
59990684 | 896 | ENTRY(camellia_ecb_enc_16way) |
d9b1d2e7 JK |
897 | /* input: |
898 | * %rdi: ctx, CTX | |
899 | * %rsi: dst (16 blocks) | |
900 | * %rdx: src (16 blocks) | |
901 | */ | |
8691ccd7 | 902 | FRAME_BEGIN |
d9b1d2e7 JK |
903 | |
904 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
905 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
906 | %xmm15, %rdx, (key_table)(CTX)); | |
907 | ||
908 | /* now dst can be used as temporary buffer (even in src == dst case) */ | |
909 | movq %rsi, %rax; | |
910 | ||
911 | call __camellia_enc_blk16; | |
912 | ||
913 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | |
914 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | |
915 | %xmm8, %rsi); | |
916 | ||
8691ccd7 | 917 | FRAME_END |
d9b1d2e7 | 918 | ret; |
59990684 | 919 | ENDPROC(camellia_ecb_enc_16way) |
d9b1d2e7 | 920 | |
59990684 | 921 | ENTRY(camellia_ecb_dec_16way) |
d9b1d2e7 JK |
922 | /* input: |
923 | * %rdi: ctx, CTX | |
924 | * %rsi: dst (16 blocks) | |
925 | * %rdx: src (16 blocks) | |
926 | */ | |
8691ccd7 | 927 | FRAME_BEGIN |
d9b1d2e7 JK |
928 | |
929 | cmpl $16, key_length(CTX); | |
930 | movl $32, %r8d; | |
931 | movl $24, %eax; | |
932 | cmovel %eax, %r8d; /* max */ | |
933 | ||
934 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
935 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
936 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | |
937 | ||
938 | /* now dst can be used as temporary buffer (even in src == dst case) */ | |
939 | movq %rsi, %rax; | |
940 | ||
941 | call __camellia_dec_blk16; | |
942 | ||
943 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | |
944 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | |
945 | %xmm8, %rsi); | |
946 | ||
8691ccd7 | 947 | FRAME_END |
d9b1d2e7 | 948 | ret; |
59990684 | 949 | ENDPROC(camellia_ecb_dec_16way) |
d9b1d2e7 | 950 | |
59990684 | 951 | ENTRY(camellia_cbc_dec_16way) |
d9b1d2e7 JK |
952 | /* input: |
953 | * %rdi: ctx, CTX | |
954 | * %rsi: dst (16 blocks) | |
955 | * %rdx: src (16 blocks) | |
956 | */ | |
8691ccd7 | 957 | FRAME_BEGIN |
d9b1d2e7 JK |
958 | |
959 | cmpl $16, key_length(CTX); | |
960 | movl $32, %r8d; | |
961 | movl $24, %eax; | |
962 | cmovel %eax, %r8d; /* max */ | |
963 | ||
964 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | |
965 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | |
966 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | |
967 | ||
968 | /* | |
969 | * dst might still be in-use (in case dst == src), so use stack for | |
970 | * temporary storage. | |
971 | */ | |
972 | subq $(16 * 16), %rsp; | |
973 | movq %rsp, %rax; | |
974 | ||
975 | call __camellia_dec_blk16; | |
976 | ||
977 | addq $(16 * 16), %rsp; | |
978 | ||
979 | vpxor (0 * 16)(%rdx), %xmm6, %xmm6; | |
980 | vpxor (1 * 16)(%rdx), %xmm5, %xmm5; | |
981 | vpxor (2 * 16)(%rdx), %xmm4, %xmm4; | |
982 | vpxor (3 * 16)(%rdx), %xmm3, %xmm3; | |
983 | vpxor (4 * 16)(%rdx), %xmm2, %xmm2; | |
984 | vpxor (5 * 16)(%rdx), %xmm1, %xmm1; | |
985 | vpxor (6 * 16)(%rdx), %xmm0, %xmm0; | |
986 | vpxor (7 * 16)(%rdx), %xmm15, %xmm15; | |
987 | vpxor (8 * 16)(%rdx), %xmm14, %xmm14; | |
988 | vpxor (9 * 16)(%rdx), %xmm13, %xmm13; | |
989 | vpxor (10 * 16)(%rdx), %xmm12, %xmm12; | |
990 | vpxor (11 * 16)(%rdx), %xmm11, %xmm11; | |
991 | vpxor (12 * 16)(%rdx), %xmm10, %xmm10; | |
992 | vpxor (13 * 16)(%rdx), %xmm9, %xmm9; | |
993 | vpxor (14 * 16)(%rdx), %xmm8, %xmm8; | |
994 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | |
995 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | |
996 | %xmm8, %rsi); | |
997 | ||
8691ccd7 | 998 | FRAME_END |
d9b1d2e7 | 999 | ret; |
59990684 | 1000 | ENDPROC(camellia_cbc_dec_16way) |
d9b1d2e7 JK |
1001 | |
1002 | #define inc_le128(x, minus_one, tmp) \ | |
1003 | vpcmpeqq minus_one, x, tmp; \ | |
1004 | vpsubq minus_one, x, x; \ | |
1005 | vpslldq $8, tmp, tmp; \ | |
1006 | vpsubq tmp, x, x; | |
1007 | ||
59990684 | 1008 | ENTRY(camellia_ctr_16way) |
d9b1d2e7 JK |
1009 | /* input: |
1010 | * %rdi: ctx, CTX | |
1011 | * %rsi: dst (16 blocks) | |
1012 | * %rdx: src (16 blocks) | |
1013 | * %rcx: iv (little endian, 128bit) | |
1014 | */ | |
8691ccd7 | 1015 | FRAME_BEGIN |
d9b1d2e7 JK |
1016 | |
1017 | subq $(16 * 16), %rsp; | |
1018 | movq %rsp, %rax; | |
1019 | ||
1020 | vmovdqa .Lbswap128_mask, %xmm14; | |
1021 | ||
1022 | /* load IV and byteswap */ | |
1023 | vmovdqu (%rcx), %xmm0; | |
1024 | vpshufb %xmm14, %xmm0, %xmm15; | |
1025 | vmovdqu %xmm15, 15 * 16(%rax); | |
1026 | ||
1027 | vpcmpeqd %xmm15, %xmm15, %xmm15; | |
1028 | vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ | |
1029 | ||
1030 | /* construct IVs */ | |
1031 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1032 | vpshufb %xmm14, %xmm0, %xmm13; | |
1033 | vmovdqu %xmm13, 14 * 16(%rax); | |
1034 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1035 | vpshufb %xmm14, %xmm0, %xmm13; | |
1036 | vmovdqu %xmm13, 13 * 16(%rax); | |
1037 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1038 | vpshufb %xmm14, %xmm0, %xmm12; | |
1039 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1040 | vpshufb %xmm14, %xmm0, %xmm11; | |
1041 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1042 | vpshufb %xmm14, %xmm0, %xmm10; | |
1043 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1044 | vpshufb %xmm14, %xmm0, %xmm9; | |
1045 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1046 | vpshufb %xmm14, %xmm0, %xmm8; | |
1047 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1048 | vpshufb %xmm14, %xmm0, %xmm7; | |
1049 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1050 | vpshufb %xmm14, %xmm0, %xmm6; | |
1051 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1052 | vpshufb %xmm14, %xmm0, %xmm5; | |
1053 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1054 | vpshufb %xmm14, %xmm0, %xmm4; | |
1055 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1056 | vpshufb %xmm14, %xmm0, %xmm3; | |
1057 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1058 | vpshufb %xmm14, %xmm0, %xmm2; | |
1059 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1060 | vpshufb %xmm14, %xmm0, %xmm1; | |
1061 | inc_le128(%xmm0, %xmm15, %xmm13); | |
1062 | vmovdqa %xmm0, %xmm13; | |
1063 | vpshufb %xmm14, %xmm0, %xmm0; | |
1064 | inc_le128(%xmm13, %xmm15, %xmm14); | |
1065 | vmovdqu %xmm13, (%rcx); | |
1066 | ||
1067 | /* inpack16_pre: */ | |
1068 | vmovq (key_table)(CTX), %xmm15; | |
1069 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | |
1070 | vpxor %xmm0, %xmm15, %xmm0; | |
1071 | vpxor %xmm1, %xmm15, %xmm1; | |
1072 | vpxor %xmm2, %xmm15, %xmm2; | |
1073 | vpxor %xmm3, %xmm15, %xmm3; | |
1074 | vpxor %xmm4, %xmm15, %xmm4; | |
1075 | vpxor %xmm5, %xmm15, %xmm5; | |
1076 | vpxor %xmm6, %xmm15, %xmm6; | |
1077 | vpxor %xmm7, %xmm15, %xmm7; | |
1078 | vpxor %xmm8, %xmm15, %xmm8; | |
1079 | vpxor %xmm9, %xmm15, %xmm9; | |
1080 | vpxor %xmm10, %xmm15, %xmm10; | |
1081 | vpxor %xmm11, %xmm15, %xmm11; | |
1082 | vpxor %xmm12, %xmm15, %xmm12; | |
1083 | vpxor 13 * 16(%rax), %xmm15, %xmm13; | |
1084 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | |
1085 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | |
1086 | ||
1087 | call __camellia_enc_blk16; | |
1088 | ||
1089 | addq $(16 * 16), %rsp; | |
1090 | ||
1091 | vpxor 0 * 16(%rdx), %xmm7, %xmm7; | |
1092 | vpxor 1 * 16(%rdx), %xmm6, %xmm6; | |
1093 | vpxor 2 * 16(%rdx), %xmm5, %xmm5; | |
1094 | vpxor 3 * 16(%rdx), %xmm4, %xmm4; | |
1095 | vpxor 4 * 16(%rdx), %xmm3, %xmm3; | |
1096 | vpxor 5 * 16(%rdx), %xmm2, %xmm2; | |
1097 | vpxor 6 * 16(%rdx), %xmm1, %xmm1; | |
1098 | vpxor 7 * 16(%rdx), %xmm0, %xmm0; | |
1099 | vpxor 8 * 16(%rdx), %xmm15, %xmm15; | |
1100 | vpxor 9 * 16(%rdx), %xmm14, %xmm14; | |
1101 | vpxor 10 * 16(%rdx), %xmm13, %xmm13; | |
1102 | vpxor 11 * 16(%rdx), %xmm12, %xmm12; | |
1103 | vpxor 12 * 16(%rdx), %xmm11, %xmm11; | |
1104 | vpxor 13 * 16(%rdx), %xmm10, %xmm10; | |
1105 | vpxor 14 * 16(%rdx), %xmm9, %xmm9; | |
1106 | vpxor 15 * 16(%rdx), %xmm8, %xmm8; | |
1107 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | |
1108 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | |
1109 | %xmm8, %rsi); | |
1110 | ||
8691ccd7 | 1111 | FRAME_END |
d9b1d2e7 | 1112 | ret; |
59990684 | 1113 | ENDPROC(camellia_ctr_16way) |
b5c5b072 JK |
1114 | |
1115 | #define gf128mul_x_ble(iv, mask, tmp) \ | |
1116 | vpsrad $31, iv, tmp; \ | |
1117 | vpaddq iv, iv, iv; \ | |
1118 | vpshufd $0x13, tmp, tmp; \ | |
1119 | vpand mask, tmp, tmp; \ | |
1120 | vpxor tmp, iv, iv; | |
1121 | ||
1122 | .align 8 | |
1123 | camellia_xts_crypt_16way: | |
1124 | /* input: | |
1125 | * %rdi: ctx, CTX | |
1126 | * %rsi: dst (16 blocks) | |
1127 | * %rdx: src (16 blocks) | |
1128 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1129 | * %r8: index for input whitening key | |
1130 | * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 | |
1131 | */ | |
8691ccd7 | 1132 | FRAME_BEGIN |
b5c5b072 JK |
1133 | |
1134 | subq $(16 * 16), %rsp; | |
1135 | movq %rsp, %rax; | |
1136 | ||
1137 | vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; | |
1138 | ||
1139 | /* load IV */ | |
1140 | vmovdqu (%rcx), %xmm0; | |
1141 | vpxor 0 * 16(%rdx), %xmm0, %xmm15; | |
1142 | vmovdqu %xmm15, 15 * 16(%rax); | |
1143 | vmovdqu %xmm0, 0 * 16(%rsi); | |
1144 | ||
1145 | /* construct IVs */ | |
1146 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1147 | vpxor 1 * 16(%rdx), %xmm0, %xmm15; | |
1148 | vmovdqu %xmm15, 14 * 16(%rax); | |
1149 | vmovdqu %xmm0, 1 * 16(%rsi); | |
1150 | ||
1151 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1152 | vpxor 2 * 16(%rdx), %xmm0, %xmm13; | |
1153 | vmovdqu %xmm0, 2 * 16(%rsi); | |
1154 | ||
1155 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1156 | vpxor 3 * 16(%rdx), %xmm0, %xmm12; | |
1157 | vmovdqu %xmm0, 3 * 16(%rsi); | |
1158 | ||
1159 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1160 | vpxor 4 * 16(%rdx), %xmm0, %xmm11; | |
1161 | vmovdqu %xmm0, 4 * 16(%rsi); | |
1162 | ||
1163 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1164 | vpxor 5 * 16(%rdx), %xmm0, %xmm10; | |
1165 | vmovdqu %xmm0, 5 * 16(%rsi); | |
1166 | ||
1167 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1168 | vpxor 6 * 16(%rdx), %xmm0, %xmm9; | |
1169 | vmovdqu %xmm0, 6 * 16(%rsi); | |
1170 | ||
1171 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1172 | vpxor 7 * 16(%rdx), %xmm0, %xmm8; | |
1173 | vmovdqu %xmm0, 7 * 16(%rsi); | |
1174 | ||
1175 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1176 | vpxor 8 * 16(%rdx), %xmm0, %xmm7; | |
1177 | vmovdqu %xmm0, 8 * 16(%rsi); | |
1178 | ||
1179 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1180 | vpxor 9 * 16(%rdx), %xmm0, %xmm6; | |
1181 | vmovdqu %xmm0, 9 * 16(%rsi); | |
1182 | ||
1183 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1184 | vpxor 10 * 16(%rdx), %xmm0, %xmm5; | |
1185 | vmovdqu %xmm0, 10 * 16(%rsi); | |
1186 | ||
1187 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1188 | vpxor 11 * 16(%rdx), %xmm0, %xmm4; | |
1189 | vmovdqu %xmm0, 11 * 16(%rsi); | |
1190 | ||
1191 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1192 | vpxor 12 * 16(%rdx), %xmm0, %xmm3; | |
1193 | vmovdqu %xmm0, 12 * 16(%rsi); | |
1194 | ||
1195 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1196 | vpxor 13 * 16(%rdx), %xmm0, %xmm2; | |
1197 | vmovdqu %xmm0, 13 * 16(%rsi); | |
1198 | ||
1199 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1200 | vpxor 14 * 16(%rdx), %xmm0, %xmm1; | |
1201 | vmovdqu %xmm0, 14 * 16(%rsi); | |
1202 | ||
1203 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1204 | vpxor 15 * 16(%rdx), %xmm0, %xmm15; | |
1205 | vmovdqu %xmm15, 0 * 16(%rax); | |
1206 | vmovdqu %xmm0, 15 * 16(%rsi); | |
1207 | ||
1208 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | |
1209 | vmovdqu %xmm0, (%rcx); | |
1210 | ||
1211 | /* inpack16_pre: */ | |
1212 | vmovq (key_table)(CTX, %r8, 8), %xmm15; | |
1213 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | |
1214 | vpxor 0 * 16(%rax), %xmm15, %xmm0; | |
1215 | vpxor %xmm1, %xmm15, %xmm1; | |
1216 | vpxor %xmm2, %xmm15, %xmm2; | |
1217 | vpxor %xmm3, %xmm15, %xmm3; | |
1218 | vpxor %xmm4, %xmm15, %xmm4; | |
1219 | vpxor %xmm5, %xmm15, %xmm5; | |
1220 | vpxor %xmm6, %xmm15, %xmm6; | |
1221 | vpxor %xmm7, %xmm15, %xmm7; | |
1222 | vpxor %xmm8, %xmm15, %xmm8; | |
1223 | vpxor %xmm9, %xmm15, %xmm9; | |
1224 | vpxor %xmm10, %xmm15, %xmm10; | |
1225 | vpxor %xmm11, %xmm15, %xmm11; | |
1226 | vpxor %xmm12, %xmm15, %xmm12; | |
1227 | vpxor %xmm13, %xmm15, %xmm13; | |
1228 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | |
1229 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | |
1230 | ||
9697fa39 | 1231 | CALL_NOSPEC %r9; |
b5c5b072 JK |
1232 | |
1233 | addq $(16 * 16), %rsp; | |
1234 | ||
1235 | vpxor 0 * 16(%rsi), %xmm7, %xmm7; | |
1236 | vpxor 1 * 16(%rsi), %xmm6, %xmm6; | |
1237 | vpxor 2 * 16(%rsi), %xmm5, %xmm5; | |
1238 | vpxor 3 * 16(%rsi), %xmm4, %xmm4; | |
1239 | vpxor 4 * 16(%rsi), %xmm3, %xmm3; | |
1240 | vpxor 5 * 16(%rsi), %xmm2, %xmm2; | |
1241 | vpxor 6 * 16(%rsi), %xmm1, %xmm1; | |
1242 | vpxor 7 * 16(%rsi), %xmm0, %xmm0; | |
1243 | vpxor 8 * 16(%rsi), %xmm15, %xmm15; | |
1244 | vpxor 9 * 16(%rsi), %xmm14, %xmm14; | |
1245 | vpxor 10 * 16(%rsi), %xmm13, %xmm13; | |
1246 | vpxor 11 * 16(%rsi), %xmm12, %xmm12; | |
1247 | vpxor 12 * 16(%rsi), %xmm11, %xmm11; | |
1248 | vpxor 13 * 16(%rsi), %xmm10, %xmm10; | |
1249 | vpxor 14 * 16(%rsi), %xmm9, %xmm9; | |
1250 | vpxor 15 * 16(%rsi), %xmm8, %xmm8; | |
1251 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | |
1252 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | |
1253 | %xmm8, %rsi); | |
1254 | ||
8691ccd7 | 1255 | FRAME_END |
b5c5b072 JK |
1256 | ret; |
1257 | ENDPROC(camellia_xts_crypt_16way) | |
1258 | ||
1259 | ENTRY(camellia_xts_enc_16way) | |
1260 | /* input: | |
1261 | * %rdi: ctx, CTX | |
1262 | * %rsi: dst (16 blocks) | |
1263 | * %rdx: src (16 blocks) | |
1264 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1265 | */ | |
1266 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | |
1267 | ||
1268 | leaq __camellia_enc_blk16, %r9; | |
1269 | ||
1270 | jmp camellia_xts_crypt_16way; | |
1271 | ENDPROC(camellia_xts_enc_16way) | |
1272 | ||
1273 | ENTRY(camellia_xts_dec_16way) | |
1274 | /* input: | |
1275 | * %rdi: ctx, CTX | |
1276 | * %rsi: dst (16 blocks) | |
1277 | * %rdx: src (16 blocks) | |
1278 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | |
1279 | */ | |
1280 | ||
1281 | cmpl $16, key_length(CTX); | |
1282 | movl $32, %r8d; | |
1283 | movl $24, %eax; | |
1284 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | |
1285 | ||
1286 | leaq __camellia_dec_blk16, %r9; | |
1287 | ||
1288 | jmp camellia_xts_crypt_16way; | |
1289 | ENDPROC(camellia_xts_dec_16way) |