]>
Commit | Line | Data |
---|---|---|
937c30d7 JK |
1 | /* |
2 | * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * Based on crypto/serpent.c by | |
7 | * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> | |
8 | * 2003 Herbert Valerio Riedel <hvr@gnu.org> | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License as published by | |
12 | * the Free Software Foundation; either version 2 of the License, or | |
13 | * (at your option) any later version. | |
14 | * | |
15 | * This program is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | * GNU General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU General Public License | |
21 | * along with this program; if not, write to the Free Software | |
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
23 | * USA | |
24 | * | |
25 | */ | |
26 | ||
2dcfd44d JK |
27 | #include <linux/linkage.h> |
28 | ||
937c30d7 JK |
29 | .file "serpent-sse2-x86_64-asm_64.S" |
30 | .text | |
31 | ||
32 | #define CTX %rdi | |
33 | ||
34 | /********************************************************************** | |
35 | 8-way SSE2 serpent | |
36 | **********************************************************************/ | |
37 | #define RA1 %xmm0 | |
38 | #define RB1 %xmm1 | |
39 | #define RC1 %xmm2 | |
40 | #define RD1 %xmm3 | |
41 | #define RE1 %xmm4 | |
42 | ||
43 | #define RA2 %xmm5 | |
44 | #define RB2 %xmm6 | |
45 | #define RC2 %xmm7 | |
46 | #define RD2 %xmm8 | |
47 | #define RE2 %xmm9 | |
48 | ||
49 | #define RNOT %xmm10 | |
50 | ||
51 | #define RK0 %xmm11 | |
52 | #define RK1 %xmm12 | |
53 | #define RK2 %xmm13 | |
54 | #define RK3 %xmm14 | |
55 | ||
56 | #define S0_1(x0, x1, x2, x3, x4) \ | |
57 | movdqa x3, x4; \ | |
58 | por x0, x3; \ | |
59 | pxor x4, x0; \ | |
60 | pxor x2, x4; \ | |
61 | pxor RNOT, x4; \ | |
62 | pxor x1, x3; \ | |
63 | pand x0, x1; \ | |
64 | pxor x4, x1; \ | |
65 | pxor x0, x2; | |
66 | #define S0_2(x0, x1, x2, x3, x4) \ | |
67 | pxor x3, x0; \ | |
68 | por x0, x4; \ | |
69 | pxor x2, x0; \ | |
70 | pand x1, x2; \ | |
71 | pxor x2, x3; \ | |
72 | pxor RNOT, x1; \ | |
73 | pxor x4, x2; \ | |
74 | pxor x2, x1; | |
75 | ||
76 | #define S1_1(x0, x1, x2, x3, x4) \ | |
77 | movdqa x1, x4; \ | |
78 | pxor x0, x1; \ | |
79 | pxor x3, x0; \ | |
80 | pxor RNOT, x3; \ | |
81 | pand x1, x4; \ | |
82 | por x1, x0; \ | |
83 | pxor x2, x3; \ | |
84 | pxor x3, x0; \ | |
85 | pxor x3, x1; | |
86 | #define S1_2(x0, x1, x2, x3, x4) \ | |
87 | pxor x4, x3; \ | |
88 | por x4, x1; \ | |
89 | pxor x2, x4; \ | |
90 | pand x0, x2; \ | |
91 | pxor x1, x2; \ | |
92 | por x0, x1; \ | |
93 | pxor RNOT, x0; \ | |
94 | pxor x2, x0; \ | |
95 | pxor x1, x4; | |
96 | ||
97 | #define S2_1(x0, x1, x2, x3, x4) \ | |
98 | pxor RNOT, x3; \ | |
99 | pxor x0, x1; \ | |
100 | movdqa x0, x4; \ | |
101 | pand x2, x0; \ | |
102 | pxor x3, x0; \ | |
103 | por x4, x3; \ | |
104 | pxor x1, x2; \ | |
105 | pxor x1, x3; \ | |
106 | pand x0, x1; | |
107 | #define S2_2(x0, x1, x2, x3, x4) \ | |
108 | pxor x2, x0; \ | |
109 | pand x3, x2; \ | |
110 | por x1, x3; \ | |
111 | pxor RNOT, x0; \ | |
112 | pxor x0, x3; \ | |
113 | pxor x0, x4; \ | |
114 | pxor x2, x0; \ | |
115 | por x2, x1; | |
116 | ||
117 | #define S3_1(x0, x1, x2, x3, x4) \ | |
118 | movdqa x1, x4; \ | |
119 | pxor x3, x1; \ | |
120 | por x0, x3; \ | |
121 | pand x0, x4; \ | |
122 | pxor x2, x0; \ | |
123 | pxor x1, x2; \ | |
124 | pand x3, x1; \ | |
125 | pxor x3, x2; \ | |
126 | por x4, x0; \ | |
127 | pxor x3, x4; | |
128 | #define S3_2(x0, x1, x2, x3, x4) \ | |
129 | pxor x0, x1; \ | |
130 | pand x3, x0; \ | |
131 | pand x4, x3; \ | |
132 | pxor x2, x3; \ | |
133 | por x1, x4; \ | |
134 | pand x1, x2; \ | |
135 | pxor x3, x4; \ | |
136 | pxor x3, x0; \ | |
137 | pxor x2, x3; | |
138 | ||
139 | #define S4_1(x0, x1, x2, x3, x4) \ | |
140 | movdqa x3, x4; \ | |
141 | pand x0, x3; \ | |
142 | pxor x4, x0; \ | |
143 | pxor x2, x3; \ | |
144 | por x4, x2; \ | |
145 | pxor x1, x0; \ | |
146 | pxor x3, x4; \ | |
147 | por x0, x2; \ | |
148 | pxor x1, x2; | |
149 | #define S4_2(x0, x1, x2, x3, x4) \ | |
150 | pand x0, x1; \ | |
151 | pxor x4, x1; \ | |
152 | pand x2, x4; \ | |
153 | pxor x3, x2; \ | |
154 | pxor x0, x4; \ | |
155 | por x1, x3; \ | |
156 | pxor RNOT, x1; \ | |
157 | pxor x0, x3; | |
158 | ||
159 | #define S5_1(x0, x1, x2, x3, x4) \ | |
160 | movdqa x1, x4; \ | |
161 | por x0, x1; \ | |
162 | pxor x1, x2; \ | |
163 | pxor RNOT, x3; \ | |
164 | pxor x0, x4; \ | |
165 | pxor x2, x0; \ | |
166 | pand x4, x1; \ | |
167 | por x3, x4; \ | |
168 | pxor x0, x4; | |
169 | #define S5_2(x0, x1, x2, x3, x4) \ | |
170 | pand x3, x0; \ | |
171 | pxor x3, x1; \ | |
172 | pxor x2, x3; \ | |
173 | pxor x1, x0; \ | |
174 | pand x4, x2; \ | |
175 | pxor x2, x1; \ | |
176 | pand x0, x2; \ | |
177 | pxor x2, x3; | |
178 | ||
179 | #define S6_1(x0, x1, x2, x3, x4) \ | |
180 | movdqa x1, x4; \ | |
181 | pxor x0, x3; \ | |
182 | pxor x2, x1; \ | |
183 | pxor x0, x2; \ | |
184 | pand x3, x0; \ | |
185 | por x3, x1; \ | |
186 | pxor RNOT, x4; \ | |
187 | pxor x1, x0; \ | |
188 | pxor x2, x1; | |
189 | #define S6_2(x0, x1, x2, x3, x4) \ | |
190 | pxor x4, x3; \ | |
191 | pxor x0, x4; \ | |
192 | pand x0, x2; \ | |
193 | pxor x1, x4; \ | |
194 | pxor x3, x2; \ | |
195 | pand x1, x3; \ | |
196 | pxor x0, x3; \ | |
197 | pxor x2, x1; | |
198 | ||
199 | #define S7_1(x0, x1, x2, x3, x4) \ | |
200 | pxor RNOT, x1; \ | |
201 | movdqa x1, x4; \ | |
202 | pxor RNOT, x0; \ | |
203 | pand x2, x1; \ | |
204 | pxor x3, x1; \ | |
205 | por x4, x3; \ | |
206 | pxor x2, x4; \ | |
207 | pxor x3, x2; \ | |
208 | pxor x0, x3; \ | |
209 | por x1, x0; | |
210 | #define S7_2(x0, x1, x2, x3, x4) \ | |
211 | pand x0, x2; \ | |
212 | pxor x4, x0; \ | |
213 | pxor x3, x4; \ | |
214 | pand x0, x3; \ | |
215 | pxor x1, x4; \ | |
216 | pxor x4, x2; \ | |
217 | pxor x1, x3; \ | |
218 | por x0, x4; \ | |
219 | pxor x1, x4; | |
220 | ||
221 | #define SI0_1(x0, x1, x2, x3, x4) \ | |
222 | movdqa x3, x4; \ | |
223 | pxor x0, x1; \ | |
224 | por x1, x3; \ | |
225 | pxor x1, x4; \ | |
226 | pxor RNOT, x0; \ | |
227 | pxor x3, x2; \ | |
228 | pxor x0, x3; \ | |
229 | pand x1, x0; \ | |
230 | pxor x2, x0; | |
231 | #define SI0_2(x0, x1, x2, x3, x4) \ | |
232 | pand x3, x2; \ | |
233 | pxor x4, x3; \ | |
234 | pxor x3, x2; \ | |
235 | pxor x3, x1; \ | |
236 | pand x0, x3; \ | |
237 | pxor x0, x1; \ | |
238 | pxor x2, x0; \ | |
239 | pxor x3, x4; | |
240 | ||
241 | #define SI1_1(x0, x1, x2, x3, x4) \ | |
242 | pxor x3, x1; \ | |
243 | movdqa x0, x4; \ | |
244 | pxor x2, x0; \ | |
245 | pxor RNOT, x2; \ | |
246 | por x1, x4; \ | |
247 | pxor x3, x4; \ | |
248 | pand x1, x3; \ | |
249 | pxor x2, x1; \ | |
250 | pand x4, x2; | |
251 | #define SI1_2(x0, x1, x2, x3, x4) \ | |
252 | pxor x1, x4; \ | |
253 | por x3, x1; \ | |
254 | pxor x0, x3; \ | |
255 | pxor x0, x2; \ | |
256 | por x4, x0; \ | |
257 | pxor x4, x2; \ | |
258 | pxor x0, x1; \ | |
259 | pxor x1, x4; | |
260 | ||
261 | #define SI2_1(x0, x1, x2, x3, x4) \ | |
262 | pxor x1, x2; \ | |
263 | movdqa x3, x4; \ | |
264 | pxor RNOT, x3; \ | |
265 | por x2, x3; \ | |
266 | pxor x4, x2; \ | |
267 | pxor x0, x4; \ | |
268 | pxor x1, x3; \ | |
269 | por x2, x1; \ | |
270 | pxor x0, x2; | |
271 | #define SI2_2(x0, x1, x2, x3, x4) \ | |
272 | pxor x4, x1; \ | |
273 | por x3, x4; \ | |
274 | pxor x3, x2; \ | |
275 | pxor x2, x4; \ | |
276 | pand x1, x2; \ | |
277 | pxor x3, x2; \ | |
278 | pxor x4, x3; \ | |
279 | pxor x0, x4; | |
280 | ||
281 | #define SI3_1(x0, x1, x2, x3, x4) \ | |
282 | pxor x1, x2; \ | |
283 | movdqa x1, x4; \ | |
284 | pand x2, x1; \ | |
285 | pxor x0, x1; \ | |
286 | por x4, x0; \ | |
287 | pxor x3, x4; \ | |
288 | pxor x3, x0; \ | |
289 | por x1, x3; \ | |
290 | pxor x2, x1; | |
291 | #define SI3_2(x0, x1, x2, x3, x4) \ | |
292 | pxor x3, x1; \ | |
293 | pxor x2, x0; \ | |
294 | pxor x3, x2; \ | |
295 | pand x1, x3; \ | |
296 | pxor x0, x1; \ | |
297 | pand x2, x0; \ | |
298 | pxor x3, x4; \ | |
299 | pxor x0, x3; \ | |
300 | pxor x1, x0; | |
301 | ||
302 | #define SI4_1(x0, x1, x2, x3, x4) \ | |
303 | pxor x3, x2; \ | |
304 | movdqa x0, x4; \ | |
305 | pand x1, x0; \ | |
306 | pxor x2, x0; \ | |
307 | por x3, x2; \ | |
308 | pxor RNOT, x4; \ | |
309 | pxor x0, x1; \ | |
310 | pxor x2, x0; \ | |
311 | pand x4, x2; | |
312 | #define SI4_2(x0, x1, x2, x3, x4) \ | |
313 | pxor x0, x2; \ | |
314 | por x4, x0; \ | |
315 | pxor x3, x0; \ | |
316 | pand x2, x3; \ | |
317 | pxor x3, x4; \ | |
318 | pxor x1, x3; \ | |
319 | pand x0, x1; \ | |
320 | pxor x1, x4; \ | |
321 | pxor x3, x0; | |
322 | ||
323 | #define SI5_1(x0, x1, x2, x3, x4) \ | |
324 | movdqa x1, x4; \ | |
325 | por x2, x1; \ | |
326 | pxor x4, x2; \ | |
327 | pxor x3, x1; \ | |
328 | pand x4, x3; \ | |
329 | pxor x3, x2; \ | |
330 | por x0, x3; \ | |
331 | pxor RNOT, x0; \ | |
332 | pxor x2, x3; \ | |
333 | por x0, x2; | |
334 | #define SI5_2(x0, x1, x2, x3, x4) \ | |
335 | pxor x1, x4; \ | |
336 | pxor x4, x2; \ | |
337 | pand x0, x4; \ | |
338 | pxor x1, x0; \ | |
339 | pxor x3, x1; \ | |
340 | pand x2, x0; \ | |
341 | pxor x3, x2; \ | |
342 | pxor x2, x0; \ | |
343 | pxor x4, x2; \ | |
344 | pxor x3, x4; | |
345 | ||
346 | #define SI6_1(x0, x1, x2, x3, x4) \ | |
347 | pxor x2, x0; \ | |
348 | movdqa x0, x4; \ | |
349 | pand x3, x0; \ | |
350 | pxor x3, x2; \ | |
351 | pxor x2, x0; \ | |
352 | pxor x1, x3; \ | |
353 | por x4, x2; \ | |
354 | pxor x3, x2; \ | |
355 | pand x0, x3; | |
356 | #define SI6_2(x0, x1, x2, x3, x4) \ | |
357 | pxor RNOT, x0; \ | |
358 | pxor x1, x3; \ | |
359 | pand x2, x1; \ | |
360 | pxor x0, x4; \ | |
361 | pxor x4, x3; \ | |
362 | pxor x2, x4; \ | |
363 | pxor x1, x0; \ | |
364 | pxor x0, x2; | |
365 | ||
366 | #define SI7_1(x0, x1, x2, x3, x4) \ | |
367 | movdqa x3, x4; \ | |
368 | pand x0, x3; \ | |
369 | pxor x2, x0; \ | |
370 | por x4, x2; \ | |
371 | pxor x1, x4; \ | |
372 | pxor RNOT, x0; \ | |
373 | por x3, x1; \ | |
374 | pxor x0, x4; \ | |
375 | pand x2, x0; \ | |
376 | pxor x1, x0; | |
377 | #define SI7_2(x0, x1, x2, x3, x4) \ | |
378 | pand x2, x1; \ | |
379 | pxor x2, x3; \ | |
380 | pxor x3, x4; \ | |
381 | pand x3, x2; \ | |
382 | por x0, x3; \ | |
383 | pxor x4, x1; \ | |
384 | pxor x4, x3; \ | |
385 | pand x0, x4; \ | |
386 | pxor x2, x4; | |
387 | ||
388 | #define get_key(i, j, t) \ | |
389 | movd (4*(i)+(j))*4(CTX), t; \ | |
390 | pshufd $0, t, t; | |
391 | ||
392 | #define K2(x0, x1, x2, x3, x4, i) \ | |
393 | get_key(i, 0, RK0); \ | |
394 | get_key(i, 1, RK1); \ | |
395 | get_key(i, 2, RK2); \ | |
396 | get_key(i, 3, RK3); \ | |
397 | pxor RK0, x0 ## 1; \ | |
398 | pxor RK1, x1 ## 1; \ | |
399 | pxor RK2, x2 ## 1; \ | |
400 | pxor RK3, x3 ## 1; \ | |
401 | pxor RK0, x0 ## 2; \ | |
402 | pxor RK1, x1 ## 2; \ | |
403 | pxor RK2, x2 ## 2; \ | |
404 | pxor RK3, x3 ## 2; | |
405 | ||
406 | #define LK2(x0, x1, x2, x3, x4, i) \ | |
407 | movdqa x0 ## 1, x4 ## 1; \ | |
408 | pslld $13, x0 ## 1; \ | |
409 | psrld $(32 - 13), x4 ## 1; \ | |
410 | por x4 ## 1, x0 ## 1; \ | |
411 | pxor x0 ## 1, x1 ## 1; \ | |
412 | movdqa x2 ## 1, x4 ## 1; \ | |
413 | pslld $3, x2 ## 1; \ | |
414 | psrld $(32 - 3), x4 ## 1; \ | |
415 | por x4 ## 1, x2 ## 1; \ | |
416 | pxor x2 ## 1, x1 ## 1; \ | |
417 | movdqa x0 ## 2, x4 ## 2; \ | |
418 | pslld $13, x0 ## 2; \ | |
419 | psrld $(32 - 13), x4 ## 2; \ | |
420 | por x4 ## 2, x0 ## 2; \ | |
421 | pxor x0 ## 2, x1 ## 2; \ | |
422 | movdqa x2 ## 2, x4 ## 2; \ | |
423 | pslld $3, x2 ## 2; \ | |
424 | psrld $(32 - 3), x4 ## 2; \ | |
425 | por x4 ## 2, x2 ## 2; \ | |
426 | pxor x2 ## 2, x1 ## 2; \ | |
427 | movdqa x1 ## 1, x4 ## 1; \ | |
428 | pslld $1, x1 ## 1; \ | |
429 | psrld $(32 - 1), x4 ## 1; \ | |
430 | por x4 ## 1, x1 ## 1; \ | |
431 | movdqa x0 ## 1, x4 ## 1; \ | |
432 | pslld $3, x4 ## 1; \ | |
433 | pxor x2 ## 1, x3 ## 1; \ | |
434 | pxor x4 ## 1, x3 ## 1; \ | |
435 | movdqa x3 ## 1, x4 ## 1; \ | |
436 | get_key(i, 1, RK1); \ | |
437 | movdqa x1 ## 2, x4 ## 2; \ | |
438 | pslld $1, x1 ## 2; \ | |
439 | psrld $(32 - 1), x4 ## 2; \ | |
440 | por x4 ## 2, x1 ## 2; \ | |
441 | movdqa x0 ## 2, x4 ## 2; \ | |
442 | pslld $3, x4 ## 2; \ | |
443 | pxor x2 ## 2, x3 ## 2; \ | |
444 | pxor x4 ## 2, x3 ## 2; \ | |
445 | movdqa x3 ## 2, x4 ## 2; \ | |
446 | get_key(i, 3, RK3); \ | |
447 | pslld $7, x3 ## 1; \ | |
448 | psrld $(32 - 7), x4 ## 1; \ | |
449 | por x4 ## 1, x3 ## 1; \ | |
450 | movdqa x1 ## 1, x4 ## 1; \ | |
451 | pslld $7, x4 ## 1; \ | |
452 | pxor x1 ## 1, x0 ## 1; \ | |
453 | pxor x3 ## 1, x0 ## 1; \ | |
454 | pxor x3 ## 1, x2 ## 1; \ | |
455 | pxor x4 ## 1, x2 ## 1; \ | |
456 | get_key(i, 0, RK0); \ | |
457 | pslld $7, x3 ## 2; \ | |
458 | psrld $(32 - 7), x4 ## 2; \ | |
459 | por x4 ## 2, x3 ## 2; \ | |
460 | movdqa x1 ## 2, x4 ## 2; \ | |
461 | pslld $7, x4 ## 2; \ | |
462 | pxor x1 ## 2, x0 ## 2; \ | |
463 | pxor x3 ## 2, x0 ## 2; \ | |
464 | pxor x3 ## 2, x2 ## 2; \ | |
465 | pxor x4 ## 2, x2 ## 2; \ | |
466 | get_key(i, 2, RK2); \ | |
467 | pxor RK1, x1 ## 1; \ | |
468 | pxor RK3, x3 ## 1; \ | |
469 | movdqa x0 ## 1, x4 ## 1; \ | |
470 | pslld $5, x0 ## 1; \ | |
471 | psrld $(32 - 5), x4 ## 1; \ | |
472 | por x4 ## 1, x0 ## 1; \ | |
473 | movdqa x2 ## 1, x4 ## 1; \ | |
474 | pslld $22, x2 ## 1; \ | |
475 | psrld $(32 - 22), x4 ## 1; \ | |
476 | por x4 ## 1, x2 ## 1; \ | |
477 | pxor RK0, x0 ## 1; \ | |
478 | pxor RK2, x2 ## 1; \ | |
479 | pxor RK1, x1 ## 2; \ | |
480 | pxor RK3, x3 ## 2; \ | |
481 | movdqa x0 ## 2, x4 ## 2; \ | |
482 | pslld $5, x0 ## 2; \ | |
483 | psrld $(32 - 5), x4 ## 2; \ | |
484 | por x4 ## 2, x0 ## 2; \ | |
485 | movdqa x2 ## 2, x4 ## 2; \ | |
486 | pslld $22, x2 ## 2; \ | |
487 | psrld $(32 - 22), x4 ## 2; \ | |
488 | por x4 ## 2, x2 ## 2; \ | |
489 | pxor RK0, x0 ## 2; \ | |
490 | pxor RK2, x2 ## 2; | |
491 | ||
492 | #define KL2(x0, x1, x2, x3, x4, i) \ | |
493 | pxor RK0, x0 ## 1; \ | |
494 | pxor RK2, x2 ## 1; \ | |
495 | movdqa x0 ## 1, x4 ## 1; \ | |
496 | psrld $5, x0 ## 1; \ | |
497 | pslld $(32 - 5), x4 ## 1; \ | |
498 | por x4 ## 1, x0 ## 1; \ | |
499 | pxor RK3, x3 ## 1; \ | |
500 | pxor RK1, x1 ## 1; \ | |
501 | movdqa x2 ## 1, x4 ## 1; \ | |
502 | psrld $22, x2 ## 1; \ | |
503 | pslld $(32 - 22), x4 ## 1; \ | |
504 | por x4 ## 1, x2 ## 1; \ | |
505 | pxor x3 ## 1, x2 ## 1; \ | |
506 | pxor RK0, x0 ## 2; \ | |
507 | pxor RK2, x2 ## 2; \ | |
508 | movdqa x0 ## 2, x4 ## 2; \ | |
509 | psrld $5, x0 ## 2; \ | |
510 | pslld $(32 - 5), x4 ## 2; \ | |
511 | por x4 ## 2, x0 ## 2; \ | |
512 | pxor RK3, x3 ## 2; \ | |
513 | pxor RK1, x1 ## 2; \ | |
514 | movdqa x2 ## 2, x4 ## 2; \ | |
515 | psrld $22, x2 ## 2; \ | |
516 | pslld $(32 - 22), x4 ## 2; \ | |
517 | por x4 ## 2, x2 ## 2; \ | |
518 | pxor x3 ## 2, x2 ## 2; \ | |
519 | pxor x3 ## 1, x0 ## 1; \ | |
520 | movdqa x1 ## 1, x4 ## 1; \ | |
521 | pslld $7, x4 ## 1; \ | |
522 | pxor x1 ## 1, x0 ## 1; \ | |
523 | pxor x4 ## 1, x2 ## 1; \ | |
524 | movdqa x1 ## 1, x4 ## 1; \ | |
525 | psrld $1, x1 ## 1; \ | |
526 | pslld $(32 - 1), x4 ## 1; \ | |
527 | por x4 ## 1, x1 ## 1; \ | |
528 | pxor x3 ## 2, x0 ## 2; \ | |
529 | movdqa x1 ## 2, x4 ## 2; \ | |
530 | pslld $7, x4 ## 2; \ | |
531 | pxor x1 ## 2, x0 ## 2; \ | |
532 | pxor x4 ## 2, x2 ## 2; \ | |
533 | movdqa x1 ## 2, x4 ## 2; \ | |
534 | psrld $1, x1 ## 2; \ | |
535 | pslld $(32 - 1), x4 ## 2; \ | |
536 | por x4 ## 2, x1 ## 2; \ | |
537 | movdqa x3 ## 1, x4 ## 1; \ | |
538 | psrld $7, x3 ## 1; \ | |
539 | pslld $(32 - 7), x4 ## 1; \ | |
540 | por x4 ## 1, x3 ## 1; \ | |
541 | pxor x0 ## 1, x1 ## 1; \ | |
542 | movdqa x0 ## 1, x4 ## 1; \ | |
543 | pslld $3, x4 ## 1; \ | |
544 | pxor x4 ## 1, x3 ## 1; \ | |
545 | movdqa x0 ## 1, x4 ## 1; \ | |
546 | movdqa x3 ## 2, x4 ## 2; \ | |
547 | psrld $7, x3 ## 2; \ | |
548 | pslld $(32 - 7), x4 ## 2; \ | |
549 | por x4 ## 2, x3 ## 2; \ | |
550 | pxor x0 ## 2, x1 ## 2; \ | |
551 | movdqa x0 ## 2, x4 ## 2; \ | |
552 | pslld $3, x4 ## 2; \ | |
553 | pxor x4 ## 2, x3 ## 2; \ | |
554 | movdqa x0 ## 2, x4 ## 2; \ | |
555 | psrld $13, x0 ## 1; \ | |
556 | pslld $(32 - 13), x4 ## 1; \ | |
557 | por x4 ## 1, x0 ## 1; \ | |
558 | pxor x2 ## 1, x1 ## 1; \ | |
559 | pxor x2 ## 1, x3 ## 1; \ | |
560 | movdqa x2 ## 1, x4 ## 1; \ | |
561 | psrld $3, x2 ## 1; \ | |
562 | pslld $(32 - 3), x4 ## 1; \ | |
563 | por x4 ## 1, x2 ## 1; \ | |
564 | psrld $13, x0 ## 2; \ | |
565 | pslld $(32 - 13), x4 ## 2; \ | |
566 | por x4 ## 2, x0 ## 2; \ | |
567 | pxor x2 ## 2, x1 ## 2; \ | |
568 | pxor x2 ## 2, x3 ## 2; \ | |
569 | movdqa x2 ## 2, x4 ## 2; \ | |
570 | psrld $3, x2 ## 2; \ | |
571 | pslld $(32 - 3), x4 ## 2; \ | |
572 | por x4 ## 2, x2 ## 2; | |
573 | ||
574 | #define S(SBOX, x0, x1, x2, x3, x4) \ | |
575 | SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | |
576 | SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | |
577 | SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | |
578 | SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); | |
579 | ||
580 | #define SP(SBOX, x0, x1, x2, x3, x4, i) \ | |
581 | get_key(i, 0, RK0); \ | |
582 | SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | |
583 | get_key(i, 2, RK2); \ | |
584 | SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | |
585 | get_key(i, 3, RK3); \ | |
586 | SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | |
587 | get_key(i, 1, RK1); \ | |
588 | SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | |
589 | ||
847cb7ef | 590 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
937c30d7 | 591 | movdqa x0, t2; \ |
847cb7ef JK |
592 | punpckldq x1, x0; \ |
593 | punpckhdq x1, t2; \ | |
594 | movdqa x2, t1; \ | |
595 | punpckhdq x3, x2; \ | |
596 | punpckldq x3, t1; \ | |
597 | movdqa x0, x1; \ | |
598 | punpcklqdq t1, x0; \ | |
599 | punpckhqdq t1, x1; \ | |
600 | movdqa t2, x3; \ | |
601 | punpcklqdq x2, t2; \ | |
602 | punpckhqdq x2, x3; \ | |
603 | movdqa t2, x2; | |
937c30d7 JK |
604 | |
605 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | |
606 | movdqu (0*4*4)(in), x0; \ | |
607 | movdqu (1*4*4)(in), x1; \ | |
608 | movdqu (2*4*4)(in), x2; \ | |
609 | movdqu (3*4*4)(in), x3; \ | |
610 | \ | |
611 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | |
612 | ||
613 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | |
614 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
615 | \ | |
616 | movdqu x0, (0*4*4)(out); \ | |
617 | movdqu x1, (1*4*4)(out); \ | |
618 | movdqu x2, (2*4*4)(out); \ | |
619 | movdqu x3, (3*4*4)(out); | |
620 | ||
621 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | |
622 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
623 | \ | |
624 | movdqu (0*4*4)(out), t0; \ | |
625 | pxor t0, x0; \ | |
626 | movdqu x0, (0*4*4)(out); \ | |
627 | movdqu (1*4*4)(out), t0; \ | |
628 | pxor t0, x1; \ | |
629 | movdqu x1, (1*4*4)(out); \ | |
630 | movdqu (2*4*4)(out), t0; \ | |
631 | pxor t0, x2; \ | |
632 | movdqu x2, (2*4*4)(out); \ | |
633 | movdqu (3*4*4)(out), t0; \ | |
634 | pxor t0, x3; \ | |
635 | movdqu x3, (3*4*4)(out); | |
636 | ||
2dcfd44d | 637 | ENTRY(__serpent_enc_blk_8way) |
937c30d7 JK |
638 | /* input: |
639 | * %rdi: ctx, CTX | |
640 | * %rsi: dst | |
641 | * %rdx: src | |
642 | * %rcx: bool, if true: xor output | |
643 | */ | |
644 | ||
645 | pcmpeqd RNOT, RNOT; | |
646 | ||
647 | leaq (4*4*4)(%rdx), %rax; | |
648 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | |
649 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
650 | ||
651 | K2(RA, RB, RC, RD, RE, 0); | |
652 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | |
653 | S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); | |
654 | S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); | |
655 | S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); | |
656 | S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); | |
657 | S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); | |
658 | S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); | |
659 | S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); | |
660 | S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); | |
661 | S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); | |
662 | S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); | |
663 | S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); | |
664 | S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); | |
665 | S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); | |
666 | S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); | |
667 | S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); | |
668 | S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); | |
669 | S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); | |
670 | S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); | |
671 | S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); | |
672 | S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); | |
673 | S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); | |
674 | S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); | |
675 | S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); | |
676 | S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); | |
677 | S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); | |
678 | S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); | |
679 | S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); | |
680 | S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); | |
681 | S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); | |
682 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | |
683 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | |
684 | ||
685 | leaq (4*4*4)(%rsi), %rax; | |
686 | ||
687 | testb %cl, %cl; | |
2dcfd44d | 688 | jnz .L__enc_xor8; |
937c30d7 JK |
689 | |
690 | write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | |
691 | write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
692 | ||
693 | ret; | |
694 | ||
2dcfd44d | 695 | .L__enc_xor8: |
937c30d7 JK |
696 | xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
697 | xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
698 | ||
699 | ret; | |
2dcfd44d | 700 | ENDPROC(__serpent_enc_blk_8way) |
937c30d7 | 701 | |
2dcfd44d | 702 | ENTRY(serpent_dec_blk_8way) |
937c30d7 JK |
703 | /* input: |
704 | * %rdi: ctx, CTX | |
705 | * %rsi: dst | |
706 | * %rdx: src | |
707 | */ | |
708 | ||
709 | pcmpeqd RNOT, RNOT; | |
710 | ||
711 | leaq (4*4*4)(%rdx), %rax; | |
712 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | |
713 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
714 | ||
715 | K2(RA, RB, RC, RD, RE, 32); | |
716 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | |
717 | SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); | |
718 | SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); | |
719 | SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); | |
720 | SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); | |
721 | SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); | |
722 | SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); | |
723 | SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); | |
724 | SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); | |
725 | SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); | |
726 | SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); | |
727 | SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); | |
728 | SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); | |
729 | SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); | |
730 | SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); | |
731 | SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); | |
732 | SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); | |
733 | SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); | |
734 | SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); | |
735 | SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); | |
736 | SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); | |
737 | SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); | |
738 | SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); | |
739 | SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); | |
740 | SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); | |
741 | SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); | |
742 | SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); | |
743 | SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); | |
744 | SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); | |
745 | SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); | |
746 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | |
747 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | |
748 | ||
749 | leaq (4*4*4)(%rsi), %rax; | |
750 | write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); | |
751 | write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); | |
752 | ||
753 | ret; | |
2dcfd44d | 754 | ENDPROC(serpent_dec_blk_8way) |