]>
Commit | Line | Data |
---|---|---|
251496db JK |
1 | /* |
2 | * Serpent Cipher 4-way parallel algorithm (i586/SSE2) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
5 | * | |
6 | * Based on crypto/serpent.c by | |
7 | * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> | |
8 | * 2003 Herbert Valerio Riedel <hvr@gnu.org> | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License as published by | |
12 | * the Free Software Foundation; either version 2 of the License, or | |
13 | * (at your option) any later version. | |
14 | * | |
15 | * This program is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | * GNU General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU General Public License | |
21 | * along with this program; if not, write to the Free Software | |
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
23 | * USA | |
24 | * | |
25 | */ | |
26 | ||
2dcfd44d JK |
27 | #include <linux/linkage.h> |
28 | ||
251496db JK |
29 | .file "serpent-sse2-i586-asm_32.S" |
30 | .text | |
31 | ||
32 | #define arg_ctx 4 | |
33 | #define arg_dst 8 | |
34 | #define arg_src 12 | |
35 | #define arg_xor 16 | |
36 | ||
37 | /********************************************************************** | |
38 | 4-way SSE2 serpent | |
39 | **********************************************************************/ | |
40 | #define CTX %edx | |
41 | ||
42 | #define RA %xmm0 | |
43 | #define RB %xmm1 | |
44 | #define RC %xmm2 | |
45 | #define RD %xmm3 | |
46 | #define RE %xmm4 | |
47 | ||
48 | #define RT0 %xmm5 | |
49 | #define RT1 %xmm6 | |
50 | ||
51 | #define RNOT %xmm7 | |
52 | ||
53 | #define get_key(i, j, t) \ | |
54 | movd (4*(i)+(j))*4(CTX), t; \ | |
55 | pshufd $0, t, t; | |
56 | ||
57 | #define K(x0, x1, x2, x3, x4, i) \ | |
58 | get_key(i, 0, x4); \ | |
59 | get_key(i, 1, RT0); \ | |
60 | get_key(i, 2, RT1); \ | |
61 | pxor x4, x0; \ | |
62 | pxor RT0, x1; \ | |
63 | pxor RT1, x2; \ | |
64 | get_key(i, 3, x4); \ | |
65 | pxor x4, x3; | |
66 | ||
67 | #define LK(x0, x1, x2, x3, x4, i) \ | |
68 | movdqa x0, x4; \ | |
69 | pslld $13, x0; \ | |
70 | psrld $(32 - 13), x4; \ | |
71 | por x4, x0; \ | |
72 | pxor x0, x1; \ | |
73 | movdqa x2, x4; \ | |
74 | pslld $3, x2; \ | |
75 | psrld $(32 - 3), x4; \ | |
76 | por x4, x2; \ | |
77 | pxor x2, x1; \ | |
78 | movdqa x1, x4; \ | |
79 | pslld $1, x1; \ | |
80 | psrld $(32 - 1), x4; \ | |
81 | por x4, x1; \ | |
82 | movdqa x0, x4; \ | |
83 | pslld $3, x4; \ | |
84 | pxor x2, x3; \ | |
85 | pxor x4, x3; \ | |
86 | movdqa x3, x4; \ | |
87 | pslld $7, x3; \ | |
88 | psrld $(32 - 7), x4; \ | |
89 | por x4, x3; \ | |
90 | movdqa x1, x4; \ | |
91 | pslld $7, x4; \ | |
92 | pxor x1, x0; \ | |
93 | pxor x3, x0; \ | |
94 | pxor x3, x2; \ | |
95 | pxor x4, x2; \ | |
96 | movdqa x0, x4; \ | |
97 | get_key(i, 1, RT0); \ | |
98 | pxor RT0, x1; \ | |
99 | get_key(i, 3, RT0); \ | |
100 | pxor RT0, x3; \ | |
101 | pslld $5, x0; \ | |
102 | psrld $(32 - 5), x4; \ | |
103 | por x4, x0; \ | |
104 | movdqa x2, x4; \ | |
105 | pslld $22, x2; \ | |
106 | psrld $(32 - 22), x4; \ | |
107 | por x4, x2; \ | |
108 | get_key(i, 0, RT0); \ | |
109 | pxor RT0, x0; \ | |
110 | get_key(i, 2, RT0); \ | |
111 | pxor RT0, x2; | |
112 | ||
113 | #define KL(x0, x1, x2, x3, x4, i) \ | |
114 | K(x0, x1, x2, x3, x4, i); \ | |
115 | movdqa x0, x4; \ | |
116 | psrld $5, x0; \ | |
117 | pslld $(32 - 5), x4; \ | |
118 | por x4, x0; \ | |
119 | movdqa x2, x4; \ | |
120 | psrld $22, x2; \ | |
121 | pslld $(32 - 22), x4; \ | |
122 | por x4, x2; \ | |
123 | pxor x3, x2; \ | |
124 | pxor x3, x0; \ | |
125 | movdqa x1, x4; \ | |
126 | pslld $7, x4; \ | |
127 | pxor x1, x0; \ | |
128 | pxor x4, x2; \ | |
129 | movdqa x1, x4; \ | |
130 | psrld $1, x1; \ | |
131 | pslld $(32 - 1), x4; \ | |
132 | por x4, x1; \ | |
133 | movdqa x3, x4; \ | |
134 | psrld $7, x3; \ | |
135 | pslld $(32 - 7), x4; \ | |
136 | por x4, x3; \ | |
137 | pxor x0, x1; \ | |
138 | movdqa x0, x4; \ | |
139 | pslld $3, x4; \ | |
140 | pxor x4, x3; \ | |
141 | movdqa x0, x4; \ | |
142 | psrld $13, x0; \ | |
143 | pslld $(32 - 13), x4; \ | |
144 | por x4, x0; \ | |
145 | pxor x2, x1; \ | |
146 | pxor x2, x3; \ | |
147 | movdqa x2, x4; \ | |
148 | psrld $3, x2; \ | |
149 | pslld $(32 - 3), x4; \ | |
150 | por x4, x2; | |
151 | ||
152 | #define S0(x0, x1, x2, x3, x4) \ | |
153 | movdqa x3, x4; \ | |
154 | por x0, x3; \ | |
155 | pxor x4, x0; \ | |
156 | pxor x2, x4; \ | |
157 | pxor RNOT, x4; \ | |
158 | pxor x1, x3; \ | |
159 | pand x0, x1; \ | |
160 | pxor x4, x1; \ | |
161 | pxor x0, x2; \ | |
162 | pxor x3, x0; \ | |
163 | por x0, x4; \ | |
164 | pxor x2, x0; \ | |
165 | pand x1, x2; \ | |
166 | pxor x2, x3; \ | |
167 | pxor RNOT, x1; \ | |
168 | pxor x4, x2; \ | |
169 | pxor x2, x1; | |
170 | ||
171 | #define S1(x0, x1, x2, x3, x4) \ | |
172 | movdqa x1, x4; \ | |
173 | pxor x0, x1; \ | |
174 | pxor x3, x0; \ | |
175 | pxor RNOT, x3; \ | |
176 | pand x1, x4; \ | |
177 | por x1, x0; \ | |
178 | pxor x2, x3; \ | |
179 | pxor x3, x0; \ | |
180 | pxor x3, x1; \ | |
181 | pxor x4, x3; \ | |
182 | por x4, x1; \ | |
183 | pxor x2, x4; \ | |
184 | pand x0, x2; \ | |
185 | pxor x1, x2; \ | |
186 | por x0, x1; \ | |
187 | pxor RNOT, x0; \ | |
188 | pxor x2, x0; \ | |
189 | pxor x1, x4; | |
190 | ||
191 | #define S2(x0, x1, x2, x3, x4) \ | |
192 | pxor RNOT, x3; \ | |
193 | pxor x0, x1; \ | |
194 | movdqa x0, x4; \ | |
195 | pand x2, x0; \ | |
196 | pxor x3, x0; \ | |
197 | por x4, x3; \ | |
198 | pxor x1, x2; \ | |
199 | pxor x1, x3; \ | |
200 | pand x0, x1; \ | |
201 | pxor x2, x0; \ | |
202 | pand x3, x2; \ | |
203 | por x1, x3; \ | |
204 | pxor RNOT, x0; \ | |
205 | pxor x0, x3; \ | |
206 | pxor x0, x4; \ | |
207 | pxor x2, x0; \ | |
208 | por x2, x1; | |
209 | ||
210 | #define S3(x0, x1, x2, x3, x4) \ | |
211 | movdqa x1, x4; \ | |
212 | pxor x3, x1; \ | |
213 | por x0, x3; \ | |
214 | pand x0, x4; \ | |
215 | pxor x2, x0; \ | |
216 | pxor x1, x2; \ | |
217 | pand x3, x1; \ | |
218 | pxor x3, x2; \ | |
219 | por x4, x0; \ | |
220 | pxor x3, x4; \ | |
221 | pxor x0, x1; \ | |
222 | pand x3, x0; \ | |
223 | pand x4, x3; \ | |
224 | pxor x2, x3; \ | |
225 | por x1, x4; \ | |
226 | pand x1, x2; \ | |
227 | pxor x3, x4; \ | |
228 | pxor x3, x0; \ | |
229 | pxor x2, x3; | |
230 | ||
231 | #define S4(x0, x1, x2, x3, x4) \ | |
232 | movdqa x3, x4; \ | |
233 | pand x0, x3; \ | |
234 | pxor x4, x0; \ | |
235 | pxor x2, x3; \ | |
236 | por x4, x2; \ | |
237 | pxor x1, x0; \ | |
238 | pxor x3, x4; \ | |
239 | por x0, x2; \ | |
240 | pxor x1, x2; \ | |
241 | pand x0, x1; \ | |
242 | pxor x4, x1; \ | |
243 | pand x2, x4; \ | |
244 | pxor x3, x2; \ | |
245 | pxor x0, x4; \ | |
246 | por x1, x3; \ | |
247 | pxor RNOT, x1; \ | |
248 | pxor x0, x3; | |
249 | ||
250 | #define S5(x0, x1, x2, x3, x4) \ | |
251 | movdqa x1, x4; \ | |
252 | por x0, x1; \ | |
253 | pxor x1, x2; \ | |
254 | pxor RNOT, x3; \ | |
255 | pxor x0, x4; \ | |
256 | pxor x2, x0; \ | |
257 | pand x4, x1; \ | |
258 | por x3, x4; \ | |
259 | pxor x0, x4; \ | |
260 | pand x3, x0; \ | |
261 | pxor x3, x1; \ | |
262 | pxor x2, x3; \ | |
263 | pxor x1, x0; \ | |
264 | pand x4, x2; \ | |
265 | pxor x2, x1; \ | |
266 | pand x0, x2; \ | |
267 | pxor x2, x3; | |
268 | ||
269 | #define S6(x0, x1, x2, x3, x4) \ | |
270 | movdqa x1, x4; \ | |
271 | pxor x0, x3; \ | |
272 | pxor x2, x1; \ | |
273 | pxor x0, x2; \ | |
274 | pand x3, x0; \ | |
275 | por x3, x1; \ | |
276 | pxor RNOT, x4; \ | |
277 | pxor x1, x0; \ | |
278 | pxor x2, x1; \ | |
279 | pxor x4, x3; \ | |
280 | pxor x0, x4; \ | |
281 | pand x0, x2; \ | |
282 | pxor x1, x4; \ | |
283 | pxor x3, x2; \ | |
284 | pand x1, x3; \ | |
285 | pxor x0, x3; \ | |
286 | pxor x2, x1; | |
287 | ||
288 | #define S7(x0, x1, x2, x3, x4) \ | |
289 | pxor RNOT, x1; \ | |
290 | movdqa x1, x4; \ | |
291 | pxor RNOT, x0; \ | |
292 | pand x2, x1; \ | |
293 | pxor x3, x1; \ | |
294 | por x4, x3; \ | |
295 | pxor x2, x4; \ | |
296 | pxor x3, x2; \ | |
297 | pxor x0, x3; \ | |
298 | por x1, x0; \ | |
299 | pand x0, x2; \ | |
300 | pxor x4, x0; \ | |
301 | pxor x3, x4; \ | |
302 | pand x0, x3; \ | |
303 | pxor x1, x4; \ | |
304 | pxor x4, x2; \ | |
305 | pxor x1, x3; \ | |
306 | por x0, x4; \ | |
307 | pxor x1, x4; | |
308 | ||
309 | #define SI0(x0, x1, x2, x3, x4) \ | |
310 | movdqa x3, x4; \ | |
311 | pxor x0, x1; \ | |
312 | por x1, x3; \ | |
313 | pxor x1, x4; \ | |
314 | pxor RNOT, x0; \ | |
315 | pxor x3, x2; \ | |
316 | pxor x0, x3; \ | |
317 | pand x1, x0; \ | |
318 | pxor x2, x0; \ | |
319 | pand x3, x2; \ | |
320 | pxor x4, x3; \ | |
321 | pxor x3, x2; \ | |
322 | pxor x3, x1; \ | |
323 | pand x0, x3; \ | |
324 | pxor x0, x1; \ | |
325 | pxor x2, x0; \ | |
326 | pxor x3, x4; | |
327 | ||
328 | #define SI1(x0, x1, x2, x3, x4) \ | |
329 | pxor x3, x1; \ | |
330 | movdqa x0, x4; \ | |
331 | pxor x2, x0; \ | |
332 | pxor RNOT, x2; \ | |
333 | por x1, x4; \ | |
334 | pxor x3, x4; \ | |
335 | pand x1, x3; \ | |
336 | pxor x2, x1; \ | |
337 | pand x4, x2; \ | |
338 | pxor x1, x4; \ | |
339 | por x3, x1; \ | |
340 | pxor x0, x3; \ | |
341 | pxor x0, x2; \ | |
342 | por x4, x0; \ | |
343 | pxor x4, x2; \ | |
344 | pxor x0, x1; \ | |
345 | pxor x1, x4; | |
346 | ||
347 | #define SI2(x0, x1, x2, x3, x4) \ | |
348 | pxor x1, x2; \ | |
349 | movdqa x3, x4; \ | |
350 | pxor RNOT, x3; \ | |
351 | por x2, x3; \ | |
352 | pxor x4, x2; \ | |
353 | pxor x0, x4; \ | |
354 | pxor x1, x3; \ | |
355 | por x2, x1; \ | |
356 | pxor x0, x2; \ | |
357 | pxor x4, x1; \ | |
358 | por x3, x4; \ | |
359 | pxor x3, x2; \ | |
360 | pxor x2, x4; \ | |
361 | pand x1, x2; \ | |
362 | pxor x3, x2; \ | |
363 | pxor x4, x3; \ | |
364 | pxor x0, x4; | |
365 | ||
366 | #define SI3(x0, x1, x2, x3, x4) \ | |
367 | pxor x1, x2; \ | |
368 | movdqa x1, x4; \ | |
369 | pand x2, x1; \ | |
370 | pxor x0, x1; \ | |
371 | por x4, x0; \ | |
372 | pxor x3, x4; \ | |
373 | pxor x3, x0; \ | |
374 | por x1, x3; \ | |
375 | pxor x2, x1; \ | |
376 | pxor x3, x1; \ | |
377 | pxor x2, x0; \ | |
378 | pxor x3, x2; \ | |
379 | pand x1, x3; \ | |
380 | pxor x0, x1; \ | |
381 | pand x2, x0; \ | |
382 | pxor x3, x4; \ | |
383 | pxor x0, x3; \ | |
384 | pxor x1, x0; | |
385 | ||
386 | #define SI4(x0, x1, x2, x3, x4) \ | |
387 | pxor x3, x2; \ | |
388 | movdqa x0, x4; \ | |
389 | pand x1, x0; \ | |
390 | pxor x2, x0; \ | |
391 | por x3, x2; \ | |
392 | pxor RNOT, x4; \ | |
393 | pxor x0, x1; \ | |
394 | pxor x2, x0; \ | |
395 | pand x4, x2; \ | |
396 | pxor x0, x2; \ | |
397 | por x4, x0; \ | |
398 | pxor x3, x0; \ | |
399 | pand x2, x3; \ | |
400 | pxor x3, x4; \ | |
401 | pxor x1, x3; \ | |
402 | pand x0, x1; \ | |
403 | pxor x1, x4; \ | |
404 | pxor x3, x0; | |
405 | ||
406 | #define SI5(x0, x1, x2, x3, x4) \ | |
407 | movdqa x1, x4; \ | |
408 | por x2, x1; \ | |
409 | pxor x4, x2; \ | |
410 | pxor x3, x1; \ | |
411 | pand x4, x3; \ | |
412 | pxor x3, x2; \ | |
413 | por x0, x3; \ | |
414 | pxor RNOT, x0; \ | |
415 | pxor x2, x3; \ | |
416 | por x0, x2; \ | |
417 | pxor x1, x4; \ | |
418 | pxor x4, x2; \ | |
419 | pand x0, x4; \ | |
420 | pxor x1, x0; \ | |
421 | pxor x3, x1; \ | |
422 | pand x2, x0; \ | |
423 | pxor x3, x2; \ | |
424 | pxor x2, x0; \ | |
425 | pxor x4, x2; \ | |
426 | pxor x3, x4; | |
427 | ||
428 | #define SI6(x0, x1, x2, x3, x4) \ | |
429 | pxor x2, x0; \ | |
430 | movdqa x0, x4; \ | |
431 | pand x3, x0; \ | |
432 | pxor x3, x2; \ | |
433 | pxor x2, x0; \ | |
434 | pxor x1, x3; \ | |
435 | por x4, x2; \ | |
436 | pxor x3, x2; \ | |
437 | pand x0, x3; \ | |
438 | pxor RNOT, x0; \ | |
439 | pxor x1, x3; \ | |
440 | pand x2, x1; \ | |
441 | pxor x0, x4; \ | |
442 | pxor x4, x3; \ | |
443 | pxor x2, x4; \ | |
444 | pxor x1, x0; \ | |
445 | pxor x0, x2; | |
446 | ||
447 | #define SI7(x0, x1, x2, x3, x4) \ | |
448 | movdqa x3, x4; \ | |
449 | pand x0, x3; \ | |
450 | pxor x2, x0; \ | |
451 | por x4, x2; \ | |
452 | pxor x1, x4; \ | |
453 | pxor RNOT, x0; \ | |
454 | por x3, x1; \ | |
455 | pxor x0, x4; \ | |
456 | pand x2, x0; \ | |
457 | pxor x1, x0; \ | |
458 | pand x2, x1; \ | |
459 | pxor x2, x3; \ | |
460 | pxor x3, x4; \ | |
461 | pand x3, x2; \ | |
462 | por x0, x3; \ | |
463 | pxor x4, x1; \ | |
464 | pxor x4, x3; \ | |
465 | pand x0, x4; \ | |
466 | pxor x2, x4; | |
467 | ||
847cb7ef | 468 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
251496db | 469 | movdqa x0, t2; \ |
847cb7ef JK |
470 | punpckldq x1, x0; \ |
471 | punpckhdq x1, t2; \ | |
472 | movdqa x2, t1; \ | |
473 | punpckhdq x3, x2; \ | |
474 | punpckldq x3, t1; \ | |
475 | movdqa x0, x1; \ | |
476 | punpcklqdq t1, x0; \ | |
477 | punpckhqdq t1, x1; \ | |
478 | movdqa t2, x3; \ | |
479 | punpcklqdq x2, t2; \ | |
480 | punpckhqdq x2, x3; \ | |
481 | movdqa t2, x2; | |
251496db JK |
482 | |
483 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | |
484 | movdqu (0*4*4)(in), x0; \ | |
485 | movdqu (1*4*4)(in), x1; \ | |
486 | movdqu (2*4*4)(in), x2; \ | |
487 | movdqu (3*4*4)(in), x3; \ | |
488 | \ | |
489 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | |
490 | ||
491 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | |
492 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
493 | \ | |
494 | movdqu x0, (0*4*4)(out); \ | |
495 | movdqu x1, (1*4*4)(out); \ | |
496 | movdqu x2, (2*4*4)(out); \ | |
497 | movdqu x3, (3*4*4)(out); | |
498 | ||
499 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | |
500 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | |
501 | \ | |
502 | movdqu (0*4*4)(out), t0; \ | |
503 | pxor t0, x0; \ | |
504 | movdqu x0, (0*4*4)(out); \ | |
505 | movdqu (1*4*4)(out), t0; \ | |
506 | pxor t0, x1; \ | |
507 | movdqu x1, (1*4*4)(out); \ | |
508 | movdqu (2*4*4)(out), t0; \ | |
509 | pxor t0, x2; \ | |
510 | movdqu x2, (2*4*4)(out); \ | |
511 | movdqu (3*4*4)(out), t0; \ | |
512 | pxor t0, x3; \ | |
513 | movdqu x3, (3*4*4)(out); | |
514 | ||
2dcfd44d | 515 | ENTRY(__serpent_enc_blk_4way) |
251496db JK |
516 | /* input: |
517 | * arg_ctx(%esp): ctx, CTX | |
518 | * arg_dst(%esp): dst | |
519 | * arg_src(%esp): src | |
520 | * arg_xor(%esp): bool, if true: xor output | |
521 | */ | |
522 | ||
523 | pcmpeqd RNOT, RNOT; | |
524 | ||
525 | movl arg_ctx(%esp), CTX; | |
526 | ||
527 | movl arg_src(%esp), %eax; | |
528 | read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); | |
529 | ||
530 | K(RA, RB, RC, RD, RE, 0); | |
531 | S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); | |
532 | S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); | |
533 | S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); | |
534 | S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); | |
535 | S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); | |
536 | S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); | |
537 | S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); | |
538 | S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); | |
539 | S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); | |
540 | S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); | |
541 | S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); | |
542 | S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); | |
543 | S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); | |
544 | S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); | |
545 | S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); | |
546 | S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); | |
547 | S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); | |
548 | S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); | |
549 | S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); | |
550 | S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); | |
551 | S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); | |
552 | S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); | |
553 | S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); | |
554 | S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); | |
555 | S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); | |
556 | S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); | |
557 | S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); | |
558 | S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); | |
559 | S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); | |
560 | S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); | |
561 | S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); | |
562 | S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); | |
563 | ||
564 | movl arg_dst(%esp), %eax; | |
565 | ||
566 | cmpb $0, arg_xor(%esp); | |
2dcfd44d | 567 | jnz .L__enc_xor4; |
251496db JK |
568 | |
569 | write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); | |
570 | ||
571 | ret; | |
572 | ||
2dcfd44d | 573 | .L__enc_xor4: |
251496db JK |
574 | xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); |
575 | ||
576 | ret; | |
2dcfd44d | 577 | ENDPROC(__serpent_enc_blk_4way) |
251496db | 578 | |
2dcfd44d | 579 | ENTRY(serpent_dec_blk_4way) |
251496db JK |
580 | /* input: |
581 | * arg_ctx(%esp): ctx, CTX | |
582 | * arg_dst(%esp): dst | |
583 | * arg_src(%esp): src | |
584 | */ | |
585 | ||
586 | pcmpeqd RNOT, RNOT; | |
587 | ||
588 | movl arg_ctx(%esp), CTX; | |
589 | ||
590 | movl arg_src(%esp), %eax; | |
591 | read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); | |
592 | ||
593 | K(RA, RB, RC, RD, RE, 32); | |
594 | SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); | |
595 | SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); | |
596 | SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); | |
597 | SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); | |
598 | SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); | |
599 | SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); | |
600 | SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); | |
601 | SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); | |
602 | SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); | |
603 | SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); | |
604 | SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); | |
605 | SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); | |
606 | SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); | |
607 | SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); | |
608 | SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); | |
609 | SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); | |
610 | SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); | |
611 | SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); | |
612 | SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); | |
613 | SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); | |
614 | SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); | |
615 | SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); | |
616 | SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); | |
617 | SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); | |
618 | SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); | |
619 | SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); | |
620 | SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); | |
621 | SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); | |
622 | SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); | |
623 | SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); | |
624 | SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); | |
625 | SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); | |
626 | ||
627 | movl arg_dst(%esp), %eax; | |
628 | write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); | |
629 | ||
630 | ret; | |
2dcfd44d | 631 | ENDPROC(serpent_dec_blk_4way) |