2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ; routine to do AES cbc decrypt on 16n bytes doing AES by 4
30 ; XMM registers are clobbered. Saving/restoring must be done at a higher level
32 ; void aes_cbc_dec_256_sse(void *in,
38 ; arg 1: rcx: pointer to input (cipher text)
39 ; arg 2: rdx: pointer to IV
40 ; arg 3: r8: pointer to keys
41 ; arg 4: r9: pointer to output (plain text)
42 ; arg 5: sp: length in bytes (multiple of 16)
47 %ifndef AES_CBC_DEC_256
48 %define AES_CBC_DEC_256 aes_cbc_dec_256_sse
86 %define IV_TMP XSAVED3
90 MKGLOBAL(AES_CBC_DEC_256,function,internal)
105 movdqu XDATA0, [IN + 0*16]
106 movdqu XDATA1, [IN + 1*16]
108 movdqa XKEY0, [KEYS + 0*16]
111 movdqa XSAVED0, XDATA0
114 pxor XDATA0, XKEY0 ; 0. ARK
117 movdqa XKEY2, [KEYS + 2*16]
119 aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
120 aesdec XDATA1, [KEYS + 1*16]
124 aesdec XDATA0, XKEY2 ; 2. DEC
127 movdqa XKEY4, [KEYS + 4*16]
129 aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
130 aesdec XDATA1, [KEYS + 3*16]
134 aesdec XDATA0, XKEY4 ; 4. DEC
137 movdqa XKEY6, [KEYS + 6*16]
139 aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
140 aesdec XDATA1, [KEYS + 5*16]
142 aesdec XDATA0, XKEY6 ; 6. DEC
145 movdqa XKEY_B, [KEYS + 8*16]
147 aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
148 aesdec XDATA1, [KEYS + 7*16]
150 aesdec XDATA0, XKEY_B ; 8. DEC
151 aesdec XDATA1, XKEY_B
153 movdqa XKEY10, [KEYS + 10*16]
155 aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
156 aesdec XDATA1, [KEYS + 9*16]
158 aesdec XDATA0, XKEY10 ; 10. DEC
159 aesdec XDATA1, XKEY10
161 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
162 aesdec XDATA1, [KEYS + 11*16]
164 aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
165 aesdec XDATA1, [KEYS + 12*16]
167 aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
168 aesdec XDATA1, [KEYS + 13*16]
170 aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
171 aesdeclast XDATA1, [KEYS + 14*16]
176 movdqu [OUT + 0*16], XDATA0
177 movdqu [OUT + 1*16], XDATA1
187 movdqu XDATA0, [IN + 0*16]
189 movdqa XKEY0, [KEYS + 0*16]
194 pxor XDATA0, XKEY0 ; 0. ARK
196 movdqa XKEY2, [KEYS + 2*16]
198 aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
202 aesdec XDATA0, XKEY2 ; 2. DEC
204 movdqa XKEY4, [KEYS + 4*16]
206 aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
210 aesdec XDATA0, XKEY4 ; 4. DEC
212 movdqa XKEY6, [KEYS + 6*16]
214 aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
216 aesdec XDATA0, XKEY6 ; 6. DEC
218 movdqa XKEY_B, [KEYS + 8*16]
220 aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
222 aesdec XDATA0, XKEY_B ; 8. DEC
224 movdqa XKEY10, [KEYS + 10*16]
226 aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
228 aesdec XDATA0, XKEY10 ; 10. DEC
230 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
232 aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
234 aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
236 aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
240 movdqu [OUT + 0*16], XDATA0
249 movdqu XDATA0, [IN + 0*16]
250 movdqu XDATA1, [IN + 1*16]
251 movdqu XDATA2, [IN + 2*16]
253 movdqa XKEY0, [KEYS + 0*16]
256 movdqa XSAVED0, XDATA0
257 movdqa XSAVED1, XDATA1
260 movdqa XKEY_A, [KEYS + 1*16]
262 pxor XDATA0, XKEY0 ; 0. ARK
266 movdqa XKEY2, [KEYS + 2*16]
268 aesdec XDATA0, XKEY_A ; 1. DEC
269 aesdec XDATA1, XKEY_A
270 aesdec XDATA2, XKEY_A
272 movdqa XKEY_A, [KEYS + 3*16]
275 aesdec XDATA0, XKEY2 ; 2. DEC
279 movdqa XKEY4, [KEYS + 4*16]
281 aesdec XDATA0, XKEY_A ; 3. DEC
282 aesdec XDATA1, XKEY_A
283 aesdec XDATA2, XKEY_A
285 movdqa XKEY_A, [KEYS + 5*16]
288 aesdec XDATA0, XKEY4 ; 4. DEC
292 movdqa XKEY6, [KEYS + 6*16]
294 aesdec XDATA0, XKEY_A ; 5. DEC
295 aesdec XDATA1, XKEY_A
296 aesdec XDATA2, XKEY_A
298 movdqa XKEY_A, [KEYS + 7*16]
300 aesdec XDATA0, XKEY6 ; 6. DEC
304 movdqa XKEY_B, [KEYS + 8*16]
306 aesdec XDATA0, XKEY_A ; 7. DEC
307 aesdec XDATA1, XKEY_A
308 aesdec XDATA2, XKEY_A
310 movdqa XKEY_A, [KEYS + 9*16]
312 aesdec XDATA0, XKEY_B ; 8. DEC
313 aesdec XDATA1, XKEY_B
314 aesdec XDATA2, XKEY_B
316 movdqa XKEY10, [KEYS + 10*16]
318 aesdec XDATA0, XKEY_A ; 9. DEC
319 aesdec XDATA1, XKEY_A
320 aesdec XDATA2, XKEY_A
322 movdqa XKEY_A, [KEYS + 11*16]
324 aesdec XDATA0, XKEY10 ; 10. DEC
325 aesdec XDATA1, XKEY10
326 aesdec XDATA2, XKEY10
328 movdqa XKEY_B, [KEYS + 12*16]
330 aesdec XDATA0, XKEY_A ; 11. DEC
331 aesdec XDATA1, XKEY_A
332 aesdec XDATA2, XKEY_A
334 movdqa XKEY_A, [KEYS + 13*16]
336 aesdec XDATA0, XKEY_B ; 12. DEC
337 aesdec XDATA1, XKEY_B
338 aesdec XDATA2, XKEY_B
340 movdqa XKEY_B, [KEYS + 14*16]
342 aesdec XDATA0, XKEY_A ; 13. DEC
343 aesdec XDATA1, XKEY_A
344 aesdec XDATA2, XKEY_A
346 aesdeclast XDATA0, XKEY_B ; 14. DEC
347 aesdeclast XDATA1, XKEY_B
348 aesdeclast XDATA2, XKEY_B
354 movdqu [OUT + 0*16], XDATA0
355 movdqu [OUT + 1*16], XDATA1
356 movdqu [OUT + 2*16], XDATA2
366 movdqu XDATA0, [IN + 0*16]
367 movdqu XDATA1, [IN + 1*16]
368 movdqu XDATA2, [IN + 2*16]
369 movdqu XDATA3, [IN + 3*16]
371 movdqa XKEY0, [KEYS + 0*16]
374 movdqa XSAVED0, XDATA0
375 movdqa XSAVED1, XDATA1
376 movdqa XSAVED2, XDATA2
379 movdqa XKEY_A, [KEYS + 1*16]
381 pxor XDATA0, XKEY0 ; 0. ARK
386 movdqa XKEY2, [KEYS + 2*16]
388 aesdec XDATA0, XKEY_A ; 1. DEC
389 aesdec XDATA1, XKEY_A
390 aesdec XDATA2, XKEY_A
391 aesdec XDATA3, XKEY_A
393 movdqa XKEY_A, [KEYS + 3*16]
397 aesdec XDATA0, XKEY2 ; 2. DEC
402 movdqa XKEY4, [KEYS + 4*16]
404 aesdec XDATA0, XKEY_A ; 3. DEC
405 aesdec XDATA1, XKEY_A
406 aesdec XDATA2, XKEY_A
407 aesdec XDATA3, XKEY_A
409 movdqa XKEY_A, [KEYS + 5*16]
413 aesdec XDATA0, XKEY4 ; 4. DEC
418 movdqa XKEY6, [KEYS + 6*16]
420 aesdec XDATA0, XKEY_A ; 5. DEC
421 aesdec XDATA1, XKEY_A
422 aesdec XDATA2, XKEY_A
423 aesdec XDATA3, XKEY_A
425 movdqa XKEY_A, [KEYS + 7*16]
427 aesdec XDATA0, XKEY6 ; 6. DEC
432 movdqa XKEY_B, [KEYS + 8*16]
434 aesdec XDATA0, XKEY_A ; 7. DEC
435 aesdec XDATA1, XKEY_A
436 aesdec XDATA2, XKEY_A
437 aesdec XDATA3, XKEY_A
439 movdqa XKEY_A, [KEYS + 9*16]
441 aesdec XDATA0, XKEY_B ; 8. DEC
442 aesdec XDATA1, XKEY_B
443 aesdec XDATA2, XKEY_B
444 aesdec XDATA3, XKEY_B
446 movdqa XKEY10, [KEYS + 10*16]
448 aesdec XDATA0, XKEY_A ; 9. DEC
449 aesdec XDATA1, XKEY_A
450 aesdec XDATA2, XKEY_A
451 aesdec XDATA3, XKEY_A
453 movdqa XKEY_A, [KEYS + 11*16]
455 aesdec XDATA0, XKEY10 ; 10. DEC
456 aesdec XDATA1, XKEY10
457 aesdec XDATA2, XKEY10
458 aesdec XDATA3, XKEY10
460 movdqa XKEY_B, [KEYS + 12*16]
462 aesdec XDATA0, XKEY_A ; 11. DEC
463 aesdec XDATA1, XKEY_A
464 aesdec XDATA2, XKEY_A
465 aesdec XDATA3, XKEY_A
467 movdqa XKEY_A, [KEYS + 13*16]
469 aesdec XDATA0, XKEY_B ; 12. DEC
470 aesdec XDATA1, XKEY_B
471 aesdec XDATA2, XKEY_B
472 aesdec XDATA3, XKEY_B
474 movdqa XKEY_B, [KEYS + 14*16]
476 aesdec XDATA0, XKEY_A ; 13. DEC
477 aesdec XDATA1, XKEY_A
478 aesdec XDATA2, XKEY_A
479 aesdec XDATA3, XKEY_A
481 aesdeclast XDATA0, XKEY_B ; 14. DEC
482 aesdeclast XDATA1, XKEY_B
483 aesdeclast XDATA2, XKEY_B
484 aesdeclast XDATA3, XKEY_B
491 movdqu [OUT + 0*16], XDATA0
492 movdqu [OUT + 1*16], XDATA1
493 movdqu [OUT + 2*16], XDATA2
494 movdqu [OUT + 3*16], XDATA3
503 movdqu XDATA0, [IN + IDX + 0*16]
504 movdqu XDATA1, [IN + IDX + 1*16]
505 movdqu XDATA2, [IN + IDX + 2*16]
506 movdqu XDATA3, [IN + IDX + 3*16]
509 movdqa XSAVED0, XDATA0
510 movdqa XSAVED1, XDATA1
511 movdqa XSAVED2, XDATA2
512 movdqa XSAVED3, XDATA3
514 movdqa XKEY_A, [KEYS + 1*16]
516 pxor XDATA0, XKEY0 ; 0. ARK
523 aesdec XDATA0, XKEY_A ; 1. DEC
524 aesdec XDATA1, XKEY_A
525 aesdec XDATA2, XKEY_A
526 aesdec XDATA3, XKEY_A
528 movdqa XKEY_A, [KEYS + 3*16]
530 aesdec XDATA0, XKEY2 ; 2. DEC
535 aesdec XDATA0, XKEY_A ; 3. DEC
536 aesdec XDATA1, XKEY_A
537 aesdec XDATA2, XKEY_A
538 aesdec XDATA3, XKEY_A
540 movdqa XKEY_A, [KEYS + 5*16]
542 aesdec XDATA0, XKEY4 ; 4. DEC
547 aesdec XDATA0, XKEY_A ; 5. DEC
548 aesdec XDATA1, XKEY_A
549 aesdec XDATA2, XKEY_A
550 aesdec XDATA3, XKEY_A
552 movdqa XKEY_A, [KEYS + 7*16]
554 aesdec XDATA0, XKEY6 ; 6. DEC
559 movdqa XKEY_B, [KEYS + 8*16]
561 aesdec XDATA0, XKEY_A ; 7. DEC
562 aesdec XDATA1, XKEY_A
563 aesdec XDATA2, XKEY_A
564 aesdec XDATA3, XKEY_A
566 movdqa XKEY_A, [KEYS + 9*16]
568 aesdec XDATA0, XKEY_B ; 8. DEC
569 aesdec XDATA1, XKEY_B
570 aesdec XDATA2, XKEY_B
571 aesdec XDATA3, XKEY_B
573 aesdec XDATA0, XKEY_A ; 9. DEC
574 aesdec XDATA1, XKEY_A
575 aesdec XDATA2, XKEY_A
576 aesdec XDATA3, XKEY_A
578 movdqa XKEY_A, [KEYS + 11*16]
580 aesdec XDATA0, XKEY10 ; 10. DEC
581 aesdec XDATA1, XKEY10
582 aesdec XDATA2, XKEY10
583 aesdec XDATA3, XKEY10
585 movdqa XKEY_B, [KEYS + 12*16]
587 aesdec XDATA0, XKEY_A ; 11. DEC
588 aesdec XDATA1, XKEY_A
589 aesdec XDATA2, XKEY_A
590 aesdec XDATA3, XKEY_A
592 movdqa XKEY_A, [KEYS + 13*16]
594 aesdec XDATA0, XKEY_B ; 12. DEC
595 aesdec XDATA1, XKEY_B
596 aesdec XDATA2, XKEY_B
597 aesdec XDATA3, XKEY_B
599 movdqa XKEY_B, [KEYS + 14*16]
601 aesdec XDATA0, XKEY_A ; 13. DEC
602 aesdec XDATA1, XKEY_A
603 aesdec XDATA2, XKEY_A
604 aesdec XDATA3, XKEY_A
606 aesdeclast XDATA0, XKEY_B ; 14. DEC
607 aesdeclast XDATA1, XKEY_B
608 aesdeclast XDATA2, XKEY_B
609 aesdeclast XDATA3, XKEY_B
616 movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
617 movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
618 movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
619 movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
627 ; Don't write back IV
633 section .note.GNU-stack noalloc noexec nowrite progbits