]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / aes256_cbc_dec_by4_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28; routine to do AES cbc decrypt on 16n bytes doing AES by 4
29
30; XMM registers are clobbered. Saving/restoring must be done at a higher level
31
32; void aes_cbc_dec_256_sse(void *in,
33; UINT128 *IV,
34; UINT128 keys[15],
35; void *out,
36; UINT64 len_bytes);
37;
38; arg 1: rcx: pointer to input (cipher text)
39; arg 2: rdx: pointer to IV
40; arg 3: r8: pointer to keys
41; arg 4: r9: pointer to output (plain text)
42; arg 5: sp: length in bytes (multiple of 16)
43;
44
f67539c2 45%include "include/os.asm"
11fdf7f2 46
9f95a23c
TL
47%ifndef AES_CBC_DEC_256
48%define AES_CBC_DEC_256 aes_cbc_dec_256_sse
49%endif
50
11fdf7f2
TL
51%define MOVDQ movdqu
52
53%ifdef LINUX
54%define IN rdi
55%define IV rsi
56%define KEYS rdx
57%define OUT rcx
58%define LEN r8
59%else
60%define IN rcx
61%define IV rdx
62%define KEYS r8
63%define OUT r9
64%define LEN r10
65%endif
66
67%define IDX rax
68%define TMP IDX
69%define XDATA0 xmm0
70%define XDATA1 xmm1
71%define XDATA2 xmm2
72%define XDATA3 xmm3
73%define XKEY0 xmm4
74%define XKEY2 xmm5
75%define XKEY4 xmm6
76%define XKEY6 xmm7
77%define XKEY10 xmm8
78%define XIV xmm9
79%define XSAVED0 xmm10
80%define XSAVED1 xmm11
81%define XSAVED2 xmm12
82%define XSAVED3 xmm13
83%define XKEY_A xmm14
84%define XKEY_B xmm15
85
86%define IV_TMP XSAVED3
87
88section .text
89
9f95a23c
TL
90MKGLOBAL(AES_CBC_DEC_256,function,internal)
91AES_CBC_DEC_256:
11fdf7f2
TL
92%ifndef LINUX
93 mov LEN, [rsp + 8*5]
94%endif
95
96 mov TMP, LEN
97 and TMP, 3*16
98 jz initial_4
99 cmp TMP, 2*16
100 jb initial_1
101 ja initial_3
102
103initial_2:
104 ; load cipher text
105 movdqu XDATA0, [IN + 0*16]
106 movdqu XDATA1, [IN + 1*16]
107
108 movdqa XKEY0, [KEYS + 0*16]
109
110 ; save cipher text
111 movdqa XSAVED0, XDATA0
112 movdqa XIV, XDATA1
113
114 pxor XDATA0, XKEY0 ; 0. ARK
115 pxor XDATA1, XKEY0
116
117 movdqa XKEY2, [KEYS + 2*16]
118
119 aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
120 aesdec XDATA1, [KEYS + 1*16]
121
122 mov IDX, 2*16
123
124 aesdec XDATA0, XKEY2 ; 2. DEC
125 aesdec XDATA1, XKEY2
126
127 movdqa XKEY4, [KEYS + 4*16]
128
129 aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
130 aesdec XDATA1, [KEYS + 3*16]
131
132 movdqu IV_TMP, [IV]
133
134 aesdec XDATA0, XKEY4 ; 4. DEC
135 aesdec XDATA1, XKEY4
136
137 movdqa XKEY6, [KEYS + 6*16]
138
139 aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
140 aesdec XDATA1, [KEYS + 5*16]
141
142 aesdec XDATA0, XKEY6 ; 6. DEC
143 aesdec XDATA1, XKEY6
144
145 movdqa XKEY_B, [KEYS + 8*16]
146
147 aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
148 aesdec XDATA1, [KEYS + 7*16]
149
150 aesdec XDATA0, XKEY_B ; 8. DEC
151 aesdec XDATA1, XKEY_B
152
153 movdqa XKEY10, [KEYS + 10*16]
154
155 aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
156 aesdec XDATA1, [KEYS + 9*16]
157
158 aesdec XDATA0, XKEY10 ; 10. DEC
159 aesdec XDATA1, XKEY10
160
161 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
162 aesdec XDATA1, [KEYS + 11*16]
163
164 aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
165 aesdec XDATA1, [KEYS + 12*16]
166
167 aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
168 aesdec XDATA1, [KEYS + 13*16]
169
170 aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
171 aesdeclast XDATA1, [KEYS + 14*16]
172
173 pxor XDATA0, IV_TMP
174 pxor XDATA1, XSAVED0
175
176 movdqu [OUT + 0*16], XDATA0
177 movdqu [OUT + 1*16], XDATA1
178
179 cmp LEN, 2*16
180 je done
181 jmp main_loop
182
183
184 align 16
185initial_1:
186 ; load cipher text
187 movdqu XDATA0, [IN + 0*16]
188
189 movdqa XKEY0, [KEYS + 0*16]
190
191 ; save cipher text
192 movdqa XIV, XDATA0
193
194 pxor XDATA0, XKEY0 ; 0. ARK
195
196 movdqa XKEY2, [KEYS + 2*16]
197
198 aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
199
200 mov IDX, 1*16
201
202 aesdec XDATA0, XKEY2 ; 2. DEC
203
204 movdqa XKEY4, [KEYS + 4*16]
205
206 aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
207
208 movdqu IV_TMP, [IV]
209
210 aesdec XDATA0, XKEY4 ; 4. DEC
211
212 movdqa XKEY6, [KEYS + 6*16]
213
214 aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
215
216 aesdec XDATA0, XKEY6 ; 6. DEC
217
218 movdqa XKEY_B, [KEYS + 8*16]
219
220 aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
221
222 aesdec XDATA0, XKEY_B ; 8. DEC
223
224 movdqa XKEY10, [KEYS + 10*16]
225
226 aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
227
228 aesdec XDATA0, XKEY10 ; 10. DEC
229
230 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
231
232 aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
233
234 aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
235
236 aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
237
238 pxor XDATA0, IV_TMP
239
240 movdqu [OUT + 0*16], XDATA0
241
242 cmp LEN, 1*16
243 je done
244 jmp main_loop
245
246
247initial_3:
248 ; load cipher text
249 movdqu XDATA0, [IN + 0*16]
250 movdqu XDATA1, [IN + 1*16]
251 movdqu XDATA2, [IN + 2*16]
252
253 movdqa XKEY0, [KEYS + 0*16]
254
255 ; save cipher text
256 movdqa XSAVED0, XDATA0
257 movdqa XSAVED1, XDATA1
258 movdqa XIV, XDATA2
259
260 movdqa XKEY_A, [KEYS + 1*16]
261
262 pxor XDATA0, XKEY0 ; 0. ARK
263 pxor XDATA1, XKEY0
264 pxor XDATA2, XKEY0
265
266 movdqa XKEY2, [KEYS + 2*16]
267
268 aesdec XDATA0, XKEY_A ; 1. DEC
269 aesdec XDATA1, XKEY_A
270 aesdec XDATA2, XKEY_A
271
272 movdqa XKEY_A, [KEYS + 3*16]
273 mov IDX, 3*16
274
275 aesdec XDATA0, XKEY2 ; 2. DEC
276 aesdec XDATA1, XKEY2
277 aesdec XDATA2, XKEY2
278
279 movdqa XKEY4, [KEYS + 4*16]
280
281 aesdec XDATA0, XKEY_A ; 3. DEC
282 aesdec XDATA1, XKEY_A
283 aesdec XDATA2, XKEY_A
284
285 movdqa XKEY_A, [KEYS + 5*16]
286 movdqu IV_TMP, [IV]
287
288 aesdec XDATA0, XKEY4 ; 4. DEC
289 aesdec XDATA1, XKEY4
290 aesdec XDATA2, XKEY4
291
292 movdqa XKEY6, [KEYS + 6*16]
293
294 aesdec XDATA0, XKEY_A ; 5. DEC
295 aesdec XDATA1, XKEY_A
296 aesdec XDATA2, XKEY_A
297
298 movdqa XKEY_A, [KEYS + 7*16]
299
300 aesdec XDATA0, XKEY6 ; 6. DEC
301 aesdec XDATA1, XKEY6
302 aesdec XDATA2, XKEY6
303
304 movdqa XKEY_B, [KEYS + 8*16]
305
306 aesdec XDATA0, XKEY_A ; 7. DEC
307 aesdec XDATA1, XKEY_A
308 aesdec XDATA2, XKEY_A
309
310 movdqa XKEY_A, [KEYS + 9*16]
311
312 aesdec XDATA0, XKEY_B ; 8. DEC
313 aesdec XDATA1, XKEY_B
314 aesdec XDATA2, XKEY_B
315
316 movdqa XKEY10, [KEYS + 10*16]
317
318 aesdec XDATA0, XKEY_A ; 9. DEC
319 aesdec XDATA1, XKEY_A
320 aesdec XDATA2, XKEY_A
321
322 movdqa XKEY_A, [KEYS + 11*16]
323
324 aesdec XDATA0, XKEY10 ; 10. DEC
325 aesdec XDATA1, XKEY10
326 aesdec XDATA2, XKEY10
327
328 movdqa XKEY_B, [KEYS + 12*16]
329
330 aesdec XDATA0, XKEY_A ; 11. DEC
331 aesdec XDATA1, XKEY_A
332 aesdec XDATA2, XKEY_A
333
334 movdqa XKEY_A, [KEYS + 13*16]
335
336 aesdec XDATA0, XKEY_B ; 12. DEC
337 aesdec XDATA1, XKEY_B
338 aesdec XDATA2, XKEY_B
339
340 movdqa XKEY_B, [KEYS + 14*16]
341
342 aesdec XDATA0, XKEY_A ; 13. DEC
343 aesdec XDATA1, XKEY_A
344 aesdec XDATA2, XKEY_A
345
346 aesdeclast XDATA0, XKEY_B ; 14. DEC
347 aesdeclast XDATA1, XKEY_B
348 aesdeclast XDATA2, XKEY_B
349
350 pxor XDATA0, IV_TMP
351 pxor XDATA1, XSAVED0
352 pxor XDATA2, XSAVED1
353
354 movdqu [OUT + 0*16], XDATA0
355 movdqu [OUT + 1*16], XDATA1
356 movdqu [OUT + 2*16], XDATA2
357
358 cmp LEN, 3*16
359 je done
360 jmp main_loop
361
362
363 align 16
364initial_4:
365 ; load cipher text
366 movdqu XDATA0, [IN + 0*16]
367 movdqu XDATA1, [IN + 1*16]
368 movdqu XDATA2, [IN + 2*16]
369 movdqu XDATA3, [IN + 3*16]
370
371 movdqa XKEY0, [KEYS + 0*16]
372
373 ; save cipher text
374 movdqa XSAVED0, XDATA0
375 movdqa XSAVED1, XDATA1
376 movdqa XSAVED2, XDATA2
377 movdqa XIV, XDATA3
378
379 movdqa XKEY_A, [KEYS + 1*16]
380
381 pxor XDATA0, XKEY0 ; 0. ARK
382 pxor XDATA1, XKEY0
383 pxor XDATA2, XKEY0
384 pxor XDATA3, XKEY0
385
386 movdqa XKEY2, [KEYS + 2*16]
387
388 aesdec XDATA0, XKEY_A ; 1. DEC
389 aesdec XDATA1, XKEY_A
390 aesdec XDATA2, XKEY_A
391 aesdec XDATA3, XKEY_A
392
393 movdqa XKEY_A, [KEYS + 3*16]
394
395 mov IDX, 4*16
396
397 aesdec XDATA0, XKEY2 ; 2. DEC
398 aesdec XDATA1, XKEY2
399 aesdec XDATA2, XKEY2
400 aesdec XDATA3, XKEY2
401
402 movdqa XKEY4, [KEYS + 4*16]
403
404 aesdec XDATA0, XKEY_A ; 3. DEC
405 aesdec XDATA1, XKEY_A
406 aesdec XDATA2, XKEY_A
407 aesdec XDATA3, XKEY_A
408
409 movdqa XKEY_A, [KEYS + 5*16]
410
411 movdqu IV_TMP, [IV]
412
413 aesdec XDATA0, XKEY4 ; 4. DEC
414 aesdec XDATA1, XKEY4
415 aesdec XDATA2, XKEY4
416 aesdec XDATA3, XKEY4
417
418 movdqa XKEY6, [KEYS + 6*16]
419
420 aesdec XDATA0, XKEY_A ; 5. DEC
421 aesdec XDATA1, XKEY_A
422 aesdec XDATA2, XKEY_A
423 aesdec XDATA3, XKEY_A
424
425 movdqa XKEY_A, [KEYS + 7*16]
426
427 aesdec XDATA0, XKEY6 ; 6. DEC
428 aesdec XDATA1, XKEY6
429 aesdec XDATA2, XKEY6
430 aesdec XDATA3, XKEY6
431
432 movdqa XKEY_B, [KEYS + 8*16]
433
434 aesdec XDATA0, XKEY_A ; 7. DEC
435 aesdec XDATA1, XKEY_A
436 aesdec XDATA2, XKEY_A
437 aesdec XDATA3, XKEY_A
438
439 movdqa XKEY_A, [KEYS + 9*16]
440
441 aesdec XDATA0, XKEY_B ; 8. DEC
442 aesdec XDATA1, XKEY_B
443 aesdec XDATA2, XKEY_B
444 aesdec XDATA3, XKEY_B
445
446 movdqa XKEY10, [KEYS + 10*16]
447
448 aesdec XDATA0, XKEY_A ; 9. DEC
449 aesdec XDATA1, XKEY_A
450 aesdec XDATA2, XKEY_A
451 aesdec XDATA3, XKEY_A
452
453 movdqa XKEY_A, [KEYS + 11*16]
454
455 aesdec XDATA0, XKEY10 ; 10. DEC
456 aesdec XDATA1, XKEY10
457 aesdec XDATA2, XKEY10
458 aesdec XDATA3, XKEY10
459
460 movdqa XKEY_B, [KEYS + 12*16]
461
462 aesdec XDATA0, XKEY_A ; 11. DEC
463 aesdec XDATA1, XKEY_A
464 aesdec XDATA2, XKEY_A
465 aesdec XDATA3, XKEY_A
466
467 movdqa XKEY_A, [KEYS + 13*16]
468
469 aesdec XDATA0, XKEY_B ; 12. DEC
470 aesdec XDATA1, XKEY_B
471 aesdec XDATA2, XKEY_B
472 aesdec XDATA3, XKEY_B
473
474 movdqa XKEY_B, [KEYS + 14*16]
475
476 aesdec XDATA0, XKEY_A ; 13. DEC
477 aesdec XDATA1, XKEY_A
478 aesdec XDATA2, XKEY_A
479 aesdec XDATA3, XKEY_A
480
481 aesdeclast XDATA0, XKEY_B ; 14. DEC
482 aesdeclast XDATA1, XKEY_B
483 aesdeclast XDATA2, XKEY_B
484 aesdeclast XDATA3, XKEY_B
485
486 pxor XDATA0, IV_TMP
487 pxor XDATA1, XSAVED0
488 pxor XDATA2, XSAVED1
489 pxor XDATA3, XSAVED2
490
491 movdqu [OUT + 0*16], XDATA0
492 movdqu [OUT + 1*16], XDATA1
493 movdqu [OUT + 2*16], XDATA2
494 movdqu [OUT + 3*16], XDATA3
495
496 cmp LEN, 4*16
497 jz done
498 jmp main_loop
499
500 align 16
501main_loop:
502 ; load cipher text
503 movdqu XDATA0, [IN + IDX + 0*16]
504 movdqu XDATA1, [IN + IDX + 1*16]
505 movdqu XDATA2, [IN + IDX + 2*16]
506 movdqu XDATA3, [IN + IDX + 3*16]
507
508 ; save cipher text
509 movdqa XSAVED0, XDATA0
510 movdqa XSAVED1, XDATA1
511 movdqa XSAVED2, XDATA2
512 movdqa XSAVED3, XDATA3
513
514 movdqa XKEY_A, [KEYS + 1*16]
515
516 pxor XDATA0, XKEY0 ; 0. ARK
517 pxor XDATA1, XKEY0
518 pxor XDATA2, XKEY0
519 pxor XDATA3, XKEY0
520
521 add IDX, 4*16
522
523 aesdec XDATA0, XKEY_A ; 1. DEC
524 aesdec XDATA1, XKEY_A
525 aesdec XDATA2, XKEY_A
526 aesdec XDATA3, XKEY_A
527
528 movdqa XKEY_A, [KEYS + 3*16]
529
530 aesdec XDATA0, XKEY2 ; 2. DEC
531 aesdec XDATA1, XKEY2
532 aesdec XDATA2, XKEY2
533 aesdec XDATA3, XKEY2
534
535 aesdec XDATA0, XKEY_A ; 3. DEC
536 aesdec XDATA1, XKEY_A
537 aesdec XDATA2, XKEY_A
538 aesdec XDATA3, XKEY_A
539
540 movdqa XKEY_A, [KEYS + 5*16]
541
542 aesdec XDATA0, XKEY4 ; 4. DEC
543 aesdec XDATA1, XKEY4
544 aesdec XDATA2, XKEY4
545 aesdec XDATA3, XKEY4
546
547 aesdec XDATA0, XKEY_A ; 5. DEC
548 aesdec XDATA1, XKEY_A
549 aesdec XDATA2, XKEY_A
550 aesdec XDATA3, XKEY_A
551
552 movdqa XKEY_A, [KEYS + 7*16]
553
554 aesdec XDATA0, XKEY6 ; 6. DEC
555 aesdec XDATA1, XKEY6
556 aesdec XDATA2, XKEY6
557 aesdec XDATA3, XKEY6
558
559 movdqa XKEY_B, [KEYS + 8*16]
560
561 aesdec XDATA0, XKEY_A ; 7. DEC
562 aesdec XDATA1, XKEY_A
563 aesdec XDATA2, XKEY_A
564 aesdec XDATA3, XKEY_A
565
566 movdqa XKEY_A, [KEYS + 9*16]
567
568 aesdec XDATA0, XKEY_B ; 8. DEC
569 aesdec XDATA1, XKEY_B
570 aesdec XDATA2, XKEY_B
571 aesdec XDATA3, XKEY_B
572
573 aesdec XDATA0, XKEY_A ; 9. DEC
574 aesdec XDATA1, XKEY_A
575 aesdec XDATA2, XKEY_A
576 aesdec XDATA3, XKEY_A
577
578 movdqa XKEY_A, [KEYS + 11*16]
579
580 aesdec XDATA0, XKEY10 ; 10. DEC
581 aesdec XDATA1, XKEY10
582 aesdec XDATA2, XKEY10
583 aesdec XDATA3, XKEY10
584
585 movdqa XKEY_B, [KEYS + 12*16]
586
587 aesdec XDATA0, XKEY_A ; 11. DEC
588 aesdec XDATA1, XKEY_A
589 aesdec XDATA2, XKEY_A
590 aesdec XDATA3, XKEY_A
591
592 movdqa XKEY_A, [KEYS + 13*16]
593
594 aesdec XDATA0, XKEY_B ; 12. DEC
595 aesdec XDATA1, XKEY_B
596 aesdec XDATA2, XKEY_B
597 aesdec XDATA3, XKEY_B
598
599 movdqa XKEY_B, [KEYS + 14*16]
600
601 aesdec XDATA0, XKEY_A ; 13. DEC
602 aesdec XDATA1, XKEY_A
603 aesdec XDATA2, XKEY_A
604 aesdec XDATA3, XKEY_A
605
606 aesdeclast XDATA0, XKEY_B ; 14. DEC
607 aesdeclast XDATA1, XKEY_B
608 aesdeclast XDATA2, XKEY_B
609 aesdeclast XDATA3, XKEY_B
610
611 pxor XDATA0, XIV
612 pxor XDATA1, XSAVED0
613 pxor XDATA2, XSAVED1
614 pxor XDATA3, XSAVED2
615
616 movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
617 movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
618 movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
619 movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
620
621 movdqa XIV, XSAVED3
622
623 CMP IDX, LEN
624 jne main_loop
625
626done:
627; Don't write back IV
628; movdqu [IV], XIV
629
630 ret
631
632%ifdef LINUX
633section .note.GNU-stack noalloc noexec nowrite progbits
634%endif