]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_128_dec_expanded_key_vaes.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS decrypt function with 256-bit AES
30 ; expanded keys are not aligned
31 ; keys are expanded in parallel with the tweak encryption
32 ; plaintext and ciphertext are not aligned
33 ; second key is stored in the stack as aligned to 16 Bytes
34 ; first key is required only once, no need for storage of this key
35
36 %include "reg_sizes.asm"
37
38 %if (AS_FEATURE_LEVEL) >= 10
39
40 default rel
41 %define TW rsp ; store 8 tweak values
42 %define keys rsp + 16*8 ; store 15 expanded keys
43
44 %ifidn __OUTPUT_FORMAT__, win64
45 %define _xmm rsp + 16*23 ; store xmm6:xmm15
46 %endif
47
48 %ifidn __OUTPUT_FORMAT__, elf64
49 %define _gpr rsp + 16*23 ; store rbx
50 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %else
52 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
53 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
54 %endif
55
56 %define GHASH_POLY 0x87
57
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59 ;void XTS_AES_256_enc_avx(
60 ; UINT8 *k2, // key used for tweaking, 16*2 bytes
61 ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
62 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
63 ; UINT64 N, // sector size, in bytes
64 ; const UINT8 *pt, // plaintext sector input data
65 ; UINT8 *ct); // ciphertext sector output data
66 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
67
68 ; arguments for input parameters
69 %ifidn __OUTPUT_FORMAT__, elf64
70 %xdefine ptr_key2 rdi
71 %xdefine ptr_key1 rsi
72 %xdefine T_val rdx
73 %xdefine N_val rcx
74 %xdefine ptr_plaintext r8
75 %xdefine ptr_ciphertext r9
76 %else
77 %xdefine ptr_key2 rcx
78 %xdefine ptr_key1 rdx
79 %xdefine T_val r8
80 %xdefine N_val r9
81 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
82 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
83 %endif
84
85 ; arguments for temp parameters
86 %ifidn __OUTPUT_FORMAT__, elf64
87 %define tmp1 rdi
88 %define ghash_poly_8b r10
89 %define ghash_poly_8b_temp r11
90 %else
91 %define tmp1 rcx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94 %endif
95
96 %define twtempl rax ; global temp registers used for tweak computation
97 %define twtemph rbx
98 %define zpoly zmm25
99
100 ; macro to encrypt the tweak value
101
102 %macro encrypt_T 8
103 %define %%xkey2 %1
104 %define %%xstate_tweak %2
105 %define %%xkey1 %3
106 %define %%xraw_key %4
107 %define %%xtmp %5
108 %define %%ptr_key2 %6
109 %define %%ptr_key1 %7
110 %define %%ptr_expanded_keys %8
111
112 vmovdqu %%xkey2, [%%ptr_key2]
113 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
114
115 vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
116 vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
117
118 vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
119 vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
120
121 vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
122 vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
123
124
125 vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
126 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
127
128 vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
129 vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
130
131 vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
132 vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
133
134 vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
135 vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
136
137 vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
138 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
139
140 vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
141 vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
142
143 vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
144 vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
145
146 vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
147 vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
148
149 vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
150 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
151
152 vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
153 vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
154
155 vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
156 vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
157
158 vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
159 vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
160
161
162 vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
163 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
164
165 vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
166 vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
167
168
169 vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
170 vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
171
172 vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
173 vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
174
175
176
177
178 vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
179 vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
180
181 vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
182 vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
183
184 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
185 %endmacro
186
187
188 ; Original way to generate initial tweak values and load plaintext values
189 ; only used for small blocks
190 %macro initialize 16
191
192 %define %%ST1 %1 ; state 1
193 %define %%ST2 %2 ; state 2
194 %define %%ST3 %3 ; state 3
195 %define %%ST4 %4 ; state 4
196 %define %%ST5 %5 ; state 5
197 %define %%ST6 %6 ; state 6
198 %define %%ST7 %7 ; state 7
199 %define %%ST8 %8 ; state 8
200
201 %define %%TW1 %9 ; tweak 1
202 %define %%TW2 %10 ; tweak 2
203 %define %%TW3 %11 ; tweak 3
204 %define %%TW4 %12 ; tweak 4
205 %define %%TW5 %13 ; tweak 5
206 %define %%TW6 %14 ; tweak 6
207 %define %%TW7 %15 ; tweak 7
208
209 %define %%num_initial_blocks %16
210
211
212 ; generate next Tweak values
213 vmovdqa %%TW1, [TW+16*0]
214 mov twtempl, [TW+8*0]
215 mov twtemph, [TW+8*1]
216 vmovdqu %%ST1, [ptr_plaintext+16*0]
217 %if (%%num_initial_blocks>=2)
218 xor ghash_poly_8b_temp, ghash_poly_8b_temp
219 shl twtempl, 1
220 adc twtemph, twtemph
221 cmovc ghash_poly_8b_temp, ghash_poly_8b
222 xor twtempl, ghash_poly_8b_temp
223 mov [TW+8*2], twtempl
224 mov [TW+8*3], twtemph;
225 vmovdqa %%TW2, [TW+16*1]
226 vmovdqu %%ST2, [ptr_plaintext+16*1]
227 %endif
228 %if (%%num_initial_blocks>=3)
229 xor ghash_poly_8b_temp, ghash_poly_8b_temp
230 shl twtempl, 1
231 adc twtemph, twtemph
232 cmovc ghash_poly_8b_temp, ghash_poly_8b
233 xor twtempl, ghash_poly_8b_temp
234 mov [TW+8*4], twtempl
235 mov [TW+8*5], twtemph;
236 vmovdqa %%TW3, [TW+16*2]
237 vmovdqu %%ST3, [ptr_plaintext+16*2]
238 %endif
239 %if (%%num_initial_blocks>=4)
240 xor ghash_poly_8b_temp, ghash_poly_8b_temp
241 shl twtempl, 1
242 adc twtemph, twtemph
243 cmovc ghash_poly_8b_temp, ghash_poly_8b
244 xor twtempl, ghash_poly_8b_temp
245 mov [TW+8*6], twtempl
246 mov [TW+8*7], twtemph;
247 vmovdqa %%TW4, [TW+16*3]
248 vmovdqu %%ST4, [ptr_plaintext+16*3]
249 %endif
250 %if (%%num_initial_blocks>=5)
251 xor ghash_poly_8b_temp, ghash_poly_8b_temp
252 shl twtempl, 1
253 adc twtemph, twtemph
254 cmovc ghash_poly_8b_temp, ghash_poly_8b
255 xor twtempl, ghash_poly_8b_temp
256 mov [TW+8*8], twtempl
257 mov [TW+8*9], twtemph;
258 vmovdqa %%TW5, [TW+16*4]
259 vmovdqu %%ST5, [ptr_plaintext+16*4]
260 %endif
261 %if (%%num_initial_blocks>=6)
262 xor ghash_poly_8b_temp, ghash_poly_8b_temp
263 shl twtempl, 1
264 adc twtemph, twtemph
265 cmovc ghash_poly_8b_temp, ghash_poly_8b
266 xor twtempl, ghash_poly_8b_temp
267 mov [TW+8*10], twtempl
268 mov [TW+8*11], twtemph;
269 vmovdqa %%TW6, [TW+16*5]
270 vmovdqu %%ST6, [ptr_plaintext+16*5]
271 %endif
272 %if (%%num_initial_blocks>=7)
273 xor ghash_poly_8b_temp, ghash_poly_8b_temp
274 shl twtempl, 1
275 adc twtemph, twtemph
276 cmovc ghash_poly_8b_temp, ghash_poly_8b
277 xor twtempl, ghash_poly_8b_temp
278 mov [TW+8*12], twtempl
279 mov [TW+8*13], twtemph;
280 vmovdqa %%TW7, [TW+16*6]
281 vmovdqu %%ST7, [ptr_plaintext+16*6]
282 %endif
283
284 %endmacro
285
286
287 ; Original decrypt initial blocks of AES
288 ; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
289 ; next 8 Tweak values can be generated
290 %macro decrypt_initial 18
291 %define %%ST1 %1 ; state 1
292 %define %%ST2 %2 ; state 2
293 %define %%ST3 %3 ; state 3
294 %define %%ST4 %4 ; state 4
295 %define %%ST5 %5 ; state 5
296 %define %%ST6 %6 ; state 6
297 %define %%ST7 %7 ; state 7
298 %define %%ST8 %8 ; state 8
299
300 %define %%TW1 %9 ; tweak 1
301 %define %%TW2 %10 ; tweak 2
302 %define %%TW3 %11 ; tweak 3
303 %define %%TW4 %12 ; tweak 4
304 %define %%TW5 %13 ; tweak 5
305 %define %%TW6 %14 ; tweak 6
306 %define %%TW7 %15 ; tweak 7
307 %define %%T0 %16 ; Temp register
308 %define %%num_blocks %17
309 ; %%num_blocks blocks decrypted
310 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
311
312 %define %%lt128 %18 ; less than 128 bytes
313
314 ; xor Tweak value
315 vpxor %%ST1, %%TW1
316 %if (%%num_blocks>=2)
317 vpxor %%ST2, %%TW2
318 %endif
319 %if (%%num_blocks>=3)
320 vpxor %%ST3, %%TW3
321 %endif
322 %if (%%num_blocks>=4)
323 vpxor %%ST4, %%TW4
324 %endif
325 %if (%%num_blocks>=5)
326 vpxor %%ST5, %%TW5
327 %endif
328 %if (%%num_blocks>=6)
329 vpxor %%ST6, %%TW6
330 %endif
331 %if (%%num_blocks>=7)
332 vpxor %%ST7, %%TW7
333 %endif
334
335
336 ; ARK
337 vmovdqa %%T0, [keys]
338 vpxor %%ST1, %%T0
339 %if (%%num_blocks>=2)
340 vpxor %%ST2, %%T0
341 %endif
342 %if (%%num_blocks>=3)
343 vpxor %%ST3, %%T0
344 %endif
345 %if (%%num_blocks>=4)
346 vpxor %%ST4, %%T0
347 %endif
348 %if (%%num_blocks>=5)
349 vpxor %%ST5, %%T0
350 %endif
351 %if (%%num_blocks>=6)
352 vpxor %%ST6, %%T0
353 %endif
354 %if (%%num_blocks>=7)
355 vpxor %%ST7, %%T0
356 %endif
357
358
359 %if (0 == %%lt128)
360 xor ghash_poly_8b_temp, ghash_poly_8b_temp
361 shl twtempl, 1
362 adc twtemph, twtemph
363 %endif
364
365 ; round 1
366 vmovdqa %%T0, [keys + 16*1]
367 vaesdec %%ST1, %%T0
368 %if (%%num_blocks>=2)
369 vaesdec %%ST2, %%T0
370 %endif
371 %if (%%num_blocks>=3)
372 vaesdec %%ST3, %%T0
373 %endif
374 %if (%%num_blocks>=4)
375 vaesdec %%ST4, %%T0
376 %endif
377 %if (%%num_blocks>=5)
378 vaesdec %%ST5, %%T0
379 %endif
380 %if (%%num_blocks>=6)
381 vaesdec %%ST6, %%T0
382 %endif
383 %if (%%num_blocks>=7)
384 vaesdec %%ST7, %%T0
385 %endif
386 %if (0 == %%lt128)
387 cmovc ghash_poly_8b_temp, ghash_poly_8b
388 xor twtempl, ghash_poly_8b_temp
389 mov [TW + 8*0], twtempl ; next Tweak1 generated
390 mov [TW + 8*1], twtemph
391 xor ghash_poly_8b_temp, ghash_poly_8b_temp
392 %endif
393
394 ; round 2
395 vmovdqa %%T0, [keys + 16*2]
396 vaesdec %%ST1, %%T0
397 %if (%%num_blocks>=2)
398 vaesdec %%ST2, %%T0
399 %endif
400 %if (%%num_blocks>=3)
401 vaesdec %%ST3, %%T0
402 %endif
403 %if (%%num_blocks>=4)
404 vaesdec %%ST4, %%T0
405 %endif
406 %if (%%num_blocks>=5)
407 vaesdec %%ST5, %%T0
408 %endif
409 %if (%%num_blocks>=6)
410 vaesdec %%ST6, %%T0
411 %endif
412 %if (%%num_blocks>=7)
413 vaesdec %%ST7, %%T0
414 %endif
415
416 %if (0 == %%lt128)
417 shl twtempl, 1
418 adc twtemph, twtemph
419 cmovc ghash_poly_8b_temp, ghash_poly_8b
420 xor twtempl, ghash_poly_8b_temp
421 mov [TW + 8*2], twtempl ; next Tweak2 generated
422 %endif
423
424 ; round 3
425 vmovdqa %%T0, [keys + 16*3]
426 vaesdec %%ST1, %%T0
427 %if (%%num_blocks>=2)
428 vaesdec %%ST2, %%T0
429 %endif
430 %if (%%num_blocks>=3)
431 vaesdec %%ST3, %%T0
432 %endif
433 %if (%%num_blocks>=4)
434 vaesdec %%ST4, %%T0
435 %endif
436 %if (%%num_blocks>=5)
437 vaesdec %%ST5, %%T0
438 %endif
439 %if (%%num_blocks>=6)
440 vaesdec %%ST6, %%T0
441 %endif
442 %if (%%num_blocks>=7)
443 vaesdec %%ST7, %%T0
444 %endif
445 %if (0 == %%lt128)
446 mov [TW + 8*3], twtemph
447 xor ghash_poly_8b_temp, ghash_poly_8b_temp
448 shl twtempl, 1
449 adc twtemph, twtemph
450 cmovc ghash_poly_8b_temp, ghash_poly_8b
451 %endif
452
453 ; round 4
454 vmovdqa %%T0, [keys + 16*4]
455 vaesdec %%ST1, %%T0
456 %if (%%num_blocks>=2)
457 vaesdec %%ST2, %%T0
458 %endif
459 %if (%%num_blocks>=3)
460 vaesdec %%ST3, %%T0
461 %endif
462 %if (%%num_blocks>=4)
463 vaesdec %%ST4, %%T0
464 %endif
465 %if (%%num_blocks>=5)
466 vaesdec %%ST5, %%T0
467 %endif
468 %if (%%num_blocks>=6)
469 vaesdec %%ST6, %%T0
470 %endif
471 %if (%%num_blocks>=7)
472 vaesdec %%ST7, %%T0
473 %endif
474
475 %if (0 == %%lt128)
476 xor twtempl, ghash_poly_8b_temp
477 mov [TW + 8*4], twtempl ; next Tweak3 generated
478 mov [TW + 8*5], twtemph
479 xor ghash_poly_8b_temp, ghash_poly_8b_temp
480 shl twtempl, 1
481 %endif
482
483 ; round 5
484 vmovdqa %%T0, [keys + 16*5]
485 vaesdec %%ST1, %%T0
486 %if (%%num_blocks>=2)
487 vaesdec %%ST2, %%T0
488 %endif
489 %if (%%num_blocks>=3)
490 vaesdec %%ST3, %%T0
491 %endif
492 %if (%%num_blocks>=4)
493 vaesdec %%ST4, %%T0
494 %endif
495 %if (%%num_blocks>=5)
496 vaesdec %%ST5, %%T0
497 %endif
498 %if (%%num_blocks>=6)
499 vaesdec %%ST6, %%T0
500 %endif
501 %if (%%num_blocks>=7)
502 vaesdec %%ST7, %%T0
503 %endif
504
505 %if (0 == %%lt128)
506 adc twtemph, twtemph
507 cmovc ghash_poly_8b_temp, ghash_poly_8b
508 xor twtempl, ghash_poly_8b_temp
509 mov [TW + 8*6], twtempl ; next Tweak4 generated
510 mov [TW + 8*7], twtemph
511 %endif
512
513 ; round 6
514 vmovdqa %%T0, [keys + 16*6]
515 vaesdec %%ST1, %%T0
516 %if (%%num_blocks>=2)
517 vaesdec %%ST2, %%T0
518 %endif
519 %if (%%num_blocks>=3)
520 vaesdec %%ST3, %%T0
521 %endif
522 %if (%%num_blocks>=4)
523 vaesdec %%ST4, %%T0
524 %endif
525 %if (%%num_blocks>=5)
526 vaesdec %%ST5, %%T0
527 %endif
528 %if (%%num_blocks>=6)
529 vaesdec %%ST6, %%T0
530 %endif
531 %if (%%num_blocks>=7)
532 vaesdec %%ST7, %%T0
533 %endif
534
535 %if (0 == %%lt128)
536 xor ghash_poly_8b_temp, ghash_poly_8b_temp
537 shl twtempl, 1
538 adc twtemph, twtemph
539 cmovc ghash_poly_8b_temp, ghash_poly_8b
540 xor twtempl, ghash_poly_8b_temp
541 mov [TW + 8*8], twtempl ; next Tweak5 generated
542 mov [TW + 8*9], twtemph
543 %endif
544
545 ; round 7
546 vmovdqa %%T0, [keys + 16*7]
547 vaesdec %%ST1, %%T0
548 %if (%%num_blocks>=2)
549 vaesdec %%ST2, %%T0
550 %endif
551 %if (%%num_blocks>=3)
552 vaesdec %%ST3, %%T0
553 %endif
554 %if (%%num_blocks>=4)
555 vaesdec %%ST4, %%T0
556 %endif
557 %if (%%num_blocks>=5)
558 vaesdec %%ST5, %%T0
559 %endif
560 %if (%%num_blocks>=6)
561 vaesdec %%ST6, %%T0
562 %endif
563 %if (%%num_blocks>=7)
564 vaesdec %%ST7, %%T0
565 %endif
566
567 %if (0 == %%lt128)
568 xor ghash_poly_8b_temp, ghash_poly_8b_temp
569 shl twtempl, 1
570 adc twtemph, twtemph
571 cmovc ghash_poly_8b_temp, ghash_poly_8b
572 xor twtempl, ghash_poly_8b_temp
573 mov [TW + 8*10], twtempl ; next Tweak6 generated
574 mov [TW + 8*11], twtemph
575 %endif
576 ; round 8
577 vmovdqa %%T0, [keys + 16*8]
578 vaesdec %%ST1, %%T0
579 %if (%%num_blocks>=2)
580 vaesdec %%ST2, %%T0
581 %endif
582 %if (%%num_blocks>=3)
583 vaesdec %%ST3, %%T0
584 %endif
585 %if (%%num_blocks>=4)
586 vaesdec %%ST4, %%T0
587 %endif
588 %if (%%num_blocks>=5)
589 vaesdec %%ST5, %%T0
590 %endif
591 %if (%%num_blocks>=6)
592 vaesdec %%ST6, %%T0
593 %endif
594 %if (%%num_blocks>=7)
595 vaesdec %%ST7, %%T0
596 %endif
597
598 %if (0 == %%lt128)
599 xor ghash_poly_8b_temp, ghash_poly_8b_temp
600 shl twtempl, 1
601 adc twtemph, twtemph
602 cmovc ghash_poly_8b_temp, ghash_poly_8b
603 xor twtempl, ghash_poly_8b_temp
604 mov [TW + 8*12], twtempl ; next Tweak7 generated
605 mov [TW + 8*13], twtemph
606 %endif
607 ; round 9
608 vmovdqa %%T0, [keys + 16*9]
609 vaesdec %%ST1, %%T0
610 %if (%%num_blocks>=2)
611 vaesdec %%ST2, %%T0
612 %endif
613 %if (%%num_blocks>=3)
614 vaesdec %%ST3, %%T0
615 %endif
616 %if (%%num_blocks>=4)
617 vaesdec %%ST4, %%T0
618 %endif
619 %if (%%num_blocks>=5)
620 vaesdec %%ST5, %%T0
621 %endif
622 %if (%%num_blocks>=6)
623 vaesdec %%ST6, %%T0
624 %endif
625 %if (%%num_blocks>=7)
626 vaesdec %%ST7, %%T0
627 %endif
628
629 %if (0 == %%lt128)
630 xor ghash_poly_8b_temp, ghash_poly_8b_temp
631 shl twtempl, 1
632 adc twtemph, twtemph
633 cmovc ghash_poly_8b_temp, ghash_poly_8b
634 xor twtempl, ghash_poly_8b_temp
635 mov [TW + 8*14], twtempl ; next Tweak8 generated
636 mov [TW + 8*15], twtemph
637 %endif
638
639 ; round 10
640 vmovdqa %%T0, [keys + 16*10]
641 vaesdeclast %%ST1, %%T0
642 %if (%%num_blocks>=2)
643 vaesdeclast %%ST2, %%T0
644 %endif
645 %if (%%num_blocks>=3)
646 vaesdeclast %%ST3, %%T0
647 %endif
648 %if (%%num_blocks>=4)
649 vaesdeclast %%ST4, %%T0
650 %endif
651 %if (%%num_blocks>=5)
652 vaesdeclast %%ST5, %%T0
653 %endif
654 %if (%%num_blocks>=6)
655 vaesdeclast %%ST6, %%T0
656 %endif
657 %if (%%num_blocks>=7)
658 vaesdeclast %%ST7, %%T0
659 %endif
660
661
662 ; xor Tweak values
663 vpxor %%ST1, %%TW1
664 %if (%%num_blocks>=2)
665 vpxor %%ST2, %%TW2
666 %endif
667 %if (%%num_blocks>=3)
668 vpxor %%ST3, %%TW3
669 %endif
670 %if (%%num_blocks>=4)
671 vpxor %%ST4, %%TW4
672 %endif
673 %if (%%num_blocks>=5)
674 vpxor %%ST5, %%TW5
675 %endif
676 %if (%%num_blocks>=6)
677 vpxor %%ST6, %%TW6
678 %endif
679 %if (%%num_blocks>=7)
680 vpxor %%ST7, %%TW7
681 %endif
682
683
684 %if (0 == %%lt128)
685 ; load next Tweak values
686 vmovdqa %%TW1, [TW + 16*0]
687 vmovdqa %%TW2, [TW + 16*1]
688 vmovdqa %%TW3, [TW + 16*2]
689 vmovdqa %%TW4, [TW + 16*3]
690 vmovdqa %%TW5, [TW + 16*4]
691 vmovdqa %%TW6, [TW + 16*5]
692 vmovdqa %%TW7, [TW + 16*6]
693
694 %endif
695
696 %endmacro
697
698
699
700 ; Decrypt 8 blocks in parallel
701 ; generate next 8 tweak values
702 %macro decrypt_by_eight_zmm 6
703 %define %%ST1 %1 ; state 1
704 %define %%ST2 %2 ; state 2
705 %define %%TW1 %3 ; tweak 1
706 %define %%TW2 %4 ; tweak 2
707 %define %%T0 %5 ; Temp register
708 %define %%last_eight %6
709
710 ; xor Tweak values
711 vpxorq %%ST1, %%TW1
712 vpxorq %%ST2, %%TW2
713
714 ; ARK
715 vbroadcasti32x4 %%T0, [keys]
716 vpxorq %%ST1, %%T0
717 vpxorq %%ST2, %%T0
718
719 %if (0 == %%last_eight)
720 vpsrldq zmm13, %%TW1, 15
721 vpclmulqdq zmm14, zmm13, zpoly, 0
722 vpslldq zmm15, %%TW1, 1
723 vpxord zmm15, zmm15, zmm14
724 %endif
725 ; round 1
726 vbroadcasti32x4 %%T0, [keys + 16*1]
727 vaesdec %%ST1, %%T0
728 vaesdec %%ST2, %%T0
729
730 ; round 2
731 vbroadcasti32x4 %%T0, [keys + 16*2]
732 vaesdec %%ST1, %%T0
733 vaesdec %%ST2, %%T0
734
735 ; round 3
736 vbroadcasti32x4 %%T0, [keys + 16*3]
737 vaesdec %%ST1, %%T0
738 vaesdec %%ST2, %%T0
739 %if (0 == %%last_eight)
740 vpsrldq zmm13, %%TW2, 15
741 vpclmulqdq zmm14, zmm13, zpoly, 0
742 vpslldq zmm16, %%TW2, 1
743 vpxord zmm16, zmm16, zmm14
744 %endif
745 ; round 4
746 vbroadcasti32x4 %%T0, [keys + 16*4]
747 vaesdec %%ST1, %%T0
748 vaesdec %%ST2, %%T0
749
750 ; round 5
751 vbroadcasti32x4 %%T0, [keys + 16*5]
752 vaesdec %%ST1, %%T0
753 vaesdec %%ST2, %%T0
754
755 ; round 6
756 vbroadcasti32x4 %%T0, [keys + 16*6]
757 vaesdec %%ST1, %%T0
758 vaesdec %%ST2, %%T0
759
760 ; round 7
761 vbroadcasti32x4 %%T0, [keys + 16*7]
762 vaesdec %%ST1, %%T0
763 vaesdec %%ST2, %%T0
764
765 ; round 8
766 vbroadcasti32x4 %%T0, [keys + 16*8]
767 vaesdec %%ST1, %%T0
768 vaesdec %%ST2, %%T0
769
770 ; round 9
771 vbroadcasti32x4 %%T0, [keys + 16*9]
772 vaesdec %%ST1, %%T0
773 vaesdec %%ST2, %%T0
774
775 ; round 10
776 vbroadcasti32x4 %%T0, [keys + 16*10]
777 vaesdeclast %%ST1, %%T0
778 vaesdeclast %%ST2, %%T0
779
780 ; xor Tweak values
781 vpxorq %%ST1, %%TW1
782 vpxorq %%ST2, %%TW2
783
784 ; load next Tweak values
785 vmovdqa32 %%TW1, zmm15
786 vmovdqa32 %%TW2, zmm16
787 %endmacro
788
789
790 ; Decrypt 16 blocks in parallel
791 ; generate next 8 tweak values
792 %macro decrypt_by_16_zmm 10
793 %define %%ST1 %1 ; state 1
794 %define %%ST2 %2 ; state 2
795 %define %%ST3 %3 ; state 3
796 %define %%ST4 %4 ; state 4
797
798 %define %%TW1 %5 ; tweak 1
799 %define %%TW2 %6 ; tweak 2
800 %define %%TW3 %7 ; tweak 3
801 %define %%TW4 %8 ; tweak 4
802
803 %define %%T0 %9 ; Temp register
804 %define %%last_eight %10
805
806 ; xor Tweak values
807 vpxorq %%ST1, %%TW1
808 vpxorq %%ST2, %%TW2
809 vpxorq %%ST3, %%TW3
810 vpxorq %%ST4, %%TW4
811
812 ; ARK
813 vbroadcasti32x4 %%T0, [keys]
814 vpxorq %%ST1, %%T0
815 vpxorq %%ST2, %%T0
816 vpxorq %%ST3, %%T0
817 vpxorq %%ST4, %%T0
818
819 %if (0 == %%last_eight)
820 vpsrldq zmm13, %%TW3, 15
821 vpclmulqdq zmm14, zmm13, zpoly, 0
822 vpslldq zmm15, %%TW3, 1
823 vpxord zmm15, zmm15, zmm14
824 %endif
825 ; round 1
826 vbroadcasti32x4 %%T0, [keys + 16*1]
827 vaesdec %%ST1, %%T0
828 vaesdec %%ST2, %%T0
829 vaesdec %%ST3, %%T0
830 vaesdec %%ST4, %%T0
831
832 ; round 2
833 vbroadcasti32x4 %%T0, [keys + 16*2]
834 vaesdec %%ST1, %%T0
835 vaesdec %%ST2, %%T0
836 vaesdec %%ST3, %%T0
837 vaesdec %%ST4, %%T0
838
839 ; round 3
840 vbroadcasti32x4 %%T0, [keys + 16*3]
841 vaesdec %%ST1, %%T0
842 vaesdec %%ST2, %%T0
843 vaesdec %%ST3, %%T0
844 vaesdec %%ST4, %%T0
845 %if (0 == %%last_eight)
846 vpsrldq zmm13, %%TW4, 15
847 vpclmulqdq zmm14, zmm13, zpoly, 0
848 vpslldq zmm16, %%TW4, 1
849 vpxord zmm16, zmm16, zmm14
850 %endif
851 ; round 4
852 vbroadcasti32x4 %%T0, [keys + 16*4]
853 vaesdec %%ST1, %%T0
854 vaesdec %%ST2, %%T0
855 vaesdec %%ST3, %%T0
856 vaesdec %%ST4, %%T0
857
858 ; round 5
859 vbroadcasti32x4 %%T0, [keys + 16*5]
860 vaesdec %%ST1, %%T0
861 vaesdec %%ST2, %%T0
862 vaesdec %%ST3, %%T0
863 vaesdec %%ST4, %%T0
864
865 ; round 6
866 vbroadcasti32x4 %%T0, [keys + 16*6]
867 vaesdec %%ST1, %%T0
868 vaesdec %%ST2, %%T0
869 vaesdec %%ST3, %%T0
870 vaesdec %%ST4, %%T0
871 %if (0 == %%last_eight)
872 vpsrldq zmm13, zmm15, 15
873 vpclmulqdq zmm14, zmm13, zpoly, 0
874 vpslldq zmm17, zmm15, 1
875 vpxord zmm17, zmm17, zmm14
876 %endif
877 ; round 7
878 vbroadcasti32x4 %%T0, [keys + 16*7]
879 vaesdec %%ST1, %%T0
880 vaesdec %%ST2, %%T0
881 vaesdec %%ST3, %%T0
882 vaesdec %%ST4, %%T0
883
884 ; round 8
885 vbroadcasti32x4 %%T0, [keys + 16*8]
886 vaesdec %%ST1, %%T0
887 vaesdec %%ST2, %%T0
888 vaesdec %%ST3, %%T0
889 vaesdec %%ST4, %%T0
890
891 ; round 9
892 vbroadcasti32x4 %%T0, [keys + 16*9]
893 vaesdec %%ST1, %%T0
894 vaesdec %%ST2, %%T0
895 vaesdec %%ST3, %%T0
896 vaesdec %%ST4, %%T0
897 %if (0 == %%last_eight)
898 vpsrldq zmm13, zmm16, 15
899 vpclmulqdq zmm14, zmm13, zpoly, 0
900 vpslldq zmm18, zmm16, 1
901 vpxord zmm18, zmm18, zmm14
902 %endif
903 ; round 10
904 vbroadcasti32x4 %%T0, [keys + 16*10]
905 vaesdeclast %%ST1, %%T0
906 vaesdeclast %%ST2, %%T0
907 vaesdeclast %%ST3, %%T0
908 vaesdeclast %%ST4, %%T0
909
910 ; xor Tweak values
911 vpxorq %%ST1, %%TW1
912 vpxorq %%ST2, %%TW2
913 vpxorq %%ST3, %%TW3
914 vpxorq %%ST4, %%TW4
915
916 ; load next Tweak values
917 vmovdqa32 %%TW1, zmm15
918 vmovdqa32 %%TW2, zmm16
919 vmovdqa32 %%TW3, zmm17
920 vmovdqa32 %%TW4, zmm18
921 %endmacro
922
923
924 section .text
925
926 mk_global XTS_AES_128_dec_expanded_key_vaes, function
927 XTS_AES_128_dec_expanded_key_vaes:
928 endbranch
929
930 %define ALIGN_STACK
931 %ifdef ALIGN_STACK
932 push rbp
933 mov rbp, rsp
934 sub rsp, VARIABLE_OFFSET
935 and rsp, ~63
936 %else
937 sub rsp, VARIABLE_OFFSET
938 %endif
939
940 mov [_gpr + 8*0], rbx
941 %ifidn __OUTPUT_FORMAT__, win64
942 mov [_gpr + 8*1], rdi
943 mov [_gpr + 8*2], rsi
944
945 vmovdqa [_xmm + 16*0], xmm6
946 vmovdqa [_xmm + 16*1], xmm7
947 vmovdqa [_xmm + 16*2], xmm8
948 vmovdqa [_xmm + 16*3], xmm9
949 vmovdqa [_xmm + 16*4], xmm10
950 vmovdqa [_xmm + 16*5], xmm11
951 vmovdqa [_xmm + 16*6], xmm12
952 vmovdqa [_xmm + 16*7], xmm13
953 vmovdqa [_xmm + 16*8], xmm14
954 vmovdqa [_xmm + 16*9], xmm15
955 %endif
956
957 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
958
959
960 vmovdqu xmm1, [T_val] ; read initial Tweak value
961 vpxor xmm4, xmm4 ; for key expansion
962 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
963
964
965 %ifidn __OUTPUT_FORMAT__, win64
966 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
967 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
968 %endif
969
970 cmp N_val, 128
971 jl _less_than_128_bytes
972
973 vpbroadcastq zpoly, ghash_poly_8b
974
975 cmp N_val, 256
976 jge _start_by16
977
978 cmp N_val, 128
979 jge _start_by8
980
981 _do_n_blocks:
982 cmp N_val, 0
983 je _ret_
984
985 cmp N_val, (7*16)
986 jge _remaining_num_blocks_is_7
987
988 cmp N_val, (6*16)
989 jge _remaining_num_blocks_is_6
990
991 cmp N_val, (5*16)
992 jge _remaining_num_blocks_is_5
993
994 cmp N_val, (4*16)
995 jge _remaining_num_blocks_is_4
996
997 cmp N_val, (3*16)
998 jge _remaining_num_blocks_is_3
999
1000 cmp N_val, (2*16)
1001 jge _remaining_num_blocks_is_2
1002
1003 cmp N_val, (1*16)
1004 jge _remaining_num_blocks_is_1
1005
1006 ;; _remaining_num_blocks_is_0:
1007 vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
1008 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1009 vmovdqu [ptr_ciphertext - 16], xmm1
1010 vmovdqa xmm8, xmm1
1011
1012 ; Calc previous tweak
1013 mov tmp1, 1
1014 kmovq k1, tmp1
1015 vpsllq xmm13, xmm9, 63
1016 vpsraq xmm14, xmm13, 63
1017 vpandq xmm5, xmm14, XWORD(zpoly)
1018 vpxorq xmm9 {k1}, xmm9, xmm5
1019 vpsrldq xmm10, xmm9, 8
1020 vpshrdq xmm0, xmm9, xmm10, 1
1021 vpslldq xmm13, xmm13, 8
1022 vpxorq xmm0, xmm0, xmm13
1023 jmp _steal_cipher
1024
1025 _remaining_num_blocks_is_7:
1026 mov tmp1, -1
1027 shr tmp1, 16
1028 kmovq k1, tmp1
1029 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1030 vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
1031 add ptr_plaintext, 16*7
1032 and N_val, 15
1033 je _done_7_remain
1034 vextracti32x4 xmm12, zmm10, 2
1035 vextracti32x4 xmm13, zmm10, 3
1036 vinserti32x4 zmm10, xmm13, 2
1037 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1038 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1039 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1040 add ptr_ciphertext, 16*7
1041 vextracti32x4 xmm8, zmm2, 0x2
1042 vmovdqa xmm0, xmm12
1043 jmp _steal_cipher
1044 _done_7_remain:
1045 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1046 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1047 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1048 jmp _ret_
1049
1050 _remaining_num_blocks_is_6:
1051 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1052 vmovdqu8 ymm2, [ptr_plaintext+16*4]
1053 add ptr_plaintext, 16*6
1054 and N_val, 15
1055 je _done_6_remain
1056 vextracti32x4 xmm12, zmm10, 1
1057 vextracti32x4 xmm13, zmm10, 2
1058 vinserti32x4 zmm10, xmm13, 1
1059 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1060 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1061 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1062 add ptr_ciphertext, 16*6
1063 vextracti32x4 xmm8, zmm2, 0x1
1064 vmovdqa xmm0, xmm12
1065 jmp _steal_cipher
1066 _done_6_remain:
1067 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1068 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1069 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1070 jmp _ret_
1071
1072 _remaining_num_blocks_is_5:
1073 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1074 vmovdqu xmm2, [ptr_plaintext+16*4]
1075 add ptr_plaintext, 16*5
1076 and N_val, 15
1077 je _done_5_remain
1078 vmovdqa xmm12, xmm10
1079 vextracti32x4 xmm10, zmm10, 1
1080 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1081 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1082 vmovdqu [ptr_ciphertext+16*4], xmm2
1083 add ptr_ciphertext, 16*5
1084 vmovdqa xmm8, xmm2
1085 vmovdqa xmm0, xmm12
1086 jmp _steal_cipher
1087 _done_5_remain:
1088 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1089 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1090 vmovdqu [ptr_ciphertext+16*4], xmm2
1091 jmp _ret_
1092
1093 _remaining_num_blocks_is_4:
1094 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1095 add ptr_plaintext, 16*4
1096 and N_val, 15
1097 je _done_4_remain
1098 vextracti32x4 xmm12, zmm9, 3
1099 vinserti32x4 zmm9, xmm10, 3
1100 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1101 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1102 add ptr_ciphertext, 16*4
1103 vextracti32x4 xmm8, zmm1, 0x3
1104 vmovdqa xmm0, xmm12
1105 jmp _steal_cipher
1106 _done_4_remain:
1107 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1108 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1109 jmp _ret_
1110
1111 _remaining_num_blocks_is_3:
1112 vmovdqu xmm1, [ptr_plaintext+16*0]
1113 vmovdqu xmm2, [ptr_plaintext+16*1]
1114 vmovdqu xmm3, [ptr_plaintext+16*2]
1115 add ptr_plaintext, 16*3
1116 and N_val, 15
1117 je _done_3_remain
1118 vextracti32x4 xmm13, zmm9, 2
1119 vextracti32x4 xmm10, zmm9, 1
1120 vextracti32x4 xmm11, zmm9, 3
1121 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1122 vmovdqu [ptr_ciphertext+16*0], xmm1
1123 vmovdqu [ptr_ciphertext+16*1], xmm2
1124 vmovdqu [ptr_ciphertext+16*2], xmm3
1125 add ptr_ciphertext, 16*3
1126 vmovdqa xmm8, xmm3
1127 vmovdqa xmm0, xmm13
1128 jmp _steal_cipher
1129 _done_3_remain:
1130 vextracti32x4 xmm10, zmm9, 1
1131 vextracti32x4 xmm11, zmm9, 2
1132 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1133 vmovdqu [ptr_ciphertext+16*0], xmm1
1134 vmovdqu [ptr_ciphertext+16*1], xmm2
1135 vmovdqu [ptr_ciphertext+16*2], xmm3
1136 jmp _ret_
1137
1138 _remaining_num_blocks_is_2:
1139 vmovdqu xmm1, [ptr_plaintext+16*0]
1140 vmovdqu xmm2, [ptr_plaintext+16*1]
1141 add ptr_plaintext, 16*2
1142 and N_val, 15
1143 je _done_2_remain
1144 vextracti32x4 xmm10, zmm9, 2
1145 vextracti32x4 xmm12, zmm9, 1
1146 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1147 vmovdqu [ptr_ciphertext+16*0], xmm1
1148 vmovdqu [ptr_ciphertext+16*1], xmm2
1149 add ptr_ciphertext, 16*2
1150 vmovdqa xmm8, xmm2
1151 vmovdqa xmm0, xmm12
1152 jmp _steal_cipher
1153 _done_2_remain:
1154 vextracti32x4 xmm10, zmm9, 1
1155 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1156 vmovdqu [ptr_ciphertext+16*0], xmm1
1157 vmovdqu [ptr_ciphertext+16*1], xmm2
1158 jmp _ret_
1159
1160 _remaining_num_blocks_is_1:
1161 vmovdqu xmm1, [ptr_plaintext]
1162 add ptr_plaintext, 16
1163 and N_val, 15
1164 je _done_1_remain
1165 vextracti32x4 xmm11, zmm9, 1
1166 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
1167 vmovdqu [ptr_ciphertext], xmm1
1168 add ptr_ciphertext, 16
1169 vmovdqa xmm8, xmm1
1170 vmovdqa xmm0, xmm9
1171 jmp _steal_cipher
1172 _done_1_remain:
1173 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1174 vmovdqu [ptr_ciphertext], xmm1
1175 jmp _ret_
1176
1177
1178
1179 _start_by16:
1180 ; Make first 7 tweek values
1181 vbroadcasti32x4 zmm0, [TW]
1182 vbroadcasti32x4 zmm8, [shufb_15_7]
1183 mov tmp1, 0xaa
1184 kmovq k2, tmp1
1185
1186 ; Mult tweak by 2^{3, 2, 1, 0}
1187 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1188 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1189 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1190 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1191 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1192 vpxord zmm9, zmm3, zmm4
1193
1194 ; Mult tweak by 2^{7, 6, 5, 4}
1195 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1196 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1197 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1198 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1199 vpxord zmm10, zmm7, zmm5
1200
1201 ; Make next 8 tweek values by all x 2^8
1202 vpsrldq zmm13, zmm9, 15
1203 vpclmulqdq zmm14, zmm13, zpoly, 0
1204 vpslldq zmm11, zmm9, 1
1205 vpxord zmm11, zmm11, zmm14
1206
1207 vpsrldq zmm15, zmm10, 15
1208 vpclmulqdq zmm16, zmm15, zpoly, 0
1209 vpslldq zmm12, zmm10, 1
1210 vpxord zmm12, zmm12, zmm16
1211
1212 _main_loop_run_16:
1213 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1214 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1215 vmovdqu8 zmm3, [ptr_plaintext+16*8]
1216 vmovdqu8 zmm4, [ptr_plaintext+16*12]
1217 add ptr_plaintext, 256
1218
1219 decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
1220
1221 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1222 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1223 vmovdqu8 [ptr_ciphertext+16*8], zmm3
1224 vmovdqu8 [ptr_ciphertext+16*12], zmm4
1225 add ptr_ciphertext, 256
1226 sub N_val, 256
1227 cmp N_val, 256
1228 jge _main_loop_run_16
1229
1230 cmp N_val, 128
1231 jge _main_loop_run_8
1232
1233 jmp _do_n_blocks
1234
1235 _start_by8:
1236 ; Make first 7 tweek values
1237 vbroadcasti32x4 zmm0, [TW]
1238 vbroadcasti32x4 zmm8, [shufb_15_7]
1239 mov tmp1, 0xaa
1240 kmovq k2, tmp1
1241
1242 ; Mult tweak by 2^{3, 2, 1, 0}
1243 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1244 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1245 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1246 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1247 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1248 vpxord zmm9, zmm3, zmm4
1249
1250 ; Mult tweak by 2^{7, 6, 5, 4}
1251 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1252 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1253 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1254 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1255 vpxord zmm10, zmm7, zmm5
1256
1257 _main_loop_run_8:
1258 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1259 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1260 add ptr_plaintext, 128
1261
1262 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
1263
1264 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1265 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1266 add ptr_ciphertext, 128
1267 sub N_val, 128
1268 cmp N_val, 128
1269 jge _main_loop_run_8
1270
1271 jmp _do_n_blocks
1272
1273 _steal_cipher:
1274 ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
1275 vmovdqa xmm2, xmm8
1276
1277 ; shift xmm8 to the left by 16-N_val bytes
1278 lea twtempl, [vpshufb_shf_table]
1279 vmovdqu xmm10, [twtempl+N_val]
1280 vpshufb xmm8, xmm10
1281
1282 vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
1283 vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
1284
1285 ; shift xmm3 to the right by 16-N_val bytes
1286 lea twtempl, [vpshufb_shf_table +16]
1287 sub twtempl, N_val
1288 vmovdqu xmm10, [twtempl]
1289 vpxor xmm10, [mask1]
1290 vpshufb xmm3, xmm10
1291
1292 vpblendvb xmm3, xmm3, xmm2, xmm10
1293
1294 ; xor Tweak value
1295 vpxor xmm8, xmm3, xmm0
1296
1297 ;decrypt last block with cipher stealing
1298 vpxor xmm8, [keys] ; ARK
1299 vaesdec xmm8, [keys + 16*1] ; round 1
1300 vaesdec xmm8, [keys + 16*2] ; round 2
1301 vaesdec xmm8, [keys + 16*3] ; round 3
1302 vaesdec xmm8, [keys + 16*4] ; round 4
1303 vaesdec xmm8, [keys + 16*5] ; round 5
1304 vaesdec xmm8, [keys + 16*6] ; round 6
1305 vaesdec xmm8, [keys + 16*7] ; round 7
1306 vaesdec xmm8, [keys + 16*8] ; round 8
1307 vaesdec xmm8, [keys + 16*9] ; round 9
1308 vaesdeclast xmm8, [keys + 16*10] ; round 10
1309
1310 ; xor Tweak value
1311 vpxor xmm8, xmm8, xmm0
1312
1313 _done:
1314 ; store last ciphertext value
1315 vmovdqu [ptr_ciphertext - 16], xmm8
1316
1317 _ret_:
1318 mov rbx, [_gpr + 8*0]
1319
1320 %ifidn __OUTPUT_FORMAT__, win64
1321 mov rdi, [_gpr + 8*1]
1322 mov rsi, [_gpr + 8*2]
1323
1324 vmovdqa xmm6, [_xmm + 16*0]
1325 vmovdqa xmm7, [_xmm + 16*1]
1326 vmovdqa xmm8, [_xmm + 16*2]
1327 vmovdqa xmm9, [_xmm + 16*3]
1328 vmovdqa xmm10, [_xmm + 16*4]
1329 vmovdqa xmm11, [_xmm + 16*5]
1330 vmovdqa xmm12, [_xmm + 16*6]
1331 vmovdqa xmm13, [_xmm + 16*7]
1332 vmovdqa xmm14, [_xmm + 16*8]
1333 vmovdqa xmm15, [_xmm + 16*9]
1334 %endif
1335
1336 %ifndef ALIGN_STACK
1337 add rsp, VARIABLE_OFFSET
1338 %else
1339 mov rsp, rbp
1340 pop rbp
1341 %endif
1342 ret
1343
1344
1345 _less_than_128_bytes:
1346 cmp N_val, 16
1347 jb _ret_
1348
1349 mov tmp1, N_val
1350 and tmp1, (7 << 4)
1351 cmp tmp1, (6 << 4)
1352 je _num_blocks_is_6
1353 cmp tmp1, (5 << 4)
1354 je _num_blocks_is_5
1355 cmp tmp1, (4 << 4)
1356 je _num_blocks_is_4
1357 cmp tmp1, (3 << 4)
1358 je _num_blocks_is_3
1359 cmp tmp1, (2 << 4)
1360 je _num_blocks_is_2
1361 cmp tmp1, (1 << 4)
1362 je _num_blocks_is_1
1363
1364 _num_blocks_is_7:
1365 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1366 add ptr_plaintext, 16*7
1367 and N_val, 15
1368 je _done_7
1369
1370 _steal_cipher_7:
1371 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1372 shl twtempl, 1
1373 adc twtemph, twtemph
1374 cmovc ghash_poly_8b_temp, ghash_poly_8b
1375 xor twtempl, ghash_poly_8b_temp
1376 mov [TW+8*2], twtempl
1377 mov [TW+8*3], twtemph
1378 vmovdqa64 xmm16, xmm15
1379 vmovdqa xmm15, [TW+16*1]
1380
1381 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1382 vmovdqu [ptr_ciphertext+16*0], xmm1
1383 vmovdqu [ptr_ciphertext+16*1], xmm2
1384 vmovdqu [ptr_ciphertext+16*2], xmm3
1385 vmovdqu [ptr_ciphertext+16*3], xmm4
1386 vmovdqu [ptr_ciphertext+16*4], xmm5
1387 vmovdqu [ptr_ciphertext+16*5], xmm6
1388 add ptr_ciphertext, 16*7
1389 vmovdqa64 xmm0, xmm16
1390 vmovdqa xmm8, xmm7
1391 jmp _steal_cipher
1392
1393 _done_7:
1394 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1395 vmovdqu [ptr_ciphertext+16*0], xmm1
1396 vmovdqu [ptr_ciphertext+16*1], xmm2
1397 vmovdqu [ptr_ciphertext+16*2], xmm3
1398 vmovdqu [ptr_ciphertext+16*3], xmm4
1399 vmovdqu [ptr_ciphertext+16*4], xmm5
1400 vmovdqu [ptr_ciphertext+16*5], xmm6
1401 add ptr_ciphertext, 16*7
1402 vmovdqa xmm8, xmm7
1403 jmp _done
1404
1405 _num_blocks_is_6:
1406 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1407 add ptr_plaintext, 16*6
1408 and N_val, 15
1409 je _done_6
1410
1411 _steal_cipher_6:
1412 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1413 shl twtempl, 1
1414 adc twtemph, twtemph
1415 cmovc ghash_poly_8b_temp, ghash_poly_8b
1416 xor twtempl, ghash_poly_8b_temp
1417 mov [TW+8*2], twtempl
1418 mov [TW+8*3], twtemph
1419 vmovdqa xmm15, xmm14
1420 vmovdqa xmm14, [TW+16*1]
1421
1422 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1423 vmovdqu [ptr_ciphertext+16*0], xmm1
1424 vmovdqu [ptr_ciphertext+16*1], xmm2
1425 vmovdqu [ptr_ciphertext+16*2], xmm3
1426 vmovdqu [ptr_ciphertext+16*3], xmm4
1427 vmovdqu [ptr_ciphertext+16*4], xmm5
1428 add ptr_ciphertext, 16*6
1429 vmovdqa xmm0, xmm15
1430 vmovdqa xmm8, xmm6
1431 jmp _steal_cipher
1432
1433 _done_6:
1434 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1435 vmovdqu [ptr_ciphertext+16*0], xmm1
1436 vmovdqu [ptr_ciphertext+16*1], xmm2
1437 vmovdqu [ptr_ciphertext+16*2], xmm3
1438 vmovdqu [ptr_ciphertext+16*3], xmm4
1439 vmovdqu [ptr_ciphertext+16*4], xmm5
1440 add ptr_ciphertext, 16*6
1441 vmovdqa xmm8, xmm6
1442 jmp _done
1443
1444 _num_blocks_is_5:
1445 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1446 add ptr_plaintext, 16*5
1447 and N_val, 15
1448 je _done_5
1449
1450 _steal_cipher_5:
1451 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1452 shl twtempl, 1
1453 adc twtemph, twtemph
1454 cmovc ghash_poly_8b_temp, ghash_poly_8b
1455 xor twtempl, ghash_poly_8b_temp
1456 mov [TW+8*2], twtempl
1457 mov [TW+8*3], twtemph
1458 vmovdqa xmm14, xmm13
1459 vmovdqa xmm13, [TW+16*1]
1460
1461 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1462 vmovdqu [ptr_ciphertext+16*0], xmm1
1463 vmovdqu [ptr_ciphertext+16*1], xmm2
1464 vmovdqu [ptr_ciphertext+16*2], xmm3
1465 vmovdqu [ptr_ciphertext+16*3], xmm4
1466 add ptr_ciphertext, 16*5
1467 vmovdqa xmm0, xmm14
1468 vmovdqa xmm8, xmm5
1469 jmp _steal_cipher
1470
1471 _done_5:
1472 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1473 vmovdqu [ptr_ciphertext+16*0], xmm1
1474 vmovdqu [ptr_ciphertext+16*1], xmm2
1475 vmovdqu [ptr_ciphertext+16*2], xmm3
1476 vmovdqu [ptr_ciphertext+16*3], xmm4
1477 add ptr_ciphertext, 16*5
1478 vmovdqa xmm8, xmm5
1479 jmp _done
1480
1481 _num_blocks_is_4:
1482 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1483 add ptr_plaintext, 16*4
1484 and N_val, 15
1485 je _done_4
1486
1487 _steal_cipher_4:
1488 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1489 shl twtempl, 1
1490 adc twtemph, twtemph
1491 cmovc ghash_poly_8b_temp, ghash_poly_8b
1492 xor twtempl, ghash_poly_8b_temp
1493 mov [TW+8*2], twtempl
1494 mov [TW+8*3], twtemph
1495 vmovdqa xmm13, xmm12
1496 vmovdqa xmm12, [TW+16*1]
1497
1498 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1499 vmovdqu [ptr_ciphertext+16*0], xmm1
1500 vmovdqu [ptr_ciphertext+16*1], xmm2
1501 vmovdqu [ptr_ciphertext+16*2], xmm3
1502 add ptr_ciphertext, 16*4
1503 vmovdqa xmm0, xmm13
1504 vmovdqa xmm8, xmm4
1505 jmp _steal_cipher
1506
1507 _done_4:
1508 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1509 vmovdqu [ptr_ciphertext+16*0], xmm1
1510 vmovdqu [ptr_ciphertext+16*1], xmm2
1511 vmovdqu [ptr_ciphertext+16*2], xmm3
1512 add ptr_ciphertext, 16*4
1513 vmovdqa xmm8, xmm4
1514 jmp _done
1515
1516 _num_blocks_is_3:
1517 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1518 add ptr_plaintext, 16*3
1519 and N_val, 15
1520 je _done_3
1521
1522 _steal_cipher_3:
1523 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1524 shl twtempl, 1
1525 adc twtemph, twtemph
1526 cmovc ghash_poly_8b_temp, ghash_poly_8b
1527 xor twtempl, ghash_poly_8b_temp
1528 mov [TW+8*2], twtempl
1529 mov [TW+8*3], twtemph
1530 vmovdqa xmm12, xmm11
1531 vmovdqa xmm11, [TW+16*1]
1532
1533 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1534 vmovdqu [ptr_ciphertext+16*0], xmm1
1535 vmovdqu [ptr_ciphertext+16*1], xmm2
1536 add ptr_ciphertext, 16*3
1537 vmovdqa xmm0, xmm12
1538 vmovdqa xmm8, xmm3
1539 jmp _steal_cipher
1540
1541 _done_3:
1542 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1543 vmovdqu [ptr_ciphertext+16*0], xmm1
1544 vmovdqu [ptr_ciphertext+16*1], xmm2
1545 add ptr_ciphertext, 16*3
1546 vmovdqa xmm8, xmm3
1547 jmp _done
1548
1549 _num_blocks_is_2:
1550 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1551 add ptr_plaintext, 16*2
1552 and N_val, 15
1553 je _done_2
1554
1555 _steal_cipher_2:
1556 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1557 shl twtempl, 1
1558 adc twtemph, twtemph
1559 cmovc ghash_poly_8b_temp, ghash_poly_8b
1560 xor twtempl, ghash_poly_8b_temp
1561 mov [TW+8*2], twtempl
1562 mov [TW+8*3], twtemph
1563 vmovdqa xmm11, xmm10
1564 vmovdqa xmm10, [TW+16*1]
1565
1566 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1567 vmovdqu [ptr_ciphertext], xmm1
1568 add ptr_ciphertext, 16*2
1569 vmovdqa xmm0, xmm11
1570 vmovdqa xmm8, xmm2
1571 jmp _steal_cipher
1572
1573 _done_2:
1574 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1575 vmovdqu [ptr_ciphertext], xmm1
1576 add ptr_ciphertext, 16*2
1577 vmovdqa xmm8, xmm2
1578 jmp _done
1579
1580 _num_blocks_is_1:
1581 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1582 add ptr_plaintext, 16*1
1583 and N_val, 15
1584 je _done_1
1585
1586 _steal_cipher_1:
1587 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1588 shl twtempl, 1
1589 adc twtemph, twtemph
1590 cmovc ghash_poly_8b_temp, ghash_poly_8b
1591 xor twtempl, ghash_poly_8b_temp
1592 mov [TW+8*2], twtempl
1593 mov [TW+8*3], twtemph
1594 vmovdqa xmm10, xmm9
1595 vmovdqa xmm9, [TW+16*1]
1596
1597 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1598 add ptr_ciphertext, 16*1
1599 vmovdqa xmm0, xmm10
1600 vmovdqa xmm8, xmm1
1601 jmp _steal_cipher
1602
1603 _done_1:
1604 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1605 add ptr_ciphertext, 16*1
1606 vmovdqa xmm8, xmm1
1607 jmp _done
1608
1609 section .data
1610 align 16
1611
1612 vpshufb_shf_table:
1613 ; use these values for shift constants for the vpshufb instruction
1614 ; different alignments result in values as shown:
1615 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1616 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1617 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1618 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1619 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1620 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1621 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1622 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1623 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1624 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1625 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1626 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1627 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1628 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1629 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1630 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1631 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1632
1633 mask1:
1634 dq 0x8080808080808080, 0x8080808080808080
1635
1636 const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
1637 const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
1638 const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
1639 const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
1640
1641 shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1642
1643 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
1644 %ifidn __OUTPUT_FORMAT__, win64
1645 global no_XTS_AES_128_dec_expanded_key_vaes
1646 no_XTS_AES_128_dec_expanded_key_vaes:
1647 %endif
1648 %endif ; (AS_FEATURE_LEVEL) >= 10