]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_128_dec_vaes.asm
CommitLineData
1e59de90
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29; XTS decrypt function with 256-bit AES
30; input keys are not aligned
31; keys are expanded in parallel with the tweak encryption
32; plaintext and ciphertext are not aligned
33; second key is stored in the stack as aligned to 16 Bytes
34; first key is required only once, no need for storage of this key
35
36%include "reg_sizes.asm"
37
38%if (AS_FEATURE_LEVEL) >= 10
39
40default rel
41%define TW rsp ; store 8 tweak values
42%define keys rsp + 16*8 ; store 15 expanded keys
43
44%ifidn __OUTPUT_FORMAT__, win64
45 %define _xmm rsp + 16*23 ; store xmm6:xmm15
46%endif
47
48%ifidn __OUTPUT_FORMAT__, elf64
49%define _gpr rsp + 16*23 ; store rbx
50%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
51%else
52%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
53%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
54%endif
55
56%define GHASH_POLY 0x87
57
58;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59;void XTS_AES_256_dec_vavx(
60; UINT8 *k2, // key used for tweaking, 16*2 bytes
61; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
62; UINT8 *TW_initial, // initial tweak value, 16 bytes
63; UINT64 N, // sector size, in bytes
64; const UINT8 *pt, // plaintext sector input data
65; UINT8 *ct); // ciphertext sector output data
66;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
67
68; arguments for input parameters
69%ifidn __OUTPUT_FORMAT__, elf64
70 %xdefine ptr_key2 rdi
71 %xdefine ptr_key1 rsi
72 %xdefine T_val rdx
73 %xdefine N_val rcx
74 %xdefine ptr_plaintext r8
75 %xdefine ptr_ciphertext r9
76%else
77 %xdefine ptr_key2 rcx
78 %xdefine ptr_key1 rdx
79 %xdefine T_val r8
80 %xdefine N_val r9
81 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
82 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
83%endif
84
85; arguments for temp parameters
86%ifidn __OUTPUT_FORMAT__, elf64
87 %define tmp1 rdi
88 %define ghash_poly_8b r10
89 %define ghash_poly_8b_temp r11
90%else
91 %define tmp1 rcx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94%endif
95
96%define twtempl rax ; global temp registers used for tweak computation
97%define twtemph rbx
98%define zpoly zmm25
99
100; produce the key for the next round
101; raw_key is the output of vaeskeygenassist instruction
102; round_key value before this key_expansion_128 macro is current round key
103; round_key value after this key_expansion_128 macro is next round key
104%macro key_expansion_128 3
105%define %%xraw_key %1
106%define %%xtmp %2
107%define %%xround_key %3
108 vpshufd %%xraw_key, %%xraw_key, 11111111b
109 vshufps %%xtmp, %%xround_key, 00010000b
110 vpxor %%xround_key, %%xtmp
111 vshufps %%xtmp, %%xround_key, 10001100b
112 vpxor %%xround_key, %%xtmp
113 vpxor %%xround_key, %%xraw_key
114%endmacro
115
116
117
118; macro to encrypt the tweak value in parallel with key generation of both keys
119
120%macro encrypt_T 9
121%define %%xkey2 %1
122%define %%xstate_tweak %2
123%define %%xkey1 %3
124%define %%xraw_key %4
125%define %%xtmp %5
126%define %%xtmp2 %6
127%define %%ptr_key2 %7
128%define %%ptr_key1 %8
129%define %%ptr_expanded_keys %9
130
131
132 vmovdqu %%xkey2, [%%ptr_key2]
133 vmovdqu %%xkey1, [%%ptr_key1]
134 vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
135
136 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
137
138 vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
139 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
140 vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
141 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
142 vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
143 vaesimc %%xtmp2, %%xkey1
144 vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
145
146 vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
147 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
148 vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
149 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
150 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
151 vaesimc %%xtmp2, %%xkey1
152 vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
153
154 vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
155 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
156 vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
157 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
158 vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
159 vaesimc %%xtmp2, %%xkey1
160 vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
161
162 vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
163 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
164 vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
165 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
166 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
167 vaesimc %%xtmp2, %%xkey1
168 vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
169
170 vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
171 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
172 vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
173 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
174 vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
175 vaesimc %%xtmp2, %%xkey1
176 vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
177
178 vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
179 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
180 vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
181 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
182 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
183 vaesimc %%xtmp2, %%xkey1
184 vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
185
186 vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
187 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
188 vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
189 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
190 vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
191 vaesimc %%xtmp2, %%xkey1
192 vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
193
194 vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
195 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
196 vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
197 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
198 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
199 vaesimc %%xtmp2, %%xkey1
200 vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
201
202 vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
203 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
204 vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
205 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
206 vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
207 vaesimc %%xtmp2, %%xkey1
208 vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
209
210 vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
211 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
212 vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
213 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
214 vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
215 vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1
216
217 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
218%endmacro
219
220
221; Original way to generate initial tweak values and load plaintext values
222; only used for small blocks
223%macro initialize 16
224
225%define %%ST1 %1 ; state 1
226%define %%ST2 %2 ; state 2
227%define %%ST3 %3 ; state 3
228%define %%ST4 %4 ; state 4
229%define %%ST5 %5 ; state 5
230%define %%ST6 %6 ; state 6
231%define %%ST7 %7 ; state 7
232%define %%ST8 %8 ; state 8
233
234%define %%TW1 %9 ; tweak 1
235%define %%TW2 %10 ; tweak 2
236%define %%TW3 %11 ; tweak 3
237%define %%TW4 %12 ; tweak 4
238%define %%TW5 %13 ; tweak 5
239%define %%TW6 %14 ; tweak 6
240%define %%TW7 %15 ; tweak 7
241
242%define %%num_initial_blocks %16
243
244
245 ; generate next Tweak values
246 vmovdqa %%TW1, [TW+16*0]
247 mov twtempl, [TW+8*0]
248 mov twtemph, [TW+8*1]
249 vmovdqu %%ST1, [ptr_plaintext+16*0]
250%if (%%num_initial_blocks>=2)
251 xor ghash_poly_8b_temp, ghash_poly_8b_temp
252 shl twtempl, 1
253 adc twtemph, twtemph
254 cmovc ghash_poly_8b_temp, ghash_poly_8b
255 xor twtempl, ghash_poly_8b_temp
256 mov [TW+8*2], twtempl
257 mov [TW+8*3], twtemph;
258 vmovdqa %%TW2, [TW+16*1]
259 vmovdqu %%ST2, [ptr_plaintext+16*1]
260%endif
261%if (%%num_initial_blocks>=3)
262 xor ghash_poly_8b_temp, ghash_poly_8b_temp
263 shl twtempl, 1
264 adc twtemph, twtemph
265 cmovc ghash_poly_8b_temp, ghash_poly_8b
266 xor twtempl, ghash_poly_8b_temp
267 mov [TW+8*4], twtempl
268 mov [TW+8*5], twtemph;
269 vmovdqa %%TW3, [TW+16*2]
270 vmovdqu %%ST3, [ptr_plaintext+16*2]
271%endif
272%if (%%num_initial_blocks>=4)
273 xor ghash_poly_8b_temp, ghash_poly_8b_temp
274 shl twtempl, 1
275 adc twtemph, twtemph
276 cmovc ghash_poly_8b_temp, ghash_poly_8b
277 xor twtempl, ghash_poly_8b_temp
278 mov [TW+8*6], twtempl
279 mov [TW+8*7], twtemph;
280 vmovdqa %%TW4, [TW+16*3]
281 vmovdqu %%ST4, [ptr_plaintext+16*3]
282%endif
283%if (%%num_initial_blocks>=5)
284 xor ghash_poly_8b_temp, ghash_poly_8b_temp
285 shl twtempl, 1
286 adc twtemph, twtemph
287 cmovc ghash_poly_8b_temp, ghash_poly_8b
288 xor twtempl, ghash_poly_8b_temp
289 mov [TW+8*8], twtempl
290 mov [TW+8*9], twtemph;
291 vmovdqa %%TW5, [TW+16*4]
292 vmovdqu %%ST5, [ptr_plaintext+16*4]
293%endif
294%if (%%num_initial_blocks>=6)
295 xor ghash_poly_8b_temp, ghash_poly_8b_temp
296 shl twtempl, 1
297 adc twtemph, twtemph
298 cmovc ghash_poly_8b_temp, ghash_poly_8b
299 xor twtempl, ghash_poly_8b_temp
300 mov [TW+8*10], twtempl
301 mov [TW+8*11], twtemph;
302 vmovdqa %%TW6, [TW+16*5]
303 vmovdqu %%ST6, [ptr_plaintext+16*5]
304%endif
305%if (%%num_initial_blocks>=7)
306 xor ghash_poly_8b_temp, ghash_poly_8b_temp
307 shl twtempl, 1
308 adc twtemph, twtemph
309 cmovc ghash_poly_8b_temp, ghash_poly_8b
310 xor twtempl, ghash_poly_8b_temp
311 mov [TW+8*12], twtempl
312 mov [TW+8*13], twtemph;
313 vmovdqa %%TW7, [TW+16*6]
314 vmovdqu %%ST7, [ptr_plaintext+16*6]
315%endif
316
317%endmacro
318
319
320; Original decrypt initial blocks of AES
321; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
322; next 8 Tweak values can be generated
323%macro decrypt_initial 18
324%define %%ST1 %1 ; state 1
325%define %%ST2 %2 ; state 2
326%define %%ST3 %3 ; state 3
327%define %%ST4 %4 ; state 4
328%define %%ST5 %5 ; state 5
329%define %%ST6 %6 ; state 6
330%define %%ST7 %7 ; state 7
331%define %%ST8 %8 ; state 8
332
333%define %%TW1 %9 ; tweak 1
334%define %%TW2 %10 ; tweak 2
335%define %%TW3 %11 ; tweak 3
336%define %%TW4 %12 ; tweak 4
337%define %%TW5 %13 ; tweak 5
338%define %%TW6 %14 ; tweak 6
339%define %%TW7 %15 ; tweak 7
340%define %%T0 %16 ; Temp register
341%define %%num_blocks %17
342; %%num_blocks blocks decrypted
343; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
344
345%define %%lt128 %18 ; less than 128 bytes
346
347 ; xor Tweak value
348 vpxor %%ST1, %%TW1
349%if (%%num_blocks>=2)
350 vpxor %%ST2, %%TW2
351%endif
352%if (%%num_blocks>=3)
353 vpxor %%ST3, %%TW3
354%endif
355%if (%%num_blocks>=4)
356 vpxor %%ST4, %%TW4
357%endif
358%if (%%num_blocks>=5)
359 vpxor %%ST5, %%TW5
360%endif
361%if (%%num_blocks>=6)
362 vpxor %%ST6, %%TW6
363%endif
364%if (%%num_blocks>=7)
365 vpxor %%ST7, %%TW7
366%endif
367
368
369 ; ARK
370 vmovdqa %%T0, [keys]
371 vpxor %%ST1, %%T0
372%if (%%num_blocks>=2)
373 vpxor %%ST2, %%T0
374%endif
375%if (%%num_blocks>=3)
376 vpxor %%ST3, %%T0
377%endif
378%if (%%num_blocks>=4)
379 vpxor %%ST4, %%T0
380%endif
381%if (%%num_blocks>=5)
382 vpxor %%ST5, %%T0
383%endif
384%if (%%num_blocks>=6)
385 vpxor %%ST6, %%T0
386%endif
387%if (%%num_blocks>=7)
388 vpxor %%ST7, %%T0
389%endif
390
391
392 %if (0 == %%lt128)
393 xor ghash_poly_8b_temp, ghash_poly_8b_temp
394 shl twtempl, 1
395 adc twtemph, twtemph
396 %endif
397
398 ; round 1
399 vmovdqa %%T0, [keys + 16*1]
400 vaesdec %%ST1, %%T0
401%if (%%num_blocks>=2)
402 vaesdec %%ST2, %%T0
403%endif
404%if (%%num_blocks>=3)
405 vaesdec %%ST3, %%T0
406%endif
407%if (%%num_blocks>=4)
408 vaesdec %%ST4, %%T0
409%endif
410%if (%%num_blocks>=5)
411 vaesdec %%ST5, %%T0
412%endif
413%if (%%num_blocks>=6)
414 vaesdec %%ST6, %%T0
415%endif
416%if (%%num_blocks>=7)
417 vaesdec %%ST7, %%T0
418%endif
419 %if (0 == %%lt128)
420 cmovc ghash_poly_8b_temp, ghash_poly_8b
421 xor twtempl, ghash_poly_8b_temp
422 mov [TW + 8*0], twtempl ; next Tweak1 generated
423 mov [TW + 8*1], twtemph
424 xor ghash_poly_8b_temp, ghash_poly_8b_temp
425 %endif
426
427 ; round 2
428 vmovdqa %%T0, [keys + 16*2]
429 vaesdec %%ST1, %%T0
430%if (%%num_blocks>=2)
431 vaesdec %%ST2, %%T0
432%endif
433%if (%%num_blocks>=3)
434 vaesdec %%ST3, %%T0
435%endif
436%if (%%num_blocks>=4)
437 vaesdec %%ST4, %%T0
438%endif
439%if (%%num_blocks>=5)
440 vaesdec %%ST5, %%T0
441%endif
442%if (%%num_blocks>=6)
443 vaesdec %%ST6, %%T0
444%endif
445%if (%%num_blocks>=7)
446 vaesdec %%ST7, %%T0
447%endif
448
449 %if (0 == %%lt128)
450 shl twtempl, 1
451 adc twtemph, twtemph
452 cmovc ghash_poly_8b_temp, ghash_poly_8b
453 xor twtempl, ghash_poly_8b_temp
454 mov [TW + 8*2], twtempl ; next Tweak2 generated
455 %endif
456
457 ; round 3
458 vmovdqa %%T0, [keys + 16*3]
459 vaesdec %%ST1, %%T0
460%if (%%num_blocks>=2)
461 vaesdec %%ST2, %%T0
462%endif
463%if (%%num_blocks>=3)
464 vaesdec %%ST3, %%T0
465%endif
466%if (%%num_blocks>=4)
467 vaesdec %%ST4, %%T0
468%endif
469%if (%%num_blocks>=5)
470 vaesdec %%ST5, %%T0
471%endif
472%if (%%num_blocks>=6)
473 vaesdec %%ST6, %%T0
474%endif
475%if (%%num_blocks>=7)
476 vaesdec %%ST7, %%T0
477%endif
478 %if (0 == %%lt128)
479 mov [TW + 8*3], twtemph
480 xor ghash_poly_8b_temp, ghash_poly_8b_temp
481 shl twtempl, 1
482 adc twtemph, twtemph
483 cmovc ghash_poly_8b_temp, ghash_poly_8b
484 %endif
485
486 ; round 4
487 vmovdqa %%T0, [keys + 16*4]
488 vaesdec %%ST1, %%T0
489%if (%%num_blocks>=2)
490 vaesdec %%ST2, %%T0
491%endif
492%if (%%num_blocks>=3)
493 vaesdec %%ST3, %%T0
494%endif
495%if (%%num_blocks>=4)
496 vaesdec %%ST4, %%T0
497%endif
498%if (%%num_blocks>=5)
499 vaesdec %%ST5, %%T0
500%endif
501%if (%%num_blocks>=6)
502 vaesdec %%ST6, %%T0
503%endif
504%if (%%num_blocks>=7)
505 vaesdec %%ST7, %%T0
506%endif
507
508 %if (0 == %%lt128)
509 xor twtempl, ghash_poly_8b_temp
510 mov [TW + 8*4], twtempl ; next Tweak3 generated
511 mov [TW + 8*5], twtemph
512 xor ghash_poly_8b_temp, ghash_poly_8b_temp
513 shl twtempl, 1
514 %endif
515
516 ; round 5
517 vmovdqa %%T0, [keys + 16*5]
518 vaesdec %%ST1, %%T0
519%if (%%num_blocks>=2)
520 vaesdec %%ST2, %%T0
521%endif
522%if (%%num_blocks>=3)
523 vaesdec %%ST3, %%T0
524%endif
525%if (%%num_blocks>=4)
526 vaesdec %%ST4, %%T0
527%endif
528%if (%%num_blocks>=5)
529 vaesdec %%ST5, %%T0
530%endif
531%if (%%num_blocks>=6)
532 vaesdec %%ST6, %%T0
533%endif
534%if (%%num_blocks>=7)
535 vaesdec %%ST7, %%T0
536%endif
537
538 %if (0 == %%lt128)
539 adc twtemph, twtemph
540 cmovc ghash_poly_8b_temp, ghash_poly_8b
541 xor twtempl, ghash_poly_8b_temp
542 mov [TW + 8*6], twtempl ; next Tweak4 generated
543 mov [TW + 8*7], twtemph
544 %endif
545
546 ; round 6
547 vmovdqa %%T0, [keys + 16*6]
548 vaesdec %%ST1, %%T0
549%if (%%num_blocks>=2)
550 vaesdec %%ST2, %%T0
551%endif
552%if (%%num_blocks>=3)
553 vaesdec %%ST3, %%T0
554%endif
555%if (%%num_blocks>=4)
556 vaesdec %%ST4, %%T0
557%endif
558%if (%%num_blocks>=5)
559 vaesdec %%ST5, %%T0
560%endif
561%if (%%num_blocks>=6)
562 vaesdec %%ST6, %%T0
563%endif
564%if (%%num_blocks>=7)
565 vaesdec %%ST7, %%T0
566%endif
567
568 %if (0 == %%lt128)
569 xor ghash_poly_8b_temp, ghash_poly_8b_temp
570 shl twtempl, 1
571 adc twtemph, twtemph
572 cmovc ghash_poly_8b_temp, ghash_poly_8b
573 xor twtempl, ghash_poly_8b_temp
574 mov [TW + 8*8], twtempl ; next Tweak5 generated
575 mov [TW + 8*9], twtemph
576 %endif
577
578 ; round 7
579 vmovdqa %%T0, [keys + 16*7]
580 vaesdec %%ST1, %%T0
581%if (%%num_blocks>=2)
582 vaesdec %%ST2, %%T0
583%endif
584%if (%%num_blocks>=3)
585 vaesdec %%ST3, %%T0
586%endif
587%if (%%num_blocks>=4)
588 vaesdec %%ST4, %%T0
589%endif
590%if (%%num_blocks>=5)
591 vaesdec %%ST5, %%T0
592%endif
593%if (%%num_blocks>=6)
594 vaesdec %%ST6, %%T0
595%endif
596%if (%%num_blocks>=7)
597 vaesdec %%ST7, %%T0
598%endif
599
600 %if (0 == %%lt128)
601 xor ghash_poly_8b_temp, ghash_poly_8b_temp
602 shl twtempl, 1
603 adc twtemph, twtemph
604 cmovc ghash_poly_8b_temp, ghash_poly_8b
605 xor twtempl, ghash_poly_8b_temp
606 mov [TW + 8*10], twtempl ; next Tweak6 generated
607 mov [TW + 8*11], twtemph
608 %endif
609 ; round 8
610 vmovdqa %%T0, [keys + 16*8]
611 vaesdec %%ST1, %%T0
612%if (%%num_blocks>=2)
613 vaesdec %%ST2, %%T0
614%endif
615%if (%%num_blocks>=3)
616 vaesdec %%ST3, %%T0
617%endif
618%if (%%num_blocks>=4)
619 vaesdec %%ST4, %%T0
620%endif
621%if (%%num_blocks>=5)
622 vaesdec %%ST5, %%T0
623%endif
624%if (%%num_blocks>=6)
625 vaesdec %%ST6, %%T0
626%endif
627%if (%%num_blocks>=7)
628 vaesdec %%ST7, %%T0
629%endif
630
631 %if (0 == %%lt128)
632 xor ghash_poly_8b_temp, ghash_poly_8b_temp
633 shl twtempl, 1
634 adc twtemph, twtemph
635 cmovc ghash_poly_8b_temp, ghash_poly_8b
636 xor twtempl, ghash_poly_8b_temp
637 mov [TW + 8*12], twtempl ; next Tweak7 generated
638 mov [TW + 8*13], twtemph
639 %endif
640 ; round 9
641 vmovdqa %%T0, [keys + 16*9]
642 vaesdec %%ST1, %%T0
643%if (%%num_blocks>=2)
644 vaesdec %%ST2, %%T0
645%endif
646%if (%%num_blocks>=3)
647 vaesdec %%ST3, %%T0
648%endif
649%if (%%num_blocks>=4)
650 vaesdec %%ST4, %%T0
651%endif
652%if (%%num_blocks>=5)
653 vaesdec %%ST5, %%T0
654%endif
655%if (%%num_blocks>=6)
656 vaesdec %%ST6, %%T0
657%endif
658%if (%%num_blocks>=7)
659 vaesdec %%ST7, %%T0
660%endif
661
662 %if (0 == %%lt128)
663 xor ghash_poly_8b_temp, ghash_poly_8b_temp
664 shl twtempl, 1
665 adc twtemph, twtemph
666 cmovc ghash_poly_8b_temp, ghash_poly_8b
667 xor twtempl, ghash_poly_8b_temp
668 mov [TW + 8*14], twtempl ; next Tweak8 generated
669 mov [TW + 8*15], twtemph
670 %endif
671
672 ; round 10
673 vmovdqa %%T0, [keys + 16*10]
674 vaesdeclast %%ST1, %%T0
675%if (%%num_blocks>=2)
676 vaesdeclast %%ST2, %%T0
677%endif
678%if (%%num_blocks>=3)
679 vaesdeclast %%ST3, %%T0
680%endif
681%if (%%num_blocks>=4)
682 vaesdeclast %%ST4, %%T0
683%endif
684%if (%%num_blocks>=5)
685 vaesdeclast %%ST5, %%T0
686%endif
687%if (%%num_blocks>=6)
688 vaesdeclast %%ST6, %%T0
689%endif
690%if (%%num_blocks>=7)
691 vaesdeclast %%ST7, %%T0
692%endif
693
694
695 ; xor Tweak values
696 vpxor %%ST1, %%TW1
697%if (%%num_blocks>=2)
698 vpxor %%ST2, %%TW2
699%endif
700%if (%%num_blocks>=3)
701 vpxor %%ST3, %%TW3
702%endif
703%if (%%num_blocks>=4)
704 vpxor %%ST4, %%TW4
705%endif
706%if (%%num_blocks>=5)
707 vpxor %%ST5, %%TW5
708%endif
709%if (%%num_blocks>=6)
710 vpxor %%ST6, %%TW6
711%endif
712%if (%%num_blocks>=7)
713 vpxor %%ST7, %%TW7
714%endif
715
716
717%if (0 == %%lt128)
718 ; load next Tweak values
719 vmovdqa %%TW1, [TW + 16*0]
720 vmovdqa %%TW2, [TW + 16*1]
721 vmovdqa %%TW3, [TW + 16*2]
722 vmovdqa %%TW4, [TW + 16*3]
723 vmovdqa %%TW5, [TW + 16*4]
724 vmovdqa %%TW6, [TW + 16*5]
725 vmovdqa %%TW7, [TW + 16*6]
726
727%endif
728
729%endmacro
730
731
732
733; Decrypt 8 blocks in parallel
734; generate next 8 tweak values
735%macro decrypt_by_eight_zmm 6
736%define %%ST1 %1 ; state 1
737%define %%ST2 %2 ; state 2
738%define %%TW1 %3 ; tweak 1
739%define %%TW2 %4 ; tweak 2
740%define %%T0 %5 ; Temp register
741%define %%last_eight %6
742
743 ; xor Tweak values
744 vpxorq %%ST1, %%TW1
745 vpxorq %%ST2, %%TW2
746
747 ; ARK
748 vbroadcasti32x4 %%T0, [keys]
749 vpxorq %%ST1, %%T0
750 vpxorq %%ST2, %%T0
751
752%if (0 == %%last_eight)
753 vpsrldq zmm13, %%TW1, 15
754 vpclmulqdq zmm14, zmm13, zpoly, 0
755 vpslldq zmm15, %%TW1, 1
756 vpxord zmm15, zmm15, zmm14
757%endif
758 ; round 1
759 vbroadcasti32x4 %%T0, [keys + 16*1]
760 vaesdec %%ST1, %%T0
761 vaesdec %%ST2, %%T0
762
763 ; round 2
764 vbroadcasti32x4 %%T0, [keys + 16*2]
765 vaesdec %%ST1, %%T0
766 vaesdec %%ST2, %%T0
767
768 ; round 3
769 vbroadcasti32x4 %%T0, [keys + 16*3]
770 vaesdec %%ST1, %%T0
771 vaesdec %%ST2, %%T0
772%if (0 == %%last_eight)
773 vpsrldq zmm13, %%TW2, 15
774 vpclmulqdq zmm14, zmm13, zpoly, 0
775 vpslldq zmm16, %%TW2, 1
776 vpxord zmm16, zmm16, zmm14
777%endif
778 ; round 4
779 vbroadcasti32x4 %%T0, [keys + 16*4]
780 vaesdec %%ST1, %%T0
781 vaesdec %%ST2, %%T0
782
783 ; round 5
784 vbroadcasti32x4 %%T0, [keys + 16*5]
785 vaesdec %%ST1, %%T0
786 vaesdec %%ST2, %%T0
787
788 ; round 6
789 vbroadcasti32x4 %%T0, [keys + 16*6]
790 vaesdec %%ST1, %%T0
791 vaesdec %%ST2, %%T0
792
793 ; round 7
794 vbroadcasti32x4 %%T0, [keys + 16*7]
795 vaesdec %%ST1, %%T0
796 vaesdec %%ST2, %%T0
797
798 ; round 8
799 vbroadcasti32x4 %%T0, [keys + 16*8]
800 vaesdec %%ST1, %%T0
801 vaesdec %%ST2, %%T0
802
803 ; round 9
804 vbroadcasti32x4 %%T0, [keys + 16*9]
805 vaesdec %%ST1, %%T0
806 vaesdec %%ST2, %%T0
807
808 ; round 10
809 vbroadcasti32x4 %%T0, [keys + 16*10]
810 vaesdeclast %%ST1, %%T0
811 vaesdeclast %%ST2, %%T0
812
813 ; xor Tweak values
814 vpxorq %%ST1, %%TW1
815 vpxorq %%ST2, %%TW2
816
817 ; load next Tweak values
818 vmovdqa32 %%TW1, zmm15
819 vmovdqa32 %%TW2, zmm16
820%endmacro
821
822
823; Decrypt 16 blocks in parallel
824; generate next 8 tweak values
825%macro decrypt_by_16_zmm 10
826%define %%ST1 %1 ; state 1
827%define %%ST2 %2 ; state 2
828%define %%ST3 %3 ; state 3
829%define %%ST4 %4 ; state 4
830
831%define %%TW1 %5 ; tweak 1
832%define %%TW2 %6 ; tweak 2
833%define %%TW3 %7 ; tweak 3
834%define %%TW4 %8 ; tweak 4
835
836%define %%T0 %9 ; Temp register
837%define %%last_eight %10
838
839 ; xor Tweak values
840 vpxorq %%ST1, %%TW1
841 vpxorq %%ST2, %%TW2
842 vpxorq %%ST3, %%TW3
843 vpxorq %%ST4, %%TW4
844
845 ; ARK
846 vbroadcasti32x4 %%T0, [keys]
847 vpxorq %%ST1, %%T0
848 vpxorq %%ST2, %%T0
849 vpxorq %%ST3, %%T0
850 vpxorq %%ST4, %%T0
851
852%if (0 == %%last_eight)
853 vpsrldq zmm13, %%TW3, 15
854 vpclmulqdq zmm14, zmm13, zpoly, 0
855 vpslldq zmm15, %%TW3, 1
856 vpxord zmm15, zmm15, zmm14
857%endif
858 ; round 1
859 vbroadcasti32x4 %%T0, [keys + 16*1]
860 vaesdec %%ST1, %%T0
861 vaesdec %%ST2, %%T0
862 vaesdec %%ST3, %%T0
863 vaesdec %%ST4, %%T0
864
865 ; round 2
866 vbroadcasti32x4 %%T0, [keys + 16*2]
867 vaesdec %%ST1, %%T0
868 vaesdec %%ST2, %%T0
869 vaesdec %%ST3, %%T0
870 vaesdec %%ST4, %%T0
871
872 ; round 3
873 vbroadcasti32x4 %%T0, [keys + 16*3]
874 vaesdec %%ST1, %%T0
875 vaesdec %%ST2, %%T0
876 vaesdec %%ST3, %%T0
877 vaesdec %%ST4, %%T0
878%if (0 == %%last_eight)
879 vpsrldq zmm13, %%TW4, 15
880 vpclmulqdq zmm14, zmm13, zpoly, 0
881 vpslldq zmm16, %%TW4, 1
882 vpxord zmm16, zmm16, zmm14
883%endif
884 ; round 4
885 vbroadcasti32x4 %%T0, [keys + 16*4]
886 vaesdec %%ST1, %%T0
887 vaesdec %%ST2, %%T0
888 vaesdec %%ST3, %%T0
889 vaesdec %%ST4, %%T0
890
891 ; round 5
892 vbroadcasti32x4 %%T0, [keys + 16*5]
893 vaesdec %%ST1, %%T0
894 vaesdec %%ST2, %%T0
895 vaesdec %%ST3, %%T0
896 vaesdec %%ST4, %%T0
897
898 ; round 6
899 vbroadcasti32x4 %%T0, [keys + 16*6]
900 vaesdec %%ST1, %%T0
901 vaesdec %%ST2, %%T0
902 vaesdec %%ST3, %%T0
903 vaesdec %%ST4, %%T0
904%if (0 == %%last_eight)
905 vpsrldq zmm13, zmm15, 15
906 vpclmulqdq zmm14, zmm13, zpoly, 0
907 vpslldq zmm17, zmm15, 1
908 vpxord zmm17, zmm17, zmm14
909%endif
910 ; round 7
911 vbroadcasti32x4 %%T0, [keys + 16*7]
912 vaesdec %%ST1, %%T0
913 vaesdec %%ST2, %%T0
914 vaesdec %%ST3, %%T0
915 vaesdec %%ST4, %%T0
916
917 ; round 8
918 vbroadcasti32x4 %%T0, [keys + 16*8]
919 vaesdec %%ST1, %%T0
920 vaesdec %%ST2, %%T0
921 vaesdec %%ST3, %%T0
922 vaesdec %%ST4, %%T0
923
924 ; round 9
925 vbroadcasti32x4 %%T0, [keys + 16*9]
926 vaesdec %%ST1, %%T0
927 vaesdec %%ST2, %%T0
928 vaesdec %%ST3, %%T0
929 vaesdec %%ST4, %%T0
930%if (0 == %%last_eight)
931 vpsrldq zmm13, zmm16, 15
932 vpclmulqdq zmm14, zmm13, zpoly, 0
933 vpslldq zmm18, zmm16, 1
934 vpxord zmm18, zmm18, zmm14
935%endif
936 ; round 10
937 vbroadcasti32x4 %%T0, [keys + 16*10]
938 vaesdeclast %%ST1, %%T0
939 vaesdeclast %%ST2, %%T0
940 vaesdeclast %%ST3, %%T0
941 vaesdeclast %%ST4, %%T0
942
943 ; xor Tweak values
944 vpxorq %%ST1, %%TW1
945 vpxorq %%ST2, %%TW2
946 vpxorq %%ST3, %%TW3
947 vpxorq %%ST4, %%TW4
948
949 ; load next Tweak values
950 vmovdqa32 %%TW1, zmm15
951 vmovdqa32 %%TW2, zmm16
952 vmovdqa32 %%TW3, zmm17
953 vmovdqa32 %%TW4, zmm18
954%endmacro
955
956
957section .text
958
959mk_global XTS_AES_128_dec_vaes, function
960XTS_AES_128_dec_vaes:
961 endbranch
962
963%define ALIGN_STACK
964%ifdef ALIGN_STACK
965 push rbp
966 mov rbp, rsp
967 sub rsp, VARIABLE_OFFSET
968 and rsp, ~63
969%else
970 sub rsp, VARIABLE_OFFSET
971%endif
972
973 mov [_gpr + 8*0], rbx
974%ifidn __OUTPUT_FORMAT__, win64
975 mov [_gpr + 8*1], rdi
976 mov [_gpr + 8*2], rsi
977
978 vmovdqa [_xmm + 16*0], xmm6
979 vmovdqa [_xmm + 16*1], xmm7
980 vmovdqa [_xmm + 16*2], xmm8
981 vmovdqa [_xmm + 16*3], xmm9
982 vmovdqa [_xmm + 16*4], xmm10
983 vmovdqa [_xmm + 16*5], xmm11
984 vmovdqa [_xmm + 16*6], xmm12
985 vmovdqa [_xmm + 16*7], xmm13
986 vmovdqa [_xmm + 16*8], xmm14
987 vmovdqa [_xmm + 16*9], xmm15
988%endif
989
990 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
991
992
993 vmovdqu xmm1, [T_val] ; read initial Tweak value
994 vpxor xmm4, xmm4 ; for key expansion
995 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
996
997
998%ifidn __OUTPUT_FORMAT__, win64
999 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1000 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1001%endif
1002
1003 cmp N_val, 128
1004 jl _less_than_128_bytes
1005
1006 vpbroadcastq zpoly, ghash_poly_8b
1007
1008 cmp N_val, 256
1009 jge _start_by16
1010
1011 cmp N_val, 128
1012 jge _start_by8
1013
1014_do_n_blocks:
1015 cmp N_val, 0
1016 je _ret_
1017
1018 cmp N_val, (7*16)
1019 jge _remaining_num_blocks_is_7
1020
1021 cmp N_val, (6*16)
1022 jge _remaining_num_blocks_is_6
1023
1024 cmp N_val, (5*16)
1025 jge _remaining_num_blocks_is_5
1026
1027 cmp N_val, (4*16)
1028 jge _remaining_num_blocks_is_4
1029
1030 cmp N_val, (3*16)
1031 jge _remaining_num_blocks_is_3
1032
1033 cmp N_val, (2*16)
1034 jge _remaining_num_blocks_is_2
1035
1036 cmp N_val, (1*16)
1037 jge _remaining_num_blocks_is_1
1038
1039;; _remaining_num_blocks_is_0:
1040 vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
1041 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1042 vmovdqu [ptr_ciphertext - 16], xmm1
1043 vmovdqa xmm8, xmm1
1044
1045 ; Calc previous tweak
1046 mov tmp1, 1
1047 kmovq k1, tmp1
1048 vpsllq xmm13, xmm9, 63
1049 vpsraq xmm14, xmm13, 63
1050 vpandq xmm5, xmm14, XWORD(zpoly)
1051 vpxorq xmm9 {k1}, xmm9, xmm5
1052 vpsrldq xmm10, xmm9, 8
1053 vpshrdq xmm0, xmm9, xmm10, 1
1054 vpslldq xmm13, xmm13, 8
1055 vpxorq xmm0, xmm0, xmm13
1056 jmp _steal_cipher
1057
1058_remaining_num_blocks_is_7:
1059 mov tmp1, -1
1060 shr tmp1, 16
1061 kmovq k1, tmp1
1062 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1063 vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
1064 add ptr_plaintext, 16*7
1065 and N_val, 15
1066 je _done_7_remain
1067 vextracti32x4 xmm12, zmm10, 2
1068 vextracti32x4 xmm13, zmm10, 3
1069 vinserti32x4 zmm10, xmm13, 2
1070 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1071 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1072 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1073 add ptr_ciphertext, 16*7
1074 vextracti32x4 xmm8, zmm2, 0x2
1075 vmovdqa xmm0, xmm12
1076 jmp _steal_cipher
1077_done_7_remain:
1078 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1079 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1080 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1081 jmp _ret_
1082
1083_remaining_num_blocks_is_6:
1084 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1085 vmovdqu8 ymm2, [ptr_plaintext+16*4]
1086 add ptr_plaintext, 16*6
1087 and N_val, 15
1088 je _done_6_remain
1089 vextracti32x4 xmm12, zmm10, 1
1090 vextracti32x4 xmm13, zmm10, 2
1091 vinserti32x4 zmm10, xmm13, 1
1092 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1093 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1094 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1095 add ptr_ciphertext, 16*6
1096 vextracti32x4 xmm8, zmm2, 0x1
1097 vmovdqa xmm0, xmm12
1098 jmp _steal_cipher
1099_done_6_remain:
1100 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1101 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1102 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1103 jmp _ret_
1104
1105_remaining_num_blocks_is_5:
1106 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1107 vmovdqu xmm2, [ptr_plaintext+16*4]
1108 add ptr_plaintext, 16*5
1109 and N_val, 15
1110 je _done_5_remain
1111 vmovdqa xmm12, xmm10
1112 vextracti32x4 xmm10, zmm10, 1
1113 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1114 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1115 vmovdqu [ptr_ciphertext+16*4], xmm2
1116 add ptr_ciphertext, 16*5
1117 vmovdqa xmm8, xmm2
1118 vmovdqa xmm0, xmm12
1119 jmp _steal_cipher
1120_done_5_remain:
1121 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1122 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1123 vmovdqu [ptr_ciphertext+16*4], xmm2
1124 jmp _ret_
1125
1126_remaining_num_blocks_is_4:
1127 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1128 add ptr_plaintext, 16*4
1129 and N_val, 15
1130 je _done_4_remain
1131 vextracti32x4 xmm12, zmm9, 3
1132 vinserti32x4 zmm9, xmm10, 3
1133 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1134 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1135 add ptr_ciphertext, 16*4
1136 vextracti32x4 xmm8, zmm1, 0x3
1137 vmovdqa xmm0, xmm12
1138 jmp _steal_cipher
1139_done_4_remain:
1140 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1141 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1142 jmp _ret_
1143
1144_remaining_num_blocks_is_3:
1145 vmovdqu xmm1, [ptr_plaintext+16*0]
1146 vmovdqu xmm2, [ptr_plaintext+16*1]
1147 vmovdqu xmm3, [ptr_plaintext+16*2]
1148 add ptr_plaintext, 16*3
1149 and N_val, 15
1150 je _done_3_remain
1151 vextracti32x4 xmm13, zmm9, 2
1152 vextracti32x4 xmm10, zmm9, 1
1153 vextracti32x4 xmm11, zmm9, 3
1154 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1155 vmovdqu [ptr_ciphertext+16*0], xmm1
1156 vmovdqu [ptr_ciphertext+16*1], xmm2
1157 vmovdqu [ptr_ciphertext+16*2], xmm3
1158 add ptr_ciphertext, 16*3
1159 vmovdqa xmm8, xmm3
1160 vmovdqa xmm0, xmm13
1161 jmp _steal_cipher
1162_done_3_remain:
1163 vextracti32x4 xmm10, zmm9, 1
1164 vextracti32x4 xmm11, zmm9, 2
1165 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1166 vmovdqu [ptr_ciphertext+16*0], xmm1
1167 vmovdqu [ptr_ciphertext+16*1], xmm2
1168 vmovdqu [ptr_ciphertext+16*2], xmm3
1169 jmp _ret_
1170
1171_remaining_num_blocks_is_2:
1172 vmovdqu xmm1, [ptr_plaintext+16*0]
1173 vmovdqu xmm2, [ptr_plaintext+16*1]
1174 add ptr_plaintext, 16*2
1175 and N_val, 15
1176 je _done_2_remain
1177 vextracti32x4 xmm10, zmm9, 2
1178 vextracti32x4 xmm12, zmm9, 1
1179 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1180 vmovdqu [ptr_ciphertext+16*0], xmm1
1181 vmovdqu [ptr_ciphertext+16*1], xmm2
1182 add ptr_ciphertext, 16*2
1183 vmovdqa xmm8, xmm2
1184 vmovdqa xmm0, xmm12
1185 jmp _steal_cipher
1186_done_2_remain:
1187 vextracti32x4 xmm10, zmm9, 1
1188 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1189 vmovdqu [ptr_ciphertext+16*0], xmm1
1190 vmovdqu [ptr_ciphertext+16*1], xmm2
1191 jmp _ret_
1192
1193_remaining_num_blocks_is_1:
1194 vmovdqu xmm1, [ptr_plaintext]
1195 add ptr_plaintext, 16
1196 and N_val, 15
1197 je _done_1_remain
1198 vextracti32x4 xmm11, zmm9, 1
1199 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
1200 vmovdqu [ptr_ciphertext], xmm1
1201 add ptr_ciphertext, 16
1202 vmovdqa xmm8, xmm1
1203 vmovdqa xmm0, xmm9
1204 jmp _steal_cipher
1205_done_1_remain:
1206 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1207 vmovdqu [ptr_ciphertext], xmm1
1208 jmp _ret_
1209
1210
1211
1212_start_by16:
1213 ; Make first 7 tweek values
1214 vbroadcasti32x4 zmm0, [TW]
1215 vbroadcasti32x4 zmm8, [shufb_15_7]
1216 mov tmp1, 0xaa
1217 kmovq k2, tmp1
1218
1219 ; Mult tweak by 2^{3, 2, 1, 0}
1220 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1221 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1222 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1223 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1224 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1225 vpxord zmm9, zmm3, zmm4
1226
1227 ; Mult tweak by 2^{7, 6, 5, 4}
1228 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1229 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1230 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1231 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1232 vpxord zmm10, zmm7, zmm5
1233
1234 ; Make next 8 tweek values by all x 2^8
1235 vpsrldq zmm13, zmm9, 15
1236 vpclmulqdq zmm14, zmm13, zpoly, 0
1237 vpslldq zmm11, zmm9, 1
1238 vpxord zmm11, zmm11, zmm14
1239
1240 vpsrldq zmm15, zmm10, 15
1241 vpclmulqdq zmm16, zmm15, zpoly, 0
1242 vpslldq zmm12, zmm10, 1
1243 vpxord zmm12, zmm12, zmm16
1244
1245_main_loop_run_16:
1246 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1247 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1248 vmovdqu8 zmm3, [ptr_plaintext+16*8]
1249 vmovdqu8 zmm4, [ptr_plaintext+16*12]
1250 add ptr_plaintext, 256
1251
1252 decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
1253
1254 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1255 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1256 vmovdqu8 [ptr_ciphertext+16*8], zmm3
1257 vmovdqu8 [ptr_ciphertext+16*12], zmm4
1258 add ptr_ciphertext, 256
1259 sub N_val, 256
1260 cmp N_val, 256
1261 jge _main_loop_run_16
1262
1263 cmp N_val, 128
1264 jge _main_loop_run_8
1265
1266 jmp _do_n_blocks
1267
1268_start_by8:
1269 ; Make first 7 tweek values
1270 vbroadcasti32x4 zmm0, [TW]
1271 vbroadcasti32x4 zmm8, [shufb_15_7]
1272 mov tmp1, 0xaa
1273 kmovq k2, tmp1
1274
1275 ; Mult tweak by 2^{3, 2, 1, 0}
1276 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1277 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1278 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1279 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1280 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1281 vpxord zmm9, zmm3, zmm4
1282
1283 ; Mult tweak by 2^{7, 6, 5, 4}
1284 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1285 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1286 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1287 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1288 vpxord zmm10, zmm7, zmm5
1289
1290_main_loop_run_8:
1291 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1292 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1293 add ptr_plaintext, 128
1294
1295 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
1296
1297 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1298 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1299 add ptr_ciphertext, 128
1300 sub N_val, 128
1301 cmp N_val, 128
1302 jge _main_loop_run_8
1303
1304 jmp _do_n_blocks
1305
1306_steal_cipher:
1307 ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
1308 vmovdqa xmm2, xmm8
1309
1310 ; shift xmm8 to the left by 16-N_val bytes
1311 lea twtempl, [vpshufb_shf_table]
1312 vmovdqu xmm10, [twtempl+N_val]
1313 vpshufb xmm8, xmm10
1314
1315 vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
1316 vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
1317
1318 ; shift xmm3 to the right by 16-N_val bytes
1319 lea twtempl, [vpshufb_shf_table +16]
1320 sub twtempl, N_val
1321 vmovdqu xmm10, [twtempl]
1322 vpxor xmm10, [mask1]
1323 vpshufb xmm3, xmm10
1324
1325 vpblendvb xmm3, xmm3, xmm2, xmm10
1326
1327 ; xor Tweak value
1328 vpxor xmm8, xmm3, xmm0
1329
1330 ;decrypt last block with cipher stealing
1331 vpxor xmm8, [keys] ; ARK
1332 vaesdec xmm8, [keys + 16*1] ; round 1
1333 vaesdec xmm8, [keys + 16*2] ; round 2
1334 vaesdec xmm8, [keys + 16*3] ; round 3
1335 vaesdec xmm8, [keys + 16*4] ; round 4
1336 vaesdec xmm8, [keys + 16*5] ; round 5
1337 vaesdec xmm8, [keys + 16*6] ; round 6
1338 vaesdec xmm8, [keys + 16*7] ; round 7
1339 vaesdec xmm8, [keys + 16*8] ; round 8
1340 vaesdec xmm8, [keys + 16*9] ; round 9
1341 vaesdeclast xmm8, [keys + 16*10] ; round 10
1342
1343 ; xor Tweak value
1344 vpxor xmm8, xmm8, xmm0
1345
1346_done:
1347 ; store last ciphertext value
1348 vmovdqu [ptr_ciphertext - 16], xmm8
1349
1350_ret_:
1351 mov rbx, [_gpr + 8*0]
1352
1353%ifidn __OUTPUT_FORMAT__, win64
1354 mov rdi, [_gpr + 8*1]
1355 mov rsi, [_gpr + 8*2]
1356
1357 vmovdqa xmm6, [_xmm + 16*0]
1358 vmovdqa xmm7, [_xmm + 16*1]
1359 vmovdqa xmm8, [_xmm + 16*2]
1360 vmovdqa xmm9, [_xmm + 16*3]
1361 vmovdqa xmm10, [_xmm + 16*4]
1362 vmovdqa xmm11, [_xmm + 16*5]
1363 vmovdqa xmm12, [_xmm + 16*6]
1364 vmovdqa xmm13, [_xmm + 16*7]
1365 vmovdqa xmm14, [_xmm + 16*8]
1366 vmovdqa xmm15, [_xmm + 16*9]
1367%endif
1368
1369%ifndef ALIGN_STACK
1370 add rsp, VARIABLE_OFFSET
1371%else
1372 mov rsp, rbp
1373 pop rbp
1374%endif
1375 ret
1376
1377
1378_less_than_128_bytes:
1379 cmp N_val, 16
1380 jb _ret_
1381
1382 mov tmp1, N_val
1383 and tmp1, (7 << 4)
1384 cmp tmp1, (6 << 4)
1385 je _num_blocks_is_6
1386 cmp tmp1, (5 << 4)
1387 je _num_blocks_is_5
1388 cmp tmp1, (4 << 4)
1389 je _num_blocks_is_4
1390 cmp tmp1, (3 << 4)
1391 je _num_blocks_is_3
1392 cmp tmp1, (2 << 4)
1393 je _num_blocks_is_2
1394 cmp tmp1, (1 << 4)
1395 je _num_blocks_is_1
1396
1397_num_blocks_is_7:
1398 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1399 add ptr_plaintext, 16*7
1400 and N_val, 15
1401 je _done_7
1402
1403_steal_cipher_7:
1404 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1405 shl twtempl, 1
1406 adc twtemph, twtemph
1407 cmovc ghash_poly_8b_temp, ghash_poly_8b
1408 xor twtempl, ghash_poly_8b_temp
1409 mov [TW+8*2], twtempl
1410 mov [TW+8*3], twtemph
1411 vmovdqa64 xmm16, xmm15
1412 vmovdqa xmm15, [TW+16*1]
1413
1414 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1415 vmovdqu [ptr_ciphertext+16*0], xmm1
1416 vmovdqu [ptr_ciphertext+16*1], xmm2
1417 vmovdqu [ptr_ciphertext+16*2], xmm3
1418 vmovdqu [ptr_ciphertext+16*3], xmm4
1419 vmovdqu [ptr_ciphertext+16*4], xmm5
1420 vmovdqu [ptr_ciphertext+16*5], xmm6
1421 add ptr_ciphertext, 16*7
1422 vmovdqa64 xmm0, xmm16
1423 vmovdqa xmm8, xmm7
1424 jmp _steal_cipher
1425
1426_done_7:
1427 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1428 vmovdqu [ptr_ciphertext+16*0], xmm1
1429 vmovdqu [ptr_ciphertext+16*1], xmm2
1430 vmovdqu [ptr_ciphertext+16*2], xmm3
1431 vmovdqu [ptr_ciphertext+16*3], xmm4
1432 vmovdqu [ptr_ciphertext+16*4], xmm5
1433 vmovdqu [ptr_ciphertext+16*5], xmm6
1434 add ptr_ciphertext, 16*7
1435 vmovdqa xmm8, xmm7
1436 jmp _done
1437
1438_num_blocks_is_6:
1439 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1440 add ptr_plaintext, 16*6
1441 and N_val, 15
1442 je _done_6
1443
1444_steal_cipher_6:
1445 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1446 shl twtempl, 1
1447 adc twtemph, twtemph
1448 cmovc ghash_poly_8b_temp, ghash_poly_8b
1449 xor twtempl, ghash_poly_8b_temp
1450 mov [TW+8*2], twtempl
1451 mov [TW+8*3], twtemph
1452 vmovdqa xmm15, xmm14
1453 vmovdqa xmm14, [TW+16*1]
1454
1455 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1456 vmovdqu [ptr_ciphertext+16*0], xmm1
1457 vmovdqu [ptr_ciphertext+16*1], xmm2
1458 vmovdqu [ptr_ciphertext+16*2], xmm3
1459 vmovdqu [ptr_ciphertext+16*3], xmm4
1460 vmovdqu [ptr_ciphertext+16*4], xmm5
1461 add ptr_ciphertext, 16*6
1462 vmovdqa xmm0, xmm15
1463 vmovdqa xmm8, xmm6
1464 jmp _steal_cipher
1465
1466_done_6:
1467 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1468 vmovdqu [ptr_ciphertext+16*0], xmm1
1469 vmovdqu [ptr_ciphertext+16*1], xmm2
1470 vmovdqu [ptr_ciphertext+16*2], xmm3
1471 vmovdqu [ptr_ciphertext+16*3], xmm4
1472 vmovdqu [ptr_ciphertext+16*4], xmm5
1473 add ptr_ciphertext, 16*6
1474 vmovdqa xmm8, xmm6
1475 jmp _done
1476
1477_num_blocks_is_5:
1478 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1479 add ptr_plaintext, 16*5
1480 and N_val, 15
1481 je _done_5
1482
1483_steal_cipher_5:
1484 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1485 shl twtempl, 1
1486 adc twtemph, twtemph
1487 cmovc ghash_poly_8b_temp, ghash_poly_8b
1488 xor twtempl, ghash_poly_8b_temp
1489 mov [TW+8*2], twtempl
1490 mov [TW+8*3], twtemph
1491 vmovdqa xmm14, xmm13
1492 vmovdqa xmm13, [TW+16*1]
1493
1494 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1495 vmovdqu [ptr_ciphertext+16*0], xmm1
1496 vmovdqu [ptr_ciphertext+16*1], xmm2
1497 vmovdqu [ptr_ciphertext+16*2], xmm3
1498 vmovdqu [ptr_ciphertext+16*3], xmm4
1499 add ptr_ciphertext, 16*5
1500 vmovdqa xmm0, xmm14
1501 vmovdqa xmm8, xmm5
1502 jmp _steal_cipher
1503
1504_done_5:
1505 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1506 vmovdqu [ptr_ciphertext+16*0], xmm1
1507 vmovdqu [ptr_ciphertext+16*1], xmm2
1508 vmovdqu [ptr_ciphertext+16*2], xmm3
1509 vmovdqu [ptr_ciphertext+16*3], xmm4
1510 add ptr_ciphertext, 16*5
1511 vmovdqa xmm8, xmm5
1512 jmp _done
1513
1514_num_blocks_is_4:
1515 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1516 add ptr_plaintext, 16*4
1517 and N_val, 15
1518 je _done_4
1519
1520_steal_cipher_4:
1521 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1522 shl twtempl, 1
1523 adc twtemph, twtemph
1524 cmovc ghash_poly_8b_temp, ghash_poly_8b
1525 xor twtempl, ghash_poly_8b_temp
1526 mov [TW+8*2], twtempl
1527 mov [TW+8*3], twtemph
1528 vmovdqa xmm13, xmm12
1529 vmovdqa xmm12, [TW+16*1]
1530
1531 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1532 vmovdqu [ptr_ciphertext+16*0], xmm1
1533 vmovdqu [ptr_ciphertext+16*1], xmm2
1534 vmovdqu [ptr_ciphertext+16*2], xmm3
1535 add ptr_ciphertext, 16*4
1536 vmovdqa xmm0, xmm13
1537 vmovdqa xmm8, xmm4
1538 jmp _steal_cipher
1539
1540_done_4:
1541 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1542 vmovdqu [ptr_ciphertext+16*0], xmm1
1543 vmovdqu [ptr_ciphertext+16*1], xmm2
1544 vmovdqu [ptr_ciphertext+16*2], xmm3
1545 add ptr_ciphertext, 16*4
1546 vmovdqa xmm8, xmm4
1547 jmp _done
1548
1549_num_blocks_is_3:
1550 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1551 add ptr_plaintext, 16*3
1552 and N_val, 15
1553 je _done_3
1554
1555_steal_cipher_3:
1556 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1557 shl twtempl, 1
1558 adc twtemph, twtemph
1559 cmovc ghash_poly_8b_temp, ghash_poly_8b
1560 xor twtempl, ghash_poly_8b_temp
1561 mov [TW+8*2], twtempl
1562 mov [TW+8*3], twtemph
1563 vmovdqa xmm12, xmm11
1564 vmovdqa xmm11, [TW+16*1]
1565
1566 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1567 vmovdqu [ptr_ciphertext+16*0], xmm1
1568 vmovdqu [ptr_ciphertext+16*1], xmm2
1569 add ptr_ciphertext, 16*3
1570 vmovdqa xmm0, xmm12
1571 vmovdqa xmm8, xmm3
1572 jmp _steal_cipher
1573
1574_done_3:
1575 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1576 vmovdqu [ptr_ciphertext+16*0], xmm1
1577 vmovdqu [ptr_ciphertext+16*1], xmm2
1578 add ptr_ciphertext, 16*3
1579 vmovdqa xmm8, xmm3
1580 jmp _done
1581
1582_num_blocks_is_2:
1583 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1584 add ptr_plaintext, 16*2
1585 and N_val, 15
1586 je _done_2
1587
1588_steal_cipher_2:
1589 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1590 shl twtempl, 1
1591 adc twtemph, twtemph
1592 cmovc ghash_poly_8b_temp, ghash_poly_8b
1593 xor twtempl, ghash_poly_8b_temp
1594 mov [TW+8*2], twtempl
1595 mov [TW+8*3], twtemph
1596 vmovdqa xmm11, xmm10
1597 vmovdqa xmm10, [TW+16*1]
1598
1599 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1600 vmovdqu [ptr_ciphertext], xmm1
1601 add ptr_ciphertext, 16*2
1602 vmovdqa xmm0, xmm11
1603 vmovdqa xmm8, xmm2
1604 jmp _steal_cipher
1605
1606_done_2:
1607 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1608 vmovdqu [ptr_ciphertext], xmm1
1609 add ptr_ciphertext, 16*2
1610 vmovdqa xmm8, xmm2
1611 jmp _done
1612
1613_num_blocks_is_1:
1614 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1615 add ptr_plaintext, 16*1
1616 and N_val, 15
1617 je _done_1
1618
1619_steal_cipher_1:
1620 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1621 shl twtempl, 1
1622 adc twtemph, twtemph
1623 cmovc ghash_poly_8b_temp, ghash_poly_8b
1624 xor twtempl, ghash_poly_8b_temp
1625 mov [TW+8*2], twtempl
1626 mov [TW+8*3], twtemph
1627 vmovdqa xmm10, xmm9
1628 vmovdqa xmm9, [TW+16*1]
1629
1630 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1631 add ptr_ciphertext, 16*1
1632 vmovdqa xmm0, xmm10
1633 vmovdqa xmm8, xmm1
1634 jmp _steal_cipher
1635
1636_done_1:
1637 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1638 add ptr_ciphertext, 16*1
1639 vmovdqa xmm8, xmm1
1640 jmp _done
1641
1642section .data
1643align 16
1644
1645vpshufb_shf_table:
1646; use these values for shift constants for the vpshufb instruction
1647; different alignments result in values as shown:
1648; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1649; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1650; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1651; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1652; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1653; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1654; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1655; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1656; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1657; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1658; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1659; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1660; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1661; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1662; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1663dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1664dq 0x0706050403020100, 0x000e0d0c0b0a0908
1665
1666mask1:
1667dq 0x8080808080808080, 0x8080808080808080
1668
1669const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
1670const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
1671const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
1672const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
1673
1674shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1675
1676%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
1677%ifidn __OUTPUT_FORMAT__, win64
1678global no_XTS_AES_128_dec_vaes
1679no_XTS_AES_128_dec_vaes:
1680%endif
1681%endif ; (AS_FEATURE_LEVEL) >= 10