]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_256_dec_vaes.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS decrypt function with 256-bit AES
30 ; input keys are not aligned
31 ; keys are expanded in parallel with the tweak encryption
32 ; plaintext and ciphertext are not aligned
33 ; second key is stored in the stack as aligned to 16 Bytes
34 ; first key is required only once, no need for storage of this key
35
36 %include "reg_sizes.asm"
37
38 %if (AS_FEATURE_LEVEL) >= 10
39
40 default rel
41 %define TW rsp ; store 8 tweak values
42 %define keys rsp + 16*8 ; store 15 expanded keys
43
44 %ifidn __OUTPUT_FORMAT__, win64
45 %define _xmm rsp + 16*23 ; store xmm6:xmm15
46 %endif
47
48 %ifidn __OUTPUT_FORMAT__, elf64
49 %define _gpr rsp + 16*23 ; store rbx
50 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %else
52 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
53 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
54 %endif
55
56 %define GHASH_POLY 0x87
57
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59 ;void XTS_AES_256_enc_avx(
60 ; UINT8 *k2, // key used for tweaking, 16*2 bytes
61 ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
62 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
63 ; UINT64 N, // sector size, in bytes
64 ; const UINT8 *pt, // plaintext sector input data
65 ; UINT8 *ct); // ciphertext sector output data
66 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
67
68 ; arguments for input parameters
69 %ifidn __OUTPUT_FORMAT__, elf64
70 %xdefine ptr_key2 rdi
71 %xdefine ptr_key1 rsi
72 %xdefine T_val rdx
73 %xdefine N_val rcx
74 %xdefine ptr_plaintext r8
75 %xdefine ptr_ciphertext r9
76 %else
77 %xdefine ptr_key2 rcx
78 %xdefine ptr_key1 rdx
79 %xdefine T_val r8
80 %xdefine N_val r9
81 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
82 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
83 %endif
84
85 ; arguments for temp parameters
86 %ifidn __OUTPUT_FORMAT__, elf64
87 %define tmp1 rdi
88 %define ghash_poly_8b r10
89 %define ghash_poly_8b_temp r11
90 %else
91 %define tmp1 rcx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94 %endif
95
96 %define twtempl rax ; global temp registers used for tweak computation
97 %define twtemph rbx
98 %define zpoly zmm25
99
100 ; produce the key for the next round
101 ; raw_key is the output of vaeskeygenassist instruction
102 ; round_key value before this key_expansion_128 macro is current round key
103 ; round_key value after this key_expansion_128 macro is next round key
104 ; 2 macros will be used for key generation in a flip-flopped fashion
105 %macro key_expansion_256_flip 3
106 %define %%xraw_key %1
107 %define %%xtmp %2
108 %define %%xround_key %3
109 vpshufd %%xraw_key, %%xraw_key, 11111111b
110 vshufps %%xtmp, %%xround_key, 00010000b
111 vpxor %%xround_key, %%xtmp
112 vshufps %%xtmp, %%xround_key, 10001100b
113 vpxor %%xround_key, %%xtmp
114 vpxor %%xround_key, %%xraw_key
115 %endmacro
116
117 %macro key_expansion_256_flop 3
118 %define %%xraw_key %1
119 %define %%xtmp %2
120 %define %%xround_key %3
121 vpshufd %%xraw_key, %%xraw_key, 10101010b
122 vshufps %%xtmp, %%xround_key, 00010000b
123 vpxor %%xround_key, %%xtmp
124 vshufps %%xtmp, %%xround_key, 10001100b
125 vpxor %%xround_key, %%xtmp
126 vpxor %%xround_key, %%xraw_key
127 %endmacro
128
129
130 ; macro to encrypt the tweak value in parallel with key generation of both keys
131
132 %macro encrypt_T 11
133 %define %%xkey2 %1
134 %define %%xkey2_2 %2
135 %define %%xstate_tweak %3
136 %define %%xkey1 %4
137 %define %%xkey1_2 %5
138 %define %%xraw_key %6
139 %define %%xtmp %7
140 %define %%xtmp2 %8
141 %define %%ptr_key2 %9
142 %define %%ptr_key1 %10
143 %define %%ptr_expanded_keys %11
144
145
146 vmovdqu %%xkey2, [%%ptr_key2]
147 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
148
149 vmovdqu %%xkey1, [%%ptr_key1]
150 vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
151
152 vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
153 vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
154
155 vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
156 vaesimc %%xtmp2, %%xkey1_2
157 vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2
158
159
160
161
162 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
163 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
164 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
165 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
166 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
167 vaesimc %%xtmp2, %%xkey1
168 vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2
169
170 vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
171 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
172 vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
173 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
174 vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
175 vaesimc %%xtmp2, %%xkey1_2
176 vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2
177
178
179
180 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
181 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
182 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
183 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
184 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
185 vaesimc %%xtmp2, %%xkey1
186 vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2
187
188 vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
189 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
190 vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
191 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
192 vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
193 vaesimc %%xtmp2, %%xkey1_2
194 vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2
195
196
197
198 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
199 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
200 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
201 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
202 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
203 vaesimc %%xtmp2, %%xkey1
204 vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2
205
206 vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
207 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
208 vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
209 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
210 vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
211 vaesimc %%xtmp2, %%xkey1_2
212 vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2
213
214
215 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
216 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
217 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
218 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
219 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
220 vaesimc %%xtmp2, %%xkey1
221 vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2
222
223 vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
224 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
225 vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
226 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
227 vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
228 vaesimc %%xtmp2, %%xkey1_2
229 vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2
230
231
232 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
233 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
234 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
235 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
236 vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
237 vaesimc %%xtmp2, %%xkey1
238 vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2
239
240 vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
241 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
242 vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
243 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
244 vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
245 vaesimc %%xtmp2, %%xkey1_2
246 vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2
247
248
249 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
250 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
251 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
252 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
253 vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
254 vaesimc %%xtmp2, %%xkey1
255 vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2
256
257 vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
258 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
259 vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
260 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
261 vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
262 vaesimc %%xtmp2, %%xkey1_2
263 vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2
264
265
266 vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
267 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
268 vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
269 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
270 vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
271 vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
272
273 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
274 %endmacro
275
276
277 ; Original way to generate initial tweak values and load plaintext values
278 ; only used for small blocks
279 %macro initialize 16
280
281 %define %%ST1 %1 ; state 1
282 %define %%ST2 %2 ; state 2
283 %define %%ST3 %3 ; state 3
284 %define %%ST4 %4 ; state 4
285 %define %%ST5 %5 ; state 5
286 %define %%ST6 %6 ; state 6
287 %define %%ST7 %7 ; state 7
288 %define %%ST8 %8 ; state 8
289
290 %define %%TW1 %9 ; tweak 1
291 %define %%TW2 %10 ; tweak 2
292 %define %%TW3 %11 ; tweak 3
293 %define %%TW4 %12 ; tweak 4
294 %define %%TW5 %13 ; tweak 5
295 %define %%TW6 %14 ; tweak 6
296 %define %%TW7 %15 ; tweak 7
297
298 %define %%num_initial_blocks %16
299
300
301 ; generate next Tweak values
302 vmovdqa %%TW1, [TW+16*0]
303 mov twtempl, [TW+8*0]
304 mov twtemph, [TW+8*1]
305 vmovdqu %%ST1, [ptr_plaintext+16*0]
306 %if (%%num_initial_blocks>=2)
307 xor ghash_poly_8b_temp, ghash_poly_8b_temp
308 shl twtempl, 1
309 adc twtemph, twtemph
310 cmovc ghash_poly_8b_temp, ghash_poly_8b
311 xor twtempl, ghash_poly_8b_temp
312 mov [TW+8*2], twtempl
313 mov [TW+8*3], twtemph;
314 vmovdqa %%TW2, [TW+16*1]
315 vmovdqu %%ST2, [ptr_plaintext+16*1]
316 %endif
317 %if (%%num_initial_blocks>=3)
318 xor ghash_poly_8b_temp, ghash_poly_8b_temp
319 shl twtempl, 1
320 adc twtemph, twtemph
321 cmovc ghash_poly_8b_temp, ghash_poly_8b
322 xor twtempl, ghash_poly_8b_temp
323 mov [TW+8*4], twtempl
324 mov [TW+8*5], twtemph;
325 vmovdqa %%TW3, [TW+16*2]
326 vmovdqu %%ST3, [ptr_plaintext+16*2]
327 %endif
328 %if (%%num_initial_blocks>=4)
329 xor ghash_poly_8b_temp, ghash_poly_8b_temp
330 shl twtempl, 1
331 adc twtemph, twtemph
332 cmovc ghash_poly_8b_temp, ghash_poly_8b
333 xor twtempl, ghash_poly_8b_temp
334 mov [TW+8*6], twtempl
335 mov [TW+8*7], twtemph;
336 vmovdqa %%TW4, [TW+16*3]
337 vmovdqu %%ST4, [ptr_plaintext+16*3]
338 %endif
339 %if (%%num_initial_blocks>=5)
340 xor ghash_poly_8b_temp, ghash_poly_8b_temp
341 shl twtempl, 1
342 adc twtemph, twtemph
343 cmovc ghash_poly_8b_temp, ghash_poly_8b
344 xor twtempl, ghash_poly_8b_temp
345 mov [TW+8*8], twtempl
346 mov [TW+8*9], twtemph;
347 vmovdqa %%TW5, [TW+16*4]
348 vmovdqu %%ST5, [ptr_plaintext+16*4]
349 %endif
350 %if (%%num_initial_blocks>=6)
351 xor ghash_poly_8b_temp, ghash_poly_8b_temp
352 shl twtempl, 1
353 adc twtemph, twtemph
354 cmovc ghash_poly_8b_temp, ghash_poly_8b
355 xor twtempl, ghash_poly_8b_temp
356 mov [TW+8*10], twtempl
357 mov [TW+8*11], twtemph;
358 vmovdqa %%TW6, [TW+16*5]
359 vmovdqu %%ST6, [ptr_plaintext+16*5]
360 %endif
361 %if (%%num_initial_blocks>=7)
362 xor ghash_poly_8b_temp, ghash_poly_8b_temp
363 shl twtempl, 1
364 adc twtemph, twtemph
365 cmovc ghash_poly_8b_temp, ghash_poly_8b
366 xor twtempl, ghash_poly_8b_temp
367 mov [TW+8*12], twtempl
368 mov [TW+8*13], twtemph;
369 vmovdqa %%TW7, [TW+16*6]
370 vmovdqu %%ST7, [ptr_plaintext+16*6]
371 %endif
372
373 %endmacro
374
375
376 ; Original decrypt initial blocks of AES
377 ; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
378 ; next 8 Tweak values can be generated
379 %macro decrypt_initial 18
380 %define %%ST1 %1 ; state 1
381 %define %%ST2 %2 ; state 2
382 %define %%ST3 %3 ; state 3
383 %define %%ST4 %4 ; state 4
384 %define %%ST5 %5 ; state 5
385 %define %%ST6 %6 ; state 6
386 %define %%ST7 %7 ; state 7
387 %define %%ST8 %8 ; state 8
388
389 %define %%TW1 %9 ; tweak 1
390 %define %%TW2 %10 ; tweak 2
391 %define %%TW3 %11 ; tweak 3
392 %define %%TW4 %12 ; tweak 4
393 %define %%TW5 %13 ; tweak 5
394 %define %%TW6 %14 ; tweak 6
395 %define %%TW7 %15 ; tweak 7
396 %define %%T0 %16 ; Temp register
397 %define %%num_blocks %17
398 ; %%num_blocks blocks decrypted
399 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
400
401 %define %%lt128 %18 ; less than 128 bytes
402
403 ; xor Tweak value
404 vpxor %%ST1, %%TW1
405 %if (%%num_blocks>=2)
406 vpxor %%ST2, %%TW2
407 %endif
408 %if (%%num_blocks>=3)
409 vpxor %%ST3, %%TW3
410 %endif
411 %if (%%num_blocks>=4)
412 vpxor %%ST4, %%TW4
413 %endif
414 %if (%%num_blocks>=5)
415 vpxor %%ST5, %%TW5
416 %endif
417 %if (%%num_blocks>=6)
418 vpxor %%ST6, %%TW6
419 %endif
420 %if (%%num_blocks>=7)
421 vpxor %%ST7, %%TW7
422 %endif
423
424
425 ; ARK
426 vmovdqa %%T0, [keys]
427 vpxor %%ST1, %%T0
428 %if (%%num_blocks>=2)
429 vpxor %%ST2, %%T0
430 %endif
431 %if (%%num_blocks>=3)
432 vpxor %%ST3, %%T0
433 %endif
434 %if (%%num_blocks>=4)
435 vpxor %%ST4, %%T0
436 %endif
437 %if (%%num_blocks>=5)
438 vpxor %%ST5, %%T0
439 %endif
440 %if (%%num_blocks>=6)
441 vpxor %%ST6, %%T0
442 %endif
443 %if (%%num_blocks>=7)
444 vpxor %%ST7, %%T0
445 %endif
446
447
448 %if (0 == %%lt128)
449 xor ghash_poly_8b_temp, ghash_poly_8b_temp
450 shl twtempl, 1
451 adc twtemph, twtemph
452 %endif
453
454 ; round 1
455 vmovdqa %%T0, [keys + 16*1]
456 vaesdec %%ST1, %%T0
457 %if (%%num_blocks>=2)
458 vaesdec %%ST2, %%T0
459 %endif
460 %if (%%num_blocks>=3)
461 vaesdec %%ST3, %%T0
462 %endif
463 %if (%%num_blocks>=4)
464 vaesdec %%ST4, %%T0
465 %endif
466 %if (%%num_blocks>=5)
467 vaesdec %%ST5, %%T0
468 %endif
469 %if (%%num_blocks>=6)
470 vaesdec %%ST6, %%T0
471 %endif
472 %if (%%num_blocks>=7)
473 vaesdec %%ST7, %%T0
474 %endif
475 %if (0 == %%lt128)
476 cmovc ghash_poly_8b_temp, ghash_poly_8b
477 xor twtempl, ghash_poly_8b_temp
478 mov [TW + 8*0], twtempl ; next Tweak1 generated
479 mov [TW + 8*1], twtemph
480 xor ghash_poly_8b_temp, ghash_poly_8b_temp
481 %endif
482
483 ; round 2
484 vmovdqa %%T0, [keys + 16*2]
485 vaesdec %%ST1, %%T0
486 %if (%%num_blocks>=2)
487 vaesdec %%ST2, %%T0
488 %endif
489 %if (%%num_blocks>=3)
490 vaesdec %%ST3, %%T0
491 %endif
492 %if (%%num_blocks>=4)
493 vaesdec %%ST4, %%T0
494 %endif
495 %if (%%num_blocks>=5)
496 vaesdec %%ST5, %%T0
497 %endif
498 %if (%%num_blocks>=6)
499 vaesdec %%ST6, %%T0
500 %endif
501 %if (%%num_blocks>=7)
502 vaesdec %%ST7, %%T0
503 %endif
504
505 %if (0 == %%lt128)
506 shl twtempl, 1
507 adc twtemph, twtemph
508 cmovc ghash_poly_8b_temp, ghash_poly_8b
509 xor twtempl, ghash_poly_8b_temp
510 mov [TW + 8*2], twtempl ; next Tweak2 generated
511 %endif
512
513 ; round 3
514 vmovdqa %%T0, [keys + 16*3]
515 vaesdec %%ST1, %%T0
516 %if (%%num_blocks>=2)
517 vaesdec %%ST2, %%T0
518 %endif
519 %if (%%num_blocks>=3)
520 vaesdec %%ST3, %%T0
521 %endif
522 %if (%%num_blocks>=4)
523 vaesdec %%ST4, %%T0
524 %endif
525 %if (%%num_blocks>=5)
526 vaesdec %%ST5, %%T0
527 %endif
528 %if (%%num_blocks>=6)
529 vaesdec %%ST6, %%T0
530 %endif
531 %if (%%num_blocks>=7)
532 vaesdec %%ST7, %%T0
533 %endif
534 %if (0 == %%lt128)
535 mov [TW + 8*3], twtemph
536 xor ghash_poly_8b_temp, ghash_poly_8b_temp
537 shl twtempl, 1
538 adc twtemph, twtemph
539 cmovc ghash_poly_8b_temp, ghash_poly_8b
540 %endif
541
542 ; round 4
543 vmovdqa %%T0, [keys + 16*4]
544 vaesdec %%ST1, %%T0
545 %if (%%num_blocks>=2)
546 vaesdec %%ST2, %%T0
547 %endif
548 %if (%%num_blocks>=3)
549 vaesdec %%ST3, %%T0
550 %endif
551 %if (%%num_blocks>=4)
552 vaesdec %%ST4, %%T0
553 %endif
554 %if (%%num_blocks>=5)
555 vaesdec %%ST5, %%T0
556 %endif
557 %if (%%num_blocks>=6)
558 vaesdec %%ST6, %%T0
559 %endif
560 %if (%%num_blocks>=7)
561 vaesdec %%ST7, %%T0
562 %endif
563
564 %if (0 == %%lt128)
565 xor twtempl, ghash_poly_8b_temp
566 mov [TW + 8*4], twtempl ; next Tweak3 generated
567 mov [TW + 8*5], twtemph
568 xor ghash_poly_8b_temp, ghash_poly_8b_temp
569 shl twtempl, 1
570 %endif
571
572 ; round 5
573 vmovdqa %%T0, [keys + 16*5]
574 vaesdec %%ST1, %%T0
575 %if (%%num_blocks>=2)
576 vaesdec %%ST2, %%T0
577 %endif
578 %if (%%num_blocks>=3)
579 vaesdec %%ST3, %%T0
580 %endif
581 %if (%%num_blocks>=4)
582 vaesdec %%ST4, %%T0
583 %endif
584 %if (%%num_blocks>=5)
585 vaesdec %%ST5, %%T0
586 %endif
587 %if (%%num_blocks>=6)
588 vaesdec %%ST6, %%T0
589 %endif
590 %if (%%num_blocks>=7)
591 vaesdec %%ST7, %%T0
592 %endif
593
594 %if (0 == %%lt128)
595 adc twtemph, twtemph
596 cmovc ghash_poly_8b_temp, ghash_poly_8b
597 xor twtempl, ghash_poly_8b_temp
598 mov [TW + 8*6], twtempl ; next Tweak4 generated
599 mov [TW + 8*7], twtemph
600 %endif
601
602 ; round 6
603 vmovdqa %%T0, [keys + 16*6]
604 vaesdec %%ST1, %%T0
605 %if (%%num_blocks>=2)
606 vaesdec %%ST2, %%T0
607 %endif
608 %if (%%num_blocks>=3)
609 vaesdec %%ST3, %%T0
610 %endif
611 %if (%%num_blocks>=4)
612 vaesdec %%ST4, %%T0
613 %endif
614 %if (%%num_blocks>=5)
615 vaesdec %%ST5, %%T0
616 %endif
617 %if (%%num_blocks>=6)
618 vaesdec %%ST6, %%T0
619 %endif
620 %if (%%num_blocks>=7)
621 vaesdec %%ST7, %%T0
622 %endif
623
624 %if (0 == %%lt128)
625 xor ghash_poly_8b_temp, ghash_poly_8b_temp
626 shl twtempl, 1
627 adc twtemph, twtemph
628 cmovc ghash_poly_8b_temp, ghash_poly_8b
629 xor twtempl, ghash_poly_8b_temp
630 mov [TW + 8*8], twtempl ; next Tweak5 generated
631 mov [TW + 8*9], twtemph
632 %endif
633
634 ; round 7
635 vmovdqa %%T0, [keys + 16*7]
636 vaesdec %%ST1, %%T0
637 %if (%%num_blocks>=2)
638 vaesdec %%ST2, %%T0
639 %endif
640 %if (%%num_blocks>=3)
641 vaesdec %%ST3, %%T0
642 %endif
643 %if (%%num_blocks>=4)
644 vaesdec %%ST4, %%T0
645 %endif
646 %if (%%num_blocks>=5)
647 vaesdec %%ST5, %%T0
648 %endif
649 %if (%%num_blocks>=6)
650 vaesdec %%ST6, %%T0
651 %endif
652 %if (%%num_blocks>=7)
653 vaesdec %%ST7, %%T0
654 %endif
655
656 %if (0 == %%lt128)
657 xor ghash_poly_8b_temp, ghash_poly_8b_temp
658 shl twtempl, 1
659 adc twtemph, twtemph
660 cmovc ghash_poly_8b_temp, ghash_poly_8b
661 xor twtempl, ghash_poly_8b_temp
662 mov [TW + 8*10], twtempl ; next Tweak6 generated
663 mov [TW + 8*11], twtemph
664 %endif
665 ; round 8
666 vmovdqa %%T0, [keys + 16*8]
667 vaesdec %%ST1, %%T0
668 %if (%%num_blocks>=2)
669 vaesdec %%ST2, %%T0
670 %endif
671 %if (%%num_blocks>=3)
672 vaesdec %%ST3, %%T0
673 %endif
674 %if (%%num_blocks>=4)
675 vaesdec %%ST4, %%T0
676 %endif
677 %if (%%num_blocks>=5)
678 vaesdec %%ST5, %%T0
679 %endif
680 %if (%%num_blocks>=6)
681 vaesdec %%ST6, %%T0
682 %endif
683 %if (%%num_blocks>=7)
684 vaesdec %%ST7, %%T0
685 %endif
686
687 %if (0 == %%lt128)
688 xor ghash_poly_8b_temp, ghash_poly_8b_temp
689 shl twtempl, 1
690 adc twtemph, twtemph
691 cmovc ghash_poly_8b_temp, ghash_poly_8b
692 xor twtempl, ghash_poly_8b_temp
693 mov [TW + 8*12], twtempl ; next Tweak7 generated
694 mov [TW + 8*13], twtemph
695 %endif
696 ; round 9
697 vmovdqa %%T0, [keys + 16*9]
698 vaesdec %%ST1, %%T0
699 %if (%%num_blocks>=2)
700 vaesdec %%ST2, %%T0
701 %endif
702 %if (%%num_blocks>=3)
703 vaesdec %%ST3, %%T0
704 %endif
705 %if (%%num_blocks>=4)
706 vaesdec %%ST4, %%T0
707 %endif
708 %if (%%num_blocks>=5)
709 vaesdec %%ST5, %%T0
710 %endif
711 %if (%%num_blocks>=6)
712 vaesdec %%ST6, %%T0
713 %endif
714 %if (%%num_blocks>=7)
715 vaesdec %%ST7, %%T0
716 %endif
717
718 %if (0 == %%lt128)
719 xor ghash_poly_8b_temp, ghash_poly_8b_temp
720 shl twtempl, 1
721 adc twtemph, twtemph
722 cmovc ghash_poly_8b_temp, ghash_poly_8b
723 xor twtempl, ghash_poly_8b_temp
724 mov [TW + 8*14], twtempl ; next Tweak8 generated
725 mov [TW + 8*15], twtemph
726 %endif
727 ; round 10
728 vmovdqa %%T0, [keys + 16*10]
729 vaesdec %%ST1, %%T0
730 %if (%%num_blocks>=2)
731 vaesdec %%ST2, %%T0
732 %endif
733 %if (%%num_blocks>=3)
734 vaesdec %%ST3, %%T0
735 %endif
736 %if (%%num_blocks>=4)
737 vaesdec %%ST4, %%T0
738 %endif
739 %if (%%num_blocks>=5)
740 vaesdec %%ST5, %%T0
741 %endif
742 %if (%%num_blocks>=6)
743 vaesdec %%ST6, %%T0
744 %endif
745 %if (%%num_blocks>=7)
746 vaesdec %%ST7, %%T0
747 %endif
748 ; round 11
749 vmovdqa %%T0, [keys + 16*11]
750 vaesdec %%ST1, %%T0
751 %if (%%num_blocks>=2)
752 vaesdec %%ST2, %%T0
753 %endif
754 %if (%%num_blocks>=3)
755 vaesdec %%ST3, %%T0
756 %endif
757 %if (%%num_blocks>=4)
758 vaesdec %%ST4, %%T0
759 %endif
760 %if (%%num_blocks>=5)
761 vaesdec %%ST5, %%T0
762 %endif
763 %if (%%num_blocks>=6)
764 vaesdec %%ST6, %%T0
765 %endif
766 %if (%%num_blocks>=7)
767 vaesdec %%ST7, %%T0
768 %endif
769
770 ; round 12
771 vmovdqa %%T0, [keys + 16*12]
772 vaesdec %%ST1, %%T0
773 %if (%%num_blocks>=2)
774 vaesdec %%ST2, %%T0
775 %endif
776 %if (%%num_blocks>=3)
777 vaesdec %%ST3, %%T0
778 %endif
779 %if (%%num_blocks>=4)
780 vaesdec %%ST4, %%T0
781 %endif
782 %if (%%num_blocks>=5)
783 vaesdec %%ST5, %%T0
784 %endif
785 %if (%%num_blocks>=6)
786 vaesdec %%ST6, %%T0
787 %endif
788 %if (%%num_blocks>=7)
789 vaesdec %%ST7, %%T0
790 %endif
791
792 ; round 13
793 vmovdqa %%T0, [keys + 16*13]
794 vaesdec %%ST1, %%T0
795 %if (%%num_blocks>=2)
796 vaesdec %%ST2, %%T0
797 %endif
798 %if (%%num_blocks>=3)
799 vaesdec %%ST3, %%T0
800 %endif
801 %if (%%num_blocks>=4)
802 vaesdec %%ST4, %%T0
803 %endif
804 %if (%%num_blocks>=5)
805 vaesdec %%ST5, %%T0
806 %endif
807 %if (%%num_blocks>=6)
808 vaesdec %%ST6, %%T0
809 %endif
810 %if (%%num_blocks>=7)
811 vaesdec %%ST7, %%T0
812 %endif
813
814 ; round 14
815 vmovdqa %%T0, [keys + 16*14]
816 vaesdeclast %%ST1, %%T0
817 %if (%%num_blocks>=2)
818 vaesdeclast %%ST2, %%T0
819 %endif
820 %if (%%num_blocks>=3)
821 vaesdeclast %%ST3, %%T0
822 %endif
823 %if (%%num_blocks>=4)
824 vaesdeclast %%ST4, %%T0
825 %endif
826 %if (%%num_blocks>=5)
827 vaesdeclast %%ST5, %%T0
828 %endif
829 %if (%%num_blocks>=6)
830 vaesdeclast %%ST6, %%T0
831 %endif
832 %if (%%num_blocks>=7)
833 vaesdeclast %%ST7, %%T0
834 %endif
835
836 ; xor Tweak values
837 vpxor %%ST1, %%TW1
838 %if (%%num_blocks>=2)
839 vpxor %%ST2, %%TW2
840 %endif
841 %if (%%num_blocks>=3)
842 vpxor %%ST3, %%TW3
843 %endif
844 %if (%%num_blocks>=4)
845 vpxor %%ST4, %%TW4
846 %endif
847 %if (%%num_blocks>=5)
848 vpxor %%ST5, %%TW5
849 %endif
850 %if (%%num_blocks>=6)
851 vpxor %%ST6, %%TW6
852 %endif
853 %if (%%num_blocks>=7)
854 vpxor %%ST7, %%TW7
855 %endif
856
857
858 %if (0 == %%lt128)
859 ; load next Tweak values
860 vmovdqa %%TW1, [TW + 16*0]
861 vmovdqa %%TW2, [TW + 16*1]
862 vmovdqa %%TW3, [TW + 16*2]
863 vmovdqa %%TW4, [TW + 16*3]
864 vmovdqa %%TW5, [TW + 16*4]
865 vmovdqa %%TW6, [TW + 16*5]
866 vmovdqa %%TW7, [TW + 16*6]
867
868 %endif
869
870 %endmacro
871
872
873
874
875 ; Decrypt 8 blocks in parallel
876 ; generate next 8 tweak values
877 %macro decrypt_by_eight_zmm 6
878 %define %%ST1 %1 ; state 1
879 %define %%ST2 %2 ; state 2
880 %define %%TW1 %3 ; tweak 1
881 %define %%TW2 %4 ; tweak 2
882 %define %%T0 %5 ; Temp register
883 %define %%last_eight %6
884
885 ; xor Tweak values
886 vpxorq %%ST1, %%TW1
887 vpxorq %%ST2, %%TW2
888
889 ; ARK
890 vbroadcasti32x4 %%T0, [keys]
891 vpxorq %%ST1, %%T0
892 vpxorq %%ST2, %%T0
893
894 %if (0 == %%last_eight)
895 vpsrldq zmm13, %%TW1, 15
896 vpclmulqdq zmm14, zmm13, zpoly, 0
897 vpslldq zmm15, %%TW1, 1
898 vpxord zmm15, zmm15, zmm14
899 %endif
900 ; round 1
901 vbroadcasti32x4 %%T0, [keys + 16*1]
902 vaesdec %%ST1, %%T0
903 vaesdec %%ST2, %%T0
904
905 ; round 2
906 vbroadcasti32x4 %%T0, [keys + 16*2]
907 vaesdec %%ST1, %%T0
908 vaesdec %%ST2, %%T0
909
910 ; round 3
911 vbroadcasti32x4 %%T0, [keys + 16*3]
912 vaesdec %%ST1, %%T0
913 vaesdec %%ST2, %%T0
914 %if (0 == %%last_eight)
915 vpsrldq zmm13, %%TW2, 15
916 vpclmulqdq zmm14, zmm13, zpoly, 0
917 vpslldq zmm16, %%TW2, 1
918 vpxord zmm16, zmm16, zmm14
919 %endif
920 ; round 4
921 vbroadcasti32x4 %%T0, [keys + 16*4]
922 vaesdec %%ST1, %%T0
923 vaesdec %%ST2, %%T0
924
925 ; round 5
926 vbroadcasti32x4 %%T0, [keys + 16*5]
927 vaesdec %%ST1, %%T0
928 vaesdec %%ST2, %%T0
929
930 ; round 6
931 vbroadcasti32x4 %%T0, [keys + 16*6]
932 vaesdec %%ST1, %%T0
933 vaesdec %%ST2, %%T0
934
935 ; round 7
936 vbroadcasti32x4 %%T0, [keys + 16*7]
937 vaesdec %%ST1, %%T0
938 vaesdec %%ST2, %%T0
939
940 ; round 8
941 vbroadcasti32x4 %%T0, [keys + 16*8]
942 vaesdec %%ST1, %%T0
943 vaesdec %%ST2, %%T0
944
945 ; round 9
946 vbroadcasti32x4 %%T0, [keys + 16*9]
947 vaesdec %%ST1, %%T0
948 vaesdec %%ST2, %%T0
949
950 ; round 10
951 vbroadcasti32x4 %%T0, [keys + 16*10]
952 vaesdec %%ST1, %%T0
953 vaesdec %%ST2, %%T0
954
955 ; round 11
956 vbroadcasti32x4 %%T0, [keys + 16*11]
957 vaesdec %%ST1, %%T0
958 vaesdec %%ST2, %%T0
959
960 ; round 12
961 vbroadcasti32x4 %%T0, [keys + 16*12]
962 vaesdec %%ST1, %%T0
963 vaesdec %%ST2, %%T0
964
965 ; round 13
966 vbroadcasti32x4 %%T0, [keys + 16*13]
967 vaesdec %%ST1, %%T0
968 vaesdec %%ST2, %%T0
969
970 ; round 14
971 vbroadcasti32x4 %%T0, [keys + 16*14]
972 vaesdeclast %%ST1, %%T0
973 vaesdeclast %%ST2, %%T0
974
975 ; xor Tweak values
976 vpxorq %%ST1, %%TW1
977 vpxorq %%ST2, %%TW2
978
979 ; load next Tweak values
980 vmovdqa32 %%TW1, zmm15
981 vmovdqa32 %%TW2, zmm16
982 %endmacro
983
984
985 ; Decrypt 16 blocks in parallel
986 ; generate next 8 tweak values
987 %macro decrypt_by_16_zmm 10
988 %define %%ST1 %1 ; state 1
989 %define %%ST2 %2 ; state 2
990 %define %%ST3 %3 ; state 3
991 %define %%ST4 %4 ; state 4
992
993 %define %%TW1 %5 ; tweak 1
994 %define %%TW2 %6 ; tweak 2
995 %define %%TW3 %7 ; tweak 3
996 %define %%TW4 %8 ; tweak 4
997
998 %define %%T0 %9 ; Temp register
999 %define %%last_eight %10
1000
1001 ; xor Tweak values
1002 vpxorq %%ST1, %%TW1
1003 vpxorq %%ST2, %%TW2
1004 vpxorq %%ST3, %%TW3
1005 vpxorq %%ST4, %%TW4
1006
1007 ; ARK
1008 vbroadcasti32x4 %%T0, [keys]
1009 vpxorq %%ST1, %%T0
1010 vpxorq %%ST2, %%T0
1011 vpxorq %%ST3, %%T0
1012 vpxorq %%ST4, %%T0
1013
1014 %if (0 == %%last_eight)
1015 vpsrldq zmm13, %%TW3, 15
1016 vpclmulqdq zmm14, zmm13, zpoly, 0
1017 vpslldq zmm15, %%TW3, 1
1018 vpxord zmm15, zmm15, zmm14
1019 %endif
1020 ; round 1
1021 vbroadcasti32x4 %%T0, [keys + 16*1]
1022 vaesdec %%ST1, %%T0
1023 vaesdec %%ST2, %%T0
1024 vaesdec %%ST3, %%T0
1025 vaesdec %%ST4, %%T0
1026
1027 ; round 2
1028 vbroadcasti32x4 %%T0, [keys + 16*2]
1029 vaesdec %%ST1, %%T0
1030 vaesdec %%ST2, %%T0
1031 vaesdec %%ST3, %%T0
1032 vaesdec %%ST4, %%T0
1033
1034 ; round 3
1035 vbroadcasti32x4 %%T0, [keys + 16*3]
1036 vaesdec %%ST1, %%T0
1037 vaesdec %%ST2, %%T0
1038 vaesdec %%ST3, %%T0
1039 vaesdec %%ST4, %%T0
1040 %if (0 == %%last_eight)
1041 vpsrldq zmm13, %%TW4, 15
1042 vpclmulqdq zmm14, zmm13, zpoly, 0
1043 vpslldq zmm16, %%TW4, 1
1044 vpxord zmm16, zmm16, zmm14
1045 %endif
1046 ; round 4
1047 vbroadcasti32x4 %%T0, [keys + 16*4]
1048 vaesdec %%ST1, %%T0
1049 vaesdec %%ST2, %%T0
1050 vaesdec %%ST3, %%T0
1051 vaesdec %%ST4, %%T0
1052
1053 ; round 5
1054 vbroadcasti32x4 %%T0, [keys + 16*5]
1055 vaesdec %%ST1, %%T0
1056 vaesdec %%ST2, %%T0
1057 vaesdec %%ST3, %%T0
1058 vaesdec %%ST4, %%T0
1059
1060 ; round 6
1061 vbroadcasti32x4 %%T0, [keys + 16*6]
1062 vaesdec %%ST1, %%T0
1063 vaesdec %%ST2, %%T0
1064 vaesdec %%ST3, %%T0
1065 vaesdec %%ST4, %%T0
1066 %if (0 == %%last_eight)
1067 vpsrldq zmm13, zmm15, 15
1068 vpclmulqdq zmm14, zmm13, zpoly, 0
1069 vpslldq zmm17, zmm15, 1
1070 vpxord zmm17, zmm17, zmm14
1071 %endif
1072 ; round 7
1073 vbroadcasti32x4 %%T0, [keys + 16*7]
1074 vaesdec %%ST1, %%T0
1075 vaesdec %%ST2, %%T0
1076 vaesdec %%ST3, %%T0
1077 vaesdec %%ST4, %%T0
1078
1079 ; round 8
1080 vbroadcasti32x4 %%T0, [keys + 16*8]
1081 vaesdec %%ST1, %%T0
1082 vaesdec %%ST2, %%T0
1083 vaesdec %%ST3, %%T0
1084 vaesdec %%ST4, %%T0
1085
1086 ; round 9
1087 vbroadcasti32x4 %%T0, [keys + 16*9]
1088 vaesdec %%ST1, %%T0
1089 vaesdec %%ST2, %%T0
1090 vaesdec %%ST3, %%T0
1091 vaesdec %%ST4, %%T0
1092 %if (0 == %%last_eight)
1093 vpsrldq zmm13, zmm16, 15
1094 vpclmulqdq zmm14, zmm13, zpoly, 0
1095 vpslldq zmm18, zmm16, 1
1096 vpxord zmm18, zmm18, zmm14
1097 %endif
1098 ; round 10
1099 vbroadcasti32x4 %%T0, [keys + 16*10]
1100 vaesdec %%ST1, %%T0
1101 vaesdec %%ST2, %%T0
1102 vaesdec %%ST3, %%T0
1103 vaesdec %%ST4, %%T0
1104
1105 ; round 11
1106 vbroadcasti32x4 %%T0, [keys + 16*11]
1107 vaesdec %%ST1, %%T0
1108 vaesdec %%ST2, %%T0
1109 vaesdec %%ST3, %%T0
1110 vaesdec %%ST4, %%T0
1111
1112 ; round 12
1113 vbroadcasti32x4 %%T0, [keys + 16*12]
1114 vaesdec %%ST1, %%T0
1115 vaesdec %%ST2, %%T0
1116 vaesdec %%ST3, %%T0
1117 vaesdec %%ST4, %%T0
1118
1119 ; round 13
1120 vbroadcasti32x4 %%T0, [keys + 16*13]
1121 vaesdec %%ST1, %%T0
1122 vaesdec %%ST2, %%T0
1123 vaesdec %%ST3, %%T0
1124 vaesdec %%ST4, %%T0
1125
1126 ; round 14
1127 vbroadcasti32x4 %%T0, [keys + 16*14]
1128 vaesdeclast %%ST1, %%T0
1129 vaesdeclast %%ST2, %%T0
1130 vaesdeclast %%ST3, %%T0
1131 vaesdeclast %%ST4, %%T0
1132
1133 ; xor Tweak values
1134 vpxorq %%ST1, %%TW1
1135 vpxorq %%ST2, %%TW2
1136 vpxorq %%ST3, %%TW3
1137 vpxorq %%ST4, %%TW4
1138
1139 ; load next Tweak values
1140 vmovdqa32 %%TW1, zmm15
1141 vmovdqa32 %%TW2, zmm16
1142 vmovdqa32 %%TW3, zmm17
1143 vmovdqa32 %%TW4, zmm18
1144 %endmacro
1145
1146
1147 section .text
1148
1149 mk_global XTS_AES_256_dec_vaes, function
1150 XTS_AES_256_dec_vaes:
1151 endbranch
1152
1153 %define ALIGN_STACK
1154 %ifdef ALIGN_STACK
1155 push rbp
1156 mov rbp, rsp
1157 sub rsp, VARIABLE_OFFSET
1158 and rsp, ~63
1159 %else
1160 sub rsp, VARIABLE_OFFSET
1161 %endif
1162
1163 mov [_gpr + 8*0], rbx
1164 %ifidn __OUTPUT_FORMAT__, win64
1165 mov [_gpr + 8*1], rdi
1166 mov [_gpr + 8*2], rsi
1167
1168 vmovdqa [_xmm + 16*0], xmm6
1169 vmovdqa [_xmm + 16*1], xmm7
1170 vmovdqa [_xmm + 16*2], xmm8
1171 vmovdqa [_xmm + 16*3], xmm9
1172 vmovdqa [_xmm + 16*4], xmm10
1173 vmovdqa [_xmm + 16*5], xmm11
1174 vmovdqa [_xmm + 16*6], xmm12
1175 vmovdqa [_xmm + 16*7], xmm13
1176 vmovdqa [_xmm + 16*8], xmm14
1177 vmovdqa [_xmm + 16*9], xmm15
1178 %endif
1179
1180 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1181
1182
1183 vmovdqu xmm1, [T_val] ; read initial Tweak value
1184 vpxor xmm4, xmm4 ; for key expansion
1185 encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
1186
1187
1188 %ifidn __OUTPUT_FORMAT__, win64
1189 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1190 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1191 %endif
1192
1193 cmp N_val, 128
1194 jl _less_than_128_bytes
1195
1196 vpbroadcastq zpoly, ghash_poly_8b
1197
1198 cmp N_val, 256
1199 jge _start_by16
1200
1201 cmp N_val, 128
1202 jge _start_by8
1203
1204 _do_n_blocks:
1205 cmp N_val, 0
1206 je _ret_
1207
1208 cmp N_val, (7*16)
1209 jge _remaining_num_blocks_is_7
1210
1211 cmp N_val, (6*16)
1212 jge _remaining_num_blocks_is_6
1213
1214 cmp N_val, (5*16)
1215 jge _remaining_num_blocks_is_5
1216
1217 cmp N_val, (4*16)
1218 jge _remaining_num_blocks_is_4
1219
1220 cmp N_val, (3*16)
1221 jge _remaining_num_blocks_is_3
1222
1223 cmp N_val, (2*16)
1224 jge _remaining_num_blocks_is_2
1225
1226 cmp N_val, (1*16)
1227 jge _remaining_num_blocks_is_1
1228
1229 ;; _remaining_num_blocks_is_0:
1230 vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
1231 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1232 vmovdqu [ptr_ciphertext - 16], xmm1
1233 vmovdqa xmm8, xmm1
1234
1235 ; Calc previous tweak
1236 mov tmp1, 1
1237 kmovq k1, tmp1
1238 vpsllq xmm13, xmm9, 63
1239 vpsraq xmm14, xmm13, 63
1240 vpandq xmm5, xmm14, XWORD(zpoly)
1241 vpxorq xmm9 {k1}, xmm9, xmm5
1242 vpsrldq xmm10, xmm9, 8
1243 vpshrdq xmm0, xmm9, xmm10, 1
1244 vpslldq xmm13, xmm13, 8
1245 vpxorq xmm0, xmm0, xmm13
1246 jmp _steal_cipher
1247
1248 _remaining_num_blocks_is_7:
1249 mov tmp1, -1
1250 shr tmp1, 16
1251 kmovq k1, tmp1
1252 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1253 vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
1254 add ptr_plaintext, 16*7
1255 and N_val, 15
1256 je _done_7_remain
1257 vextracti32x4 xmm12, zmm10, 2
1258 vextracti32x4 xmm13, zmm10, 3
1259 vinserti32x4 zmm10, xmm13, 2
1260 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1261 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1262 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1263 add ptr_ciphertext, 16*7
1264 vextracti32x4 xmm8, zmm2, 0x2
1265 vmovdqa xmm0, xmm12
1266 jmp _steal_cipher
1267 _done_7_remain:
1268 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1269 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1270 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1271 jmp _ret_
1272
1273 _remaining_num_blocks_is_6:
1274 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1275 vmovdqu8 ymm2, [ptr_plaintext+16*4]
1276 add ptr_plaintext, 16*6
1277 and N_val, 15
1278 je _done_6_remain
1279 vextracti32x4 xmm12, zmm10, 1
1280 vextracti32x4 xmm13, zmm10, 2
1281 vinserti32x4 zmm10, xmm13, 1
1282 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1283 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1284 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1285 add ptr_ciphertext, 16*6
1286 vextracti32x4 xmm8, zmm2, 0x1
1287 vmovdqa xmm0, xmm12
1288 jmp _steal_cipher
1289 _done_6_remain:
1290 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1291 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1292 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1293 jmp _ret_
1294
1295 _remaining_num_blocks_is_5:
1296 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1297 vmovdqu xmm2, [ptr_plaintext+16*4]
1298 add ptr_plaintext, 16*5
1299 and N_val, 15
1300 je _done_5_remain
1301 vmovdqa xmm12, xmm10
1302 vextracti32x4 xmm10, zmm10, 1
1303 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1304 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1305 vmovdqu [ptr_ciphertext+16*4], xmm2
1306 add ptr_ciphertext, 16*5
1307 vmovdqa xmm8, xmm2
1308 vmovdqa xmm0, xmm12
1309 jmp _steal_cipher
1310 _done_5_remain:
1311 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1312 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1313 vmovdqu [ptr_ciphertext+16*4], xmm2
1314 jmp _ret_
1315
1316 _remaining_num_blocks_is_4:
1317 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1318 add ptr_plaintext, 16*4
1319 and N_val, 15
1320 je _done_4_remain
1321 vextracti32x4 xmm12, zmm9, 3
1322 vinserti32x4 zmm9, xmm10, 3
1323 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1324 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1325 add ptr_ciphertext, 16*4
1326 vextracti32x4 xmm8, zmm1, 0x3
1327 vmovdqa xmm0, xmm12
1328 jmp _steal_cipher
1329 _done_4_remain:
1330 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1331 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1332 jmp _ret_
1333
1334 _remaining_num_blocks_is_3:
1335 vmovdqu xmm1, [ptr_plaintext+16*0]
1336 vmovdqu xmm2, [ptr_plaintext+16*1]
1337 vmovdqu xmm3, [ptr_plaintext+16*2]
1338 add ptr_plaintext, 16*3
1339 and N_val, 15
1340 je _done_3_remain
1341 vextracti32x4 xmm13, zmm9, 2
1342 vextracti32x4 xmm10, zmm9, 1
1343 vextracti32x4 xmm11, zmm9, 3
1344 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1345 vmovdqu [ptr_ciphertext+16*0], xmm1
1346 vmovdqu [ptr_ciphertext+16*1], xmm2
1347 vmovdqu [ptr_ciphertext+16*2], xmm3
1348 add ptr_ciphertext, 16*3
1349 vmovdqa xmm8, xmm3
1350 vmovdqa xmm0, xmm13
1351 jmp _steal_cipher
1352 _done_3_remain:
1353 vextracti32x4 xmm10, zmm9, 1
1354 vextracti32x4 xmm11, zmm9, 2
1355 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
1356 vmovdqu [ptr_ciphertext+16*0], xmm1
1357 vmovdqu [ptr_ciphertext+16*1], xmm2
1358 vmovdqu [ptr_ciphertext+16*2], xmm3
1359 jmp _ret_
1360
1361 _remaining_num_blocks_is_2:
1362 vmovdqu xmm1, [ptr_plaintext+16*0]
1363 vmovdqu xmm2, [ptr_plaintext+16*1]
1364 add ptr_plaintext, 16*2
1365 and N_val, 15
1366 je _done_2_remain
1367 vextracti32x4 xmm10, zmm9, 2
1368 vextracti32x4 xmm12, zmm9, 1
1369 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1370 vmovdqu [ptr_ciphertext+16*0], xmm1
1371 vmovdqu [ptr_ciphertext+16*1], xmm2
1372 add ptr_ciphertext, 16*2
1373 vmovdqa xmm8, xmm2
1374 vmovdqa xmm0, xmm12
1375 jmp _steal_cipher
1376 _done_2_remain:
1377 vextracti32x4 xmm10, zmm9, 1
1378 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
1379 vmovdqu [ptr_ciphertext+16*0], xmm1
1380 vmovdqu [ptr_ciphertext+16*1], xmm2
1381 jmp _ret_
1382
1383 _remaining_num_blocks_is_1:
1384 vmovdqu xmm1, [ptr_plaintext]
1385 add ptr_plaintext, 16
1386 and N_val, 15
1387 je _done_1_remain
1388 vextracti32x4 xmm11, zmm9, 1
1389 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
1390 vmovdqu [ptr_ciphertext], xmm1
1391 add ptr_ciphertext, 16
1392 vmovdqa xmm8, xmm1
1393 vmovdqa xmm0, xmm9
1394 jmp _steal_cipher
1395 _done_1_remain:
1396 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
1397 vmovdqu [ptr_ciphertext], xmm1
1398 jmp _ret_
1399
1400
1401
1402 _start_by16:
1403 ; Make first 7 tweek values
1404 vbroadcasti32x4 zmm0, [TW]
1405 vbroadcasti32x4 zmm8, [shufb_15_7]
1406 mov tmp1, 0xaa
1407 kmovq k2, tmp1
1408
1409 ; Mult tweak by 2^{3, 2, 1, 0}
1410 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1411 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1412 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1413 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1414 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1415 vpxord zmm9, zmm3, zmm4
1416
1417 ; Mult tweak by 2^{7, 6, 5, 4}
1418 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1419 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1420 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1421 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1422 vpxord zmm10, zmm7, zmm5
1423
1424 ; Make next 8 tweek values by all x 2^8
1425 vpsrldq zmm13, zmm9, 15
1426 vpclmulqdq zmm14, zmm13, zpoly, 0
1427 vpslldq zmm11, zmm9, 1
1428 vpxord zmm11, zmm11, zmm14
1429
1430 vpsrldq zmm15, zmm10, 15
1431 vpclmulqdq zmm16, zmm15, zpoly, 0
1432 vpslldq zmm12, zmm10, 1
1433 vpxord zmm12, zmm12, zmm16
1434
1435 _main_loop_run_16:
1436 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1437 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1438 vmovdqu8 zmm3, [ptr_plaintext+16*8]
1439 vmovdqu8 zmm4, [ptr_plaintext+16*12]
1440 add ptr_plaintext, 256
1441
1442 decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
1443
1444 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1445 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1446 vmovdqu8 [ptr_ciphertext+16*8], zmm3
1447 vmovdqu8 [ptr_ciphertext+16*12], zmm4
1448 add ptr_ciphertext, 256
1449 sub N_val, 256
1450 cmp N_val, 256
1451 jge _main_loop_run_16
1452
1453 cmp N_val, 128
1454 jge _main_loop_run_8
1455
1456 jmp _do_n_blocks
1457
1458 _start_by8:
1459 ; Make first 7 tweek values
1460 vbroadcasti32x4 zmm0, [TW]
1461 vbroadcasti32x4 zmm8, [shufb_15_7]
1462 mov tmp1, 0xaa
1463 kmovq k2, tmp1
1464
1465 ; Mult tweak by 2^{3, 2, 1, 0}
1466 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1467 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1468 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1469 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1470 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1471 vpxord zmm9, zmm3, zmm4
1472
1473 ; Mult tweak by 2^{7, 6, 5, 4}
1474 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1475 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1476 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1477 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1478 vpxord zmm10, zmm7, zmm5
1479
1480 _main_loop_run_8:
1481 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1482 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1483 add ptr_plaintext, 128
1484
1485 decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
1486
1487 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1488 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1489 add ptr_ciphertext, 128
1490 sub N_val, 128
1491 cmp N_val, 128
1492 jge _main_loop_run_8
1493
1494 jmp _do_n_blocks
1495
1496 _steal_cipher:
1497 ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
1498 vmovdqa xmm2, xmm8
1499
1500 ; shift xmm8 to the left by 16-N_val bytes
1501 lea twtempl, [vpshufb_shf_table]
1502 vmovdqu xmm10, [twtempl+N_val]
1503 vpshufb xmm8, xmm10
1504
1505 vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
1506 vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
1507
1508 ; shift xmm3 to the right by 16-N_val bytes
1509 lea twtempl, [vpshufb_shf_table +16]
1510 sub twtempl, N_val
1511 vmovdqu xmm10, [twtempl]
1512 vpxor xmm10, [mask1]
1513 vpshufb xmm3, xmm10
1514
1515 vpblendvb xmm3, xmm3, xmm2, xmm10
1516
1517 ; xor Tweak value
1518 vpxor xmm8, xmm3, xmm0
1519
1520 ;decrypt last block with cipher stealing
1521 vpxor xmm8, [keys] ; ARK
1522 vaesdec xmm8, [keys + 16*1] ; round 1
1523 vaesdec xmm8, [keys + 16*2] ; round 2
1524 vaesdec xmm8, [keys + 16*3] ; round 3
1525 vaesdec xmm8, [keys + 16*4] ; round 4
1526 vaesdec xmm8, [keys + 16*5] ; round 5
1527 vaesdec xmm8, [keys + 16*6] ; round 6
1528 vaesdec xmm8, [keys + 16*7] ; round 7
1529 vaesdec xmm8, [keys + 16*8] ; round 8
1530 vaesdec xmm8, [keys + 16*9] ; round 9
1531 vaesdec xmm8, [keys + 16*10] ; round 10
1532 vaesdec xmm8, [keys + 16*11] ; round 11
1533 vaesdec xmm8, [keys + 16*12] ; round 12
1534 vaesdec xmm8, [keys + 16*13] ; round 13
1535 vaesdeclast xmm8, [keys + 16*14] ; round 14
1536
1537 ; xor Tweak value
1538 vpxor xmm8, xmm8, xmm0
1539
1540 _done:
1541 ; store last ciphertext value
1542 vmovdqu [ptr_ciphertext - 16], xmm8
1543
1544 _ret_:
1545 mov rbx, [_gpr + 8*0]
1546
1547 %ifidn __OUTPUT_FORMAT__, win64
1548 mov rdi, [_gpr + 8*1]
1549 mov rsi, [_gpr + 8*2]
1550
1551 vmovdqa xmm6, [_xmm + 16*0]
1552 vmovdqa xmm7, [_xmm + 16*1]
1553 vmovdqa xmm8, [_xmm + 16*2]
1554 vmovdqa xmm9, [_xmm + 16*3]
1555 vmovdqa xmm10, [_xmm + 16*4]
1556 vmovdqa xmm11, [_xmm + 16*5]
1557 vmovdqa xmm12, [_xmm + 16*6]
1558 vmovdqa xmm13, [_xmm + 16*7]
1559 vmovdqa xmm14, [_xmm + 16*8]
1560 vmovdqa xmm15, [_xmm + 16*9]
1561 %endif
1562
1563 %ifndef ALIGN_STACK
1564 add rsp, VARIABLE_OFFSET
1565 %else
1566 mov rsp, rbp
1567 pop rbp
1568 %endif
1569 ret
1570
1571
1572 _less_than_128_bytes:
1573 cmp N_val, 16
1574 jb _ret_
1575
1576 mov tmp1, N_val
1577 and tmp1, (7 << 4)
1578 cmp tmp1, (6 << 4)
1579 je _num_blocks_is_6
1580 cmp tmp1, (5 << 4)
1581 je _num_blocks_is_5
1582 cmp tmp1, (4 << 4)
1583 je _num_blocks_is_4
1584 cmp tmp1, (3 << 4)
1585 je _num_blocks_is_3
1586 cmp tmp1, (2 << 4)
1587 je _num_blocks_is_2
1588 cmp tmp1, (1 << 4)
1589 je _num_blocks_is_1
1590
1591 _num_blocks_is_7:
1592 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1593 add ptr_plaintext, 16*7
1594 and N_val, 15
1595 je _done_7
1596
1597 _steal_cipher_7:
1598 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1599 shl twtempl, 1
1600 adc twtemph, twtemph
1601 cmovc ghash_poly_8b_temp, ghash_poly_8b
1602 xor twtempl, ghash_poly_8b_temp
1603 mov [TW+8*2], twtempl
1604 mov [TW+8*3], twtemph
1605 vmovdqa64 xmm16, xmm15
1606 vmovdqa xmm15, [TW+16*1]
1607
1608 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1609 vmovdqu [ptr_ciphertext+16*0], xmm1
1610 vmovdqu [ptr_ciphertext+16*1], xmm2
1611 vmovdqu [ptr_ciphertext+16*2], xmm3
1612 vmovdqu [ptr_ciphertext+16*3], xmm4
1613 vmovdqu [ptr_ciphertext+16*4], xmm5
1614 vmovdqu [ptr_ciphertext+16*5], xmm6
1615 add ptr_ciphertext, 16*7
1616 vmovdqa64 xmm0, xmm16
1617 vmovdqa xmm8, xmm7
1618 jmp _steal_cipher
1619
1620 _done_7:
1621 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1622 vmovdqu [ptr_ciphertext+16*0], xmm1
1623 vmovdqu [ptr_ciphertext+16*1], xmm2
1624 vmovdqu [ptr_ciphertext+16*2], xmm3
1625 vmovdqu [ptr_ciphertext+16*3], xmm4
1626 vmovdqu [ptr_ciphertext+16*4], xmm5
1627 vmovdqu [ptr_ciphertext+16*5], xmm6
1628 add ptr_ciphertext, 16*7
1629 vmovdqa xmm8, xmm7
1630 jmp _done
1631
1632 _num_blocks_is_6:
1633 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1634 add ptr_plaintext, 16*6
1635 and N_val, 15
1636 je _done_6
1637
1638 _steal_cipher_6:
1639 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1640 shl twtempl, 1
1641 adc twtemph, twtemph
1642 cmovc ghash_poly_8b_temp, ghash_poly_8b
1643 xor twtempl, ghash_poly_8b_temp
1644 mov [TW+8*2], twtempl
1645 mov [TW+8*3], twtemph
1646 vmovdqa xmm15, xmm14
1647 vmovdqa xmm14, [TW+16*1]
1648
1649 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1650 vmovdqu [ptr_ciphertext+16*0], xmm1
1651 vmovdqu [ptr_ciphertext+16*1], xmm2
1652 vmovdqu [ptr_ciphertext+16*2], xmm3
1653 vmovdqu [ptr_ciphertext+16*3], xmm4
1654 vmovdqu [ptr_ciphertext+16*4], xmm5
1655 add ptr_ciphertext, 16*6
1656 vmovdqa xmm0, xmm15
1657 vmovdqa xmm8, xmm6
1658 jmp _steal_cipher
1659
1660 _done_6:
1661 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1662 vmovdqu [ptr_ciphertext+16*0], xmm1
1663 vmovdqu [ptr_ciphertext+16*1], xmm2
1664 vmovdqu [ptr_ciphertext+16*2], xmm3
1665 vmovdqu [ptr_ciphertext+16*3], xmm4
1666 vmovdqu [ptr_ciphertext+16*4], xmm5
1667 add ptr_ciphertext, 16*6
1668 vmovdqa xmm8, xmm6
1669 jmp _done
1670
1671 _num_blocks_is_5:
1672 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1673 add ptr_plaintext, 16*5
1674 and N_val, 15
1675 je _done_5
1676
1677 _steal_cipher_5:
1678 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1679 shl twtempl, 1
1680 adc twtemph, twtemph
1681 cmovc ghash_poly_8b_temp, ghash_poly_8b
1682 xor twtempl, ghash_poly_8b_temp
1683 mov [TW+8*2], twtempl
1684 mov [TW+8*3], twtemph
1685 vmovdqa xmm14, xmm13
1686 vmovdqa xmm13, [TW+16*1]
1687
1688 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1689 vmovdqu [ptr_ciphertext+16*0], xmm1
1690 vmovdqu [ptr_ciphertext+16*1], xmm2
1691 vmovdqu [ptr_ciphertext+16*2], xmm3
1692 vmovdqu [ptr_ciphertext+16*3], xmm4
1693 add ptr_ciphertext, 16*5
1694 vmovdqa xmm0, xmm14
1695 vmovdqa xmm8, xmm5
1696 jmp _steal_cipher
1697
1698 _done_5:
1699 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1700 vmovdqu [ptr_ciphertext+16*0], xmm1
1701 vmovdqu [ptr_ciphertext+16*1], xmm2
1702 vmovdqu [ptr_ciphertext+16*2], xmm3
1703 vmovdqu [ptr_ciphertext+16*3], xmm4
1704 add ptr_ciphertext, 16*5
1705 vmovdqa xmm8, xmm5
1706 jmp _done
1707
1708 _num_blocks_is_4:
1709 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1710 add ptr_plaintext, 16*4
1711 and N_val, 15
1712 je _done_4
1713
1714 _steal_cipher_4:
1715 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1716 shl twtempl, 1
1717 adc twtemph, twtemph
1718 cmovc ghash_poly_8b_temp, ghash_poly_8b
1719 xor twtempl, ghash_poly_8b_temp
1720 mov [TW+8*2], twtempl
1721 mov [TW+8*3], twtemph
1722 vmovdqa xmm13, xmm12
1723 vmovdqa xmm12, [TW+16*1]
1724
1725 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1726 vmovdqu [ptr_ciphertext+16*0], xmm1
1727 vmovdqu [ptr_ciphertext+16*1], xmm2
1728 vmovdqu [ptr_ciphertext+16*2], xmm3
1729 add ptr_ciphertext, 16*4
1730 vmovdqa xmm0, xmm13
1731 vmovdqa xmm8, xmm4
1732 jmp _steal_cipher
1733
1734 _done_4:
1735 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1736 vmovdqu [ptr_ciphertext+16*0], xmm1
1737 vmovdqu [ptr_ciphertext+16*1], xmm2
1738 vmovdqu [ptr_ciphertext+16*2], xmm3
1739 add ptr_ciphertext, 16*4
1740 vmovdqa xmm8, xmm4
1741 jmp _done
1742
1743 _num_blocks_is_3:
1744 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1745 add ptr_plaintext, 16*3
1746 and N_val, 15
1747 je _done_3
1748
1749 _steal_cipher_3:
1750 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1751 shl twtempl, 1
1752 adc twtemph, twtemph
1753 cmovc ghash_poly_8b_temp, ghash_poly_8b
1754 xor twtempl, ghash_poly_8b_temp
1755 mov [TW+8*2], twtempl
1756 mov [TW+8*3], twtemph
1757 vmovdqa xmm12, xmm11
1758 vmovdqa xmm11, [TW+16*1]
1759
1760 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1761 vmovdqu [ptr_ciphertext+16*0], xmm1
1762 vmovdqu [ptr_ciphertext+16*1], xmm2
1763 add ptr_ciphertext, 16*3
1764 vmovdqa xmm0, xmm12
1765 vmovdqa xmm8, xmm3
1766 jmp _steal_cipher
1767
1768 _done_3:
1769 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1770 vmovdqu [ptr_ciphertext+16*0], xmm1
1771 vmovdqu [ptr_ciphertext+16*1], xmm2
1772 add ptr_ciphertext, 16*3
1773 vmovdqa xmm8, xmm3
1774 jmp _done
1775
1776 _num_blocks_is_2:
1777 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1778 add ptr_plaintext, 16*2
1779 and N_val, 15
1780 je _done_2
1781
1782 _steal_cipher_2:
1783 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1784 shl twtempl, 1
1785 adc twtemph, twtemph
1786 cmovc ghash_poly_8b_temp, ghash_poly_8b
1787 xor twtempl, ghash_poly_8b_temp
1788 mov [TW+8*2], twtempl
1789 mov [TW+8*3], twtemph
1790 vmovdqa xmm11, xmm10
1791 vmovdqa xmm10, [TW+16*1]
1792
1793 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1794 vmovdqu [ptr_ciphertext], xmm1
1795 add ptr_ciphertext, 16*2
1796 vmovdqa xmm0, xmm11
1797 vmovdqa xmm8, xmm2
1798 jmp _steal_cipher
1799
1800 _done_2:
1801 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1802 vmovdqu [ptr_ciphertext], xmm1
1803 add ptr_ciphertext, 16*2
1804 vmovdqa xmm8, xmm2
1805 jmp _done
1806
1807 _num_blocks_is_1:
1808 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1809 add ptr_plaintext, 16*1
1810 and N_val, 15
1811 je _done_1
1812
1813 _steal_cipher_1:
1814 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1815 shl twtempl, 1
1816 adc twtemph, twtemph
1817 cmovc ghash_poly_8b_temp, ghash_poly_8b
1818 xor twtempl, ghash_poly_8b_temp
1819 mov [TW+8*2], twtempl
1820 mov [TW+8*3], twtemph
1821 vmovdqa xmm10, xmm9
1822 vmovdqa xmm9, [TW+16*1]
1823
1824 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1825 add ptr_ciphertext, 16*1
1826 vmovdqa xmm0, xmm10
1827 vmovdqa xmm8, xmm1
1828 jmp _steal_cipher
1829
1830 _done_1:
1831 decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1832 add ptr_ciphertext, 16*1
1833 vmovdqa xmm8, xmm1
1834 jmp _done
1835
1836 section .data
1837 align 16
1838
1839 vpshufb_shf_table:
1840 ; use these values for shift constants for the vpshufb instruction
1841 ; different alignments result in values as shown:
1842 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1843 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1844 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1845 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1846 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1847 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1848 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1849 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1850 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1851 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1852 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1853 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1854 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1855 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1856 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1857 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1858 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1859
1860 mask1:
1861 dq 0x8080808080808080, 0x8080808080808080
1862
1863 const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
1864 const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
1865 const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
1866 const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
1867
1868 shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1869
1870 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
1871 %ifidn __OUTPUT_FORMAT__, win64
1872 global no_XTS_AES_256_dec_vaes
1873 no_XTS_AES_256_dec_vaes:
1874 %endif
1875 %endif ; (AS_FEATURE_LEVEL) >= 10