]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2020 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ; XTS decrypt function with 256-bit AES | |
30 | ; input keys are not aligned | |
31 | ; keys are expanded in parallel with the tweak encryption | |
32 | ; plaintext and ciphertext are not aligned | |
33 | ; second key is stored in the stack as aligned to 16 Bytes | |
34 | ; first key is required only once, no need for storage of this key | |
35 | ||
36 | %include "reg_sizes.asm" | |
37 | ||
38 | %if (AS_FEATURE_LEVEL) >= 10 | |
39 | ||
40 | default rel | |
41 | %define TW rsp ; store 8 tweak values | |
42 | %define keys rsp + 16*8 ; store 15 expanded keys | |
43 | ||
44 | %ifidn __OUTPUT_FORMAT__, win64 | |
45 | %define _xmm rsp + 16*23 ; store xmm6:xmm15 | |
46 | %endif | |
47 | ||
48 | %ifidn __OUTPUT_FORMAT__, elf64 | |
49 | %define _gpr rsp + 16*23 ; store rbx | |
50 | %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 | |
51 | %else | |
52 | %define _gpr rsp + 16*33 ; store rdi, rsi, rbx | |
53 | %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 | |
54 | %endif | |
55 | ||
56 | %define GHASH_POLY 0x87 | |
57 | ||
58 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
59 | ;void XTS_AES_256_dec_vavx( | |
60 | ; UINT8 *k2, // key used for tweaking, 16*2 bytes | |
61 | ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes | |
62 | ; UINT8 *TW_initial, // initial tweak value, 16 bytes | |
63 | ; UINT64 N, // sector size, in bytes | |
64 | ; const UINT8 *pt, // plaintext sector input data | |
65 | ; UINT8 *ct); // ciphertext sector output data | |
66 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
67 | ||
68 | ; arguments for input parameters | |
69 | %ifidn __OUTPUT_FORMAT__, elf64 | |
70 | %xdefine ptr_key2 rdi | |
71 | %xdefine ptr_key1 rsi | |
72 | %xdefine T_val rdx | |
73 | %xdefine N_val rcx | |
74 | %xdefine ptr_plaintext r8 | |
75 | %xdefine ptr_ciphertext r9 | |
76 | %else | |
77 | %xdefine ptr_key2 rcx | |
78 | %xdefine ptr_key1 rdx | |
79 | %xdefine T_val r8 | |
80 | %xdefine N_val r9 | |
81 | %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] | |
82 | %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] | |
83 | %endif | |
84 | ||
85 | ; arguments for temp parameters | |
86 | %ifidn __OUTPUT_FORMAT__, elf64 | |
87 | %define tmp1 rdi | |
88 | %define ghash_poly_8b r10 | |
89 | %define ghash_poly_8b_temp r11 | |
90 | %else | |
91 | %define tmp1 rcx | |
92 | %define ghash_poly_8b rdi | |
93 | %define ghash_poly_8b_temp rsi | |
94 | %endif | |
95 | ||
96 | %define twtempl rax ; global temp registers used for tweak computation | |
97 | %define twtemph rbx | |
98 | %define zpoly zmm25 | |
99 | ||
100 | ; produce the key for the next round | |
101 | ; raw_key is the output of vaeskeygenassist instruction | |
102 | ; round_key value before this key_expansion_128 macro is current round key | |
103 | ; round_key value after this key_expansion_128 macro is next round key | |
104 | %macro key_expansion_128 3 | |
105 | %define %%xraw_key %1 | |
106 | %define %%xtmp %2 | |
107 | %define %%xround_key %3 | |
108 | vpshufd %%xraw_key, %%xraw_key, 11111111b | |
109 | vshufps %%xtmp, %%xround_key, 00010000b | |
110 | vpxor %%xround_key, %%xtmp | |
111 | vshufps %%xtmp, %%xround_key, 10001100b | |
112 | vpxor %%xround_key, %%xtmp | |
113 | vpxor %%xround_key, %%xraw_key | |
114 | %endmacro | |
115 | ||
116 | ||
117 | ||
118 | ; macro to encrypt the tweak value in parallel with key generation of both keys | |
119 | ||
120 | %macro encrypt_T 9 | |
121 | %define %%xkey2 %1 | |
122 | %define %%xstate_tweak %2 | |
123 | %define %%xkey1 %3 | |
124 | %define %%xraw_key %4 | |
125 | %define %%xtmp %5 | |
126 | %define %%xtmp2 %6 | |
127 | %define %%ptr_key2 %7 | |
128 | %define %%ptr_key1 %8 | |
129 | %define %%ptr_expanded_keys %9 | |
130 | ||
131 | ||
132 | vmovdqu %%xkey2, [%%ptr_key2] | |
133 | vmovdqu %%xkey1, [%%ptr_key1] | |
134 | vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 | |
135 | ||
136 | vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption | |
137 | ||
138 | vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 | |
139 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
140 | vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 | |
141 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
142 | vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption | |
143 | vaesimc %%xtmp2, %%xkey1 | |
144 | vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 | |
145 | ||
146 | vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 | |
147 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
148 | vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 | |
149 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
150 | vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption | |
151 | vaesimc %%xtmp2, %%xkey1 | |
152 | vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 | |
153 | ||
154 | vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 | |
155 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
156 | vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 | |
157 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
158 | vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption | |
159 | vaesimc %%xtmp2, %%xkey1 | |
160 | vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 | |
161 | ||
162 | vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 | |
163 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
164 | vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 | |
165 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
166 | vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption | |
167 | vaesimc %%xtmp2, %%xkey1 | |
168 | vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 | |
169 | ||
170 | vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 | |
171 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
172 | vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 | |
173 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
174 | vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption | |
175 | vaesimc %%xtmp2, %%xkey1 | |
176 | vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 | |
177 | ||
178 | vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 | |
179 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
180 | vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 | |
181 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
182 | vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption | |
183 | vaesimc %%xtmp2, %%xkey1 | |
184 | vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 | |
185 | ||
186 | vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 | |
187 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
188 | vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 | |
189 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
190 | vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption | |
191 | vaesimc %%xtmp2, %%xkey1 | |
192 | vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 | |
193 | ||
194 | vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 | |
195 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
196 | vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 | |
197 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
198 | vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption | |
199 | vaesimc %%xtmp2, %%xkey1 | |
200 | vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 | |
201 | ||
202 | vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 | |
203 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
204 | vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 | |
205 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
206 | vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption | |
207 | vaesimc %%xtmp2, %%xkey1 | |
208 | vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 | |
209 | ||
210 | vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 | |
211 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 | |
212 | vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 | |
213 | key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 | |
214 | vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption | |
215 | vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1 | |
216 | ||
217 | vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value | |
218 | %endmacro | |
219 | ||
220 | ||
221 | ; Original way to generate initial tweak values and load plaintext values | |
222 | ; only used for small blocks | |
223 | %macro initialize 16 | |
224 | ||
225 | %define %%ST1 %1 ; state 1 | |
226 | %define %%ST2 %2 ; state 2 | |
227 | %define %%ST3 %3 ; state 3 | |
228 | %define %%ST4 %4 ; state 4 | |
229 | %define %%ST5 %5 ; state 5 | |
230 | %define %%ST6 %6 ; state 6 | |
231 | %define %%ST7 %7 ; state 7 | |
232 | %define %%ST8 %8 ; state 8 | |
233 | ||
234 | %define %%TW1 %9 ; tweak 1 | |
235 | %define %%TW2 %10 ; tweak 2 | |
236 | %define %%TW3 %11 ; tweak 3 | |
237 | %define %%TW4 %12 ; tweak 4 | |
238 | %define %%TW5 %13 ; tweak 5 | |
239 | %define %%TW6 %14 ; tweak 6 | |
240 | %define %%TW7 %15 ; tweak 7 | |
241 | ||
242 | %define %%num_initial_blocks %16 | |
243 | ||
244 | ||
245 | ; generate next Tweak values | |
246 | vmovdqa %%TW1, [TW+16*0] | |
247 | mov twtempl, [TW+8*0] | |
248 | mov twtemph, [TW+8*1] | |
249 | vmovdqu %%ST1, [ptr_plaintext+16*0] | |
250 | %if (%%num_initial_blocks>=2) | |
251 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
252 | shl twtempl, 1 | |
253 | adc twtemph, twtemph | |
254 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
255 | xor twtempl, ghash_poly_8b_temp | |
256 | mov [TW+8*2], twtempl | |
257 | mov [TW+8*3], twtemph; | |
258 | vmovdqa %%TW2, [TW+16*1] | |
259 | vmovdqu %%ST2, [ptr_plaintext+16*1] | |
260 | %endif | |
261 | %if (%%num_initial_blocks>=3) | |
262 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
263 | shl twtempl, 1 | |
264 | adc twtemph, twtemph | |
265 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
266 | xor twtempl, ghash_poly_8b_temp | |
267 | mov [TW+8*4], twtempl | |
268 | mov [TW+8*5], twtemph; | |
269 | vmovdqa %%TW3, [TW+16*2] | |
270 | vmovdqu %%ST3, [ptr_plaintext+16*2] | |
271 | %endif | |
272 | %if (%%num_initial_blocks>=4) | |
273 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
274 | shl twtempl, 1 | |
275 | adc twtemph, twtemph | |
276 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
277 | xor twtempl, ghash_poly_8b_temp | |
278 | mov [TW+8*6], twtempl | |
279 | mov [TW+8*7], twtemph; | |
280 | vmovdqa %%TW4, [TW+16*3] | |
281 | vmovdqu %%ST4, [ptr_plaintext+16*3] | |
282 | %endif | |
283 | %if (%%num_initial_blocks>=5) | |
284 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
285 | shl twtempl, 1 | |
286 | adc twtemph, twtemph | |
287 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
288 | xor twtempl, ghash_poly_8b_temp | |
289 | mov [TW+8*8], twtempl | |
290 | mov [TW+8*9], twtemph; | |
291 | vmovdqa %%TW5, [TW+16*4] | |
292 | vmovdqu %%ST5, [ptr_plaintext+16*4] | |
293 | %endif | |
294 | %if (%%num_initial_blocks>=6) | |
295 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
296 | shl twtempl, 1 | |
297 | adc twtemph, twtemph | |
298 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
299 | xor twtempl, ghash_poly_8b_temp | |
300 | mov [TW+8*10], twtempl | |
301 | mov [TW+8*11], twtemph; | |
302 | vmovdqa %%TW6, [TW+16*5] | |
303 | vmovdqu %%ST6, [ptr_plaintext+16*5] | |
304 | %endif | |
305 | %if (%%num_initial_blocks>=7) | |
306 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
307 | shl twtempl, 1 | |
308 | adc twtemph, twtemph | |
309 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
310 | xor twtempl, ghash_poly_8b_temp | |
311 | mov [TW+8*12], twtempl | |
312 | mov [TW+8*13], twtemph; | |
313 | vmovdqa %%TW7, [TW+16*6] | |
314 | vmovdqu %%ST7, [ptr_plaintext+16*6] | |
315 | %endif | |
316 | ||
317 | %endmacro | |
318 | ||
319 | ||
320 | ; Original decrypt initial blocks of AES | |
321 | ; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted | |
322 | ; next 8 Tweak values can be generated | |
323 | %macro decrypt_initial 18 | |
324 | %define %%ST1 %1 ; state 1 | |
325 | %define %%ST2 %2 ; state 2 | |
326 | %define %%ST3 %3 ; state 3 | |
327 | %define %%ST4 %4 ; state 4 | |
328 | %define %%ST5 %5 ; state 5 | |
329 | %define %%ST6 %6 ; state 6 | |
330 | %define %%ST7 %7 ; state 7 | |
331 | %define %%ST8 %8 ; state 8 | |
332 | ||
333 | %define %%TW1 %9 ; tweak 1 | |
334 | %define %%TW2 %10 ; tweak 2 | |
335 | %define %%TW3 %11 ; tweak 3 | |
336 | %define %%TW4 %12 ; tweak 4 | |
337 | %define %%TW5 %13 ; tweak 5 | |
338 | %define %%TW6 %14 ; tweak 6 | |
339 | %define %%TW7 %15 ; tweak 7 | |
340 | %define %%T0 %16 ; Temp register | |
341 | %define %%num_blocks %17 | |
342 | ; %%num_blocks blocks decrypted | |
343 | ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 | |
344 | ||
345 | %define %%lt128 %18 ; less than 128 bytes | |
346 | ||
347 | ; xor Tweak value | |
348 | vpxor %%ST1, %%TW1 | |
349 | %if (%%num_blocks>=2) | |
350 | vpxor %%ST2, %%TW2 | |
351 | %endif | |
352 | %if (%%num_blocks>=3) | |
353 | vpxor %%ST3, %%TW3 | |
354 | %endif | |
355 | %if (%%num_blocks>=4) | |
356 | vpxor %%ST4, %%TW4 | |
357 | %endif | |
358 | %if (%%num_blocks>=5) | |
359 | vpxor %%ST5, %%TW5 | |
360 | %endif | |
361 | %if (%%num_blocks>=6) | |
362 | vpxor %%ST6, %%TW6 | |
363 | %endif | |
364 | %if (%%num_blocks>=7) | |
365 | vpxor %%ST7, %%TW7 | |
366 | %endif | |
367 | ||
368 | ||
369 | ; ARK | |
370 | vmovdqa %%T0, [keys] | |
371 | vpxor %%ST1, %%T0 | |
372 | %if (%%num_blocks>=2) | |
373 | vpxor %%ST2, %%T0 | |
374 | %endif | |
375 | %if (%%num_blocks>=3) | |
376 | vpxor %%ST3, %%T0 | |
377 | %endif | |
378 | %if (%%num_blocks>=4) | |
379 | vpxor %%ST4, %%T0 | |
380 | %endif | |
381 | %if (%%num_blocks>=5) | |
382 | vpxor %%ST5, %%T0 | |
383 | %endif | |
384 | %if (%%num_blocks>=6) | |
385 | vpxor %%ST6, %%T0 | |
386 | %endif | |
387 | %if (%%num_blocks>=7) | |
388 | vpxor %%ST7, %%T0 | |
389 | %endif | |
390 | ||
391 | ||
392 | %if (0 == %%lt128) | |
393 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
394 | shl twtempl, 1 | |
395 | adc twtemph, twtemph | |
396 | %endif | |
397 | ||
398 | ; round 1 | |
399 | vmovdqa %%T0, [keys + 16*1] | |
400 | vaesdec %%ST1, %%T0 | |
401 | %if (%%num_blocks>=2) | |
402 | vaesdec %%ST2, %%T0 | |
403 | %endif | |
404 | %if (%%num_blocks>=3) | |
405 | vaesdec %%ST3, %%T0 | |
406 | %endif | |
407 | %if (%%num_blocks>=4) | |
408 | vaesdec %%ST4, %%T0 | |
409 | %endif | |
410 | %if (%%num_blocks>=5) | |
411 | vaesdec %%ST5, %%T0 | |
412 | %endif | |
413 | %if (%%num_blocks>=6) | |
414 | vaesdec %%ST6, %%T0 | |
415 | %endif | |
416 | %if (%%num_blocks>=7) | |
417 | vaesdec %%ST7, %%T0 | |
418 | %endif | |
419 | %if (0 == %%lt128) | |
420 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
421 | xor twtempl, ghash_poly_8b_temp | |
422 | mov [TW + 8*0], twtempl ; next Tweak1 generated | |
423 | mov [TW + 8*1], twtemph | |
424 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
425 | %endif | |
426 | ||
427 | ; round 2 | |
428 | vmovdqa %%T0, [keys + 16*2] | |
429 | vaesdec %%ST1, %%T0 | |
430 | %if (%%num_blocks>=2) | |
431 | vaesdec %%ST2, %%T0 | |
432 | %endif | |
433 | %if (%%num_blocks>=3) | |
434 | vaesdec %%ST3, %%T0 | |
435 | %endif | |
436 | %if (%%num_blocks>=4) | |
437 | vaesdec %%ST4, %%T0 | |
438 | %endif | |
439 | %if (%%num_blocks>=5) | |
440 | vaesdec %%ST5, %%T0 | |
441 | %endif | |
442 | %if (%%num_blocks>=6) | |
443 | vaesdec %%ST6, %%T0 | |
444 | %endif | |
445 | %if (%%num_blocks>=7) | |
446 | vaesdec %%ST7, %%T0 | |
447 | %endif | |
448 | ||
449 | %if (0 == %%lt128) | |
450 | shl twtempl, 1 | |
451 | adc twtemph, twtemph | |
452 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
453 | xor twtempl, ghash_poly_8b_temp | |
454 | mov [TW + 8*2], twtempl ; next Tweak2 generated | |
455 | %endif | |
456 | ||
457 | ; round 3 | |
458 | vmovdqa %%T0, [keys + 16*3] | |
459 | vaesdec %%ST1, %%T0 | |
460 | %if (%%num_blocks>=2) | |
461 | vaesdec %%ST2, %%T0 | |
462 | %endif | |
463 | %if (%%num_blocks>=3) | |
464 | vaesdec %%ST3, %%T0 | |
465 | %endif | |
466 | %if (%%num_blocks>=4) | |
467 | vaesdec %%ST4, %%T0 | |
468 | %endif | |
469 | %if (%%num_blocks>=5) | |
470 | vaesdec %%ST5, %%T0 | |
471 | %endif | |
472 | %if (%%num_blocks>=6) | |
473 | vaesdec %%ST6, %%T0 | |
474 | %endif | |
475 | %if (%%num_blocks>=7) | |
476 | vaesdec %%ST7, %%T0 | |
477 | %endif | |
478 | %if (0 == %%lt128) | |
479 | mov [TW + 8*3], twtemph | |
480 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
481 | shl twtempl, 1 | |
482 | adc twtemph, twtemph | |
483 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
484 | %endif | |
485 | ||
486 | ; round 4 | |
487 | vmovdqa %%T0, [keys + 16*4] | |
488 | vaesdec %%ST1, %%T0 | |
489 | %if (%%num_blocks>=2) | |
490 | vaesdec %%ST2, %%T0 | |
491 | %endif | |
492 | %if (%%num_blocks>=3) | |
493 | vaesdec %%ST3, %%T0 | |
494 | %endif | |
495 | %if (%%num_blocks>=4) | |
496 | vaesdec %%ST4, %%T0 | |
497 | %endif | |
498 | %if (%%num_blocks>=5) | |
499 | vaesdec %%ST5, %%T0 | |
500 | %endif | |
501 | %if (%%num_blocks>=6) | |
502 | vaesdec %%ST6, %%T0 | |
503 | %endif | |
504 | %if (%%num_blocks>=7) | |
505 | vaesdec %%ST7, %%T0 | |
506 | %endif | |
507 | ||
508 | %if (0 == %%lt128) | |
509 | xor twtempl, ghash_poly_8b_temp | |
510 | mov [TW + 8*4], twtempl ; next Tweak3 generated | |
511 | mov [TW + 8*5], twtemph | |
512 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
513 | shl twtempl, 1 | |
514 | %endif | |
515 | ||
516 | ; round 5 | |
517 | vmovdqa %%T0, [keys + 16*5] | |
518 | vaesdec %%ST1, %%T0 | |
519 | %if (%%num_blocks>=2) | |
520 | vaesdec %%ST2, %%T0 | |
521 | %endif | |
522 | %if (%%num_blocks>=3) | |
523 | vaesdec %%ST3, %%T0 | |
524 | %endif | |
525 | %if (%%num_blocks>=4) | |
526 | vaesdec %%ST4, %%T0 | |
527 | %endif | |
528 | %if (%%num_blocks>=5) | |
529 | vaesdec %%ST5, %%T0 | |
530 | %endif | |
531 | %if (%%num_blocks>=6) | |
532 | vaesdec %%ST6, %%T0 | |
533 | %endif | |
534 | %if (%%num_blocks>=7) | |
535 | vaesdec %%ST7, %%T0 | |
536 | %endif | |
537 | ||
538 | %if (0 == %%lt128) | |
539 | adc twtemph, twtemph | |
540 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
541 | xor twtempl, ghash_poly_8b_temp | |
542 | mov [TW + 8*6], twtempl ; next Tweak4 generated | |
543 | mov [TW + 8*7], twtemph | |
544 | %endif | |
545 | ||
546 | ; round 6 | |
547 | vmovdqa %%T0, [keys + 16*6] | |
548 | vaesdec %%ST1, %%T0 | |
549 | %if (%%num_blocks>=2) | |
550 | vaesdec %%ST2, %%T0 | |
551 | %endif | |
552 | %if (%%num_blocks>=3) | |
553 | vaesdec %%ST3, %%T0 | |
554 | %endif | |
555 | %if (%%num_blocks>=4) | |
556 | vaesdec %%ST4, %%T0 | |
557 | %endif | |
558 | %if (%%num_blocks>=5) | |
559 | vaesdec %%ST5, %%T0 | |
560 | %endif | |
561 | %if (%%num_blocks>=6) | |
562 | vaesdec %%ST6, %%T0 | |
563 | %endif | |
564 | %if (%%num_blocks>=7) | |
565 | vaesdec %%ST7, %%T0 | |
566 | %endif | |
567 | ||
568 | %if (0 == %%lt128) | |
569 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
570 | shl twtempl, 1 | |
571 | adc twtemph, twtemph | |
572 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
573 | xor twtempl, ghash_poly_8b_temp | |
574 | mov [TW + 8*8], twtempl ; next Tweak5 generated | |
575 | mov [TW + 8*9], twtemph | |
576 | %endif | |
577 | ||
578 | ; round 7 | |
579 | vmovdqa %%T0, [keys + 16*7] | |
580 | vaesdec %%ST1, %%T0 | |
581 | %if (%%num_blocks>=2) | |
582 | vaesdec %%ST2, %%T0 | |
583 | %endif | |
584 | %if (%%num_blocks>=3) | |
585 | vaesdec %%ST3, %%T0 | |
586 | %endif | |
587 | %if (%%num_blocks>=4) | |
588 | vaesdec %%ST4, %%T0 | |
589 | %endif | |
590 | %if (%%num_blocks>=5) | |
591 | vaesdec %%ST5, %%T0 | |
592 | %endif | |
593 | %if (%%num_blocks>=6) | |
594 | vaesdec %%ST6, %%T0 | |
595 | %endif | |
596 | %if (%%num_blocks>=7) | |
597 | vaesdec %%ST7, %%T0 | |
598 | %endif | |
599 | ||
600 | %if (0 == %%lt128) | |
601 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
602 | shl twtempl, 1 | |
603 | adc twtemph, twtemph | |
604 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
605 | xor twtempl, ghash_poly_8b_temp | |
606 | mov [TW + 8*10], twtempl ; next Tweak6 generated | |
607 | mov [TW + 8*11], twtemph | |
608 | %endif | |
609 | ; round 8 | |
610 | vmovdqa %%T0, [keys + 16*8] | |
611 | vaesdec %%ST1, %%T0 | |
612 | %if (%%num_blocks>=2) | |
613 | vaesdec %%ST2, %%T0 | |
614 | %endif | |
615 | %if (%%num_blocks>=3) | |
616 | vaesdec %%ST3, %%T0 | |
617 | %endif | |
618 | %if (%%num_blocks>=4) | |
619 | vaesdec %%ST4, %%T0 | |
620 | %endif | |
621 | %if (%%num_blocks>=5) | |
622 | vaesdec %%ST5, %%T0 | |
623 | %endif | |
624 | %if (%%num_blocks>=6) | |
625 | vaesdec %%ST6, %%T0 | |
626 | %endif | |
627 | %if (%%num_blocks>=7) | |
628 | vaesdec %%ST7, %%T0 | |
629 | %endif | |
630 | ||
631 | %if (0 == %%lt128) | |
632 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
633 | shl twtempl, 1 | |
634 | adc twtemph, twtemph | |
635 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
636 | xor twtempl, ghash_poly_8b_temp | |
637 | mov [TW + 8*12], twtempl ; next Tweak7 generated | |
638 | mov [TW + 8*13], twtemph | |
639 | %endif | |
640 | ; round 9 | |
641 | vmovdqa %%T0, [keys + 16*9] | |
642 | vaesdec %%ST1, %%T0 | |
643 | %if (%%num_blocks>=2) | |
644 | vaesdec %%ST2, %%T0 | |
645 | %endif | |
646 | %if (%%num_blocks>=3) | |
647 | vaesdec %%ST3, %%T0 | |
648 | %endif | |
649 | %if (%%num_blocks>=4) | |
650 | vaesdec %%ST4, %%T0 | |
651 | %endif | |
652 | %if (%%num_blocks>=5) | |
653 | vaesdec %%ST5, %%T0 | |
654 | %endif | |
655 | %if (%%num_blocks>=6) | |
656 | vaesdec %%ST6, %%T0 | |
657 | %endif | |
658 | %if (%%num_blocks>=7) | |
659 | vaesdec %%ST7, %%T0 | |
660 | %endif | |
661 | ||
662 | %if (0 == %%lt128) | |
663 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
664 | shl twtempl, 1 | |
665 | adc twtemph, twtemph | |
666 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
667 | xor twtempl, ghash_poly_8b_temp | |
668 | mov [TW + 8*14], twtempl ; next Tweak8 generated | |
669 | mov [TW + 8*15], twtemph | |
670 | %endif | |
671 | ||
672 | ; round 10 | |
673 | vmovdqa %%T0, [keys + 16*10] | |
674 | vaesdeclast %%ST1, %%T0 | |
675 | %if (%%num_blocks>=2) | |
676 | vaesdeclast %%ST2, %%T0 | |
677 | %endif | |
678 | %if (%%num_blocks>=3) | |
679 | vaesdeclast %%ST3, %%T0 | |
680 | %endif | |
681 | %if (%%num_blocks>=4) | |
682 | vaesdeclast %%ST4, %%T0 | |
683 | %endif | |
684 | %if (%%num_blocks>=5) | |
685 | vaesdeclast %%ST5, %%T0 | |
686 | %endif | |
687 | %if (%%num_blocks>=6) | |
688 | vaesdeclast %%ST6, %%T0 | |
689 | %endif | |
690 | %if (%%num_blocks>=7) | |
691 | vaesdeclast %%ST7, %%T0 | |
692 | %endif | |
693 | ||
694 | ||
695 | ; xor Tweak values | |
696 | vpxor %%ST1, %%TW1 | |
697 | %if (%%num_blocks>=2) | |
698 | vpxor %%ST2, %%TW2 | |
699 | %endif | |
700 | %if (%%num_blocks>=3) | |
701 | vpxor %%ST3, %%TW3 | |
702 | %endif | |
703 | %if (%%num_blocks>=4) | |
704 | vpxor %%ST4, %%TW4 | |
705 | %endif | |
706 | %if (%%num_blocks>=5) | |
707 | vpxor %%ST5, %%TW5 | |
708 | %endif | |
709 | %if (%%num_blocks>=6) | |
710 | vpxor %%ST6, %%TW6 | |
711 | %endif | |
712 | %if (%%num_blocks>=7) | |
713 | vpxor %%ST7, %%TW7 | |
714 | %endif | |
715 | ||
716 | ||
717 | %if (0 == %%lt128) | |
718 | ; load next Tweak values | |
719 | vmovdqa %%TW1, [TW + 16*0] | |
720 | vmovdqa %%TW2, [TW + 16*1] | |
721 | vmovdqa %%TW3, [TW + 16*2] | |
722 | vmovdqa %%TW4, [TW + 16*3] | |
723 | vmovdqa %%TW5, [TW + 16*4] | |
724 | vmovdqa %%TW6, [TW + 16*5] | |
725 | vmovdqa %%TW7, [TW + 16*6] | |
726 | ||
727 | %endif | |
728 | ||
729 | %endmacro | |
730 | ||
731 | ||
732 | ||
733 | ; Decrypt 8 blocks in parallel | |
734 | ; generate next 8 tweak values | |
735 | %macro decrypt_by_eight_zmm 6 | |
736 | %define %%ST1 %1 ; state 1 | |
737 | %define %%ST2 %2 ; state 2 | |
738 | %define %%TW1 %3 ; tweak 1 | |
739 | %define %%TW2 %4 ; tweak 2 | |
740 | %define %%T0 %5 ; Temp register | |
741 | %define %%last_eight %6 | |
742 | ||
743 | ; xor Tweak values | |
744 | vpxorq %%ST1, %%TW1 | |
745 | vpxorq %%ST2, %%TW2 | |
746 | ||
747 | ; ARK | |
748 | vbroadcasti32x4 %%T0, [keys] | |
749 | vpxorq %%ST1, %%T0 | |
750 | vpxorq %%ST2, %%T0 | |
751 | ||
752 | %if (0 == %%last_eight) | |
753 | vpsrldq zmm13, %%TW1, 15 | |
754 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
755 | vpslldq zmm15, %%TW1, 1 | |
756 | vpxord zmm15, zmm15, zmm14 | |
757 | %endif | |
758 | ; round 1 | |
759 | vbroadcasti32x4 %%T0, [keys + 16*1] | |
760 | vaesdec %%ST1, %%T0 | |
761 | vaesdec %%ST2, %%T0 | |
762 | ||
763 | ; round 2 | |
764 | vbroadcasti32x4 %%T0, [keys + 16*2] | |
765 | vaesdec %%ST1, %%T0 | |
766 | vaesdec %%ST2, %%T0 | |
767 | ||
768 | ; round 3 | |
769 | vbroadcasti32x4 %%T0, [keys + 16*3] | |
770 | vaesdec %%ST1, %%T0 | |
771 | vaesdec %%ST2, %%T0 | |
772 | %if (0 == %%last_eight) | |
773 | vpsrldq zmm13, %%TW2, 15 | |
774 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
775 | vpslldq zmm16, %%TW2, 1 | |
776 | vpxord zmm16, zmm16, zmm14 | |
777 | %endif | |
778 | ; round 4 | |
779 | vbroadcasti32x4 %%T0, [keys + 16*4] | |
780 | vaesdec %%ST1, %%T0 | |
781 | vaesdec %%ST2, %%T0 | |
782 | ||
783 | ; round 5 | |
784 | vbroadcasti32x4 %%T0, [keys + 16*5] | |
785 | vaesdec %%ST1, %%T0 | |
786 | vaesdec %%ST2, %%T0 | |
787 | ||
788 | ; round 6 | |
789 | vbroadcasti32x4 %%T0, [keys + 16*6] | |
790 | vaesdec %%ST1, %%T0 | |
791 | vaesdec %%ST2, %%T0 | |
792 | ||
793 | ; round 7 | |
794 | vbroadcasti32x4 %%T0, [keys + 16*7] | |
795 | vaesdec %%ST1, %%T0 | |
796 | vaesdec %%ST2, %%T0 | |
797 | ||
798 | ; round 8 | |
799 | vbroadcasti32x4 %%T0, [keys + 16*8] | |
800 | vaesdec %%ST1, %%T0 | |
801 | vaesdec %%ST2, %%T0 | |
802 | ||
803 | ; round 9 | |
804 | vbroadcasti32x4 %%T0, [keys + 16*9] | |
805 | vaesdec %%ST1, %%T0 | |
806 | vaesdec %%ST2, %%T0 | |
807 | ||
808 | ; round 10 | |
809 | vbroadcasti32x4 %%T0, [keys + 16*10] | |
810 | vaesdeclast %%ST1, %%T0 | |
811 | vaesdeclast %%ST2, %%T0 | |
812 | ||
813 | ; xor Tweak values | |
814 | vpxorq %%ST1, %%TW1 | |
815 | vpxorq %%ST2, %%TW2 | |
816 | ||
817 | ; load next Tweak values | |
818 | vmovdqa32 %%TW1, zmm15 | |
819 | vmovdqa32 %%TW2, zmm16 | |
820 | %endmacro | |
821 | ||
822 | ||
823 | ; Decrypt 16 blocks in parallel | |
824 | ; generate next 8 tweak values | |
825 | %macro decrypt_by_16_zmm 10 | |
826 | %define %%ST1 %1 ; state 1 | |
827 | %define %%ST2 %2 ; state 2 | |
828 | %define %%ST3 %3 ; state 3 | |
829 | %define %%ST4 %4 ; state 4 | |
830 | ||
831 | %define %%TW1 %5 ; tweak 1 | |
832 | %define %%TW2 %6 ; tweak 2 | |
833 | %define %%TW3 %7 ; tweak 3 | |
834 | %define %%TW4 %8 ; tweak 4 | |
835 | ||
836 | %define %%T0 %9 ; Temp register | |
837 | %define %%last_eight %10 | |
838 | ||
839 | ; xor Tweak values | |
840 | vpxorq %%ST1, %%TW1 | |
841 | vpxorq %%ST2, %%TW2 | |
842 | vpxorq %%ST3, %%TW3 | |
843 | vpxorq %%ST4, %%TW4 | |
844 | ||
845 | ; ARK | |
846 | vbroadcasti32x4 %%T0, [keys] | |
847 | vpxorq %%ST1, %%T0 | |
848 | vpxorq %%ST2, %%T0 | |
849 | vpxorq %%ST3, %%T0 | |
850 | vpxorq %%ST4, %%T0 | |
851 | ||
852 | %if (0 == %%last_eight) | |
853 | vpsrldq zmm13, %%TW3, 15 | |
854 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
855 | vpslldq zmm15, %%TW3, 1 | |
856 | vpxord zmm15, zmm15, zmm14 | |
857 | %endif | |
858 | ; round 1 | |
859 | vbroadcasti32x4 %%T0, [keys + 16*1] | |
860 | vaesdec %%ST1, %%T0 | |
861 | vaesdec %%ST2, %%T0 | |
862 | vaesdec %%ST3, %%T0 | |
863 | vaesdec %%ST4, %%T0 | |
864 | ||
865 | ; round 2 | |
866 | vbroadcasti32x4 %%T0, [keys + 16*2] | |
867 | vaesdec %%ST1, %%T0 | |
868 | vaesdec %%ST2, %%T0 | |
869 | vaesdec %%ST3, %%T0 | |
870 | vaesdec %%ST4, %%T0 | |
871 | ||
872 | ; round 3 | |
873 | vbroadcasti32x4 %%T0, [keys + 16*3] | |
874 | vaesdec %%ST1, %%T0 | |
875 | vaesdec %%ST2, %%T0 | |
876 | vaesdec %%ST3, %%T0 | |
877 | vaesdec %%ST4, %%T0 | |
878 | %if (0 == %%last_eight) | |
879 | vpsrldq zmm13, %%TW4, 15 | |
880 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
881 | vpslldq zmm16, %%TW4, 1 | |
882 | vpxord zmm16, zmm16, zmm14 | |
883 | %endif | |
884 | ; round 4 | |
885 | vbroadcasti32x4 %%T0, [keys + 16*4] | |
886 | vaesdec %%ST1, %%T0 | |
887 | vaesdec %%ST2, %%T0 | |
888 | vaesdec %%ST3, %%T0 | |
889 | vaesdec %%ST4, %%T0 | |
890 | ||
891 | ; round 5 | |
892 | vbroadcasti32x4 %%T0, [keys + 16*5] | |
893 | vaesdec %%ST1, %%T0 | |
894 | vaesdec %%ST2, %%T0 | |
895 | vaesdec %%ST3, %%T0 | |
896 | vaesdec %%ST4, %%T0 | |
897 | ||
898 | ; round 6 | |
899 | vbroadcasti32x4 %%T0, [keys + 16*6] | |
900 | vaesdec %%ST1, %%T0 | |
901 | vaesdec %%ST2, %%T0 | |
902 | vaesdec %%ST3, %%T0 | |
903 | vaesdec %%ST4, %%T0 | |
904 | %if (0 == %%last_eight) | |
905 | vpsrldq zmm13, zmm15, 15 | |
906 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
907 | vpslldq zmm17, zmm15, 1 | |
908 | vpxord zmm17, zmm17, zmm14 | |
909 | %endif | |
910 | ; round 7 | |
911 | vbroadcasti32x4 %%T0, [keys + 16*7] | |
912 | vaesdec %%ST1, %%T0 | |
913 | vaesdec %%ST2, %%T0 | |
914 | vaesdec %%ST3, %%T0 | |
915 | vaesdec %%ST4, %%T0 | |
916 | ||
917 | ; round 8 | |
918 | vbroadcasti32x4 %%T0, [keys + 16*8] | |
919 | vaesdec %%ST1, %%T0 | |
920 | vaesdec %%ST2, %%T0 | |
921 | vaesdec %%ST3, %%T0 | |
922 | vaesdec %%ST4, %%T0 | |
923 | ||
924 | ; round 9 | |
925 | vbroadcasti32x4 %%T0, [keys + 16*9] | |
926 | vaesdec %%ST1, %%T0 | |
927 | vaesdec %%ST2, %%T0 | |
928 | vaesdec %%ST3, %%T0 | |
929 | vaesdec %%ST4, %%T0 | |
930 | %if (0 == %%last_eight) | |
931 | vpsrldq zmm13, zmm16, 15 | |
932 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
933 | vpslldq zmm18, zmm16, 1 | |
934 | vpxord zmm18, zmm18, zmm14 | |
935 | %endif | |
936 | ; round 10 | |
937 | vbroadcasti32x4 %%T0, [keys + 16*10] | |
938 | vaesdeclast %%ST1, %%T0 | |
939 | vaesdeclast %%ST2, %%T0 | |
940 | vaesdeclast %%ST3, %%T0 | |
941 | vaesdeclast %%ST4, %%T0 | |
942 | ||
943 | ; xor Tweak values | |
944 | vpxorq %%ST1, %%TW1 | |
945 | vpxorq %%ST2, %%TW2 | |
946 | vpxorq %%ST3, %%TW3 | |
947 | vpxorq %%ST4, %%TW4 | |
948 | ||
949 | ; load next Tweak values | |
950 | vmovdqa32 %%TW1, zmm15 | |
951 | vmovdqa32 %%TW2, zmm16 | |
952 | vmovdqa32 %%TW3, zmm17 | |
953 | vmovdqa32 %%TW4, zmm18 | |
954 | %endmacro | |
955 | ||
956 | ||
957 | section .text | |
958 | ||
959 | mk_global XTS_AES_128_dec_vaes, function | |
960 | XTS_AES_128_dec_vaes: | |
961 | endbranch | |
962 | ||
963 | %define ALIGN_STACK | |
964 | %ifdef ALIGN_STACK | |
965 | push rbp | |
966 | mov rbp, rsp | |
967 | sub rsp, VARIABLE_OFFSET | |
968 | and rsp, ~63 | |
969 | %else | |
970 | sub rsp, VARIABLE_OFFSET | |
971 | %endif | |
972 | ||
973 | mov [_gpr + 8*0], rbx | |
974 | %ifidn __OUTPUT_FORMAT__, win64 | |
975 | mov [_gpr + 8*1], rdi | |
976 | mov [_gpr + 8*2], rsi | |
977 | ||
978 | vmovdqa [_xmm + 16*0], xmm6 | |
979 | vmovdqa [_xmm + 16*1], xmm7 | |
980 | vmovdqa [_xmm + 16*2], xmm8 | |
981 | vmovdqa [_xmm + 16*3], xmm9 | |
982 | vmovdqa [_xmm + 16*4], xmm10 | |
983 | vmovdqa [_xmm + 16*5], xmm11 | |
984 | vmovdqa [_xmm + 16*6], xmm12 | |
985 | vmovdqa [_xmm + 16*7], xmm13 | |
986 | vmovdqa [_xmm + 16*8], xmm14 | |
987 | vmovdqa [_xmm + 16*9], xmm15 | |
988 | %endif | |
989 | ||
990 | mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b | |
991 | ||
992 | ||
993 | vmovdqu xmm1, [T_val] ; read initial Tweak value | |
994 | vpxor xmm4, xmm4 ; for key expansion | |
995 | encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys | |
996 | ||
997 | ||
998 | %ifidn __OUTPUT_FORMAT__, win64 | |
999 | mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer | |
1000 | mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer | |
1001 | %endif | |
1002 | ||
1003 | cmp N_val, 128 | |
1004 | jl _less_than_128_bytes | |
1005 | ||
1006 | vpbroadcastq zpoly, ghash_poly_8b | |
1007 | ||
1008 | cmp N_val, 256 | |
1009 | jge _start_by16 | |
1010 | ||
1011 | cmp N_val, 128 | |
1012 | jge _start_by8 | |
1013 | ||
1014 | _do_n_blocks: | |
1015 | cmp N_val, 0 | |
1016 | je _ret_ | |
1017 | ||
1018 | cmp N_val, (7*16) | |
1019 | jge _remaining_num_blocks_is_7 | |
1020 | ||
1021 | cmp N_val, (6*16) | |
1022 | jge _remaining_num_blocks_is_6 | |
1023 | ||
1024 | cmp N_val, (5*16) | |
1025 | jge _remaining_num_blocks_is_5 | |
1026 | ||
1027 | cmp N_val, (4*16) | |
1028 | jge _remaining_num_blocks_is_4 | |
1029 | ||
1030 | cmp N_val, (3*16) | |
1031 | jge _remaining_num_blocks_is_3 | |
1032 | ||
1033 | cmp N_val, (2*16) | |
1034 | jge _remaining_num_blocks_is_2 | |
1035 | ||
1036 | cmp N_val, (1*16) | |
1037 | jge _remaining_num_blocks_is_1 | |
1038 | ||
1039 | ;; _remaining_num_blocks_is_0: | |
1040 | vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak | |
1041 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 | |
1042 | vmovdqu [ptr_ciphertext - 16], xmm1 | |
1043 | vmovdqa xmm8, xmm1 | |
1044 | ||
1045 | ; Calc previous tweak | |
1046 | mov tmp1, 1 | |
1047 | kmovq k1, tmp1 | |
1048 | vpsllq xmm13, xmm9, 63 | |
1049 | vpsraq xmm14, xmm13, 63 | |
1050 | vpandq xmm5, xmm14, XWORD(zpoly) | |
1051 | vpxorq xmm9 {k1}, xmm9, xmm5 | |
1052 | vpsrldq xmm10, xmm9, 8 | |
1053 | vpshrdq xmm0, xmm9, xmm10, 1 | |
1054 | vpslldq xmm13, xmm13, 8 | |
1055 | vpxorq xmm0, xmm0, xmm13 | |
1056 | jmp _steal_cipher | |
1057 | ||
1058 | _remaining_num_blocks_is_7: | |
1059 | mov tmp1, -1 | |
1060 | shr tmp1, 16 | |
1061 | kmovq k1, tmp1 | |
1062 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1063 | vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] | |
1064 | add ptr_plaintext, 16*7 | |
1065 | and N_val, 15 | |
1066 | je _done_7_remain | |
1067 | vextracti32x4 xmm12, zmm10, 2 | |
1068 | vextracti32x4 xmm13, zmm10, 3 | |
1069 | vinserti32x4 zmm10, xmm13, 2 | |
1070 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1071 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1072 | vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 | |
1073 | add ptr_ciphertext, 16*7 | |
1074 | vextracti32x4 xmm8, zmm2, 0x2 | |
1075 | vmovdqa xmm0, xmm12 | |
1076 | jmp _steal_cipher | |
1077 | _done_7_remain: | |
1078 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1079 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1080 | vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 | |
1081 | jmp _ret_ | |
1082 | ||
1083 | _remaining_num_blocks_is_6: | |
1084 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1085 | vmovdqu8 ymm2, [ptr_plaintext+16*4] | |
1086 | add ptr_plaintext, 16*6 | |
1087 | and N_val, 15 | |
1088 | je _done_6_remain | |
1089 | vextracti32x4 xmm12, zmm10, 1 | |
1090 | vextracti32x4 xmm13, zmm10, 2 | |
1091 | vinserti32x4 zmm10, xmm13, 1 | |
1092 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1093 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1094 | vmovdqu8 [ptr_ciphertext+16*4], ymm2 | |
1095 | add ptr_ciphertext, 16*6 | |
1096 | vextracti32x4 xmm8, zmm2, 0x1 | |
1097 | vmovdqa xmm0, xmm12 | |
1098 | jmp _steal_cipher | |
1099 | _done_6_remain: | |
1100 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1101 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1102 | vmovdqu8 [ptr_ciphertext+16*4], ymm2 | |
1103 | jmp _ret_ | |
1104 | ||
1105 | _remaining_num_blocks_is_5: | |
1106 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1107 | vmovdqu xmm2, [ptr_plaintext+16*4] | |
1108 | add ptr_plaintext, 16*5 | |
1109 | and N_val, 15 | |
1110 | je _done_5_remain | |
1111 | vmovdqa xmm12, xmm10 | |
1112 | vextracti32x4 xmm10, zmm10, 1 | |
1113 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1114 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1115 | vmovdqu [ptr_ciphertext+16*4], xmm2 | |
1116 | add ptr_ciphertext, 16*5 | |
1117 | vmovdqa xmm8, xmm2 | |
1118 | vmovdqa xmm0, xmm12 | |
1119 | jmp _steal_cipher | |
1120 | _done_5_remain: | |
1121 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1122 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1123 | vmovdqu [ptr_ciphertext+16*4], xmm2 | |
1124 | jmp _ret_ | |
1125 | ||
1126 | _remaining_num_blocks_is_4: | |
1127 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1128 | add ptr_plaintext, 16*4 | |
1129 | and N_val, 15 | |
1130 | je _done_4_remain | |
1131 | vextracti32x4 xmm12, zmm9, 3 | |
1132 | vinserti32x4 zmm9, xmm10, 3 | |
1133 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1134 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1135 | add ptr_ciphertext, 16*4 | |
1136 | vextracti32x4 xmm8, zmm1, 0x3 | |
1137 | vmovdqa xmm0, xmm12 | |
1138 | jmp _steal_cipher | |
1139 | _done_4_remain: | |
1140 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 | |
1141 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1142 | jmp _ret_ | |
1143 | ||
1144 | _remaining_num_blocks_is_3: | |
1145 | vmovdqu xmm1, [ptr_plaintext+16*0] | |
1146 | vmovdqu xmm2, [ptr_plaintext+16*1] | |
1147 | vmovdqu xmm3, [ptr_plaintext+16*2] | |
1148 | add ptr_plaintext, 16*3 | |
1149 | and N_val, 15 | |
1150 | je _done_3_remain | |
1151 | vextracti32x4 xmm13, zmm9, 2 | |
1152 | vextracti32x4 xmm10, zmm9, 1 | |
1153 | vextracti32x4 xmm11, zmm9, 3 | |
1154 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 | |
1155 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1156 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1157 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1158 | add ptr_ciphertext, 16*3 | |
1159 | vmovdqa xmm8, xmm3 | |
1160 | vmovdqa xmm0, xmm13 | |
1161 | jmp _steal_cipher | |
1162 | _done_3_remain: | |
1163 | vextracti32x4 xmm10, zmm9, 1 | |
1164 | vextracti32x4 xmm11, zmm9, 2 | |
1165 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 | |
1166 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1167 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1168 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1169 | jmp _ret_ | |
1170 | ||
1171 | _remaining_num_blocks_is_2: | |
1172 | vmovdqu xmm1, [ptr_plaintext+16*0] | |
1173 | vmovdqu xmm2, [ptr_plaintext+16*1] | |
1174 | add ptr_plaintext, 16*2 | |
1175 | and N_val, 15 | |
1176 | je _done_2_remain | |
1177 | vextracti32x4 xmm10, zmm9, 2 | |
1178 | vextracti32x4 xmm12, zmm9, 1 | |
1179 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 | |
1180 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1181 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1182 | add ptr_ciphertext, 16*2 | |
1183 | vmovdqa xmm8, xmm2 | |
1184 | vmovdqa xmm0, xmm12 | |
1185 | jmp _steal_cipher | |
1186 | _done_2_remain: | |
1187 | vextracti32x4 xmm10, zmm9, 1 | |
1188 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 | |
1189 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1190 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1191 | jmp _ret_ | |
1192 | ||
1193 | _remaining_num_blocks_is_1: | |
1194 | vmovdqu xmm1, [ptr_plaintext] | |
1195 | add ptr_plaintext, 16 | |
1196 | and N_val, 15 | |
1197 | je _done_1_remain | |
1198 | vextracti32x4 xmm11, zmm9, 1 | |
1199 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 | |
1200 | vmovdqu [ptr_ciphertext], xmm1 | |
1201 | add ptr_ciphertext, 16 | |
1202 | vmovdqa xmm8, xmm1 | |
1203 | vmovdqa xmm0, xmm9 | |
1204 | jmp _steal_cipher | |
1205 | _done_1_remain: | |
1206 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 | |
1207 | vmovdqu [ptr_ciphertext], xmm1 | |
1208 | jmp _ret_ | |
1209 | ||
1210 | ||
1211 | ||
1212 | _start_by16: | |
1213 | ; Make first 7 tweek values | |
1214 | vbroadcasti32x4 zmm0, [TW] | |
1215 | vbroadcasti32x4 zmm8, [shufb_15_7] | |
1216 | mov tmp1, 0xaa | |
1217 | kmovq k2, tmp1 | |
1218 | ||
1219 | ; Mult tweak by 2^{3, 2, 1, 0} | |
1220 | vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 | |
1221 | vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 | |
1222 | vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 | |
1223 | vpclmulqdq zmm3, zmm2, zpoly, 0x00 | |
1224 | vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 | |
1225 | vpxord zmm9, zmm3, zmm4 | |
1226 | ||
1227 | ; Mult tweak by 2^{7, 6, 5, 4} | |
1228 | vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 | |
1229 | vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 | |
1230 | vpclmulqdq zmm7, zmm6, zpoly, 0x00 | |
1231 | vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 | |
1232 | vpxord zmm10, zmm7, zmm5 | |
1233 | ||
1234 | ; Make next 8 tweek values by all x 2^8 | |
1235 | vpsrldq zmm13, zmm9, 15 | |
1236 | vpclmulqdq zmm14, zmm13, zpoly, 0 | |
1237 | vpslldq zmm11, zmm9, 1 | |
1238 | vpxord zmm11, zmm11, zmm14 | |
1239 | ||
1240 | vpsrldq zmm15, zmm10, 15 | |
1241 | vpclmulqdq zmm16, zmm15, zpoly, 0 | |
1242 | vpslldq zmm12, zmm10, 1 | |
1243 | vpxord zmm12, zmm12, zmm16 | |
1244 | ||
1245 | _main_loop_run_16: | |
1246 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1247 | vmovdqu8 zmm2, [ptr_plaintext+16*4] | |
1248 | vmovdqu8 zmm3, [ptr_plaintext+16*8] | |
1249 | vmovdqu8 zmm4, [ptr_plaintext+16*12] | |
1250 | add ptr_plaintext, 256 | |
1251 | ||
1252 | decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 | |
1253 | ||
1254 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1255 | vmovdqu8 [ptr_ciphertext+16*4], zmm2 | |
1256 | vmovdqu8 [ptr_ciphertext+16*8], zmm3 | |
1257 | vmovdqu8 [ptr_ciphertext+16*12], zmm4 | |
1258 | add ptr_ciphertext, 256 | |
1259 | sub N_val, 256 | |
1260 | cmp N_val, 256 | |
1261 | jge _main_loop_run_16 | |
1262 | ||
1263 | cmp N_val, 128 | |
1264 | jge _main_loop_run_8 | |
1265 | ||
1266 | jmp _do_n_blocks | |
1267 | ||
1268 | _start_by8: | |
1269 | ; Make first 7 tweek values | |
1270 | vbroadcasti32x4 zmm0, [TW] | |
1271 | vbroadcasti32x4 zmm8, [shufb_15_7] | |
1272 | mov tmp1, 0xaa | |
1273 | kmovq k2, tmp1 | |
1274 | ||
1275 | ; Mult tweak by 2^{3, 2, 1, 0} | |
1276 | vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 | |
1277 | vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 | |
1278 | vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 | |
1279 | vpclmulqdq zmm3, zmm2, zpoly, 0x00 | |
1280 | vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 | |
1281 | vpxord zmm9, zmm3, zmm4 | |
1282 | ||
1283 | ; Mult tweak by 2^{7, 6, 5, 4} | |
1284 | vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 | |
1285 | vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 | |
1286 | vpclmulqdq zmm7, zmm6, zpoly, 0x00 | |
1287 | vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 | |
1288 | vpxord zmm10, zmm7, zmm5 | |
1289 | ||
1290 | _main_loop_run_8: | |
1291 | vmovdqu8 zmm1, [ptr_plaintext+16*0] | |
1292 | vmovdqu8 zmm2, [ptr_plaintext+16*4] | |
1293 | add ptr_plaintext, 128 | |
1294 | ||
1295 | decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 | |
1296 | ||
1297 | vmovdqu8 [ptr_ciphertext+16*0], zmm1 | |
1298 | vmovdqu8 [ptr_ciphertext+16*4], zmm2 | |
1299 | add ptr_ciphertext, 128 | |
1300 | sub N_val, 128 | |
1301 | cmp N_val, 128 | |
1302 | jge _main_loop_run_8 | |
1303 | ||
1304 | jmp _do_n_blocks | |
1305 | ||
1306 | _steal_cipher: | |
1307 | ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak | |
1308 | vmovdqa xmm2, xmm8 | |
1309 | ||
1310 | ; shift xmm8 to the left by 16-N_val bytes | |
1311 | lea twtempl, [vpshufb_shf_table] | |
1312 | vmovdqu xmm10, [twtempl+N_val] | |
1313 | vpshufb xmm8, xmm10 | |
1314 | ||
1315 | vmovdqu xmm3, [ptr_plaintext - 16 + N_val] | |
1316 | vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 | |
1317 | ||
1318 | ; shift xmm3 to the right by 16-N_val bytes | |
1319 | lea twtempl, [vpshufb_shf_table +16] | |
1320 | sub twtempl, N_val | |
1321 | vmovdqu xmm10, [twtempl] | |
1322 | vpxor xmm10, [mask1] | |
1323 | vpshufb xmm3, xmm10 | |
1324 | ||
1325 | vpblendvb xmm3, xmm3, xmm2, xmm10 | |
1326 | ||
1327 | ; xor Tweak value | |
1328 | vpxor xmm8, xmm3, xmm0 | |
1329 | ||
1330 | ;decrypt last block with cipher stealing | |
1331 | vpxor xmm8, [keys] ; ARK | |
1332 | vaesdec xmm8, [keys + 16*1] ; round 1 | |
1333 | vaesdec xmm8, [keys + 16*2] ; round 2 | |
1334 | vaesdec xmm8, [keys + 16*3] ; round 3 | |
1335 | vaesdec xmm8, [keys + 16*4] ; round 4 | |
1336 | vaesdec xmm8, [keys + 16*5] ; round 5 | |
1337 | vaesdec xmm8, [keys + 16*6] ; round 6 | |
1338 | vaesdec xmm8, [keys + 16*7] ; round 7 | |
1339 | vaesdec xmm8, [keys + 16*8] ; round 8 | |
1340 | vaesdec xmm8, [keys + 16*9] ; round 9 | |
1341 | vaesdeclast xmm8, [keys + 16*10] ; round 10 | |
1342 | ||
1343 | ; xor Tweak value | |
1344 | vpxor xmm8, xmm8, xmm0 | |
1345 | ||
1346 | _done: | |
1347 | ; store last ciphertext value | |
1348 | vmovdqu [ptr_ciphertext - 16], xmm8 | |
1349 | ||
1350 | _ret_: | |
1351 | mov rbx, [_gpr + 8*0] | |
1352 | ||
1353 | %ifidn __OUTPUT_FORMAT__, win64 | |
1354 | mov rdi, [_gpr + 8*1] | |
1355 | mov rsi, [_gpr + 8*2] | |
1356 | ||
1357 | vmovdqa xmm6, [_xmm + 16*0] | |
1358 | vmovdqa xmm7, [_xmm + 16*1] | |
1359 | vmovdqa xmm8, [_xmm + 16*2] | |
1360 | vmovdqa xmm9, [_xmm + 16*3] | |
1361 | vmovdqa xmm10, [_xmm + 16*4] | |
1362 | vmovdqa xmm11, [_xmm + 16*5] | |
1363 | vmovdqa xmm12, [_xmm + 16*6] | |
1364 | vmovdqa xmm13, [_xmm + 16*7] | |
1365 | vmovdqa xmm14, [_xmm + 16*8] | |
1366 | vmovdqa xmm15, [_xmm + 16*9] | |
1367 | %endif | |
1368 | ||
1369 | %ifndef ALIGN_STACK | |
1370 | add rsp, VARIABLE_OFFSET | |
1371 | %else | |
1372 | mov rsp, rbp | |
1373 | pop rbp | |
1374 | %endif | |
1375 | ret | |
1376 | ||
1377 | ||
1378 | _less_than_128_bytes: | |
1379 | cmp N_val, 16 | |
1380 | jb _ret_ | |
1381 | ||
1382 | mov tmp1, N_val | |
1383 | and tmp1, (7 << 4) | |
1384 | cmp tmp1, (6 << 4) | |
1385 | je _num_blocks_is_6 | |
1386 | cmp tmp1, (5 << 4) | |
1387 | je _num_blocks_is_5 | |
1388 | cmp tmp1, (4 << 4) | |
1389 | je _num_blocks_is_4 | |
1390 | cmp tmp1, (3 << 4) | |
1391 | je _num_blocks_is_3 | |
1392 | cmp tmp1, (2 << 4) | |
1393 | je _num_blocks_is_2 | |
1394 | cmp tmp1, (1 << 4) | |
1395 | je _num_blocks_is_1 | |
1396 | ||
1397 | _num_blocks_is_7: | |
1398 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 | |
1399 | add ptr_plaintext, 16*7 | |
1400 | and N_val, 15 | |
1401 | je _done_7 | |
1402 | ||
1403 | _steal_cipher_7: | |
1404 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1405 | shl twtempl, 1 | |
1406 | adc twtemph, twtemph | |
1407 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1408 | xor twtempl, ghash_poly_8b_temp | |
1409 | mov [TW+8*2], twtempl | |
1410 | mov [TW+8*3], twtemph | |
1411 | vmovdqa64 xmm16, xmm15 | |
1412 | vmovdqa xmm15, [TW+16*1] | |
1413 | ||
1414 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 | |
1415 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1416 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1417 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1418 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1419 | vmovdqu [ptr_ciphertext+16*4], xmm5 | |
1420 | vmovdqu [ptr_ciphertext+16*5], xmm6 | |
1421 | add ptr_ciphertext, 16*7 | |
1422 | vmovdqa64 xmm0, xmm16 | |
1423 | vmovdqa xmm8, xmm7 | |
1424 | jmp _steal_cipher | |
1425 | ||
1426 | _done_7: | |
1427 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 | |
1428 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1429 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1430 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1431 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1432 | vmovdqu [ptr_ciphertext+16*4], xmm5 | |
1433 | vmovdqu [ptr_ciphertext+16*5], xmm6 | |
1434 | add ptr_ciphertext, 16*7 | |
1435 | vmovdqa xmm8, xmm7 | |
1436 | jmp _done | |
1437 | ||
1438 | _num_blocks_is_6: | |
1439 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 | |
1440 | add ptr_plaintext, 16*6 | |
1441 | and N_val, 15 | |
1442 | je _done_6 | |
1443 | ||
1444 | _steal_cipher_6: | |
1445 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1446 | shl twtempl, 1 | |
1447 | adc twtemph, twtemph | |
1448 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1449 | xor twtempl, ghash_poly_8b_temp | |
1450 | mov [TW+8*2], twtempl | |
1451 | mov [TW+8*3], twtemph | |
1452 | vmovdqa xmm15, xmm14 | |
1453 | vmovdqa xmm14, [TW+16*1] | |
1454 | ||
1455 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 | |
1456 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1457 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1458 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1459 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1460 | vmovdqu [ptr_ciphertext+16*4], xmm5 | |
1461 | add ptr_ciphertext, 16*6 | |
1462 | vmovdqa xmm0, xmm15 | |
1463 | vmovdqa xmm8, xmm6 | |
1464 | jmp _steal_cipher | |
1465 | ||
1466 | _done_6: | |
1467 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 | |
1468 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1469 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1470 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1471 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1472 | vmovdqu [ptr_ciphertext+16*4], xmm5 | |
1473 | add ptr_ciphertext, 16*6 | |
1474 | vmovdqa xmm8, xmm6 | |
1475 | jmp _done | |
1476 | ||
1477 | _num_blocks_is_5: | |
1478 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 | |
1479 | add ptr_plaintext, 16*5 | |
1480 | and N_val, 15 | |
1481 | je _done_5 | |
1482 | ||
1483 | _steal_cipher_5: | |
1484 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1485 | shl twtempl, 1 | |
1486 | adc twtemph, twtemph | |
1487 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1488 | xor twtempl, ghash_poly_8b_temp | |
1489 | mov [TW+8*2], twtempl | |
1490 | mov [TW+8*3], twtemph | |
1491 | vmovdqa xmm14, xmm13 | |
1492 | vmovdqa xmm13, [TW+16*1] | |
1493 | ||
1494 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 | |
1495 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1496 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1497 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1498 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1499 | add ptr_ciphertext, 16*5 | |
1500 | vmovdqa xmm0, xmm14 | |
1501 | vmovdqa xmm8, xmm5 | |
1502 | jmp _steal_cipher | |
1503 | ||
1504 | _done_5: | |
1505 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 | |
1506 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1507 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1508 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1509 | vmovdqu [ptr_ciphertext+16*3], xmm4 | |
1510 | add ptr_ciphertext, 16*5 | |
1511 | vmovdqa xmm8, xmm5 | |
1512 | jmp _done | |
1513 | ||
1514 | _num_blocks_is_4: | |
1515 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 | |
1516 | add ptr_plaintext, 16*4 | |
1517 | and N_val, 15 | |
1518 | je _done_4 | |
1519 | ||
1520 | _steal_cipher_4: | |
1521 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1522 | shl twtempl, 1 | |
1523 | adc twtemph, twtemph | |
1524 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1525 | xor twtempl, ghash_poly_8b_temp | |
1526 | mov [TW+8*2], twtempl | |
1527 | mov [TW+8*3], twtemph | |
1528 | vmovdqa xmm13, xmm12 | |
1529 | vmovdqa xmm12, [TW+16*1] | |
1530 | ||
1531 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 | |
1532 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1533 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1534 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1535 | add ptr_ciphertext, 16*4 | |
1536 | vmovdqa xmm0, xmm13 | |
1537 | vmovdqa xmm8, xmm4 | |
1538 | jmp _steal_cipher | |
1539 | ||
1540 | _done_4: | |
1541 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 | |
1542 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1543 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1544 | vmovdqu [ptr_ciphertext+16*2], xmm3 | |
1545 | add ptr_ciphertext, 16*4 | |
1546 | vmovdqa xmm8, xmm4 | |
1547 | jmp _done | |
1548 | ||
1549 | _num_blocks_is_3: | |
1550 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 | |
1551 | add ptr_plaintext, 16*3 | |
1552 | and N_val, 15 | |
1553 | je _done_3 | |
1554 | ||
1555 | _steal_cipher_3: | |
1556 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1557 | shl twtempl, 1 | |
1558 | adc twtemph, twtemph | |
1559 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1560 | xor twtempl, ghash_poly_8b_temp | |
1561 | mov [TW+8*2], twtempl | |
1562 | mov [TW+8*3], twtemph | |
1563 | vmovdqa xmm12, xmm11 | |
1564 | vmovdqa xmm11, [TW+16*1] | |
1565 | ||
1566 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 | |
1567 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1568 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1569 | add ptr_ciphertext, 16*3 | |
1570 | vmovdqa xmm0, xmm12 | |
1571 | vmovdqa xmm8, xmm3 | |
1572 | jmp _steal_cipher | |
1573 | ||
1574 | _done_3: | |
1575 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 | |
1576 | vmovdqu [ptr_ciphertext+16*0], xmm1 | |
1577 | vmovdqu [ptr_ciphertext+16*1], xmm2 | |
1578 | add ptr_ciphertext, 16*3 | |
1579 | vmovdqa xmm8, xmm3 | |
1580 | jmp _done | |
1581 | ||
1582 | _num_blocks_is_2: | |
1583 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 | |
1584 | add ptr_plaintext, 16*2 | |
1585 | and N_val, 15 | |
1586 | je _done_2 | |
1587 | ||
1588 | _steal_cipher_2: | |
1589 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1590 | shl twtempl, 1 | |
1591 | adc twtemph, twtemph | |
1592 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1593 | xor twtempl, ghash_poly_8b_temp | |
1594 | mov [TW+8*2], twtempl | |
1595 | mov [TW+8*3], twtemph | |
1596 | vmovdqa xmm11, xmm10 | |
1597 | vmovdqa xmm10, [TW+16*1] | |
1598 | ||
1599 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 | |
1600 | vmovdqu [ptr_ciphertext], xmm1 | |
1601 | add ptr_ciphertext, 16*2 | |
1602 | vmovdqa xmm0, xmm11 | |
1603 | vmovdqa xmm8, xmm2 | |
1604 | jmp _steal_cipher | |
1605 | ||
1606 | _done_2: | |
1607 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 | |
1608 | vmovdqu [ptr_ciphertext], xmm1 | |
1609 | add ptr_ciphertext, 16*2 | |
1610 | vmovdqa xmm8, xmm2 | |
1611 | jmp _done | |
1612 | ||
1613 | _num_blocks_is_1: | |
1614 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 | |
1615 | add ptr_plaintext, 16*1 | |
1616 | and N_val, 15 | |
1617 | je _done_1 | |
1618 | ||
1619 | _steal_cipher_1: | |
1620 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1621 | shl twtempl, 1 | |
1622 | adc twtemph, twtemph | |
1623 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1624 | xor twtempl, ghash_poly_8b_temp | |
1625 | mov [TW+8*2], twtempl | |
1626 | mov [TW+8*3], twtemph | |
1627 | vmovdqa xmm10, xmm9 | |
1628 | vmovdqa xmm9, [TW+16*1] | |
1629 | ||
1630 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 | |
1631 | add ptr_ciphertext, 16*1 | |
1632 | vmovdqa xmm0, xmm10 | |
1633 | vmovdqa xmm8, xmm1 | |
1634 | jmp _steal_cipher | |
1635 | ||
1636 | _done_1: | |
1637 | decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 | |
1638 | add ptr_ciphertext, 16*1 | |
1639 | vmovdqa xmm8, xmm1 | |
1640 | jmp _done | |
1641 | ||
1642 | section .data | |
1643 | align 16 | |
1644 | ||
1645 | vpshufb_shf_table: | |
1646 | ; use these values for shift constants for the vpshufb instruction | |
1647 | ; different alignments result in values as shown: | |
1648 | ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 | |
1649 | ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 | |
1650 | ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 | |
1651 | ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 | |
1652 | ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 | |
1653 | ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 | |
1654 | ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 | |
1655 | ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 | |
1656 | ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 | |
1657 | ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 | |
1658 | ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 | |
1659 | ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 | |
1660 | ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 | |
1661 | ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 | |
1662 | ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 | |
1663 | dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 | |
1664 | dq 0x0706050403020100, 0x000e0d0c0b0a0908 | |
1665 | ||
1666 | mask1: | |
1667 | dq 0x8080808080808080, 0x8080808080808080 | |
1668 | ||
1669 | const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 | |
1670 | const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 | |
1671 | const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 | |
1672 | const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 | |
1673 | ||
1674 | shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
1675 | ||
1676 | %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. | |
1677 | %ifidn __OUTPUT_FORMAT__, win64 | |
1678 | global no_XTS_AES_128_dec_vaes | |
1679 | no_XTS_AES_128_dec_vaes: | |
1680 | %endif | |
1681 | %endif ; (AS_FEATURE_LEVEL) >= 10 |