]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ; XTS decrypt function with 256-bit AES | |
30 | ; expanded keys are not aligned | |
31 | ; plaintext and ciphertext are not aligned | |
32 | ; second key is stored in the stack as aligned to 16 Bytes | |
33 | ; first key is required only once, no need for storage of this key | |
34 | ||
35 | %include "reg_sizes.asm" | |
36 | ||
37 | default rel | |
38 | %define TW rsp ; store 8 tweak values | |
39 | %define keys rsp + 16*8 ; store 15 expanded keys | |
40 | ||
41 | %ifidn __OUTPUT_FORMAT__, win64 | |
42 | %define _xmm rsp + 16*23 ; store xmm6:xmm15 | |
43 | %endif | |
44 | ||
45 | %ifidn __OUTPUT_FORMAT__, elf64 | |
46 | %define _gpr rsp + 16*23 ; store rbx | |
47 | %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 | |
48 | %else | |
49 | %define _gpr rsp + 16*33 ; store rdi, rsi, rbx | |
50 | %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 | |
51 | %endif | |
52 | ||
53 | %define GHASH_POLY 0x87 | |
54 | ||
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ;void XTS_AES_256_dec_expanded_key_sse( | |
57 | ; UINT8 *k2, // key used for tweaking, 16*15 bytes | |
58 | ; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes | |
59 | ; UINT8 *TW_initial, // initial tweak value, 16 bytes | |
60 | ; UINT64 N, // sector size, in bytes | |
61 | ; const UINT8 *ct, // ciphertext sector input data | |
62 | ; UINT8 *pt); // plaintext sector output data | |
63 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
64 | ||
65 | ; arguments for input parameters | |
66 | %ifidn __OUTPUT_FORMAT__, elf64 | |
67 | %xdefine ptr_key2 rdi | |
68 | %xdefine ptr_key1 rsi | |
69 | %xdefine T_val rdx | |
70 | %xdefine N_val rcx | |
71 | %xdefine ptr_plaintext r8 | |
72 | %xdefine ptr_ciphertext r9 | |
73 | %else | |
74 | %xdefine ptr_key2 rcx | |
75 | %xdefine ptr_key1 rdx | |
76 | %xdefine T_val r8 | |
77 | %xdefine N_val r9 | |
78 | %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] | |
79 | %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] | |
80 | %endif | |
81 | ||
82 | ; arguments for temp parameters | |
83 | %ifidn __OUTPUT_FORMAT__, elf64 | |
84 | %define tmp1 rdi | |
85 | %define target_ptr_val rsi | |
86 | %define ghash_poly_8b r10 | |
87 | %define ghash_poly_8b_temp r11 | |
88 | %else | |
89 | %define tmp1 rcx | |
90 | %define target_ptr_val rdx | |
91 | %define ghash_poly_8b rdi | |
92 | %define ghash_poly_8b_temp rsi | |
93 | %endif | |
94 | ||
95 | %define twtempl rax ; global temp registers used for tweak computation | |
96 | %define twtemph rbx | |
97 | ||
98 | ||
99 | ; macro to encrypt the tweak value | |
100 | ||
101 | %macro encrypt_T 8 | |
102 | %define %%xkey2 %1 | |
103 | %define %%xstate_tweak %2 | |
104 | %define %%xkey1 %3 | |
105 | %define %%xraw_key %4 | |
106 | %define %%xtmp %5 | |
107 | %define %%ptr_key2 %6 | |
108 | %define %%ptr_key1 %7 | |
109 | %define %%ptr_expanded_keys %8 | |
110 | ||
111 | movdqu %%xkey2, [%%ptr_key2] | |
112 | pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption | |
113 | ||
114 | movdqu %%xkey1, [%%ptr_key1 + 16*14] | |
115 | movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack | |
116 | ||
117 | movdqu %%xkey2, [%%ptr_key2 + 16*1] | |
118 | aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption | |
119 | ||
120 | movdqu %%xkey1, [%%ptr_key1 + 16*13] | |
121 | movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack | |
122 | ||
123 | ||
124 | movdqu %%xkey2, [%%ptr_key2 + 16*2] | |
125 | aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption | |
126 | ||
127 | movdqu %%xkey1, [%%ptr_key1 + 16*12] | |
128 | movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack | |
129 | ||
130 | movdqu %%xkey2, [%%ptr_key2 + 16*3] | |
131 | aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption | |
132 | ||
133 | movdqu %%xkey1, [%%ptr_key1 + 16*11] | |
134 | movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack | |
135 | ||
136 | movdqu %%xkey2, [%%ptr_key2 + 16*4] | |
137 | aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption | |
138 | ||
139 | movdqu %%xkey1, [%%ptr_key1 + 16*10] | |
140 | movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack | |
141 | ||
142 | movdqu %%xkey2, [%%ptr_key2 + 16*5] | |
143 | aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption | |
144 | ||
145 | movdqu %%xkey1, [%%ptr_key1 + 16*9] | |
146 | movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack | |
147 | ||
148 | movdqu %%xkey2, [%%ptr_key2 + 16*6] | |
149 | aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption | |
150 | ||
151 | movdqu %%xkey1, [%%ptr_key1 + 16*8] | |
152 | movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack | |
153 | ||
154 | movdqu %%xkey2, [%%ptr_key2 + 16*7] | |
155 | aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption | |
156 | ||
157 | movdqu %%xkey1, [%%ptr_key1 + 16*7] | |
158 | movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack | |
159 | ||
160 | ||
161 | movdqu %%xkey2, [%%ptr_key2 + 16*8] | |
162 | aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption | |
163 | ||
164 | movdqu %%xkey1, [%%ptr_key1 + 16*6] | |
165 | movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack | |
166 | ||
167 | ||
168 | movdqu %%xkey2, [%%ptr_key2 + 16*9] | |
169 | aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption | |
170 | ||
171 | movdqu %%xkey1, [%%ptr_key1 + 16*5] | |
172 | movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack | |
173 | ||
174 | ||
175 | movdqu %%xkey2, [%%ptr_key2 + 16*10] | |
176 | aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption | |
177 | ||
178 | movdqu %%xkey1, [%%ptr_key1 + 16*4] | |
179 | movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack | |
180 | ||
181 | ||
182 | movdqu %%xkey2, [%%ptr_key2 + 16*11] | |
183 | aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption | |
184 | ||
185 | movdqu %%xkey1, [%%ptr_key1 + 16*3] | |
186 | movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack | |
187 | ||
188 | movdqu %%xkey2, [%%ptr_key2 + 16*12] | |
189 | aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption | |
190 | ||
191 | movdqu %%xkey1, [%%ptr_key1 + 16*2] | |
192 | movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack | |
193 | ||
194 | movdqu %%xkey2, [%%ptr_key2 + 16*13] | |
195 | aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption | |
196 | ||
197 | movdqu %%xkey1, [%%ptr_key1 + 16*1] | |
198 | movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack | |
199 | ||
200 | movdqu %%xkey2, [%%ptr_key2 + 16*14] | |
201 | aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption | |
202 | ||
203 | movdqu %%xkey1, [%%ptr_key1 + 16*0] | |
204 | movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack | |
205 | ||
206 | movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value | |
207 | %endmacro | |
208 | ||
209 | ||
210 | ; generate initial tweak values | |
211 | ; load initial plaintext values | |
212 | %macro initialize 16 | |
213 | ||
214 | %define %%ST1 %1 ; state 1 | |
215 | %define %%ST2 %2 ; state 2 | |
216 | %define %%ST3 %3 ; state 3 | |
217 | %define %%ST4 %4 ; state 4 | |
218 | %define %%ST5 %5 ; state 5 | |
219 | %define %%ST6 %6 ; state 6 | |
220 | %define %%ST7 %7 ; state 7 | |
221 | %define %%ST8 %8 ; state 8 | |
222 | ||
223 | %define %%TW1 %9 ; tweak 1 | |
224 | %define %%TW2 %10 ; tweak 2 | |
225 | %define %%TW3 %11 ; tweak 3 | |
226 | %define %%TW4 %12 ; tweak 4 | |
227 | %define %%TW5 %13 ; tweak 5 | |
228 | %define %%TW6 %14 ; tweak 6 | |
229 | %define %%TW7 %15 ; tweak 7 | |
230 | ||
231 | %define %%num_initial_blocks %16 | |
232 | ||
233 | ||
234 | ; generate next Tweak values | |
235 | movdqa %%TW1, [TW+16*0] | |
236 | mov twtempl, [TW+8*0] | |
237 | mov twtemph, [TW+8*1] | |
238 | movdqu %%ST1, [ptr_plaintext+16*0] | |
239 | %if (%%num_initial_blocks>=2) | |
240 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
241 | shl twtempl, 1 | |
242 | adc twtemph, twtemph | |
243 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
244 | xor twtempl, ghash_poly_8b_temp | |
245 | mov [TW+8*2], twtempl | |
246 | mov [TW+8*3], twtemph; | |
247 | movdqa %%TW2, [TW+16*1] | |
248 | movdqu %%ST2, [ptr_plaintext+16*1] | |
249 | %endif | |
250 | %if (%%num_initial_blocks>=3) | |
251 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
252 | shl twtempl, 1 | |
253 | adc twtemph, twtemph | |
254 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
255 | xor twtempl, ghash_poly_8b_temp | |
256 | mov [TW+8*4], twtempl | |
257 | mov [TW+8*5], twtemph; | |
258 | movdqa %%TW3, [TW+16*2] | |
259 | movdqu %%ST3, [ptr_plaintext+16*2] | |
260 | %endif | |
261 | %if (%%num_initial_blocks>=4) | |
262 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
263 | shl twtempl, 1 | |
264 | adc twtemph, twtemph | |
265 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
266 | xor twtempl, ghash_poly_8b_temp | |
267 | mov [TW+8*6], twtempl | |
268 | mov [TW+8*7], twtemph; | |
269 | movdqa %%TW4, [TW+16*3] | |
270 | movdqu %%ST4, [ptr_plaintext+16*3] | |
271 | %endif | |
272 | %if (%%num_initial_blocks>=5) | |
273 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
274 | shl twtempl, 1 | |
275 | adc twtemph, twtemph | |
276 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
277 | xor twtempl, ghash_poly_8b_temp | |
278 | mov [TW+8*8], twtempl | |
279 | mov [TW+8*9], twtemph; | |
280 | movdqa %%TW5, [TW+16*4] | |
281 | movdqu %%ST5, [ptr_plaintext+16*4] | |
282 | %endif | |
283 | %if (%%num_initial_blocks>=6) | |
284 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
285 | shl twtempl, 1 | |
286 | adc twtemph, twtemph | |
287 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
288 | xor twtempl, ghash_poly_8b_temp | |
289 | mov [TW+8*10], twtempl | |
290 | mov [TW+8*11], twtemph; | |
291 | movdqa %%TW6, [TW+16*5] | |
292 | movdqu %%ST6, [ptr_plaintext+16*5] | |
293 | %endif | |
294 | %if (%%num_initial_blocks>=7) | |
295 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
296 | shl twtempl, 1 | |
297 | adc twtemph, twtemph | |
298 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
299 | xor twtempl, ghash_poly_8b_temp | |
300 | mov [TW+8*12], twtempl | |
301 | mov [TW+8*13], twtemph; | |
302 | movdqa %%TW7, [TW+16*6] | |
303 | movdqu %%ST7, [ptr_plaintext+16*6] | |
304 | %endif | |
305 | ||
306 | ||
307 | ||
308 | %endmacro | |
309 | ||
310 | ||
311 | ; encrypt initial blocks of AES | |
312 | ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted | |
313 | ; next 8 Tweak values are generated | |
314 | %macro encrypt_initial 18 | |
315 | %define %%ST1 %1 ; state 1 | |
316 | %define %%ST2 %2 ; state 2 | |
317 | %define %%ST3 %3 ; state 3 | |
318 | %define %%ST4 %4 ; state 4 | |
319 | %define %%ST5 %5 ; state 5 | |
320 | %define %%ST6 %6 ; state 6 | |
321 | %define %%ST7 %7 ; state 7 | |
322 | %define %%ST8 %8 ; state 8 | |
323 | ||
324 | %define %%TW1 %9 ; tweak 1 | |
325 | %define %%TW2 %10 ; tweak 2 | |
326 | %define %%TW3 %11 ; tweak 3 | |
327 | %define %%TW4 %12 ; tweak 4 | |
328 | %define %%TW5 %13 ; tweak 5 | |
329 | %define %%TW6 %14 ; tweak 6 | |
330 | %define %%TW7 %15 ; tweak 7 | |
331 | %define %%T0 %16 ; Temp register | |
332 | %define %%num_blocks %17 | |
333 | ; %%num_blocks blocks encrypted | |
334 | ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 | |
335 | ||
336 | %define %%lt128 %18 ; less than 128 bytes | |
337 | ||
338 | ; xor Tweak value | |
339 | pxor %%ST1, %%TW1 | |
340 | %if (%%num_blocks>=2) | |
341 | pxor %%ST2, %%TW2 | |
342 | %endif | |
343 | %if (%%num_blocks>=3) | |
344 | pxor %%ST3, %%TW3 | |
345 | %endif | |
346 | %if (%%num_blocks>=4) | |
347 | pxor %%ST4, %%TW4 | |
348 | %endif | |
349 | %if (%%num_blocks>=5) | |
350 | pxor %%ST5, %%TW5 | |
351 | %endif | |
352 | %if (%%num_blocks>=6) | |
353 | pxor %%ST6, %%TW6 | |
354 | %endif | |
355 | %if (%%num_blocks>=7) | |
356 | pxor %%ST7, %%TW7 | |
357 | %endif | |
358 | ||
359 | ||
360 | ; ARK | |
361 | movdqa %%T0, [keys] | |
362 | pxor %%ST1, %%T0 | |
363 | %if (%%num_blocks>=2) | |
364 | pxor %%ST2, %%T0 | |
365 | %endif | |
366 | %if (%%num_blocks>=3) | |
367 | pxor %%ST3, %%T0 | |
368 | %endif | |
369 | %if (%%num_blocks>=4) | |
370 | pxor %%ST4, %%T0 | |
371 | %endif | |
372 | %if (%%num_blocks>=5) | |
373 | pxor %%ST5, %%T0 | |
374 | %endif | |
375 | %if (%%num_blocks>=6) | |
376 | pxor %%ST6, %%T0 | |
377 | %endif | |
378 | %if (%%num_blocks>=7) | |
379 | pxor %%ST7, %%T0 | |
380 | %endif | |
381 | ||
382 | ||
383 | %if (0 == %%lt128) | |
384 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
385 | shl twtempl, 1 | |
386 | adc twtemph, twtemph | |
387 | %endif | |
388 | ||
389 | ; round 1 | |
390 | movdqa %%T0, [keys + 16*1] | |
391 | aesdec %%ST1, %%T0 | |
392 | %if (%%num_blocks>=2) | |
393 | aesdec %%ST2, %%T0 | |
394 | %endif | |
395 | %if (%%num_blocks>=3) | |
396 | aesdec %%ST3, %%T0 | |
397 | %endif | |
398 | %if (%%num_blocks>=4) | |
399 | aesdec %%ST4, %%T0 | |
400 | %endif | |
401 | %if (%%num_blocks>=5) | |
402 | aesdec %%ST5, %%T0 | |
403 | %endif | |
404 | %if (%%num_blocks>=6) | |
405 | aesdec %%ST6, %%T0 | |
406 | %endif | |
407 | %if (%%num_blocks>=7) | |
408 | aesdec %%ST7, %%T0 | |
409 | %endif | |
410 | %if (0 == %%lt128) | |
411 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
412 | xor twtempl, ghash_poly_8b_temp | |
413 | mov [TW + 8*0], twtempl ; next Tweak1 generated | |
414 | mov [TW + 8*1], twtemph | |
415 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
416 | %endif | |
417 | ||
418 | ; round 2 | |
419 | movdqa %%T0, [keys + 16*2] | |
420 | aesdec %%ST1, %%T0 | |
421 | %if (%%num_blocks>=2) | |
422 | aesdec %%ST2, %%T0 | |
423 | %endif | |
424 | %if (%%num_blocks>=3) | |
425 | aesdec %%ST3, %%T0 | |
426 | %endif | |
427 | %if (%%num_blocks>=4) | |
428 | aesdec %%ST4, %%T0 | |
429 | %endif | |
430 | %if (%%num_blocks>=5) | |
431 | aesdec %%ST5, %%T0 | |
432 | %endif | |
433 | %if (%%num_blocks>=6) | |
434 | aesdec %%ST6, %%T0 | |
435 | %endif | |
436 | %if (%%num_blocks>=7) | |
437 | aesdec %%ST7, %%T0 | |
438 | %endif | |
439 | ||
440 | %if (0 == %%lt128) | |
441 | shl twtempl, 1 | |
442 | adc twtemph, twtemph | |
443 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
444 | xor twtempl, ghash_poly_8b_temp | |
445 | mov [TW + 8*2], twtempl ; next Tweak2 generated | |
446 | %endif | |
447 | ||
448 | ; round 3 | |
449 | movdqa %%T0, [keys + 16*3] | |
450 | aesdec %%ST1, %%T0 | |
451 | %if (%%num_blocks>=2) | |
452 | aesdec %%ST2, %%T0 | |
453 | %endif | |
454 | %if (%%num_blocks>=3) | |
455 | aesdec %%ST3, %%T0 | |
456 | %endif | |
457 | %if (%%num_blocks>=4) | |
458 | aesdec %%ST4, %%T0 | |
459 | %endif | |
460 | %if (%%num_blocks>=5) | |
461 | aesdec %%ST5, %%T0 | |
462 | %endif | |
463 | %if (%%num_blocks>=6) | |
464 | aesdec %%ST6, %%T0 | |
465 | %endif | |
466 | %if (%%num_blocks>=7) | |
467 | aesdec %%ST7, %%T0 | |
468 | %endif | |
469 | %if (0 == %%lt128) | |
470 | mov [TW + 8*3], twtemph | |
471 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
472 | shl twtempl, 1 | |
473 | adc twtemph, twtemph | |
474 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
475 | %endif | |
476 | ||
477 | ; round 4 | |
478 | movdqa %%T0, [keys + 16*4] | |
479 | aesdec %%ST1, %%T0 | |
480 | %if (%%num_blocks>=2) | |
481 | aesdec %%ST2, %%T0 | |
482 | %endif | |
483 | %if (%%num_blocks>=3) | |
484 | aesdec %%ST3, %%T0 | |
485 | %endif | |
486 | %if (%%num_blocks>=4) | |
487 | aesdec %%ST4, %%T0 | |
488 | %endif | |
489 | %if (%%num_blocks>=5) | |
490 | aesdec %%ST5, %%T0 | |
491 | %endif | |
492 | %if (%%num_blocks>=6) | |
493 | aesdec %%ST6, %%T0 | |
494 | %endif | |
495 | %if (%%num_blocks>=7) | |
496 | aesdec %%ST7, %%T0 | |
497 | %endif | |
498 | ||
499 | %if (0 == %%lt128) | |
500 | xor twtempl, ghash_poly_8b_temp | |
501 | mov [TW + 8*4], twtempl ; next Tweak3 generated | |
502 | mov [TW + 8*5], twtemph | |
503 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
504 | shl twtempl, 1 | |
505 | %endif | |
506 | ||
507 | ; round 5 | |
508 | movdqa %%T0, [keys + 16*5] | |
509 | aesdec %%ST1, %%T0 | |
510 | %if (%%num_blocks>=2) | |
511 | aesdec %%ST2, %%T0 | |
512 | %endif | |
513 | %if (%%num_blocks>=3) | |
514 | aesdec %%ST3, %%T0 | |
515 | %endif | |
516 | %if (%%num_blocks>=4) | |
517 | aesdec %%ST4, %%T0 | |
518 | %endif | |
519 | %if (%%num_blocks>=5) | |
520 | aesdec %%ST5, %%T0 | |
521 | %endif | |
522 | %if (%%num_blocks>=6) | |
523 | aesdec %%ST6, %%T0 | |
524 | %endif | |
525 | %if (%%num_blocks>=7) | |
526 | aesdec %%ST7, %%T0 | |
527 | %endif | |
528 | ||
529 | %if (0 == %%lt128) | |
530 | adc twtemph, twtemph | |
531 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
532 | xor twtempl, ghash_poly_8b_temp | |
533 | mov [TW + 8*6], twtempl ; next Tweak4 generated | |
534 | mov [TW + 8*7], twtemph | |
535 | %endif | |
536 | ||
537 | ; round 6 | |
538 | movdqa %%T0, [keys + 16*6] | |
539 | aesdec %%ST1, %%T0 | |
540 | %if (%%num_blocks>=2) | |
541 | aesdec %%ST2, %%T0 | |
542 | %endif | |
543 | %if (%%num_blocks>=3) | |
544 | aesdec %%ST3, %%T0 | |
545 | %endif | |
546 | %if (%%num_blocks>=4) | |
547 | aesdec %%ST4, %%T0 | |
548 | %endif | |
549 | %if (%%num_blocks>=5) | |
550 | aesdec %%ST5, %%T0 | |
551 | %endif | |
552 | %if (%%num_blocks>=6) | |
553 | aesdec %%ST6, %%T0 | |
554 | %endif | |
555 | %if (%%num_blocks>=7) | |
556 | aesdec %%ST7, %%T0 | |
557 | %endif | |
558 | ||
559 | %if (0 == %%lt128) | |
560 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
561 | shl twtempl, 1 | |
562 | adc twtemph, twtemph | |
563 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
564 | xor twtempl, ghash_poly_8b_temp | |
565 | mov [TW + 8*8], twtempl ; next Tweak5 generated | |
566 | mov [TW + 8*9], twtemph | |
567 | %endif | |
568 | ||
569 | ; round 7 | |
570 | movdqa %%T0, [keys + 16*7] | |
571 | aesdec %%ST1, %%T0 | |
572 | %if (%%num_blocks>=2) | |
573 | aesdec %%ST2, %%T0 | |
574 | %endif | |
575 | %if (%%num_blocks>=3) | |
576 | aesdec %%ST3, %%T0 | |
577 | %endif | |
578 | %if (%%num_blocks>=4) | |
579 | aesdec %%ST4, %%T0 | |
580 | %endif | |
581 | %if (%%num_blocks>=5) | |
582 | aesdec %%ST5, %%T0 | |
583 | %endif | |
584 | %if (%%num_blocks>=6) | |
585 | aesdec %%ST6, %%T0 | |
586 | %endif | |
587 | %if (%%num_blocks>=7) | |
588 | aesdec %%ST7, %%T0 | |
589 | %endif | |
590 | ||
591 | %if (0 == %%lt128) | |
592 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
593 | shl twtempl, 1 | |
594 | adc twtemph, twtemph | |
595 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
596 | xor twtempl, ghash_poly_8b_temp | |
597 | mov [TW + 8*10], twtempl ; next Tweak6 generated | |
598 | mov [TW + 8*11], twtemph | |
599 | %endif | |
600 | ; round 8 | |
601 | movdqa %%T0, [keys + 16*8] | |
602 | aesdec %%ST1, %%T0 | |
603 | %if (%%num_blocks>=2) | |
604 | aesdec %%ST2, %%T0 | |
605 | %endif | |
606 | %if (%%num_blocks>=3) | |
607 | aesdec %%ST3, %%T0 | |
608 | %endif | |
609 | %if (%%num_blocks>=4) | |
610 | aesdec %%ST4, %%T0 | |
611 | %endif | |
612 | %if (%%num_blocks>=5) | |
613 | aesdec %%ST5, %%T0 | |
614 | %endif | |
615 | %if (%%num_blocks>=6) | |
616 | aesdec %%ST6, %%T0 | |
617 | %endif | |
618 | %if (%%num_blocks>=7) | |
619 | aesdec %%ST7, %%T0 | |
620 | %endif | |
621 | ||
622 | %if (0 == %%lt128) | |
623 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
624 | shl twtempl, 1 | |
625 | adc twtemph, twtemph | |
626 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
627 | xor twtempl, ghash_poly_8b_temp | |
628 | mov [TW + 8*12], twtempl ; next Tweak7 generated | |
629 | mov [TW + 8*13], twtemph | |
630 | %endif | |
631 | ; round 9 | |
632 | movdqa %%T0, [keys + 16*9] | |
633 | aesdec %%ST1, %%T0 | |
634 | %if (%%num_blocks>=2) | |
635 | aesdec %%ST2, %%T0 | |
636 | %endif | |
637 | %if (%%num_blocks>=3) | |
638 | aesdec %%ST3, %%T0 | |
639 | %endif | |
640 | %if (%%num_blocks>=4) | |
641 | aesdec %%ST4, %%T0 | |
642 | %endif | |
643 | %if (%%num_blocks>=5) | |
644 | aesdec %%ST5, %%T0 | |
645 | %endif | |
646 | %if (%%num_blocks>=6) | |
647 | aesdec %%ST6, %%T0 | |
648 | %endif | |
649 | %if (%%num_blocks>=7) | |
650 | aesdec %%ST7, %%T0 | |
651 | %endif | |
652 | ||
653 | %if (0 == %%lt128) | |
654 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
655 | shl twtempl, 1 | |
656 | adc twtemph, twtemph | |
657 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
658 | xor twtempl, ghash_poly_8b_temp | |
659 | mov [TW + 8*14], twtempl ; next Tweak8 generated | |
660 | mov [TW + 8*15], twtemph | |
661 | %endif | |
662 | ; round 10 | |
663 | movdqa %%T0, [keys + 16*10] | |
664 | aesdec %%ST1, %%T0 | |
665 | %if (%%num_blocks>=2) | |
666 | aesdec %%ST2, %%T0 | |
667 | %endif | |
668 | %if (%%num_blocks>=3) | |
669 | aesdec %%ST3, %%T0 | |
670 | %endif | |
671 | %if (%%num_blocks>=4) | |
672 | aesdec %%ST4, %%T0 | |
673 | %endif | |
674 | %if (%%num_blocks>=5) | |
675 | aesdec %%ST5, %%T0 | |
676 | %endif | |
677 | %if (%%num_blocks>=6) | |
678 | aesdec %%ST6, %%T0 | |
679 | %endif | |
680 | %if (%%num_blocks>=7) | |
681 | aesdec %%ST7, %%T0 | |
682 | %endif | |
683 | ; round 11 | |
684 | movdqa %%T0, [keys + 16*11] | |
685 | aesdec %%ST1, %%T0 | |
686 | %if (%%num_blocks>=2) | |
687 | aesdec %%ST2, %%T0 | |
688 | %endif | |
689 | %if (%%num_blocks>=3) | |
690 | aesdec %%ST3, %%T0 | |
691 | %endif | |
692 | %if (%%num_blocks>=4) | |
693 | aesdec %%ST4, %%T0 | |
694 | %endif | |
695 | %if (%%num_blocks>=5) | |
696 | aesdec %%ST5, %%T0 | |
697 | %endif | |
698 | %if (%%num_blocks>=6) | |
699 | aesdec %%ST6, %%T0 | |
700 | %endif | |
701 | %if (%%num_blocks>=7) | |
702 | aesdec %%ST7, %%T0 | |
703 | %endif | |
704 | ||
705 | ; round 12 | |
706 | movdqa %%T0, [keys + 16*12] | |
707 | aesdec %%ST1, %%T0 | |
708 | %if (%%num_blocks>=2) | |
709 | aesdec %%ST2, %%T0 | |
710 | %endif | |
711 | %if (%%num_blocks>=3) | |
712 | aesdec %%ST3, %%T0 | |
713 | %endif | |
714 | %if (%%num_blocks>=4) | |
715 | aesdec %%ST4, %%T0 | |
716 | %endif | |
717 | %if (%%num_blocks>=5) | |
718 | aesdec %%ST5, %%T0 | |
719 | %endif | |
720 | %if (%%num_blocks>=6) | |
721 | aesdec %%ST6, %%T0 | |
722 | %endif | |
723 | %if (%%num_blocks>=7) | |
724 | aesdec %%ST7, %%T0 | |
725 | %endif | |
726 | ||
727 | ; round 13 | |
728 | movdqa %%T0, [keys + 16*13] | |
729 | aesdec %%ST1, %%T0 | |
730 | %if (%%num_blocks>=2) | |
731 | aesdec %%ST2, %%T0 | |
732 | %endif | |
733 | %if (%%num_blocks>=3) | |
734 | aesdec %%ST3, %%T0 | |
735 | %endif | |
736 | %if (%%num_blocks>=4) | |
737 | aesdec %%ST4, %%T0 | |
738 | %endif | |
739 | %if (%%num_blocks>=5) | |
740 | aesdec %%ST5, %%T0 | |
741 | %endif | |
742 | %if (%%num_blocks>=6) | |
743 | aesdec %%ST6, %%T0 | |
744 | %endif | |
745 | %if (%%num_blocks>=7) | |
746 | aesdec %%ST7, %%T0 | |
747 | %endif | |
748 | ||
749 | ; round 14 | |
750 | movdqa %%T0, [keys + 16*14] | |
751 | aesdeclast %%ST1, %%T0 | |
752 | %if (%%num_blocks>=2) | |
753 | aesdeclast %%ST2, %%T0 | |
754 | %endif | |
755 | %if (%%num_blocks>=3) | |
756 | aesdeclast %%ST3, %%T0 | |
757 | %endif | |
758 | %if (%%num_blocks>=4) | |
759 | aesdeclast %%ST4, %%T0 | |
760 | %endif | |
761 | %if (%%num_blocks>=5) | |
762 | aesdeclast %%ST5, %%T0 | |
763 | %endif | |
764 | %if (%%num_blocks>=6) | |
765 | aesdeclast %%ST6, %%T0 | |
766 | %endif | |
767 | %if (%%num_blocks>=7) | |
768 | aesdeclast %%ST7, %%T0 | |
769 | %endif | |
770 | ||
771 | ; xor Tweak values | |
772 | pxor %%ST1, %%TW1 | |
773 | %if (%%num_blocks>=2) | |
774 | pxor %%ST2, %%TW2 | |
775 | %endif | |
776 | %if (%%num_blocks>=3) | |
777 | pxor %%ST3, %%TW3 | |
778 | %endif | |
779 | %if (%%num_blocks>=4) | |
780 | pxor %%ST4, %%TW4 | |
781 | %endif | |
782 | %if (%%num_blocks>=5) | |
783 | pxor %%ST5, %%TW5 | |
784 | %endif | |
785 | %if (%%num_blocks>=6) | |
786 | pxor %%ST6, %%TW6 | |
787 | %endif | |
788 | %if (%%num_blocks>=7) | |
789 | pxor %%ST7, %%TW7 | |
790 | %endif | |
791 | ||
792 | ||
793 | %if (0 == %%lt128) | |
794 | ; load next Tweak values | |
795 | movdqa %%TW1, [TW + 16*0] | |
796 | movdqa %%TW2, [TW + 16*1] | |
797 | movdqa %%TW3, [TW + 16*2] | |
798 | movdqa %%TW4, [TW + 16*3] | |
799 | movdqa %%TW5, [TW + 16*4] | |
800 | movdqa %%TW6, [TW + 16*5] | |
801 | movdqa %%TW7, [TW + 16*6] | |
802 | ||
803 | %endif | |
804 | ||
805 | %endmacro | |
806 | ||
807 | ||
808 | ; Encrypt 8 blocks in parallel | |
809 | ; generate next 8 tweak values | |
810 | %macro encrypt_by_eight 18 | |
811 | %define %%ST1 %1 ; state 1 | |
812 | %define %%ST2 %2 ; state 2 | |
813 | %define %%ST3 %3 ; state 3 | |
814 | %define %%ST4 %4 ; state 4 | |
815 | %define %%ST5 %5 ; state 5 | |
816 | %define %%ST6 %6 ; state 6 | |
817 | %define %%ST7 %7 ; state 7 | |
818 | %define %%ST8 %8 ; state 8 | |
819 | %define %%TW1 %9 ; tweak 1 | |
820 | %define %%TW2 %10 ; tweak 2 | |
821 | %define %%TW3 %11 ; tweak 3 | |
822 | %define %%TW4 %12 ; tweak 4 | |
823 | %define %%TW5 %13 ; tweak 5 | |
824 | %define %%TW6 %14 ; tweak 6 | |
825 | %define %%TW7 %15 ; tweak 7 | |
826 | %define %%TW8 %16 ; tweak 8 | |
827 | %define %%T0 %17 ; Temp register | |
828 | %define %%last_eight %18 | |
829 | ||
830 | ; xor Tweak values | |
831 | pxor %%ST1, %%TW1 | |
832 | pxor %%ST2, %%TW2 | |
833 | pxor %%ST3, %%TW3 | |
834 | pxor %%ST4, %%TW4 | |
835 | pxor %%ST5, %%TW5 | |
836 | pxor %%ST6, %%TW6 | |
837 | pxor %%ST7, %%TW7 | |
838 | pxor %%ST8, %%TW8 | |
839 | ||
840 | ; ARK | |
841 | movdqa %%T0, [keys] | |
842 | pxor %%ST1, %%T0 | |
843 | pxor %%ST2, %%T0 | |
844 | pxor %%ST3, %%T0 | |
845 | pxor %%ST4, %%T0 | |
846 | pxor %%ST5, %%T0 | |
847 | pxor %%ST6, %%T0 | |
848 | pxor %%ST7, %%T0 | |
849 | pxor %%ST8, %%T0 | |
850 | ||
851 | %if (0 == %%last_eight) | |
852 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
853 | shl twtempl, 1 | |
854 | adc twtemph, twtemph | |
855 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
856 | %endif | |
857 | ; round 1 | |
858 | movdqa %%T0, [keys + 16*1] | |
859 | aesdec %%ST1, %%T0 | |
860 | aesdec %%ST2, %%T0 | |
861 | aesdec %%ST3, %%T0 | |
862 | aesdec %%ST4, %%T0 | |
863 | aesdec %%ST5, %%T0 | |
864 | aesdec %%ST6, %%T0 | |
865 | aesdec %%ST7, %%T0 | |
866 | aesdec %%ST8, %%T0 | |
867 | %if (0 == %%last_eight) | |
868 | xor twtempl, ghash_poly_8b_temp | |
869 | mov [TW + 8*0], twtempl | |
870 | mov [TW + 8*1], twtemph | |
871 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
872 | %endif | |
873 | ; round 2 | |
874 | movdqa %%T0, [keys + 16*2] | |
875 | aesdec %%ST1, %%T0 | |
876 | aesdec %%ST2, %%T0 | |
877 | aesdec %%ST3, %%T0 | |
878 | aesdec %%ST4, %%T0 | |
879 | aesdec %%ST5, %%T0 | |
880 | aesdec %%ST6, %%T0 | |
881 | aesdec %%ST7, %%T0 | |
882 | aesdec %%ST8, %%T0 | |
883 | %if (0 == %%last_eight) | |
884 | shl twtempl, 1 | |
885 | adc twtemph, twtemph | |
886 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
887 | xor twtempl, ghash_poly_8b_temp | |
888 | ||
889 | %endif | |
890 | ; round 3 | |
891 | movdqa %%T0, [keys + 16*3] | |
892 | aesdec %%ST1, %%T0 | |
893 | aesdec %%ST2, %%T0 | |
894 | aesdec %%ST3, %%T0 | |
895 | aesdec %%ST4, %%T0 | |
896 | aesdec %%ST5, %%T0 | |
897 | aesdec %%ST6, %%T0 | |
898 | aesdec %%ST7, %%T0 | |
899 | aesdec %%ST8, %%T0 | |
900 | %if (0 == %%last_eight) | |
901 | mov [TW + 8*2], twtempl | |
902 | mov [TW + 8*3], twtemph | |
903 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
904 | shl twtempl, 1 | |
905 | %endif | |
906 | ; round 4 | |
907 | movdqa %%T0, [keys + 16*4] | |
908 | aesdec %%ST1, %%T0 | |
909 | aesdec %%ST2, %%T0 | |
910 | aesdec %%ST3, %%T0 | |
911 | aesdec %%ST4, %%T0 | |
912 | aesdec %%ST5, %%T0 | |
913 | aesdec %%ST6, %%T0 | |
914 | aesdec %%ST7, %%T0 | |
915 | aesdec %%ST8, %%T0 | |
916 | %if (0 == %%last_eight) | |
917 | adc twtemph, twtemph | |
918 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
919 | xor twtempl, ghash_poly_8b_temp | |
920 | mov [TW + 8*4], twtempl | |
921 | %endif | |
922 | ; round 5 | |
923 | movdqa %%T0, [keys + 16*5] | |
924 | aesdec %%ST1, %%T0 | |
925 | aesdec %%ST2, %%T0 | |
926 | aesdec %%ST3, %%T0 | |
927 | aesdec %%ST4, %%T0 | |
928 | aesdec %%ST5, %%T0 | |
929 | aesdec %%ST6, %%T0 | |
930 | aesdec %%ST7, %%T0 | |
931 | aesdec %%ST8, %%T0 | |
932 | %if (0 == %%last_eight) | |
933 | mov [TW + 8*5], twtemph | |
934 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
935 | shl twtempl, 1 | |
936 | adc twtemph, twtemph | |
937 | %endif | |
938 | ; round 6 | |
939 | movdqa %%T0, [keys + 16*6] | |
940 | aesdec %%ST1, %%T0 | |
941 | aesdec %%ST2, %%T0 | |
942 | aesdec %%ST3, %%T0 | |
943 | aesdec %%ST4, %%T0 | |
944 | aesdec %%ST5, %%T0 | |
945 | aesdec %%ST6, %%T0 | |
946 | aesdec %%ST7, %%T0 | |
947 | aesdec %%ST8, %%T0 | |
948 | %if (0 == %%last_eight) | |
949 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
950 | xor twtempl, ghash_poly_8b_temp | |
951 | mov [TW + 8*6], twtempl | |
952 | mov [TW + 8*7], twtemph | |
953 | %endif | |
954 | ; round 7 | |
955 | movdqa %%T0, [keys + 16*7] | |
956 | aesdec %%ST1, %%T0 | |
957 | aesdec %%ST2, %%T0 | |
958 | aesdec %%ST3, %%T0 | |
959 | aesdec %%ST4, %%T0 | |
960 | aesdec %%ST5, %%T0 | |
961 | aesdec %%ST6, %%T0 | |
962 | aesdec %%ST7, %%T0 | |
963 | aesdec %%ST8, %%T0 | |
964 | %if (0 == %%last_eight) | |
965 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
966 | shl twtempl, 1 | |
967 | adc twtemph, twtemph | |
968 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
969 | %endif | |
970 | ; round 8 | |
971 | movdqa %%T0, [keys + 16*8] | |
972 | aesdec %%ST1, %%T0 | |
973 | aesdec %%ST2, %%T0 | |
974 | aesdec %%ST3, %%T0 | |
975 | aesdec %%ST4, %%T0 | |
976 | aesdec %%ST5, %%T0 | |
977 | aesdec %%ST6, %%T0 | |
978 | aesdec %%ST7, %%T0 | |
979 | aesdec %%ST8, %%T0 | |
980 | %if (0 == %%last_eight) | |
981 | xor twtempl, ghash_poly_8b_temp | |
982 | mov [TW + 8*8], twtempl | |
983 | mov [TW + 8*9], twtemph | |
984 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
985 | %endif | |
986 | ; round 9 | |
987 | movdqa %%T0, [keys + 16*9] | |
988 | aesdec %%ST1, %%T0 | |
989 | aesdec %%ST2, %%T0 | |
990 | aesdec %%ST3, %%T0 | |
991 | aesdec %%ST4, %%T0 | |
992 | aesdec %%ST5, %%T0 | |
993 | aesdec %%ST6, %%T0 | |
994 | aesdec %%ST7, %%T0 | |
995 | aesdec %%ST8, %%T0 | |
996 | %if (0 == %%last_eight) | |
997 | shl twtempl, 1 | |
998 | adc twtemph, twtemph | |
999 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1000 | xor twtempl, ghash_poly_8b_temp | |
1001 | %endif | |
1002 | ; round 10 | |
1003 | movdqa %%T0, [keys + 16*10] | |
1004 | aesdec %%ST1, %%T0 | |
1005 | aesdec %%ST2, %%T0 | |
1006 | aesdec %%ST3, %%T0 | |
1007 | aesdec %%ST4, %%T0 | |
1008 | aesdec %%ST5, %%T0 | |
1009 | aesdec %%ST6, %%T0 | |
1010 | aesdec %%ST7, %%T0 | |
1011 | aesdec %%ST8, %%T0 | |
1012 | %if (0 == %%last_eight) | |
1013 | mov [TW + 8*10], twtempl | |
1014 | mov [TW + 8*11], twtemph | |
1015 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1016 | shl twtempl, 1 | |
1017 | %endif | |
1018 | ; round 11 | |
1019 | movdqa %%T0, [keys + 16*11] | |
1020 | aesdec %%ST1, %%T0 | |
1021 | aesdec %%ST2, %%T0 | |
1022 | aesdec %%ST3, %%T0 | |
1023 | aesdec %%ST4, %%T0 | |
1024 | aesdec %%ST5, %%T0 | |
1025 | aesdec %%ST6, %%T0 | |
1026 | aesdec %%ST7, %%T0 | |
1027 | aesdec %%ST8, %%T0 | |
1028 | %if (0 == %%last_eight) | |
1029 | adc twtemph, twtemph | |
1030 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1031 | xor twtempl, ghash_poly_8b_temp | |
1032 | mov [TW + 8*12], twtempl | |
1033 | %endif | |
1034 | ; round 12 | |
1035 | movdqa %%T0, [keys + 16*12] | |
1036 | aesdec %%ST1, %%T0 | |
1037 | aesdec %%ST2, %%T0 | |
1038 | aesdec %%ST3, %%T0 | |
1039 | aesdec %%ST4, %%T0 | |
1040 | aesdec %%ST5, %%T0 | |
1041 | aesdec %%ST6, %%T0 | |
1042 | aesdec %%ST7, %%T0 | |
1043 | aesdec %%ST8, %%T0 | |
1044 | %if (0 == %%last_eight) | |
1045 | mov [TW + 8*13], twtemph | |
1046 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1047 | shl twtempl, 1 | |
1048 | adc twtemph, twtemph | |
1049 | %endif | |
1050 | ; round 13 | |
1051 | movdqa %%T0, [keys + 16*13] | |
1052 | aesdec %%ST1, %%T0 | |
1053 | aesdec %%ST2, %%T0 | |
1054 | aesdec %%ST3, %%T0 | |
1055 | aesdec %%ST4, %%T0 | |
1056 | aesdec %%ST5, %%T0 | |
1057 | aesdec %%ST6, %%T0 | |
1058 | aesdec %%ST7, %%T0 | |
1059 | aesdec %%ST8, %%T0 | |
1060 | %if (0 == %%last_eight) | |
1061 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1062 | xor twtempl, ghash_poly_8b_temp | |
1063 | ; mov [TW + 8*14], twtempl | |
1064 | ; mov [TW + 8*15], twtemph | |
1065 | %endif | |
1066 | ; round 14 | |
1067 | movdqa %%T0, [keys + 16*14] | |
1068 | aesdeclast %%ST1, %%T0 | |
1069 | aesdeclast %%ST2, %%T0 | |
1070 | aesdeclast %%ST3, %%T0 | |
1071 | aesdeclast %%ST4, %%T0 | |
1072 | aesdeclast %%ST5, %%T0 | |
1073 | aesdeclast %%ST6, %%T0 | |
1074 | aesdeclast %%ST7, %%T0 | |
1075 | aesdeclast %%ST8, %%T0 | |
1076 | ||
1077 | ; xor Tweak values | |
1078 | pxor %%ST1, %%TW1 | |
1079 | pxor %%ST2, %%TW2 | |
1080 | pxor %%ST3, %%TW3 | |
1081 | pxor %%ST4, %%TW4 | |
1082 | pxor %%ST5, %%TW5 | |
1083 | pxor %%ST6, %%TW6 | |
1084 | pxor %%ST7, %%TW7 | |
1085 | pxor %%ST8, %%TW8 | |
1086 | ||
1087 | mov [TW + 8*14], twtempl | |
1088 | mov [TW + 8*15], twtemph | |
1089 | ; load next Tweak values | |
1090 | movdqa %%TW1, [TW + 16*0] | |
1091 | movdqa %%TW2, [TW + 16*1] | |
1092 | movdqa %%TW3, [TW + 16*2] | |
1093 | movdqa %%TW4, [TW + 16*3] | |
1094 | movdqa %%TW5, [TW + 16*4] | |
1095 | movdqa %%TW6, [TW + 16*5] | |
1096 | movdqa %%TW7, [TW + 16*6] | |
1097 | ||
1098 | %endmacro | |
1099 | ||
1100 | ||
1101 | section .text | |
1102 | ||
1e59de90 | 1103 | mk_global XTS_AES_256_dec_expanded_key_sse, function |
7c673cae | 1104 | XTS_AES_256_dec_expanded_key_sse: |
1e59de90 | 1105 | endbranch |
7c673cae FG |
1106 | |
1107 | sub rsp, VARIABLE_OFFSET | |
1108 | ||
1109 | mov [_gpr + 8*0], rbx | |
1110 | %ifidn __OUTPUT_FORMAT__, win64 | |
1111 | mov [_gpr + 8*1], rdi | |
1112 | mov [_gpr + 8*2], rsi | |
1113 | ||
1114 | movdqa [_xmm + 16*0], xmm6 | |
1115 | movdqa [_xmm + 16*1], xmm7 | |
1116 | movdqa [_xmm + 16*2], xmm8 | |
1117 | movdqa [_xmm + 16*3], xmm9 | |
1118 | movdqa [_xmm + 16*4], xmm10 | |
1119 | movdqa [_xmm + 16*5], xmm11 | |
1120 | movdqa [_xmm + 16*6], xmm12 | |
1121 | movdqa [_xmm + 16*7], xmm13 | |
1122 | movdqa [_xmm + 16*8], xmm14 | |
1123 | movdqa [_xmm + 16*9], xmm15 | |
1124 | %endif | |
1125 | ||
1126 | mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b | |
1127 | ||
1128 | ||
1129 | movdqu xmm1, [T_val] ; read initial Tweak value | |
1130 | pxor xmm4, xmm4 ; for key expansion | |
1131 | encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys | |
1132 | ||
1133 | ||
1134 | %ifidn __OUTPUT_FORMAT__, win64 | |
1135 | mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer | |
1136 | mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer | |
1137 | %endif | |
1138 | ||
1139 | ||
1140 | ||
1141 | mov target_ptr_val, N_val | |
1142 | and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) | |
1143 | sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations | |
1144 | jl _less_than_128_bytes | |
1145 | ||
1146 | add target_ptr_val, ptr_ciphertext | |
1147 | ||
1148 | ||
1149 | mov tmp1, N_val | |
1150 | and tmp1, (7 << 4) | |
1151 | jz _initial_num_blocks_is_0 | |
1152 | ||
1153 | cmp tmp1, (4 << 4) | |
1154 | je _initial_num_blocks_is_4 | |
1155 | ||
1156 | ||
1157 | ||
1158 | cmp tmp1, (6 << 4) | |
1159 | je _initial_num_blocks_is_6 | |
1160 | ||
1161 | cmp tmp1, (5 << 4) | |
1162 | je _initial_num_blocks_is_5 | |
1163 | ||
1164 | ||
1165 | ||
1166 | cmp tmp1, (3 << 4) | |
1167 | je _initial_num_blocks_is_3 | |
1168 | ||
1169 | cmp tmp1, (2 << 4) | |
1170 | je _initial_num_blocks_is_2 | |
1171 | ||
1172 | cmp tmp1, (1 << 4) | |
1173 | je _initial_num_blocks_is_1 | |
1174 | ||
1175 | _initial_num_blocks_is_7: | |
1176 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 | |
1177 | add ptr_plaintext, 16*7 | |
1178 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 | |
1179 | ; store ciphertext | |
1180 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1181 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1182 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1183 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1184 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1185 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1186 | movdqu [ptr_ciphertext+16*6], xmm7 | |
1187 | add ptr_ciphertext, 16*7 | |
1188 | ||
1189 | cmp ptr_ciphertext, target_ptr_val | |
1190 | je _last_eight | |
1191 | ||
1192 | jmp _main_loop | |
1193 | _initial_num_blocks_is_6: | |
1194 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 | |
1195 | add ptr_plaintext, 16*6 | |
1196 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 | |
1197 | ; store ciphertext | |
1198 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1199 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1200 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1201 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1202 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1203 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1204 | add ptr_ciphertext, 16*6 | |
1205 | ||
1206 | cmp ptr_ciphertext, target_ptr_val | |
1207 | je _last_eight | |
1208 | ||
1209 | jmp _main_loop | |
1210 | _initial_num_blocks_is_5: | |
1211 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 | |
1212 | add ptr_plaintext, 16*5 | |
1213 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 | |
1214 | ; store ciphertext | |
1215 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1216 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1217 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1218 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1219 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1220 | add ptr_ciphertext, 16*5 | |
1221 | ||
1222 | cmp ptr_ciphertext, target_ptr_val | |
1223 | je _last_eight | |
1224 | ||
1225 | jmp _main_loop | |
1226 | _initial_num_blocks_is_4: | |
1227 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 | |
1228 | add ptr_plaintext, 16*4 | |
1229 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 | |
1230 | ; store ciphertext | |
1231 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1232 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1233 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1234 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1235 | add ptr_ciphertext, 16*4 | |
1236 | ||
1237 | cmp ptr_ciphertext, target_ptr_val | |
1238 | je _last_eight | |
1239 | ||
1240 | jmp _main_loop | |
1241 | ||
1242 | ||
1243 | _initial_num_blocks_is_3: | |
1244 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 | |
1245 | add ptr_plaintext, 16*3 | |
1246 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 | |
1247 | ; store ciphertext | |
1248 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1249 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1250 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1251 | add ptr_ciphertext, 16*3 | |
1252 | ||
1253 | cmp ptr_ciphertext, target_ptr_val | |
1254 | je _last_eight | |
1255 | ||
1256 | jmp _main_loop | |
1257 | _initial_num_blocks_is_2: | |
1258 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 | |
1259 | add ptr_plaintext, 16*2 | |
1260 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 | |
1261 | ; store ciphertext | |
1262 | movdqu [ptr_ciphertext], xmm1 | |
1263 | movdqu [ptr_ciphertext+16], xmm2 | |
1264 | add ptr_ciphertext, 16*2 | |
1265 | ||
1266 | cmp ptr_ciphertext, target_ptr_val | |
1267 | je _last_eight | |
1268 | ||
1269 | jmp _main_loop | |
1270 | ||
1271 | _initial_num_blocks_is_1: | |
1272 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 | |
1273 | add ptr_plaintext, 16*1 | |
1274 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 | |
1275 | ; store ciphertext | |
1276 | movdqu [ptr_ciphertext], xmm1 | |
1277 | add ptr_ciphertext, 16 | |
1278 | ||
1279 | cmp ptr_ciphertext, target_ptr_val | |
1280 | je _last_eight | |
1281 | ||
1282 | jmp _main_loop | |
1283 | ||
1284 | _initial_num_blocks_is_0: | |
1285 | mov twtempl, [TW+8*0] | |
1286 | mov twtemph, [TW+8*1] | |
1287 | movdqa xmm9, [TW+16*0] | |
1288 | ||
1289 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1290 | shl twtempl, 1 | |
1291 | adc twtemph, twtemph | |
1292 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1293 | xor twtempl, ghash_poly_8b_temp | |
1294 | mov [TW+8*2], twtempl | |
1295 | mov [TW+8*3], twtemph | |
1296 | movdqa xmm10, [TW+16*1] | |
1297 | ||
1298 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1299 | shl twtempl, 1 | |
1300 | adc twtemph, twtemph | |
1301 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1302 | xor twtempl, ghash_poly_8b_temp | |
1303 | mov [TW+8*4], twtempl | |
1304 | mov [TW+8*5], twtemph | |
1305 | movdqa xmm11, [TW+16*2] | |
1306 | ||
1307 | ||
1308 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1309 | shl twtempl, 1 | |
1310 | adc twtemph, twtemph | |
1311 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1312 | xor twtempl, ghash_poly_8b_temp | |
1313 | mov [TW+8*6], twtempl | |
1314 | mov [TW+8*7], twtemph | |
1315 | movdqa xmm12, [TW+16*3] | |
1316 | ||
1317 | ||
1318 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1319 | shl twtempl, 1 | |
1320 | adc twtemph, twtemph | |
1321 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1322 | xor twtempl, ghash_poly_8b_temp | |
1323 | mov [TW+8*8], twtempl | |
1324 | mov [TW+8*9], twtemph | |
1325 | movdqa xmm13, [TW+16*4] | |
1326 | ||
1327 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1328 | shl twtempl, 1 | |
1329 | adc twtemph, twtemph | |
1330 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1331 | xor twtempl, ghash_poly_8b_temp | |
1332 | mov [TW+8*10], twtempl | |
1333 | mov [TW+8*11], twtemph | |
1334 | movdqa xmm14, [TW+16*5] | |
1335 | ||
1336 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1337 | shl twtempl, 1 | |
1338 | adc twtemph, twtemph | |
1339 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1340 | xor twtempl, ghash_poly_8b_temp | |
1341 | mov [TW+8*12], twtempl | |
1342 | mov [TW+8*13], twtemph | |
1343 | movdqa xmm15, [TW+16*6] | |
1344 | ||
1345 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1346 | shl twtempl, 1 | |
1347 | adc twtemph, twtemph | |
1348 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1349 | xor twtempl, ghash_poly_8b_temp | |
1350 | mov [TW+8*14], twtempl | |
1351 | mov [TW+8*15], twtemph | |
1352 | ;movdqa xmm16, [TW+16*7] | |
1353 | ||
1354 | cmp ptr_ciphertext, target_ptr_val | |
1355 | je _last_eight | |
1356 | _main_loop: | |
1357 | ; load plaintext | |
1358 | movdqu xmm1, [ptr_plaintext+16*0] | |
1359 | movdqu xmm2, [ptr_plaintext+16*1] | |
1360 | movdqu xmm3, [ptr_plaintext+16*2] | |
1361 | movdqu xmm4, [ptr_plaintext+16*3] | |
1362 | movdqu xmm5, [ptr_plaintext+16*4] | |
1363 | movdqu xmm6, [ptr_plaintext+16*5] | |
1364 | movdqu xmm7, [ptr_plaintext+16*6] | |
1365 | movdqu xmm8, [ptr_plaintext+16*7] | |
1366 | ||
1367 | add ptr_plaintext, 128 | |
1368 | ||
1369 | encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 | |
1370 | ||
1371 | ; store ciphertext | |
1372 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1373 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1374 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1375 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1376 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1377 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1378 | movdqu [ptr_ciphertext+16*6], xmm7 | |
1379 | movdqu [ptr_ciphertext+16*7], xmm8 | |
1380 | add ptr_ciphertext, 128 | |
1381 | ||
1382 | cmp ptr_ciphertext, target_ptr_val | |
1383 | jne _main_loop | |
1384 | ||
1385 | _last_eight: | |
1386 | ||
1387 | and N_val, 15 ; N_val = N_val mod 16 | |
1388 | je _done_final | |
1389 | ||
1390 | ; generate next Tweak value | |
1391 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1392 | shl twtempl, 1 | |
1393 | adc twtemph, twtemph | |
1394 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1395 | xor twtempl, ghash_poly_8b_temp | |
1396 | movdqa xmm1, [TW + 16*7] | |
1397 | movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt | |
1398 | ||
1399 | mov [TW + 16*7], twtempl | |
1400 | mov [TW + 16*7+8], twtemph | |
1401 | ||
1402 | ; load plaintext | |
1403 | movdqu xmm1, [ptr_plaintext+16*0] | |
1404 | movdqu xmm2, [ptr_plaintext+16*1] | |
1405 | movdqu xmm3, [ptr_plaintext+16*2] | |
1406 | movdqu xmm4, [ptr_plaintext+16*3] | |
1407 | movdqu xmm5, [ptr_plaintext+16*4] | |
1408 | movdqu xmm6, [ptr_plaintext+16*5] | |
1409 | movdqu xmm7, [ptr_plaintext+16*6] | |
1410 | movdqu xmm8, [ptr_plaintext+16*7] | |
1411 | encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 | |
1412 | ||
1413 | ; store ciphertext | |
1414 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1415 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1416 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1417 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1418 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1419 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1420 | movdqu [ptr_ciphertext+16*6], xmm7 | |
1421 | jmp _steal_cipher | |
1422 | ||
1423 | ||
1424 | _done_final: | |
1425 | ; load plaintext | |
1426 | movdqu xmm1, [ptr_plaintext+16*0] | |
1427 | movdqu xmm2, [ptr_plaintext+16*1] | |
1428 | movdqu xmm3, [ptr_plaintext+16*2] | |
1429 | movdqu xmm4, [ptr_plaintext+16*3] | |
1430 | movdqu xmm5, [ptr_plaintext+16*4] | |
1431 | movdqu xmm6, [ptr_plaintext+16*5] | |
1432 | movdqu xmm7, [ptr_plaintext+16*6] | |
1433 | movdqu xmm8, [ptr_plaintext+16*7] | |
1434 | encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 | |
1435 | ||
1436 | ; store ciphertext | |
1437 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1438 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1439 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1440 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1441 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1442 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1443 | movdqu [ptr_ciphertext+16*6], xmm7 | |
1444 | ||
1445 | jmp _done | |
1446 | ||
1447 | ||
1448 | _steal_cipher: | |
1449 | ; start cipher stealing | |
1450 | ||
1451 | ||
1452 | movdqa xmm2, xmm8 | |
1453 | ||
1454 | ; shift xmm8 to the left by 16-N_val bytes | |
1455 | lea twtempl, [pshufb_shf_table] | |
1456 | movdqu xmm0, [twtempl+N_val] | |
1457 | pshufb xmm8, xmm0 | |
1458 | ||
1459 | ||
1460 | movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move | |
1461 | movdqu [ptr_ciphertext + 112 + N_val], xmm8 | |
1462 | ||
1463 | ; shift xmm3 to the right by 16-N_val bytes | |
1464 | lea twtempl, [pshufb_shf_table +16] | |
1465 | sub twtempl, N_val | |
1466 | movdqu xmm0, [twtempl] | |
1467 | pxor xmm0, [mask1] | |
1468 | pshufb xmm3, xmm0 | |
1469 | ||
1470 | pblendvb xmm3, xmm2 ;xmm0 is implicit | |
1471 | ||
1472 | ; xor Tweak value | |
1473 | movdqa xmm8, [TW] | |
1474 | pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped | |
1475 | ||
1476 | ||
1477 | ;encrypt last block with cipher stealing | |
1478 | pxor xmm8, [keys] ; ARK | |
1479 | aesdec xmm8, [keys + 16*1] ; round 1 | |
1480 | aesdec xmm8, [keys + 16*2] ; round 2 | |
1481 | aesdec xmm8, [keys + 16*3] ; round 3 | |
1482 | aesdec xmm8, [keys + 16*4] ; round 4 | |
1483 | aesdec xmm8, [keys + 16*5] ; round 5 | |
1484 | aesdec xmm8, [keys + 16*6] ; round 6 | |
1485 | aesdec xmm8, [keys + 16*7] ; round 7 | |
1486 | aesdec xmm8, [keys + 16*8] ; round 8 | |
1487 | aesdec xmm8, [keys + 16*9] ; round 9 | |
1488 | aesdec xmm8, [keys + 16*10] ; round 9 | |
1489 | aesdec xmm8, [keys + 16*11] ; round 9 | |
1490 | aesdec xmm8, [keys + 16*12] ; round 9 | |
1491 | aesdec xmm8, [keys + 16*13] ; round 9 | |
1492 | aesdeclast xmm8, [keys + 16*14] ; round 10 | |
1493 | ||
1494 | ; xor Tweak value | |
1495 | pxor xmm8, [TW] | |
1496 | ||
1497 | _done: | |
1498 | ; store last ciphertext value | |
1499 | movdqu [ptr_ciphertext+16*7], xmm8 | |
1500 | ||
1501 | _ret_: | |
1502 | ||
1503 | mov rbx, [_gpr + 8*0] | |
1504 | %ifidn __OUTPUT_FORMAT__, win64 | |
1505 | mov rdi, [_gpr + 8*1] | |
1506 | mov rsi, [_gpr + 8*2] | |
1507 | ||
1508 | ||
1509 | movdqa xmm6, [_xmm + 16*0] | |
1510 | movdqa xmm7, [_xmm + 16*1] | |
1511 | movdqa xmm8, [_xmm + 16*2] | |
1512 | movdqa xmm9, [_xmm + 16*3] | |
1513 | movdqa xmm10, [_xmm + 16*4] | |
1514 | movdqa xmm11, [_xmm + 16*5] | |
1515 | movdqa xmm12, [_xmm + 16*6] | |
1516 | movdqa xmm13, [_xmm + 16*7] | |
1517 | movdqa xmm14, [_xmm + 16*8] | |
1518 | movdqa xmm15, [_xmm + 16*9] | |
1519 | %endif | |
1520 | ||
1521 | add rsp, VARIABLE_OFFSET | |
1522 | ||
1523 | ret | |
1524 | ||
1525 | ||
1526 | ||
1527 | ||
1528 | ||
1529 | _less_than_128_bytes: | |
1530 | cmp N_val, 16 | |
1531 | jb _ret_ | |
1532 | ||
1533 | mov tmp1, N_val | |
1534 | and tmp1, (7 << 4) | |
1535 | cmp tmp1, (6 << 4) | |
1536 | je _num_blocks_is_6 | |
1537 | cmp tmp1, (5 << 4) | |
1538 | je _num_blocks_is_5 | |
1539 | cmp tmp1, (4 << 4) | |
1540 | je _num_blocks_is_4 | |
1541 | cmp tmp1, (3 << 4) | |
1542 | je _num_blocks_is_3 | |
1543 | cmp tmp1, (2 << 4) | |
1544 | je _num_blocks_is_2 | |
1545 | cmp tmp1, (1 << 4) | |
1546 | je _num_blocks_is_1 | |
1547 | ||
1548 | ||
1549 | ||
1550 | ||
1551 | _num_blocks_is_7: | |
1552 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 | |
1553 | ||
1554 | sub ptr_plaintext, 16*1 | |
1555 | ||
1556 | and N_val, 15 ; N_val = N_val mod 16 | |
1557 | je _done_7 | |
1558 | ||
1559 | _steal_cipher_7: | |
1560 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1561 | shl twtempl, 1 | |
1562 | adc twtemph, twtemph | |
1563 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1564 | xor twtempl, ghash_poly_8b_temp | |
1565 | mov [TW+8*2], twtempl | |
1566 | mov [TW+8*3], twtemph | |
1567 | ||
1568 | movdqa [TW + 16*0] , xmm15 | |
1569 | movdqa xmm15, [TW+16*1] | |
1570 | ||
1571 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 | |
1572 | ; store ciphertext | |
1573 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1574 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1575 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1576 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1577 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1578 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1579 | ||
1580 | sub ptr_ciphertext, 16*1 | |
1581 | movdqa xmm8, xmm7 | |
1582 | jmp _steal_cipher | |
1583 | ||
1584 | _done_7: | |
1585 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 | |
1586 | ; store ciphertext | |
1587 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1588 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1589 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1590 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1591 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1592 | movdqu [ptr_ciphertext+16*5], xmm6 | |
1593 | ||
1594 | sub ptr_ciphertext, 16*1 | |
1595 | movdqa xmm8, xmm7 | |
1596 | jmp _done | |
1597 | ||
1598 | ||
1599 | ||
1600 | ||
1601 | ||
1602 | ||
1603 | _num_blocks_is_6: | |
1604 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 | |
1605 | ||
1606 | sub ptr_plaintext, 16*2 | |
1607 | ||
1608 | and N_val, 15 ; N_val = N_val mod 16 | |
1609 | je _done_6 | |
1610 | ||
1611 | _steal_cipher_6: | |
1612 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1613 | shl twtempl, 1 | |
1614 | adc twtemph, twtemph | |
1615 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1616 | xor twtempl, ghash_poly_8b_temp | |
1617 | mov [TW+8*2], twtempl | |
1618 | mov [TW+8*3], twtemph | |
1619 | ||
1620 | movdqa [TW + 16*0] , xmm14 | |
1621 | movdqa xmm14, [TW+16*1] | |
1622 | ||
1623 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 | |
1624 | ; store ciphertext | |
1625 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1626 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1627 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1628 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1629 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1630 | ||
1631 | sub ptr_ciphertext, 16*2 | |
1632 | movdqa xmm8, xmm6 | |
1633 | jmp _steal_cipher | |
1634 | ||
1635 | _done_6: | |
1636 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 | |
1637 | ; store ciphertext | |
1638 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1639 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1640 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1641 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1642 | movdqu [ptr_ciphertext+16*4], xmm5 | |
1643 | ||
1644 | sub ptr_ciphertext, 16*2 | |
1645 | movdqa xmm8, xmm6 | |
1646 | jmp _done | |
1647 | ||
1648 | ||
1649 | ||
1650 | ||
1651 | ||
1652 | _num_blocks_is_5: | |
1653 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 | |
1654 | ||
1655 | sub ptr_plaintext, 16*3 | |
1656 | ||
1657 | and N_val, 15 ; N_val = N_val mod 16 | |
1658 | je _done_5 | |
1659 | ||
1660 | _steal_cipher_5: | |
1661 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1662 | shl twtempl, 1 | |
1663 | adc twtemph, twtemph | |
1664 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1665 | xor twtempl, ghash_poly_8b_temp | |
1666 | mov [TW+8*2], twtempl | |
1667 | mov [TW+8*3], twtemph | |
1668 | ||
1669 | movdqa [TW + 16*0] , xmm13 | |
1670 | movdqa xmm13, [TW+16*1] | |
1671 | ||
1672 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 | |
1673 | ; store ciphertext | |
1674 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1675 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1676 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1677 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1678 | ||
1679 | sub ptr_ciphertext, 16*3 | |
1680 | movdqa xmm8, xmm5 | |
1681 | jmp _steal_cipher | |
1682 | ||
1683 | _done_5: | |
1684 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 | |
1685 | ; store ciphertext | |
1686 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1687 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1688 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1689 | movdqu [ptr_ciphertext+16*3], xmm4 | |
1690 | ||
1691 | sub ptr_ciphertext, 16*3 | |
1692 | movdqa xmm8, xmm5 | |
1693 | jmp _done | |
1694 | ||
1695 | ||
1696 | ||
1697 | ||
1698 | ||
1699 | _num_blocks_is_4: | |
1700 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 | |
1701 | ||
1702 | sub ptr_plaintext, 16*4 | |
1703 | ||
1704 | and N_val, 15 ; N_val = N_val mod 16 | |
1705 | je _done_4 | |
1706 | ||
1707 | _steal_cipher_4: | |
1708 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1709 | shl twtempl, 1 | |
1710 | adc twtemph, twtemph | |
1711 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1712 | xor twtempl, ghash_poly_8b_temp | |
1713 | mov [TW+8*2], twtempl | |
1714 | mov [TW+8*3], twtemph | |
1715 | ||
1716 | movdqa [TW + 16*0] , xmm12 | |
1717 | movdqa xmm12, [TW+16*1] | |
1718 | ||
1719 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 | |
1720 | ; store ciphertext | |
1721 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1722 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1723 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1724 | ||
1725 | sub ptr_ciphertext, 16*4 | |
1726 | movdqa xmm8, xmm4 | |
1727 | jmp _steal_cipher | |
1728 | ||
1729 | _done_4: | |
1730 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 | |
1731 | ; store ciphertext | |
1732 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1733 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1734 | movdqu [ptr_ciphertext+16*2], xmm3 | |
1735 | ||
1736 | sub ptr_ciphertext, 16*4 | |
1737 | movdqa xmm8, xmm4 | |
1738 | jmp _done | |
1739 | ||
1740 | ||
1741 | ||
1742 | ||
1743 | _num_blocks_is_3: | |
1744 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 | |
1745 | ||
1746 | sub ptr_plaintext, 16*5 | |
1747 | ||
1748 | and N_val, 15 ; N_val = N_val mod 16 | |
1749 | je _done_3 | |
1750 | ||
1751 | _steal_cipher_3: | |
1752 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1753 | shl twtempl, 1 | |
1754 | adc twtemph, twtemph | |
1755 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1756 | xor twtempl, ghash_poly_8b_temp | |
1757 | mov [TW+8*2], twtempl | |
1758 | mov [TW+8*3], twtemph | |
1759 | ||
1760 | movdqa [TW + 16*0] , xmm11 | |
1761 | movdqa xmm11, [TW+16*1] | |
1762 | ||
1763 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 | |
1764 | ; store ciphertext | |
1765 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1766 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1767 | ||
1768 | sub ptr_ciphertext, 16*5 | |
1769 | movdqa xmm8, xmm3 | |
1770 | jmp _steal_cipher | |
1771 | ||
1772 | _done_3: | |
1773 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 | |
1774 | ; store ciphertext | |
1775 | movdqu [ptr_ciphertext+16*0], xmm1 | |
1776 | movdqu [ptr_ciphertext+16*1], xmm2 | |
1777 | ||
1778 | sub ptr_ciphertext, 16*5 | |
1779 | movdqa xmm8, xmm3 | |
1780 | jmp _done | |
1781 | ||
1782 | ||
1783 | ||
1784 | ||
1785 | ||
1786 | ||
1787 | _num_blocks_is_2: | |
1788 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 | |
1789 | ||
1790 | sub ptr_plaintext, 16*6 | |
1791 | ||
1792 | and N_val, 15 ; N_val = N_val mod 16 | |
1793 | je _done_2 | |
1794 | ||
1795 | _steal_cipher_2: | |
1796 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1797 | shl twtempl, 1 | |
1798 | adc twtemph, twtemph | |
1799 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1800 | xor twtempl, ghash_poly_8b_temp | |
1801 | mov [TW+8*2], twtempl | |
1802 | mov [TW+8*3], twtemph | |
1803 | ||
1804 | movdqa [TW + 16*0] , xmm10 | |
1805 | movdqa xmm10, [TW+16*1] | |
1806 | ||
1807 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 | |
1808 | ; store ciphertext | |
1809 | movdqu [ptr_ciphertext], xmm1 | |
1810 | ||
1811 | sub ptr_ciphertext, 16*6 | |
1812 | movdqa xmm8, xmm2 | |
1813 | jmp _steal_cipher | |
1814 | ||
1815 | _done_2: | |
1816 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 | |
1817 | ; store ciphertext | |
1818 | movdqu [ptr_ciphertext], xmm1 | |
1819 | ||
1820 | sub ptr_ciphertext, 16*6 | |
1821 | movdqa xmm8, xmm2 | |
1822 | jmp _done | |
1823 | ||
1824 | ||
1825 | ||
1826 | ||
1827 | ||
1828 | ||
1829 | ||
1830 | ||
1831 | ||
1832 | ||
1833 | ||
1834 | ||
1835 | ||
1836 | _num_blocks_is_1: | |
1837 | initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 | |
1838 | ||
1839 | sub ptr_plaintext, 16*7 | |
1840 | ||
1841 | and N_val, 15 ; N_val = N_val mod 16 | |
1842 | je _done_1 | |
1843 | ||
1844 | _steal_cipher_1: | |
1845 | xor ghash_poly_8b_temp, ghash_poly_8b_temp | |
1846 | shl twtempl, 1 | |
1847 | adc twtemph, twtemph | |
1848 | cmovc ghash_poly_8b_temp, ghash_poly_8b | |
1849 | xor twtempl, ghash_poly_8b_temp | |
1850 | mov [TW+8*2], twtempl | |
1851 | mov [TW+8*3], twtemph | |
1852 | ||
1853 | movdqa [TW + 16*0] , xmm9 | |
1854 | movdqa xmm9, [TW+16*1] | |
1855 | ||
1856 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 | |
1857 | ; store ciphertext | |
1858 | ||
1859 | sub ptr_ciphertext, 16*7 | |
1860 | movdqa xmm8, xmm1 | |
1861 | jmp _steal_cipher | |
1862 | ||
1863 | _done_1: | |
1864 | encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 | |
1865 | ; store ciphertext | |
1866 | ||
1867 | sub ptr_ciphertext, 16*7 | |
1868 | movdqa xmm8, xmm1 | |
1869 | jmp _done | |
1870 | ||
1871 | section .data | |
1872 | align 16 | |
1873 | ||
1874 | pshufb_shf_table: | |
1875 | ; use these values for shift constants for the pshufb instruction | |
1876 | ; different alignments result in values as shown: | |
1877 | ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 | |
1878 | ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 | |
1879 | ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 | |
1880 | ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 | |
1881 | ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 | |
1882 | ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 | |
1883 | ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 | |
1884 | ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 | |
1885 | ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 | |
1886 | ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 | |
1887 | ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 | |
1888 | ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 | |
1889 | ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 | |
1890 | ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 | |
1891 | ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 | |
1892 | dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 | |
1893 | dq 0x0706050403020100, 0x000e0d0c0b0a0908 | |
1894 | ||
1895 | mask1: | |
1896 | dq 0x8080808080808080, 0x8080808080808080 | |
1897 | ||
1898 |