]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_256_dec_expanded_key_avx.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS decrypt function with 256-bit AES
30 ; expanded keys are not aligned
31 ; plaintext and ciphertext are not aligned
32 ; second key is stored in the stack as aligned to 16 Bytes
33 ; first key is required only once, no need for storage of this key
34
35 %include "reg_sizes.asm"
36
37 default rel
38 %define TW rsp ; store 8 tweak values
39 %define keys rsp + 16*8 ; store 15 expanded keys
40
41 %ifidn __OUTPUT_FORMAT__, win64
42 %define _xmm rsp + 16*23 ; store xmm6:xmm15
43 %endif
44
45 %ifidn __OUTPUT_FORMAT__, elf64
46 %define _gpr rsp + 16*23 ; store rbx
47 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
48 %else
49 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
50 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %endif
52
53 %define GHASH_POLY 0x87
54
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 ;void XTS_AES_256_dec_expanded_key_avx(
57 ; UINT8 *k2, // key used for tweaking, 16*15 bytes
58 ; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
59 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
60 ; UINT64 N, // sector size, in bytes
61 ; const UINT8 *ct, // ciphertext sector input data
62 ; UINT8 *pt); // plaintext sector output data
63 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
64
65 ; arguments for input parameters
66 %ifidn __OUTPUT_FORMAT__, elf64
67 %xdefine ptr_key2 rdi
68 %xdefine ptr_key1 rsi
69 %xdefine T_val rdx
70 %xdefine N_val rcx
71 %xdefine ptr_plaintext r8
72 %xdefine ptr_ciphertext r9
73 %else
74 %xdefine ptr_key2 rcx
75 %xdefine ptr_key1 rdx
76 %xdefine T_val r8
77 %xdefine N_val r9
78 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
79 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
80 %endif
81
82 ; arguments for temp parameters
83 %ifidn __OUTPUT_FORMAT__, elf64
84 %define tmp1 rdi
85 %define target_ptr_val rsi
86 %define ghash_poly_8b r10
87 %define ghash_poly_8b_temp r11
88 %else
89 %define tmp1 rcx
90 %define target_ptr_val rdx
91 %define ghash_poly_8b rdi
92 %define ghash_poly_8b_temp rsi
93 %endif
94
95 %define twtempl rax ; global temp registers used for tweak computation
96 %define twtemph rbx
97
98
99 ; macro to encrypt the tweak value
100
101 %macro encrypt_T 8
102 %define %%xkey2 %1
103 %define %%xstate_tweak %2
104 %define %%xkey1 %3
105 %define %%xraw_key %4
106 %define %%xtmp %5
107 %define %%ptr_key2 %6
108 %define %%ptr_key1 %7
109 %define %%ptr_expanded_keys %8
110
111 vmovdqu %%xkey2, [%%ptr_key2]
112 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
113
114 vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
115 vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
116
117 vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
118 vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
119
120 vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
121 vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
122
123
124 vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
125 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
126
127 vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
128 vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
129
130 vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
131 vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
132
133 vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
134 vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
135
136 vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
137 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
138
139 vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
140 vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
141
142 vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
143 vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
144
145 vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
146 vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
147
148 vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
149 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
150
151 vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
152 vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
153
154 vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
155 vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
156
157 vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
158 vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
159
160
161 vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
162 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
163
164 vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
165 vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
166
167
168 vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
169 vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
170
171 vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
172 vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
173
174
175 vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
176 vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
177
178 vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
179 vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
180
181
182 vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
183 vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
184
185 vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
186 vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
187
188 vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
189 vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
190
191 vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
192 vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
193
194 vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
195 vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
196
197 vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
198 vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
199
200 vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
201 vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
202
203 vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
204 vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
205
206 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
207 %endmacro
208
209
210 ; generate initial tweak values
211 ; load initial plaintext values
212 %macro initialize 16
213
214 %define %%ST1 %1 ; state 1
215 %define %%ST2 %2 ; state 2
216 %define %%ST3 %3 ; state 3
217 %define %%ST4 %4 ; state 4
218 %define %%ST5 %5 ; state 5
219 %define %%ST6 %6 ; state 6
220 %define %%ST7 %7 ; state 7
221 %define %%ST8 %8 ; state 8
222
223 %define %%TW1 %9 ; tweak 1
224 %define %%TW2 %10 ; tweak 2
225 %define %%TW3 %11 ; tweak 3
226 %define %%TW4 %12 ; tweak 4
227 %define %%TW5 %13 ; tweak 5
228 %define %%TW6 %14 ; tweak 6
229 %define %%TW7 %15 ; tweak 7
230
231 %define %%num_initial_blocks %16
232
233
234 ; generate next Tweak values
235 vmovdqa %%TW1, [TW+16*0]
236 mov twtempl, [TW+8*0]
237 mov twtemph, [TW+8*1]
238 vmovdqu %%ST1, [ptr_plaintext+16*0]
239 %if (%%num_initial_blocks>=2)
240 xor ghash_poly_8b_temp, ghash_poly_8b_temp
241 shl twtempl, 1
242 adc twtemph, twtemph
243 cmovc ghash_poly_8b_temp, ghash_poly_8b
244 xor twtempl, ghash_poly_8b_temp
245 mov [TW+8*2], twtempl
246 mov [TW+8*3], twtemph;
247 vmovdqa %%TW2, [TW+16*1]
248 vmovdqu %%ST2, [ptr_plaintext+16*1]
249 %endif
250 %if (%%num_initial_blocks>=3)
251 xor ghash_poly_8b_temp, ghash_poly_8b_temp
252 shl twtempl, 1
253 adc twtemph, twtemph
254 cmovc ghash_poly_8b_temp, ghash_poly_8b
255 xor twtempl, ghash_poly_8b_temp
256 mov [TW+8*4], twtempl
257 mov [TW+8*5], twtemph;
258 vmovdqa %%TW3, [TW+16*2]
259 vmovdqu %%ST3, [ptr_plaintext+16*2]
260 %endif
261 %if (%%num_initial_blocks>=4)
262 xor ghash_poly_8b_temp, ghash_poly_8b_temp
263 shl twtempl, 1
264 adc twtemph, twtemph
265 cmovc ghash_poly_8b_temp, ghash_poly_8b
266 xor twtempl, ghash_poly_8b_temp
267 mov [TW+8*6], twtempl
268 mov [TW+8*7], twtemph;
269 vmovdqa %%TW4, [TW+16*3]
270 vmovdqu %%ST4, [ptr_plaintext+16*3]
271 %endif
272 %if (%%num_initial_blocks>=5)
273 xor ghash_poly_8b_temp, ghash_poly_8b_temp
274 shl twtempl, 1
275 adc twtemph, twtemph
276 cmovc ghash_poly_8b_temp, ghash_poly_8b
277 xor twtempl, ghash_poly_8b_temp
278 mov [TW+8*8], twtempl
279 mov [TW+8*9], twtemph;
280 vmovdqa %%TW5, [TW+16*4]
281 vmovdqu %%ST5, [ptr_plaintext+16*4]
282 %endif
283 %if (%%num_initial_blocks>=6)
284 xor ghash_poly_8b_temp, ghash_poly_8b_temp
285 shl twtempl, 1
286 adc twtemph, twtemph
287 cmovc ghash_poly_8b_temp, ghash_poly_8b
288 xor twtempl, ghash_poly_8b_temp
289 mov [TW+8*10], twtempl
290 mov [TW+8*11], twtemph;
291 vmovdqa %%TW6, [TW+16*5]
292 vmovdqu %%ST6, [ptr_plaintext+16*5]
293 %endif
294 %if (%%num_initial_blocks>=7)
295 xor ghash_poly_8b_temp, ghash_poly_8b_temp
296 shl twtempl, 1
297 adc twtemph, twtemph
298 cmovc ghash_poly_8b_temp, ghash_poly_8b
299 xor twtempl, ghash_poly_8b_temp
300 mov [TW+8*12], twtempl
301 mov [TW+8*13], twtemph;
302 vmovdqa %%TW7, [TW+16*6]
303 vmovdqu %%ST7, [ptr_plaintext+16*6]
304 %endif
305
306
307
308 %endmacro
309
310
311 ; encrypt initial blocks of AES
312 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
313 ; next 8 Tweak values are generated
314 %macro encrypt_initial 18
315 %define %%ST1 %1 ; state 1
316 %define %%ST2 %2 ; state 2
317 %define %%ST3 %3 ; state 3
318 %define %%ST4 %4 ; state 4
319 %define %%ST5 %5 ; state 5
320 %define %%ST6 %6 ; state 6
321 %define %%ST7 %7 ; state 7
322 %define %%ST8 %8 ; state 8
323
324 %define %%TW1 %9 ; tweak 1
325 %define %%TW2 %10 ; tweak 2
326 %define %%TW3 %11 ; tweak 3
327 %define %%TW4 %12 ; tweak 4
328 %define %%TW5 %13 ; tweak 5
329 %define %%TW6 %14 ; tweak 6
330 %define %%TW7 %15 ; tweak 7
331 %define %%T0 %16 ; Temp register
332 %define %%num_blocks %17
333 ; %%num_blocks blocks encrypted
334 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
335
336 %define %%lt128 %18 ; less than 128 bytes
337
338 ; xor Tweak value
339 vpxor %%ST1, %%TW1
340 %if (%%num_blocks>=2)
341 vpxor %%ST2, %%TW2
342 %endif
343 %if (%%num_blocks>=3)
344 vpxor %%ST3, %%TW3
345 %endif
346 %if (%%num_blocks>=4)
347 vpxor %%ST4, %%TW4
348 %endif
349 %if (%%num_blocks>=5)
350 vpxor %%ST5, %%TW5
351 %endif
352 %if (%%num_blocks>=6)
353 vpxor %%ST6, %%TW6
354 %endif
355 %if (%%num_blocks>=7)
356 vpxor %%ST7, %%TW7
357 %endif
358
359
360 ; ARK
361 vmovdqa %%T0, [keys]
362 vpxor %%ST1, %%T0
363 %if (%%num_blocks>=2)
364 vpxor %%ST2, %%T0
365 %endif
366 %if (%%num_blocks>=3)
367 vpxor %%ST3, %%T0
368 %endif
369 %if (%%num_blocks>=4)
370 vpxor %%ST4, %%T0
371 %endif
372 %if (%%num_blocks>=5)
373 vpxor %%ST5, %%T0
374 %endif
375 %if (%%num_blocks>=6)
376 vpxor %%ST6, %%T0
377 %endif
378 %if (%%num_blocks>=7)
379 vpxor %%ST7, %%T0
380 %endif
381
382
383 %if (0 == %%lt128)
384 xor ghash_poly_8b_temp, ghash_poly_8b_temp
385 shl twtempl, 1
386 adc twtemph, twtemph
387 %endif
388
389 ; round 1
390 vmovdqa %%T0, [keys + 16*1]
391 vaesdec %%ST1, %%T0
392 %if (%%num_blocks>=2)
393 vaesdec %%ST2, %%T0
394 %endif
395 %if (%%num_blocks>=3)
396 vaesdec %%ST3, %%T0
397 %endif
398 %if (%%num_blocks>=4)
399 vaesdec %%ST4, %%T0
400 %endif
401 %if (%%num_blocks>=5)
402 vaesdec %%ST5, %%T0
403 %endif
404 %if (%%num_blocks>=6)
405 vaesdec %%ST6, %%T0
406 %endif
407 %if (%%num_blocks>=7)
408 vaesdec %%ST7, %%T0
409 %endif
410 %if (0 == %%lt128)
411 cmovc ghash_poly_8b_temp, ghash_poly_8b
412 xor twtempl, ghash_poly_8b_temp
413 mov [TW + 8*0], twtempl ; next Tweak1 generated
414 mov [TW + 8*1], twtemph
415 xor ghash_poly_8b_temp, ghash_poly_8b_temp
416 %endif
417
418 ; round 2
419 vmovdqa %%T0, [keys + 16*2]
420 vaesdec %%ST1, %%T0
421 %if (%%num_blocks>=2)
422 vaesdec %%ST2, %%T0
423 %endif
424 %if (%%num_blocks>=3)
425 vaesdec %%ST3, %%T0
426 %endif
427 %if (%%num_blocks>=4)
428 vaesdec %%ST4, %%T0
429 %endif
430 %if (%%num_blocks>=5)
431 vaesdec %%ST5, %%T0
432 %endif
433 %if (%%num_blocks>=6)
434 vaesdec %%ST6, %%T0
435 %endif
436 %if (%%num_blocks>=7)
437 vaesdec %%ST7, %%T0
438 %endif
439
440 %if (0 == %%lt128)
441 shl twtempl, 1
442 adc twtemph, twtemph
443 cmovc ghash_poly_8b_temp, ghash_poly_8b
444 xor twtempl, ghash_poly_8b_temp
445 mov [TW + 8*2], twtempl ; next Tweak2 generated
446 %endif
447
448 ; round 3
449 vmovdqa %%T0, [keys + 16*3]
450 vaesdec %%ST1, %%T0
451 %if (%%num_blocks>=2)
452 vaesdec %%ST2, %%T0
453 %endif
454 %if (%%num_blocks>=3)
455 vaesdec %%ST3, %%T0
456 %endif
457 %if (%%num_blocks>=4)
458 vaesdec %%ST4, %%T0
459 %endif
460 %if (%%num_blocks>=5)
461 vaesdec %%ST5, %%T0
462 %endif
463 %if (%%num_blocks>=6)
464 vaesdec %%ST6, %%T0
465 %endif
466 %if (%%num_blocks>=7)
467 vaesdec %%ST7, %%T0
468 %endif
469 %if (0 == %%lt128)
470 mov [TW + 8*3], twtemph
471 xor ghash_poly_8b_temp, ghash_poly_8b_temp
472 shl twtempl, 1
473 adc twtemph, twtemph
474 cmovc ghash_poly_8b_temp, ghash_poly_8b
475 %endif
476
477 ; round 4
478 vmovdqa %%T0, [keys + 16*4]
479 vaesdec %%ST1, %%T0
480 %if (%%num_blocks>=2)
481 vaesdec %%ST2, %%T0
482 %endif
483 %if (%%num_blocks>=3)
484 vaesdec %%ST3, %%T0
485 %endif
486 %if (%%num_blocks>=4)
487 vaesdec %%ST4, %%T0
488 %endif
489 %if (%%num_blocks>=5)
490 vaesdec %%ST5, %%T0
491 %endif
492 %if (%%num_blocks>=6)
493 vaesdec %%ST6, %%T0
494 %endif
495 %if (%%num_blocks>=7)
496 vaesdec %%ST7, %%T0
497 %endif
498
499 %if (0 == %%lt128)
500 xor twtempl, ghash_poly_8b_temp
501 mov [TW + 8*4], twtempl ; next Tweak3 generated
502 mov [TW + 8*5], twtemph
503 xor ghash_poly_8b_temp, ghash_poly_8b_temp
504 shl twtempl, 1
505 %endif
506
507 ; round 5
508 vmovdqa %%T0, [keys + 16*5]
509 vaesdec %%ST1, %%T0
510 %if (%%num_blocks>=2)
511 vaesdec %%ST2, %%T0
512 %endif
513 %if (%%num_blocks>=3)
514 vaesdec %%ST3, %%T0
515 %endif
516 %if (%%num_blocks>=4)
517 vaesdec %%ST4, %%T0
518 %endif
519 %if (%%num_blocks>=5)
520 vaesdec %%ST5, %%T0
521 %endif
522 %if (%%num_blocks>=6)
523 vaesdec %%ST6, %%T0
524 %endif
525 %if (%%num_blocks>=7)
526 vaesdec %%ST7, %%T0
527 %endif
528
529 %if (0 == %%lt128)
530 adc twtemph, twtemph
531 cmovc ghash_poly_8b_temp, ghash_poly_8b
532 xor twtempl, ghash_poly_8b_temp
533 mov [TW + 8*6], twtempl ; next Tweak4 generated
534 mov [TW + 8*7], twtemph
535 %endif
536
537 ; round 6
538 vmovdqa %%T0, [keys + 16*6]
539 vaesdec %%ST1, %%T0
540 %if (%%num_blocks>=2)
541 vaesdec %%ST2, %%T0
542 %endif
543 %if (%%num_blocks>=3)
544 vaesdec %%ST3, %%T0
545 %endif
546 %if (%%num_blocks>=4)
547 vaesdec %%ST4, %%T0
548 %endif
549 %if (%%num_blocks>=5)
550 vaesdec %%ST5, %%T0
551 %endif
552 %if (%%num_blocks>=6)
553 vaesdec %%ST6, %%T0
554 %endif
555 %if (%%num_blocks>=7)
556 vaesdec %%ST7, %%T0
557 %endif
558
559 %if (0 == %%lt128)
560 xor ghash_poly_8b_temp, ghash_poly_8b_temp
561 shl twtempl, 1
562 adc twtemph, twtemph
563 cmovc ghash_poly_8b_temp, ghash_poly_8b
564 xor twtempl, ghash_poly_8b_temp
565 mov [TW + 8*8], twtempl ; next Tweak5 generated
566 mov [TW + 8*9], twtemph
567 %endif
568
569 ; round 7
570 vmovdqa %%T0, [keys + 16*7]
571 vaesdec %%ST1, %%T0
572 %if (%%num_blocks>=2)
573 vaesdec %%ST2, %%T0
574 %endif
575 %if (%%num_blocks>=3)
576 vaesdec %%ST3, %%T0
577 %endif
578 %if (%%num_blocks>=4)
579 vaesdec %%ST4, %%T0
580 %endif
581 %if (%%num_blocks>=5)
582 vaesdec %%ST5, %%T0
583 %endif
584 %if (%%num_blocks>=6)
585 vaesdec %%ST6, %%T0
586 %endif
587 %if (%%num_blocks>=7)
588 vaesdec %%ST7, %%T0
589 %endif
590
591 %if (0 == %%lt128)
592 xor ghash_poly_8b_temp, ghash_poly_8b_temp
593 shl twtempl, 1
594 adc twtemph, twtemph
595 cmovc ghash_poly_8b_temp, ghash_poly_8b
596 xor twtempl, ghash_poly_8b_temp
597 mov [TW + 8*10], twtempl ; next Tweak6 generated
598 mov [TW + 8*11], twtemph
599 %endif
600 ; round 8
601 vmovdqa %%T0, [keys + 16*8]
602 vaesdec %%ST1, %%T0
603 %if (%%num_blocks>=2)
604 vaesdec %%ST2, %%T0
605 %endif
606 %if (%%num_blocks>=3)
607 vaesdec %%ST3, %%T0
608 %endif
609 %if (%%num_blocks>=4)
610 vaesdec %%ST4, %%T0
611 %endif
612 %if (%%num_blocks>=5)
613 vaesdec %%ST5, %%T0
614 %endif
615 %if (%%num_blocks>=6)
616 vaesdec %%ST6, %%T0
617 %endif
618 %if (%%num_blocks>=7)
619 vaesdec %%ST7, %%T0
620 %endif
621
622 %if (0 == %%lt128)
623 xor ghash_poly_8b_temp, ghash_poly_8b_temp
624 shl twtempl, 1
625 adc twtemph, twtemph
626 cmovc ghash_poly_8b_temp, ghash_poly_8b
627 xor twtempl, ghash_poly_8b_temp
628 mov [TW + 8*12], twtempl ; next Tweak7 generated
629 mov [TW + 8*13], twtemph
630 %endif
631 ; round 9
632 vmovdqa %%T0, [keys + 16*9]
633 vaesdec %%ST1, %%T0
634 %if (%%num_blocks>=2)
635 vaesdec %%ST2, %%T0
636 %endif
637 %if (%%num_blocks>=3)
638 vaesdec %%ST3, %%T0
639 %endif
640 %if (%%num_blocks>=4)
641 vaesdec %%ST4, %%T0
642 %endif
643 %if (%%num_blocks>=5)
644 vaesdec %%ST5, %%T0
645 %endif
646 %if (%%num_blocks>=6)
647 vaesdec %%ST6, %%T0
648 %endif
649 %if (%%num_blocks>=7)
650 vaesdec %%ST7, %%T0
651 %endif
652
653 %if (0 == %%lt128)
654 xor ghash_poly_8b_temp, ghash_poly_8b_temp
655 shl twtempl, 1
656 adc twtemph, twtemph
657 cmovc ghash_poly_8b_temp, ghash_poly_8b
658 xor twtempl, ghash_poly_8b_temp
659 mov [TW + 8*14], twtempl ; next Tweak8 generated
660 mov [TW + 8*15], twtemph
661 %endif
662 ; round 10
663 vmovdqa %%T0, [keys + 16*10]
664 vaesdec %%ST1, %%T0
665 %if (%%num_blocks>=2)
666 vaesdec %%ST2, %%T0
667 %endif
668 %if (%%num_blocks>=3)
669 vaesdec %%ST3, %%T0
670 %endif
671 %if (%%num_blocks>=4)
672 vaesdec %%ST4, %%T0
673 %endif
674 %if (%%num_blocks>=5)
675 vaesdec %%ST5, %%T0
676 %endif
677 %if (%%num_blocks>=6)
678 vaesdec %%ST6, %%T0
679 %endif
680 %if (%%num_blocks>=7)
681 vaesdec %%ST7, %%T0
682 %endif
683 ; round 11
684 vmovdqa %%T0, [keys + 16*11]
685 vaesdec %%ST1, %%T0
686 %if (%%num_blocks>=2)
687 vaesdec %%ST2, %%T0
688 %endif
689 %if (%%num_blocks>=3)
690 vaesdec %%ST3, %%T0
691 %endif
692 %if (%%num_blocks>=4)
693 vaesdec %%ST4, %%T0
694 %endif
695 %if (%%num_blocks>=5)
696 vaesdec %%ST5, %%T0
697 %endif
698 %if (%%num_blocks>=6)
699 vaesdec %%ST6, %%T0
700 %endif
701 %if (%%num_blocks>=7)
702 vaesdec %%ST7, %%T0
703 %endif
704
705 ; round 12
706 vmovdqa %%T0, [keys + 16*12]
707 vaesdec %%ST1, %%T0
708 %if (%%num_blocks>=2)
709 vaesdec %%ST2, %%T0
710 %endif
711 %if (%%num_blocks>=3)
712 vaesdec %%ST3, %%T0
713 %endif
714 %if (%%num_blocks>=4)
715 vaesdec %%ST4, %%T0
716 %endif
717 %if (%%num_blocks>=5)
718 vaesdec %%ST5, %%T0
719 %endif
720 %if (%%num_blocks>=6)
721 vaesdec %%ST6, %%T0
722 %endif
723 %if (%%num_blocks>=7)
724 vaesdec %%ST7, %%T0
725 %endif
726
727 ; round 13
728 vmovdqa %%T0, [keys + 16*13]
729 vaesdec %%ST1, %%T0
730 %if (%%num_blocks>=2)
731 vaesdec %%ST2, %%T0
732 %endif
733 %if (%%num_blocks>=3)
734 vaesdec %%ST3, %%T0
735 %endif
736 %if (%%num_blocks>=4)
737 vaesdec %%ST4, %%T0
738 %endif
739 %if (%%num_blocks>=5)
740 vaesdec %%ST5, %%T0
741 %endif
742 %if (%%num_blocks>=6)
743 vaesdec %%ST6, %%T0
744 %endif
745 %if (%%num_blocks>=7)
746 vaesdec %%ST7, %%T0
747 %endif
748
749 ; round 14
750 vmovdqa %%T0, [keys + 16*14]
751 vaesdeclast %%ST1, %%T0
752 %if (%%num_blocks>=2)
753 vaesdeclast %%ST2, %%T0
754 %endif
755 %if (%%num_blocks>=3)
756 vaesdeclast %%ST3, %%T0
757 %endif
758 %if (%%num_blocks>=4)
759 vaesdeclast %%ST4, %%T0
760 %endif
761 %if (%%num_blocks>=5)
762 vaesdeclast %%ST5, %%T0
763 %endif
764 %if (%%num_blocks>=6)
765 vaesdeclast %%ST6, %%T0
766 %endif
767 %if (%%num_blocks>=7)
768 vaesdeclast %%ST7, %%T0
769 %endif
770
771 ; xor Tweak values
772 vpxor %%ST1, %%TW1
773 %if (%%num_blocks>=2)
774 vpxor %%ST2, %%TW2
775 %endif
776 %if (%%num_blocks>=3)
777 vpxor %%ST3, %%TW3
778 %endif
779 %if (%%num_blocks>=4)
780 vpxor %%ST4, %%TW4
781 %endif
782 %if (%%num_blocks>=5)
783 vpxor %%ST5, %%TW5
784 %endif
785 %if (%%num_blocks>=6)
786 vpxor %%ST6, %%TW6
787 %endif
788 %if (%%num_blocks>=7)
789 vpxor %%ST7, %%TW7
790 %endif
791
792
793 %if (0 == %%lt128)
794 ; load next Tweak values
795 vmovdqa %%TW1, [TW + 16*0]
796 vmovdqa %%TW2, [TW + 16*1]
797 vmovdqa %%TW3, [TW + 16*2]
798 vmovdqa %%TW4, [TW + 16*3]
799 vmovdqa %%TW5, [TW + 16*4]
800 vmovdqa %%TW6, [TW + 16*5]
801 vmovdqa %%TW7, [TW + 16*6]
802
803 %endif
804
805 %endmacro
806
807
808 ; Encrypt 8 blocks in parallel
809 ; generate next 8 tweak values
810 %macro encrypt_by_eight 18
811 %define %%ST1 %1 ; state 1
812 %define %%ST2 %2 ; state 2
813 %define %%ST3 %3 ; state 3
814 %define %%ST4 %4 ; state 4
815 %define %%ST5 %5 ; state 5
816 %define %%ST6 %6 ; state 6
817 %define %%ST7 %7 ; state 7
818 %define %%ST8 %8 ; state 8
819 %define %%TW1 %9 ; tweak 1
820 %define %%TW2 %10 ; tweak 2
821 %define %%TW3 %11 ; tweak 3
822 %define %%TW4 %12 ; tweak 4
823 %define %%TW5 %13 ; tweak 5
824 %define %%TW6 %14 ; tweak 6
825 %define %%TW7 %15 ; tweak 7
826 %define %%TW8 %16 ; tweak 8
827 %define %%T0 %17 ; Temp register
828 %define %%last_eight %18
829
830 ; xor Tweak values
831 vpxor %%ST1, %%TW1
832 vpxor %%ST2, %%TW2
833 vpxor %%ST3, %%TW3
834 vpxor %%ST4, %%TW4
835 vpxor %%ST5, %%TW5
836 vpxor %%ST6, %%TW6
837 vpxor %%ST7, %%TW7
838 vpxor %%ST8, %%TW8
839
840 ; ARK
841 vmovdqa %%T0, [keys]
842 vpxor %%ST1, %%T0
843 vpxor %%ST2, %%T0
844 vpxor %%ST3, %%T0
845 vpxor %%ST4, %%T0
846 vpxor %%ST5, %%T0
847 vpxor %%ST6, %%T0
848 vpxor %%ST7, %%T0
849 vpxor %%ST8, %%T0
850
851 %if (0 == %%last_eight)
852 xor ghash_poly_8b_temp, ghash_poly_8b_temp
853 shl twtempl, 1
854 adc twtemph, twtemph
855 cmovc ghash_poly_8b_temp, ghash_poly_8b
856 %endif
857 ; round 1
858 vmovdqa %%T0, [keys + 16*1]
859 vaesdec %%ST1, %%T0
860 vaesdec %%ST2, %%T0
861 vaesdec %%ST3, %%T0
862 vaesdec %%ST4, %%T0
863 vaesdec %%ST5, %%T0
864 vaesdec %%ST6, %%T0
865 vaesdec %%ST7, %%T0
866 vaesdec %%ST8, %%T0
867 %if (0 == %%last_eight)
868 xor twtempl, ghash_poly_8b_temp
869 mov [TW + 8*0], twtempl
870 mov [TW + 8*1], twtemph
871 xor ghash_poly_8b_temp, ghash_poly_8b_temp
872 %endif
873 ; round 2
874 vmovdqa %%T0, [keys + 16*2]
875 vaesdec %%ST1, %%T0
876 vaesdec %%ST2, %%T0
877 vaesdec %%ST3, %%T0
878 vaesdec %%ST4, %%T0
879 vaesdec %%ST5, %%T0
880 vaesdec %%ST6, %%T0
881 vaesdec %%ST7, %%T0
882 vaesdec %%ST8, %%T0
883 %if (0 == %%last_eight)
884 shl twtempl, 1
885 adc twtemph, twtemph
886 cmovc ghash_poly_8b_temp, ghash_poly_8b
887 xor twtempl, ghash_poly_8b_temp
888
889 %endif
890 ; round 3
891 vmovdqa %%T0, [keys + 16*3]
892 vaesdec %%ST1, %%T0
893 vaesdec %%ST2, %%T0
894 vaesdec %%ST3, %%T0
895 vaesdec %%ST4, %%T0
896 vaesdec %%ST5, %%T0
897 vaesdec %%ST6, %%T0
898 vaesdec %%ST7, %%T0
899 vaesdec %%ST8, %%T0
900 %if (0 == %%last_eight)
901 mov [TW + 8*2], twtempl
902 mov [TW + 8*3], twtemph
903 xor ghash_poly_8b_temp, ghash_poly_8b_temp
904 shl twtempl, 1
905 %endif
906 ; round 4
907 vmovdqa %%T0, [keys + 16*4]
908 vaesdec %%ST1, %%T0
909 vaesdec %%ST2, %%T0
910 vaesdec %%ST3, %%T0
911 vaesdec %%ST4, %%T0
912 vaesdec %%ST5, %%T0
913 vaesdec %%ST6, %%T0
914 vaesdec %%ST7, %%T0
915 vaesdec %%ST8, %%T0
916 %if (0 == %%last_eight)
917 adc twtemph, twtemph
918 cmovc ghash_poly_8b_temp, ghash_poly_8b
919 xor twtempl, ghash_poly_8b_temp
920 mov [TW + 8*4], twtempl
921 %endif
922 ; round 5
923 vmovdqa %%T0, [keys + 16*5]
924 vaesdec %%ST1, %%T0
925 vaesdec %%ST2, %%T0
926 vaesdec %%ST3, %%T0
927 vaesdec %%ST4, %%T0
928 vaesdec %%ST5, %%T0
929 vaesdec %%ST6, %%T0
930 vaesdec %%ST7, %%T0
931 vaesdec %%ST8, %%T0
932 %if (0 == %%last_eight)
933 mov [TW + 8*5], twtemph
934 xor ghash_poly_8b_temp, ghash_poly_8b_temp
935 shl twtempl, 1
936 adc twtemph, twtemph
937 %endif
938 ; round 6
939 vmovdqa %%T0, [keys + 16*6]
940 vaesdec %%ST1, %%T0
941 vaesdec %%ST2, %%T0
942 vaesdec %%ST3, %%T0
943 vaesdec %%ST4, %%T0
944 vaesdec %%ST5, %%T0
945 vaesdec %%ST6, %%T0
946 vaesdec %%ST7, %%T0
947 vaesdec %%ST8, %%T0
948 %if (0 == %%last_eight)
949 cmovc ghash_poly_8b_temp, ghash_poly_8b
950 xor twtempl, ghash_poly_8b_temp
951 mov [TW + 8*6], twtempl
952 mov [TW + 8*7], twtemph
953 %endif
954 ; round 7
955 vmovdqa %%T0, [keys + 16*7]
956 vaesdec %%ST1, %%T0
957 vaesdec %%ST2, %%T0
958 vaesdec %%ST3, %%T0
959 vaesdec %%ST4, %%T0
960 vaesdec %%ST5, %%T0
961 vaesdec %%ST6, %%T0
962 vaesdec %%ST7, %%T0
963 vaesdec %%ST8, %%T0
964 %if (0 == %%last_eight)
965 xor ghash_poly_8b_temp, ghash_poly_8b_temp
966 shl twtempl, 1
967 adc twtemph, twtemph
968 cmovc ghash_poly_8b_temp, ghash_poly_8b
969 %endif
970 ; round 8
971 vmovdqa %%T0, [keys + 16*8]
972 vaesdec %%ST1, %%T0
973 vaesdec %%ST2, %%T0
974 vaesdec %%ST3, %%T0
975 vaesdec %%ST4, %%T0
976 vaesdec %%ST5, %%T0
977 vaesdec %%ST6, %%T0
978 vaesdec %%ST7, %%T0
979 vaesdec %%ST8, %%T0
980 %if (0 == %%last_eight)
981 xor twtempl, ghash_poly_8b_temp
982 mov [TW + 8*8], twtempl
983 mov [TW + 8*9], twtemph
984 xor ghash_poly_8b_temp, ghash_poly_8b_temp
985 %endif
986 ; round 9
987 vmovdqa %%T0, [keys + 16*9]
988 vaesdec %%ST1, %%T0
989 vaesdec %%ST2, %%T0
990 vaesdec %%ST3, %%T0
991 vaesdec %%ST4, %%T0
992 vaesdec %%ST5, %%T0
993 vaesdec %%ST6, %%T0
994 vaesdec %%ST7, %%T0
995 vaesdec %%ST8, %%T0
996 %if (0 == %%last_eight)
997 shl twtempl, 1
998 adc twtemph, twtemph
999 cmovc ghash_poly_8b_temp, ghash_poly_8b
1000 xor twtempl, ghash_poly_8b_temp
1001 %endif
1002 ; round 10
1003 vmovdqa %%T0, [keys + 16*10]
1004 vaesdec %%ST1, %%T0
1005 vaesdec %%ST2, %%T0
1006 vaesdec %%ST3, %%T0
1007 vaesdec %%ST4, %%T0
1008 vaesdec %%ST5, %%T0
1009 vaesdec %%ST6, %%T0
1010 vaesdec %%ST7, %%T0
1011 vaesdec %%ST8, %%T0
1012 %if (0 == %%last_eight)
1013 mov [TW + 8*10], twtempl
1014 mov [TW + 8*11], twtemph
1015 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1016 shl twtempl, 1
1017 %endif
1018 ; round 11
1019 vmovdqa %%T0, [keys + 16*11]
1020 vaesdec %%ST1, %%T0
1021 vaesdec %%ST2, %%T0
1022 vaesdec %%ST3, %%T0
1023 vaesdec %%ST4, %%T0
1024 vaesdec %%ST5, %%T0
1025 vaesdec %%ST6, %%T0
1026 vaesdec %%ST7, %%T0
1027 vaesdec %%ST8, %%T0
1028 %if (0 == %%last_eight)
1029 adc twtemph, twtemph
1030 cmovc ghash_poly_8b_temp, ghash_poly_8b
1031 xor twtempl, ghash_poly_8b_temp
1032 mov [TW + 8*12], twtempl
1033 %endif
1034 ; round 12
1035 vmovdqa %%T0, [keys + 16*12]
1036 vaesdec %%ST1, %%T0
1037 vaesdec %%ST2, %%T0
1038 vaesdec %%ST3, %%T0
1039 vaesdec %%ST4, %%T0
1040 vaesdec %%ST5, %%T0
1041 vaesdec %%ST6, %%T0
1042 vaesdec %%ST7, %%T0
1043 vaesdec %%ST8, %%T0
1044 %if (0 == %%last_eight)
1045 mov [TW + 8*13], twtemph
1046 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1047 shl twtempl, 1
1048 adc twtemph, twtemph
1049 %endif
1050 ; round 13
1051 vmovdqa %%T0, [keys + 16*13]
1052 vaesdec %%ST1, %%T0
1053 vaesdec %%ST2, %%T0
1054 vaesdec %%ST3, %%T0
1055 vaesdec %%ST4, %%T0
1056 vaesdec %%ST5, %%T0
1057 vaesdec %%ST6, %%T0
1058 vaesdec %%ST7, %%T0
1059 vaesdec %%ST8, %%T0
1060 %if (0 == %%last_eight)
1061 cmovc ghash_poly_8b_temp, ghash_poly_8b
1062 xor twtempl, ghash_poly_8b_temp
1063 ; mov [TW + 8*14], twtempl
1064 ; mov [TW + 8*15], twtemph
1065 %endif
1066 ; round 14
1067 vmovdqa %%T0, [keys + 16*14]
1068 vaesdeclast %%ST1, %%T0
1069 vaesdeclast %%ST2, %%T0
1070 vaesdeclast %%ST3, %%T0
1071 vaesdeclast %%ST4, %%T0
1072 vaesdeclast %%ST5, %%T0
1073 vaesdeclast %%ST6, %%T0
1074 vaesdeclast %%ST7, %%T0
1075 vaesdeclast %%ST8, %%T0
1076
1077 ; xor Tweak values
1078 vpxor %%ST1, %%TW1
1079 vpxor %%ST2, %%TW2
1080 vpxor %%ST3, %%TW3
1081 vpxor %%ST4, %%TW4
1082 vpxor %%ST5, %%TW5
1083 vpxor %%ST6, %%TW6
1084 vpxor %%ST7, %%TW7
1085 vpxor %%ST8, %%TW8
1086
1087 mov [TW + 8*14], twtempl
1088 mov [TW + 8*15], twtemph
1089 ; load next Tweak values
1090 vmovdqa %%TW1, [TW + 16*0]
1091 vmovdqa %%TW2, [TW + 16*1]
1092 vmovdqa %%TW3, [TW + 16*2]
1093 vmovdqa %%TW4, [TW + 16*3]
1094 vmovdqa %%TW5, [TW + 16*4]
1095 vmovdqa %%TW6, [TW + 16*5]
1096 vmovdqa %%TW7, [TW + 16*6]
1097
1098 %endmacro
1099
1100
1101 section .text
1102
1103 global XTS_AES_256_dec_expanded_key_avx:function
1104 XTS_AES_256_dec_expanded_key_avx:
1105
1106 sub rsp, VARIABLE_OFFSET
1107
1108 mov [_gpr + 8*0], rbx
1109 %ifidn __OUTPUT_FORMAT__, win64
1110 mov [_gpr + 8*1], rdi
1111 mov [_gpr + 8*2], rsi
1112
1113 vmovdqa [_xmm + 16*0], xmm6
1114 vmovdqa [_xmm + 16*1], xmm7
1115 vmovdqa [_xmm + 16*2], xmm8
1116 vmovdqa [_xmm + 16*3], xmm9
1117 vmovdqa [_xmm + 16*4], xmm10
1118 vmovdqa [_xmm + 16*5], xmm11
1119 vmovdqa [_xmm + 16*6], xmm12
1120 vmovdqa [_xmm + 16*7], xmm13
1121 vmovdqa [_xmm + 16*8], xmm14
1122 vmovdqa [_xmm + 16*9], xmm15
1123 %endif
1124
1125 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1126
1127
1128 vmovdqu xmm1, [T_val] ; read initial Tweak value
1129 vpxor xmm4, xmm4 ; for key expansion
1130 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
1131
1132
1133 %ifidn __OUTPUT_FORMAT__, win64
1134 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1135 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1136 %endif
1137
1138
1139
1140 mov target_ptr_val, N_val
1141 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
1142 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
1143 jl _less_than_128_bytes
1144
1145 add target_ptr_val, ptr_ciphertext
1146
1147
1148 mov tmp1, N_val
1149 and tmp1, (7 << 4)
1150 jz _initial_num_blocks_is_0
1151
1152 cmp tmp1, (4 << 4)
1153 je _initial_num_blocks_is_4
1154
1155
1156
1157 cmp tmp1, (6 << 4)
1158 je _initial_num_blocks_is_6
1159
1160 cmp tmp1, (5 << 4)
1161 je _initial_num_blocks_is_5
1162
1163
1164
1165 cmp tmp1, (3 << 4)
1166 je _initial_num_blocks_is_3
1167
1168 cmp tmp1, (2 << 4)
1169 je _initial_num_blocks_is_2
1170
1171 cmp tmp1, (1 << 4)
1172 je _initial_num_blocks_is_1
1173
1174 _initial_num_blocks_is_7:
1175 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1176 add ptr_plaintext, 16*7
1177 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1178 ; store ciphertext
1179 vmovdqu [ptr_ciphertext+16*0], xmm1
1180 vmovdqu [ptr_ciphertext+16*1], xmm2
1181 vmovdqu [ptr_ciphertext+16*2], xmm3
1182 vmovdqu [ptr_ciphertext+16*3], xmm4
1183 vmovdqu [ptr_ciphertext+16*4], xmm5
1184 vmovdqu [ptr_ciphertext+16*5], xmm6
1185 vmovdqu [ptr_ciphertext+16*6], xmm7
1186 add ptr_ciphertext, 16*7
1187
1188 cmp ptr_ciphertext, target_ptr_val
1189 je _last_eight
1190
1191 jmp _main_loop
1192 _initial_num_blocks_is_6:
1193 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1194 add ptr_plaintext, 16*6
1195 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1196 ; store ciphertext
1197 vmovdqu [ptr_ciphertext+16*0], xmm1
1198 vmovdqu [ptr_ciphertext+16*1], xmm2
1199 vmovdqu [ptr_ciphertext+16*2], xmm3
1200 vmovdqu [ptr_ciphertext+16*3], xmm4
1201 vmovdqu [ptr_ciphertext+16*4], xmm5
1202 vmovdqu [ptr_ciphertext+16*5], xmm6
1203 add ptr_ciphertext, 16*6
1204
1205 cmp ptr_ciphertext, target_ptr_val
1206 je _last_eight
1207
1208 jmp _main_loop
1209 _initial_num_blocks_is_5:
1210 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1211 add ptr_plaintext, 16*5
1212 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1213 ; store ciphertext
1214 vmovdqu [ptr_ciphertext+16*0], xmm1
1215 vmovdqu [ptr_ciphertext+16*1], xmm2
1216 vmovdqu [ptr_ciphertext+16*2], xmm3
1217 vmovdqu [ptr_ciphertext+16*3], xmm4
1218 vmovdqu [ptr_ciphertext+16*4], xmm5
1219 add ptr_ciphertext, 16*5
1220
1221 cmp ptr_ciphertext, target_ptr_val
1222 je _last_eight
1223
1224 jmp _main_loop
1225 _initial_num_blocks_is_4:
1226 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1227 add ptr_plaintext, 16*4
1228 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1229 ; store ciphertext
1230 vmovdqu [ptr_ciphertext+16*0], xmm1
1231 vmovdqu [ptr_ciphertext+16*1], xmm2
1232 vmovdqu [ptr_ciphertext+16*2], xmm3
1233 vmovdqu [ptr_ciphertext+16*3], xmm4
1234 add ptr_ciphertext, 16*4
1235
1236 cmp ptr_ciphertext, target_ptr_val
1237 je _last_eight
1238
1239 jmp _main_loop
1240
1241
1242 _initial_num_blocks_is_3:
1243 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1244 add ptr_plaintext, 16*3
1245 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1246 ; store ciphertext
1247 vmovdqu [ptr_ciphertext+16*0], xmm1
1248 vmovdqu [ptr_ciphertext+16*1], xmm2
1249 vmovdqu [ptr_ciphertext+16*2], xmm3
1250 add ptr_ciphertext, 16*3
1251
1252 cmp ptr_ciphertext, target_ptr_val
1253 je _last_eight
1254
1255 jmp _main_loop
1256 _initial_num_blocks_is_2:
1257 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1258 add ptr_plaintext, 16*2
1259 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1260 ; store ciphertext
1261 vmovdqu [ptr_ciphertext], xmm1
1262 vmovdqu [ptr_ciphertext+16], xmm2
1263 add ptr_ciphertext, 16*2
1264
1265 cmp ptr_ciphertext, target_ptr_val
1266 je _last_eight
1267
1268 jmp _main_loop
1269
1270 _initial_num_blocks_is_1:
1271 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1272 add ptr_plaintext, 16*1
1273 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1274 ; store ciphertext
1275 vmovdqu [ptr_ciphertext], xmm1
1276 add ptr_ciphertext, 16
1277
1278 cmp ptr_ciphertext, target_ptr_val
1279 je _last_eight
1280
1281 jmp _main_loop
1282
1283 _initial_num_blocks_is_0:
1284 mov twtempl, [TW+8*0]
1285 mov twtemph, [TW+8*1]
1286 vmovdqa xmm9, [TW+16*0]
1287
1288 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1289 shl twtempl, 1
1290 adc twtemph, twtemph
1291 cmovc ghash_poly_8b_temp, ghash_poly_8b
1292 xor twtempl, ghash_poly_8b_temp
1293 mov [TW+8*2], twtempl
1294 mov [TW+8*3], twtemph
1295 vmovdqa xmm10, [TW+16*1]
1296
1297 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1298 shl twtempl, 1
1299 adc twtemph, twtemph
1300 cmovc ghash_poly_8b_temp, ghash_poly_8b
1301 xor twtempl, ghash_poly_8b_temp
1302 mov [TW+8*4], twtempl
1303 mov [TW+8*5], twtemph
1304 vmovdqa xmm11, [TW+16*2]
1305
1306
1307 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1308 shl twtempl, 1
1309 adc twtemph, twtemph
1310 cmovc ghash_poly_8b_temp, ghash_poly_8b
1311 xor twtempl, ghash_poly_8b_temp
1312 mov [TW+8*6], twtempl
1313 mov [TW+8*7], twtemph
1314 vmovdqa xmm12, [TW+16*3]
1315
1316
1317 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1318 shl twtempl, 1
1319 adc twtemph, twtemph
1320 cmovc ghash_poly_8b_temp, ghash_poly_8b
1321 xor twtempl, ghash_poly_8b_temp
1322 mov [TW+8*8], twtempl
1323 mov [TW+8*9], twtemph
1324 vmovdqa xmm13, [TW+16*4]
1325
1326 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1327 shl twtempl, 1
1328 adc twtemph, twtemph
1329 cmovc ghash_poly_8b_temp, ghash_poly_8b
1330 xor twtempl, ghash_poly_8b_temp
1331 mov [TW+8*10], twtempl
1332 mov [TW+8*11], twtemph
1333 vmovdqa xmm14, [TW+16*5]
1334
1335 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1336 shl twtempl, 1
1337 adc twtemph, twtemph
1338 cmovc ghash_poly_8b_temp, ghash_poly_8b
1339 xor twtempl, ghash_poly_8b_temp
1340 mov [TW+8*12], twtempl
1341 mov [TW+8*13], twtemph
1342 vmovdqa xmm15, [TW+16*6]
1343
1344 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1345 shl twtempl, 1
1346 adc twtemph, twtemph
1347 cmovc ghash_poly_8b_temp, ghash_poly_8b
1348 xor twtempl, ghash_poly_8b_temp
1349 mov [TW+8*14], twtempl
1350 mov [TW+8*15], twtemph
1351 ;vmovdqa xmm16, [TW+16*7]
1352
1353 cmp ptr_ciphertext, target_ptr_val
1354 je _last_eight
1355 _main_loop:
1356 ; load plaintext
1357 vmovdqu xmm1, [ptr_plaintext+16*0]
1358 vmovdqu xmm2, [ptr_plaintext+16*1]
1359 vmovdqu xmm3, [ptr_plaintext+16*2]
1360 vmovdqu xmm4, [ptr_plaintext+16*3]
1361 vmovdqu xmm5, [ptr_plaintext+16*4]
1362 vmovdqu xmm6, [ptr_plaintext+16*5]
1363 vmovdqu xmm7, [ptr_plaintext+16*6]
1364 vmovdqu xmm8, [ptr_plaintext+16*7]
1365
1366 add ptr_plaintext, 128
1367
1368 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1369
1370 ; store ciphertext
1371 vmovdqu [ptr_ciphertext+16*0], xmm1
1372 vmovdqu [ptr_ciphertext+16*1], xmm2
1373 vmovdqu [ptr_ciphertext+16*2], xmm3
1374 vmovdqu [ptr_ciphertext+16*3], xmm4
1375 vmovdqu [ptr_ciphertext+16*4], xmm5
1376 vmovdqu [ptr_ciphertext+16*5], xmm6
1377 vmovdqu [ptr_ciphertext+16*6], xmm7
1378 vmovdqu [ptr_ciphertext+16*7], xmm8
1379 add ptr_ciphertext, 128
1380
1381 cmp ptr_ciphertext, target_ptr_val
1382 jne _main_loop
1383
1384 _last_eight:
1385
1386 and N_val, 15 ; N_val = N_val mod 16
1387 je _done_final
1388
1389 ; generate next Tweak value
1390 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1391 shl twtempl, 1
1392 adc twtemph, twtemph
1393 cmovc ghash_poly_8b_temp, ghash_poly_8b
1394 xor twtempl, ghash_poly_8b_temp
1395 vmovdqa xmm1, [TW + 16*7]
1396 vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
1397
1398 mov [TW + 16*7], twtempl
1399 mov [TW + 16*7+8], twtemph
1400
1401 ; load plaintext
1402 vmovdqu xmm1, [ptr_plaintext+16*0]
1403 vmovdqu xmm2, [ptr_plaintext+16*1]
1404 vmovdqu xmm3, [ptr_plaintext+16*2]
1405 vmovdqu xmm4, [ptr_plaintext+16*3]
1406 vmovdqu xmm5, [ptr_plaintext+16*4]
1407 vmovdqu xmm6, [ptr_plaintext+16*5]
1408 vmovdqu xmm7, [ptr_plaintext+16*6]
1409 vmovdqu xmm8, [ptr_plaintext+16*7]
1410 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1411
1412 ; store ciphertext
1413 vmovdqu [ptr_ciphertext+16*0], xmm1
1414 vmovdqu [ptr_ciphertext+16*1], xmm2
1415 vmovdqu [ptr_ciphertext+16*2], xmm3
1416 vmovdqu [ptr_ciphertext+16*3], xmm4
1417 vmovdqu [ptr_ciphertext+16*4], xmm5
1418 vmovdqu [ptr_ciphertext+16*5], xmm6
1419 vmovdqu [ptr_ciphertext+16*6], xmm7
1420 jmp _steal_cipher
1421
1422
1423 _done_final:
1424 ; load plaintext
1425 vmovdqu xmm1, [ptr_plaintext+16*0]
1426 vmovdqu xmm2, [ptr_plaintext+16*1]
1427 vmovdqu xmm3, [ptr_plaintext+16*2]
1428 vmovdqu xmm4, [ptr_plaintext+16*3]
1429 vmovdqu xmm5, [ptr_plaintext+16*4]
1430 vmovdqu xmm6, [ptr_plaintext+16*5]
1431 vmovdqu xmm7, [ptr_plaintext+16*6]
1432 vmovdqu xmm8, [ptr_plaintext+16*7]
1433 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1434
1435 ; store ciphertext
1436 vmovdqu [ptr_ciphertext+16*0], xmm1
1437 vmovdqu [ptr_ciphertext+16*1], xmm2
1438 vmovdqu [ptr_ciphertext+16*2], xmm3
1439 vmovdqu [ptr_ciphertext+16*3], xmm4
1440 vmovdqu [ptr_ciphertext+16*4], xmm5
1441 vmovdqu [ptr_ciphertext+16*5], xmm6
1442 vmovdqu [ptr_ciphertext+16*6], xmm7
1443
1444 jmp _done
1445
1446
1447 _steal_cipher:
1448 ; start cipher stealing
1449
1450
1451 vmovdqa xmm2, xmm8
1452
1453 ; shift xmm8 to the left by 16-N_val bytes
1454 lea twtempl, [vpshufb_shf_table]
1455 vmovdqu xmm0, [twtempl+N_val]
1456 vpshufb xmm8, xmm0
1457
1458
1459 vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1460 vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
1461
1462 ; shift xmm3 to the right by 16-N_val bytes
1463 lea twtempl, [vpshufb_shf_table +16]
1464 sub twtempl, N_val
1465 vmovdqu xmm0, [twtempl]
1466 vpxor xmm0, [mask1]
1467 vpshufb xmm3, xmm0
1468
1469 vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
1470
1471 ; xor Tweak value
1472 vmovdqa xmm8, [TW]
1473 vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
1474
1475
1476 ;encrypt last block with cipher stealing
1477 vpxor xmm8, [keys] ; ARK
1478 vaesdec xmm8, [keys + 16*1] ; round 1
1479 vaesdec xmm8, [keys + 16*2] ; round 2
1480 vaesdec xmm8, [keys + 16*3] ; round 3
1481 vaesdec xmm8, [keys + 16*4] ; round 4
1482 vaesdec xmm8, [keys + 16*5] ; round 5
1483 vaesdec xmm8, [keys + 16*6] ; round 6
1484 vaesdec xmm8, [keys + 16*7] ; round 7
1485 vaesdec xmm8, [keys + 16*8] ; round 8
1486 vaesdec xmm8, [keys + 16*9] ; round 9
1487 vaesdec xmm8, [keys + 16*10] ; round 9
1488 vaesdec xmm8, [keys + 16*11] ; round 9
1489 vaesdec xmm8, [keys + 16*12] ; round 9
1490 vaesdec xmm8, [keys + 16*13] ; round 9
1491 vaesdeclast xmm8, [keys + 16*14] ; round 10
1492
1493 ; xor Tweak value
1494 vpxor xmm8, [TW]
1495
1496 _done:
1497 ; store last ciphertext value
1498 vmovdqu [ptr_ciphertext+16*7], xmm8
1499
1500 _ret_:
1501
1502 mov rbx, [_gpr + 8*0]
1503 %ifidn __OUTPUT_FORMAT__, win64
1504 mov rdi, [_gpr + 8*1]
1505 mov rsi, [_gpr + 8*2]
1506
1507
1508 vmovdqa xmm6, [_xmm + 16*0]
1509 vmovdqa xmm7, [_xmm + 16*1]
1510 vmovdqa xmm8, [_xmm + 16*2]
1511 vmovdqa xmm9, [_xmm + 16*3]
1512 vmovdqa xmm10, [_xmm + 16*4]
1513 vmovdqa xmm11, [_xmm + 16*5]
1514 vmovdqa xmm12, [_xmm + 16*6]
1515 vmovdqa xmm13, [_xmm + 16*7]
1516 vmovdqa xmm14, [_xmm + 16*8]
1517 vmovdqa xmm15, [_xmm + 16*9]
1518 %endif
1519
1520 add rsp, VARIABLE_OFFSET
1521
1522 ret
1523
1524
1525
1526
1527
1528 _less_than_128_bytes:
1529 cmp N_val, 16
1530 jb _ret_
1531
1532 mov tmp1, N_val
1533 and tmp1, (7 << 4)
1534 cmp tmp1, (6 << 4)
1535 je _num_blocks_is_6
1536 cmp tmp1, (5 << 4)
1537 je _num_blocks_is_5
1538 cmp tmp1, (4 << 4)
1539 je _num_blocks_is_4
1540 cmp tmp1, (3 << 4)
1541 je _num_blocks_is_3
1542 cmp tmp1, (2 << 4)
1543 je _num_blocks_is_2
1544 cmp tmp1, (1 << 4)
1545 je _num_blocks_is_1
1546
1547
1548
1549
1550 _num_blocks_is_7:
1551 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1552
1553 sub ptr_plaintext, 16*1
1554
1555 and N_val, 15 ; N_val = N_val mod 16
1556 je _done_7
1557
1558 _steal_cipher_7:
1559 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1560 shl twtempl, 1
1561 adc twtemph, twtemph
1562 cmovc ghash_poly_8b_temp, ghash_poly_8b
1563 xor twtempl, ghash_poly_8b_temp
1564 mov [TW+8*2], twtempl
1565 mov [TW+8*3], twtemph
1566
1567 vmovdqa [TW + 16*0] , xmm15
1568 vmovdqa xmm15, [TW+16*1]
1569
1570 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1571 ; store ciphertext
1572 vmovdqu [ptr_ciphertext+16*0], xmm1
1573 vmovdqu [ptr_ciphertext+16*1], xmm2
1574 vmovdqu [ptr_ciphertext+16*2], xmm3
1575 vmovdqu [ptr_ciphertext+16*3], xmm4
1576 vmovdqu [ptr_ciphertext+16*4], xmm5
1577 vmovdqu [ptr_ciphertext+16*5], xmm6
1578
1579 sub ptr_ciphertext, 16*1
1580 vmovdqa xmm8, xmm7
1581 jmp _steal_cipher
1582
1583 _done_7:
1584 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1585 ; store ciphertext
1586 vmovdqu [ptr_ciphertext+16*0], xmm1
1587 vmovdqu [ptr_ciphertext+16*1], xmm2
1588 vmovdqu [ptr_ciphertext+16*2], xmm3
1589 vmovdqu [ptr_ciphertext+16*3], xmm4
1590 vmovdqu [ptr_ciphertext+16*4], xmm5
1591 vmovdqu [ptr_ciphertext+16*5], xmm6
1592
1593 sub ptr_ciphertext, 16*1
1594 vmovdqa xmm8, xmm7
1595 jmp _done
1596
1597
1598
1599
1600
1601
1602 _num_blocks_is_6:
1603 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1604
1605 sub ptr_plaintext, 16*2
1606
1607 and N_val, 15 ; N_val = N_val mod 16
1608 je _done_6
1609
1610 _steal_cipher_6:
1611 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1612 shl twtempl, 1
1613 adc twtemph, twtemph
1614 cmovc ghash_poly_8b_temp, ghash_poly_8b
1615 xor twtempl, ghash_poly_8b_temp
1616 mov [TW+8*2], twtempl
1617 mov [TW+8*3], twtemph
1618
1619 vmovdqa [TW + 16*0] , xmm14
1620 vmovdqa xmm14, [TW+16*1]
1621
1622 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1623 ; store ciphertext
1624 vmovdqu [ptr_ciphertext+16*0], xmm1
1625 vmovdqu [ptr_ciphertext+16*1], xmm2
1626 vmovdqu [ptr_ciphertext+16*2], xmm3
1627 vmovdqu [ptr_ciphertext+16*3], xmm4
1628 vmovdqu [ptr_ciphertext+16*4], xmm5
1629
1630 sub ptr_ciphertext, 16*2
1631 vmovdqa xmm8, xmm6
1632 jmp _steal_cipher
1633
1634 _done_6:
1635 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1636 ; store ciphertext
1637 vmovdqu [ptr_ciphertext+16*0], xmm1
1638 vmovdqu [ptr_ciphertext+16*1], xmm2
1639 vmovdqu [ptr_ciphertext+16*2], xmm3
1640 vmovdqu [ptr_ciphertext+16*3], xmm4
1641 vmovdqu [ptr_ciphertext+16*4], xmm5
1642
1643 sub ptr_ciphertext, 16*2
1644 vmovdqa xmm8, xmm6
1645 jmp _done
1646
1647
1648
1649
1650
1651 _num_blocks_is_5:
1652 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1653
1654 sub ptr_plaintext, 16*3
1655
1656 and N_val, 15 ; N_val = N_val mod 16
1657 je _done_5
1658
1659 _steal_cipher_5:
1660 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1661 shl twtempl, 1
1662 adc twtemph, twtemph
1663 cmovc ghash_poly_8b_temp, ghash_poly_8b
1664 xor twtempl, ghash_poly_8b_temp
1665 mov [TW+8*2], twtempl
1666 mov [TW+8*3], twtemph
1667
1668 vmovdqa [TW + 16*0] , xmm13
1669 vmovdqa xmm13, [TW+16*1]
1670
1671 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1672 ; store ciphertext
1673 vmovdqu [ptr_ciphertext+16*0], xmm1
1674 vmovdqu [ptr_ciphertext+16*1], xmm2
1675 vmovdqu [ptr_ciphertext+16*2], xmm3
1676 vmovdqu [ptr_ciphertext+16*3], xmm4
1677
1678 sub ptr_ciphertext, 16*3
1679 vmovdqa xmm8, xmm5
1680 jmp _steal_cipher
1681
1682 _done_5:
1683 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1684 ; store ciphertext
1685 vmovdqu [ptr_ciphertext+16*0], xmm1
1686 vmovdqu [ptr_ciphertext+16*1], xmm2
1687 vmovdqu [ptr_ciphertext+16*2], xmm3
1688 vmovdqu [ptr_ciphertext+16*3], xmm4
1689
1690 sub ptr_ciphertext, 16*3
1691 vmovdqa xmm8, xmm5
1692 jmp _done
1693
1694
1695
1696
1697
1698 _num_blocks_is_4:
1699 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1700
1701 sub ptr_plaintext, 16*4
1702
1703 and N_val, 15 ; N_val = N_val mod 16
1704 je _done_4
1705
1706 _steal_cipher_4:
1707 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1708 shl twtempl, 1
1709 adc twtemph, twtemph
1710 cmovc ghash_poly_8b_temp, ghash_poly_8b
1711 xor twtempl, ghash_poly_8b_temp
1712 mov [TW+8*2], twtempl
1713 mov [TW+8*3], twtemph
1714
1715 vmovdqa [TW + 16*0] , xmm12
1716 vmovdqa xmm12, [TW+16*1]
1717
1718 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1719 ; store ciphertext
1720 vmovdqu [ptr_ciphertext+16*0], xmm1
1721 vmovdqu [ptr_ciphertext+16*1], xmm2
1722 vmovdqu [ptr_ciphertext+16*2], xmm3
1723
1724 sub ptr_ciphertext, 16*4
1725 vmovdqa xmm8, xmm4
1726 jmp _steal_cipher
1727
1728 _done_4:
1729 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1730 ; store ciphertext
1731 vmovdqu [ptr_ciphertext+16*0], xmm1
1732 vmovdqu [ptr_ciphertext+16*1], xmm2
1733 vmovdqu [ptr_ciphertext+16*2], xmm3
1734
1735 sub ptr_ciphertext, 16*4
1736 vmovdqa xmm8, xmm4
1737 jmp _done
1738
1739
1740
1741
1742 _num_blocks_is_3:
1743 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1744
1745 sub ptr_plaintext, 16*5
1746
1747 and N_val, 15 ; N_val = N_val mod 16
1748 je _done_3
1749
1750 _steal_cipher_3:
1751 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1752 shl twtempl, 1
1753 adc twtemph, twtemph
1754 cmovc ghash_poly_8b_temp, ghash_poly_8b
1755 xor twtempl, ghash_poly_8b_temp
1756 mov [TW+8*2], twtempl
1757 mov [TW+8*3], twtemph
1758
1759 vmovdqa [TW + 16*0] , xmm11
1760 vmovdqa xmm11, [TW+16*1]
1761
1762 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1763 ; store ciphertext
1764 vmovdqu [ptr_ciphertext+16*0], xmm1
1765 vmovdqu [ptr_ciphertext+16*1], xmm2
1766
1767 sub ptr_ciphertext, 16*5
1768 vmovdqa xmm8, xmm3
1769 jmp _steal_cipher
1770
1771 _done_3:
1772 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1773 ; store ciphertext
1774 vmovdqu [ptr_ciphertext+16*0], xmm1
1775 vmovdqu [ptr_ciphertext+16*1], xmm2
1776
1777 sub ptr_ciphertext, 16*5
1778 vmovdqa xmm8, xmm3
1779 jmp _done
1780
1781
1782
1783
1784
1785
1786 _num_blocks_is_2:
1787 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1788
1789 sub ptr_plaintext, 16*6
1790
1791 and N_val, 15 ; N_val = N_val mod 16
1792 je _done_2
1793
1794 _steal_cipher_2:
1795 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1796 shl twtempl, 1
1797 adc twtemph, twtemph
1798 cmovc ghash_poly_8b_temp, ghash_poly_8b
1799 xor twtempl, ghash_poly_8b_temp
1800 mov [TW+8*2], twtempl
1801 mov [TW+8*3], twtemph
1802
1803 vmovdqa [TW + 16*0] , xmm10
1804 vmovdqa xmm10, [TW+16*1]
1805
1806 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1807 ; store ciphertext
1808 vmovdqu [ptr_ciphertext], xmm1
1809
1810 sub ptr_ciphertext, 16*6
1811 vmovdqa xmm8, xmm2
1812 jmp _steal_cipher
1813
1814 _done_2:
1815 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1816 ; store ciphertext
1817 vmovdqu [ptr_ciphertext], xmm1
1818
1819 sub ptr_ciphertext, 16*6
1820 vmovdqa xmm8, xmm2
1821 jmp _done
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835 _num_blocks_is_1:
1836 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1837
1838 sub ptr_plaintext, 16*7
1839
1840 and N_val, 15 ; N_val = N_val mod 16
1841 je _done_1
1842
1843 _steal_cipher_1:
1844 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1845 shl twtempl, 1
1846 adc twtemph, twtemph
1847 cmovc ghash_poly_8b_temp, ghash_poly_8b
1848 xor twtempl, ghash_poly_8b_temp
1849 mov [TW+8*2], twtempl
1850 mov [TW+8*3], twtemph
1851
1852 vmovdqa [TW + 16*0] , xmm9
1853 vmovdqa xmm9, [TW+16*1]
1854
1855 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1856 ; store ciphertext
1857
1858 sub ptr_ciphertext, 16*7
1859 vmovdqa xmm8, xmm1
1860 jmp _steal_cipher
1861
1862 _done_1:
1863 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1864 ; store ciphertext
1865
1866 sub ptr_ciphertext, 16*7
1867 vmovdqa xmm8, xmm1
1868 jmp _done
1869
1870 section .data
1871 align 16
1872
1873 vpshufb_shf_table:
1874 ; use these values for shift constants for the vpshufb instruction
1875 ; different alignments result in values as shown:
1876 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1877 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1878 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1879 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1880 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1881 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1882 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1883 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1884 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1885 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1886 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1887 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1888 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1889 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1890 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1891 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1892 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1893
1894 mask1:
1895 dq 0x8080808080808080, 0x8080808080808080