]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_128_dec_expanded_key_avx.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS decrypt function with 128-bit AES
30 ; expanded keys are not aligned
31 ; plaintext and ciphertext are not aligned
32 ; second key is stored in the stack as aligned to 16 Bytes
33 ; first key is required only once, no need for storage of this key
34
35 %include "reg_sizes.asm"
36
37 default rel
38 %define TW rsp ; store 8 tweak values
39 %define keys rsp + 16*8 ; store 11 expanded keys
40
41 %ifidn __OUTPUT_FORMAT__, win64
42 %define _xmm rsp + 16*19 ; store xmm6:xmm15
43 %endif
44
45 %ifidn __OUTPUT_FORMAT__, elf64
46 %define _gpr rsp + 16*19 ; store rbx
47 %define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
48 %else
49 %define _gpr rsp + 16*29 ; store rdi, rsi, rbx
50 %define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %endif
52
53 %define GHASH_POLY 0x87
54
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 ;void XTS_AES_128_dec_expanded_key_avx(
57 ; UINT8 *k2, // key used for tweaking, 16*11 bytes
58 ; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
59 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
60 ; UINT64 N, // sector size, in bytes
61 ; const UINT8 *ct, // ciphertext sector input data
62 ; UINT8 *pt); // plaintext sector output data
63 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
64
65 ; arguments for input parameters
66 %ifidn __OUTPUT_FORMAT__, elf64
67 %xdefine ptr_key2 rdi
68 %xdefine ptr_key1 rsi
69 %xdefine T_val rdx
70 %xdefine N_val rcx
71 %xdefine ptr_plaintext r8
72 %xdefine ptr_ciphertext r9
73 %else
74 %xdefine ptr_key2 rcx
75 %xdefine ptr_key1 rdx
76 %xdefine T_val r8
77 %xdefine N_val r9
78 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
79 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
80 %endif
81
82 ; arguments for temp parameters
83 %ifidn __OUTPUT_FORMAT__, elf64
84 %define tmp1 rdi
85 %define target_ptr_val rsi
86 %define ghash_poly_8b r10
87 %define ghash_poly_8b_temp r11
88 %else
89 %define tmp1 rcx
90 %define target_ptr_val rdx
91 %define ghash_poly_8b rdi
92 %define ghash_poly_8b_temp rsi
93 %endif
94
95 %define twtempl rax ; global temp registers used for tweak computation
96 %define twtemph rbx
97
98
99 ; macro to encrypt the tweak value
100
101 %macro encrypt_T 8
102 %define %%xkey2 %1
103 %define %%xstate_tweak %2
104 %define %%xkey1 %3
105 %define %%xraw_key %4
106 %define %%xtmp %5
107 %define %%ptr_key2 %6
108 %define %%ptr_key1 %7
109 %define %%ptr_expanded_keys %8
110
111 vmovdqu %%xkey2, [%%ptr_key2]
112 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
113
114 vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
115 vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
116
117 vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
118 vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
119
120 vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
121 vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
122
123
124 vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
125 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
126
127 vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
128 vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
129
130 vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
131 vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
132
133 vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
134 vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
135
136 vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
137 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
138
139 vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
140 vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
141
142 vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
143 vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
144
145 vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
146 vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
147
148 vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
149 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
150
151 vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
152 vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
153
154 vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
155 vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
156
157 vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
158 vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
159
160
161 vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
162 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
163
164 vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
165 vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
166
167
168 vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
169 vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
170
171 vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
172 vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
173
174
175
176
177 vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
178 vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
179
180 vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
181 vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
182
183 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
184 %endmacro
185
186
187 ; generate initial tweak values
188 ; load initial plaintext values
189 %macro initialize 16
190
191 %define %%ST1 %1 ; state 1
192 %define %%ST2 %2 ; state 2
193 %define %%ST3 %3 ; state 3
194 %define %%ST4 %4 ; state 4
195 %define %%ST5 %5 ; state 5
196 %define %%ST6 %6 ; state 6
197 %define %%ST7 %7 ; state 7
198 %define %%ST8 %8 ; state 8
199
200 %define %%TW1 %9 ; tweak 1
201 %define %%TW2 %10 ; tweak 2
202 %define %%TW3 %11 ; tweak 3
203 %define %%TW4 %12 ; tweak 4
204 %define %%TW5 %13 ; tweak 5
205 %define %%TW6 %14 ; tweak 6
206 %define %%TW7 %15 ; tweak 7
207
208 %define %%num_initial_blocks %16
209
210
211 ; generate next Tweak values
212 vmovdqa %%TW1, [TW+16*0]
213 mov twtempl, [TW+8*0]
214 mov twtemph, [TW+8*1]
215 vmovdqu %%ST1, [ptr_plaintext+16*0]
216 %if (%%num_initial_blocks>=2)
217 xor ghash_poly_8b_temp, ghash_poly_8b_temp
218 shl twtempl, 1
219 adc twtemph, twtemph
220 cmovc ghash_poly_8b_temp, ghash_poly_8b
221 xor twtempl, ghash_poly_8b_temp
222 mov [TW+8*2], twtempl
223 mov [TW+8*3], twtemph;
224 vmovdqa %%TW2, [TW+16*1]
225 vmovdqu %%ST2, [ptr_plaintext+16*1]
226 %endif
227 %if (%%num_initial_blocks>=3)
228 xor ghash_poly_8b_temp, ghash_poly_8b_temp
229 shl twtempl, 1
230 adc twtemph, twtemph
231 cmovc ghash_poly_8b_temp, ghash_poly_8b
232 xor twtempl, ghash_poly_8b_temp
233 mov [TW+8*4], twtempl
234 mov [TW+8*5], twtemph;
235 vmovdqa %%TW3, [TW+16*2]
236 vmovdqu %%ST3, [ptr_plaintext+16*2]
237 %endif
238 %if (%%num_initial_blocks>=4)
239 xor ghash_poly_8b_temp, ghash_poly_8b_temp
240 shl twtempl, 1
241 adc twtemph, twtemph
242 cmovc ghash_poly_8b_temp, ghash_poly_8b
243 xor twtempl, ghash_poly_8b_temp
244 mov [TW+8*6], twtempl
245 mov [TW+8*7], twtemph;
246 vmovdqa %%TW4, [TW+16*3]
247 vmovdqu %%ST4, [ptr_plaintext+16*3]
248 %endif
249 %if (%%num_initial_blocks>=5)
250 xor ghash_poly_8b_temp, ghash_poly_8b_temp
251 shl twtempl, 1
252 adc twtemph, twtemph
253 cmovc ghash_poly_8b_temp, ghash_poly_8b
254 xor twtempl, ghash_poly_8b_temp
255 mov [TW+8*8], twtempl
256 mov [TW+8*9], twtemph;
257 vmovdqa %%TW5, [TW+16*4]
258 vmovdqu %%ST5, [ptr_plaintext+16*4]
259 %endif
260 %if (%%num_initial_blocks>=6)
261 xor ghash_poly_8b_temp, ghash_poly_8b_temp
262 shl twtempl, 1
263 adc twtemph, twtemph
264 cmovc ghash_poly_8b_temp, ghash_poly_8b
265 xor twtempl, ghash_poly_8b_temp
266 mov [TW+8*10], twtempl
267 mov [TW+8*11], twtemph;
268 vmovdqa %%TW6, [TW+16*5]
269 vmovdqu %%ST6, [ptr_plaintext+16*5]
270 %endif
271 %if (%%num_initial_blocks>=7)
272 xor ghash_poly_8b_temp, ghash_poly_8b_temp
273 shl twtempl, 1
274 adc twtemph, twtemph
275 cmovc ghash_poly_8b_temp, ghash_poly_8b
276 xor twtempl, ghash_poly_8b_temp
277 mov [TW+8*12], twtempl
278 mov [TW+8*13], twtemph;
279 vmovdqa %%TW7, [TW+16*6]
280 vmovdqu %%ST7, [ptr_plaintext+16*6]
281 %endif
282
283
284
285 %endmacro
286
287
288 ; encrypt initial blocks of AES
289 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
290 ; next 8 Tweak values are generated
291 %macro encrypt_initial 18
292 %define %%ST1 %1 ; state 1
293 %define %%ST2 %2 ; state 2
294 %define %%ST3 %3 ; state 3
295 %define %%ST4 %4 ; state 4
296 %define %%ST5 %5 ; state 5
297 %define %%ST6 %6 ; state 6
298 %define %%ST7 %7 ; state 7
299 %define %%ST8 %8 ; state 8
300
301 %define %%TW1 %9 ; tweak 1
302 %define %%TW2 %10 ; tweak 2
303 %define %%TW3 %11 ; tweak 3
304 %define %%TW4 %12 ; tweak 4
305 %define %%TW5 %13 ; tweak 5
306 %define %%TW6 %14 ; tweak 6
307 %define %%TW7 %15 ; tweak 7
308 %define %%T0 %16 ; Temp register
309 %define %%num_blocks %17
310 ; %%num_blocks blocks encrypted
311 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
312
313 %define %%lt128 %18 ; less than 128 bytes
314
315 ; xor Tweak value
316 vpxor %%ST1, %%TW1
317 %if (%%num_blocks>=2)
318 vpxor %%ST2, %%TW2
319 %endif
320 %if (%%num_blocks>=3)
321 vpxor %%ST3, %%TW3
322 %endif
323 %if (%%num_blocks>=4)
324 vpxor %%ST4, %%TW4
325 %endif
326 %if (%%num_blocks>=5)
327 vpxor %%ST5, %%TW5
328 %endif
329 %if (%%num_blocks>=6)
330 vpxor %%ST6, %%TW6
331 %endif
332 %if (%%num_blocks>=7)
333 vpxor %%ST7, %%TW7
334 %endif
335
336
337 ; ARK
338 vmovdqa %%T0, [keys]
339 vpxor %%ST1, %%T0
340 %if (%%num_blocks>=2)
341 vpxor %%ST2, %%T0
342 %endif
343 %if (%%num_blocks>=3)
344 vpxor %%ST3, %%T0
345 %endif
346 %if (%%num_blocks>=4)
347 vpxor %%ST4, %%T0
348 %endif
349 %if (%%num_blocks>=5)
350 vpxor %%ST5, %%T0
351 %endif
352 %if (%%num_blocks>=6)
353 vpxor %%ST6, %%T0
354 %endif
355 %if (%%num_blocks>=7)
356 vpxor %%ST7, %%T0
357 %endif
358
359
360 %if (0 == %%lt128)
361 xor ghash_poly_8b_temp, ghash_poly_8b_temp
362 shl twtempl, 1
363 adc twtemph, twtemph
364 %endif
365
366 ; round 1
367 vmovdqa %%T0, [keys + 16*1]
368 vaesdec %%ST1, %%T0
369 %if (%%num_blocks>=2)
370 vaesdec %%ST2, %%T0
371 %endif
372 %if (%%num_blocks>=3)
373 vaesdec %%ST3, %%T0
374 %endif
375 %if (%%num_blocks>=4)
376 vaesdec %%ST4, %%T0
377 %endif
378 %if (%%num_blocks>=5)
379 vaesdec %%ST5, %%T0
380 %endif
381 %if (%%num_blocks>=6)
382 vaesdec %%ST6, %%T0
383 %endif
384 %if (%%num_blocks>=7)
385 vaesdec %%ST7, %%T0
386 %endif
387 %if (0 == %%lt128)
388 cmovc ghash_poly_8b_temp, ghash_poly_8b
389 xor twtempl, ghash_poly_8b_temp
390 mov [TW + 8*0], twtempl ; next Tweak1 generated
391 mov [TW + 8*1], twtemph
392 xor ghash_poly_8b_temp, ghash_poly_8b_temp
393 %endif
394
395 ; round 2
396 vmovdqa %%T0, [keys + 16*2]
397 vaesdec %%ST1, %%T0
398 %if (%%num_blocks>=2)
399 vaesdec %%ST2, %%T0
400 %endif
401 %if (%%num_blocks>=3)
402 vaesdec %%ST3, %%T0
403 %endif
404 %if (%%num_blocks>=4)
405 vaesdec %%ST4, %%T0
406 %endif
407 %if (%%num_blocks>=5)
408 vaesdec %%ST5, %%T0
409 %endif
410 %if (%%num_blocks>=6)
411 vaesdec %%ST6, %%T0
412 %endif
413 %if (%%num_blocks>=7)
414 vaesdec %%ST7, %%T0
415 %endif
416
417 %if (0 == %%lt128)
418 shl twtempl, 1
419 adc twtemph, twtemph
420 cmovc ghash_poly_8b_temp, ghash_poly_8b
421 xor twtempl, ghash_poly_8b_temp
422 mov [TW + 8*2], twtempl ; next Tweak2 generated
423 %endif
424
425 ; round 3
426 vmovdqa %%T0, [keys + 16*3]
427 vaesdec %%ST1, %%T0
428 %if (%%num_blocks>=2)
429 vaesdec %%ST2, %%T0
430 %endif
431 %if (%%num_blocks>=3)
432 vaesdec %%ST3, %%T0
433 %endif
434 %if (%%num_blocks>=4)
435 vaesdec %%ST4, %%T0
436 %endif
437 %if (%%num_blocks>=5)
438 vaesdec %%ST5, %%T0
439 %endif
440 %if (%%num_blocks>=6)
441 vaesdec %%ST6, %%T0
442 %endif
443 %if (%%num_blocks>=7)
444 vaesdec %%ST7, %%T0
445 %endif
446 %if (0 == %%lt128)
447 mov [TW + 8*3], twtemph
448 xor ghash_poly_8b_temp, ghash_poly_8b_temp
449 shl twtempl, 1
450 adc twtemph, twtemph
451 cmovc ghash_poly_8b_temp, ghash_poly_8b
452 %endif
453
454 ; round 4
455 vmovdqa %%T0, [keys + 16*4]
456 vaesdec %%ST1, %%T0
457 %if (%%num_blocks>=2)
458 vaesdec %%ST2, %%T0
459 %endif
460 %if (%%num_blocks>=3)
461 vaesdec %%ST3, %%T0
462 %endif
463 %if (%%num_blocks>=4)
464 vaesdec %%ST4, %%T0
465 %endif
466 %if (%%num_blocks>=5)
467 vaesdec %%ST5, %%T0
468 %endif
469 %if (%%num_blocks>=6)
470 vaesdec %%ST6, %%T0
471 %endif
472 %if (%%num_blocks>=7)
473 vaesdec %%ST7, %%T0
474 %endif
475
476 %if (0 == %%lt128)
477 xor twtempl, ghash_poly_8b_temp
478 mov [TW + 8*4], twtempl ; next Tweak3 generated
479 mov [TW + 8*5], twtemph
480 xor ghash_poly_8b_temp, ghash_poly_8b_temp
481 shl twtempl, 1
482 %endif
483
484 ; round 5
485 vmovdqa %%T0, [keys + 16*5]
486 vaesdec %%ST1, %%T0
487 %if (%%num_blocks>=2)
488 vaesdec %%ST2, %%T0
489 %endif
490 %if (%%num_blocks>=3)
491 vaesdec %%ST3, %%T0
492 %endif
493 %if (%%num_blocks>=4)
494 vaesdec %%ST4, %%T0
495 %endif
496 %if (%%num_blocks>=5)
497 vaesdec %%ST5, %%T0
498 %endif
499 %if (%%num_blocks>=6)
500 vaesdec %%ST6, %%T0
501 %endif
502 %if (%%num_blocks>=7)
503 vaesdec %%ST7, %%T0
504 %endif
505
506 %if (0 == %%lt128)
507 adc twtemph, twtemph
508 cmovc ghash_poly_8b_temp, ghash_poly_8b
509 xor twtempl, ghash_poly_8b_temp
510 mov [TW + 8*6], twtempl ; next Tweak4 generated
511 mov [TW + 8*7], twtemph
512 %endif
513
514 ; round 6
515 vmovdqa %%T0, [keys + 16*6]
516 vaesdec %%ST1, %%T0
517 %if (%%num_blocks>=2)
518 vaesdec %%ST2, %%T0
519 %endif
520 %if (%%num_blocks>=3)
521 vaesdec %%ST3, %%T0
522 %endif
523 %if (%%num_blocks>=4)
524 vaesdec %%ST4, %%T0
525 %endif
526 %if (%%num_blocks>=5)
527 vaesdec %%ST5, %%T0
528 %endif
529 %if (%%num_blocks>=6)
530 vaesdec %%ST6, %%T0
531 %endif
532 %if (%%num_blocks>=7)
533 vaesdec %%ST7, %%T0
534 %endif
535
536 %if (0 == %%lt128)
537 xor ghash_poly_8b_temp, ghash_poly_8b_temp
538 shl twtempl, 1
539 adc twtemph, twtemph
540 cmovc ghash_poly_8b_temp, ghash_poly_8b
541 xor twtempl, ghash_poly_8b_temp
542 mov [TW + 8*8], twtempl ; next Tweak5 generated
543 mov [TW + 8*9], twtemph
544 %endif
545
546 ; round 7
547 vmovdqa %%T0, [keys + 16*7]
548 vaesdec %%ST1, %%T0
549 %if (%%num_blocks>=2)
550 vaesdec %%ST2, %%T0
551 %endif
552 %if (%%num_blocks>=3)
553 vaesdec %%ST3, %%T0
554 %endif
555 %if (%%num_blocks>=4)
556 vaesdec %%ST4, %%T0
557 %endif
558 %if (%%num_blocks>=5)
559 vaesdec %%ST5, %%T0
560 %endif
561 %if (%%num_blocks>=6)
562 vaesdec %%ST6, %%T0
563 %endif
564 %if (%%num_blocks>=7)
565 vaesdec %%ST7, %%T0
566 %endif
567
568 %if (0 == %%lt128)
569 xor ghash_poly_8b_temp, ghash_poly_8b_temp
570 shl twtempl, 1
571 adc twtemph, twtemph
572 cmovc ghash_poly_8b_temp, ghash_poly_8b
573 xor twtempl, ghash_poly_8b_temp
574 mov [TW + 8*10], twtempl ; next Tweak6 generated
575 mov [TW + 8*11], twtemph
576 %endif
577 ; round 8
578 vmovdqa %%T0, [keys + 16*8]
579 vaesdec %%ST1, %%T0
580 %if (%%num_blocks>=2)
581 vaesdec %%ST2, %%T0
582 %endif
583 %if (%%num_blocks>=3)
584 vaesdec %%ST3, %%T0
585 %endif
586 %if (%%num_blocks>=4)
587 vaesdec %%ST4, %%T0
588 %endif
589 %if (%%num_blocks>=5)
590 vaesdec %%ST5, %%T0
591 %endif
592 %if (%%num_blocks>=6)
593 vaesdec %%ST6, %%T0
594 %endif
595 %if (%%num_blocks>=7)
596 vaesdec %%ST7, %%T0
597 %endif
598
599 %if (0 == %%lt128)
600 xor ghash_poly_8b_temp, ghash_poly_8b_temp
601 shl twtempl, 1
602 adc twtemph, twtemph
603 cmovc ghash_poly_8b_temp, ghash_poly_8b
604 xor twtempl, ghash_poly_8b_temp
605 mov [TW + 8*12], twtempl ; next Tweak7 generated
606 mov [TW + 8*13], twtemph
607 %endif
608 ; round 9
609 vmovdqa %%T0, [keys + 16*9]
610 vaesdec %%ST1, %%T0
611 %if (%%num_blocks>=2)
612 vaesdec %%ST2, %%T0
613 %endif
614 %if (%%num_blocks>=3)
615 vaesdec %%ST3, %%T0
616 %endif
617 %if (%%num_blocks>=4)
618 vaesdec %%ST4, %%T0
619 %endif
620 %if (%%num_blocks>=5)
621 vaesdec %%ST5, %%T0
622 %endif
623 %if (%%num_blocks>=6)
624 vaesdec %%ST6, %%T0
625 %endif
626 %if (%%num_blocks>=7)
627 vaesdec %%ST7, %%T0
628 %endif
629
630 %if (0 == %%lt128)
631 xor ghash_poly_8b_temp, ghash_poly_8b_temp
632 shl twtempl, 1
633 adc twtemph, twtemph
634 cmovc ghash_poly_8b_temp, ghash_poly_8b
635 xor twtempl, ghash_poly_8b_temp
636 mov [TW + 8*14], twtempl ; next Tweak8 generated
637 mov [TW + 8*15], twtemph
638 %endif
639
640
641 ; round 10
642 vmovdqa %%T0, [keys + 16*10]
643 vaesdeclast %%ST1, %%T0
644 %if (%%num_blocks>=2)
645 vaesdeclast %%ST2, %%T0
646 %endif
647 %if (%%num_blocks>=3)
648 vaesdeclast %%ST3, %%T0
649 %endif
650 %if (%%num_blocks>=4)
651 vaesdeclast %%ST4, %%T0
652 %endif
653 %if (%%num_blocks>=5)
654 vaesdeclast %%ST5, %%T0
655 %endif
656 %if (%%num_blocks>=6)
657 vaesdeclast %%ST6, %%T0
658 %endif
659 %if (%%num_blocks>=7)
660 vaesdeclast %%ST7, %%T0
661 %endif
662
663 ; xor Tweak values
664 vpxor %%ST1, %%TW1
665 %if (%%num_blocks>=2)
666 vpxor %%ST2, %%TW2
667 %endif
668 %if (%%num_blocks>=3)
669 vpxor %%ST3, %%TW3
670 %endif
671 %if (%%num_blocks>=4)
672 vpxor %%ST4, %%TW4
673 %endif
674 %if (%%num_blocks>=5)
675 vpxor %%ST5, %%TW5
676 %endif
677 %if (%%num_blocks>=6)
678 vpxor %%ST6, %%TW6
679 %endif
680 %if (%%num_blocks>=7)
681 vpxor %%ST7, %%TW7
682 %endif
683
684
685 %if (0 == %%lt128)
686 ; load next Tweak values
687 vmovdqa %%TW1, [TW + 16*0]
688 vmovdqa %%TW2, [TW + 16*1]
689 vmovdqa %%TW3, [TW + 16*2]
690 vmovdqa %%TW4, [TW + 16*3]
691 vmovdqa %%TW5, [TW + 16*4]
692 vmovdqa %%TW6, [TW + 16*5]
693 vmovdqa %%TW7, [TW + 16*6]
694
695 %endif
696
697 %endmacro
698
699
700 ; Encrypt 8 blocks in parallel
701 ; generate next 8 tweak values
702 %macro encrypt_by_eight 18
703 %define %%ST1 %1 ; state 1
704 %define %%ST2 %2 ; state 2
705 %define %%ST3 %3 ; state 3
706 %define %%ST4 %4 ; state 4
707 %define %%ST5 %5 ; state 5
708 %define %%ST6 %6 ; state 6
709 %define %%ST7 %7 ; state 7
710 %define %%ST8 %8 ; state 8
711 %define %%TW1 %9 ; tweak 1
712 %define %%TW2 %10 ; tweak 2
713 %define %%TW3 %11 ; tweak 3
714 %define %%TW4 %12 ; tweak 4
715 %define %%TW5 %13 ; tweak 5
716 %define %%TW6 %14 ; tweak 6
717 %define %%TW7 %15 ; tweak 7
718 %define %%TW8 %16 ; tweak 8
719 %define %%T0 %17 ; Temp register
720 %define %%last_eight %18
721
722 ; xor Tweak values
723 vpxor %%ST1, %%TW1
724 vpxor %%ST2, %%TW2
725 vpxor %%ST3, %%TW3
726 vpxor %%ST4, %%TW4
727 vpxor %%ST5, %%TW5
728 vpxor %%ST6, %%TW6
729 vpxor %%ST7, %%TW7
730 vpxor %%ST8, %%TW8
731
732 ; ARK
733 vmovdqa %%T0, [keys]
734 vpxor %%ST1, %%T0
735 vpxor %%ST2, %%T0
736 vpxor %%ST3, %%T0
737 vpxor %%ST4, %%T0
738 vpxor %%ST5, %%T0
739 vpxor %%ST6, %%T0
740 vpxor %%ST7, %%T0
741 vpxor %%ST8, %%T0
742
743 %if (0 == %%last_eight)
744 xor ghash_poly_8b_temp, ghash_poly_8b_temp
745 shl twtempl, 1
746 adc twtemph, twtemph
747 cmovc ghash_poly_8b_temp, ghash_poly_8b
748 %endif
749 ; round 1
750 vmovdqa %%T0, [keys + 16*1]
751 vaesdec %%ST1, %%T0
752 vaesdec %%ST2, %%T0
753 vaesdec %%ST3, %%T0
754 vaesdec %%ST4, %%T0
755 vaesdec %%ST5, %%T0
756 vaesdec %%ST6, %%T0
757 vaesdec %%ST7, %%T0
758 vaesdec %%ST8, %%T0
759 %if (0 == %%last_eight)
760 xor twtempl, ghash_poly_8b_temp
761 mov [TW + 8*0], twtempl
762 mov [TW + 8*1], twtemph
763 xor ghash_poly_8b_temp, ghash_poly_8b_temp
764 %endif
765 ; round 2
766 vmovdqa %%T0, [keys + 16*2]
767 vaesdec %%ST1, %%T0
768 vaesdec %%ST2, %%T0
769 vaesdec %%ST3, %%T0
770 vaesdec %%ST4, %%T0
771 vaesdec %%ST5, %%T0
772 vaesdec %%ST6, %%T0
773 vaesdec %%ST7, %%T0
774 vaesdec %%ST8, %%T0
775 %if (0 == %%last_eight)
776 shl twtempl, 1
777 adc twtemph, twtemph
778 cmovc ghash_poly_8b_temp, ghash_poly_8b
779 xor twtempl, ghash_poly_8b_temp
780
781 %endif
782 ; round 3
783 vmovdqa %%T0, [keys + 16*3]
784 vaesdec %%ST1, %%T0
785 vaesdec %%ST2, %%T0
786 vaesdec %%ST3, %%T0
787 vaesdec %%ST4, %%T0
788 vaesdec %%ST5, %%T0
789 vaesdec %%ST6, %%T0
790 vaesdec %%ST7, %%T0
791 vaesdec %%ST8, %%T0
792 %if (0 == %%last_eight)
793 mov [TW + 8*2], twtempl
794 mov [TW + 8*3], twtemph
795 xor ghash_poly_8b_temp, ghash_poly_8b_temp
796 shl twtempl, 1
797 %endif
798 ; round 4
799 vmovdqa %%T0, [keys + 16*4]
800 vaesdec %%ST1, %%T0
801 vaesdec %%ST2, %%T0
802 vaesdec %%ST3, %%T0
803 vaesdec %%ST4, %%T0
804 vaesdec %%ST5, %%T0
805 vaesdec %%ST6, %%T0
806 vaesdec %%ST7, %%T0
807 vaesdec %%ST8, %%T0
808 %if (0 == %%last_eight)
809 adc twtemph, twtemph
810 cmovc ghash_poly_8b_temp, ghash_poly_8b
811 xor twtempl, ghash_poly_8b_temp
812 mov [TW + 8*4], twtempl
813 %endif
814 ; round 5
815 vmovdqa %%T0, [keys + 16*5]
816 vaesdec %%ST1, %%T0
817 vaesdec %%ST2, %%T0
818 vaesdec %%ST3, %%T0
819 vaesdec %%ST4, %%T0
820 vaesdec %%ST5, %%T0
821 vaesdec %%ST6, %%T0
822 vaesdec %%ST7, %%T0
823 vaesdec %%ST8, %%T0
824 %if (0 == %%last_eight)
825 mov [TW + 8*5], twtemph
826 xor ghash_poly_8b_temp, ghash_poly_8b_temp
827 shl twtempl, 1
828 adc twtemph, twtemph
829 %endif
830 ; round 6
831 vmovdqa %%T0, [keys + 16*6]
832 vaesdec %%ST1, %%T0
833 vaesdec %%ST2, %%T0
834 vaesdec %%ST3, %%T0
835 vaesdec %%ST4, %%T0
836 vaesdec %%ST5, %%T0
837 vaesdec %%ST6, %%T0
838 vaesdec %%ST7, %%T0
839 vaesdec %%ST8, %%T0
840 %if (0 == %%last_eight)
841 cmovc ghash_poly_8b_temp, ghash_poly_8b
842 xor twtempl, ghash_poly_8b_temp
843 mov [TW + 8*6], twtempl
844 mov [TW + 8*7], twtemph
845 %endif
846 ; round 7
847 vmovdqa %%T0, [keys + 16*7]
848 vaesdec %%ST1, %%T0
849 vaesdec %%ST2, %%T0
850 vaesdec %%ST3, %%T0
851 vaesdec %%ST4, %%T0
852 vaesdec %%ST5, %%T0
853 vaesdec %%ST6, %%T0
854 vaesdec %%ST7, %%T0
855 vaesdec %%ST8, %%T0
856 %if (0 == %%last_eight)
857 xor ghash_poly_8b_temp, ghash_poly_8b_temp
858 shl twtempl, 1
859 adc twtemph, twtemph
860 cmovc ghash_poly_8b_temp, ghash_poly_8b
861 %endif
862 ; round 8
863 vmovdqa %%T0, [keys + 16*8]
864 vaesdec %%ST1, %%T0
865 vaesdec %%ST2, %%T0
866 vaesdec %%ST3, %%T0
867 vaesdec %%ST4, %%T0
868 vaesdec %%ST5, %%T0
869 vaesdec %%ST6, %%T0
870 vaesdec %%ST7, %%T0
871 vaesdec %%ST8, %%T0
872 %if (0 == %%last_eight)
873 xor twtempl, ghash_poly_8b_temp
874 mov [TW + 8*8], twtempl
875 mov [TW + 8*9], twtemph
876 xor ghash_poly_8b_temp, ghash_poly_8b_temp
877 %endif
878 ; round 9
879 vmovdqa %%T0, [keys + 16*9]
880 vaesdec %%ST1, %%T0
881 vaesdec %%ST2, %%T0
882 vaesdec %%ST3, %%T0
883 vaesdec %%ST4, %%T0
884 vaesdec %%ST5, %%T0
885 vaesdec %%ST6, %%T0
886 vaesdec %%ST7, %%T0
887 vaesdec %%ST8, %%T0
888 %if (0 == %%last_eight)
889 shl twtempl, 1
890 adc twtemph, twtemph
891 cmovc ghash_poly_8b_temp, ghash_poly_8b
892 xor twtempl, ghash_poly_8b_temp
893 %endif
894
895 %if (0 == %%last_eight)
896 mov [TW + 8*10], twtempl
897 mov [TW + 8*11], twtemph
898 xor ghash_poly_8b_temp, ghash_poly_8b_temp
899 shl twtempl, 1
900 %endif
901
902 %if (0 == %%last_eight)
903 adc twtemph, twtemph
904 cmovc ghash_poly_8b_temp, ghash_poly_8b
905 xor twtempl, ghash_poly_8b_temp
906 mov [TW + 8*12], twtempl
907 %endif
908
909 %if (0 == %%last_eight)
910 mov [TW + 8*13], twtemph
911 xor ghash_poly_8b_temp, ghash_poly_8b_temp
912 shl twtempl, 1
913 adc twtemph, twtemph
914 %endif
915
916 %if (0 == %%last_eight)
917 cmovc ghash_poly_8b_temp, ghash_poly_8b
918 xor twtempl, ghash_poly_8b_temp
919 ; mov [TW + 8*14], twtempl
920 ; mov [TW + 8*15], twtemph
921 %endif
922 ; round 10
923 vmovdqa %%T0, [keys + 16*10]
924 vaesdeclast %%ST1, %%T0
925 vaesdeclast %%ST2, %%T0
926 vaesdeclast %%ST3, %%T0
927 vaesdeclast %%ST4, %%T0
928 vaesdeclast %%ST5, %%T0
929 vaesdeclast %%ST6, %%T0
930 vaesdeclast %%ST7, %%T0
931 vaesdeclast %%ST8, %%T0
932
933 ; xor Tweak values
934 vpxor %%ST1, %%TW1
935 vpxor %%ST2, %%TW2
936 vpxor %%ST3, %%TW3
937 vpxor %%ST4, %%TW4
938 vpxor %%ST5, %%TW5
939 vpxor %%ST6, %%TW6
940 vpxor %%ST7, %%TW7
941 vpxor %%ST8, %%TW8
942
943 mov [TW + 8*14], twtempl
944 mov [TW + 8*15], twtemph
945 ; load next Tweak values
946 vmovdqa %%TW1, [TW + 16*0]
947 vmovdqa %%TW2, [TW + 16*1]
948 vmovdqa %%TW3, [TW + 16*2]
949 vmovdqa %%TW4, [TW + 16*3]
950 vmovdqa %%TW5, [TW + 16*4]
951 vmovdqa %%TW6, [TW + 16*5]
952 vmovdqa %%TW7, [TW + 16*6]
953
954 %endmacro
955
956
957 section .text
958
959 global XTS_AES_128_dec_expanded_key_avx:function
960 XTS_AES_128_dec_expanded_key_avx:
961
962 sub rsp, VARIABLE_OFFSET
963
964 mov [_gpr + 8*0], rbx
965 %ifidn __OUTPUT_FORMAT__, win64
966 mov [_gpr + 8*1], rdi
967 mov [_gpr + 8*2], rsi
968
969 vmovdqa [_xmm + 16*0], xmm6
970 vmovdqa [_xmm + 16*1], xmm7
971 vmovdqa [_xmm + 16*2], xmm8
972 vmovdqa [_xmm + 16*3], xmm9
973 vmovdqa [_xmm + 16*4], xmm10
974 vmovdqa [_xmm + 16*5], xmm11
975 vmovdqa [_xmm + 16*6], xmm12
976 vmovdqa [_xmm + 16*7], xmm13
977 vmovdqa [_xmm + 16*8], xmm14
978 vmovdqa [_xmm + 16*9], xmm15
979 %endif
980
981 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
982
983
984 vmovdqu xmm1, [T_val] ; read initial Tweak value
985 vpxor xmm4, xmm4 ; for key expansion
986 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
987
988
989 %ifidn __OUTPUT_FORMAT__, win64
990 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
991 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
992 %endif
993
994
995
996 mov target_ptr_val, N_val
997 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
998 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
999 jl _less_than_128_bytes
1000
1001 add target_ptr_val, ptr_ciphertext
1002
1003
1004 mov tmp1, N_val
1005 and tmp1, (7 << 4)
1006 jz _initial_num_blocks_is_0
1007
1008 cmp tmp1, (4 << 4)
1009 je _initial_num_blocks_is_4
1010
1011
1012
1013 cmp tmp1, (6 << 4)
1014 je _initial_num_blocks_is_6
1015
1016 cmp tmp1, (5 << 4)
1017 je _initial_num_blocks_is_5
1018
1019
1020
1021 cmp tmp1, (3 << 4)
1022 je _initial_num_blocks_is_3
1023
1024 cmp tmp1, (2 << 4)
1025 je _initial_num_blocks_is_2
1026
1027 cmp tmp1, (1 << 4)
1028 je _initial_num_blocks_is_1
1029
1030 _initial_num_blocks_is_7:
1031 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1032 add ptr_plaintext, 16*7
1033 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1034 ; store ciphertext
1035 vmovdqu [ptr_ciphertext+16*0], xmm1
1036 vmovdqu [ptr_ciphertext+16*1], xmm2
1037 vmovdqu [ptr_ciphertext+16*2], xmm3
1038 vmovdqu [ptr_ciphertext+16*3], xmm4
1039 vmovdqu [ptr_ciphertext+16*4], xmm5
1040 vmovdqu [ptr_ciphertext+16*5], xmm6
1041 vmovdqu [ptr_ciphertext+16*6], xmm7
1042 add ptr_ciphertext, 16*7
1043
1044 cmp ptr_ciphertext, target_ptr_val
1045 je _last_eight
1046
1047 jmp _main_loop
1048 _initial_num_blocks_is_6:
1049 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1050 add ptr_plaintext, 16*6
1051 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1052 ; store ciphertext
1053 vmovdqu [ptr_ciphertext+16*0], xmm1
1054 vmovdqu [ptr_ciphertext+16*1], xmm2
1055 vmovdqu [ptr_ciphertext+16*2], xmm3
1056 vmovdqu [ptr_ciphertext+16*3], xmm4
1057 vmovdqu [ptr_ciphertext+16*4], xmm5
1058 vmovdqu [ptr_ciphertext+16*5], xmm6
1059 add ptr_ciphertext, 16*6
1060
1061 cmp ptr_ciphertext, target_ptr_val
1062 je _last_eight
1063
1064 jmp _main_loop
1065 _initial_num_blocks_is_5:
1066 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1067 add ptr_plaintext, 16*5
1068 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1069 ; store ciphertext
1070 vmovdqu [ptr_ciphertext+16*0], xmm1
1071 vmovdqu [ptr_ciphertext+16*1], xmm2
1072 vmovdqu [ptr_ciphertext+16*2], xmm3
1073 vmovdqu [ptr_ciphertext+16*3], xmm4
1074 vmovdqu [ptr_ciphertext+16*4], xmm5
1075 add ptr_ciphertext, 16*5
1076
1077 cmp ptr_ciphertext, target_ptr_val
1078 je _last_eight
1079
1080 jmp _main_loop
1081 _initial_num_blocks_is_4:
1082 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1083 add ptr_plaintext, 16*4
1084 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1085 ; store ciphertext
1086 vmovdqu [ptr_ciphertext+16*0], xmm1
1087 vmovdqu [ptr_ciphertext+16*1], xmm2
1088 vmovdqu [ptr_ciphertext+16*2], xmm3
1089 vmovdqu [ptr_ciphertext+16*3], xmm4
1090 add ptr_ciphertext, 16*4
1091
1092 cmp ptr_ciphertext, target_ptr_val
1093 je _last_eight
1094
1095 jmp _main_loop
1096
1097
1098 _initial_num_blocks_is_3:
1099 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1100 add ptr_plaintext, 16*3
1101 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1102 ; store ciphertext
1103 vmovdqu [ptr_ciphertext+16*0], xmm1
1104 vmovdqu [ptr_ciphertext+16*1], xmm2
1105 vmovdqu [ptr_ciphertext+16*2], xmm3
1106 add ptr_ciphertext, 16*3
1107
1108 cmp ptr_ciphertext, target_ptr_val
1109 je _last_eight
1110
1111 jmp _main_loop
1112 _initial_num_blocks_is_2:
1113 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1114 add ptr_plaintext, 16*2
1115 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1116 ; store ciphertext
1117 vmovdqu [ptr_ciphertext], xmm1
1118 vmovdqu [ptr_ciphertext+16], xmm2
1119 add ptr_ciphertext, 16*2
1120
1121 cmp ptr_ciphertext, target_ptr_val
1122 je _last_eight
1123
1124 jmp _main_loop
1125
1126 _initial_num_blocks_is_1:
1127 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1128 add ptr_plaintext, 16*1
1129 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1130 ; store ciphertext
1131 vmovdqu [ptr_ciphertext], xmm1
1132 add ptr_ciphertext, 16
1133
1134 cmp ptr_ciphertext, target_ptr_val
1135 je _last_eight
1136
1137 jmp _main_loop
1138
1139 _initial_num_blocks_is_0:
1140 mov twtempl, [TW+8*0]
1141 mov twtemph, [TW+8*1]
1142 vmovdqa xmm9, [TW+16*0]
1143
1144 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1145 shl twtempl, 1
1146 adc twtemph, twtemph
1147 cmovc ghash_poly_8b_temp, ghash_poly_8b
1148 xor twtempl, ghash_poly_8b_temp
1149 mov [TW+8*2], twtempl
1150 mov [TW+8*3], twtemph
1151 vmovdqa xmm10, [TW+16*1]
1152
1153 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1154 shl twtempl, 1
1155 adc twtemph, twtemph
1156 cmovc ghash_poly_8b_temp, ghash_poly_8b
1157 xor twtempl, ghash_poly_8b_temp
1158 mov [TW+8*4], twtempl
1159 mov [TW+8*5], twtemph
1160 vmovdqa xmm11, [TW+16*2]
1161
1162
1163 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1164 shl twtempl, 1
1165 adc twtemph, twtemph
1166 cmovc ghash_poly_8b_temp, ghash_poly_8b
1167 xor twtempl, ghash_poly_8b_temp
1168 mov [TW+8*6], twtempl
1169 mov [TW+8*7], twtemph
1170 vmovdqa xmm12, [TW+16*3]
1171
1172
1173 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1174 shl twtempl, 1
1175 adc twtemph, twtemph
1176 cmovc ghash_poly_8b_temp, ghash_poly_8b
1177 xor twtempl, ghash_poly_8b_temp
1178 mov [TW+8*8], twtempl
1179 mov [TW+8*9], twtemph
1180 vmovdqa xmm13, [TW+16*4]
1181
1182 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1183 shl twtempl, 1
1184 adc twtemph, twtemph
1185 cmovc ghash_poly_8b_temp, ghash_poly_8b
1186 xor twtempl, ghash_poly_8b_temp
1187 mov [TW+8*10], twtempl
1188 mov [TW+8*11], twtemph
1189 vmovdqa xmm14, [TW+16*5]
1190
1191 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1192 shl twtempl, 1
1193 adc twtemph, twtemph
1194 cmovc ghash_poly_8b_temp, ghash_poly_8b
1195 xor twtempl, ghash_poly_8b_temp
1196 mov [TW+8*12], twtempl
1197 mov [TW+8*13], twtemph
1198 vmovdqa xmm15, [TW+16*6]
1199
1200 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1201 shl twtempl, 1
1202 adc twtemph, twtemph
1203 cmovc ghash_poly_8b_temp, ghash_poly_8b
1204 xor twtempl, ghash_poly_8b_temp
1205 mov [TW+8*14], twtempl
1206 mov [TW+8*15], twtemph
1207 ;vmovdqa xmm16, [TW+16*7]
1208
1209 cmp ptr_ciphertext, target_ptr_val
1210 je _last_eight
1211 _main_loop:
1212 ; load plaintext
1213 vmovdqu xmm1, [ptr_plaintext+16*0]
1214 vmovdqu xmm2, [ptr_plaintext+16*1]
1215 vmovdqu xmm3, [ptr_plaintext+16*2]
1216 vmovdqu xmm4, [ptr_plaintext+16*3]
1217 vmovdqu xmm5, [ptr_plaintext+16*4]
1218 vmovdqu xmm6, [ptr_plaintext+16*5]
1219 vmovdqu xmm7, [ptr_plaintext+16*6]
1220 vmovdqu xmm8, [ptr_plaintext+16*7]
1221
1222 add ptr_plaintext, 128
1223
1224 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1225
1226 ; store ciphertext
1227 vmovdqu [ptr_ciphertext+16*0], xmm1
1228 vmovdqu [ptr_ciphertext+16*1], xmm2
1229 vmovdqu [ptr_ciphertext+16*2], xmm3
1230 vmovdqu [ptr_ciphertext+16*3], xmm4
1231 vmovdqu [ptr_ciphertext+16*4], xmm5
1232 vmovdqu [ptr_ciphertext+16*5], xmm6
1233 vmovdqu [ptr_ciphertext+16*6], xmm7
1234 vmovdqu [ptr_ciphertext+16*7], xmm8
1235 add ptr_ciphertext, 128
1236
1237 cmp ptr_ciphertext, target_ptr_val
1238 jne _main_loop
1239
1240 _last_eight:
1241
1242 and N_val, 15 ; N_val = N_val mod 16
1243 je _done_final
1244
1245 ; generate next Tweak value
1246 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1247 shl twtempl, 1
1248 adc twtemph, twtemph
1249 cmovc ghash_poly_8b_temp, ghash_poly_8b
1250 xor twtempl, ghash_poly_8b_temp
1251 vmovdqa xmm1, [TW + 16*7]
1252 vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
1253
1254 mov [TW + 16*7], twtempl
1255 mov [TW + 16*7+8], twtemph
1256
1257 ; load plaintext
1258 vmovdqu xmm1, [ptr_plaintext+16*0]
1259 vmovdqu xmm2, [ptr_plaintext+16*1]
1260 vmovdqu xmm3, [ptr_plaintext+16*2]
1261 vmovdqu xmm4, [ptr_plaintext+16*3]
1262 vmovdqu xmm5, [ptr_plaintext+16*4]
1263 vmovdqu xmm6, [ptr_plaintext+16*5]
1264 vmovdqu xmm7, [ptr_plaintext+16*6]
1265 vmovdqu xmm8, [ptr_plaintext+16*7]
1266 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1267
1268 ; store ciphertext
1269 vmovdqu [ptr_ciphertext+16*0], xmm1
1270 vmovdqu [ptr_ciphertext+16*1], xmm2
1271 vmovdqu [ptr_ciphertext+16*2], xmm3
1272 vmovdqu [ptr_ciphertext+16*3], xmm4
1273 vmovdqu [ptr_ciphertext+16*4], xmm5
1274 vmovdqu [ptr_ciphertext+16*5], xmm6
1275 vmovdqu [ptr_ciphertext+16*6], xmm7
1276 jmp _steal_cipher
1277
1278
1279 _done_final:
1280 ; load plaintext
1281 vmovdqu xmm1, [ptr_plaintext+16*0]
1282 vmovdqu xmm2, [ptr_plaintext+16*1]
1283 vmovdqu xmm3, [ptr_plaintext+16*2]
1284 vmovdqu xmm4, [ptr_plaintext+16*3]
1285 vmovdqu xmm5, [ptr_plaintext+16*4]
1286 vmovdqu xmm6, [ptr_plaintext+16*5]
1287 vmovdqu xmm7, [ptr_plaintext+16*6]
1288 vmovdqu xmm8, [ptr_plaintext+16*7]
1289 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1290
1291 ; store ciphertext
1292 vmovdqu [ptr_ciphertext+16*0], xmm1
1293 vmovdqu [ptr_ciphertext+16*1], xmm2
1294 vmovdqu [ptr_ciphertext+16*2], xmm3
1295 vmovdqu [ptr_ciphertext+16*3], xmm4
1296 vmovdqu [ptr_ciphertext+16*4], xmm5
1297 vmovdqu [ptr_ciphertext+16*5], xmm6
1298 vmovdqu [ptr_ciphertext+16*6], xmm7
1299
1300 jmp _done
1301
1302
1303 _steal_cipher:
1304 ; start cipher stealing
1305
1306 vmovdqa xmm2, xmm8
1307
1308 ; shift xmm8 to the left by 16-N_val bytes
1309 lea twtempl, [vpshufb_shf_table]
1310 vmovdqu xmm0, [twtempl+N_val]
1311 vpshufb xmm8, xmm0
1312
1313
1314 vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1315 vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
1316
1317 ; shift xmm3 to the right by 16-N_val bytes
1318 lea twtempl, [vpshufb_shf_table +16]
1319 sub twtempl, N_val
1320 vmovdqu xmm0, [twtempl]
1321 vpxor xmm0, [mask1]
1322 vpshufb xmm3, xmm0
1323
1324 vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
1325
1326 ; xor Tweak value
1327 vmovdqa xmm8, [TW]
1328 vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
1329
1330
1331 ;encrypt last block with cipher stealing
1332 vpxor xmm8, [keys] ; ARK
1333 vaesdec xmm8, [keys + 16*1] ; round 1
1334 vaesdec xmm8, [keys + 16*2] ; round 2
1335 vaesdec xmm8, [keys + 16*3] ; round 3
1336 vaesdec xmm8, [keys + 16*4] ; round 4
1337 vaesdec xmm8, [keys + 16*5] ; round 5
1338 vaesdec xmm8, [keys + 16*6] ; round 6
1339 vaesdec xmm8, [keys + 16*7] ; round 7
1340 vaesdec xmm8, [keys + 16*8] ; round 8
1341 vaesdec xmm8, [keys + 16*9] ; round 9
1342 vaesdeclast xmm8, [keys + 16*10] ; round 10
1343
1344 ; xor Tweak value
1345 vpxor xmm8, [TW]
1346
1347 _done:
1348 ; store last ciphertext value
1349 vmovdqu [ptr_ciphertext+16*7], xmm8
1350
1351 _ret_:
1352
1353 mov rbx, [_gpr + 8*0]
1354 %ifidn __OUTPUT_FORMAT__, win64
1355 mov rdi, [_gpr + 8*1]
1356 mov rsi, [_gpr + 8*2]
1357
1358
1359 vmovdqa xmm6, [_xmm + 16*0]
1360 vmovdqa xmm7, [_xmm + 16*1]
1361 vmovdqa xmm8, [_xmm + 16*2]
1362 vmovdqa xmm9, [_xmm + 16*3]
1363 vmovdqa xmm10, [_xmm + 16*4]
1364 vmovdqa xmm11, [_xmm + 16*5]
1365 vmovdqa xmm12, [_xmm + 16*6]
1366 vmovdqa xmm13, [_xmm + 16*7]
1367 vmovdqa xmm14, [_xmm + 16*8]
1368 vmovdqa xmm15, [_xmm + 16*9]
1369 %endif
1370
1371 add rsp, VARIABLE_OFFSET
1372
1373 ret
1374
1375
1376
1377
1378
1379 _less_than_128_bytes:
1380 cmp N_val, 16
1381 jb _ret_
1382
1383 mov tmp1, N_val
1384 and tmp1, (7 << 4)
1385 cmp tmp1, (6 << 4)
1386 je _num_blocks_is_6
1387 cmp tmp1, (5 << 4)
1388 je _num_blocks_is_5
1389 cmp tmp1, (4 << 4)
1390 je _num_blocks_is_4
1391 cmp tmp1, (3 << 4)
1392 je _num_blocks_is_3
1393 cmp tmp1, (2 << 4)
1394 je _num_blocks_is_2
1395 cmp tmp1, (1 << 4)
1396 je _num_blocks_is_1
1397
1398
1399
1400
1401 _num_blocks_is_7:
1402 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1403
1404 sub ptr_plaintext, 16*1
1405
1406 and N_val, 15 ; N_val = N_val mod 16
1407 je _done_7
1408
1409 _steal_cipher_7:
1410 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1411 shl twtempl, 1
1412 adc twtemph, twtemph
1413 cmovc ghash_poly_8b_temp, ghash_poly_8b
1414 xor twtempl, ghash_poly_8b_temp
1415 mov [TW+8*2], twtempl
1416 mov [TW+8*3], twtemph
1417
1418 vmovdqa [TW + 16*0] , xmm15
1419 vmovdqa xmm15, [TW+16*1]
1420
1421 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1422 ; store ciphertext
1423 vmovdqu [ptr_ciphertext+16*0], xmm1
1424 vmovdqu [ptr_ciphertext+16*1], xmm2
1425 vmovdqu [ptr_ciphertext+16*2], xmm3
1426 vmovdqu [ptr_ciphertext+16*3], xmm4
1427 vmovdqu [ptr_ciphertext+16*4], xmm5
1428 vmovdqu [ptr_ciphertext+16*5], xmm6
1429
1430 sub ptr_ciphertext, 16*1
1431 vmovdqa xmm8, xmm7
1432 jmp _steal_cipher
1433
1434 _done_7:
1435 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1436 ; store ciphertext
1437 vmovdqu [ptr_ciphertext+16*0], xmm1
1438 vmovdqu [ptr_ciphertext+16*1], xmm2
1439 vmovdqu [ptr_ciphertext+16*2], xmm3
1440 vmovdqu [ptr_ciphertext+16*3], xmm4
1441 vmovdqu [ptr_ciphertext+16*4], xmm5
1442 vmovdqu [ptr_ciphertext+16*5], xmm6
1443
1444 sub ptr_ciphertext, 16*1
1445 vmovdqa xmm8, xmm7
1446 jmp _done
1447
1448
1449
1450
1451
1452
1453 _num_blocks_is_6:
1454 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1455
1456 sub ptr_plaintext, 16*2
1457
1458 and N_val, 15 ; N_val = N_val mod 16
1459 je _done_6
1460
1461 _steal_cipher_6:
1462 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1463 shl twtempl, 1
1464 adc twtemph, twtemph
1465 cmovc ghash_poly_8b_temp, ghash_poly_8b
1466 xor twtempl, ghash_poly_8b_temp
1467 mov [TW+8*2], twtempl
1468 mov [TW+8*3], twtemph
1469
1470 vmovdqa [TW + 16*0] , xmm14
1471 vmovdqa xmm14, [TW+16*1]
1472
1473 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1474 ; store ciphertext
1475 vmovdqu [ptr_ciphertext+16*0], xmm1
1476 vmovdqu [ptr_ciphertext+16*1], xmm2
1477 vmovdqu [ptr_ciphertext+16*2], xmm3
1478 vmovdqu [ptr_ciphertext+16*3], xmm4
1479 vmovdqu [ptr_ciphertext+16*4], xmm5
1480
1481 sub ptr_ciphertext, 16*2
1482 vmovdqa xmm8, xmm6
1483 jmp _steal_cipher
1484
1485 _done_6:
1486 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1487 ; store ciphertext
1488 vmovdqu [ptr_ciphertext+16*0], xmm1
1489 vmovdqu [ptr_ciphertext+16*1], xmm2
1490 vmovdqu [ptr_ciphertext+16*2], xmm3
1491 vmovdqu [ptr_ciphertext+16*3], xmm4
1492 vmovdqu [ptr_ciphertext+16*4], xmm5
1493
1494 sub ptr_ciphertext, 16*2
1495 vmovdqa xmm8, xmm6
1496 jmp _done
1497
1498
1499
1500
1501
1502 _num_blocks_is_5:
1503 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1504
1505 sub ptr_plaintext, 16*3
1506
1507 and N_val, 15 ; N_val = N_val mod 16
1508 je _done_5
1509
1510 _steal_cipher_5:
1511 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1512 shl twtempl, 1
1513 adc twtemph, twtemph
1514 cmovc ghash_poly_8b_temp, ghash_poly_8b
1515 xor twtempl, ghash_poly_8b_temp
1516 mov [TW+8*2], twtempl
1517 mov [TW+8*3], twtemph
1518
1519 vmovdqa [TW + 16*0] , xmm13
1520 vmovdqa xmm13, [TW+16*1]
1521
1522 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1523 ; store ciphertext
1524 vmovdqu [ptr_ciphertext+16*0], xmm1
1525 vmovdqu [ptr_ciphertext+16*1], xmm2
1526 vmovdqu [ptr_ciphertext+16*2], xmm3
1527 vmovdqu [ptr_ciphertext+16*3], xmm4
1528
1529 sub ptr_ciphertext, 16*3
1530 vmovdqa xmm8, xmm5
1531 jmp _steal_cipher
1532
1533 _done_5:
1534 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1535 ; store ciphertext
1536 vmovdqu [ptr_ciphertext+16*0], xmm1
1537 vmovdqu [ptr_ciphertext+16*1], xmm2
1538 vmovdqu [ptr_ciphertext+16*2], xmm3
1539 vmovdqu [ptr_ciphertext+16*3], xmm4
1540
1541 sub ptr_ciphertext, 16*3
1542 vmovdqa xmm8, xmm5
1543 jmp _done
1544
1545
1546
1547
1548
1549 _num_blocks_is_4:
1550 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1551
1552 sub ptr_plaintext, 16*4
1553
1554 and N_val, 15 ; N_val = N_val mod 16
1555 je _done_4
1556
1557 _steal_cipher_4:
1558 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1559 shl twtempl, 1
1560 adc twtemph, twtemph
1561 cmovc ghash_poly_8b_temp, ghash_poly_8b
1562 xor twtempl, ghash_poly_8b_temp
1563 mov [TW+8*2], twtempl
1564 mov [TW+8*3], twtemph
1565
1566 vmovdqa [TW + 16*0] , xmm12
1567 vmovdqa xmm12, [TW+16*1]
1568
1569 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1570 ; store ciphertext
1571 vmovdqu [ptr_ciphertext+16*0], xmm1
1572 vmovdqu [ptr_ciphertext+16*1], xmm2
1573 vmovdqu [ptr_ciphertext+16*2], xmm3
1574
1575 sub ptr_ciphertext, 16*4
1576 vmovdqa xmm8, xmm4
1577 jmp _steal_cipher
1578
1579 _done_4:
1580 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1581 ; store ciphertext
1582 vmovdqu [ptr_ciphertext+16*0], xmm1
1583 vmovdqu [ptr_ciphertext+16*1], xmm2
1584 vmovdqu [ptr_ciphertext+16*2], xmm3
1585
1586 sub ptr_ciphertext, 16*4
1587 vmovdqa xmm8, xmm4
1588 jmp _done
1589
1590
1591
1592
1593 _num_blocks_is_3:
1594 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1595
1596 sub ptr_plaintext, 16*5
1597
1598 and N_val, 15 ; N_val = N_val mod 16
1599 je _done_3
1600
1601 _steal_cipher_3:
1602 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1603 shl twtempl, 1
1604 adc twtemph, twtemph
1605 cmovc ghash_poly_8b_temp, ghash_poly_8b
1606 xor twtempl, ghash_poly_8b_temp
1607 mov [TW+8*2], twtempl
1608 mov [TW+8*3], twtemph
1609
1610 vmovdqa [TW + 16*0] , xmm11
1611 vmovdqa xmm11, [TW+16*1]
1612
1613 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1614 ; store ciphertext
1615 vmovdqu [ptr_ciphertext+16*0], xmm1
1616 vmovdqu [ptr_ciphertext+16*1], xmm2
1617
1618 sub ptr_ciphertext, 16*5
1619 vmovdqa xmm8, xmm3
1620 jmp _steal_cipher
1621
1622 _done_3:
1623 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1624 ; store ciphertext
1625 vmovdqu [ptr_ciphertext+16*0], xmm1
1626 vmovdqu [ptr_ciphertext+16*1], xmm2
1627
1628 sub ptr_ciphertext, 16*5
1629 vmovdqa xmm8, xmm3
1630 jmp _done
1631
1632
1633
1634
1635
1636
1637 _num_blocks_is_2:
1638 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1639
1640 sub ptr_plaintext, 16*6
1641
1642 and N_val, 15 ; N_val = N_val mod 16
1643 je _done_2
1644
1645 _steal_cipher_2:
1646 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1647 shl twtempl, 1
1648 adc twtemph, twtemph
1649 cmovc ghash_poly_8b_temp, ghash_poly_8b
1650 xor twtempl, ghash_poly_8b_temp
1651 mov [TW+8*2], twtempl
1652 mov [TW+8*3], twtemph
1653
1654 vmovdqa [TW + 16*0] , xmm10
1655 vmovdqa xmm10, [TW+16*1]
1656
1657 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1658 ; store ciphertext
1659 vmovdqu [ptr_ciphertext], xmm1
1660
1661 sub ptr_ciphertext, 16*6
1662 vmovdqa xmm8, xmm2
1663 jmp _steal_cipher
1664
1665 _done_2:
1666 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1667 ; store ciphertext
1668 vmovdqu [ptr_ciphertext], xmm1
1669
1670 sub ptr_ciphertext, 16*6
1671 vmovdqa xmm8, xmm2
1672 jmp _done
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686 _num_blocks_is_1:
1687 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1688
1689 sub ptr_plaintext, 16*7
1690
1691 and N_val, 15 ; N_val = N_val mod 16
1692 je _done_1
1693
1694 _steal_cipher_1:
1695 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1696 shl twtempl, 1
1697 adc twtemph, twtemph
1698 cmovc ghash_poly_8b_temp, ghash_poly_8b
1699 xor twtempl, ghash_poly_8b_temp
1700 mov [TW+8*2], twtempl
1701 mov [TW+8*3], twtemph
1702
1703 vmovdqa [TW + 16*0] , xmm9
1704 vmovdqa xmm9, [TW+16*1]
1705
1706 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1707 ; store ciphertext
1708
1709 sub ptr_ciphertext, 16*7
1710 vmovdqa xmm8, xmm1
1711 jmp _steal_cipher
1712
1713 _done_1:
1714 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1715 ; store ciphertext
1716
1717 sub ptr_ciphertext, 16*7
1718 vmovdqa xmm8, xmm1
1719 jmp _done
1720
1721 section .data
1722 align 16
1723
1724 vpshufb_shf_table:
1725 ; use these values for shift constants for the vpshufb instruction
1726 ; different alignments result in values as shown:
1727 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1728 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1729 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1730 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1731 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1732 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1733 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1734 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1735 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1736 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1737 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1738 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1739 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1740 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1741 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1742 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1743 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1744
1745 mask1:
1746 dq 0x8080808080808080, 0x8080808080808080
1747