]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_256_dec_expanded_key_sse.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29; XTS decrypt function with 256-bit AES
30; expanded keys are not aligned
31; plaintext and ciphertext are not aligned
32; second key is stored in the stack as aligned to 16 Bytes
33; first key is required only once, no need for storage of this key
34
35%include "reg_sizes.asm"
36
37default rel
38%define TW rsp ; store 8 tweak values
39%define keys rsp + 16*8 ; store 15 expanded keys
40
41%ifidn __OUTPUT_FORMAT__, win64
42 %define _xmm rsp + 16*23 ; store xmm6:xmm15
43%endif
44
45%ifidn __OUTPUT_FORMAT__, elf64
46%define _gpr rsp + 16*23 ; store rbx
47%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
48%else
49%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
50%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
51%endif
52
53%define GHASH_POLY 0x87
54
55;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56;void XTS_AES_256_dec_expanded_key_sse(
57; UINT8 *k2, // key used for tweaking, 16*15 bytes
58; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
59; UINT8 *TW_initial, // initial tweak value, 16 bytes
60; UINT64 N, // sector size, in bytes
61; const UINT8 *ct, // ciphertext sector input data
62; UINT8 *pt); // plaintext sector output data
63;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
64
65; arguments for input parameters
66%ifidn __OUTPUT_FORMAT__, elf64
67 %xdefine ptr_key2 rdi
68 %xdefine ptr_key1 rsi
69 %xdefine T_val rdx
70 %xdefine N_val rcx
71 %xdefine ptr_plaintext r8
72 %xdefine ptr_ciphertext r9
73%else
74 %xdefine ptr_key2 rcx
75 %xdefine ptr_key1 rdx
76 %xdefine T_val r8
77 %xdefine N_val r9
78 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
79 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
80%endif
81
82; arguments for temp parameters
83%ifidn __OUTPUT_FORMAT__, elf64
84 %define tmp1 rdi
85 %define target_ptr_val rsi
86 %define ghash_poly_8b r10
87 %define ghash_poly_8b_temp r11
88%else
89 %define tmp1 rcx
90 %define target_ptr_val rdx
91 %define ghash_poly_8b rdi
92 %define ghash_poly_8b_temp rsi
93%endif
94
95%define twtempl rax ; global temp registers used for tweak computation
96%define twtemph rbx
97
98
99; macro to encrypt the tweak value
100
101%macro encrypt_T 8
102%define %%xkey2 %1
103%define %%xstate_tweak %2
104%define %%xkey1 %3
105%define %%xraw_key %4
106%define %%xtmp %5
107%define %%ptr_key2 %6
108%define %%ptr_key1 %7
109%define %%ptr_expanded_keys %8
110
111 movdqu %%xkey2, [%%ptr_key2]
112 pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
113
114 movdqu %%xkey1, [%%ptr_key1 + 16*14]
115 movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
116
117 movdqu %%xkey2, [%%ptr_key2 + 16*1]
118 aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
119
120 movdqu %%xkey1, [%%ptr_key1 + 16*13]
121 movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
122
123
124 movdqu %%xkey2, [%%ptr_key2 + 16*2]
125 aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
126
127 movdqu %%xkey1, [%%ptr_key1 + 16*12]
128 movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
129
130 movdqu %%xkey2, [%%ptr_key2 + 16*3]
131 aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
132
133 movdqu %%xkey1, [%%ptr_key1 + 16*11]
134 movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
135
136 movdqu %%xkey2, [%%ptr_key2 + 16*4]
137 aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
138
139 movdqu %%xkey1, [%%ptr_key1 + 16*10]
140 movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
141
142 movdqu %%xkey2, [%%ptr_key2 + 16*5]
143 aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
144
145 movdqu %%xkey1, [%%ptr_key1 + 16*9]
146 movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
147
148 movdqu %%xkey2, [%%ptr_key2 + 16*6]
149 aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
150
151 movdqu %%xkey1, [%%ptr_key1 + 16*8]
152 movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
153
154 movdqu %%xkey2, [%%ptr_key2 + 16*7]
155 aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
156
157 movdqu %%xkey1, [%%ptr_key1 + 16*7]
158 movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
159
160
161 movdqu %%xkey2, [%%ptr_key2 + 16*8]
162 aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
163
164 movdqu %%xkey1, [%%ptr_key1 + 16*6]
165 movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
166
167
168 movdqu %%xkey2, [%%ptr_key2 + 16*9]
169 aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
170
171 movdqu %%xkey1, [%%ptr_key1 + 16*5]
172 movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
173
174
175 movdqu %%xkey2, [%%ptr_key2 + 16*10]
176 aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
177
178 movdqu %%xkey1, [%%ptr_key1 + 16*4]
179 movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
180
181
182 movdqu %%xkey2, [%%ptr_key2 + 16*11]
183 aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
184
185 movdqu %%xkey1, [%%ptr_key1 + 16*3]
186 movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
187
188 movdqu %%xkey2, [%%ptr_key2 + 16*12]
189 aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
190
191 movdqu %%xkey1, [%%ptr_key1 + 16*2]
192 movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
193
194 movdqu %%xkey2, [%%ptr_key2 + 16*13]
195 aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
196
197 movdqu %%xkey1, [%%ptr_key1 + 16*1]
198 movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
199
200 movdqu %%xkey2, [%%ptr_key2 + 16*14]
201 aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
202
203 movdqu %%xkey1, [%%ptr_key1 + 16*0]
204 movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
205
206 movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
207%endmacro
208
209
210; generate initial tweak values
211; load initial plaintext values
212%macro initialize 16
213
214%define %%ST1 %1 ; state 1
215%define %%ST2 %2 ; state 2
216%define %%ST3 %3 ; state 3
217%define %%ST4 %4 ; state 4
218%define %%ST5 %5 ; state 5
219%define %%ST6 %6 ; state 6
220%define %%ST7 %7 ; state 7
221%define %%ST8 %8 ; state 8
222
223%define %%TW1 %9 ; tweak 1
224%define %%TW2 %10 ; tweak 2
225%define %%TW3 %11 ; tweak 3
226%define %%TW4 %12 ; tweak 4
227%define %%TW5 %13 ; tweak 5
228%define %%TW6 %14 ; tweak 6
229%define %%TW7 %15 ; tweak 7
230
231%define %%num_initial_blocks %16
232
233
234 ; generate next Tweak values
235 movdqa %%TW1, [TW+16*0]
236 mov twtempl, [TW+8*0]
237 mov twtemph, [TW+8*1]
238 movdqu %%ST1, [ptr_plaintext+16*0]
239%if (%%num_initial_blocks>=2)
240 xor ghash_poly_8b_temp, ghash_poly_8b_temp
241 shl twtempl, 1
242 adc twtemph, twtemph
243 cmovc ghash_poly_8b_temp, ghash_poly_8b
244 xor twtempl, ghash_poly_8b_temp
245 mov [TW+8*2], twtempl
246 mov [TW+8*3], twtemph;
247 movdqa %%TW2, [TW+16*1]
248 movdqu %%ST2, [ptr_plaintext+16*1]
249%endif
250%if (%%num_initial_blocks>=3)
251 xor ghash_poly_8b_temp, ghash_poly_8b_temp
252 shl twtempl, 1
253 adc twtemph, twtemph
254 cmovc ghash_poly_8b_temp, ghash_poly_8b
255 xor twtempl, ghash_poly_8b_temp
256 mov [TW+8*4], twtempl
257 mov [TW+8*5], twtemph;
258 movdqa %%TW3, [TW+16*2]
259 movdqu %%ST3, [ptr_plaintext+16*2]
260%endif
261%if (%%num_initial_blocks>=4)
262 xor ghash_poly_8b_temp, ghash_poly_8b_temp
263 shl twtempl, 1
264 adc twtemph, twtemph
265 cmovc ghash_poly_8b_temp, ghash_poly_8b
266 xor twtempl, ghash_poly_8b_temp
267 mov [TW+8*6], twtempl
268 mov [TW+8*7], twtemph;
269 movdqa %%TW4, [TW+16*3]
270 movdqu %%ST4, [ptr_plaintext+16*3]
271%endif
272%if (%%num_initial_blocks>=5)
273 xor ghash_poly_8b_temp, ghash_poly_8b_temp
274 shl twtempl, 1
275 adc twtemph, twtemph
276 cmovc ghash_poly_8b_temp, ghash_poly_8b
277 xor twtempl, ghash_poly_8b_temp
278 mov [TW+8*8], twtempl
279 mov [TW+8*9], twtemph;
280 movdqa %%TW5, [TW+16*4]
281 movdqu %%ST5, [ptr_plaintext+16*4]
282%endif
283%if (%%num_initial_blocks>=6)
284 xor ghash_poly_8b_temp, ghash_poly_8b_temp
285 shl twtempl, 1
286 adc twtemph, twtemph
287 cmovc ghash_poly_8b_temp, ghash_poly_8b
288 xor twtempl, ghash_poly_8b_temp
289 mov [TW+8*10], twtempl
290 mov [TW+8*11], twtemph;
291 movdqa %%TW6, [TW+16*5]
292 movdqu %%ST6, [ptr_plaintext+16*5]
293%endif
294%if (%%num_initial_blocks>=7)
295 xor ghash_poly_8b_temp, ghash_poly_8b_temp
296 shl twtempl, 1
297 adc twtemph, twtemph
298 cmovc ghash_poly_8b_temp, ghash_poly_8b
299 xor twtempl, ghash_poly_8b_temp
300 mov [TW+8*12], twtempl
301 mov [TW+8*13], twtemph;
302 movdqa %%TW7, [TW+16*6]
303 movdqu %%ST7, [ptr_plaintext+16*6]
304%endif
305
306
307
308%endmacro
309
310
311; encrypt initial blocks of AES
312; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
313; next 8 Tweak values are generated
314%macro encrypt_initial 18
315%define %%ST1 %1 ; state 1
316%define %%ST2 %2 ; state 2
317%define %%ST3 %3 ; state 3
318%define %%ST4 %4 ; state 4
319%define %%ST5 %5 ; state 5
320%define %%ST6 %6 ; state 6
321%define %%ST7 %7 ; state 7
322%define %%ST8 %8 ; state 8
323
324%define %%TW1 %9 ; tweak 1
325%define %%TW2 %10 ; tweak 2
326%define %%TW3 %11 ; tweak 3
327%define %%TW4 %12 ; tweak 4
328%define %%TW5 %13 ; tweak 5
329%define %%TW6 %14 ; tweak 6
330%define %%TW7 %15 ; tweak 7
331%define %%T0 %16 ; Temp register
332%define %%num_blocks %17
333; %%num_blocks blocks encrypted
334; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
335
336%define %%lt128 %18 ; less than 128 bytes
337
338 ; xor Tweak value
339 pxor %%ST1, %%TW1
340%if (%%num_blocks>=2)
341 pxor %%ST2, %%TW2
342%endif
343%if (%%num_blocks>=3)
344 pxor %%ST3, %%TW3
345%endif
346%if (%%num_blocks>=4)
347 pxor %%ST4, %%TW4
348%endif
349%if (%%num_blocks>=5)
350 pxor %%ST5, %%TW5
351%endif
352%if (%%num_blocks>=6)
353 pxor %%ST6, %%TW6
354%endif
355%if (%%num_blocks>=7)
356 pxor %%ST7, %%TW7
357%endif
358
359
360 ; ARK
361 movdqa %%T0, [keys]
362 pxor %%ST1, %%T0
363%if (%%num_blocks>=2)
364 pxor %%ST2, %%T0
365%endif
366%if (%%num_blocks>=3)
367 pxor %%ST3, %%T0
368%endif
369%if (%%num_blocks>=4)
370 pxor %%ST4, %%T0
371%endif
372%if (%%num_blocks>=5)
373 pxor %%ST5, %%T0
374%endif
375%if (%%num_blocks>=6)
376 pxor %%ST6, %%T0
377%endif
378%if (%%num_blocks>=7)
379 pxor %%ST7, %%T0
380%endif
381
382
383 %if (0 == %%lt128)
384 xor ghash_poly_8b_temp, ghash_poly_8b_temp
385 shl twtempl, 1
386 adc twtemph, twtemph
387 %endif
388
389 ; round 1
390 movdqa %%T0, [keys + 16*1]
391 aesdec %%ST1, %%T0
392%if (%%num_blocks>=2)
393 aesdec %%ST2, %%T0
394%endif
395%if (%%num_blocks>=3)
396 aesdec %%ST3, %%T0
397%endif
398%if (%%num_blocks>=4)
399 aesdec %%ST4, %%T0
400%endif
401%if (%%num_blocks>=5)
402 aesdec %%ST5, %%T0
403%endif
404%if (%%num_blocks>=6)
405 aesdec %%ST6, %%T0
406%endif
407%if (%%num_blocks>=7)
408 aesdec %%ST7, %%T0
409%endif
410 %if (0 == %%lt128)
411 cmovc ghash_poly_8b_temp, ghash_poly_8b
412 xor twtempl, ghash_poly_8b_temp
413 mov [TW + 8*0], twtempl ; next Tweak1 generated
414 mov [TW + 8*1], twtemph
415 xor ghash_poly_8b_temp, ghash_poly_8b_temp
416 %endif
417
418 ; round 2
419 movdqa %%T0, [keys + 16*2]
420 aesdec %%ST1, %%T0
421%if (%%num_blocks>=2)
422 aesdec %%ST2, %%T0
423%endif
424%if (%%num_blocks>=3)
425 aesdec %%ST3, %%T0
426%endif
427%if (%%num_blocks>=4)
428 aesdec %%ST4, %%T0
429%endif
430%if (%%num_blocks>=5)
431 aesdec %%ST5, %%T0
432%endif
433%if (%%num_blocks>=6)
434 aesdec %%ST6, %%T0
435%endif
436%if (%%num_blocks>=7)
437 aesdec %%ST7, %%T0
438%endif
439
440 %if (0 == %%lt128)
441 shl twtempl, 1
442 adc twtemph, twtemph
443 cmovc ghash_poly_8b_temp, ghash_poly_8b
444 xor twtempl, ghash_poly_8b_temp
445 mov [TW + 8*2], twtempl ; next Tweak2 generated
446 %endif
447
448 ; round 3
449 movdqa %%T0, [keys + 16*3]
450 aesdec %%ST1, %%T0
451%if (%%num_blocks>=2)
452 aesdec %%ST2, %%T0
453%endif
454%if (%%num_blocks>=3)
455 aesdec %%ST3, %%T0
456%endif
457%if (%%num_blocks>=4)
458 aesdec %%ST4, %%T0
459%endif
460%if (%%num_blocks>=5)
461 aesdec %%ST5, %%T0
462%endif
463%if (%%num_blocks>=6)
464 aesdec %%ST6, %%T0
465%endif
466%if (%%num_blocks>=7)
467 aesdec %%ST7, %%T0
468%endif
469 %if (0 == %%lt128)
470 mov [TW + 8*3], twtemph
471 xor ghash_poly_8b_temp, ghash_poly_8b_temp
472 shl twtempl, 1
473 adc twtemph, twtemph
474 cmovc ghash_poly_8b_temp, ghash_poly_8b
475 %endif
476
477 ; round 4
478 movdqa %%T0, [keys + 16*4]
479 aesdec %%ST1, %%T0
480%if (%%num_blocks>=2)
481 aesdec %%ST2, %%T0
482%endif
483%if (%%num_blocks>=3)
484 aesdec %%ST3, %%T0
485%endif
486%if (%%num_blocks>=4)
487 aesdec %%ST4, %%T0
488%endif
489%if (%%num_blocks>=5)
490 aesdec %%ST5, %%T0
491%endif
492%if (%%num_blocks>=6)
493 aesdec %%ST6, %%T0
494%endif
495%if (%%num_blocks>=7)
496 aesdec %%ST7, %%T0
497%endif
498
499 %if (0 == %%lt128)
500 xor twtempl, ghash_poly_8b_temp
501 mov [TW + 8*4], twtempl ; next Tweak3 generated
502 mov [TW + 8*5], twtemph
503 xor ghash_poly_8b_temp, ghash_poly_8b_temp
504 shl twtempl, 1
505 %endif
506
507 ; round 5
508 movdqa %%T0, [keys + 16*5]
509 aesdec %%ST1, %%T0
510%if (%%num_blocks>=2)
511 aesdec %%ST2, %%T0
512%endif
513%if (%%num_blocks>=3)
514 aesdec %%ST3, %%T0
515%endif
516%if (%%num_blocks>=4)
517 aesdec %%ST4, %%T0
518%endif
519%if (%%num_blocks>=5)
520 aesdec %%ST5, %%T0
521%endif
522%if (%%num_blocks>=6)
523 aesdec %%ST6, %%T0
524%endif
525%if (%%num_blocks>=7)
526 aesdec %%ST7, %%T0
527%endif
528
529 %if (0 == %%lt128)
530 adc twtemph, twtemph
531 cmovc ghash_poly_8b_temp, ghash_poly_8b
532 xor twtempl, ghash_poly_8b_temp
533 mov [TW + 8*6], twtempl ; next Tweak4 generated
534 mov [TW + 8*7], twtemph
535 %endif
536
537 ; round 6
538 movdqa %%T0, [keys + 16*6]
539 aesdec %%ST1, %%T0
540%if (%%num_blocks>=2)
541 aesdec %%ST2, %%T0
542%endif
543%if (%%num_blocks>=3)
544 aesdec %%ST3, %%T0
545%endif
546%if (%%num_blocks>=4)
547 aesdec %%ST4, %%T0
548%endif
549%if (%%num_blocks>=5)
550 aesdec %%ST5, %%T0
551%endif
552%if (%%num_blocks>=6)
553 aesdec %%ST6, %%T0
554%endif
555%if (%%num_blocks>=7)
556 aesdec %%ST7, %%T0
557%endif
558
559 %if (0 == %%lt128)
560 xor ghash_poly_8b_temp, ghash_poly_8b_temp
561 shl twtempl, 1
562 adc twtemph, twtemph
563 cmovc ghash_poly_8b_temp, ghash_poly_8b
564 xor twtempl, ghash_poly_8b_temp
565 mov [TW + 8*8], twtempl ; next Tweak5 generated
566 mov [TW + 8*9], twtemph
567 %endif
568
569 ; round 7
570 movdqa %%T0, [keys + 16*7]
571 aesdec %%ST1, %%T0
572%if (%%num_blocks>=2)
573 aesdec %%ST2, %%T0
574%endif
575%if (%%num_blocks>=3)
576 aesdec %%ST3, %%T0
577%endif
578%if (%%num_blocks>=4)
579 aesdec %%ST4, %%T0
580%endif
581%if (%%num_blocks>=5)
582 aesdec %%ST5, %%T0
583%endif
584%if (%%num_blocks>=6)
585 aesdec %%ST6, %%T0
586%endif
587%if (%%num_blocks>=7)
588 aesdec %%ST7, %%T0
589%endif
590
591 %if (0 == %%lt128)
592 xor ghash_poly_8b_temp, ghash_poly_8b_temp
593 shl twtempl, 1
594 adc twtemph, twtemph
595 cmovc ghash_poly_8b_temp, ghash_poly_8b
596 xor twtempl, ghash_poly_8b_temp
597 mov [TW + 8*10], twtempl ; next Tweak6 generated
598 mov [TW + 8*11], twtemph
599 %endif
600 ; round 8
601 movdqa %%T0, [keys + 16*8]
602 aesdec %%ST1, %%T0
603%if (%%num_blocks>=2)
604 aesdec %%ST2, %%T0
605%endif
606%if (%%num_blocks>=3)
607 aesdec %%ST3, %%T0
608%endif
609%if (%%num_blocks>=4)
610 aesdec %%ST4, %%T0
611%endif
612%if (%%num_blocks>=5)
613 aesdec %%ST5, %%T0
614%endif
615%if (%%num_blocks>=6)
616 aesdec %%ST6, %%T0
617%endif
618%if (%%num_blocks>=7)
619 aesdec %%ST7, %%T0
620%endif
621
622 %if (0 == %%lt128)
623 xor ghash_poly_8b_temp, ghash_poly_8b_temp
624 shl twtempl, 1
625 adc twtemph, twtemph
626 cmovc ghash_poly_8b_temp, ghash_poly_8b
627 xor twtempl, ghash_poly_8b_temp
628 mov [TW + 8*12], twtempl ; next Tweak7 generated
629 mov [TW + 8*13], twtemph
630 %endif
631 ; round 9
632 movdqa %%T0, [keys + 16*9]
633 aesdec %%ST1, %%T0
634%if (%%num_blocks>=2)
635 aesdec %%ST2, %%T0
636%endif
637%if (%%num_blocks>=3)
638 aesdec %%ST3, %%T0
639%endif
640%if (%%num_blocks>=4)
641 aesdec %%ST4, %%T0
642%endif
643%if (%%num_blocks>=5)
644 aesdec %%ST5, %%T0
645%endif
646%if (%%num_blocks>=6)
647 aesdec %%ST6, %%T0
648%endif
649%if (%%num_blocks>=7)
650 aesdec %%ST7, %%T0
651%endif
652
653 %if (0 == %%lt128)
654 xor ghash_poly_8b_temp, ghash_poly_8b_temp
655 shl twtempl, 1
656 adc twtemph, twtemph
657 cmovc ghash_poly_8b_temp, ghash_poly_8b
658 xor twtempl, ghash_poly_8b_temp
659 mov [TW + 8*14], twtempl ; next Tweak8 generated
660 mov [TW + 8*15], twtemph
661 %endif
662 ; round 10
663 movdqa %%T0, [keys + 16*10]
664 aesdec %%ST1, %%T0
665%if (%%num_blocks>=2)
666 aesdec %%ST2, %%T0
667%endif
668%if (%%num_blocks>=3)
669 aesdec %%ST3, %%T0
670%endif
671%if (%%num_blocks>=4)
672 aesdec %%ST4, %%T0
673%endif
674%if (%%num_blocks>=5)
675 aesdec %%ST5, %%T0
676%endif
677%if (%%num_blocks>=6)
678 aesdec %%ST6, %%T0
679%endif
680%if (%%num_blocks>=7)
681 aesdec %%ST7, %%T0
682%endif
683 ; round 11
684 movdqa %%T0, [keys + 16*11]
685 aesdec %%ST1, %%T0
686%if (%%num_blocks>=2)
687 aesdec %%ST2, %%T0
688%endif
689%if (%%num_blocks>=3)
690 aesdec %%ST3, %%T0
691%endif
692%if (%%num_blocks>=4)
693 aesdec %%ST4, %%T0
694%endif
695%if (%%num_blocks>=5)
696 aesdec %%ST5, %%T0
697%endif
698%if (%%num_blocks>=6)
699 aesdec %%ST6, %%T0
700%endif
701%if (%%num_blocks>=7)
702 aesdec %%ST7, %%T0
703%endif
704
705 ; round 12
706 movdqa %%T0, [keys + 16*12]
707 aesdec %%ST1, %%T0
708%if (%%num_blocks>=2)
709 aesdec %%ST2, %%T0
710%endif
711%if (%%num_blocks>=3)
712 aesdec %%ST3, %%T0
713%endif
714%if (%%num_blocks>=4)
715 aesdec %%ST4, %%T0
716%endif
717%if (%%num_blocks>=5)
718 aesdec %%ST5, %%T0
719%endif
720%if (%%num_blocks>=6)
721 aesdec %%ST6, %%T0
722%endif
723%if (%%num_blocks>=7)
724 aesdec %%ST7, %%T0
725%endif
726
727 ; round 13
728 movdqa %%T0, [keys + 16*13]
729 aesdec %%ST1, %%T0
730%if (%%num_blocks>=2)
731 aesdec %%ST2, %%T0
732%endif
733%if (%%num_blocks>=3)
734 aesdec %%ST3, %%T0
735%endif
736%if (%%num_blocks>=4)
737 aesdec %%ST4, %%T0
738%endif
739%if (%%num_blocks>=5)
740 aesdec %%ST5, %%T0
741%endif
742%if (%%num_blocks>=6)
743 aesdec %%ST6, %%T0
744%endif
745%if (%%num_blocks>=7)
746 aesdec %%ST7, %%T0
747%endif
748
749 ; round 14
750 movdqa %%T0, [keys + 16*14]
751 aesdeclast %%ST1, %%T0
752%if (%%num_blocks>=2)
753 aesdeclast %%ST2, %%T0
754%endif
755%if (%%num_blocks>=3)
756 aesdeclast %%ST3, %%T0
757%endif
758%if (%%num_blocks>=4)
759 aesdeclast %%ST4, %%T0
760%endif
761%if (%%num_blocks>=5)
762 aesdeclast %%ST5, %%T0
763%endif
764%if (%%num_blocks>=6)
765 aesdeclast %%ST6, %%T0
766%endif
767%if (%%num_blocks>=7)
768 aesdeclast %%ST7, %%T0
769%endif
770
771 ; xor Tweak values
772 pxor %%ST1, %%TW1
773%if (%%num_blocks>=2)
774 pxor %%ST2, %%TW2
775%endif
776%if (%%num_blocks>=3)
777 pxor %%ST3, %%TW3
778%endif
779%if (%%num_blocks>=4)
780 pxor %%ST4, %%TW4
781%endif
782%if (%%num_blocks>=5)
783 pxor %%ST5, %%TW5
784%endif
785%if (%%num_blocks>=6)
786 pxor %%ST6, %%TW6
787%endif
788%if (%%num_blocks>=7)
789 pxor %%ST7, %%TW7
790%endif
791
792
793%if (0 == %%lt128)
794 ; load next Tweak values
795 movdqa %%TW1, [TW + 16*0]
796 movdqa %%TW2, [TW + 16*1]
797 movdqa %%TW3, [TW + 16*2]
798 movdqa %%TW4, [TW + 16*3]
799 movdqa %%TW5, [TW + 16*4]
800 movdqa %%TW6, [TW + 16*5]
801 movdqa %%TW7, [TW + 16*6]
802
803%endif
804
805%endmacro
806
807
808; Encrypt 8 blocks in parallel
809; generate next 8 tweak values
810%macro encrypt_by_eight 18
811%define %%ST1 %1 ; state 1
812%define %%ST2 %2 ; state 2
813%define %%ST3 %3 ; state 3
814%define %%ST4 %4 ; state 4
815%define %%ST5 %5 ; state 5
816%define %%ST6 %6 ; state 6
817%define %%ST7 %7 ; state 7
818%define %%ST8 %8 ; state 8
819%define %%TW1 %9 ; tweak 1
820%define %%TW2 %10 ; tweak 2
821%define %%TW3 %11 ; tweak 3
822%define %%TW4 %12 ; tweak 4
823%define %%TW5 %13 ; tweak 5
824%define %%TW6 %14 ; tweak 6
825%define %%TW7 %15 ; tweak 7
826%define %%TW8 %16 ; tweak 8
827%define %%T0 %17 ; Temp register
828%define %%last_eight %18
829
830 ; xor Tweak values
831 pxor %%ST1, %%TW1
832 pxor %%ST2, %%TW2
833 pxor %%ST3, %%TW3
834 pxor %%ST4, %%TW4
835 pxor %%ST5, %%TW5
836 pxor %%ST6, %%TW6
837 pxor %%ST7, %%TW7
838 pxor %%ST8, %%TW8
839
840 ; ARK
841 movdqa %%T0, [keys]
842 pxor %%ST1, %%T0
843 pxor %%ST2, %%T0
844 pxor %%ST3, %%T0
845 pxor %%ST4, %%T0
846 pxor %%ST5, %%T0
847 pxor %%ST6, %%T0
848 pxor %%ST7, %%T0
849 pxor %%ST8, %%T0
850
851%if (0 == %%last_eight)
852 xor ghash_poly_8b_temp, ghash_poly_8b_temp
853 shl twtempl, 1
854 adc twtemph, twtemph
855 cmovc ghash_poly_8b_temp, ghash_poly_8b
856%endif
857 ; round 1
858 movdqa %%T0, [keys + 16*1]
859 aesdec %%ST1, %%T0
860 aesdec %%ST2, %%T0
861 aesdec %%ST3, %%T0
862 aesdec %%ST4, %%T0
863 aesdec %%ST5, %%T0
864 aesdec %%ST6, %%T0
865 aesdec %%ST7, %%T0
866 aesdec %%ST8, %%T0
867%if (0 == %%last_eight)
868 xor twtempl, ghash_poly_8b_temp
869 mov [TW + 8*0], twtempl
870 mov [TW + 8*1], twtemph
871 xor ghash_poly_8b_temp, ghash_poly_8b_temp
872%endif
873 ; round 2
874 movdqa %%T0, [keys + 16*2]
875 aesdec %%ST1, %%T0
876 aesdec %%ST2, %%T0
877 aesdec %%ST3, %%T0
878 aesdec %%ST4, %%T0
879 aesdec %%ST5, %%T0
880 aesdec %%ST6, %%T0
881 aesdec %%ST7, %%T0
882 aesdec %%ST8, %%T0
883%if (0 == %%last_eight)
884 shl twtempl, 1
885 adc twtemph, twtemph
886 cmovc ghash_poly_8b_temp, ghash_poly_8b
887 xor twtempl, ghash_poly_8b_temp
888
889%endif
890 ; round 3
891 movdqa %%T0, [keys + 16*3]
892 aesdec %%ST1, %%T0
893 aesdec %%ST2, %%T0
894 aesdec %%ST3, %%T0
895 aesdec %%ST4, %%T0
896 aesdec %%ST5, %%T0
897 aesdec %%ST6, %%T0
898 aesdec %%ST7, %%T0
899 aesdec %%ST8, %%T0
900%if (0 == %%last_eight)
901 mov [TW + 8*2], twtempl
902 mov [TW + 8*3], twtemph
903 xor ghash_poly_8b_temp, ghash_poly_8b_temp
904 shl twtempl, 1
905%endif
906 ; round 4
907 movdqa %%T0, [keys + 16*4]
908 aesdec %%ST1, %%T0
909 aesdec %%ST2, %%T0
910 aesdec %%ST3, %%T0
911 aesdec %%ST4, %%T0
912 aesdec %%ST5, %%T0
913 aesdec %%ST6, %%T0
914 aesdec %%ST7, %%T0
915 aesdec %%ST8, %%T0
916%if (0 == %%last_eight)
917 adc twtemph, twtemph
918 cmovc ghash_poly_8b_temp, ghash_poly_8b
919 xor twtempl, ghash_poly_8b_temp
920 mov [TW + 8*4], twtempl
921%endif
922 ; round 5
923 movdqa %%T0, [keys + 16*5]
924 aesdec %%ST1, %%T0
925 aesdec %%ST2, %%T0
926 aesdec %%ST3, %%T0
927 aesdec %%ST4, %%T0
928 aesdec %%ST5, %%T0
929 aesdec %%ST6, %%T0
930 aesdec %%ST7, %%T0
931 aesdec %%ST8, %%T0
932%if (0 == %%last_eight)
933 mov [TW + 8*5], twtemph
934 xor ghash_poly_8b_temp, ghash_poly_8b_temp
935 shl twtempl, 1
936 adc twtemph, twtemph
937%endif
938 ; round 6
939 movdqa %%T0, [keys + 16*6]
940 aesdec %%ST1, %%T0
941 aesdec %%ST2, %%T0
942 aesdec %%ST3, %%T0
943 aesdec %%ST4, %%T0
944 aesdec %%ST5, %%T0
945 aesdec %%ST6, %%T0
946 aesdec %%ST7, %%T0
947 aesdec %%ST8, %%T0
948%if (0 == %%last_eight)
949 cmovc ghash_poly_8b_temp, ghash_poly_8b
950 xor twtempl, ghash_poly_8b_temp
951 mov [TW + 8*6], twtempl
952 mov [TW + 8*7], twtemph
953%endif
954 ; round 7
955 movdqa %%T0, [keys + 16*7]
956 aesdec %%ST1, %%T0
957 aesdec %%ST2, %%T0
958 aesdec %%ST3, %%T0
959 aesdec %%ST4, %%T0
960 aesdec %%ST5, %%T0
961 aesdec %%ST6, %%T0
962 aesdec %%ST7, %%T0
963 aesdec %%ST8, %%T0
964%if (0 == %%last_eight)
965 xor ghash_poly_8b_temp, ghash_poly_8b_temp
966 shl twtempl, 1
967 adc twtemph, twtemph
968 cmovc ghash_poly_8b_temp, ghash_poly_8b
969%endif
970 ; round 8
971 movdqa %%T0, [keys + 16*8]
972 aesdec %%ST1, %%T0
973 aesdec %%ST2, %%T0
974 aesdec %%ST3, %%T0
975 aesdec %%ST4, %%T0
976 aesdec %%ST5, %%T0
977 aesdec %%ST6, %%T0
978 aesdec %%ST7, %%T0
979 aesdec %%ST8, %%T0
980%if (0 == %%last_eight)
981 xor twtempl, ghash_poly_8b_temp
982 mov [TW + 8*8], twtempl
983 mov [TW + 8*9], twtemph
984 xor ghash_poly_8b_temp, ghash_poly_8b_temp
985%endif
986 ; round 9
987 movdqa %%T0, [keys + 16*9]
988 aesdec %%ST1, %%T0
989 aesdec %%ST2, %%T0
990 aesdec %%ST3, %%T0
991 aesdec %%ST4, %%T0
992 aesdec %%ST5, %%T0
993 aesdec %%ST6, %%T0
994 aesdec %%ST7, %%T0
995 aesdec %%ST8, %%T0
996%if (0 == %%last_eight)
997 shl twtempl, 1
998 adc twtemph, twtemph
999 cmovc ghash_poly_8b_temp, ghash_poly_8b
1000 xor twtempl, ghash_poly_8b_temp
1001%endif
1002 ; round 10
1003 movdqa %%T0, [keys + 16*10]
1004 aesdec %%ST1, %%T0
1005 aesdec %%ST2, %%T0
1006 aesdec %%ST3, %%T0
1007 aesdec %%ST4, %%T0
1008 aesdec %%ST5, %%T0
1009 aesdec %%ST6, %%T0
1010 aesdec %%ST7, %%T0
1011 aesdec %%ST8, %%T0
1012%if (0 == %%last_eight)
1013 mov [TW + 8*10], twtempl
1014 mov [TW + 8*11], twtemph
1015 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1016 shl twtempl, 1
1017%endif
1018 ; round 11
1019 movdqa %%T0, [keys + 16*11]
1020 aesdec %%ST1, %%T0
1021 aesdec %%ST2, %%T0
1022 aesdec %%ST3, %%T0
1023 aesdec %%ST4, %%T0
1024 aesdec %%ST5, %%T0
1025 aesdec %%ST6, %%T0
1026 aesdec %%ST7, %%T0
1027 aesdec %%ST8, %%T0
1028%if (0 == %%last_eight)
1029 adc twtemph, twtemph
1030 cmovc ghash_poly_8b_temp, ghash_poly_8b
1031 xor twtempl, ghash_poly_8b_temp
1032 mov [TW + 8*12], twtempl
1033%endif
1034 ; round 12
1035 movdqa %%T0, [keys + 16*12]
1036 aesdec %%ST1, %%T0
1037 aesdec %%ST2, %%T0
1038 aesdec %%ST3, %%T0
1039 aesdec %%ST4, %%T0
1040 aesdec %%ST5, %%T0
1041 aesdec %%ST6, %%T0
1042 aesdec %%ST7, %%T0
1043 aesdec %%ST8, %%T0
1044%if (0 == %%last_eight)
1045 mov [TW + 8*13], twtemph
1046 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1047 shl twtempl, 1
1048 adc twtemph, twtemph
1049%endif
1050 ; round 13
1051 movdqa %%T0, [keys + 16*13]
1052 aesdec %%ST1, %%T0
1053 aesdec %%ST2, %%T0
1054 aesdec %%ST3, %%T0
1055 aesdec %%ST4, %%T0
1056 aesdec %%ST5, %%T0
1057 aesdec %%ST6, %%T0
1058 aesdec %%ST7, %%T0
1059 aesdec %%ST8, %%T0
1060%if (0 == %%last_eight)
1061 cmovc ghash_poly_8b_temp, ghash_poly_8b
1062 xor twtempl, ghash_poly_8b_temp
1063; mov [TW + 8*14], twtempl
1064; mov [TW + 8*15], twtemph
1065%endif
1066 ; round 14
1067 movdqa %%T0, [keys + 16*14]
1068 aesdeclast %%ST1, %%T0
1069 aesdeclast %%ST2, %%T0
1070 aesdeclast %%ST3, %%T0
1071 aesdeclast %%ST4, %%T0
1072 aesdeclast %%ST5, %%T0
1073 aesdeclast %%ST6, %%T0
1074 aesdeclast %%ST7, %%T0
1075 aesdeclast %%ST8, %%T0
1076
1077 ; xor Tweak values
1078 pxor %%ST1, %%TW1
1079 pxor %%ST2, %%TW2
1080 pxor %%ST3, %%TW3
1081 pxor %%ST4, %%TW4
1082 pxor %%ST5, %%TW5
1083 pxor %%ST6, %%TW6
1084 pxor %%ST7, %%TW7
1085 pxor %%ST8, %%TW8
1086
1087 mov [TW + 8*14], twtempl
1088 mov [TW + 8*15], twtemph
1089 ; load next Tweak values
1090 movdqa %%TW1, [TW + 16*0]
1091 movdqa %%TW2, [TW + 16*1]
1092 movdqa %%TW3, [TW + 16*2]
1093 movdqa %%TW4, [TW + 16*3]
1094 movdqa %%TW5, [TW + 16*4]
1095 movdqa %%TW6, [TW + 16*5]
1096 movdqa %%TW7, [TW + 16*6]
1097
1098%endmacro
1099
1100
1101section .text
1102
1e59de90 1103mk_global XTS_AES_256_dec_expanded_key_sse, function
7c673cae 1104XTS_AES_256_dec_expanded_key_sse:
1e59de90 1105 endbranch
7c673cae
FG
1106
1107 sub rsp, VARIABLE_OFFSET
1108
1109 mov [_gpr + 8*0], rbx
1110%ifidn __OUTPUT_FORMAT__, win64
1111 mov [_gpr + 8*1], rdi
1112 mov [_gpr + 8*2], rsi
1113
1114 movdqa [_xmm + 16*0], xmm6
1115 movdqa [_xmm + 16*1], xmm7
1116 movdqa [_xmm + 16*2], xmm8
1117 movdqa [_xmm + 16*3], xmm9
1118 movdqa [_xmm + 16*4], xmm10
1119 movdqa [_xmm + 16*5], xmm11
1120 movdqa [_xmm + 16*6], xmm12
1121 movdqa [_xmm + 16*7], xmm13
1122 movdqa [_xmm + 16*8], xmm14
1123 movdqa [_xmm + 16*9], xmm15
1124%endif
1125
1126 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1127
1128
1129 movdqu xmm1, [T_val] ; read initial Tweak value
1130 pxor xmm4, xmm4 ; for key expansion
1131 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
1132
1133
1134%ifidn __OUTPUT_FORMAT__, win64
1135 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1136 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1137%endif
1138
1139
1140
1141 mov target_ptr_val, N_val
1142 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
1143 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
1144 jl _less_than_128_bytes
1145
1146 add target_ptr_val, ptr_ciphertext
1147
1148
1149 mov tmp1, N_val
1150 and tmp1, (7 << 4)
1151 jz _initial_num_blocks_is_0
1152
1153 cmp tmp1, (4 << 4)
1154 je _initial_num_blocks_is_4
1155
1156
1157
1158 cmp tmp1, (6 << 4)
1159 je _initial_num_blocks_is_6
1160
1161 cmp tmp1, (5 << 4)
1162 je _initial_num_blocks_is_5
1163
1164
1165
1166 cmp tmp1, (3 << 4)
1167 je _initial_num_blocks_is_3
1168
1169 cmp tmp1, (2 << 4)
1170 je _initial_num_blocks_is_2
1171
1172 cmp tmp1, (1 << 4)
1173 je _initial_num_blocks_is_1
1174
1175_initial_num_blocks_is_7:
1176 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1177 add ptr_plaintext, 16*7
1178 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1179 ; store ciphertext
1180 movdqu [ptr_ciphertext+16*0], xmm1
1181 movdqu [ptr_ciphertext+16*1], xmm2
1182 movdqu [ptr_ciphertext+16*2], xmm3
1183 movdqu [ptr_ciphertext+16*3], xmm4
1184 movdqu [ptr_ciphertext+16*4], xmm5
1185 movdqu [ptr_ciphertext+16*5], xmm6
1186 movdqu [ptr_ciphertext+16*6], xmm7
1187 add ptr_ciphertext, 16*7
1188
1189 cmp ptr_ciphertext, target_ptr_val
1190 je _last_eight
1191
1192 jmp _main_loop
1193_initial_num_blocks_is_6:
1194 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1195 add ptr_plaintext, 16*6
1196 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1197 ; store ciphertext
1198 movdqu [ptr_ciphertext+16*0], xmm1
1199 movdqu [ptr_ciphertext+16*1], xmm2
1200 movdqu [ptr_ciphertext+16*2], xmm3
1201 movdqu [ptr_ciphertext+16*3], xmm4
1202 movdqu [ptr_ciphertext+16*4], xmm5
1203 movdqu [ptr_ciphertext+16*5], xmm6
1204 add ptr_ciphertext, 16*6
1205
1206 cmp ptr_ciphertext, target_ptr_val
1207 je _last_eight
1208
1209 jmp _main_loop
1210_initial_num_blocks_is_5:
1211 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1212 add ptr_plaintext, 16*5
1213 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1214 ; store ciphertext
1215 movdqu [ptr_ciphertext+16*0], xmm1
1216 movdqu [ptr_ciphertext+16*1], xmm2
1217 movdqu [ptr_ciphertext+16*2], xmm3
1218 movdqu [ptr_ciphertext+16*3], xmm4
1219 movdqu [ptr_ciphertext+16*4], xmm5
1220 add ptr_ciphertext, 16*5
1221
1222 cmp ptr_ciphertext, target_ptr_val
1223 je _last_eight
1224
1225 jmp _main_loop
1226_initial_num_blocks_is_4:
1227 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1228 add ptr_plaintext, 16*4
1229 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1230 ; store ciphertext
1231 movdqu [ptr_ciphertext+16*0], xmm1
1232 movdqu [ptr_ciphertext+16*1], xmm2
1233 movdqu [ptr_ciphertext+16*2], xmm3
1234 movdqu [ptr_ciphertext+16*3], xmm4
1235 add ptr_ciphertext, 16*4
1236
1237 cmp ptr_ciphertext, target_ptr_val
1238 je _last_eight
1239
1240 jmp _main_loop
1241
1242
1243_initial_num_blocks_is_3:
1244 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1245 add ptr_plaintext, 16*3
1246 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1247 ; store ciphertext
1248 movdqu [ptr_ciphertext+16*0], xmm1
1249 movdqu [ptr_ciphertext+16*1], xmm2
1250 movdqu [ptr_ciphertext+16*2], xmm3
1251 add ptr_ciphertext, 16*3
1252
1253 cmp ptr_ciphertext, target_ptr_val
1254 je _last_eight
1255
1256 jmp _main_loop
1257_initial_num_blocks_is_2:
1258 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1259 add ptr_plaintext, 16*2
1260 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1261 ; store ciphertext
1262 movdqu [ptr_ciphertext], xmm1
1263 movdqu [ptr_ciphertext+16], xmm2
1264 add ptr_ciphertext, 16*2
1265
1266 cmp ptr_ciphertext, target_ptr_val
1267 je _last_eight
1268
1269 jmp _main_loop
1270
1271_initial_num_blocks_is_1:
1272 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1273 add ptr_plaintext, 16*1
1274 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1275 ; store ciphertext
1276 movdqu [ptr_ciphertext], xmm1
1277 add ptr_ciphertext, 16
1278
1279 cmp ptr_ciphertext, target_ptr_val
1280 je _last_eight
1281
1282 jmp _main_loop
1283
1284_initial_num_blocks_is_0:
1285 mov twtempl, [TW+8*0]
1286 mov twtemph, [TW+8*1]
1287 movdqa xmm9, [TW+16*0]
1288
1289 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1290 shl twtempl, 1
1291 adc twtemph, twtemph
1292 cmovc ghash_poly_8b_temp, ghash_poly_8b
1293 xor twtempl, ghash_poly_8b_temp
1294 mov [TW+8*2], twtempl
1295 mov [TW+8*3], twtemph
1296 movdqa xmm10, [TW+16*1]
1297
1298 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1299 shl twtempl, 1
1300 adc twtemph, twtemph
1301 cmovc ghash_poly_8b_temp, ghash_poly_8b
1302 xor twtempl, ghash_poly_8b_temp
1303 mov [TW+8*4], twtempl
1304 mov [TW+8*5], twtemph
1305 movdqa xmm11, [TW+16*2]
1306
1307
1308 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1309 shl twtempl, 1
1310 adc twtemph, twtemph
1311 cmovc ghash_poly_8b_temp, ghash_poly_8b
1312 xor twtempl, ghash_poly_8b_temp
1313 mov [TW+8*6], twtempl
1314 mov [TW+8*7], twtemph
1315 movdqa xmm12, [TW+16*3]
1316
1317
1318 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1319 shl twtempl, 1
1320 adc twtemph, twtemph
1321 cmovc ghash_poly_8b_temp, ghash_poly_8b
1322 xor twtempl, ghash_poly_8b_temp
1323 mov [TW+8*8], twtempl
1324 mov [TW+8*9], twtemph
1325 movdqa xmm13, [TW+16*4]
1326
1327 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1328 shl twtempl, 1
1329 adc twtemph, twtemph
1330 cmovc ghash_poly_8b_temp, ghash_poly_8b
1331 xor twtempl, ghash_poly_8b_temp
1332 mov [TW+8*10], twtempl
1333 mov [TW+8*11], twtemph
1334 movdqa xmm14, [TW+16*5]
1335
1336 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1337 shl twtempl, 1
1338 adc twtemph, twtemph
1339 cmovc ghash_poly_8b_temp, ghash_poly_8b
1340 xor twtempl, ghash_poly_8b_temp
1341 mov [TW+8*12], twtempl
1342 mov [TW+8*13], twtemph
1343 movdqa xmm15, [TW+16*6]
1344
1345 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1346 shl twtempl, 1
1347 adc twtemph, twtemph
1348 cmovc ghash_poly_8b_temp, ghash_poly_8b
1349 xor twtempl, ghash_poly_8b_temp
1350 mov [TW+8*14], twtempl
1351 mov [TW+8*15], twtemph
1352 ;movdqa xmm16, [TW+16*7]
1353
1354 cmp ptr_ciphertext, target_ptr_val
1355 je _last_eight
1356_main_loop:
1357 ; load plaintext
1358 movdqu xmm1, [ptr_plaintext+16*0]
1359 movdqu xmm2, [ptr_plaintext+16*1]
1360 movdqu xmm3, [ptr_plaintext+16*2]
1361 movdqu xmm4, [ptr_plaintext+16*3]
1362 movdqu xmm5, [ptr_plaintext+16*4]
1363 movdqu xmm6, [ptr_plaintext+16*5]
1364 movdqu xmm7, [ptr_plaintext+16*6]
1365 movdqu xmm8, [ptr_plaintext+16*7]
1366
1367 add ptr_plaintext, 128
1368
1369 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1370
1371 ; store ciphertext
1372 movdqu [ptr_ciphertext+16*0], xmm1
1373 movdqu [ptr_ciphertext+16*1], xmm2
1374 movdqu [ptr_ciphertext+16*2], xmm3
1375 movdqu [ptr_ciphertext+16*3], xmm4
1376 movdqu [ptr_ciphertext+16*4], xmm5
1377 movdqu [ptr_ciphertext+16*5], xmm6
1378 movdqu [ptr_ciphertext+16*6], xmm7
1379 movdqu [ptr_ciphertext+16*7], xmm8
1380 add ptr_ciphertext, 128
1381
1382 cmp ptr_ciphertext, target_ptr_val
1383 jne _main_loop
1384
1385_last_eight:
1386
1387 and N_val, 15 ; N_val = N_val mod 16
1388 je _done_final
1389
1390 ; generate next Tweak value
1391 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1392 shl twtempl, 1
1393 adc twtemph, twtemph
1394 cmovc ghash_poly_8b_temp, ghash_poly_8b
1395 xor twtempl, ghash_poly_8b_temp
1396 movdqa xmm1, [TW + 16*7]
1397 movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
1398
1399 mov [TW + 16*7], twtempl
1400 mov [TW + 16*7+8], twtemph
1401
1402 ; load plaintext
1403 movdqu xmm1, [ptr_plaintext+16*0]
1404 movdqu xmm2, [ptr_plaintext+16*1]
1405 movdqu xmm3, [ptr_plaintext+16*2]
1406 movdqu xmm4, [ptr_plaintext+16*3]
1407 movdqu xmm5, [ptr_plaintext+16*4]
1408 movdqu xmm6, [ptr_plaintext+16*5]
1409 movdqu xmm7, [ptr_plaintext+16*6]
1410 movdqu xmm8, [ptr_plaintext+16*7]
1411 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1412
1413 ; store ciphertext
1414 movdqu [ptr_ciphertext+16*0], xmm1
1415 movdqu [ptr_ciphertext+16*1], xmm2
1416 movdqu [ptr_ciphertext+16*2], xmm3
1417 movdqu [ptr_ciphertext+16*3], xmm4
1418 movdqu [ptr_ciphertext+16*4], xmm5
1419 movdqu [ptr_ciphertext+16*5], xmm6
1420 movdqu [ptr_ciphertext+16*6], xmm7
1421 jmp _steal_cipher
1422
1423
1424_done_final:
1425 ; load plaintext
1426 movdqu xmm1, [ptr_plaintext+16*0]
1427 movdqu xmm2, [ptr_plaintext+16*1]
1428 movdqu xmm3, [ptr_plaintext+16*2]
1429 movdqu xmm4, [ptr_plaintext+16*3]
1430 movdqu xmm5, [ptr_plaintext+16*4]
1431 movdqu xmm6, [ptr_plaintext+16*5]
1432 movdqu xmm7, [ptr_plaintext+16*6]
1433 movdqu xmm8, [ptr_plaintext+16*7]
1434 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1435
1436 ; store ciphertext
1437 movdqu [ptr_ciphertext+16*0], xmm1
1438 movdqu [ptr_ciphertext+16*1], xmm2
1439 movdqu [ptr_ciphertext+16*2], xmm3
1440 movdqu [ptr_ciphertext+16*3], xmm4
1441 movdqu [ptr_ciphertext+16*4], xmm5
1442 movdqu [ptr_ciphertext+16*5], xmm6
1443 movdqu [ptr_ciphertext+16*6], xmm7
1444
1445 jmp _done
1446
1447
1448_steal_cipher:
1449 ; start cipher stealing
1450
1451
1452 movdqa xmm2, xmm8
1453
1454 ; shift xmm8 to the left by 16-N_val bytes
1455 lea twtempl, [pshufb_shf_table]
1456 movdqu xmm0, [twtempl+N_val]
1457 pshufb xmm8, xmm0
1458
1459
1460 movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1461 movdqu [ptr_ciphertext + 112 + N_val], xmm8
1462
1463 ; shift xmm3 to the right by 16-N_val bytes
1464 lea twtempl, [pshufb_shf_table +16]
1465 sub twtempl, N_val
1466 movdqu xmm0, [twtempl]
1467 pxor xmm0, [mask1]
1468 pshufb xmm3, xmm0
1469
1470 pblendvb xmm3, xmm2 ;xmm0 is implicit
1471
1472 ; xor Tweak value
1473 movdqa xmm8, [TW]
1474 pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
1475
1476
1477 ;encrypt last block with cipher stealing
1478 pxor xmm8, [keys] ; ARK
1479 aesdec xmm8, [keys + 16*1] ; round 1
1480 aesdec xmm8, [keys + 16*2] ; round 2
1481 aesdec xmm8, [keys + 16*3] ; round 3
1482 aesdec xmm8, [keys + 16*4] ; round 4
1483 aesdec xmm8, [keys + 16*5] ; round 5
1484 aesdec xmm8, [keys + 16*6] ; round 6
1485 aesdec xmm8, [keys + 16*7] ; round 7
1486 aesdec xmm8, [keys + 16*8] ; round 8
1487 aesdec xmm8, [keys + 16*9] ; round 9
1488 aesdec xmm8, [keys + 16*10] ; round 9
1489 aesdec xmm8, [keys + 16*11] ; round 9
1490 aesdec xmm8, [keys + 16*12] ; round 9
1491 aesdec xmm8, [keys + 16*13] ; round 9
1492 aesdeclast xmm8, [keys + 16*14] ; round 10
1493
1494 ; xor Tweak value
1495 pxor xmm8, [TW]
1496
1497_done:
1498 ; store last ciphertext value
1499 movdqu [ptr_ciphertext+16*7], xmm8
1500
1501_ret_:
1502
1503 mov rbx, [_gpr + 8*0]
1504%ifidn __OUTPUT_FORMAT__, win64
1505 mov rdi, [_gpr + 8*1]
1506 mov rsi, [_gpr + 8*2]
1507
1508
1509 movdqa xmm6, [_xmm + 16*0]
1510 movdqa xmm7, [_xmm + 16*1]
1511 movdqa xmm8, [_xmm + 16*2]
1512 movdqa xmm9, [_xmm + 16*3]
1513 movdqa xmm10, [_xmm + 16*4]
1514 movdqa xmm11, [_xmm + 16*5]
1515 movdqa xmm12, [_xmm + 16*6]
1516 movdqa xmm13, [_xmm + 16*7]
1517 movdqa xmm14, [_xmm + 16*8]
1518 movdqa xmm15, [_xmm + 16*9]
1519%endif
1520
1521 add rsp, VARIABLE_OFFSET
1522
1523 ret
1524
1525
1526
1527
1528
1529_less_than_128_bytes:
1530 cmp N_val, 16
1531 jb _ret_
1532
1533 mov tmp1, N_val
1534 and tmp1, (7 << 4)
1535 cmp tmp1, (6 << 4)
1536 je _num_blocks_is_6
1537 cmp tmp1, (5 << 4)
1538 je _num_blocks_is_5
1539 cmp tmp1, (4 << 4)
1540 je _num_blocks_is_4
1541 cmp tmp1, (3 << 4)
1542 je _num_blocks_is_3
1543 cmp tmp1, (2 << 4)
1544 je _num_blocks_is_2
1545 cmp tmp1, (1 << 4)
1546 je _num_blocks_is_1
1547
1548
1549
1550
1551_num_blocks_is_7:
1552 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1553
1554 sub ptr_plaintext, 16*1
1555
1556 and N_val, 15 ; N_val = N_val mod 16
1557 je _done_7
1558
1559_steal_cipher_7:
1560 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1561 shl twtempl, 1
1562 adc twtemph, twtemph
1563 cmovc ghash_poly_8b_temp, ghash_poly_8b
1564 xor twtempl, ghash_poly_8b_temp
1565 mov [TW+8*2], twtempl
1566 mov [TW+8*3], twtemph
1567
1568 movdqa [TW + 16*0] , xmm15
1569 movdqa xmm15, [TW+16*1]
1570
1571 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1572 ; store ciphertext
1573 movdqu [ptr_ciphertext+16*0], xmm1
1574 movdqu [ptr_ciphertext+16*1], xmm2
1575 movdqu [ptr_ciphertext+16*2], xmm3
1576 movdqu [ptr_ciphertext+16*3], xmm4
1577 movdqu [ptr_ciphertext+16*4], xmm5
1578 movdqu [ptr_ciphertext+16*5], xmm6
1579
1580 sub ptr_ciphertext, 16*1
1581 movdqa xmm8, xmm7
1582 jmp _steal_cipher
1583
1584_done_7:
1585 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1586 ; store ciphertext
1587 movdqu [ptr_ciphertext+16*0], xmm1
1588 movdqu [ptr_ciphertext+16*1], xmm2
1589 movdqu [ptr_ciphertext+16*2], xmm3
1590 movdqu [ptr_ciphertext+16*3], xmm4
1591 movdqu [ptr_ciphertext+16*4], xmm5
1592 movdqu [ptr_ciphertext+16*5], xmm6
1593
1594 sub ptr_ciphertext, 16*1
1595 movdqa xmm8, xmm7
1596 jmp _done
1597
1598
1599
1600
1601
1602
1603_num_blocks_is_6:
1604 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1605
1606 sub ptr_plaintext, 16*2
1607
1608 and N_val, 15 ; N_val = N_val mod 16
1609 je _done_6
1610
1611_steal_cipher_6:
1612 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1613 shl twtempl, 1
1614 adc twtemph, twtemph
1615 cmovc ghash_poly_8b_temp, ghash_poly_8b
1616 xor twtempl, ghash_poly_8b_temp
1617 mov [TW+8*2], twtempl
1618 mov [TW+8*3], twtemph
1619
1620 movdqa [TW + 16*0] , xmm14
1621 movdqa xmm14, [TW+16*1]
1622
1623 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1624 ; store ciphertext
1625 movdqu [ptr_ciphertext+16*0], xmm1
1626 movdqu [ptr_ciphertext+16*1], xmm2
1627 movdqu [ptr_ciphertext+16*2], xmm3
1628 movdqu [ptr_ciphertext+16*3], xmm4
1629 movdqu [ptr_ciphertext+16*4], xmm5
1630
1631 sub ptr_ciphertext, 16*2
1632 movdqa xmm8, xmm6
1633 jmp _steal_cipher
1634
1635_done_6:
1636 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1637 ; store ciphertext
1638 movdqu [ptr_ciphertext+16*0], xmm1
1639 movdqu [ptr_ciphertext+16*1], xmm2
1640 movdqu [ptr_ciphertext+16*2], xmm3
1641 movdqu [ptr_ciphertext+16*3], xmm4
1642 movdqu [ptr_ciphertext+16*4], xmm5
1643
1644 sub ptr_ciphertext, 16*2
1645 movdqa xmm8, xmm6
1646 jmp _done
1647
1648
1649
1650
1651
1652_num_blocks_is_5:
1653 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1654
1655 sub ptr_plaintext, 16*3
1656
1657 and N_val, 15 ; N_val = N_val mod 16
1658 je _done_5
1659
1660_steal_cipher_5:
1661 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1662 shl twtempl, 1
1663 adc twtemph, twtemph
1664 cmovc ghash_poly_8b_temp, ghash_poly_8b
1665 xor twtempl, ghash_poly_8b_temp
1666 mov [TW+8*2], twtempl
1667 mov [TW+8*3], twtemph
1668
1669 movdqa [TW + 16*0] , xmm13
1670 movdqa xmm13, [TW+16*1]
1671
1672 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1673 ; store ciphertext
1674 movdqu [ptr_ciphertext+16*0], xmm1
1675 movdqu [ptr_ciphertext+16*1], xmm2
1676 movdqu [ptr_ciphertext+16*2], xmm3
1677 movdqu [ptr_ciphertext+16*3], xmm4
1678
1679 sub ptr_ciphertext, 16*3
1680 movdqa xmm8, xmm5
1681 jmp _steal_cipher
1682
1683_done_5:
1684 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1685 ; store ciphertext
1686 movdqu [ptr_ciphertext+16*0], xmm1
1687 movdqu [ptr_ciphertext+16*1], xmm2
1688 movdqu [ptr_ciphertext+16*2], xmm3
1689 movdqu [ptr_ciphertext+16*3], xmm4
1690
1691 sub ptr_ciphertext, 16*3
1692 movdqa xmm8, xmm5
1693 jmp _done
1694
1695
1696
1697
1698
1699_num_blocks_is_4:
1700 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1701
1702 sub ptr_plaintext, 16*4
1703
1704 and N_val, 15 ; N_val = N_val mod 16
1705 je _done_4
1706
1707_steal_cipher_4:
1708 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1709 shl twtempl, 1
1710 adc twtemph, twtemph
1711 cmovc ghash_poly_8b_temp, ghash_poly_8b
1712 xor twtempl, ghash_poly_8b_temp
1713 mov [TW+8*2], twtempl
1714 mov [TW+8*3], twtemph
1715
1716 movdqa [TW + 16*0] , xmm12
1717 movdqa xmm12, [TW+16*1]
1718
1719 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1720 ; store ciphertext
1721 movdqu [ptr_ciphertext+16*0], xmm1
1722 movdqu [ptr_ciphertext+16*1], xmm2
1723 movdqu [ptr_ciphertext+16*2], xmm3
1724
1725 sub ptr_ciphertext, 16*4
1726 movdqa xmm8, xmm4
1727 jmp _steal_cipher
1728
1729_done_4:
1730 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1731 ; store ciphertext
1732 movdqu [ptr_ciphertext+16*0], xmm1
1733 movdqu [ptr_ciphertext+16*1], xmm2
1734 movdqu [ptr_ciphertext+16*2], xmm3
1735
1736 sub ptr_ciphertext, 16*4
1737 movdqa xmm8, xmm4
1738 jmp _done
1739
1740
1741
1742
1743_num_blocks_is_3:
1744 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1745
1746 sub ptr_plaintext, 16*5
1747
1748 and N_val, 15 ; N_val = N_val mod 16
1749 je _done_3
1750
1751_steal_cipher_3:
1752 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1753 shl twtempl, 1
1754 adc twtemph, twtemph
1755 cmovc ghash_poly_8b_temp, ghash_poly_8b
1756 xor twtempl, ghash_poly_8b_temp
1757 mov [TW+8*2], twtempl
1758 mov [TW+8*3], twtemph
1759
1760 movdqa [TW + 16*0] , xmm11
1761 movdqa xmm11, [TW+16*1]
1762
1763 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1764 ; store ciphertext
1765 movdqu [ptr_ciphertext+16*0], xmm1
1766 movdqu [ptr_ciphertext+16*1], xmm2
1767
1768 sub ptr_ciphertext, 16*5
1769 movdqa xmm8, xmm3
1770 jmp _steal_cipher
1771
1772_done_3:
1773 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1774 ; store ciphertext
1775 movdqu [ptr_ciphertext+16*0], xmm1
1776 movdqu [ptr_ciphertext+16*1], xmm2
1777
1778 sub ptr_ciphertext, 16*5
1779 movdqa xmm8, xmm3
1780 jmp _done
1781
1782
1783
1784
1785
1786
1787_num_blocks_is_2:
1788 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1789
1790 sub ptr_plaintext, 16*6
1791
1792 and N_val, 15 ; N_val = N_val mod 16
1793 je _done_2
1794
1795_steal_cipher_2:
1796 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1797 shl twtempl, 1
1798 adc twtemph, twtemph
1799 cmovc ghash_poly_8b_temp, ghash_poly_8b
1800 xor twtempl, ghash_poly_8b_temp
1801 mov [TW+8*2], twtempl
1802 mov [TW+8*3], twtemph
1803
1804 movdqa [TW + 16*0] , xmm10
1805 movdqa xmm10, [TW+16*1]
1806
1807 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1808 ; store ciphertext
1809 movdqu [ptr_ciphertext], xmm1
1810
1811 sub ptr_ciphertext, 16*6
1812 movdqa xmm8, xmm2
1813 jmp _steal_cipher
1814
1815_done_2:
1816 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1817 ; store ciphertext
1818 movdqu [ptr_ciphertext], xmm1
1819
1820 sub ptr_ciphertext, 16*6
1821 movdqa xmm8, xmm2
1822 jmp _done
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836_num_blocks_is_1:
1837 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1838
1839 sub ptr_plaintext, 16*7
1840
1841 and N_val, 15 ; N_val = N_val mod 16
1842 je _done_1
1843
1844_steal_cipher_1:
1845 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1846 shl twtempl, 1
1847 adc twtemph, twtemph
1848 cmovc ghash_poly_8b_temp, ghash_poly_8b
1849 xor twtempl, ghash_poly_8b_temp
1850 mov [TW+8*2], twtempl
1851 mov [TW+8*3], twtemph
1852
1853 movdqa [TW + 16*0] , xmm9
1854 movdqa xmm9, [TW+16*1]
1855
1856 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1857 ; store ciphertext
1858
1859 sub ptr_ciphertext, 16*7
1860 movdqa xmm8, xmm1
1861 jmp _steal_cipher
1862
1863_done_1:
1864 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1865 ; store ciphertext
1866
1867 sub ptr_ciphertext, 16*7
1868 movdqa xmm8, xmm1
1869 jmp _done
1870
1871section .data
1872align 16
1873
1874pshufb_shf_table:
1875; use these values for shift constants for the pshufb instruction
1876; different alignments result in values as shown:
1877; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1878; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1879; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1880; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1881; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1882; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1883; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1884; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1885; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1886; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1887; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1888; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1889; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1890; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1891; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1892dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1893dq 0x0706050403020100, 0x000e0d0c0b0a0908
1894
1895mask1:
1896dq 0x8080808080808080, 0x8080808080808080
1897
1898