]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_128_enc_expanded_key_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS encrypt function with 128-bit AES
30 ; expanded keys are not aligned
31 ; plaintext and ciphertext are not aligned
32 ; second key is stored in the stack as aligned to 16 Bytes
33 ; first key is required only once, no need for storage of this key
34
35 %include "reg_sizes.asm"
36
37 default rel
38 %define TW rsp ; store 8 tweak values
39 %define keys rsp + 16*8 ; store 11 expanded keys
40
41 %ifidn __OUTPUT_FORMAT__, win64
42 %define _xmm rsp + 16*19 ; store xmm6:xmm15
43 %endif
44
45 %ifidn __OUTPUT_FORMAT__, elf64
46 %define _gpr rsp + 16*19 ; store rbx
47 %define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
48 %else
49 %define _gpr rsp + 16*29 ; store rdi, rsi, rbx
50 %define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %endif
52
53 %define GHASH_POLY 0x87
54
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 ;void XTS_AES_128_enc_expanded_key_sse(
57 ; UINT8 *k2, // key used for tweaking, 16*11 bytes
58 ; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
59 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
60 ; UINT64 N, // sector size, in bytes
61 ; const UINT8 *pt, // plaintext sector input data
62 ; UINT8 *ct); // ciphertext sector output data
63 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
64
65 ; arguments for input parameters
66 %ifidn __OUTPUT_FORMAT__, elf64
67 %xdefine ptr_key2 rdi
68 %xdefine ptr_key1 rsi
69 %xdefine T_val rdx
70 %xdefine N_val rcx
71 %xdefine ptr_plaintext r8
72 %xdefine ptr_ciphertext r9
73 %else
74 %xdefine ptr_key2 rcx
75 %xdefine ptr_key1 rdx
76 %xdefine T_val r8
77 %xdefine N_val r9
78 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
79 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
80 %endif
81
82 ; arguments for temp parameters
83 %ifidn __OUTPUT_FORMAT__, elf64
84 %define tmp1 rdi
85 %define target_ptr_val rsi
86 %define ghash_poly_8b r10
87 %define ghash_poly_8b_temp r11
88 %else
89 %define tmp1 rcx
90 %define target_ptr_val rdx
91 %define ghash_poly_8b rdi
92 %define ghash_poly_8b_temp rsi
93 %endif
94
95 %define twtempl rax ; global temp registers used for tweak computation
96 %define twtemph rbx
97
98
99 ; macro to encrypt the tweak value
100
101 %macro encrypt_T 8
102 %define %%xkey2 %1
103 %define %%xstate_tweak %2
104 %define %%xkey1 %3
105 %define %%xraw_key %4
106 %define %%xtmp %5
107 %define %%ptr_key2 %6
108 %define %%ptr_key1 %7
109 %define %%ptr_expanded_keys %8
110
111 movdqu %%xkey2, [%%ptr_key2]
112 pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
113
114 movdqu %%xkey1, [%%ptr_key1]
115 movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
116
117 movdqu %%xkey2, [%%ptr_key2 + 16*1]
118 aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
119
120 movdqu %%xkey1, [%%ptr_key1 + 16*1]
121 movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
122
123
124 movdqu %%xkey2, [%%ptr_key2 + 16*2]
125 aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
126
127 movdqu %%xkey1, [%%ptr_key1 + 16*2]
128 movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
129
130 movdqu %%xkey2, [%%ptr_key2 + 16*3]
131 aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
132
133 movdqu %%xkey1, [%%ptr_key1 + 16*3]
134 movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
135
136 movdqu %%xkey2, [%%ptr_key2 + 16*4]
137 aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
138
139 movdqu %%xkey1, [%%ptr_key1 + 16*4]
140 movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
141
142 movdqu %%xkey2, [%%ptr_key2 + 16*5]
143 aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
144
145 movdqu %%xkey1, [%%ptr_key1 + 16*5]
146 movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
147
148 movdqu %%xkey2, [%%ptr_key2 + 16*6]
149 aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
150
151 movdqu %%xkey1, [%%ptr_key1 + 16*6]
152 movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
153
154 movdqu %%xkey2, [%%ptr_key2 + 16*7]
155 aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
156
157 movdqu %%xkey1, [%%ptr_key1 + 16*7]
158 movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
159
160
161 movdqu %%xkey2, [%%ptr_key2 + 16*8]
162 aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
163
164 movdqu %%xkey1, [%%ptr_key1 + 16*8]
165 movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
166
167
168 movdqu %%xkey2, [%%ptr_key2 + 16*9]
169 aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
170
171 movdqu %%xkey1, [%%ptr_key1 + 16*9]
172 movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
173
174
175 movdqu %%xkey2, [%%ptr_key2 + 16*10]
176 aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
177
178 movdqu %%xkey1, [%%ptr_key1 + 16*10]
179 movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
180
181 movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
182 %endmacro
183
184
185 ; generate initial tweak values
186 ; load initial plaintext values
187 %macro initialize 16
188
189 %define %%ST1 %1 ; state 1
190 %define %%ST2 %2 ; state 2
191 %define %%ST3 %3 ; state 3
192 %define %%ST4 %4 ; state 4
193 %define %%ST5 %5 ; state 5
194 %define %%ST6 %6 ; state 6
195 %define %%ST7 %7 ; state 7
196 %define %%ST8 %8 ; state 8
197
198 %define %%TW1 %9 ; tweak 1
199 %define %%TW2 %10 ; tweak 2
200 %define %%TW3 %11 ; tweak 3
201 %define %%TW4 %12 ; tweak 4
202 %define %%TW5 %13 ; tweak 5
203 %define %%TW6 %14 ; tweak 6
204 %define %%TW7 %15 ; tweak 7
205
206 %define %%num_initial_blocks %16
207
208
209 ; generate next Tweak values
210 movdqa %%TW1, [TW+16*0]
211 mov twtempl, [TW+8*0]
212 mov twtemph, [TW+8*1]
213 movdqu %%ST1, [ptr_plaintext+16*0]
214 %if (%%num_initial_blocks>=2)
215 xor ghash_poly_8b_temp, ghash_poly_8b_temp
216 shl twtempl, 1
217 adc twtemph, twtemph
218 cmovc ghash_poly_8b_temp, ghash_poly_8b
219 xor twtempl, ghash_poly_8b_temp
220 mov [TW+8*2], twtempl
221 mov [TW+8*3], twtemph;
222 movdqa %%TW2, [TW+16*1]
223 movdqu %%ST2, [ptr_plaintext+16*1]
224 %endif
225 %if (%%num_initial_blocks>=3)
226 xor ghash_poly_8b_temp, ghash_poly_8b_temp
227 shl twtempl, 1
228 adc twtemph, twtemph
229 cmovc ghash_poly_8b_temp, ghash_poly_8b
230 xor twtempl, ghash_poly_8b_temp
231 mov [TW+8*4], twtempl
232 mov [TW+8*5], twtemph;
233 movdqa %%TW3, [TW+16*2]
234 movdqu %%ST3, [ptr_plaintext+16*2]
235 %endif
236 %if (%%num_initial_blocks>=4)
237 xor ghash_poly_8b_temp, ghash_poly_8b_temp
238 shl twtempl, 1
239 adc twtemph, twtemph
240 cmovc ghash_poly_8b_temp, ghash_poly_8b
241 xor twtempl, ghash_poly_8b_temp
242 mov [TW+8*6], twtempl
243 mov [TW+8*7], twtemph;
244 movdqa %%TW4, [TW+16*3]
245 movdqu %%ST4, [ptr_plaintext+16*3]
246 %endif
247 %if (%%num_initial_blocks>=5)
248 xor ghash_poly_8b_temp, ghash_poly_8b_temp
249 shl twtempl, 1
250 adc twtemph, twtemph
251 cmovc ghash_poly_8b_temp, ghash_poly_8b
252 xor twtempl, ghash_poly_8b_temp
253 mov [TW+8*8], twtempl
254 mov [TW+8*9], twtemph;
255 movdqa %%TW5, [TW+16*4]
256 movdqu %%ST5, [ptr_plaintext+16*4]
257 %endif
258 %if (%%num_initial_blocks>=6)
259 xor ghash_poly_8b_temp, ghash_poly_8b_temp
260 shl twtempl, 1
261 adc twtemph, twtemph
262 cmovc ghash_poly_8b_temp, ghash_poly_8b
263 xor twtempl, ghash_poly_8b_temp
264 mov [TW+8*10], twtempl
265 mov [TW+8*11], twtemph;
266 movdqa %%TW6, [TW+16*5]
267 movdqu %%ST6, [ptr_plaintext+16*5]
268 %endif
269 %if (%%num_initial_blocks>=7)
270 xor ghash_poly_8b_temp, ghash_poly_8b_temp
271 shl twtempl, 1
272 adc twtemph, twtemph
273 cmovc ghash_poly_8b_temp, ghash_poly_8b
274 xor twtempl, ghash_poly_8b_temp
275 mov [TW+8*12], twtempl
276 mov [TW+8*13], twtemph;
277 movdqa %%TW7, [TW+16*6]
278 movdqu %%ST7, [ptr_plaintext+16*6]
279 %endif
280
281
282
283 %endmacro
284
285
286 ; encrypt initial blocks of AES
287 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
288 ; next 8 Tweak values are generated
289 %macro encrypt_initial 18
290 %define %%ST1 %1 ; state 1
291 %define %%ST2 %2 ; state 2
292 %define %%ST3 %3 ; state 3
293 %define %%ST4 %4 ; state 4
294 %define %%ST5 %5 ; state 5
295 %define %%ST6 %6 ; state 6
296 %define %%ST7 %7 ; state 7
297 %define %%ST8 %8 ; state 8
298
299 %define %%TW1 %9 ; tweak 1
300 %define %%TW2 %10 ; tweak 2
301 %define %%TW3 %11 ; tweak 3
302 %define %%TW4 %12 ; tweak 4
303 %define %%TW5 %13 ; tweak 5
304 %define %%TW6 %14 ; tweak 6
305 %define %%TW7 %15 ; tweak 7
306 %define %%T0 %16 ; Temp register
307 %define %%num_blocks %17
308 ; %%num_blocks blocks encrypted
309 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
310
311 %define %%lt128 %18 ; less than 128 bytes
312
313 ; xor Tweak value
314 pxor %%ST1, %%TW1
315 %if (%%num_blocks>=2)
316 pxor %%ST2, %%TW2
317 %endif
318 %if (%%num_blocks>=3)
319 pxor %%ST3, %%TW3
320 %endif
321 %if (%%num_blocks>=4)
322 pxor %%ST4, %%TW4
323 %endif
324 %if (%%num_blocks>=5)
325 pxor %%ST5, %%TW5
326 %endif
327 %if (%%num_blocks>=6)
328 pxor %%ST6, %%TW6
329 %endif
330 %if (%%num_blocks>=7)
331 pxor %%ST7, %%TW7
332 %endif
333
334
335 ; ARK
336 movdqa %%T0, [keys]
337 pxor %%ST1, %%T0
338 %if (%%num_blocks>=2)
339 pxor %%ST2, %%T0
340 %endif
341 %if (%%num_blocks>=3)
342 pxor %%ST3, %%T0
343 %endif
344 %if (%%num_blocks>=4)
345 pxor %%ST4, %%T0
346 %endif
347 %if (%%num_blocks>=5)
348 pxor %%ST5, %%T0
349 %endif
350 %if (%%num_blocks>=6)
351 pxor %%ST6, %%T0
352 %endif
353 %if (%%num_blocks>=7)
354 pxor %%ST7, %%T0
355 %endif
356
357
358 %if (0 == %%lt128)
359 xor ghash_poly_8b_temp, ghash_poly_8b_temp
360 shl twtempl, 1
361 adc twtemph, twtemph
362 %endif
363
364 ; round 1
365 movdqa %%T0, [keys + 16*1]
366 aesenc %%ST1, %%T0
367 %if (%%num_blocks>=2)
368 aesenc %%ST2, %%T0
369 %endif
370 %if (%%num_blocks>=3)
371 aesenc %%ST3, %%T0
372 %endif
373 %if (%%num_blocks>=4)
374 aesenc %%ST4, %%T0
375 %endif
376 %if (%%num_blocks>=5)
377 aesenc %%ST5, %%T0
378 %endif
379 %if (%%num_blocks>=6)
380 aesenc %%ST6, %%T0
381 %endif
382 %if (%%num_blocks>=7)
383 aesenc %%ST7, %%T0
384 %endif
385 %if (0 == %%lt128)
386 cmovc ghash_poly_8b_temp, ghash_poly_8b
387 xor twtempl, ghash_poly_8b_temp
388 mov [TW + 8*0], twtempl ; next Tweak1 generated
389 mov [TW + 8*1], twtemph
390 xor ghash_poly_8b_temp, ghash_poly_8b_temp
391 %endif
392
393 ; round 2
394 movdqa %%T0, [keys + 16*2]
395 aesenc %%ST1, %%T0
396 %if (%%num_blocks>=2)
397 aesenc %%ST2, %%T0
398 %endif
399 %if (%%num_blocks>=3)
400 aesenc %%ST3, %%T0
401 %endif
402 %if (%%num_blocks>=4)
403 aesenc %%ST4, %%T0
404 %endif
405 %if (%%num_blocks>=5)
406 aesenc %%ST5, %%T0
407 %endif
408 %if (%%num_blocks>=6)
409 aesenc %%ST6, %%T0
410 %endif
411 %if (%%num_blocks>=7)
412 aesenc %%ST7, %%T0
413 %endif
414
415 %if (0 == %%lt128)
416 shl twtempl, 1
417 adc twtemph, twtemph
418 cmovc ghash_poly_8b_temp, ghash_poly_8b
419 xor twtempl, ghash_poly_8b_temp
420 mov [TW + 8*2], twtempl ; next Tweak2 generated
421 %endif
422
423 ; round 3
424 movdqa %%T0, [keys + 16*3]
425 aesenc %%ST1, %%T0
426 %if (%%num_blocks>=2)
427 aesenc %%ST2, %%T0
428 %endif
429 %if (%%num_blocks>=3)
430 aesenc %%ST3, %%T0
431 %endif
432 %if (%%num_blocks>=4)
433 aesenc %%ST4, %%T0
434 %endif
435 %if (%%num_blocks>=5)
436 aesenc %%ST5, %%T0
437 %endif
438 %if (%%num_blocks>=6)
439 aesenc %%ST6, %%T0
440 %endif
441 %if (%%num_blocks>=7)
442 aesenc %%ST7, %%T0
443 %endif
444 %if (0 == %%lt128)
445 mov [TW + 8*3], twtemph
446 xor ghash_poly_8b_temp, ghash_poly_8b_temp
447 shl twtempl, 1
448 adc twtemph, twtemph
449 cmovc ghash_poly_8b_temp, ghash_poly_8b
450 %endif
451
452 ; round 4
453 movdqa %%T0, [keys + 16*4]
454 aesenc %%ST1, %%T0
455 %if (%%num_blocks>=2)
456 aesenc %%ST2, %%T0
457 %endif
458 %if (%%num_blocks>=3)
459 aesenc %%ST3, %%T0
460 %endif
461 %if (%%num_blocks>=4)
462 aesenc %%ST4, %%T0
463 %endif
464 %if (%%num_blocks>=5)
465 aesenc %%ST5, %%T0
466 %endif
467 %if (%%num_blocks>=6)
468 aesenc %%ST6, %%T0
469 %endif
470 %if (%%num_blocks>=7)
471 aesenc %%ST7, %%T0
472 %endif
473
474 %if (0 == %%lt128)
475 xor twtempl, ghash_poly_8b_temp
476 mov [TW + 8*4], twtempl ; next Tweak3 generated
477 mov [TW + 8*5], twtemph
478 xor ghash_poly_8b_temp, ghash_poly_8b_temp
479 shl twtempl, 1
480 %endif
481
482 ; round 5
483 movdqa %%T0, [keys + 16*5]
484 aesenc %%ST1, %%T0
485 %if (%%num_blocks>=2)
486 aesenc %%ST2, %%T0
487 %endif
488 %if (%%num_blocks>=3)
489 aesenc %%ST3, %%T0
490 %endif
491 %if (%%num_blocks>=4)
492 aesenc %%ST4, %%T0
493 %endif
494 %if (%%num_blocks>=5)
495 aesenc %%ST5, %%T0
496 %endif
497 %if (%%num_blocks>=6)
498 aesenc %%ST6, %%T0
499 %endif
500 %if (%%num_blocks>=7)
501 aesenc %%ST7, %%T0
502 %endif
503
504 %if (0 == %%lt128)
505 adc twtemph, twtemph
506 cmovc ghash_poly_8b_temp, ghash_poly_8b
507 xor twtempl, ghash_poly_8b_temp
508 mov [TW + 8*6], twtempl ; next Tweak4 generated
509 mov [TW + 8*7], twtemph
510 %endif
511
512 ; round 6
513 movdqa %%T0, [keys + 16*6]
514 aesenc %%ST1, %%T0
515 %if (%%num_blocks>=2)
516 aesenc %%ST2, %%T0
517 %endif
518 %if (%%num_blocks>=3)
519 aesenc %%ST3, %%T0
520 %endif
521 %if (%%num_blocks>=4)
522 aesenc %%ST4, %%T0
523 %endif
524 %if (%%num_blocks>=5)
525 aesenc %%ST5, %%T0
526 %endif
527 %if (%%num_blocks>=6)
528 aesenc %%ST6, %%T0
529 %endif
530 %if (%%num_blocks>=7)
531 aesenc %%ST7, %%T0
532 %endif
533
534 %if (0 == %%lt128)
535 xor ghash_poly_8b_temp, ghash_poly_8b_temp
536 shl twtempl, 1
537 adc twtemph, twtemph
538 cmovc ghash_poly_8b_temp, ghash_poly_8b
539 xor twtempl, ghash_poly_8b_temp
540 mov [TW + 8*8], twtempl ; next Tweak5 generated
541 mov [TW + 8*9], twtemph
542 %endif
543
544 ; round 7
545 movdqa %%T0, [keys + 16*7]
546 aesenc %%ST1, %%T0
547 %if (%%num_blocks>=2)
548 aesenc %%ST2, %%T0
549 %endif
550 %if (%%num_blocks>=3)
551 aesenc %%ST3, %%T0
552 %endif
553 %if (%%num_blocks>=4)
554 aesenc %%ST4, %%T0
555 %endif
556 %if (%%num_blocks>=5)
557 aesenc %%ST5, %%T0
558 %endif
559 %if (%%num_blocks>=6)
560 aesenc %%ST6, %%T0
561 %endif
562 %if (%%num_blocks>=7)
563 aesenc %%ST7, %%T0
564 %endif
565
566 %if (0 == %%lt128)
567 xor ghash_poly_8b_temp, ghash_poly_8b_temp
568 shl twtempl, 1
569 adc twtemph, twtemph
570 cmovc ghash_poly_8b_temp, ghash_poly_8b
571 xor twtempl, ghash_poly_8b_temp
572 mov [TW + 8*10], twtempl ; next Tweak6 generated
573 mov [TW + 8*11], twtemph
574 %endif
575 ; round 8
576 movdqa %%T0, [keys + 16*8]
577 aesenc %%ST1, %%T0
578 %if (%%num_blocks>=2)
579 aesenc %%ST2, %%T0
580 %endif
581 %if (%%num_blocks>=3)
582 aesenc %%ST3, %%T0
583 %endif
584 %if (%%num_blocks>=4)
585 aesenc %%ST4, %%T0
586 %endif
587 %if (%%num_blocks>=5)
588 aesenc %%ST5, %%T0
589 %endif
590 %if (%%num_blocks>=6)
591 aesenc %%ST6, %%T0
592 %endif
593 %if (%%num_blocks>=7)
594 aesenc %%ST7, %%T0
595 %endif
596
597 %if (0 == %%lt128)
598 xor ghash_poly_8b_temp, ghash_poly_8b_temp
599 shl twtempl, 1
600 adc twtemph, twtemph
601 cmovc ghash_poly_8b_temp, ghash_poly_8b
602 xor twtempl, ghash_poly_8b_temp
603 mov [TW + 8*12], twtempl ; next Tweak7 generated
604 mov [TW + 8*13], twtemph
605 %endif
606 ; round 9
607 movdqa %%T0, [keys + 16*9]
608 aesenc %%ST1, %%T0
609 %if (%%num_blocks>=2)
610 aesenc %%ST2, %%T0
611 %endif
612 %if (%%num_blocks>=3)
613 aesenc %%ST3, %%T0
614 %endif
615 %if (%%num_blocks>=4)
616 aesenc %%ST4, %%T0
617 %endif
618 %if (%%num_blocks>=5)
619 aesenc %%ST5, %%T0
620 %endif
621 %if (%%num_blocks>=6)
622 aesenc %%ST6, %%T0
623 %endif
624 %if (%%num_blocks>=7)
625 aesenc %%ST7, %%T0
626 %endif
627
628 %if (0 == %%lt128)
629 xor ghash_poly_8b_temp, ghash_poly_8b_temp
630 shl twtempl, 1
631 adc twtemph, twtemph
632 cmovc ghash_poly_8b_temp, ghash_poly_8b
633 xor twtempl, ghash_poly_8b_temp
634 mov [TW + 8*14], twtempl ; next Tweak8 generated
635 mov [TW + 8*15], twtemph
636 %endif
637
638
639
640
641 ; round 10
642 movdqa %%T0, [keys + 16*10]
643 aesenclast %%ST1, %%T0
644 %if (%%num_blocks>=2)
645 aesenclast %%ST2, %%T0
646 %endif
647 %if (%%num_blocks>=3)
648 aesenclast %%ST3, %%T0
649 %endif
650 %if (%%num_blocks>=4)
651 aesenclast %%ST4, %%T0
652 %endif
653 %if (%%num_blocks>=5)
654 aesenclast %%ST5, %%T0
655 %endif
656 %if (%%num_blocks>=6)
657 aesenclast %%ST6, %%T0
658 %endif
659 %if (%%num_blocks>=7)
660 aesenclast %%ST7, %%T0
661 %endif
662
663 ; xor Tweak values
664 pxor %%ST1, %%TW1
665 %if (%%num_blocks>=2)
666 pxor %%ST2, %%TW2
667 %endif
668 %if (%%num_blocks>=3)
669 pxor %%ST3, %%TW3
670 %endif
671 %if (%%num_blocks>=4)
672 pxor %%ST4, %%TW4
673 %endif
674 %if (%%num_blocks>=5)
675 pxor %%ST5, %%TW5
676 %endif
677 %if (%%num_blocks>=6)
678 pxor %%ST6, %%TW6
679 %endif
680 %if (%%num_blocks>=7)
681 pxor %%ST7, %%TW7
682 %endif
683
684
685 %if (0 == %%lt128)
686 ; load next Tweak values
687 movdqa %%TW1, [TW + 16*0]
688 movdqa %%TW2, [TW + 16*1]
689 movdqa %%TW3, [TW + 16*2]
690 movdqa %%TW4, [TW + 16*3]
691 movdqa %%TW5, [TW + 16*4]
692 movdqa %%TW6, [TW + 16*5]
693 movdqa %%TW7, [TW + 16*6]
694
695 %endif
696
697 %endmacro
698
699
700 ; Encrypt 8 blocks in parallel
701 ; generate next 8 tweak values
702 %macro encrypt_by_eight 18
703 %define %%ST1 %1 ; state 1
704 %define %%ST2 %2 ; state 2
705 %define %%ST3 %3 ; state 3
706 %define %%ST4 %4 ; state 4
707 %define %%ST5 %5 ; state 5
708 %define %%ST6 %6 ; state 6
709 %define %%ST7 %7 ; state 7
710 %define %%ST8 %8 ; state 8
711 %define %%TW1 %9 ; tweak 1
712 %define %%TW2 %10 ; tweak 2
713 %define %%TW3 %11 ; tweak 3
714 %define %%TW4 %12 ; tweak 4
715 %define %%TW5 %13 ; tweak 5
716 %define %%TW6 %14 ; tweak 6
717 %define %%TW7 %15 ; tweak 7
718 %define %%TW8 %16 ; tweak 8
719 %define %%T0 %17 ; Temp register
720 %define %%last_eight %18
721
722 ; xor Tweak values
723 pxor %%ST1, %%TW1
724 pxor %%ST2, %%TW2
725 pxor %%ST3, %%TW3
726 pxor %%ST4, %%TW4
727 pxor %%ST5, %%TW5
728 pxor %%ST6, %%TW6
729 pxor %%ST7, %%TW7
730 pxor %%ST8, %%TW8
731
732 ; ARK
733 movdqa %%T0, [keys]
734 pxor %%ST1, %%T0
735 pxor %%ST2, %%T0
736 pxor %%ST3, %%T0
737 pxor %%ST4, %%T0
738 pxor %%ST5, %%T0
739 pxor %%ST6, %%T0
740 pxor %%ST7, %%T0
741 pxor %%ST8, %%T0
742
743 %if (0 == %%last_eight)
744 xor ghash_poly_8b_temp, ghash_poly_8b_temp
745 shl twtempl, 1
746 adc twtemph, twtemph
747 cmovc ghash_poly_8b_temp, ghash_poly_8b
748 %endif
749 ; round 1
750 movdqa %%T0, [keys + 16*1]
751 aesenc %%ST1, %%T0
752 aesenc %%ST2, %%T0
753 aesenc %%ST3, %%T0
754 aesenc %%ST4, %%T0
755 aesenc %%ST5, %%T0
756 aesenc %%ST6, %%T0
757 aesenc %%ST7, %%T0
758 aesenc %%ST8, %%T0
759 %if (0 == %%last_eight)
760 xor twtempl, ghash_poly_8b_temp
761 mov [TW + 8*0], twtempl
762 mov [TW + 8*1], twtemph
763 xor ghash_poly_8b_temp, ghash_poly_8b_temp
764 %endif
765 ; round 2
766 movdqa %%T0, [keys + 16*2]
767 aesenc %%ST1, %%T0
768 aesenc %%ST2, %%T0
769 aesenc %%ST3, %%T0
770 aesenc %%ST4, %%T0
771 aesenc %%ST5, %%T0
772 aesenc %%ST6, %%T0
773 aesenc %%ST7, %%T0
774 aesenc %%ST8, %%T0
775 %if (0 == %%last_eight)
776 shl twtempl, 1
777 adc twtemph, twtemph
778 cmovc ghash_poly_8b_temp, ghash_poly_8b
779 xor twtempl, ghash_poly_8b_temp
780
781 %endif
782 ; round 3
783 movdqa %%T0, [keys + 16*3]
784 aesenc %%ST1, %%T0
785 aesenc %%ST2, %%T0
786 aesenc %%ST3, %%T0
787 aesenc %%ST4, %%T0
788 aesenc %%ST5, %%T0
789 aesenc %%ST6, %%T0
790 aesenc %%ST7, %%T0
791 aesenc %%ST8, %%T0
792 %if (0 == %%last_eight)
793 mov [TW + 8*2], twtempl
794 mov [TW + 8*3], twtemph
795 xor ghash_poly_8b_temp, ghash_poly_8b_temp
796 shl twtempl, 1
797 %endif
798 ; round 4
799 movdqa %%T0, [keys + 16*4]
800 aesenc %%ST1, %%T0
801 aesenc %%ST2, %%T0
802 aesenc %%ST3, %%T0
803 aesenc %%ST4, %%T0
804 aesenc %%ST5, %%T0
805 aesenc %%ST6, %%T0
806 aesenc %%ST7, %%T0
807 aesenc %%ST8, %%T0
808 %if (0 == %%last_eight)
809 adc twtemph, twtemph
810 cmovc ghash_poly_8b_temp, ghash_poly_8b
811 xor twtempl, ghash_poly_8b_temp
812 mov [TW + 8*4], twtempl
813 %endif
814 ; round 5
815 movdqa %%T0, [keys + 16*5]
816 aesenc %%ST1, %%T0
817 aesenc %%ST2, %%T0
818 aesenc %%ST3, %%T0
819 aesenc %%ST4, %%T0
820 aesenc %%ST5, %%T0
821 aesenc %%ST6, %%T0
822 aesenc %%ST7, %%T0
823 aesenc %%ST8, %%T0
824 %if (0 == %%last_eight)
825 mov [TW + 8*5], twtemph
826 xor ghash_poly_8b_temp, ghash_poly_8b_temp
827 shl twtempl, 1
828 adc twtemph, twtemph
829 %endif
830 ; round 6
831 movdqa %%T0, [keys + 16*6]
832 aesenc %%ST1, %%T0
833 aesenc %%ST2, %%T0
834 aesenc %%ST3, %%T0
835 aesenc %%ST4, %%T0
836 aesenc %%ST5, %%T0
837 aesenc %%ST6, %%T0
838 aesenc %%ST7, %%T0
839 aesenc %%ST8, %%T0
840 %if (0 == %%last_eight)
841 cmovc ghash_poly_8b_temp, ghash_poly_8b
842 xor twtempl, ghash_poly_8b_temp
843 mov [TW + 8*6], twtempl
844 mov [TW + 8*7], twtemph
845 %endif
846 ; round 7
847 movdqa %%T0, [keys + 16*7]
848 aesenc %%ST1, %%T0
849 aesenc %%ST2, %%T0
850 aesenc %%ST3, %%T0
851 aesenc %%ST4, %%T0
852 aesenc %%ST5, %%T0
853 aesenc %%ST6, %%T0
854 aesenc %%ST7, %%T0
855 aesenc %%ST8, %%T0
856 %if (0 == %%last_eight)
857 xor ghash_poly_8b_temp, ghash_poly_8b_temp
858 shl twtempl, 1
859 adc twtemph, twtemph
860 cmovc ghash_poly_8b_temp, ghash_poly_8b
861 %endif
862 ; round 8
863 movdqa %%T0, [keys + 16*8]
864 aesenc %%ST1, %%T0
865 aesenc %%ST2, %%T0
866 aesenc %%ST3, %%T0
867 aesenc %%ST4, %%T0
868 aesenc %%ST5, %%T0
869 aesenc %%ST6, %%T0
870 aesenc %%ST7, %%T0
871 aesenc %%ST8, %%T0
872 %if (0 == %%last_eight)
873 xor twtempl, ghash_poly_8b_temp
874 mov [TW + 8*8], twtempl
875 mov [TW + 8*9], twtemph
876 xor ghash_poly_8b_temp, ghash_poly_8b_temp
877 %endif
878 ; round 9
879 movdqa %%T0, [keys + 16*9]
880 aesenc %%ST1, %%T0
881 aesenc %%ST2, %%T0
882 aesenc %%ST3, %%T0
883 aesenc %%ST4, %%T0
884 aesenc %%ST5, %%T0
885 aesenc %%ST6, %%T0
886 aesenc %%ST7, %%T0
887 aesenc %%ST8, %%T0
888 %if (0 == %%last_eight)
889 shl twtempl, 1
890 adc twtemph, twtemph
891 cmovc ghash_poly_8b_temp, ghash_poly_8b
892 xor twtempl, ghash_poly_8b_temp
893 %endif
894
895 %if (0 == %%last_eight)
896 mov [TW + 8*10], twtempl
897 mov [TW + 8*11], twtemph
898 xor ghash_poly_8b_temp, ghash_poly_8b_temp
899 shl twtempl, 1
900 %endif
901
902 %if (0 == %%last_eight)
903 adc twtemph, twtemph
904 cmovc ghash_poly_8b_temp, ghash_poly_8b
905 xor twtempl, ghash_poly_8b_temp
906 mov [TW + 8*12], twtempl
907 %endif
908
909 %if (0 == %%last_eight)
910 mov [TW + 8*13], twtemph
911 xor ghash_poly_8b_temp, ghash_poly_8b_temp
912 shl twtempl, 1
913 adc twtemph, twtemph
914 %endif
915
916 %if (0 == %%last_eight)
917 cmovc ghash_poly_8b_temp, ghash_poly_8b
918 xor twtempl, ghash_poly_8b_temp
919 ; mov [TW + 8*14], twtempl
920 ; mov [TW + 8*15], twtemph
921 %endif
922 ; round 10
923 movdqa %%T0, [keys + 16*10]
924 aesenclast %%ST1, %%T0
925 aesenclast %%ST2, %%T0
926 aesenclast %%ST3, %%T0
927 aesenclast %%ST4, %%T0
928 aesenclast %%ST5, %%T0
929 aesenclast %%ST6, %%T0
930 aesenclast %%ST7, %%T0
931 aesenclast %%ST8, %%T0
932
933 ; xor Tweak values
934 pxor %%ST1, %%TW1
935 pxor %%ST2, %%TW2
936 pxor %%ST3, %%TW3
937 pxor %%ST4, %%TW4
938 pxor %%ST5, %%TW5
939 pxor %%ST6, %%TW6
940 pxor %%ST7, %%TW7
941 pxor %%ST8, %%TW8
942
943 mov [TW + 8*14], twtempl
944 mov [TW + 8*15], twtemph
945 ; load next Tweak values
946 movdqa %%TW1, [TW + 16*0]
947 movdqa %%TW2, [TW + 16*1]
948 movdqa %%TW3, [TW + 16*2]
949 movdqa %%TW4, [TW + 16*3]
950 movdqa %%TW5, [TW + 16*4]
951 movdqa %%TW6, [TW + 16*5]
952 movdqa %%TW7, [TW + 16*6]
953
954 %endmacro
955
956
957 section .text
958
959 global XTS_AES_128_enc_expanded_key_sse:function
960 XTS_AES_128_enc_expanded_key_sse:
961
962 sub rsp, VARIABLE_OFFSET
963
964 mov [_gpr + 8*0], rbx
965 %ifidn __OUTPUT_FORMAT__, win64
966 mov [_gpr + 8*1], rdi
967 mov [_gpr + 8*2], rsi
968
969 movdqa [_xmm + 16*0], xmm6
970 movdqa [_xmm + 16*1], xmm7
971 movdqa [_xmm + 16*2], xmm8
972 movdqa [_xmm + 16*3], xmm9
973 movdqa [_xmm + 16*4], xmm10
974 movdqa [_xmm + 16*5], xmm11
975 movdqa [_xmm + 16*6], xmm12
976 movdqa [_xmm + 16*7], xmm13
977 movdqa [_xmm + 16*8], xmm14
978 movdqa [_xmm + 16*9], xmm15
979 %endif
980
981 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
982
983
984 movdqu xmm1, [T_val] ; read initial Tweak value
985 pxor xmm4, xmm4 ; for key expansion
986 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
987
988
989 %ifidn __OUTPUT_FORMAT__, win64
990 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
991 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
992 %endif
993
994
995
996 mov target_ptr_val, N_val
997 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
998 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
999 jl _less_than_128_bytes
1000
1001 add target_ptr_val, ptr_ciphertext
1002
1003
1004 mov tmp1, N_val
1005 and tmp1, (7 << 4)
1006 jz _initial_num_blocks_is_0
1007
1008 cmp tmp1, (4 << 4)
1009 je _initial_num_blocks_is_4
1010
1011
1012
1013 cmp tmp1, (6 << 4)
1014 je _initial_num_blocks_is_6
1015
1016 cmp tmp1, (5 << 4)
1017 je _initial_num_blocks_is_5
1018
1019
1020
1021 cmp tmp1, (3 << 4)
1022 je _initial_num_blocks_is_3
1023
1024 cmp tmp1, (2 << 4)
1025 je _initial_num_blocks_is_2
1026
1027 cmp tmp1, (1 << 4)
1028 je _initial_num_blocks_is_1
1029
1030 _initial_num_blocks_is_7:
1031 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1032 add ptr_plaintext, 16*7
1033 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1034 ; store ciphertext
1035 movdqu [ptr_ciphertext+16*0], xmm1
1036 movdqu [ptr_ciphertext+16*1], xmm2
1037 movdqu [ptr_ciphertext+16*2], xmm3
1038 movdqu [ptr_ciphertext+16*3], xmm4
1039 movdqu [ptr_ciphertext+16*4], xmm5
1040 movdqu [ptr_ciphertext+16*5], xmm6
1041 movdqu [ptr_ciphertext+16*6], xmm7
1042 add ptr_ciphertext, 16*7
1043
1044 cmp ptr_ciphertext, target_ptr_val
1045 je _last_eight
1046
1047 jmp _main_loop
1048 _initial_num_blocks_is_6:
1049 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1050 add ptr_plaintext, 16*6
1051 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1052 ; store ciphertext
1053 movdqu [ptr_ciphertext+16*0], xmm1
1054 movdqu [ptr_ciphertext+16*1], xmm2
1055 movdqu [ptr_ciphertext+16*2], xmm3
1056 movdqu [ptr_ciphertext+16*3], xmm4
1057 movdqu [ptr_ciphertext+16*4], xmm5
1058 movdqu [ptr_ciphertext+16*5], xmm6
1059 add ptr_ciphertext, 16*6
1060
1061 cmp ptr_ciphertext, target_ptr_val
1062 je _last_eight
1063
1064 jmp _main_loop
1065 _initial_num_blocks_is_5:
1066 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1067 add ptr_plaintext, 16*5
1068 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1069 ; store ciphertext
1070 movdqu [ptr_ciphertext+16*0], xmm1
1071 movdqu [ptr_ciphertext+16*1], xmm2
1072 movdqu [ptr_ciphertext+16*2], xmm3
1073 movdqu [ptr_ciphertext+16*3], xmm4
1074 movdqu [ptr_ciphertext+16*4], xmm5
1075 add ptr_ciphertext, 16*5
1076
1077 cmp ptr_ciphertext, target_ptr_val
1078 je _last_eight
1079
1080 jmp _main_loop
1081 _initial_num_blocks_is_4:
1082 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1083 add ptr_plaintext, 16*4
1084 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1085 ; store ciphertext
1086 movdqu [ptr_ciphertext+16*0], xmm1
1087 movdqu [ptr_ciphertext+16*1], xmm2
1088 movdqu [ptr_ciphertext+16*2], xmm3
1089 movdqu [ptr_ciphertext+16*3], xmm4
1090 add ptr_ciphertext, 16*4
1091
1092 cmp ptr_ciphertext, target_ptr_val
1093 je _last_eight
1094
1095 jmp _main_loop
1096
1097
1098 _initial_num_blocks_is_3:
1099 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1100 add ptr_plaintext, 16*3
1101 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1102 ; store ciphertext
1103 movdqu [ptr_ciphertext+16*0], xmm1
1104 movdqu [ptr_ciphertext+16*1], xmm2
1105 movdqu [ptr_ciphertext+16*2], xmm3
1106 add ptr_ciphertext, 16*3
1107
1108 cmp ptr_ciphertext, target_ptr_val
1109 je _last_eight
1110
1111 jmp _main_loop
1112 _initial_num_blocks_is_2:
1113 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1114 add ptr_plaintext, 16*2
1115 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1116 ; store ciphertext
1117 movdqu [ptr_ciphertext], xmm1
1118 movdqu [ptr_ciphertext+16], xmm2
1119 add ptr_ciphertext, 16*2
1120
1121 cmp ptr_ciphertext, target_ptr_val
1122 je _last_eight
1123
1124 jmp _main_loop
1125
1126 _initial_num_blocks_is_1:
1127 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1128 add ptr_plaintext, 16*1
1129 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1130 ; store ciphertext
1131 movdqu [ptr_ciphertext], xmm1
1132 add ptr_ciphertext, 16
1133
1134 cmp ptr_ciphertext, target_ptr_val
1135 je _last_eight
1136
1137 jmp _main_loop
1138
1139 _initial_num_blocks_is_0:
1140 mov twtempl, [TW+8*0]
1141 mov twtemph, [TW+8*1]
1142 movdqa xmm9, [TW+16*0]
1143
1144 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1145 shl twtempl, 1
1146 adc twtemph, twtemph
1147 cmovc ghash_poly_8b_temp, ghash_poly_8b
1148 xor twtempl, ghash_poly_8b_temp
1149 mov [TW+8*2], twtempl
1150 mov [TW+8*3], twtemph
1151 movdqa xmm10, [TW+16*1]
1152
1153 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1154 shl twtempl, 1
1155 adc twtemph, twtemph
1156 cmovc ghash_poly_8b_temp, ghash_poly_8b
1157 xor twtempl, ghash_poly_8b_temp
1158 mov [TW+8*4], twtempl
1159 mov [TW+8*5], twtemph
1160 movdqa xmm11, [TW+16*2]
1161
1162
1163 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1164 shl twtempl, 1
1165 adc twtemph, twtemph
1166 cmovc ghash_poly_8b_temp, ghash_poly_8b
1167 xor twtempl, ghash_poly_8b_temp
1168 mov [TW+8*6], twtempl
1169 mov [TW+8*7], twtemph
1170 movdqa xmm12, [TW+16*3]
1171
1172
1173 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1174 shl twtempl, 1
1175 adc twtemph, twtemph
1176 cmovc ghash_poly_8b_temp, ghash_poly_8b
1177 xor twtempl, ghash_poly_8b_temp
1178 mov [TW+8*8], twtempl
1179 mov [TW+8*9], twtemph
1180 movdqa xmm13, [TW+16*4]
1181
1182 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1183 shl twtempl, 1
1184 adc twtemph, twtemph
1185 cmovc ghash_poly_8b_temp, ghash_poly_8b
1186 xor twtempl, ghash_poly_8b_temp
1187 mov [TW+8*10], twtempl
1188 mov [TW+8*11], twtemph
1189 movdqa xmm14, [TW+16*5]
1190
1191 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1192 shl twtempl, 1
1193 adc twtemph, twtemph
1194 cmovc ghash_poly_8b_temp, ghash_poly_8b
1195 xor twtempl, ghash_poly_8b_temp
1196 mov [TW+8*12], twtempl
1197 mov [TW+8*13], twtemph
1198 movdqa xmm15, [TW+16*6]
1199
1200 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1201 shl twtempl, 1
1202 adc twtemph, twtemph
1203 cmovc ghash_poly_8b_temp, ghash_poly_8b
1204 xor twtempl, ghash_poly_8b_temp
1205 mov [TW+8*14], twtempl
1206 mov [TW+8*15], twtemph
1207 ;movdqa xmm16, [TW+16*7]
1208
1209 cmp ptr_ciphertext, target_ptr_val
1210 je _last_eight
1211 _main_loop:
1212 ; load plaintext
1213 movdqu xmm1, [ptr_plaintext+16*0]
1214 movdqu xmm2, [ptr_plaintext+16*1]
1215 movdqu xmm3, [ptr_plaintext+16*2]
1216 movdqu xmm4, [ptr_plaintext+16*3]
1217 movdqu xmm5, [ptr_plaintext+16*4]
1218 movdqu xmm6, [ptr_plaintext+16*5]
1219 movdqu xmm7, [ptr_plaintext+16*6]
1220 movdqu xmm8, [ptr_plaintext+16*7]
1221
1222 add ptr_plaintext, 128
1223
1224 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1225
1226 ; store ciphertext
1227 movdqu [ptr_ciphertext+16*0], xmm1
1228 movdqu [ptr_ciphertext+16*1], xmm2
1229 movdqu [ptr_ciphertext+16*2], xmm3
1230 movdqu [ptr_ciphertext+16*3], xmm4
1231 movdqu [ptr_ciphertext+16*4], xmm5
1232 movdqu [ptr_ciphertext+16*5], xmm6
1233 movdqu [ptr_ciphertext+16*6], xmm7
1234 movdqu [ptr_ciphertext+16*7], xmm8
1235 add ptr_ciphertext, 128
1236
1237 cmp ptr_ciphertext, target_ptr_val
1238 jne _main_loop
1239
1240 _last_eight:
1241 ; load plaintext
1242 movdqu xmm1, [ptr_plaintext+16*0]
1243 movdqu xmm2, [ptr_plaintext+16*1]
1244 movdqu xmm3, [ptr_plaintext+16*2]
1245 movdqu xmm4, [ptr_plaintext+16*3]
1246 movdqu xmm5, [ptr_plaintext+16*4]
1247 movdqu xmm6, [ptr_plaintext+16*5]
1248 movdqu xmm7, [ptr_plaintext+16*6]
1249 movdqu xmm8, [ptr_plaintext+16*7]
1250 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1251
1252 ; store ciphertext
1253 movdqu [ptr_ciphertext+16*0], xmm1
1254 movdqu [ptr_ciphertext+16*1], xmm2
1255 movdqu [ptr_ciphertext+16*2], xmm3
1256 movdqu [ptr_ciphertext+16*3], xmm4
1257 movdqu [ptr_ciphertext+16*4], xmm5
1258 movdqu [ptr_ciphertext+16*5], xmm6
1259 movdqu [ptr_ciphertext+16*6], xmm7
1260
1261
1262 and N_val, 15 ; N_val = N_val mod 16
1263 je _done
1264 _steal_cipher:
1265 ; start cipher stealing
1266
1267 ; generate next Tweak value
1268 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1269 shl twtempl, 1
1270 adc twtemph, twtemph
1271 cmovc ghash_poly_8b_temp, ghash_poly_8b
1272 xor twtempl, ghash_poly_8b_temp
1273 mov [TW], twtempl
1274 mov [TW + 8], twtemph
1275
1276 movdqa xmm2, xmm8
1277
1278 ; shift xmm8 to the left by 16-N_val bytes
1279 lea twtempl, [pshufb_shf_table]
1280 movdqu xmm0, [twtempl+N_val]
1281 pshufb xmm8, xmm0
1282
1283
1284 movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1285 movdqu [ptr_ciphertext + 112 + N_val], xmm8
1286
1287 ; shift xmm3 to the right by 16-N_val bytes
1288 lea twtempl, [pshufb_shf_table +16]
1289 sub twtempl, N_val
1290 movdqu xmm0, [twtempl]
1291 pxor xmm0, [mask1]
1292 pshufb xmm3, xmm0
1293
1294 pblendvb xmm3, xmm2 ;xmm0 is implicit
1295
1296 ; xor Tweak value
1297 movdqa xmm8, [TW]
1298 pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
1299
1300
1301 ;encrypt last block with cipher stealing
1302 pxor xmm8, [keys] ; ARK
1303 aesenc xmm8, [keys + 16*1] ; round 1
1304 aesenc xmm8, [keys + 16*2] ; round 2
1305 aesenc xmm8, [keys + 16*3] ; round 3
1306 aesenc xmm8, [keys + 16*4] ; round 4
1307 aesenc xmm8, [keys + 16*5] ; round 5
1308 aesenc xmm8, [keys + 16*6] ; round 6
1309 aesenc xmm8, [keys + 16*7] ; round 7
1310 aesenc xmm8, [keys + 16*8] ; round 8
1311 aesenc xmm8, [keys + 16*9] ; round 9
1312 aesenclast xmm8, [keys + 16*10] ; round 10
1313
1314 ; xor Tweak value
1315 pxor xmm8, [TW]
1316
1317 _done:
1318 ; store last ciphertext value
1319 movdqu [ptr_ciphertext+16*7], xmm8
1320
1321 _ret_:
1322
1323 mov rbx, [_gpr + 8*0]
1324 %ifidn __OUTPUT_FORMAT__, win64
1325 mov rdi, [_gpr + 8*1]
1326 mov rsi, [_gpr + 8*2]
1327
1328
1329 movdqa xmm6, [_xmm + 16*0]
1330 movdqa xmm7, [_xmm + 16*1]
1331 movdqa xmm8, [_xmm + 16*2]
1332 movdqa xmm9, [_xmm + 16*3]
1333 movdqa xmm10, [_xmm + 16*4]
1334 movdqa xmm11, [_xmm + 16*5]
1335 movdqa xmm12, [_xmm + 16*6]
1336 movdqa xmm13, [_xmm + 16*7]
1337 movdqa xmm14, [_xmm + 16*8]
1338 movdqa xmm15, [_xmm + 16*9]
1339 %endif
1340
1341 add rsp, VARIABLE_OFFSET
1342
1343 ret
1344
1345
1346
1347
1348
1349 _less_than_128_bytes:
1350 cmp N_val, 16
1351 jb _ret_
1352
1353 mov tmp1, N_val
1354 and tmp1, (7 << 4)
1355 cmp tmp1, (6 << 4)
1356 je _num_blocks_is_6
1357 cmp tmp1, (5 << 4)
1358 je _num_blocks_is_5
1359 cmp tmp1, (4 << 4)
1360 je _num_blocks_is_4
1361 cmp tmp1, (3 << 4)
1362 je _num_blocks_is_3
1363 cmp tmp1, (2 << 4)
1364 je _num_blocks_is_2
1365 cmp tmp1, (1 << 4)
1366 je _num_blocks_is_1
1367
1368 _num_blocks_is_7:
1369 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1370 sub ptr_plaintext, 16*1
1371 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1372 ; store ciphertext
1373 movdqu [ptr_ciphertext+16*0], xmm1
1374 movdqu [ptr_ciphertext+16*1], xmm2
1375 movdqu [ptr_ciphertext+16*2], xmm3
1376 movdqu [ptr_ciphertext+16*3], xmm4
1377 movdqu [ptr_ciphertext+16*4], xmm5
1378 movdqu [ptr_ciphertext+16*5], xmm6
1379
1380 sub ptr_ciphertext, 16*1
1381 movdqa xmm8, xmm7
1382
1383 and N_val, 15 ; N_val = N_val mod 16
1384 je _done
1385 jmp _steal_cipher
1386 _num_blocks_is_6:
1387 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1388 sub ptr_plaintext, 16*2
1389 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1390 ; store ciphertext
1391 movdqu [ptr_ciphertext+16*0], xmm1
1392 movdqu [ptr_ciphertext+16*1], xmm2
1393 movdqu [ptr_ciphertext+16*2], xmm3
1394 movdqu [ptr_ciphertext+16*3], xmm4
1395 movdqu [ptr_ciphertext+16*4], xmm5
1396
1397 sub ptr_ciphertext, 16*2
1398 movdqa xmm8, xmm6
1399
1400 and N_val, 15 ; N_val = N_val mod 16
1401 je _done
1402 jmp _steal_cipher
1403 _num_blocks_is_5:
1404 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1405 sub ptr_plaintext, 16*3
1406 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1407 ; store ciphertext
1408 movdqu [ptr_ciphertext+16*0], xmm1
1409 movdqu [ptr_ciphertext+16*1], xmm2
1410 movdqu [ptr_ciphertext+16*2], xmm3
1411 movdqu [ptr_ciphertext+16*3], xmm4
1412
1413 sub ptr_ciphertext, 16*3
1414 movdqa xmm8, xmm5
1415
1416 and N_val, 15 ; N_val = N_val mod 16
1417 je _done
1418 jmp _steal_cipher
1419 _num_blocks_is_4:
1420 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1421 sub ptr_plaintext, 16*4
1422 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1423 ; store ciphertext
1424 movdqu [ptr_ciphertext+16*0], xmm1
1425 movdqu [ptr_ciphertext+16*1], xmm2
1426 movdqu [ptr_ciphertext+16*2], xmm3
1427
1428 sub ptr_ciphertext, 16*4
1429 movdqa xmm8, xmm4
1430
1431 and N_val, 15 ; N_val = N_val mod 16
1432 je _done
1433 jmp _steal_cipher
1434 _num_blocks_is_3:
1435 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1436 sub ptr_plaintext, 16*5
1437 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1438 ; store ciphertext
1439 movdqu [ptr_ciphertext+16*0], xmm1
1440 movdqu [ptr_ciphertext+16*1], xmm2
1441
1442 sub ptr_ciphertext, 16*5
1443 movdqa xmm8, xmm3
1444
1445 and N_val, 15 ; N_val = N_val mod 16
1446 je _done
1447 jmp _steal_cipher
1448
1449 _num_blocks_is_2:
1450 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1451 sub ptr_plaintext, 16*6
1452 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1453 ; store ciphertext
1454 movdqu [ptr_ciphertext], xmm1
1455
1456 sub ptr_ciphertext, 16*6
1457 movdqa xmm8, xmm2
1458
1459 and N_val, 15 ; N_val = N_val mod 16
1460 je _done
1461 jmp _steal_cipher
1462
1463
1464 _num_blocks_is_1:
1465 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1466
1467 sub ptr_plaintext, 16*7
1468 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1469 ; store ciphertext
1470
1471 sub ptr_ciphertext, 16*7
1472 movdqa xmm8, xmm1
1473
1474 and N_val, 15 ; N_val = N_val mod 16
1475 je _done
1476 jmp _steal_cipher
1477
1478 section .data
1479 align 16
1480
1481 pshufb_shf_table:
1482 ; use these values for shift constants for the pshufb instruction
1483 ; different alignments result in values as shown:
1484 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1485 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1486 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1487 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1488 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1489 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1490 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1491 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1492 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1493 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1494 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1495 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1496 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1497 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1498 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1499 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1500 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1501
1502 mask1:
1503 dq 0x8080808080808080, 0x8080808080808080
1504