]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_128_enc_vaes.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS encrypt function with 128-bit AES
30 ; input keys are not aligned
31 ; keys are expanded in parallel with the tweak encryption
32 ; plaintext and ciphertext are not aligned
33 ; second key is stored in the stack as aligned to 16 Bytes
34 ; first key is required only once, no need for storage of this key
35
36 %include "reg_sizes.asm"
37
38 %if (AS_FEATURE_LEVEL) >= 10
39
40 default rel
41 %define TW rsp ; store 8 tweak values
42 %define keys rsp + 16*8 ; store 15 expanded keys
43
44 %ifidn __OUTPUT_FORMAT__, win64
45 %define _xmm rsp + 16*23 ; store xmm6:xmm15
46 %endif
47
48 %ifidn __OUTPUT_FORMAT__, elf64
49 %define _gpr rsp + 16*23 ; store rbx
50 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
51 %else
52 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
53 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
54 %endif
55
56 %define GHASH_POLY 0x87
57
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59 ;void XTS_AES_128_enc_vavx(
60 ; UINT8 *k2, // key used for tweaking, 16*2 bytes
61 ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
62 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
63 ; UINT64 N, // sector size, in bytes
64 ; const UINT8 *pt, // plaintext sector input data
65 ; UINT8 *ct); // ciphertext sector output data
66 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
67
68 ; arguments for input parameters
69 %ifidn __OUTPUT_FORMAT__, elf64
70 %xdefine ptr_key2 rdi
71 %xdefine ptr_key1 rsi
72 %xdefine T_val rdx
73 %xdefine N_val rcx
74 %xdefine ptr_plaintext r8
75 %xdefine ptr_ciphertext r9
76 %else
77 %xdefine ptr_key2 rcx
78 %xdefine ptr_key1 rdx
79 %xdefine T_val r8
80 %xdefine N_val r9
81 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
82 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
83 %endif
84
85 ; arguments for temp parameters
86 %ifidn __OUTPUT_FORMAT__, elf64
87 %define tmp1 rdi
88 %define ghash_poly_8b r10
89 %define ghash_poly_8b_temp r11
90 %else
91 %define tmp1 rcx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94 %endif
95
96 %define twtempl rax ; global temp registers used for tweak computation
97 %define twtemph rbx
98 %define zpoly zmm25
99
100
101 ; produce the key for the next round
102 ; raw_key is the output of vaeskeygenassist instruction
103 ; round_key value before this key_expansion_128 macro is current round key
104 ; round_key value after this key_expansion_128 macro is next round key
105 %macro key_expansion_128 3
106 %define %%xraw_key %1
107 %define %%xtmp %2
108 %define %%xround_key %3
109 vpshufd %%xraw_key, %%xraw_key, 11111111b
110 vshufps %%xtmp, %%xround_key, 00010000b
111 vpxor %%xround_key, %%xtmp
112 vshufps %%xtmp, %%xround_key, 10001100b
113 vpxor %%xround_key, %%xtmp
114 vpxor %%xround_key, %%xraw_key
115 %endmacro
116
117
118
119 ; macro to encrypt the tweak value in parallel with key generation of both keys
120
121 %macro encrypt_T 8
122 %define %%xkey2 %1
123 %define %%xstate_tweak %2
124 %define %%xkey1 %3
125 %define %%xraw_key %4
126 %define %%xtmp %5
127 %define %%ptr_key2 %6
128 %define %%ptr_key1 %7
129 %define %%ptr_expanded_keys %8
130
131
132 vmovdqu %%xkey2, [%%ptr_key2]
133 vmovdqu %%xkey1, [%%ptr_key1]
134 vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
135
136 vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
137
138 vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
139 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
140 vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
141 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
142 vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
143 vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1
144
145 vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
146 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
147 vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
148 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
149 vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
150 vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
151
152 vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
153 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
154 vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
155 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
156 vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
157 vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1
158
159 vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
160 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
161 vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
162 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
163 vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
164 vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1
165
166 vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
167 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
168 vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
169 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
170 vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
171 vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1
172
173 vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
174 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
175 vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
176 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
177 vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
178 vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1
179
180 vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
181 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
182 vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
183 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
184 vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
185 vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1
186
187 vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
188 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
189 vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
190 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
191 vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
192 vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1
193
194 vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
195 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
196 vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
197 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
198 vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
199 vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1
200
201 vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
202 key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
203 vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
204 key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
205 vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
206 vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1
207
208 vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
209 %endmacro
210
211
212 ; generate initial tweak values
213 ; load initial plaintext values
214 %macro initialize 16
215
216 %define %%ST1 %1 ; state 1
217 %define %%ST2 %2 ; state 2
218 %define %%ST3 %3 ; state 3
219 %define %%ST4 %4 ; state 4
220 %define %%ST5 %5 ; state 5
221 %define %%ST6 %6 ; state 6
222 %define %%ST7 %7 ; state 7
223 %define %%ST8 %8 ; state 8
224
225 %define %%TW1 %9 ; tweak 1
226 %define %%TW2 %10 ; tweak 2
227 %define %%TW3 %11 ; tweak 3
228 %define %%TW4 %12 ; tweak 4
229 %define %%TW5 %13 ; tweak 5
230 %define %%TW6 %14 ; tweak 6
231 %define %%TW7 %15 ; tweak 7
232
233 %define %%num_initial_blocks %16
234
235
236 ; generate next Tweak values
237 vmovdqa %%TW1, [TW+16*0]
238 mov twtempl, [TW+8*0]
239 mov twtemph, [TW+8*1]
240 vmovdqu %%ST1, [ptr_plaintext+16*0]
241 %if (%%num_initial_blocks>=2)
242 xor ghash_poly_8b_temp, ghash_poly_8b_temp
243 shl twtempl, 1
244 adc twtemph, twtemph
245 cmovc ghash_poly_8b_temp, ghash_poly_8b
246 xor twtempl, ghash_poly_8b_temp
247 mov [TW+8*2], twtempl
248 mov [TW+8*3], twtemph;
249 vmovdqa %%TW2, [TW+16*1]
250 vmovdqu %%ST2, [ptr_plaintext+16*1]
251 %endif
252 %if (%%num_initial_blocks>=3)
253 xor ghash_poly_8b_temp, ghash_poly_8b_temp
254 shl twtempl, 1
255 adc twtemph, twtemph
256 cmovc ghash_poly_8b_temp, ghash_poly_8b
257 xor twtempl, ghash_poly_8b_temp
258 mov [TW+8*4], twtempl
259 mov [TW+8*5], twtemph;
260 vmovdqa %%TW3, [TW+16*2]
261 vmovdqu %%ST3, [ptr_plaintext+16*2]
262 %endif
263 %if (%%num_initial_blocks>=4)
264 xor ghash_poly_8b_temp, ghash_poly_8b_temp
265 shl twtempl, 1
266 adc twtemph, twtemph
267 cmovc ghash_poly_8b_temp, ghash_poly_8b
268 xor twtempl, ghash_poly_8b_temp
269 mov [TW+8*6], twtempl
270 mov [TW+8*7], twtemph;
271 vmovdqa %%TW4, [TW+16*3]
272 vmovdqu %%ST4, [ptr_plaintext+16*3]
273 %endif
274 %if (%%num_initial_blocks>=5)
275 xor ghash_poly_8b_temp, ghash_poly_8b_temp
276 shl twtempl, 1
277 adc twtemph, twtemph
278 cmovc ghash_poly_8b_temp, ghash_poly_8b
279 xor twtempl, ghash_poly_8b_temp
280 mov [TW+8*8], twtempl
281 mov [TW+8*9], twtemph;
282 vmovdqa %%TW5, [TW+16*4]
283 vmovdqu %%ST5, [ptr_plaintext+16*4]
284 %endif
285 %if (%%num_initial_blocks>=6)
286 xor ghash_poly_8b_temp, ghash_poly_8b_temp
287 shl twtempl, 1
288 adc twtemph, twtemph
289 cmovc ghash_poly_8b_temp, ghash_poly_8b
290 xor twtempl, ghash_poly_8b_temp
291 mov [TW+8*10], twtempl
292 mov [TW+8*11], twtemph;
293 vmovdqa %%TW6, [TW+16*5]
294 vmovdqu %%ST6, [ptr_plaintext+16*5]
295 %endif
296 %if (%%num_initial_blocks>=7)
297 xor ghash_poly_8b_temp, ghash_poly_8b_temp
298 shl twtempl, 1
299 adc twtemph, twtemph
300 cmovc ghash_poly_8b_temp, ghash_poly_8b
301 xor twtempl, ghash_poly_8b_temp
302 mov [TW+8*12], twtempl
303 mov [TW+8*13], twtemph;
304 vmovdqa %%TW7, [TW+16*6]
305 vmovdqu %%ST7, [ptr_plaintext+16*6]
306 %endif
307
308 %endmacro
309
310
311 ; encrypt initial blocks of AES
312 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
313 ; next 8 Tweak values are generated
314 %macro encrypt_initial 18
315 %define %%ST1 %1 ; state 1
316 %define %%ST2 %2 ; state 2
317 %define %%ST3 %3 ; state 3
318 %define %%ST4 %4 ; state 4
319 %define %%ST5 %5 ; state 5
320 %define %%ST6 %6 ; state 6
321 %define %%ST7 %7 ; state 7
322 %define %%ST8 %8 ; state 8
323
324 %define %%TW1 %9 ; tweak 1
325 %define %%TW2 %10 ; tweak 2
326 %define %%TW3 %11 ; tweak 3
327 %define %%TW4 %12 ; tweak 4
328 %define %%TW5 %13 ; tweak 5
329 %define %%TW6 %14 ; tweak 6
330 %define %%TW7 %15 ; tweak 7
331 %define %%T0 %16 ; Temp register
332 %define %%num_blocks %17
333 ; %%num_blocks blocks encrypted
334 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
335
336 %define %%lt128 %18 ; less than 128 bytes
337
338 ; xor Tweak value
339 vpxor %%ST1, %%TW1
340 %if (%%num_blocks>=2)
341 vpxor %%ST2, %%TW2
342 %endif
343 %if (%%num_blocks>=3)
344 vpxor %%ST3, %%TW3
345 %endif
346 %if (%%num_blocks>=4)
347 vpxor %%ST4, %%TW4
348 %endif
349 %if (%%num_blocks>=5)
350 vpxor %%ST5, %%TW5
351 %endif
352 %if (%%num_blocks>=6)
353 vpxor %%ST6, %%TW6
354 %endif
355 %if (%%num_blocks>=7)
356 vpxor %%ST7, %%TW7
357 %endif
358
359
360 ; ARK
361 vmovdqa %%T0, [keys]
362 vpxor %%ST1, %%T0
363 %if (%%num_blocks>=2)
364 vpxor %%ST2, %%T0
365 %endif
366 %if (%%num_blocks>=3)
367 vpxor %%ST3, %%T0
368 %endif
369 %if (%%num_blocks>=4)
370 vpxor %%ST4, %%T0
371 %endif
372 %if (%%num_blocks>=5)
373 vpxor %%ST5, %%T0
374 %endif
375 %if (%%num_blocks>=6)
376 vpxor %%ST6, %%T0
377 %endif
378 %if (%%num_blocks>=7)
379 vpxor %%ST7, %%T0
380 %endif
381
382
383 %if (0 == %%lt128)
384 xor ghash_poly_8b_temp, ghash_poly_8b_temp
385 shl twtempl, 1
386 adc twtemph, twtemph
387 %endif
388
389 ; round 1
390 vmovdqa %%T0, [keys + 16*1]
391 vaesenc %%ST1, %%T0
392 %if (%%num_blocks>=2)
393 vaesenc %%ST2, %%T0
394 %endif
395 %if (%%num_blocks>=3)
396 vaesenc %%ST3, %%T0
397 %endif
398 %if (%%num_blocks>=4)
399 vaesenc %%ST4, %%T0
400 %endif
401 %if (%%num_blocks>=5)
402 vaesenc %%ST5, %%T0
403 %endif
404 %if (%%num_blocks>=6)
405 vaesenc %%ST6, %%T0
406 %endif
407 %if (%%num_blocks>=7)
408 vaesenc %%ST7, %%T0
409 %endif
410 %if (0 == %%lt128)
411 cmovc ghash_poly_8b_temp, ghash_poly_8b
412 xor twtempl, ghash_poly_8b_temp
413 mov [TW + 8*0], twtempl ; next Tweak1 generated
414 mov [TW + 8*1], twtemph
415 xor ghash_poly_8b_temp, ghash_poly_8b_temp
416 %endif
417
418 ; round 2
419 vmovdqa %%T0, [keys + 16*2]
420 vaesenc %%ST1, %%T0
421 %if (%%num_blocks>=2)
422 vaesenc %%ST2, %%T0
423 %endif
424 %if (%%num_blocks>=3)
425 vaesenc %%ST3, %%T0
426 %endif
427 %if (%%num_blocks>=4)
428 vaesenc %%ST4, %%T0
429 %endif
430 %if (%%num_blocks>=5)
431 vaesenc %%ST5, %%T0
432 %endif
433 %if (%%num_blocks>=6)
434 vaesenc %%ST6, %%T0
435 %endif
436 %if (%%num_blocks>=7)
437 vaesenc %%ST7, %%T0
438 %endif
439
440 %if (0 == %%lt128)
441 shl twtempl, 1
442 adc twtemph, twtemph
443 cmovc ghash_poly_8b_temp, ghash_poly_8b
444 xor twtempl, ghash_poly_8b_temp
445 mov [TW + 8*2], twtempl ; next Tweak2 generated
446 %endif
447
448 ; round 3
449 vmovdqa %%T0, [keys + 16*3]
450 vaesenc %%ST1, %%T0
451 %if (%%num_blocks>=2)
452 vaesenc %%ST2, %%T0
453 %endif
454 %if (%%num_blocks>=3)
455 vaesenc %%ST3, %%T0
456 %endif
457 %if (%%num_blocks>=4)
458 vaesenc %%ST4, %%T0
459 %endif
460 %if (%%num_blocks>=5)
461 vaesenc %%ST5, %%T0
462 %endif
463 %if (%%num_blocks>=6)
464 vaesenc %%ST6, %%T0
465 %endif
466 %if (%%num_blocks>=7)
467 vaesenc %%ST7, %%T0
468 %endif
469 %if (0 == %%lt128)
470 mov [TW + 8*3], twtemph
471 xor ghash_poly_8b_temp, ghash_poly_8b_temp
472 shl twtempl, 1
473 adc twtemph, twtemph
474 cmovc ghash_poly_8b_temp, ghash_poly_8b
475 %endif
476
477 ; round 4
478 vmovdqa %%T0, [keys + 16*4]
479 vaesenc %%ST1, %%T0
480 %if (%%num_blocks>=2)
481 vaesenc %%ST2, %%T0
482 %endif
483 %if (%%num_blocks>=3)
484 vaesenc %%ST3, %%T0
485 %endif
486 %if (%%num_blocks>=4)
487 vaesenc %%ST4, %%T0
488 %endif
489 %if (%%num_blocks>=5)
490 vaesenc %%ST5, %%T0
491 %endif
492 %if (%%num_blocks>=6)
493 vaesenc %%ST6, %%T0
494 %endif
495 %if (%%num_blocks>=7)
496 vaesenc %%ST7, %%T0
497 %endif
498
499 %if (0 == %%lt128)
500 xor twtempl, ghash_poly_8b_temp
501 mov [TW + 8*4], twtempl ; next Tweak3 generated
502 mov [TW + 8*5], twtemph
503 xor ghash_poly_8b_temp, ghash_poly_8b_temp
504 shl twtempl, 1
505 %endif
506
507 ; round 5
508 vmovdqa %%T0, [keys + 16*5]
509 vaesenc %%ST1, %%T0
510 %if (%%num_blocks>=2)
511 vaesenc %%ST2, %%T0
512 %endif
513 %if (%%num_blocks>=3)
514 vaesenc %%ST3, %%T0
515 %endif
516 %if (%%num_blocks>=4)
517 vaesenc %%ST4, %%T0
518 %endif
519 %if (%%num_blocks>=5)
520 vaesenc %%ST5, %%T0
521 %endif
522 %if (%%num_blocks>=6)
523 vaesenc %%ST6, %%T0
524 %endif
525 %if (%%num_blocks>=7)
526 vaesenc %%ST7, %%T0
527 %endif
528
529 %if (0 == %%lt128)
530 adc twtemph, twtemph
531 cmovc ghash_poly_8b_temp, ghash_poly_8b
532 xor twtempl, ghash_poly_8b_temp
533 mov [TW + 8*6], twtempl ; next Tweak4 generated
534 mov [TW + 8*7], twtemph
535 %endif
536
537 ; round 6
538 vmovdqa %%T0, [keys + 16*6]
539 vaesenc %%ST1, %%T0
540 %if (%%num_blocks>=2)
541 vaesenc %%ST2, %%T0
542 %endif
543 %if (%%num_blocks>=3)
544 vaesenc %%ST3, %%T0
545 %endif
546 %if (%%num_blocks>=4)
547 vaesenc %%ST4, %%T0
548 %endif
549 %if (%%num_blocks>=5)
550 vaesenc %%ST5, %%T0
551 %endif
552 %if (%%num_blocks>=6)
553 vaesenc %%ST6, %%T0
554 %endif
555 %if (%%num_blocks>=7)
556 vaesenc %%ST7, %%T0
557 %endif
558
559 %if (0 == %%lt128)
560 xor ghash_poly_8b_temp, ghash_poly_8b_temp
561 shl twtempl, 1
562 adc twtemph, twtemph
563 cmovc ghash_poly_8b_temp, ghash_poly_8b
564 xor twtempl, ghash_poly_8b_temp
565 mov [TW + 8*8], twtempl ; next Tweak5 generated
566 mov [TW + 8*9], twtemph
567 %endif
568
569 ; round 7
570 vmovdqa %%T0, [keys + 16*7]
571 vaesenc %%ST1, %%T0
572 %if (%%num_blocks>=2)
573 vaesenc %%ST2, %%T0
574 %endif
575 %if (%%num_blocks>=3)
576 vaesenc %%ST3, %%T0
577 %endif
578 %if (%%num_blocks>=4)
579 vaesenc %%ST4, %%T0
580 %endif
581 %if (%%num_blocks>=5)
582 vaesenc %%ST5, %%T0
583 %endif
584 %if (%%num_blocks>=6)
585 vaesenc %%ST6, %%T0
586 %endif
587 %if (%%num_blocks>=7)
588 vaesenc %%ST7, %%T0
589 %endif
590
591 %if (0 == %%lt128)
592 xor ghash_poly_8b_temp, ghash_poly_8b_temp
593 shl twtempl, 1
594 adc twtemph, twtemph
595 cmovc ghash_poly_8b_temp, ghash_poly_8b
596 xor twtempl, ghash_poly_8b_temp
597 mov [TW + 8*10], twtempl ; next Tweak6 generated
598 mov [TW + 8*11], twtemph
599 %endif
600 ; round 8
601 vmovdqa %%T0, [keys + 16*8]
602 vaesenc %%ST1, %%T0
603 %if (%%num_blocks>=2)
604 vaesenc %%ST2, %%T0
605 %endif
606 %if (%%num_blocks>=3)
607 vaesenc %%ST3, %%T0
608 %endif
609 %if (%%num_blocks>=4)
610 vaesenc %%ST4, %%T0
611 %endif
612 %if (%%num_blocks>=5)
613 vaesenc %%ST5, %%T0
614 %endif
615 %if (%%num_blocks>=6)
616 vaesenc %%ST6, %%T0
617 %endif
618 %if (%%num_blocks>=7)
619 vaesenc %%ST7, %%T0
620 %endif
621
622 %if (0 == %%lt128)
623 xor ghash_poly_8b_temp, ghash_poly_8b_temp
624 shl twtempl, 1
625 adc twtemph, twtemph
626 cmovc ghash_poly_8b_temp, ghash_poly_8b
627 xor twtempl, ghash_poly_8b_temp
628 mov [TW + 8*12], twtempl ; next Tweak7 generated
629 mov [TW + 8*13], twtemph
630 %endif
631 ; round 9
632 vmovdqa %%T0, [keys + 16*9]
633 vaesenc %%ST1, %%T0
634 %if (%%num_blocks>=2)
635 vaesenc %%ST2, %%T0
636 %endif
637 %if (%%num_blocks>=3)
638 vaesenc %%ST3, %%T0
639 %endif
640 %if (%%num_blocks>=4)
641 vaesenc %%ST4, %%T0
642 %endif
643 %if (%%num_blocks>=5)
644 vaesenc %%ST5, %%T0
645 %endif
646 %if (%%num_blocks>=6)
647 vaesenc %%ST6, %%T0
648 %endif
649 %if (%%num_blocks>=7)
650 vaesenc %%ST7, %%T0
651 %endif
652
653 %if (0 == %%lt128)
654 xor ghash_poly_8b_temp, ghash_poly_8b_temp
655 shl twtempl, 1
656 adc twtemph, twtemph
657 cmovc ghash_poly_8b_temp, ghash_poly_8b
658 xor twtempl, ghash_poly_8b_temp
659 mov [TW + 8*14], twtempl ; next Tweak8 generated
660 mov [TW + 8*15], twtemph
661 %endif
662
663
664 ; round 10
665 vmovdqa %%T0, [keys + 16*10]
666 vaesenclast %%ST1, %%T0
667 %if (%%num_blocks>=2)
668 vaesenclast %%ST2, %%T0
669 %endif
670 %if (%%num_blocks>=3)
671 vaesenclast %%ST3, %%T0
672 %endif
673 %if (%%num_blocks>=4)
674 vaesenclast %%ST4, %%T0
675 %endif
676 %if (%%num_blocks>=5)
677 vaesenclast %%ST5, %%T0
678 %endif
679 %if (%%num_blocks>=6)
680 vaesenclast %%ST6, %%T0
681 %endif
682 %if (%%num_blocks>=7)
683 vaesenclast %%ST7, %%T0
684 %endif
685
686
687 ; xor Tweak values
688 vpxor %%ST1, %%TW1
689 %if (%%num_blocks>=2)
690 vpxor %%ST2, %%TW2
691 %endif
692 %if (%%num_blocks>=3)
693 vpxor %%ST3, %%TW3
694 %endif
695 %if (%%num_blocks>=4)
696 vpxor %%ST4, %%TW4
697 %endif
698 %if (%%num_blocks>=5)
699 vpxor %%ST5, %%TW5
700 %endif
701 %if (%%num_blocks>=6)
702 vpxor %%ST6, %%TW6
703 %endif
704 %if (%%num_blocks>=7)
705 vpxor %%ST7, %%TW7
706 %endif
707
708
709 %if (0 == %%lt128)
710 ; load next Tweak values
711 vmovdqa %%TW1, [TW + 16*0]
712 vmovdqa %%TW2, [TW + 16*1]
713 vmovdqa %%TW3, [TW + 16*2]
714 vmovdqa %%TW4, [TW + 16*3]
715 vmovdqa %%TW5, [TW + 16*4]
716 vmovdqa %%TW6, [TW + 16*5]
717 vmovdqa %%TW7, [TW + 16*6]
718
719 %endif
720
721 %endmacro
722
723
724
725
726 ; Encrypt 8 blocks in parallel
727 ; generate next 8 tweak values
728 %macro encrypt_by_eight_zmm 6
729 %define %%ST1 %1 ; state 1
730 %define %%ST2 %2 ; state 2
731 %define %%TW1 %3 ; tweak 1
732 %define %%TW2 %4 ; tweak 2
733 %define %%T0 %5 ; Temp register
734 %define %%last_eight %6
735
736 ; xor Tweak values
737 vpxorq %%ST1, %%TW1
738 vpxorq %%ST2, %%TW2
739
740 ; ARK
741 vbroadcasti32x4 %%T0, [keys]
742 vpxorq %%ST1, %%T0
743 vpxorq %%ST2, %%T0
744
745 %if (0 == %%last_eight)
746 vpsrldq zmm13, %%TW1, 15
747 vpclmulqdq zmm14, zmm13, zpoly, 0
748 vpslldq zmm15, %%TW1, 1
749 vpxord zmm15, zmm15, zmm14
750 %endif
751 ; round 1
752 vbroadcasti32x4 %%T0, [keys + 16*1]
753 vaesenc %%ST1, %%T0
754 vaesenc %%ST2, %%T0
755
756 ; round 2
757 vbroadcasti32x4 %%T0, [keys + 16*2]
758 vaesenc %%ST1, %%T0
759 vaesenc %%ST2, %%T0
760
761 ; round 3
762 vbroadcasti32x4 %%T0, [keys + 16*3]
763 vaesenc %%ST1, %%T0
764 vaesenc %%ST2, %%T0
765 %if (0 == %%last_eight)
766 vpsrldq zmm13, %%TW2, 15
767 vpclmulqdq zmm14, zmm13, zpoly, 0
768 vpslldq zmm16, %%TW2, 1
769 vpxord zmm16, zmm16, zmm14
770 %endif
771 ; round 4
772 vbroadcasti32x4 %%T0, [keys + 16*4]
773 vaesenc %%ST1, %%T0
774 vaesenc %%ST2, %%T0
775
776 ; round 5
777 vbroadcasti32x4 %%T0, [keys + 16*5]
778 vaesenc %%ST1, %%T0
779 vaesenc %%ST2, %%T0
780
781 ; round 6
782 vbroadcasti32x4 %%T0, [keys + 16*6]
783 vaesenc %%ST1, %%T0
784 vaesenc %%ST2, %%T0
785
786 ; round 7
787 vbroadcasti32x4 %%T0, [keys + 16*7]
788 vaesenc %%ST1, %%T0
789 vaesenc %%ST2, %%T0
790
791 ; round 8
792 vbroadcasti32x4 %%T0, [keys + 16*8]
793 vaesenc %%ST1, %%T0
794 vaesenc %%ST2, %%T0
795
796 ; round 9
797 vbroadcasti32x4 %%T0, [keys + 16*9]
798 vaesenc %%ST1, %%T0
799 vaesenc %%ST2, %%T0
800
801 ; round 10
802 vbroadcasti32x4 %%T0, [keys + 16*10]
803 vaesenclast %%ST1, %%T0
804 vaesenclast %%ST2, %%T0
805
806 ; xor Tweak values
807 vpxorq %%ST1, %%TW1
808 vpxorq %%ST2, %%TW2
809
810 ; load next Tweak values
811 vmovdqa32 %%TW1, zmm15
812 vmovdqa32 %%TW2, zmm16
813 %endmacro
814
815
816 ; Encrypt 16 blocks in parallel
817 ; generate next 8 tweak values
818 %macro encrypt_by_16_zmm 10
819 %define %%ST1 %1 ; state 1
820 %define %%ST2 %2 ; state 2
821 %define %%ST3 %3 ; state 3
822 %define %%ST4 %4 ; state 4
823
824 %define %%TW1 %5 ; tweak 1
825 %define %%TW2 %6 ; tweak 2
826 %define %%TW3 %7 ; tweak 3
827 %define %%TW4 %8 ; tweak 4
828
829 %define %%T0 %9 ; Temp register
830 %define %%last_eight %10
831
832 ; xor Tweak values
833 vpxorq %%ST1, %%TW1
834 vpxorq %%ST2, %%TW2
835 vpxorq %%ST3, %%TW3
836 vpxorq %%ST4, %%TW4
837
838 ; ARK
839 vbroadcasti32x4 %%T0, [keys]
840 vpxorq %%ST1, %%T0
841 vpxorq %%ST2, %%T0
842 vpxorq %%ST3, %%T0
843 vpxorq %%ST4, %%T0
844
845 %if (0 == %%last_eight)
846 vpsrldq zmm13, %%TW3, 15
847 vpclmulqdq zmm14, zmm13, zpoly, 0
848 vpslldq zmm15, %%TW3, 1
849 vpxord zmm15, zmm15, zmm14
850 %endif
851 ; round 1
852 vbroadcasti32x4 %%T0, [keys + 16*1]
853 vaesenc %%ST1, %%T0
854 vaesenc %%ST2, %%T0
855 vaesenc %%ST3, %%T0
856 vaesenc %%ST4, %%T0
857
858 ; round 2
859 vbroadcasti32x4 %%T0, [keys + 16*2]
860 vaesenc %%ST1, %%T0
861 vaesenc %%ST2, %%T0
862 vaesenc %%ST3, %%T0
863 vaesenc %%ST4, %%T0
864
865 ; round 3
866 vbroadcasti32x4 %%T0, [keys + 16*3]
867 vaesenc %%ST1, %%T0
868 vaesenc %%ST2, %%T0
869 vaesenc %%ST3, %%T0
870 vaesenc %%ST4, %%T0
871 %if (0 == %%last_eight)
872 vpsrldq zmm13, %%TW4, 15
873 vpclmulqdq zmm14, zmm13, zpoly, 0
874 vpslldq zmm16, %%TW4, 1
875 vpxord zmm16, zmm16, zmm14
876 %endif
877 ; round 4
878 vbroadcasti32x4 %%T0, [keys + 16*4]
879 vaesenc %%ST1, %%T0
880 vaesenc %%ST2, %%T0
881 vaesenc %%ST3, %%T0
882 vaesenc %%ST4, %%T0
883
884 ; round 5
885 vbroadcasti32x4 %%T0, [keys + 16*5]
886 vaesenc %%ST1, %%T0
887 vaesenc %%ST2, %%T0
888 vaesenc %%ST3, %%T0
889 vaesenc %%ST4, %%T0
890
891 ; round 6
892 vbroadcasti32x4 %%T0, [keys + 16*6]
893 vaesenc %%ST1, %%T0
894 vaesenc %%ST2, %%T0
895 vaesenc %%ST3, %%T0
896 vaesenc %%ST4, %%T0
897 %if (0 == %%last_eight)
898 vpsrldq zmm13, zmm15, 15
899 vpclmulqdq zmm14, zmm13, zpoly, 0
900 vpslldq zmm17, zmm15, 1
901 vpxord zmm17, zmm17, zmm14
902 %endif
903 ; round 7
904 vbroadcasti32x4 %%T0, [keys + 16*7]
905 vaesenc %%ST1, %%T0
906 vaesenc %%ST2, %%T0
907 vaesenc %%ST3, %%T0
908 vaesenc %%ST4, %%T0
909
910 ; round 8
911 vbroadcasti32x4 %%T0, [keys + 16*8]
912 vaesenc %%ST1, %%T0
913 vaesenc %%ST2, %%T0
914 vaesenc %%ST3, %%T0
915 vaesenc %%ST4, %%T0
916
917 ; round 9
918 vbroadcasti32x4 %%T0, [keys + 16*9]
919 vaesenc %%ST1, %%T0
920 vaesenc %%ST2, %%T0
921 vaesenc %%ST3, %%T0
922 vaesenc %%ST4, %%T0
923 %if (0 == %%last_eight)
924 vpsrldq zmm13, zmm16, 15
925 vpclmulqdq zmm14, zmm13, zpoly, 0
926 vpslldq zmm18, zmm16, 1
927 vpxord zmm18, zmm18, zmm14
928 %endif
929 ; round 10
930 vbroadcasti32x4 %%T0, [keys + 16*10]
931 vaesenclast %%ST1, %%T0
932 vaesenclast %%ST2, %%T0
933 vaesenclast %%ST3, %%T0
934 vaesenclast %%ST4, %%T0
935
936 ; xor Tweak values
937 vpxorq %%ST1, %%TW1
938 vpxorq %%ST2, %%TW2
939 vpxorq %%ST3, %%TW3
940 vpxorq %%ST4, %%TW4
941
942 ; load next Tweak values
943 vmovdqa32 %%TW1, zmm15
944 vmovdqa32 %%TW2, zmm16
945 vmovdqa32 %%TW3, zmm17
946 vmovdqa32 %%TW4, zmm18
947 %endmacro
948
949
950 section .text
951
952 mk_global XTS_AES_128_enc_vaes, function
953 XTS_AES_128_enc_vaes:
954 endbranch
955
956 %define ALIGN_STACK
957 %ifdef ALIGN_STACK
958 push rbp
959 mov rbp, rsp
960 sub rsp, VARIABLE_OFFSET
961 and rsp, ~63
962 %else
963 sub rsp, VARIABLE_OFFSET
964 %endif
965
966 mov [_gpr + 8*0], rbx
967 %ifidn __OUTPUT_FORMAT__, win64
968 mov [_gpr + 8*1], rdi
969 mov [_gpr + 8*2], rsi
970
971 vmovdqa [_xmm + 16*0], xmm6
972 vmovdqa [_xmm + 16*1], xmm7
973 vmovdqa [_xmm + 16*2], xmm8
974 vmovdqa [_xmm + 16*3], xmm9
975 vmovdqa [_xmm + 16*4], xmm10
976 vmovdqa [_xmm + 16*5], xmm11
977 vmovdqa [_xmm + 16*6], xmm12
978 vmovdqa [_xmm + 16*7], xmm13
979 vmovdqa [_xmm + 16*8], xmm14
980 vmovdqa [_xmm + 16*9], xmm15
981 %endif
982
983 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
984
985
986 vmovdqu xmm1, [T_val] ; read initial Tweak value
987 vpxor xmm4, xmm4 ; for key expansion
988 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
989
990
991 %ifidn __OUTPUT_FORMAT__, win64
992 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
993 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
994 %endif
995
996 cmp N_val, 128
997 jl _less_than_128_bytes
998
999 vpbroadcastq zpoly, ghash_poly_8b
1000
1001 cmp N_val, 256
1002 jge _start_by16
1003
1004 cmp N_val, 128
1005 jge _start_by8
1006
1007 _do_n_blocks:
1008 cmp N_val, 0
1009 je _ret_
1010
1011 cmp N_val, (7*16)
1012 jge _remaining_num_blocks_is_7
1013
1014 cmp N_val, (6*16)
1015 jge _remaining_num_blocks_is_6
1016
1017 cmp N_val, (5*16)
1018 jge _remaining_num_blocks_is_5
1019
1020 cmp N_val, (4*16)
1021 jge _remaining_num_blocks_is_4
1022
1023 cmp N_val, (3*16)
1024 jge _remaining_num_blocks_is_3
1025
1026 cmp N_val, (2*16)
1027 jge _remaining_num_blocks_is_2
1028
1029 cmp N_val, (1*16)
1030 jge _remaining_num_blocks_is_1
1031
1032 ;; _remaining_num_blocks_is_0:
1033 vmovdqa xmm8, xmm0
1034 vmovdqa xmm0, xmm9
1035 jmp _steal_cipher
1036
1037 _remaining_num_blocks_is_7:
1038 mov tmp1, -1
1039 shr tmp1, 16
1040 kmovq k1, tmp1
1041 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1042 vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
1043 add ptr_plaintext, 16*7
1044 encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1045 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1046 vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
1047 add ptr_ciphertext, 16*7
1048
1049 vextracti32x4 xmm8, zmm2, 0x2
1050 vextracti32x4 xmm0, zmm10, 0x3
1051 and N_val, 15
1052 je _ret_
1053 jmp _steal_cipher
1054
1055 _remaining_num_blocks_is_6:
1056 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1057 vmovdqu8 ymm2, [ptr_plaintext+16*4]
1058 add ptr_plaintext, 16*6
1059 encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1060 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1061 vmovdqu8 [ptr_ciphertext+16*4], ymm2
1062 add ptr_ciphertext, 16*6
1063
1064 vextracti32x4 xmm8, zmm2, 0x1
1065 vextracti32x4 xmm0, zmm10, 0x2
1066 and N_val, 15
1067 je _ret_
1068 jmp _steal_cipher
1069
1070 _remaining_num_blocks_is_5:
1071 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1072 vmovdqu xmm2, [ptr_plaintext+16*4]
1073 add ptr_plaintext, 16*5
1074 encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1075 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1076 vmovdqu [ptr_ciphertext+16*4], xmm2
1077 add ptr_ciphertext, 16*5
1078
1079 movdqa xmm8, xmm2
1080 vextracti32x4 xmm0, zmm10, 0x1
1081 and N_val, 15
1082 je _ret_
1083 jmp _steal_cipher
1084
1085 _remaining_num_blocks_is_4:
1086 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1087 add ptr_plaintext, 16*4
1088 encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
1089 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1090 add ptr_ciphertext, 16*4
1091
1092 vextracti32x4 xmm8, zmm1, 0x3
1093 vextracti32x4 xmm0, zmm10, 0x0
1094 and N_val, 15
1095 je _ret_
1096 jmp _steal_cipher
1097
1098 _remaining_num_blocks_is_3:
1099 vextracti32x4 xmm10, zmm9, 1
1100 vextracti32x4 xmm11, zmm9, 2
1101 vmovdqu xmm1, [ptr_plaintext+16*0]
1102 vmovdqu xmm2, [ptr_plaintext+16*1]
1103 vmovdqu xmm3, [ptr_plaintext+16*2]
1104 add ptr_plaintext, 16*3
1105 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1106 vmovdqu [ptr_ciphertext+16*0], xmm1
1107 vmovdqu [ptr_ciphertext+16*1], xmm2
1108 vmovdqu [ptr_ciphertext+16*2], xmm3
1109 add ptr_ciphertext, 16*3
1110
1111 vmovdqa xmm8, xmm3
1112 vextracti32x4 xmm0, zmm9, 3
1113 and N_val, 15
1114 je _ret_
1115 jmp _steal_cipher
1116
1117 _remaining_num_blocks_is_2:
1118 vextracti32x4 xmm10, zmm9, 1
1119 vmovdqu xmm1, [ptr_plaintext+16*0]
1120 vmovdqu xmm2, [ptr_plaintext+16*1]
1121 add ptr_plaintext, 16*2
1122 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1123 vmovdqu [ptr_ciphertext+16*0], xmm1
1124 vmovdqu [ptr_ciphertext+16*1], xmm2
1125 add ptr_ciphertext, 16*2
1126
1127 vmovdqa xmm8, xmm2
1128 vextracti32x4 xmm0, zmm9, 2
1129 and N_val, 15
1130 je _ret_
1131 jmp _steal_cipher
1132
1133 _remaining_num_blocks_is_1:
1134 vmovdqu xmm1, [ptr_plaintext]
1135 add ptr_plaintext, 16
1136 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1137 vmovdqu [ptr_ciphertext], xmm1
1138 add ptr_ciphertext, 16
1139
1140 vmovdqa xmm8, xmm1
1141 vextracti32x4 xmm0, zmm9, 1
1142 and N_val, 15
1143 je _ret_
1144 jmp _steal_cipher
1145
1146
1147 _start_by16:
1148 ; Make first 7 tweak values
1149 vbroadcasti32x4 zmm0, [TW]
1150 vbroadcasti32x4 zmm8, [shufb_15_7]
1151 mov tmp1, 0xaa
1152 kmovq k2, tmp1
1153
1154 ; Mult tweak by 2^{3, 2, 1, 0}
1155 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1156 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1157 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1158 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1159 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1160 vpxord zmm9, zmm3, zmm4
1161
1162 ; Mult tweak by 2^{7, 6, 5, 4}
1163 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1164 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1165 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1166 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1167 vpxord zmm10, zmm7, zmm5
1168
1169 ; Make next 8 tweak values by all x 2^8
1170 vpsrldq zmm13, zmm9, 15
1171 vpclmulqdq zmm14, zmm13, zpoly, 0
1172 vpslldq zmm11, zmm9, 1
1173 vpxord zmm11, zmm11, zmm14
1174
1175 vpsrldq zmm15, zmm10, 15
1176 vpclmulqdq zmm16, zmm15, zpoly, 0
1177 vpslldq zmm12, zmm10, 1
1178 vpxord zmm12, zmm12, zmm16
1179
1180 _main_loop_run_16:
1181 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1182 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1183 vmovdqu8 zmm3, [ptr_plaintext+16*8]
1184 vmovdqu8 zmm4, [ptr_plaintext+16*12]
1185 add ptr_plaintext, 256
1186
1187 encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
1188
1189 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1190 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1191 vmovdqu8 [ptr_ciphertext+16*8], zmm3
1192 vmovdqu8 [ptr_ciphertext+16*12], zmm4
1193 add ptr_ciphertext, 256
1194 sub N_val, 256
1195
1196 cmp N_val, 256
1197 jge _main_loop_run_16
1198
1199 cmp N_val, 128
1200 jge _main_loop_run_8
1201
1202 vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block
1203 jmp _do_n_blocks
1204
1205 _start_by8:
1206 ; Make first 7 tweak values
1207 vbroadcasti32x4 zmm0, [TW]
1208 vbroadcasti32x4 zmm8, [shufb_15_7]
1209 mov tmp1, 0xaa
1210 kmovq k2, tmp1
1211
1212 ; Mult tweak by 2^{3, 2, 1, 0}
1213 vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
1214 vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
1215 vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
1216 vpclmulqdq zmm3, zmm2, zpoly, 0x00
1217 vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
1218 vpxord zmm9, zmm3, zmm4
1219
1220 ; Mult tweak by 2^{7, 6, 5, 4}
1221 vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
1222 vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
1223 vpclmulqdq zmm7, zmm6, zpoly, 0x00
1224 vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
1225 vpxord zmm10, zmm7, zmm5
1226
1227 _main_loop_run_8:
1228 ; load plaintext
1229 vmovdqu8 zmm1, [ptr_plaintext+16*0]
1230 vmovdqu8 zmm2, [ptr_plaintext+16*4]
1231 add ptr_plaintext, 128
1232
1233 encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
1234
1235 ; store ciphertext
1236 vmovdqu8 [ptr_ciphertext+16*0], zmm1
1237 vmovdqu8 [ptr_ciphertext+16*4], zmm2
1238 add ptr_ciphertext, 128
1239 sub N_val, 128
1240
1241 cmp N_val, 128
1242 jge _main_loop_run_8
1243
1244 vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block
1245 jmp _do_n_blocks
1246
1247 _steal_cipher_next:
1248 ; generate next Tweak value
1249 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1250 shl twtempl, 1
1251 adc twtemph, twtemph
1252 cmovc ghash_poly_8b_temp, ghash_poly_8b
1253 xor twtempl, ghash_poly_8b_temp
1254 mov [TW], twtempl
1255 mov [TW + 8], twtemph
1256 vmovdqa xmm0, [TW]
1257
1258 _steal_cipher:
1259 ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
1260 vmovdqa xmm2, xmm8
1261
1262 ; shift xmm8 to the left by 16-N_val bytes
1263 lea twtempl, [vpshufb_shf_table]
1264 vmovdqu xmm10, [twtempl+N_val]
1265 vpshufb xmm8, xmm10
1266
1267 vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
1268 vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
1269
1270 ; shift xmm3 to the right by 16-N_val bytes
1271 lea twtempl, [vpshufb_shf_table +16]
1272 sub twtempl, N_val
1273 vmovdqu xmm10, [twtempl]
1274 vpxor xmm10, [mask1]
1275 vpshufb xmm3, xmm10
1276
1277 vpblendvb xmm3, xmm3, xmm2, xmm10
1278
1279 ; xor Tweak value
1280 vpxor xmm8, xmm3, xmm0
1281
1282 ;encrypt last block with cipher stealing
1283 vpxor xmm8, [keys] ; ARK
1284 vaesenc xmm8, [keys + 16*1] ; round 1
1285 vaesenc xmm8, [keys + 16*2] ; round 2
1286 vaesenc xmm8, [keys + 16*3] ; round 3
1287 vaesenc xmm8, [keys + 16*4] ; round 4
1288 vaesenc xmm8, [keys + 16*5] ; round 5
1289 vaesenc xmm8, [keys + 16*6] ; round 6
1290 vaesenc xmm8, [keys + 16*7] ; round 7
1291 vaesenc xmm8, [keys + 16*8] ; round 8
1292 vaesenc xmm8, [keys + 16*9] ; round 9
1293 vaesenclast xmm8, [keys + 16*10] ; round 10
1294
1295 ; xor Tweak value
1296 vpxor xmm8, xmm8, xmm0
1297
1298 ; store last ciphertext value
1299 vmovdqu [ptr_ciphertext - 16], xmm8
1300
1301 _ret_:
1302 mov rbx, [_gpr + 8*0]
1303
1304 %ifidn __OUTPUT_FORMAT__, win64
1305 mov rdi, [_gpr + 8*1]
1306 mov rsi, [_gpr + 8*2]
1307
1308 vmovdqa xmm6, [_xmm + 16*0]
1309 vmovdqa xmm7, [_xmm + 16*1]
1310 vmovdqa xmm8, [_xmm + 16*2]
1311 vmovdqa xmm9, [_xmm + 16*3]
1312 vmovdqa xmm10, [_xmm + 16*4]
1313 vmovdqa xmm11, [_xmm + 16*5]
1314 vmovdqa xmm12, [_xmm + 16*6]
1315 vmovdqa xmm13, [_xmm + 16*7]
1316 vmovdqa xmm14, [_xmm + 16*8]
1317 vmovdqa xmm15, [_xmm + 16*9]
1318 %endif
1319
1320 %ifndef ALIGN_STACK
1321 add rsp, VARIABLE_OFFSET
1322 %else
1323 mov rsp, rbp
1324 pop rbp
1325 %endif
1326 ret
1327
1328
1329 _less_than_128_bytes:
1330 cmp N_val, 16
1331 jb _ret_
1332
1333 mov tmp1, N_val
1334 and tmp1, (7*16)
1335 cmp tmp1, (6*16)
1336 je _num_blocks_is_6
1337 cmp tmp1, (5*16)
1338 je _num_blocks_is_5
1339 cmp tmp1, (4*16)
1340 je _num_blocks_is_4
1341 cmp tmp1, (3*16)
1342 je _num_blocks_is_3
1343 cmp tmp1, (2*16)
1344 je _num_blocks_is_2
1345 cmp tmp1, (1*16)
1346 je _num_blocks_is_1
1347
1348 _num_blocks_is_7:
1349 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1350 add ptr_plaintext, 16*7
1351 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1352 ; store ciphertext
1353 vmovdqu [ptr_ciphertext+16*0], xmm1
1354 vmovdqu [ptr_ciphertext+16*1], xmm2
1355 vmovdqu [ptr_ciphertext+16*2], xmm3
1356 vmovdqu [ptr_ciphertext+16*3], xmm4
1357 vmovdqu [ptr_ciphertext+16*4], xmm5
1358 vmovdqu [ptr_ciphertext+16*5], xmm6
1359 vmovdqu [ptr_ciphertext+16*6], xmm7
1360 add ptr_ciphertext, 16*7
1361 vmovdqa xmm8, xmm7
1362
1363 and N_val, 15 ; N_val = N_val mod 16
1364 je _ret_
1365 jmp _steal_cipher_next
1366 _num_blocks_is_6:
1367 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1368 add ptr_plaintext, 16*6
1369 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1370 ; store ciphertext
1371 vmovdqu [ptr_ciphertext+16*0], xmm1
1372 vmovdqu [ptr_ciphertext+16*1], xmm2
1373 vmovdqu [ptr_ciphertext+16*2], xmm3
1374 vmovdqu [ptr_ciphertext+16*3], xmm4
1375 vmovdqu [ptr_ciphertext+16*4], xmm5
1376 vmovdqu [ptr_ciphertext+16*5], xmm6
1377
1378 add ptr_ciphertext, 16*6
1379 vmovdqa xmm8, xmm6
1380
1381 and N_val, 15 ; N_val = N_val mod 16
1382 je _ret_
1383 jmp _steal_cipher_next
1384 _num_blocks_is_5:
1385 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1386 add ptr_plaintext, 16*5
1387 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1388 ; store ciphertext
1389 vmovdqu [ptr_ciphertext+16*0], xmm1
1390 vmovdqu [ptr_ciphertext+16*1], xmm2
1391 vmovdqu [ptr_ciphertext+16*2], xmm3
1392 vmovdqu [ptr_ciphertext+16*3], xmm4
1393 vmovdqu [ptr_ciphertext+16*4], xmm5
1394
1395 add ptr_ciphertext, 16*5
1396 vmovdqa xmm8, xmm5
1397
1398 and N_val, 15 ; N_val = N_val mod 16
1399 je _ret_
1400 jmp _steal_cipher_next
1401 _num_blocks_is_4:
1402 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1403 add ptr_plaintext, 16*4
1404 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1405 ; store ciphertext
1406 vmovdqu [ptr_ciphertext+16*0], xmm1
1407 vmovdqu [ptr_ciphertext+16*1], xmm2
1408 vmovdqu [ptr_ciphertext+16*2], xmm3
1409 vmovdqu [ptr_ciphertext+16*3], xmm4
1410
1411 add ptr_ciphertext, 16*4
1412 vmovdqa xmm8, xmm4
1413
1414 and N_val, 15 ; N_val = N_val mod 16
1415 je _ret_
1416 jmp _steal_cipher_next
1417 _num_blocks_is_3:
1418 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1419 add ptr_plaintext, 16*3
1420 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1421 ; store ciphertext
1422 vmovdqu [ptr_ciphertext+16*0], xmm1
1423 vmovdqu [ptr_ciphertext+16*1], xmm2
1424 vmovdqu [ptr_ciphertext+16*2], xmm3
1425
1426 add ptr_ciphertext, 16*3
1427 vmovdqa xmm8, xmm3
1428
1429 and N_val, 15 ; N_val = N_val mod 16
1430 je _ret_
1431 jmp _steal_cipher_next
1432
1433 _num_blocks_is_2:
1434 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1435 add ptr_plaintext, 16*2
1436 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1437 ; store ciphertext
1438 vmovdqu [ptr_ciphertext], xmm1
1439 vmovdqu [ptr_ciphertext+16], xmm2
1440
1441 add ptr_ciphertext, 16*2
1442 vmovdqa xmm8, xmm2
1443
1444 and N_val, 15 ; N_val = N_val mod 16
1445 je _ret_
1446 jmp _steal_cipher_next
1447
1448 _num_blocks_is_1:
1449 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1450 add ptr_plaintext, 16*1
1451 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1452 vmovdqu [ptr_ciphertext], xmm1
1453 add ptr_ciphertext, 16*1
1454 vmovdqa xmm8, xmm1
1455 and N_val, 15 ; N_val = N_val mod 16
1456 je _ret_
1457 jmp _steal_cipher_next
1458
1459 section .data
1460 align 16
1461
1462 vpshufb_shf_table:
1463 ; use these values for shift constants for the vpshufb instruction
1464 ; different alignments result in values as shown:
1465 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1466 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1467 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1468 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1469 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1470 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1471 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1472 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1473 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1474 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1475 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1476 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1477 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1478 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1479 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1480 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1481 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1482
1483 mask1:
1484 dq 0x8080808080808080, 0x8080808080808080
1485
1486 const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
1487 const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
1488 const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
1489 const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
1490
1491 shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1492
1493 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
1494 %ifidn __OUTPUT_FORMAT__, win64
1495 global no_XTS_AES_128_enc_vaes
1496 no_XTS_AES_128_enc_vaes:
1497 %endif
1498 %endif ; (AS_FEATURE_LEVEL) >= 10