]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_256_enc_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS encrypt function with 256-bit AES
30 ; input keys are not aligned
31 ; keys are expanded in parallel with the tweak encryption
32 ; plaintext and ciphertext are not aligned
33 ; second key is stored in the stack as aligned to 16 Bytes
34 ; first key is required only once, no need for storage of this key
35
36 %include "reg_sizes.asm"
37
38 default rel
39 %define TW rsp ; store 8 tweak values
40 %define keys rsp + 16*8 ; store 15 expanded keys
41
42 %ifidn __OUTPUT_FORMAT__, win64
43 %define _xmm rsp + 16*23 ; store xmm6:xmm15
44 %endif
45
46 %ifidn __OUTPUT_FORMAT__, elf64
47 %define _gpr rsp + 16*23 ; store rbx
48 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
49 %else
50 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
51 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
52 %endif
53
54 %define GHASH_POLY 0x87
55
56 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
57 ;void XTS_AES_256_enc_sse(
58 ; UINT8 *k2, // key used for tweaking, 16*2 bytes
59 ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
60 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
61 ; UINT64 N, // sector size, in bytes
62 ; const UINT8 *pt, // plaintext sector input data
63 ; UINT8 *ct); // ciphertext sector output data
64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
65
66 ; arguments for input parameters
67 %ifidn __OUTPUT_FORMAT__, elf64
68 %xdefine ptr_key2 rdi
69 %xdefine ptr_key1 rsi
70 %xdefine T_val rdx
71 %xdefine N_val rcx
72 %xdefine ptr_plaintext r8
73 %xdefine ptr_ciphertext r9
74 %else
75 %xdefine ptr_key2 rcx
76 %xdefine ptr_key1 rdx
77 %xdefine T_val r8
78 %xdefine N_val r9
79 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
80 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
81 %endif
82
83 ; arguments for temp parameters
84 %ifidn __OUTPUT_FORMAT__, elf64
85 %define tmp1 rdi
86 %define target_ptr_val rsi
87 %define ghash_poly_8b r10
88 %define ghash_poly_8b_temp r11
89 %else
90 %define tmp1 rcx
91 %define target_ptr_val rdx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94 %endif
95
96 %define twtempl rax ; global temp registers used for tweak computation
97 %define twtemph rbx
98
99
100 ; produce the key for the next round
101 ; raw_key is the output of aeskeygenassist instruction
102 ; round_key value before this key_expansion_128 macro is current round key
103 ; round_key value after this key_expansion_128 macro is next round key
104 ; 2 macros will be used for key generation in a flip-flopped fashion
105 %macro key_expansion_256_flip 3
106 %define %%xraw_key %1
107 %define %%xtmp %2
108 %define %%xround_key %3
109 pshufd %%xraw_key, %%xraw_key, 11111111b
110 shufps %%xtmp, %%xround_key, 00010000b
111 pxor %%xround_key, %%xtmp
112 shufps %%xtmp, %%xround_key, 10001100b
113 pxor %%xround_key, %%xtmp
114 pxor %%xround_key, %%xraw_key
115 %endmacro
116
117 %macro key_expansion_256_flop 3
118 %define %%xraw_key %1
119 %define %%xtmp %2
120 %define %%xround_key %3
121 pshufd %%xraw_key, %%xraw_key, 10101010b
122 shufps %%xtmp, %%xround_key, 00010000b
123 pxor %%xround_key, %%xtmp
124 shufps %%xtmp, %%xround_key, 10001100b
125 pxor %%xround_key, %%xtmp
126 pxor %%xround_key, %%xraw_key
127 %endmacro
128
129
130
131
132 ; macro to encrypt the tweak value in parallel with key generation of both keys
133
134 %macro encrypt_T 10
135 %define %%xkey2 %1
136 %define %%xkey2_2 %2
137 %define %%xstate_tweak %3
138 %define %%xkey1 %4
139 %define %%xkey1_2 %5
140 %define %%xraw_key %6
141 %define %%xtmp %7
142 %define %%ptr_key2 %8
143 %define %%ptr_key1 %9
144 %define %%ptr_expanded_keys %10
145
146
147 movdqu %%xkey2, [%%ptr_key2]
148 pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
149
150 movdqu %%xkey1, [%%ptr_key1]
151 movdqa [%%ptr_expanded_keys+16*0], %%xkey1
152
153 movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
154 aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
155
156 movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
157 movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
158
159
160
161
162 aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
163 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
164 aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
165 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
166 aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
167 movdqa [%%ptr_expanded_keys+16*2], %%xkey1
168
169 aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
170 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
171 aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
172 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
173 aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
174 movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
175
176
177
178 aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
179 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
180 aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
181 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
182 aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
183 movdqa [%%ptr_expanded_keys+16*4], %%xkey1
184
185 aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
186 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
187 aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
188 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
189 aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
190 movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
191
192
193
194 aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
195 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
196 aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
197 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
198 aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
199 movdqa [%%ptr_expanded_keys+16*6], %%xkey1
200
201 aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
202 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
203 aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
204 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
205 aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
206 movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
207
208
209 aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
210 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
211 aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
212 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
213 aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
214 movdqa [%%ptr_expanded_keys+16*8], %%xkey1
215
216 aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
217 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
218 aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
219 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
220 aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
221 movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
222
223
224 aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
225 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
226 aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
227 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
228 aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
229 movdqa [%%ptr_expanded_keys+16*10], %%xkey1
230
231 aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
232 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
233 aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
234 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
235 aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
236 movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
237
238
239 aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
240 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
241 aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
242 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
243 aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
244 movdqa [%%ptr_expanded_keys+16*12], %%xkey1
245
246 aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
247 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
248 aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
249 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
250 aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
251 movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
252
253
254 aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
255 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
256 aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
257 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
258 aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
259 movdqa [%%ptr_expanded_keys+16*14], %%xkey1
260
261 movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
262 %endmacro
263
264
265 ; generate initial tweak values
266 ; load initial plaintext values
267 %macro initialize 16
268
269 %define %%ST1 %1 ; state 1
270 %define %%ST2 %2 ; state 2
271 %define %%ST3 %3 ; state 3
272 %define %%ST4 %4 ; state 4
273 %define %%ST5 %5 ; state 5
274 %define %%ST6 %6 ; state 6
275 %define %%ST7 %7 ; state 7
276 %define %%ST8 %8 ; state 8
277
278 %define %%TW1 %9 ; tweak 1
279 %define %%TW2 %10 ; tweak 2
280 %define %%TW3 %11 ; tweak 3
281 %define %%TW4 %12 ; tweak 4
282 %define %%TW5 %13 ; tweak 5
283 %define %%TW6 %14 ; tweak 6
284 %define %%TW7 %15 ; tweak 7
285
286 %define %%num_initial_blocks %16
287
288
289 ; generate next Tweak values
290 movdqa %%TW1, [TW+16*0]
291 mov twtempl, [TW+8*0]
292 mov twtemph, [TW+8*1]
293 movdqu %%ST1, [ptr_plaintext+16*0]
294 %if (%%num_initial_blocks>=2)
295 xor ghash_poly_8b_temp, ghash_poly_8b_temp
296 shl twtempl, 1
297 adc twtemph, twtemph
298 cmovc ghash_poly_8b_temp, ghash_poly_8b
299 xor twtempl, ghash_poly_8b_temp
300 mov [TW+8*2], twtempl
301 mov [TW+8*3], twtemph;
302 movdqa %%TW2, [TW+16*1]
303 movdqu %%ST2, [ptr_plaintext+16*1]
304 %endif
305 %if (%%num_initial_blocks>=3)
306 xor ghash_poly_8b_temp, ghash_poly_8b_temp
307 shl twtempl, 1
308 adc twtemph, twtemph
309 cmovc ghash_poly_8b_temp, ghash_poly_8b
310 xor twtempl, ghash_poly_8b_temp
311 mov [TW+8*4], twtempl
312 mov [TW+8*5], twtemph;
313 movdqa %%TW3, [TW+16*2]
314 movdqu %%ST3, [ptr_plaintext+16*2]
315 %endif
316 %if (%%num_initial_blocks>=4)
317 xor ghash_poly_8b_temp, ghash_poly_8b_temp
318 shl twtempl, 1
319 adc twtemph, twtemph
320 cmovc ghash_poly_8b_temp, ghash_poly_8b
321 xor twtempl, ghash_poly_8b_temp
322 mov [TW+8*6], twtempl
323 mov [TW+8*7], twtemph;
324 movdqa %%TW4, [TW+16*3]
325 movdqu %%ST4, [ptr_plaintext+16*3]
326 %endif
327 %if (%%num_initial_blocks>=5)
328 xor ghash_poly_8b_temp, ghash_poly_8b_temp
329 shl twtempl, 1
330 adc twtemph, twtemph
331 cmovc ghash_poly_8b_temp, ghash_poly_8b
332 xor twtempl, ghash_poly_8b_temp
333 mov [TW+8*8], twtempl
334 mov [TW+8*9], twtemph;
335 movdqa %%TW5, [TW+16*4]
336 movdqu %%ST5, [ptr_plaintext+16*4]
337 %endif
338 %if (%%num_initial_blocks>=6)
339 xor ghash_poly_8b_temp, ghash_poly_8b_temp
340 shl twtempl, 1
341 adc twtemph, twtemph
342 cmovc ghash_poly_8b_temp, ghash_poly_8b
343 xor twtempl, ghash_poly_8b_temp
344 mov [TW+8*10], twtempl
345 mov [TW+8*11], twtemph;
346 movdqa %%TW6, [TW+16*5]
347 movdqu %%ST6, [ptr_plaintext+16*5]
348 %endif
349 %if (%%num_initial_blocks>=7)
350 xor ghash_poly_8b_temp, ghash_poly_8b_temp
351 shl twtempl, 1
352 adc twtemph, twtemph
353 cmovc ghash_poly_8b_temp, ghash_poly_8b
354 xor twtempl, ghash_poly_8b_temp
355 mov [TW+8*12], twtempl
356 mov [TW+8*13], twtemph;
357 movdqa %%TW7, [TW+16*6]
358 movdqu %%ST7, [ptr_plaintext+16*6]
359 %endif
360
361
362
363 %endmacro
364
365
366 ; encrypt initial blocks of AES
367 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
368 ; next 8 Tweak values are generated
369 %macro encrypt_initial 18
370 %define %%ST1 %1 ; state 1
371 %define %%ST2 %2 ; state 2
372 %define %%ST3 %3 ; state 3
373 %define %%ST4 %4 ; state 4
374 %define %%ST5 %5 ; state 5
375 %define %%ST6 %6 ; state 6
376 %define %%ST7 %7 ; state 7
377 %define %%ST8 %8 ; state 8
378
379 %define %%TW1 %9 ; tweak 1
380 %define %%TW2 %10 ; tweak 2
381 %define %%TW3 %11 ; tweak 3
382 %define %%TW4 %12 ; tweak 4
383 %define %%TW5 %13 ; tweak 5
384 %define %%TW6 %14 ; tweak 6
385 %define %%TW7 %15 ; tweak 7
386 %define %%T0 %16 ; Temp register
387 %define %%num_blocks %17
388 ; %%num_blocks blocks encrypted
389 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
390
391 %define %%lt128 %18 ; less than 128 bytes
392
393 ; xor Tweak value
394 pxor %%ST1, %%TW1
395 %if (%%num_blocks>=2)
396 pxor %%ST2, %%TW2
397 %endif
398 %if (%%num_blocks>=3)
399 pxor %%ST3, %%TW3
400 %endif
401 %if (%%num_blocks>=4)
402 pxor %%ST4, %%TW4
403 %endif
404 %if (%%num_blocks>=5)
405 pxor %%ST5, %%TW5
406 %endif
407 %if (%%num_blocks>=6)
408 pxor %%ST6, %%TW6
409 %endif
410 %if (%%num_blocks>=7)
411 pxor %%ST7, %%TW7
412 %endif
413
414
415 ; ARK
416 movdqa %%T0, [keys]
417 pxor %%ST1, %%T0
418 %if (%%num_blocks>=2)
419 pxor %%ST2, %%T0
420 %endif
421 %if (%%num_blocks>=3)
422 pxor %%ST3, %%T0
423 %endif
424 %if (%%num_blocks>=4)
425 pxor %%ST4, %%T0
426 %endif
427 %if (%%num_blocks>=5)
428 pxor %%ST5, %%T0
429 %endif
430 %if (%%num_blocks>=6)
431 pxor %%ST6, %%T0
432 %endif
433 %if (%%num_blocks>=7)
434 pxor %%ST7, %%T0
435 %endif
436
437
438 %if (0 == %%lt128)
439 xor ghash_poly_8b_temp, ghash_poly_8b_temp
440 shl twtempl, 1
441 adc twtemph, twtemph
442 %endif
443
444 ; round 1
445 movdqa %%T0, [keys + 16*1]
446 aesenc %%ST1, %%T0
447 %if (%%num_blocks>=2)
448 aesenc %%ST2, %%T0
449 %endif
450 %if (%%num_blocks>=3)
451 aesenc %%ST3, %%T0
452 %endif
453 %if (%%num_blocks>=4)
454 aesenc %%ST4, %%T0
455 %endif
456 %if (%%num_blocks>=5)
457 aesenc %%ST5, %%T0
458 %endif
459 %if (%%num_blocks>=6)
460 aesenc %%ST6, %%T0
461 %endif
462 %if (%%num_blocks>=7)
463 aesenc %%ST7, %%T0
464 %endif
465 %if (0 == %%lt128)
466 cmovc ghash_poly_8b_temp, ghash_poly_8b
467 xor twtempl, ghash_poly_8b_temp
468 mov [TW + 8*0], twtempl ; next Tweak1 generated
469 mov [TW + 8*1], twtemph
470 xor ghash_poly_8b_temp, ghash_poly_8b_temp
471 %endif
472
473 ; round 2
474 movdqa %%T0, [keys + 16*2]
475 aesenc %%ST1, %%T0
476 %if (%%num_blocks>=2)
477 aesenc %%ST2, %%T0
478 %endif
479 %if (%%num_blocks>=3)
480 aesenc %%ST3, %%T0
481 %endif
482 %if (%%num_blocks>=4)
483 aesenc %%ST4, %%T0
484 %endif
485 %if (%%num_blocks>=5)
486 aesenc %%ST5, %%T0
487 %endif
488 %if (%%num_blocks>=6)
489 aesenc %%ST6, %%T0
490 %endif
491 %if (%%num_blocks>=7)
492 aesenc %%ST7, %%T0
493 %endif
494
495 %if (0 == %%lt128)
496 shl twtempl, 1
497 adc twtemph, twtemph
498 cmovc ghash_poly_8b_temp, ghash_poly_8b
499 xor twtempl, ghash_poly_8b_temp
500 mov [TW + 8*2], twtempl ; next Tweak2 generated
501 %endif
502
503 ; round 3
504 movdqa %%T0, [keys + 16*3]
505 aesenc %%ST1, %%T0
506 %if (%%num_blocks>=2)
507 aesenc %%ST2, %%T0
508 %endif
509 %if (%%num_blocks>=3)
510 aesenc %%ST3, %%T0
511 %endif
512 %if (%%num_blocks>=4)
513 aesenc %%ST4, %%T0
514 %endif
515 %if (%%num_blocks>=5)
516 aesenc %%ST5, %%T0
517 %endif
518 %if (%%num_blocks>=6)
519 aesenc %%ST6, %%T0
520 %endif
521 %if (%%num_blocks>=7)
522 aesenc %%ST7, %%T0
523 %endif
524 %if (0 == %%lt128)
525 mov [TW + 8*3], twtemph
526 xor ghash_poly_8b_temp, ghash_poly_8b_temp
527 shl twtempl, 1
528 adc twtemph, twtemph
529 cmovc ghash_poly_8b_temp, ghash_poly_8b
530 %endif
531
532 ; round 4
533 movdqa %%T0, [keys + 16*4]
534 aesenc %%ST1, %%T0
535 %if (%%num_blocks>=2)
536 aesenc %%ST2, %%T0
537 %endif
538 %if (%%num_blocks>=3)
539 aesenc %%ST3, %%T0
540 %endif
541 %if (%%num_blocks>=4)
542 aesenc %%ST4, %%T0
543 %endif
544 %if (%%num_blocks>=5)
545 aesenc %%ST5, %%T0
546 %endif
547 %if (%%num_blocks>=6)
548 aesenc %%ST6, %%T0
549 %endif
550 %if (%%num_blocks>=7)
551 aesenc %%ST7, %%T0
552 %endif
553
554 %if (0 == %%lt128)
555 xor twtempl, ghash_poly_8b_temp
556 mov [TW + 8*4], twtempl ; next Tweak3 generated
557 mov [TW + 8*5], twtemph
558 xor ghash_poly_8b_temp, ghash_poly_8b_temp
559 shl twtempl, 1
560 %endif
561
562 ; round 5
563 movdqa %%T0, [keys + 16*5]
564 aesenc %%ST1, %%T0
565 %if (%%num_blocks>=2)
566 aesenc %%ST2, %%T0
567 %endif
568 %if (%%num_blocks>=3)
569 aesenc %%ST3, %%T0
570 %endif
571 %if (%%num_blocks>=4)
572 aesenc %%ST4, %%T0
573 %endif
574 %if (%%num_blocks>=5)
575 aesenc %%ST5, %%T0
576 %endif
577 %if (%%num_blocks>=6)
578 aesenc %%ST6, %%T0
579 %endif
580 %if (%%num_blocks>=7)
581 aesenc %%ST7, %%T0
582 %endif
583
584 %if (0 == %%lt128)
585 adc twtemph, twtemph
586 cmovc ghash_poly_8b_temp, ghash_poly_8b
587 xor twtempl, ghash_poly_8b_temp
588 mov [TW + 8*6], twtempl ; next Tweak4 generated
589 mov [TW + 8*7], twtemph
590 %endif
591
592 ; round 6
593 movdqa %%T0, [keys + 16*6]
594 aesenc %%ST1, %%T0
595 %if (%%num_blocks>=2)
596 aesenc %%ST2, %%T0
597 %endif
598 %if (%%num_blocks>=3)
599 aesenc %%ST3, %%T0
600 %endif
601 %if (%%num_blocks>=4)
602 aesenc %%ST4, %%T0
603 %endif
604 %if (%%num_blocks>=5)
605 aesenc %%ST5, %%T0
606 %endif
607 %if (%%num_blocks>=6)
608 aesenc %%ST6, %%T0
609 %endif
610 %if (%%num_blocks>=7)
611 aesenc %%ST7, %%T0
612 %endif
613
614 %if (0 == %%lt128)
615 xor ghash_poly_8b_temp, ghash_poly_8b_temp
616 shl twtempl, 1
617 adc twtemph, twtemph
618 cmovc ghash_poly_8b_temp, ghash_poly_8b
619 xor twtempl, ghash_poly_8b_temp
620 mov [TW + 8*8], twtempl ; next Tweak5 generated
621 mov [TW + 8*9], twtemph
622 %endif
623
624 ; round 7
625 movdqa %%T0, [keys + 16*7]
626 aesenc %%ST1, %%T0
627 %if (%%num_blocks>=2)
628 aesenc %%ST2, %%T0
629 %endif
630 %if (%%num_blocks>=3)
631 aesenc %%ST3, %%T0
632 %endif
633 %if (%%num_blocks>=4)
634 aesenc %%ST4, %%T0
635 %endif
636 %if (%%num_blocks>=5)
637 aesenc %%ST5, %%T0
638 %endif
639 %if (%%num_blocks>=6)
640 aesenc %%ST6, %%T0
641 %endif
642 %if (%%num_blocks>=7)
643 aesenc %%ST7, %%T0
644 %endif
645
646 %if (0 == %%lt128)
647 xor ghash_poly_8b_temp, ghash_poly_8b_temp
648 shl twtempl, 1
649 adc twtemph, twtemph
650 cmovc ghash_poly_8b_temp, ghash_poly_8b
651 xor twtempl, ghash_poly_8b_temp
652 mov [TW + 8*10], twtempl ; next Tweak6 generated
653 mov [TW + 8*11], twtemph
654 %endif
655 ; round 8
656 movdqa %%T0, [keys + 16*8]
657 aesenc %%ST1, %%T0
658 %if (%%num_blocks>=2)
659 aesenc %%ST2, %%T0
660 %endif
661 %if (%%num_blocks>=3)
662 aesenc %%ST3, %%T0
663 %endif
664 %if (%%num_blocks>=4)
665 aesenc %%ST4, %%T0
666 %endif
667 %if (%%num_blocks>=5)
668 aesenc %%ST5, %%T0
669 %endif
670 %if (%%num_blocks>=6)
671 aesenc %%ST6, %%T0
672 %endif
673 %if (%%num_blocks>=7)
674 aesenc %%ST7, %%T0
675 %endif
676
677 %if (0 == %%lt128)
678 xor ghash_poly_8b_temp, ghash_poly_8b_temp
679 shl twtempl, 1
680 adc twtemph, twtemph
681 cmovc ghash_poly_8b_temp, ghash_poly_8b
682 xor twtempl, ghash_poly_8b_temp
683 mov [TW + 8*12], twtempl ; next Tweak7 generated
684 mov [TW + 8*13], twtemph
685 %endif
686 ; round 9
687 movdqa %%T0, [keys + 16*9]
688 aesenc %%ST1, %%T0
689 %if (%%num_blocks>=2)
690 aesenc %%ST2, %%T0
691 %endif
692 %if (%%num_blocks>=3)
693 aesenc %%ST3, %%T0
694 %endif
695 %if (%%num_blocks>=4)
696 aesenc %%ST4, %%T0
697 %endif
698 %if (%%num_blocks>=5)
699 aesenc %%ST5, %%T0
700 %endif
701 %if (%%num_blocks>=6)
702 aesenc %%ST6, %%T0
703 %endif
704 %if (%%num_blocks>=7)
705 aesenc %%ST7, %%T0
706 %endif
707
708 %if (0 == %%lt128)
709 xor ghash_poly_8b_temp, ghash_poly_8b_temp
710 shl twtempl, 1
711 adc twtemph, twtemph
712 cmovc ghash_poly_8b_temp, ghash_poly_8b
713 xor twtempl, ghash_poly_8b_temp
714 mov [TW + 8*14], twtempl ; next Tweak8 generated
715 mov [TW + 8*15], twtemph
716 %endif
717 ; round 10
718 movdqa %%T0, [keys + 16*10]
719 aesenc %%ST1, %%T0
720 %if (%%num_blocks>=2)
721 aesenc %%ST2, %%T0
722 %endif
723 %if (%%num_blocks>=3)
724 aesenc %%ST3, %%T0
725 %endif
726 %if (%%num_blocks>=4)
727 aesenc %%ST4, %%T0
728 %endif
729 %if (%%num_blocks>=5)
730 aesenc %%ST5, %%T0
731 %endif
732 %if (%%num_blocks>=6)
733 aesenc %%ST6, %%T0
734 %endif
735 %if (%%num_blocks>=7)
736 aesenc %%ST7, %%T0
737 %endif
738 ; round 11
739 movdqa %%T0, [keys + 16*11]
740 aesenc %%ST1, %%T0
741 %if (%%num_blocks>=2)
742 aesenc %%ST2, %%T0
743 %endif
744 %if (%%num_blocks>=3)
745 aesenc %%ST3, %%T0
746 %endif
747 %if (%%num_blocks>=4)
748 aesenc %%ST4, %%T0
749 %endif
750 %if (%%num_blocks>=5)
751 aesenc %%ST5, %%T0
752 %endif
753 %if (%%num_blocks>=6)
754 aesenc %%ST6, %%T0
755 %endif
756 %if (%%num_blocks>=7)
757 aesenc %%ST7, %%T0
758 %endif
759
760 ; round 12
761 movdqa %%T0, [keys + 16*12]
762 aesenc %%ST1, %%T0
763 %if (%%num_blocks>=2)
764 aesenc %%ST2, %%T0
765 %endif
766 %if (%%num_blocks>=3)
767 aesenc %%ST3, %%T0
768 %endif
769 %if (%%num_blocks>=4)
770 aesenc %%ST4, %%T0
771 %endif
772 %if (%%num_blocks>=5)
773 aesenc %%ST5, %%T0
774 %endif
775 %if (%%num_blocks>=6)
776 aesenc %%ST6, %%T0
777 %endif
778 %if (%%num_blocks>=7)
779 aesenc %%ST7, %%T0
780 %endif
781
782 ; round 13
783 movdqa %%T0, [keys + 16*13]
784 aesenc %%ST1, %%T0
785 %if (%%num_blocks>=2)
786 aesenc %%ST2, %%T0
787 %endif
788 %if (%%num_blocks>=3)
789 aesenc %%ST3, %%T0
790 %endif
791 %if (%%num_blocks>=4)
792 aesenc %%ST4, %%T0
793 %endif
794 %if (%%num_blocks>=5)
795 aesenc %%ST5, %%T0
796 %endif
797 %if (%%num_blocks>=6)
798 aesenc %%ST6, %%T0
799 %endif
800 %if (%%num_blocks>=7)
801 aesenc %%ST7, %%T0
802 %endif
803
804 ; round 14
805 movdqa %%T0, [keys + 16*14]
806 aesenclast %%ST1, %%T0
807 %if (%%num_blocks>=2)
808 aesenclast %%ST2, %%T0
809 %endif
810 %if (%%num_blocks>=3)
811 aesenclast %%ST3, %%T0
812 %endif
813 %if (%%num_blocks>=4)
814 aesenclast %%ST4, %%T0
815 %endif
816 %if (%%num_blocks>=5)
817 aesenclast %%ST5, %%T0
818 %endif
819 %if (%%num_blocks>=6)
820 aesenclast %%ST6, %%T0
821 %endif
822 %if (%%num_blocks>=7)
823 aesenclast %%ST7, %%T0
824 %endif
825
826 ; xor Tweak values
827 pxor %%ST1, %%TW1
828 %if (%%num_blocks>=2)
829 pxor %%ST2, %%TW2
830 %endif
831 %if (%%num_blocks>=3)
832 pxor %%ST3, %%TW3
833 %endif
834 %if (%%num_blocks>=4)
835 pxor %%ST4, %%TW4
836 %endif
837 %if (%%num_blocks>=5)
838 pxor %%ST5, %%TW5
839 %endif
840 %if (%%num_blocks>=6)
841 pxor %%ST6, %%TW6
842 %endif
843 %if (%%num_blocks>=7)
844 pxor %%ST7, %%TW7
845 %endif
846
847
848 %if (0 == %%lt128)
849 ; load next Tweak values
850 movdqa %%TW1, [TW + 16*0]
851 movdqa %%TW2, [TW + 16*1]
852 movdqa %%TW3, [TW + 16*2]
853 movdqa %%TW4, [TW + 16*3]
854 movdqa %%TW5, [TW + 16*4]
855 movdqa %%TW6, [TW + 16*5]
856 movdqa %%TW7, [TW + 16*6]
857
858 %endif
859
860 %endmacro
861
862
863 ; Encrypt 8 blocks in parallel
864 ; generate next 8 tweak values
865 %macro encrypt_by_eight 18
866 %define %%ST1 %1 ; state 1
867 %define %%ST2 %2 ; state 2
868 %define %%ST3 %3 ; state 3
869 %define %%ST4 %4 ; state 4
870 %define %%ST5 %5 ; state 5
871 %define %%ST6 %6 ; state 6
872 %define %%ST7 %7 ; state 7
873 %define %%ST8 %8 ; state 8
874 %define %%TW1 %9 ; tweak 1
875 %define %%TW2 %10 ; tweak 2
876 %define %%TW3 %11 ; tweak 3
877 %define %%TW4 %12 ; tweak 4
878 %define %%TW5 %13 ; tweak 5
879 %define %%TW6 %14 ; tweak 6
880 %define %%TW7 %15 ; tweak 7
881 %define %%TW8 %16 ; tweak 8
882 %define %%T0 %17 ; Temp register
883 %define %%last_eight %18
884
885 ; xor Tweak values
886 pxor %%ST1, %%TW1
887 pxor %%ST2, %%TW2
888 pxor %%ST3, %%TW3
889 pxor %%ST4, %%TW4
890 pxor %%ST5, %%TW5
891 pxor %%ST6, %%TW6
892 pxor %%ST7, %%TW7
893 pxor %%ST8, %%TW8
894
895 ; ARK
896 movdqa %%T0, [keys]
897 pxor %%ST1, %%T0
898 pxor %%ST2, %%T0
899 pxor %%ST3, %%T0
900 pxor %%ST4, %%T0
901 pxor %%ST5, %%T0
902 pxor %%ST6, %%T0
903 pxor %%ST7, %%T0
904 pxor %%ST8, %%T0
905
906 %if (0 == %%last_eight)
907 xor ghash_poly_8b_temp, ghash_poly_8b_temp
908 shl twtempl, 1
909 adc twtemph, twtemph
910 cmovc ghash_poly_8b_temp, ghash_poly_8b
911 %endif
912 ; round 1
913 movdqa %%T0, [keys + 16*1]
914 aesenc %%ST1, %%T0
915 aesenc %%ST2, %%T0
916 aesenc %%ST3, %%T0
917 aesenc %%ST4, %%T0
918 aesenc %%ST5, %%T0
919 aesenc %%ST6, %%T0
920 aesenc %%ST7, %%T0
921 aesenc %%ST8, %%T0
922 %if (0 == %%last_eight)
923 xor twtempl, ghash_poly_8b_temp
924 mov [TW + 8*0], twtempl
925 mov [TW + 8*1], twtemph
926 xor ghash_poly_8b_temp, ghash_poly_8b_temp
927 %endif
928 ; round 2
929 movdqa %%T0, [keys + 16*2]
930 aesenc %%ST1, %%T0
931 aesenc %%ST2, %%T0
932 aesenc %%ST3, %%T0
933 aesenc %%ST4, %%T0
934 aesenc %%ST5, %%T0
935 aesenc %%ST6, %%T0
936 aesenc %%ST7, %%T0
937 aesenc %%ST8, %%T0
938 %if (0 == %%last_eight)
939 shl twtempl, 1
940 adc twtemph, twtemph
941 cmovc ghash_poly_8b_temp, ghash_poly_8b
942 xor twtempl, ghash_poly_8b_temp
943
944 %endif
945 ; round 3
946 movdqa %%T0, [keys + 16*3]
947 aesenc %%ST1, %%T0
948 aesenc %%ST2, %%T0
949 aesenc %%ST3, %%T0
950 aesenc %%ST4, %%T0
951 aesenc %%ST5, %%T0
952 aesenc %%ST6, %%T0
953 aesenc %%ST7, %%T0
954 aesenc %%ST8, %%T0
955 %if (0 == %%last_eight)
956 mov [TW + 8*2], twtempl
957 mov [TW + 8*3], twtemph
958 xor ghash_poly_8b_temp, ghash_poly_8b_temp
959 shl twtempl, 1
960 %endif
961 ; round 4
962 movdqa %%T0, [keys + 16*4]
963 aesenc %%ST1, %%T0
964 aesenc %%ST2, %%T0
965 aesenc %%ST3, %%T0
966 aesenc %%ST4, %%T0
967 aesenc %%ST5, %%T0
968 aesenc %%ST6, %%T0
969 aesenc %%ST7, %%T0
970 aesenc %%ST8, %%T0
971 %if (0 == %%last_eight)
972 adc twtemph, twtemph
973 cmovc ghash_poly_8b_temp, ghash_poly_8b
974 xor twtempl, ghash_poly_8b_temp
975 mov [TW + 8*4], twtempl
976 %endif
977 ; round 5
978 movdqa %%T0, [keys + 16*5]
979 aesenc %%ST1, %%T0
980 aesenc %%ST2, %%T0
981 aesenc %%ST3, %%T0
982 aesenc %%ST4, %%T0
983 aesenc %%ST5, %%T0
984 aesenc %%ST6, %%T0
985 aesenc %%ST7, %%T0
986 aesenc %%ST8, %%T0
987 %if (0 == %%last_eight)
988 mov [TW + 8*5], twtemph
989 xor ghash_poly_8b_temp, ghash_poly_8b_temp
990 shl twtempl, 1
991 adc twtemph, twtemph
992 %endif
993 ; round 6
994 movdqa %%T0, [keys + 16*6]
995 aesenc %%ST1, %%T0
996 aesenc %%ST2, %%T0
997 aesenc %%ST3, %%T0
998 aesenc %%ST4, %%T0
999 aesenc %%ST5, %%T0
1000 aesenc %%ST6, %%T0
1001 aesenc %%ST7, %%T0
1002 aesenc %%ST8, %%T0
1003 %if (0 == %%last_eight)
1004 cmovc ghash_poly_8b_temp, ghash_poly_8b
1005 xor twtempl, ghash_poly_8b_temp
1006 mov [TW + 8*6], twtempl
1007 mov [TW + 8*7], twtemph
1008 %endif
1009 ; round 7
1010 movdqa %%T0, [keys + 16*7]
1011 aesenc %%ST1, %%T0
1012 aesenc %%ST2, %%T0
1013 aesenc %%ST3, %%T0
1014 aesenc %%ST4, %%T0
1015 aesenc %%ST5, %%T0
1016 aesenc %%ST6, %%T0
1017 aesenc %%ST7, %%T0
1018 aesenc %%ST8, %%T0
1019 %if (0 == %%last_eight)
1020 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1021 shl twtempl, 1
1022 adc twtemph, twtemph
1023 cmovc ghash_poly_8b_temp, ghash_poly_8b
1024 %endif
1025 ; round 8
1026 movdqa %%T0, [keys + 16*8]
1027 aesenc %%ST1, %%T0
1028 aesenc %%ST2, %%T0
1029 aesenc %%ST3, %%T0
1030 aesenc %%ST4, %%T0
1031 aesenc %%ST5, %%T0
1032 aesenc %%ST6, %%T0
1033 aesenc %%ST7, %%T0
1034 aesenc %%ST8, %%T0
1035 %if (0 == %%last_eight)
1036 xor twtempl, ghash_poly_8b_temp
1037 mov [TW + 8*8], twtempl
1038 mov [TW + 8*9], twtemph
1039 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1040 %endif
1041 ; round 9
1042 movdqa %%T0, [keys + 16*9]
1043 aesenc %%ST1, %%T0
1044 aesenc %%ST2, %%T0
1045 aesenc %%ST3, %%T0
1046 aesenc %%ST4, %%T0
1047 aesenc %%ST5, %%T0
1048 aesenc %%ST6, %%T0
1049 aesenc %%ST7, %%T0
1050 aesenc %%ST8, %%T0
1051 %if (0 == %%last_eight)
1052 shl twtempl, 1
1053 adc twtemph, twtemph
1054 cmovc ghash_poly_8b_temp, ghash_poly_8b
1055 xor twtempl, ghash_poly_8b_temp
1056 %endif
1057 ; round 10
1058 movdqa %%T0, [keys + 16*10]
1059 aesenc %%ST1, %%T0
1060 aesenc %%ST2, %%T0
1061 aesenc %%ST3, %%T0
1062 aesenc %%ST4, %%T0
1063 aesenc %%ST5, %%T0
1064 aesenc %%ST6, %%T0
1065 aesenc %%ST7, %%T0
1066 aesenc %%ST8, %%T0
1067 %if (0 == %%last_eight)
1068 mov [TW + 8*10], twtempl
1069 mov [TW + 8*11], twtemph
1070 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1071 shl twtempl, 1
1072 %endif
1073 ; round 11
1074 movdqa %%T0, [keys + 16*11]
1075 aesenc %%ST1, %%T0
1076 aesenc %%ST2, %%T0
1077 aesenc %%ST3, %%T0
1078 aesenc %%ST4, %%T0
1079 aesenc %%ST5, %%T0
1080 aesenc %%ST6, %%T0
1081 aesenc %%ST7, %%T0
1082 aesenc %%ST8, %%T0
1083 %if (0 == %%last_eight)
1084 adc twtemph, twtemph
1085 cmovc ghash_poly_8b_temp, ghash_poly_8b
1086 xor twtempl, ghash_poly_8b_temp
1087 mov [TW + 8*12], twtempl
1088 %endif
1089 ; round 12
1090 movdqa %%T0, [keys + 16*12]
1091 aesenc %%ST1, %%T0
1092 aesenc %%ST2, %%T0
1093 aesenc %%ST3, %%T0
1094 aesenc %%ST4, %%T0
1095 aesenc %%ST5, %%T0
1096 aesenc %%ST6, %%T0
1097 aesenc %%ST7, %%T0
1098 aesenc %%ST8, %%T0
1099 %if (0 == %%last_eight)
1100 mov [TW + 8*13], twtemph
1101 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1102 shl twtempl, 1
1103 adc twtemph, twtemph
1104 %endif
1105 ; round 13
1106 movdqa %%T0, [keys + 16*13]
1107 aesenc %%ST1, %%T0
1108 aesenc %%ST2, %%T0
1109 aesenc %%ST3, %%T0
1110 aesenc %%ST4, %%T0
1111 aesenc %%ST5, %%T0
1112 aesenc %%ST6, %%T0
1113 aesenc %%ST7, %%T0
1114 aesenc %%ST8, %%T0
1115 %if (0 == %%last_eight)
1116 cmovc ghash_poly_8b_temp, ghash_poly_8b
1117 xor twtempl, ghash_poly_8b_temp
1118 ; mov [TW + 8*14], twtempl
1119 ; mov [TW + 8*15], twtemph
1120 %endif
1121 ; round 14
1122 movdqa %%T0, [keys + 16*14]
1123 aesenclast %%ST1, %%T0
1124 aesenclast %%ST2, %%T0
1125 aesenclast %%ST3, %%T0
1126 aesenclast %%ST4, %%T0
1127 aesenclast %%ST5, %%T0
1128 aesenclast %%ST6, %%T0
1129 aesenclast %%ST7, %%T0
1130 aesenclast %%ST8, %%T0
1131
1132 ; xor Tweak values
1133 pxor %%ST1, %%TW1
1134 pxor %%ST2, %%TW2
1135 pxor %%ST3, %%TW3
1136 pxor %%ST4, %%TW4
1137 pxor %%ST5, %%TW5
1138 pxor %%ST6, %%TW6
1139 pxor %%ST7, %%TW7
1140 pxor %%ST8, %%TW8
1141
1142 mov [TW + 8*14], twtempl
1143 mov [TW + 8*15], twtemph
1144 ; load next Tweak values
1145 movdqa %%TW1, [TW + 16*0]
1146 movdqa %%TW2, [TW + 16*1]
1147 movdqa %%TW3, [TW + 16*2]
1148 movdqa %%TW4, [TW + 16*3]
1149 movdqa %%TW5, [TW + 16*4]
1150 movdqa %%TW6, [TW + 16*5]
1151 movdqa %%TW7, [TW + 16*6]
1152
1153 %endmacro
1154
1155
1156 section .text
1157
1158 global XTS_AES_256_enc_sse:function
1159 XTS_AES_256_enc_sse:
1160
1161 sub rsp, VARIABLE_OFFSET
1162
1163 mov [_gpr + 8*0], rbx
1164 %ifidn __OUTPUT_FORMAT__, win64
1165 mov [_gpr + 8*1], rdi
1166 mov [_gpr + 8*2], rsi
1167
1168 movdqa [_xmm + 16*0], xmm6
1169 movdqa [_xmm + 16*1], xmm7
1170 movdqa [_xmm + 16*2], xmm8
1171 movdqa [_xmm + 16*3], xmm9
1172 movdqa [_xmm + 16*4], xmm10
1173 movdqa [_xmm + 16*5], xmm11
1174 movdqa [_xmm + 16*6], xmm12
1175 movdqa [_xmm + 16*7], xmm13
1176 movdqa [_xmm + 16*8], xmm14
1177 movdqa [_xmm + 16*9], xmm15
1178 %endif
1179
1180 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1181
1182
1183 movdqu xmm1, [T_val] ; read initial Tweak value
1184 pxor xmm4, xmm4 ; for key expansion
1185 encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
1186
1187
1188 %ifidn __OUTPUT_FORMAT__, win64
1189 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1190 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1191 %endif
1192
1193
1194
1195 mov target_ptr_val, N_val
1196 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
1197 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
1198 jl _less_than_128_bytes
1199
1200 add target_ptr_val, ptr_ciphertext
1201
1202
1203 mov tmp1, N_val
1204 and tmp1, (7 << 4)
1205 jz _initial_num_blocks_is_0
1206
1207 cmp tmp1, (4 << 4)
1208 je _initial_num_blocks_is_4
1209
1210
1211
1212 cmp tmp1, (6 << 4)
1213 je _initial_num_blocks_is_6
1214
1215 cmp tmp1, (5 << 4)
1216 je _initial_num_blocks_is_5
1217
1218
1219
1220 cmp tmp1, (3 << 4)
1221 je _initial_num_blocks_is_3
1222
1223 cmp tmp1, (2 << 4)
1224 je _initial_num_blocks_is_2
1225
1226 cmp tmp1, (1 << 4)
1227 je _initial_num_blocks_is_1
1228
1229 _initial_num_blocks_is_7:
1230 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1231 add ptr_plaintext, 16*7
1232 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1233 ; store ciphertext
1234 movdqu [ptr_ciphertext+16*0], xmm1
1235 movdqu [ptr_ciphertext+16*1], xmm2
1236 movdqu [ptr_ciphertext+16*2], xmm3
1237 movdqu [ptr_ciphertext+16*3], xmm4
1238 movdqu [ptr_ciphertext+16*4], xmm5
1239 movdqu [ptr_ciphertext+16*5], xmm6
1240 movdqu [ptr_ciphertext+16*6], xmm7
1241 add ptr_ciphertext, 16*7
1242
1243 cmp ptr_ciphertext, target_ptr_val
1244 je _last_eight
1245
1246 jmp _main_loop
1247 _initial_num_blocks_is_6:
1248 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1249 add ptr_plaintext, 16*6
1250 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1251 ; store ciphertext
1252 movdqu [ptr_ciphertext+16*0], xmm1
1253 movdqu [ptr_ciphertext+16*1], xmm2
1254 movdqu [ptr_ciphertext+16*2], xmm3
1255 movdqu [ptr_ciphertext+16*3], xmm4
1256 movdqu [ptr_ciphertext+16*4], xmm5
1257 movdqu [ptr_ciphertext+16*5], xmm6
1258 add ptr_ciphertext, 16*6
1259
1260 cmp ptr_ciphertext, target_ptr_val
1261 je _last_eight
1262
1263 jmp _main_loop
1264 _initial_num_blocks_is_5:
1265 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1266 add ptr_plaintext, 16*5
1267 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1268 ; store ciphertext
1269 movdqu [ptr_ciphertext+16*0], xmm1
1270 movdqu [ptr_ciphertext+16*1], xmm2
1271 movdqu [ptr_ciphertext+16*2], xmm3
1272 movdqu [ptr_ciphertext+16*3], xmm4
1273 movdqu [ptr_ciphertext+16*4], xmm5
1274 add ptr_ciphertext, 16*5
1275
1276 cmp ptr_ciphertext, target_ptr_val
1277 je _last_eight
1278
1279 jmp _main_loop
1280 _initial_num_blocks_is_4:
1281 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1282 add ptr_plaintext, 16*4
1283 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1284 ; store ciphertext
1285 movdqu [ptr_ciphertext+16*0], xmm1
1286 movdqu [ptr_ciphertext+16*1], xmm2
1287 movdqu [ptr_ciphertext+16*2], xmm3
1288 movdqu [ptr_ciphertext+16*3], xmm4
1289 add ptr_ciphertext, 16*4
1290
1291 cmp ptr_ciphertext, target_ptr_val
1292 je _last_eight
1293
1294 jmp _main_loop
1295
1296
1297 _initial_num_blocks_is_3:
1298 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1299 add ptr_plaintext, 16*3
1300 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1301 ; store ciphertext
1302 movdqu [ptr_ciphertext+16*0], xmm1
1303 movdqu [ptr_ciphertext+16*1], xmm2
1304 movdqu [ptr_ciphertext+16*2], xmm3
1305 add ptr_ciphertext, 16*3
1306
1307 cmp ptr_ciphertext, target_ptr_val
1308 je _last_eight
1309
1310 jmp _main_loop
1311 _initial_num_blocks_is_2:
1312 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1313 add ptr_plaintext, 16*2
1314 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1315 ; store ciphertext
1316 movdqu [ptr_ciphertext], xmm1
1317 movdqu [ptr_ciphertext+16], xmm2
1318 add ptr_ciphertext, 16*2
1319
1320 cmp ptr_ciphertext, target_ptr_val
1321 je _last_eight
1322
1323 jmp _main_loop
1324
1325 _initial_num_blocks_is_1:
1326 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1327 add ptr_plaintext, 16*1
1328 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1329 ; store ciphertext
1330 movdqu [ptr_ciphertext], xmm1
1331 add ptr_ciphertext, 16
1332
1333 cmp ptr_ciphertext, target_ptr_val
1334 je _last_eight
1335
1336 jmp _main_loop
1337
1338 _initial_num_blocks_is_0:
1339 mov twtempl, [TW+8*0]
1340 mov twtemph, [TW+8*1]
1341 movdqa xmm9, [TW+16*0]
1342
1343 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1344 shl twtempl, 1
1345 adc twtemph, twtemph
1346 cmovc ghash_poly_8b_temp, ghash_poly_8b
1347 xor twtempl, ghash_poly_8b_temp
1348 mov [TW+8*2], twtempl
1349 mov [TW+8*3], twtemph
1350 movdqa xmm10, [TW+16*1]
1351
1352 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1353 shl twtempl, 1
1354 adc twtemph, twtemph
1355 cmovc ghash_poly_8b_temp, ghash_poly_8b
1356 xor twtempl, ghash_poly_8b_temp
1357 mov [TW+8*4], twtempl
1358 mov [TW+8*5], twtemph
1359 movdqa xmm11, [TW+16*2]
1360
1361
1362 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1363 shl twtempl, 1
1364 adc twtemph, twtemph
1365 cmovc ghash_poly_8b_temp, ghash_poly_8b
1366 xor twtempl, ghash_poly_8b_temp
1367 mov [TW+8*6], twtempl
1368 mov [TW+8*7], twtemph
1369 movdqa xmm12, [TW+16*3]
1370
1371
1372 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1373 shl twtempl, 1
1374 adc twtemph, twtemph
1375 cmovc ghash_poly_8b_temp, ghash_poly_8b
1376 xor twtempl, ghash_poly_8b_temp
1377 mov [TW+8*8], twtempl
1378 mov [TW+8*9], twtemph
1379 movdqa xmm13, [TW+16*4]
1380
1381 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1382 shl twtempl, 1
1383 adc twtemph, twtemph
1384 cmovc ghash_poly_8b_temp, ghash_poly_8b
1385 xor twtempl, ghash_poly_8b_temp
1386 mov [TW+8*10], twtempl
1387 mov [TW+8*11], twtemph
1388 movdqa xmm14, [TW+16*5]
1389
1390 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1391 shl twtempl, 1
1392 adc twtemph, twtemph
1393 cmovc ghash_poly_8b_temp, ghash_poly_8b
1394 xor twtempl, ghash_poly_8b_temp
1395 mov [TW+8*12], twtempl
1396 mov [TW+8*13], twtemph
1397 movdqa xmm15, [TW+16*6]
1398
1399 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1400 shl twtempl, 1
1401 adc twtemph, twtemph
1402 cmovc ghash_poly_8b_temp, ghash_poly_8b
1403 xor twtempl, ghash_poly_8b_temp
1404 mov [TW+8*14], twtempl
1405 mov [TW+8*15], twtemph
1406 ;movdqa xmm16, [TW+16*7]
1407
1408 cmp ptr_ciphertext, target_ptr_val
1409 je _last_eight
1410 _main_loop:
1411 ; load plaintext
1412 movdqu xmm1, [ptr_plaintext+16*0]
1413 movdqu xmm2, [ptr_plaintext+16*1]
1414 movdqu xmm3, [ptr_plaintext+16*2]
1415 movdqu xmm4, [ptr_plaintext+16*3]
1416 movdqu xmm5, [ptr_plaintext+16*4]
1417 movdqu xmm6, [ptr_plaintext+16*5]
1418 movdqu xmm7, [ptr_plaintext+16*6]
1419 movdqu xmm8, [ptr_plaintext+16*7]
1420
1421 add ptr_plaintext, 128
1422
1423 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1424
1425 ; store ciphertext
1426 movdqu [ptr_ciphertext+16*0], xmm1
1427 movdqu [ptr_ciphertext+16*1], xmm2
1428 movdqu [ptr_ciphertext+16*2], xmm3
1429 movdqu [ptr_ciphertext+16*3], xmm4
1430 movdqu [ptr_ciphertext+16*4], xmm5
1431 movdqu [ptr_ciphertext+16*5], xmm6
1432 movdqu [ptr_ciphertext+16*6], xmm7
1433 movdqu [ptr_ciphertext+16*7], xmm8
1434 add ptr_ciphertext, 128
1435
1436 cmp ptr_ciphertext, target_ptr_val
1437 jne _main_loop
1438
1439 _last_eight:
1440 ; load plaintext
1441 movdqu xmm1, [ptr_plaintext+16*0]
1442 movdqu xmm2, [ptr_plaintext+16*1]
1443 movdqu xmm3, [ptr_plaintext+16*2]
1444 movdqu xmm4, [ptr_plaintext+16*3]
1445 movdqu xmm5, [ptr_plaintext+16*4]
1446 movdqu xmm6, [ptr_plaintext+16*5]
1447 movdqu xmm7, [ptr_plaintext+16*6]
1448 movdqu xmm8, [ptr_plaintext+16*7]
1449 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1450
1451 ; store ciphertext
1452 movdqu [ptr_ciphertext+16*0], xmm1
1453 movdqu [ptr_ciphertext+16*1], xmm2
1454 movdqu [ptr_ciphertext+16*2], xmm3
1455 movdqu [ptr_ciphertext+16*3], xmm4
1456 movdqu [ptr_ciphertext+16*4], xmm5
1457 movdqu [ptr_ciphertext+16*5], xmm6
1458 movdqu [ptr_ciphertext+16*6], xmm7
1459
1460
1461 and N_val, 15 ; N_val = N_val mod 16
1462 je _done
1463 _steal_cipher:
1464 ; start cipher stealing
1465
1466 ; generate next Tweak value
1467 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1468 shl twtempl, 1
1469 adc twtemph, twtemph
1470 cmovc ghash_poly_8b_temp, ghash_poly_8b
1471 xor twtempl, ghash_poly_8b_temp
1472 mov [TW], twtempl
1473 mov [TW + 8], twtemph
1474
1475 movdqa xmm2, xmm8
1476
1477 ; shift xmm8 to the left by 16-N_val bytes
1478 lea twtempl, [pshufb_shf_table]
1479 movdqu xmm0, [twtempl+N_val]
1480 pshufb xmm8, xmm0
1481
1482
1483 movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1484 movdqu [ptr_ciphertext + 112 + N_val], xmm8
1485
1486 ; shift xmm3 to the right by 16-N_val bytes
1487 lea twtempl, [pshufb_shf_table +16]
1488 sub twtempl, N_val
1489 movdqu xmm0, [twtempl]
1490 pxor xmm0, [mask1]
1491 pshufb xmm3, xmm0
1492
1493 pblendvb xmm3, xmm2 ;xmm0 is implicit
1494
1495 ; xor Tweak value
1496 movdqa xmm8, [TW]
1497 pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
1498
1499
1500 ;encrypt last block with cipher stealing
1501 pxor xmm8, [keys] ; ARK
1502 aesenc xmm8, [keys + 16*1] ; round 1
1503 aesenc xmm8, [keys + 16*2] ; round 2
1504 aesenc xmm8, [keys + 16*3] ; round 3
1505 aesenc xmm8, [keys + 16*4] ; round 4
1506 aesenc xmm8, [keys + 16*5] ; round 5
1507 aesenc xmm8, [keys + 16*6] ; round 6
1508 aesenc xmm8, [keys + 16*7] ; round 7
1509 aesenc xmm8, [keys + 16*8] ; round 8
1510 aesenc xmm8, [keys + 16*9] ; round 9
1511 aesenc xmm8, [keys + 16*10] ; round 9
1512 aesenc xmm8, [keys + 16*11] ; round 9
1513 aesenc xmm8, [keys + 16*12] ; round 9
1514 aesenc xmm8, [keys + 16*13] ; round 9
1515 aesenclast xmm8, [keys + 16*14] ; round 10
1516
1517 ; xor Tweak value
1518 pxor xmm8, [TW]
1519
1520 _done:
1521 ; store last ciphertext value
1522 movdqu [ptr_ciphertext+16*7], xmm8
1523
1524 _ret_:
1525
1526 mov rbx, [_gpr + 8*0]
1527 %ifidn __OUTPUT_FORMAT__, win64
1528 mov rdi, [_gpr + 8*1]
1529 mov rsi, [_gpr + 8*2]
1530
1531
1532 movdqa xmm6, [_xmm + 16*0]
1533 movdqa xmm7, [_xmm + 16*1]
1534 movdqa xmm8, [_xmm + 16*2]
1535 movdqa xmm9, [_xmm + 16*3]
1536 movdqa xmm10, [_xmm + 16*4]
1537 movdqa xmm11, [_xmm + 16*5]
1538 movdqa xmm12, [_xmm + 16*6]
1539 movdqa xmm13, [_xmm + 16*7]
1540 movdqa xmm14, [_xmm + 16*8]
1541 movdqa xmm15, [_xmm + 16*9]
1542 %endif
1543
1544 add rsp, VARIABLE_OFFSET
1545
1546 ret
1547
1548
1549
1550
1551
1552 _less_than_128_bytes:
1553 cmp N_val, 16
1554 jb _ret_
1555
1556 mov tmp1, N_val
1557 and tmp1, (7 << 4)
1558 cmp tmp1, (6 << 4)
1559 je _num_blocks_is_6
1560 cmp tmp1, (5 << 4)
1561 je _num_blocks_is_5
1562 cmp tmp1, (4 << 4)
1563 je _num_blocks_is_4
1564 cmp tmp1, (3 << 4)
1565 je _num_blocks_is_3
1566 cmp tmp1, (2 << 4)
1567 je _num_blocks_is_2
1568 cmp tmp1, (1 << 4)
1569 je _num_blocks_is_1
1570
1571 _num_blocks_is_7:
1572 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1573 sub ptr_plaintext, 16*1
1574 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1575 ; store ciphertext
1576 movdqu [ptr_ciphertext+16*0], xmm1
1577 movdqu [ptr_ciphertext+16*1], xmm2
1578 movdqu [ptr_ciphertext+16*2], xmm3
1579 movdqu [ptr_ciphertext+16*3], xmm4
1580 movdqu [ptr_ciphertext+16*4], xmm5
1581 movdqu [ptr_ciphertext+16*5], xmm6
1582
1583 sub ptr_ciphertext, 16*1
1584 movdqa xmm8, xmm7
1585
1586 and N_val, 15 ; N_val = N_val mod 16
1587 je _done
1588 jmp _steal_cipher
1589 _num_blocks_is_6:
1590 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1591 sub ptr_plaintext, 16*2
1592 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1593 ; store ciphertext
1594 movdqu [ptr_ciphertext+16*0], xmm1
1595 movdqu [ptr_ciphertext+16*1], xmm2
1596 movdqu [ptr_ciphertext+16*2], xmm3
1597 movdqu [ptr_ciphertext+16*3], xmm4
1598 movdqu [ptr_ciphertext+16*4], xmm5
1599
1600 sub ptr_ciphertext, 16*2
1601 movdqa xmm8, xmm6
1602
1603 and N_val, 15 ; N_val = N_val mod 16
1604 je _done
1605 jmp _steal_cipher
1606 _num_blocks_is_5:
1607 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1608 sub ptr_plaintext, 16*3
1609 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1610 ; store ciphertext
1611 movdqu [ptr_ciphertext+16*0], xmm1
1612 movdqu [ptr_ciphertext+16*1], xmm2
1613 movdqu [ptr_ciphertext+16*2], xmm3
1614 movdqu [ptr_ciphertext+16*3], xmm4
1615
1616 sub ptr_ciphertext, 16*3
1617 movdqa xmm8, xmm5
1618
1619 and N_val, 15 ; N_val = N_val mod 16
1620 je _done
1621 jmp _steal_cipher
1622 _num_blocks_is_4:
1623 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1624 sub ptr_plaintext, 16*4
1625 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1626 ; store ciphertext
1627 movdqu [ptr_ciphertext+16*0], xmm1
1628 movdqu [ptr_ciphertext+16*1], xmm2
1629 movdqu [ptr_ciphertext+16*2], xmm3
1630
1631 sub ptr_ciphertext, 16*4
1632 movdqa xmm8, xmm4
1633
1634 and N_val, 15 ; N_val = N_val mod 16
1635 je _done
1636 jmp _steal_cipher
1637 _num_blocks_is_3:
1638 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1639 sub ptr_plaintext, 16*5
1640 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1641 ; store ciphertext
1642 movdqu [ptr_ciphertext+16*0], xmm1
1643 movdqu [ptr_ciphertext+16*1], xmm2
1644
1645 sub ptr_ciphertext, 16*5
1646 movdqa xmm8, xmm3
1647
1648 and N_val, 15 ; N_val = N_val mod 16
1649 je _done
1650 jmp _steal_cipher
1651
1652 _num_blocks_is_2:
1653 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1654 sub ptr_plaintext, 16*6
1655 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1656 ; store ciphertext
1657 movdqu [ptr_ciphertext], xmm1
1658
1659 sub ptr_ciphertext, 16*6
1660 movdqa xmm8, xmm2
1661
1662 and N_val, 15 ; N_val = N_val mod 16
1663 je _done
1664 jmp _steal_cipher
1665
1666
1667 _num_blocks_is_1:
1668 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1669
1670 sub ptr_plaintext, 16*7
1671 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1672 ; store ciphertext
1673
1674 sub ptr_ciphertext, 16*7
1675 movdqa xmm8, xmm1
1676
1677 and N_val, 15 ; N_val = N_val mod 16
1678 je _done
1679 jmp _steal_cipher
1680
1681 section .data
1682 align 16
1683
1684 pshufb_shf_table:
1685 ; use these values for shift constants for the pshufb instruction
1686 ; different alignments result in values as shown:
1687 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1688 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1689 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1690 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1691 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1692 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1693 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1694 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1695 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1696 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1697 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1698 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1699 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1700 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1701 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1702 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1703 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1704
1705 mask1:
1706 dq 0x8080808080808080, 0x8080808080808080
1707