]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
3904c8a545026cb68930d126ff15b605ee290080
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / aes / XTS_AES_256_dec_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS decrypt function with 256-bit AES
30 ; input keys are not aligned
31 ; keys are expanded in parallel with the tweak encryption
32 ; plaintext and ciphertext are not aligned
33 ; second key is stored in the stack as aligned to 16 Bytes
34 ; first key is required only once, no need for storage of this key
35
36 %include "reg_sizes.asm"
37
38 default rel
39 %define TW rsp ; store 8 tweak values
40 %define keys rsp + 16*8 ; store 15 expanded keys
41
42 %ifidn __OUTPUT_FORMAT__, win64
43 %define _xmm rsp + 16*23 ; store xmm6:xmm15
44 %endif
45
46 %ifidn __OUTPUT_FORMAT__, elf64
47 %define _gpr rsp + 16*23 ; store rbx
48 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
49 %else
50 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
51 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
52 %endif
53
54 %define GHASH_POLY 0x87
55
56 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
57 ;void XTS_AES_256_dec_sse(
58 ; UINT8 *k2, // key used for tweaking, 16*2 bytes
59 ; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
60 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
61 ; UINT64 N, // sector size, in bytes
62 ; const UINT8 *ct, // ciphertext sector input data
63 ; UINT8 *pt); // plaintext sector output data
64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
65
66 ; arguments for input parameters
67 %ifidn __OUTPUT_FORMAT__, elf64
68 %xdefine ptr_key2 rdi
69 %xdefine ptr_key1 rsi
70 %xdefine T_val rdx
71 %xdefine N_val rcx
72 %xdefine ptr_plaintext r8
73 %xdefine ptr_ciphertext r9
74 %else
75 %xdefine ptr_key2 rcx
76 %xdefine ptr_key1 rdx
77 %xdefine T_val r8
78 %xdefine N_val r9
79 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
80 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
81 %endif
82
83 ; arguments for temp parameters
84 %ifidn __OUTPUT_FORMAT__, elf64
85 %define tmp1 rdi
86 %define target_ptr_val rsi
87 %define ghash_poly_8b r10
88 %define ghash_poly_8b_temp r11
89 %else
90 %define tmp1 rcx
91 %define target_ptr_val rdx
92 %define ghash_poly_8b rdi
93 %define ghash_poly_8b_temp rsi
94 %endif
95
96 %define twtempl rax ; global temp registers used for tweak computation
97 %define twtemph rbx
98
99
100 ; produce the key for the next round
101 ; raw_key is the output of aeskeygenassist instruction
102 ; round_key value before this key_expansion_128 macro is current round key
103 ; round_key value after this key_expansion_128 macro is next round key
104 ; 2 macros will be used for key generation in a flip-flopped fashion
105 %macro key_expansion_256_flip 3
106 %define %%xraw_key %1
107 %define %%xtmp %2
108 %define %%xround_key %3
109 pshufd %%xraw_key, %%xraw_key, 11111111b
110 shufps %%xtmp, %%xround_key, 00010000b
111 pxor %%xround_key, %%xtmp
112 shufps %%xtmp, %%xround_key, 10001100b
113 pxor %%xround_key, %%xtmp
114 pxor %%xround_key, %%xraw_key
115 %endmacro
116
117 %macro key_expansion_256_flop 3
118 %define %%xraw_key %1
119 %define %%xtmp %2
120 %define %%xround_key %3
121 pshufd %%xraw_key, %%xraw_key, 10101010b
122 shufps %%xtmp, %%xround_key, 00010000b
123 pxor %%xround_key, %%xtmp
124 shufps %%xtmp, %%xround_key, 10001100b
125 pxor %%xround_key, %%xtmp
126 pxor %%xround_key, %%xraw_key
127 %endmacro
128
129 ; macro to encrypt the tweak value in parallel with key generation of both keys
130
131 %macro encrypt_T 11
132 %define %%xkey2 %1
133 %define %%xkey2_2 %2
134 %define %%xstate_tweak %3
135 %define %%xkey1 %4
136 %define %%xkey1_2 %5
137 %define %%xraw_key %6
138 %define %%xtmp %7
139 %define %%xtmp2 %8
140 %define %%ptr_key2 %9
141 %define %%ptr_key1 %10
142 %define %%ptr_expanded_keys %11
143
144
145 movdqu %%xkey2, [%%ptr_key2]
146 pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
147
148 movdqu %%xkey1, [%%ptr_key1]
149 movdqa [%%ptr_expanded_keys+16*14], %%xkey1
150
151 movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
152 aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
153
154 movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
155 aesimc %%xtmp2, %%xkey1_2
156 movdqa [%%ptr_expanded_keys+16*13], %%xtmp2
157
158
159
160
161 aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
162 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
163 aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
164 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
165 aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
166 aesimc %%xtmp2, %%xkey1
167 movdqa [%%ptr_expanded_keys+16*12], %%xtmp2
168
169 aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
170 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
171 aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
172 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
173 aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
174 aesimc %%xtmp2, %%xkey1_2
175 movdqa [%%ptr_expanded_keys+16*11], %%xtmp2
176
177
178
179 aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
180 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
181 aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
182 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
183 aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
184 aesimc %%xtmp2, %%xkey1
185 movdqa [%%ptr_expanded_keys+16*10], %%xtmp2
186
187 aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
188 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
189 aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
190 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
191 aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
192 aesimc %%xtmp2, %%xkey1_2
193 movdqa [%%ptr_expanded_keys+16*9], %%xtmp2
194
195
196
197 aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
198 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
199 aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
200 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
201 aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
202 aesimc %%xtmp2, %%xkey1
203 movdqa [%%ptr_expanded_keys+16*8], %%xtmp2
204
205 aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
206 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
207 aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
208 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
209 aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
210 aesimc %%xtmp2, %%xkey1_2
211 movdqa [%%ptr_expanded_keys+16*7], %%xtmp2
212
213
214 aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
215 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
216 aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
217 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
218 aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
219 aesimc %%xtmp2, %%xkey1
220 movdqa [%%ptr_expanded_keys+16*6], %%xtmp2
221
222 aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
223 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
224 aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
225 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
226 aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
227 aesimc %%xtmp2, %%xkey1_2
228 movdqa [%%ptr_expanded_keys+16*5], %%xtmp2
229
230
231 aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
232 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
233 aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
234 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
235 aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
236 aesimc %%xtmp2, %%xkey1
237 movdqa [%%ptr_expanded_keys+16*4], %%xtmp2
238
239 aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
240 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
241 aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
242 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
243 aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
244 aesimc %%xtmp2, %%xkey1_2
245 movdqa [%%ptr_expanded_keys+16*3], %%xtmp2
246
247
248 aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
249 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
250 aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
251 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
252 aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
253 aesimc %%xtmp2, %%xkey1
254 movdqa [%%ptr_expanded_keys+16*2], %%xtmp2
255
256 aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
257 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
258 aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
259 key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
260 aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
261 aesimc %%xtmp2, %%xkey1_2
262 movdqa [%%ptr_expanded_keys+16*1], %%xtmp2
263
264
265 aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
266 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
267 aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
268 key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
269 aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
270 movdqa [%%ptr_expanded_keys+16*0], %%xkey1
271
272 movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
273 %endmacro
274
275
276 ; generate initial tweak values
277 ; load initial plaintext values
278 %macro initialize 16
279
280 %define %%ST1 %1 ; state 1
281 %define %%ST2 %2 ; state 2
282 %define %%ST3 %3 ; state 3
283 %define %%ST4 %4 ; state 4
284 %define %%ST5 %5 ; state 5
285 %define %%ST6 %6 ; state 6
286 %define %%ST7 %7 ; state 7
287 %define %%ST8 %8 ; state 8
288
289 %define %%TW1 %9 ; tweak 1
290 %define %%TW2 %10 ; tweak 2
291 %define %%TW3 %11 ; tweak 3
292 %define %%TW4 %12 ; tweak 4
293 %define %%TW5 %13 ; tweak 5
294 %define %%TW6 %14 ; tweak 6
295 %define %%TW7 %15 ; tweak 7
296
297 %define %%num_initial_blocks %16
298
299
300 ; generate next Tweak values
301 movdqa %%TW1, [TW+16*0]
302 mov twtempl, [TW+8*0]
303 mov twtemph, [TW+8*1]
304 movdqu %%ST1, [ptr_plaintext+16*0]
305 %if (%%num_initial_blocks>=2)
306 xor ghash_poly_8b_temp, ghash_poly_8b_temp
307 shl twtempl, 1
308 adc twtemph, twtemph
309 cmovc ghash_poly_8b_temp, ghash_poly_8b
310 xor twtempl, ghash_poly_8b_temp
311 mov [TW+8*2], twtempl
312 mov [TW+8*3], twtemph;
313 movdqa %%TW2, [TW+16*1]
314 movdqu %%ST2, [ptr_plaintext+16*1]
315 %endif
316 %if (%%num_initial_blocks>=3)
317 xor ghash_poly_8b_temp, ghash_poly_8b_temp
318 shl twtempl, 1
319 adc twtemph, twtemph
320 cmovc ghash_poly_8b_temp, ghash_poly_8b
321 xor twtempl, ghash_poly_8b_temp
322 mov [TW+8*4], twtempl
323 mov [TW+8*5], twtemph;
324 movdqa %%TW3, [TW+16*2]
325 movdqu %%ST3, [ptr_plaintext+16*2]
326 %endif
327 %if (%%num_initial_blocks>=4)
328 xor ghash_poly_8b_temp, ghash_poly_8b_temp
329 shl twtempl, 1
330 adc twtemph, twtemph
331 cmovc ghash_poly_8b_temp, ghash_poly_8b
332 xor twtempl, ghash_poly_8b_temp
333 mov [TW+8*6], twtempl
334 mov [TW+8*7], twtemph;
335 movdqa %%TW4, [TW+16*3]
336 movdqu %%ST4, [ptr_plaintext+16*3]
337 %endif
338 %if (%%num_initial_blocks>=5)
339 xor ghash_poly_8b_temp, ghash_poly_8b_temp
340 shl twtempl, 1
341 adc twtemph, twtemph
342 cmovc ghash_poly_8b_temp, ghash_poly_8b
343 xor twtempl, ghash_poly_8b_temp
344 mov [TW+8*8], twtempl
345 mov [TW+8*9], twtemph;
346 movdqa %%TW5, [TW+16*4]
347 movdqu %%ST5, [ptr_plaintext+16*4]
348 %endif
349 %if (%%num_initial_blocks>=6)
350 xor ghash_poly_8b_temp, ghash_poly_8b_temp
351 shl twtempl, 1
352 adc twtemph, twtemph
353 cmovc ghash_poly_8b_temp, ghash_poly_8b
354 xor twtempl, ghash_poly_8b_temp
355 mov [TW+8*10], twtempl
356 mov [TW+8*11], twtemph;
357 movdqa %%TW6, [TW+16*5]
358 movdqu %%ST6, [ptr_plaintext+16*5]
359 %endif
360 %if (%%num_initial_blocks>=7)
361 xor ghash_poly_8b_temp, ghash_poly_8b_temp
362 shl twtempl, 1
363 adc twtemph, twtemph
364 cmovc ghash_poly_8b_temp, ghash_poly_8b
365 xor twtempl, ghash_poly_8b_temp
366 mov [TW+8*12], twtempl
367 mov [TW+8*13], twtemph;
368 movdqa %%TW7, [TW+16*6]
369 movdqu %%ST7, [ptr_plaintext+16*6]
370 %endif
371
372
373
374 %endmacro
375
376
377 ; encrypt initial blocks of AES
378 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
379 ; next 8 Tweak values are generated
380 %macro encrypt_initial 18
381 %define %%ST1 %1 ; state 1
382 %define %%ST2 %2 ; state 2
383 %define %%ST3 %3 ; state 3
384 %define %%ST4 %4 ; state 4
385 %define %%ST5 %5 ; state 5
386 %define %%ST6 %6 ; state 6
387 %define %%ST7 %7 ; state 7
388 %define %%ST8 %8 ; state 8
389
390 %define %%TW1 %9 ; tweak 1
391 %define %%TW2 %10 ; tweak 2
392 %define %%TW3 %11 ; tweak 3
393 %define %%TW4 %12 ; tweak 4
394 %define %%TW5 %13 ; tweak 5
395 %define %%TW6 %14 ; tweak 6
396 %define %%TW7 %15 ; tweak 7
397 %define %%T0 %16 ; Temp register
398 %define %%num_blocks %17
399 ; %%num_blocks blocks encrypted
400 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
401
402 %define %%lt128 %18 ; less than 128 bytes
403
404 ; xor Tweak value
405 pxor %%ST1, %%TW1
406 %if (%%num_blocks>=2)
407 pxor %%ST2, %%TW2
408 %endif
409 %if (%%num_blocks>=3)
410 pxor %%ST3, %%TW3
411 %endif
412 %if (%%num_blocks>=4)
413 pxor %%ST4, %%TW4
414 %endif
415 %if (%%num_blocks>=5)
416 pxor %%ST5, %%TW5
417 %endif
418 %if (%%num_blocks>=6)
419 pxor %%ST6, %%TW6
420 %endif
421 %if (%%num_blocks>=7)
422 pxor %%ST7, %%TW7
423 %endif
424
425
426 ; ARK
427 movdqa %%T0, [keys]
428 pxor %%ST1, %%T0
429 %if (%%num_blocks>=2)
430 pxor %%ST2, %%T0
431 %endif
432 %if (%%num_blocks>=3)
433 pxor %%ST3, %%T0
434 %endif
435 %if (%%num_blocks>=4)
436 pxor %%ST4, %%T0
437 %endif
438 %if (%%num_blocks>=5)
439 pxor %%ST5, %%T0
440 %endif
441 %if (%%num_blocks>=6)
442 pxor %%ST6, %%T0
443 %endif
444 %if (%%num_blocks>=7)
445 pxor %%ST7, %%T0
446 %endif
447
448
449 %if (0 == %%lt128)
450 xor ghash_poly_8b_temp, ghash_poly_8b_temp
451 shl twtempl, 1
452 adc twtemph, twtemph
453 %endif
454
455 ; round 1
456 movdqa %%T0, [keys + 16*1]
457 aesdec %%ST1, %%T0
458 %if (%%num_blocks>=2)
459 aesdec %%ST2, %%T0
460 %endif
461 %if (%%num_blocks>=3)
462 aesdec %%ST3, %%T0
463 %endif
464 %if (%%num_blocks>=4)
465 aesdec %%ST4, %%T0
466 %endif
467 %if (%%num_blocks>=5)
468 aesdec %%ST5, %%T0
469 %endif
470 %if (%%num_blocks>=6)
471 aesdec %%ST6, %%T0
472 %endif
473 %if (%%num_blocks>=7)
474 aesdec %%ST7, %%T0
475 %endif
476 %if (0 == %%lt128)
477 cmovc ghash_poly_8b_temp, ghash_poly_8b
478 xor twtempl, ghash_poly_8b_temp
479 mov [TW + 8*0], twtempl ; next Tweak1 generated
480 mov [TW + 8*1], twtemph
481 xor ghash_poly_8b_temp, ghash_poly_8b_temp
482 %endif
483
484 ; round 2
485 movdqa %%T0, [keys + 16*2]
486 aesdec %%ST1, %%T0
487 %if (%%num_blocks>=2)
488 aesdec %%ST2, %%T0
489 %endif
490 %if (%%num_blocks>=3)
491 aesdec %%ST3, %%T0
492 %endif
493 %if (%%num_blocks>=4)
494 aesdec %%ST4, %%T0
495 %endif
496 %if (%%num_blocks>=5)
497 aesdec %%ST5, %%T0
498 %endif
499 %if (%%num_blocks>=6)
500 aesdec %%ST6, %%T0
501 %endif
502 %if (%%num_blocks>=7)
503 aesdec %%ST7, %%T0
504 %endif
505
506 %if (0 == %%lt128)
507 shl twtempl, 1
508 adc twtemph, twtemph
509 cmovc ghash_poly_8b_temp, ghash_poly_8b
510 xor twtempl, ghash_poly_8b_temp
511 mov [TW + 8*2], twtempl ; next Tweak2 generated
512 %endif
513
514 ; round 3
515 movdqa %%T0, [keys + 16*3]
516 aesdec %%ST1, %%T0
517 %if (%%num_blocks>=2)
518 aesdec %%ST2, %%T0
519 %endif
520 %if (%%num_blocks>=3)
521 aesdec %%ST3, %%T0
522 %endif
523 %if (%%num_blocks>=4)
524 aesdec %%ST4, %%T0
525 %endif
526 %if (%%num_blocks>=5)
527 aesdec %%ST5, %%T0
528 %endif
529 %if (%%num_blocks>=6)
530 aesdec %%ST6, %%T0
531 %endif
532 %if (%%num_blocks>=7)
533 aesdec %%ST7, %%T0
534 %endif
535 %if (0 == %%lt128)
536 mov [TW + 8*3], twtemph
537 xor ghash_poly_8b_temp, ghash_poly_8b_temp
538 shl twtempl, 1
539 adc twtemph, twtemph
540 cmovc ghash_poly_8b_temp, ghash_poly_8b
541 %endif
542
543 ; round 4
544 movdqa %%T0, [keys + 16*4]
545 aesdec %%ST1, %%T0
546 %if (%%num_blocks>=2)
547 aesdec %%ST2, %%T0
548 %endif
549 %if (%%num_blocks>=3)
550 aesdec %%ST3, %%T0
551 %endif
552 %if (%%num_blocks>=4)
553 aesdec %%ST4, %%T0
554 %endif
555 %if (%%num_blocks>=5)
556 aesdec %%ST5, %%T0
557 %endif
558 %if (%%num_blocks>=6)
559 aesdec %%ST6, %%T0
560 %endif
561 %if (%%num_blocks>=7)
562 aesdec %%ST7, %%T0
563 %endif
564
565 %if (0 == %%lt128)
566 xor twtempl, ghash_poly_8b_temp
567 mov [TW + 8*4], twtempl ; next Tweak3 generated
568 mov [TW + 8*5], twtemph
569 xor ghash_poly_8b_temp, ghash_poly_8b_temp
570 shl twtempl, 1
571 %endif
572
573 ; round 5
574 movdqa %%T0, [keys + 16*5]
575 aesdec %%ST1, %%T0
576 %if (%%num_blocks>=2)
577 aesdec %%ST2, %%T0
578 %endif
579 %if (%%num_blocks>=3)
580 aesdec %%ST3, %%T0
581 %endif
582 %if (%%num_blocks>=4)
583 aesdec %%ST4, %%T0
584 %endif
585 %if (%%num_blocks>=5)
586 aesdec %%ST5, %%T0
587 %endif
588 %if (%%num_blocks>=6)
589 aesdec %%ST6, %%T0
590 %endif
591 %if (%%num_blocks>=7)
592 aesdec %%ST7, %%T0
593 %endif
594
595 %if (0 == %%lt128)
596 adc twtemph, twtemph
597 cmovc ghash_poly_8b_temp, ghash_poly_8b
598 xor twtempl, ghash_poly_8b_temp
599 mov [TW + 8*6], twtempl ; next Tweak4 generated
600 mov [TW + 8*7], twtemph
601 %endif
602
603 ; round 6
604 movdqa %%T0, [keys + 16*6]
605 aesdec %%ST1, %%T0
606 %if (%%num_blocks>=2)
607 aesdec %%ST2, %%T0
608 %endif
609 %if (%%num_blocks>=3)
610 aesdec %%ST3, %%T0
611 %endif
612 %if (%%num_blocks>=4)
613 aesdec %%ST4, %%T0
614 %endif
615 %if (%%num_blocks>=5)
616 aesdec %%ST5, %%T0
617 %endif
618 %if (%%num_blocks>=6)
619 aesdec %%ST6, %%T0
620 %endif
621 %if (%%num_blocks>=7)
622 aesdec %%ST7, %%T0
623 %endif
624
625 %if (0 == %%lt128)
626 xor ghash_poly_8b_temp, ghash_poly_8b_temp
627 shl twtempl, 1
628 adc twtemph, twtemph
629 cmovc ghash_poly_8b_temp, ghash_poly_8b
630 xor twtempl, ghash_poly_8b_temp
631 mov [TW + 8*8], twtempl ; next Tweak5 generated
632 mov [TW + 8*9], twtemph
633 %endif
634
635 ; round 7
636 movdqa %%T0, [keys + 16*7]
637 aesdec %%ST1, %%T0
638 %if (%%num_blocks>=2)
639 aesdec %%ST2, %%T0
640 %endif
641 %if (%%num_blocks>=3)
642 aesdec %%ST3, %%T0
643 %endif
644 %if (%%num_blocks>=4)
645 aesdec %%ST4, %%T0
646 %endif
647 %if (%%num_blocks>=5)
648 aesdec %%ST5, %%T0
649 %endif
650 %if (%%num_blocks>=6)
651 aesdec %%ST6, %%T0
652 %endif
653 %if (%%num_blocks>=7)
654 aesdec %%ST7, %%T0
655 %endif
656
657 %if (0 == %%lt128)
658 xor ghash_poly_8b_temp, ghash_poly_8b_temp
659 shl twtempl, 1
660 adc twtemph, twtemph
661 cmovc ghash_poly_8b_temp, ghash_poly_8b
662 xor twtempl, ghash_poly_8b_temp
663 mov [TW + 8*10], twtempl ; next Tweak6 generated
664 mov [TW + 8*11], twtemph
665 %endif
666 ; round 8
667 movdqa %%T0, [keys + 16*8]
668 aesdec %%ST1, %%T0
669 %if (%%num_blocks>=2)
670 aesdec %%ST2, %%T0
671 %endif
672 %if (%%num_blocks>=3)
673 aesdec %%ST3, %%T0
674 %endif
675 %if (%%num_blocks>=4)
676 aesdec %%ST4, %%T0
677 %endif
678 %if (%%num_blocks>=5)
679 aesdec %%ST5, %%T0
680 %endif
681 %if (%%num_blocks>=6)
682 aesdec %%ST6, %%T0
683 %endif
684 %if (%%num_blocks>=7)
685 aesdec %%ST7, %%T0
686 %endif
687
688 %if (0 == %%lt128)
689 xor ghash_poly_8b_temp, ghash_poly_8b_temp
690 shl twtempl, 1
691 adc twtemph, twtemph
692 cmovc ghash_poly_8b_temp, ghash_poly_8b
693 xor twtempl, ghash_poly_8b_temp
694 mov [TW + 8*12], twtempl ; next Tweak7 generated
695 mov [TW + 8*13], twtemph
696 %endif
697 ; round 9
698 movdqa %%T0, [keys + 16*9]
699 aesdec %%ST1, %%T0
700 %if (%%num_blocks>=2)
701 aesdec %%ST2, %%T0
702 %endif
703 %if (%%num_blocks>=3)
704 aesdec %%ST3, %%T0
705 %endif
706 %if (%%num_blocks>=4)
707 aesdec %%ST4, %%T0
708 %endif
709 %if (%%num_blocks>=5)
710 aesdec %%ST5, %%T0
711 %endif
712 %if (%%num_blocks>=6)
713 aesdec %%ST6, %%T0
714 %endif
715 %if (%%num_blocks>=7)
716 aesdec %%ST7, %%T0
717 %endif
718
719 %if (0 == %%lt128)
720 xor ghash_poly_8b_temp, ghash_poly_8b_temp
721 shl twtempl, 1
722 adc twtemph, twtemph
723 cmovc ghash_poly_8b_temp, ghash_poly_8b
724 xor twtempl, ghash_poly_8b_temp
725 mov [TW + 8*14], twtempl ; next Tweak8 generated
726 mov [TW + 8*15], twtemph
727 %endif
728 ; round 10
729 movdqa %%T0, [keys + 16*10]
730 aesdec %%ST1, %%T0
731 %if (%%num_blocks>=2)
732 aesdec %%ST2, %%T0
733 %endif
734 %if (%%num_blocks>=3)
735 aesdec %%ST3, %%T0
736 %endif
737 %if (%%num_blocks>=4)
738 aesdec %%ST4, %%T0
739 %endif
740 %if (%%num_blocks>=5)
741 aesdec %%ST5, %%T0
742 %endif
743 %if (%%num_blocks>=6)
744 aesdec %%ST6, %%T0
745 %endif
746 %if (%%num_blocks>=7)
747 aesdec %%ST7, %%T0
748 %endif
749 ; round 11
750 movdqa %%T0, [keys + 16*11]
751 aesdec %%ST1, %%T0
752 %if (%%num_blocks>=2)
753 aesdec %%ST2, %%T0
754 %endif
755 %if (%%num_blocks>=3)
756 aesdec %%ST3, %%T0
757 %endif
758 %if (%%num_blocks>=4)
759 aesdec %%ST4, %%T0
760 %endif
761 %if (%%num_blocks>=5)
762 aesdec %%ST5, %%T0
763 %endif
764 %if (%%num_blocks>=6)
765 aesdec %%ST6, %%T0
766 %endif
767 %if (%%num_blocks>=7)
768 aesdec %%ST7, %%T0
769 %endif
770
771 ; round 12
772 movdqa %%T0, [keys + 16*12]
773 aesdec %%ST1, %%T0
774 %if (%%num_blocks>=2)
775 aesdec %%ST2, %%T0
776 %endif
777 %if (%%num_blocks>=3)
778 aesdec %%ST3, %%T0
779 %endif
780 %if (%%num_blocks>=4)
781 aesdec %%ST4, %%T0
782 %endif
783 %if (%%num_blocks>=5)
784 aesdec %%ST5, %%T0
785 %endif
786 %if (%%num_blocks>=6)
787 aesdec %%ST6, %%T0
788 %endif
789 %if (%%num_blocks>=7)
790 aesdec %%ST7, %%T0
791 %endif
792
793 ; round 13
794 movdqa %%T0, [keys + 16*13]
795 aesdec %%ST1, %%T0
796 %if (%%num_blocks>=2)
797 aesdec %%ST2, %%T0
798 %endif
799 %if (%%num_blocks>=3)
800 aesdec %%ST3, %%T0
801 %endif
802 %if (%%num_blocks>=4)
803 aesdec %%ST4, %%T0
804 %endif
805 %if (%%num_blocks>=5)
806 aesdec %%ST5, %%T0
807 %endif
808 %if (%%num_blocks>=6)
809 aesdec %%ST6, %%T0
810 %endif
811 %if (%%num_blocks>=7)
812 aesdec %%ST7, %%T0
813 %endif
814
815 ; round 14
816 movdqa %%T0, [keys + 16*14]
817 aesdeclast %%ST1, %%T0
818 %if (%%num_blocks>=2)
819 aesdeclast %%ST2, %%T0
820 %endif
821 %if (%%num_blocks>=3)
822 aesdeclast %%ST3, %%T0
823 %endif
824 %if (%%num_blocks>=4)
825 aesdeclast %%ST4, %%T0
826 %endif
827 %if (%%num_blocks>=5)
828 aesdeclast %%ST5, %%T0
829 %endif
830 %if (%%num_blocks>=6)
831 aesdeclast %%ST6, %%T0
832 %endif
833 %if (%%num_blocks>=7)
834 aesdeclast %%ST7, %%T0
835 %endif
836
837 ; xor Tweak values
838 pxor %%ST1, %%TW1
839 %if (%%num_blocks>=2)
840 pxor %%ST2, %%TW2
841 %endif
842 %if (%%num_blocks>=3)
843 pxor %%ST3, %%TW3
844 %endif
845 %if (%%num_blocks>=4)
846 pxor %%ST4, %%TW4
847 %endif
848 %if (%%num_blocks>=5)
849 pxor %%ST5, %%TW5
850 %endif
851 %if (%%num_blocks>=6)
852 pxor %%ST6, %%TW6
853 %endif
854 %if (%%num_blocks>=7)
855 pxor %%ST7, %%TW7
856 %endif
857
858
859 %if (0 == %%lt128)
860 ; load next Tweak values
861 movdqa %%TW1, [TW + 16*0]
862 movdqa %%TW2, [TW + 16*1]
863 movdqa %%TW3, [TW + 16*2]
864 movdqa %%TW4, [TW + 16*3]
865 movdqa %%TW5, [TW + 16*4]
866 movdqa %%TW6, [TW + 16*5]
867 movdqa %%TW7, [TW + 16*6]
868
869 %endif
870
871 %endmacro
872
873
874 ; Encrypt 8 blocks in parallel
875 ; generate next 8 tweak values
876 %macro encrypt_by_eight 18
877 %define %%ST1 %1 ; state 1
878 %define %%ST2 %2 ; state 2
879 %define %%ST3 %3 ; state 3
880 %define %%ST4 %4 ; state 4
881 %define %%ST5 %5 ; state 5
882 %define %%ST6 %6 ; state 6
883 %define %%ST7 %7 ; state 7
884 %define %%ST8 %8 ; state 8
885 %define %%TW1 %9 ; tweak 1
886 %define %%TW2 %10 ; tweak 2
887 %define %%TW3 %11 ; tweak 3
888 %define %%TW4 %12 ; tweak 4
889 %define %%TW5 %13 ; tweak 5
890 %define %%TW6 %14 ; tweak 6
891 %define %%TW7 %15 ; tweak 7
892 %define %%TW8 %16 ; tweak 8
893 %define %%T0 %17 ; Temp register
894 %define %%last_eight %18
895
896 ; xor Tweak values
897 pxor %%ST1, %%TW1
898 pxor %%ST2, %%TW2
899 pxor %%ST3, %%TW3
900 pxor %%ST4, %%TW4
901 pxor %%ST5, %%TW5
902 pxor %%ST6, %%TW6
903 pxor %%ST7, %%TW7
904 pxor %%ST8, %%TW8
905
906 ; ARK
907 movdqa %%T0, [keys]
908 pxor %%ST1, %%T0
909 pxor %%ST2, %%T0
910 pxor %%ST3, %%T0
911 pxor %%ST4, %%T0
912 pxor %%ST5, %%T0
913 pxor %%ST6, %%T0
914 pxor %%ST7, %%T0
915 pxor %%ST8, %%T0
916
917 %if (0 == %%last_eight)
918 xor ghash_poly_8b_temp, ghash_poly_8b_temp
919 shl twtempl, 1
920 adc twtemph, twtemph
921 cmovc ghash_poly_8b_temp, ghash_poly_8b
922 %endif
923 ; round 1
924 movdqa %%T0, [keys + 16*1]
925 aesdec %%ST1, %%T0
926 aesdec %%ST2, %%T0
927 aesdec %%ST3, %%T0
928 aesdec %%ST4, %%T0
929 aesdec %%ST5, %%T0
930 aesdec %%ST6, %%T0
931 aesdec %%ST7, %%T0
932 aesdec %%ST8, %%T0
933 %if (0 == %%last_eight)
934 xor twtempl, ghash_poly_8b_temp
935 mov [TW + 8*0], twtempl
936 mov [TW + 8*1], twtemph
937 xor ghash_poly_8b_temp, ghash_poly_8b_temp
938 %endif
939 ; round 2
940 movdqa %%T0, [keys + 16*2]
941 aesdec %%ST1, %%T0
942 aesdec %%ST2, %%T0
943 aesdec %%ST3, %%T0
944 aesdec %%ST4, %%T0
945 aesdec %%ST5, %%T0
946 aesdec %%ST6, %%T0
947 aesdec %%ST7, %%T0
948 aesdec %%ST8, %%T0
949 %if (0 == %%last_eight)
950 shl twtempl, 1
951 adc twtemph, twtemph
952 cmovc ghash_poly_8b_temp, ghash_poly_8b
953 xor twtempl, ghash_poly_8b_temp
954
955 %endif
956 ; round 3
957 movdqa %%T0, [keys + 16*3]
958 aesdec %%ST1, %%T0
959 aesdec %%ST2, %%T0
960 aesdec %%ST3, %%T0
961 aesdec %%ST4, %%T0
962 aesdec %%ST5, %%T0
963 aesdec %%ST6, %%T0
964 aesdec %%ST7, %%T0
965 aesdec %%ST8, %%T0
966 %if (0 == %%last_eight)
967 mov [TW + 8*2], twtempl
968 mov [TW + 8*3], twtemph
969 xor ghash_poly_8b_temp, ghash_poly_8b_temp
970 shl twtempl, 1
971 %endif
972 ; round 4
973 movdqa %%T0, [keys + 16*4]
974 aesdec %%ST1, %%T0
975 aesdec %%ST2, %%T0
976 aesdec %%ST3, %%T0
977 aesdec %%ST4, %%T0
978 aesdec %%ST5, %%T0
979 aesdec %%ST6, %%T0
980 aesdec %%ST7, %%T0
981 aesdec %%ST8, %%T0
982 %if (0 == %%last_eight)
983 adc twtemph, twtemph
984 cmovc ghash_poly_8b_temp, ghash_poly_8b
985 xor twtempl, ghash_poly_8b_temp
986 mov [TW + 8*4], twtempl
987 %endif
988 ; round 5
989 movdqa %%T0, [keys + 16*5]
990 aesdec %%ST1, %%T0
991 aesdec %%ST2, %%T0
992 aesdec %%ST3, %%T0
993 aesdec %%ST4, %%T0
994 aesdec %%ST5, %%T0
995 aesdec %%ST6, %%T0
996 aesdec %%ST7, %%T0
997 aesdec %%ST8, %%T0
998 %if (0 == %%last_eight)
999 mov [TW + 8*5], twtemph
1000 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1001 shl twtempl, 1
1002 adc twtemph, twtemph
1003 %endif
1004 ; round 6
1005 movdqa %%T0, [keys + 16*6]
1006 aesdec %%ST1, %%T0
1007 aesdec %%ST2, %%T0
1008 aesdec %%ST3, %%T0
1009 aesdec %%ST4, %%T0
1010 aesdec %%ST5, %%T0
1011 aesdec %%ST6, %%T0
1012 aesdec %%ST7, %%T0
1013 aesdec %%ST8, %%T0
1014 %if (0 == %%last_eight)
1015 cmovc ghash_poly_8b_temp, ghash_poly_8b
1016 xor twtempl, ghash_poly_8b_temp
1017 mov [TW + 8*6], twtempl
1018 mov [TW + 8*7], twtemph
1019 %endif
1020 ; round 7
1021 movdqa %%T0, [keys + 16*7]
1022 aesdec %%ST1, %%T0
1023 aesdec %%ST2, %%T0
1024 aesdec %%ST3, %%T0
1025 aesdec %%ST4, %%T0
1026 aesdec %%ST5, %%T0
1027 aesdec %%ST6, %%T0
1028 aesdec %%ST7, %%T0
1029 aesdec %%ST8, %%T0
1030 %if (0 == %%last_eight)
1031 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1032 shl twtempl, 1
1033 adc twtemph, twtemph
1034 cmovc ghash_poly_8b_temp, ghash_poly_8b
1035 %endif
1036 ; round 8
1037 movdqa %%T0, [keys + 16*8]
1038 aesdec %%ST1, %%T0
1039 aesdec %%ST2, %%T0
1040 aesdec %%ST3, %%T0
1041 aesdec %%ST4, %%T0
1042 aesdec %%ST5, %%T0
1043 aesdec %%ST6, %%T0
1044 aesdec %%ST7, %%T0
1045 aesdec %%ST8, %%T0
1046 %if (0 == %%last_eight)
1047 xor twtempl, ghash_poly_8b_temp
1048 mov [TW + 8*8], twtempl
1049 mov [TW + 8*9], twtemph
1050 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1051 %endif
1052 ; round 9
1053 movdqa %%T0, [keys + 16*9]
1054 aesdec %%ST1, %%T0
1055 aesdec %%ST2, %%T0
1056 aesdec %%ST3, %%T0
1057 aesdec %%ST4, %%T0
1058 aesdec %%ST5, %%T0
1059 aesdec %%ST6, %%T0
1060 aesdec %%ST7, %%T0
1061 aesdec %%ST8, %%T0
1062 %if (0 == %%last_eight)
1063 shl twtempl, 1
1064 adc twtemph, twtemph
1065 cmovc ghash_poly_8b_temp, ghash_poly_8b
1066 xor twtempl, ghash_poly_8b_temp
1067 %endif
1068 ; round 10
1069 movdqa %%T0, [keys + 16*10]
1070 aesdec %%ST1, %%T0
1071 aesdec %%ST2, %%T0
1072 aesdec %%ST3, %%T0
1073 aesdec %%ST4, %%T0
1074 aesdec %%ST5, %%T0
1075 aesdec %%ST6, %%T0
1076 aesdec %%ST7, %%T0
1077 aesdec %%ST8, %%T0
1078 %if (0 == %%last_eight)
1079 mov [TW + 8*10], twtempl
1080 mov [TW + 8*11], twtemph
1081 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1082 shl twtempl, 1
1083 %endif
1084 ; round 11
1085 movdqa %%T0, [keys + 16*11]
1086 aesdec %%ST1, %%T0
1087 aesdec %%ST2, %%T0
1088 aesdec %%ST3, %%T0
1089 aesdec %%ST4, %%T0
1090 aesdec %%ST5, %%T0
1091 aesdec %%ST6, %%T0
1092 aesdec %%ST7, %%T0
1093 aesdec %%ST8, %%T0
1094 %if (0 == %%last_eight)
1095 adc twtemph, twtemph
1096 cmovc ghash_poly_8b_temp, ghash_poly_8b
1097 xor twtempl, ghash_poly_8b_temp
1098 mov [TW + 8*12], twtempl
1099 %endif
1100 ; round 12
1101 movdqa %%T0, [keys + 16*12]
1102 aesdec %%ST1, %%T0
1103 aesdec %%ST2, %%T0
1104 aesdec %%ST3, %%T0
1105 aesdec %%ST4, %%T0
1106 aesdec %%ST5, %%T0
1107 aesdec %%ST6, %%T0
1108 aesdec %%ST7, %%T0
1109 aesdec %%ST8, %%T0
1110 %if (0 == %%last_eight)
1111 mov [TW + 8*13], twtemph
1112 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1113 shl twtempl, 1
1114 adc twtemph, twtemph
1115 %endif
1116 ; round 13
1117 movdqa %%T0, [keys + 16*13]
1118 aesdec %%ST1, %%T0
1119 aesdec %%ST2, %%T0
1120 aesdec %%ST3, %%T0
1121 aesdec %%ST4, %%T0
1122 aesdec %%ST5, %%T0
1123 aesdec %%ST6, %%T0
1124 aesdec %%ST7, %%T0
1125 aesdec %%ST8, %%T0
1126 %if (0 == %%last_eight)
1127 cmovc ghash_poly_8b_temp, ghash_poly_8b
1128 xor twtempl, ghash_poly_8b_temp
1129 ; mov [TW + 8*14], twtempl
1130 ; mov [TW + 8*15], twtemph
1131 %endif
1132 ; round 14
1133 movdqa %%T0, [keys + 16*14]
1134 aesdeclast %%ST1, %%T0
1135 aesdeclast %%ST2, %%T0
1136 aesdeclast %%ST3, %%T0
1137 aesdeclast %%ST4, %%T0
1138 aesdeclast %%ST5, %%T0
1139 aesdeclast %%ST6, %%T0
1140 aesdeclast %%ST7, %%T0
1141 aesdeclast %%ST8, %%T0
1142
1143 ; xor Tweak values
1144 pxor %%ST1, %%TW1
1145 pxor %%ST2, %%TW2
1146 pxor %%ST3, %%TW3
1147 pxor %%ST4, %%TW4
1148 pxor %%ST5, %%TW5
1149 pxor %%ST6, %%TW6
1150 pxor %%ST7, %%TW7
1151 pxor %%ST8, %%TW8
1152
1153 mov [TW + 8*14], twtempl
1154 mov [TW + 8*15], twtemph
1155 ; load next Tweak values
1156 movdqa %%TW1, [TW + 16*0]
1157 movdqa %%TW2, [TW + 16*1]
1158 movdqa %%TW3, [TW + 16*2]
1159 movdqa %%TW4, [TW + 16*3]
1160 movdqa %%TW5, [TW + 16*4]
1161 movdqa %%TW6, [TW + 16*5]
1162 movdqa %%TW7, [TW + 16*6]
1163
1164 %endmacro
1165
1166
1167 section .text
1168
1169 mk_global XTS_AES_256_dec_sse, function
1170 XTS_AES_256_dec_sse:
1171 endbranch
1172
1173 sub rsp, VARIABLE_OFFSET
1174
1175 mov [_gpr + 8*0], rbx
1176 %ifidn __OUTPUT_FORMAT__, win64
1177 mov [_gpr + 8*1], rdi
1178 mov [_gpr + 8*2], rsi
1179
1180 movdqa [_xmm + 16*0], xmm6
1181 movdqa [_xmm + 16*1], xmm7
1182 movdqa [_xmm + 16*2], xmm8
1183 movdqa [_xmm + 16*3], xmm9
1184 movdqa [_xmm + 16*4], xmm10
1185 movdqa [_xmm + 16*5], xmm11
1186 movdqa [_xmm + 16*6], xmm12
1187 movdqa [_xmm + 16*7], xmm13
1188 movdqa [_xmm + 16*8], xmm14
1189 movdqa [_xmm + 16*9], xmm15
1190 %endif
1191
1192 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1193
1194
1195 movdqu xmm1, [T_val] ; read initial Tweak value
1196 pxor xmm4, xmm4 ; for key expansion
1197 encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
1198
1199
1200 %ifidn __OUTPUT_FORMAT__, win64
1201 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1202 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1203 %endif
1204
1205
1206
1207 mov target_ptr_val, N_val
1208 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
1209 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
1210 jl _less_than_128_bytes
1211
1212 add target_ptr_val, ptr_ciphertext
1213
1214
1215 mov tmp1, N_val
1216 and tmp1, (7 << 4)
1217 jz _initial_num_blocks_is_0
1218
1219 cmp tmp1, (4 << 4)
1220 je _initial_num_blocks_is_4
1221
1222
1223
1224 cmp tmp1, (6 << 4)
1225 je _initial_num_blocks_is_6
1226
1227 cmp tmp1, (5 << 4)
1228 je _initial_num_blocks_is_5
1229
1230
1231
1232 cmp tmp1, (3 << 4)
1233 je _initial_num_blocks_is_3
1234
1235 cmp tmp1, (2 << 4)
1236 je _initial_num_blocks_is_2
1237
1238 cmp tmp1, (1 << 4)
1239 je _initial_num_blocks_is_1
1240
1241 _initial_num_blocks_is_7:
1242 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1243 add ptr_plaintext, 16*7
1244 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1245 ; store ciphertext
1246 movdqu [ptr_ciphertext+16*0], xmm1
1247 movdqu [ptr_ciphertext+16*1], xmm2
1248 movdqu [ptr_ciphertext+16*2], xmm3
1249 movdqu [ptr_ciphertext+16*3], xmm4
1250 movdqu [ptr_ciphertext+16*4], xmm5
1251 movdqu [ptr_ciphertext+16*5], xmm6
1252 movdqu [ptr_ciphertext+16*6], xmm7
1253 add ptr_ciphertext, 16*7
1254
1255 cmp ptr_ciphertext, target_ptr_val
1256 je _last_eight
1257
1258 jmp _main_loop
1259 _initial_num_blocks_is_6:
1260 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1261 add ptr_plaintext, 16*6
1262 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1263 ; store ciphertext
1264 movdqu [ptr_ciphertext+16*0], xmm1
1265 movdqu [ptr_ciphertext+16*1], xmm2
1266 movdqu [ptr_ciphertext+16*2], xmm3
1267 movdqu [ptr_ciphertext+16*3], xmm4
1268 movdqu [ptr_ciphertext+16*4], xmm5
1269 movdqu [ptr_ciphertext+16*5], xmm6
1270 add ptr_ciphertext, 16*6
1271
1272 cmp ptr_ciphertext, target_ptr_val
1273 je _last_eight
1274
1275 jmp _main_loop
1276 _initial_num_blocks_is_5:
1277 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1278 add ptr_plaintext, 16*5
1279 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1280 ; store ciphertext
1281 movdqu [ptr_ciphertext+16*0], xmm1
1282 movdqu [ptr_ciphertext+16*1], xmm2
1283 movdqu [ptr_ciphertext+16*2], xmm3
1284 movdqu [ptr_ciphertext+16*3], xmm4
1285 movdqu [ptr_ciphertext+16*4], xmm5
1286 add ptr_ciphertext, 16*5
1287
1288 cmp ptr_ciphertext, target_ptr_val
1289 je _last_eight
1290
1291 jmp _main_loop
1292 _initial_num_blocks_is_4:
1293 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1294 add ptr_plaintext, 16*4
1295 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1296 ; store ciphertext
1297 movdqu [ptr_ciphertext+16*0], xmm1
1298 movdqu [ptr_ciphertext+16*1], xmm2
1299 movdqu [ptr_ciphertext+16*2], xmm3
1300 movdqu [ptr_ciphertext+16*3], xmm4
1301 add ptr_ciphertext, 16*4
1302
1303 cmp ptr_ciphertext, target_ptr_val
1304 je _last_eight
1305
1306 jmp _main_loop
1307
1308
1309 _initial_num_blocks_is_3:
1310 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1311 add ptr_plaintext, 16*3
1312 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1313 ; store ciphertext
1314 movdqu [ptr_ciphertext+16*0], xmm1
1315 movdqu [ptr_ciphertext+16*1], xmm2
1316 movdqu [ptr_ciphertext+16*2], xmm3
1317 add ptr_ciphertext, 16*3
1318
1319 cmp ptr_ciphertext, target_ptr_val
1320 je _last_eight
1321
1322 jmp _main_loop
1323 _initial_num_blocks_is_2:
1324 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1325 add ptr_plaintext, 16*2
1326 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1327 ; store ciphertext
1328 movdqu [ptr_ciphertext], xmm1
1329 movdqu [ptr_ciphertext+16], xmm2
1330 add ptr_ciphertext, 16*2
1331
1332 cmp ptr_ciphertext, target_ptr_val
1333 je _last_eight
1334
1335 jmp _main_loop
1336
1337 _initial_num_blocks_is_1:
1338 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1339 add ptr_plaintext, 16*1
1340 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1341 ; store ciphertext
1342 movdqu [ptr_ciphertext], xmm1
1343 add ptr_ciphertext, 16
1344
1345 cmp ptr_ciphertext, target_ptr_val
1346 je _last_eight
1347
1348 jmp _main_loop
1349
1350 _initial_num_blocks_is_0:
1351 mov twtempl, [TW+8*0]
1352 mov twtemph, [TW+8*1]
1353 movdqa xmm9, [TW+16*0]
1354
1355 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1356 shl twtempl, 1
1357 adc twtemph, twtemph
1358 cmovc ghash_poly_8b_temp, ghash_poly_8b
1359 xor twtempl, ghash_poly_8b_temp
1360 mov [TW+8*2], twtempl
1361 mov [TW+8*3], twtemph
1362 movdqa xmm10, [TW+16*1]
1363
1364 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1365 shl twtempl, 1
1366 adc twtemph, twtemph
1367 cmovc ghash_poly_8b_temp, ghash_poly_8b
1368 xor twtempl, ghash_poly_8b_temp
1369 mov [TW+8*4], twtempl
1370 mov [TW+8*5], twtemph
1371 movdqa xmm11, [TW+16*2]
1372
1373
1374 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1375 shl twtempl, 1
1376 adc twtemph, twtemph
1377 cmovc ghash_poly_8b_temp, ghash_poly_8b
1378 xor twtempl, ghash_poly_8b_temp
1379 mov [TW+8*6], twtempl
1380 mov [TW+8*7], twtemph
1381 movdqa xmm12, [TW+16*3]
1382
1383
1384 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1385 shl twtempl, 1
1386 adc twtemph, twtemph
1387 cmovc ghash_poly_8b_temp, ghash_poly_8b
1388 xor twtempl, ghash_poly_8b_temp
1389 mov [TW+8*8], twtempl
1390 mov [TW+8*9], twtemph
1391 movdqa xmm13, [TW+16*4]
1392
1393 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1394 shl twtempl, 1
1395 adc twtemph, twtemph
1396 cmovc ghash_poly_8b_temp, ghash_poly_8b
1397 xor twtempl, ghash_poly_8b_temp
1398 mov [TW+8*10], twtempl
1399 mov [TW+8*11], twtemph
1400 movdqa xmm14, [TW+16*5]
1401
1402 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1403 shl twtempl, 1
1404 adc twtemph, twtemph
1405 cmovc ghash_poly_8b_temp, ghash_poly_8b
1406 xor twtempl, ghash_poly_8b_temp
1407 mov [TW+8*12], twtempl
1408 mov [TW+8*13], twtemph
1409 movdqa xmm15, [TW+16*6]
1410
1411 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1412 shl twtempl, 1
1413 adc twtemph, twtemph
1414 cmovc ghash_poly_8b_temp, ghash_poly_8b
1415 xor twtempl, ghash_poly_8b_temp
1416 mov [TW+8*14], twtempl
1417 mov [TW+8*15], twtemph
1418 ;movdqa xmm16, [TW+16*7]
1419
1420 cmp ptr_ciphertext, target_ptr_val
1421 je _last_eight
1422 _main_loop:
1423 ; load plaintext
1424 movdqu xmm1, [ptr_plaintext+16*0]
1425 movdqu xmm2, [ptr_plaintext+16*1]
1426 movdqu xmm3, [ptr_plaintext+16*2]
1427 movdqu xmm4, [ptr_plaintext+16*3]
1428 movdqu xmm5, [ptr_plaintext+16*4]
1429 movdqu xmm6, [ptr_plaintext+16*5]
1430 movdqu xmm7, [ptr_plaintext+16*6]
1431 movdqu xmm8, [ptr_plaintext+16*7]
1432
1433 add ptr_plaintext, 128
1434
1435 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1436
1437 ; store ciphertext
1438 movdqu [ptr_ciphertext+16*0], xmm1
1439 movdqu [ptr_ciphertext+16*1], xmm2
1440 movdqu [ptr_ciphertext+16*2], xmm3
1441 movdqu [ptr_ciphertext+16*3], xmm4
1442 movdqu [ptr_ciphertext+16*4], xmm5
1443 movdqu [ptr_ciphertext+16*5], xmm6
1444 movdqu [ptr_ciphertext+16*6], xmm7
1445 movdqu [ptr_ciphertext+16*7], xmm8
1446 add ptr_ciphertext, 128
1447
1448 cmp ptr_ciphertext, target_ptr_val
1449 jne _main_loop
1450
1451 _last_eight:
1452
1453 and N_val, 15 ; N_val = N_val mod 16
1454 je _done_final
1455
1456 ; generate next Tweak value
1457 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1458 shl twtempl, 1
1459 adc twtemph, twtemph
1460 cmovc ghash_poly_8b_temp, ghash_poly_8b
1461 xor twtempl, ghash_poly_8b_temp
1462 movdqa xmm1, [TW + 16*7]
1463 movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
1464
1465 mov [TW + 16*7], twtempl
1466 mov [TW + 16*7+8], twtemph
1467
1468 ; load plaintext
1469 movdqu xmm1, [ptr_plaintext+16*0]
1470 movdqu xmm2, [ptr_plaintext+16*1]
1471 movdqu xmm3, [ptr_plaintext+16*2]
1472 movdqu xmm4, [ptr_plaintext+16*3]
1473 movdqu xmm5, [ptr_plaintext+16*4]
1474 movdqu xmm6, [ptr_plaintext+16*5]
1475 movdqu xmm7, [ptr_plaintext+16*6]
1476 movdqu xmm8, [ptr_plaintext+16*7]
1477 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1478
1479 ; store ciphertext
1480 movdqu [ptr_ciphertext+16*0], xmm1
1481 movdqu [ptr_ciphertext+16*1], xmm2
1482 movdqu [ptr_ciphertext+16*2], xmm3
1483 movdqu [ptr_ciphertext+16*3], xmm4
1484 movdqu [ptr_ciphertext+16*4], xmm5
1485 movdqu [ptr_ciphertext+16*5], xmm6
1486 movdqu [ptr_ciphertext+16*6], xmm7
1487 jmp _steal_cipher
1488
1489
1490 _done_final:
1491 ; load plaintext
1492 movdqu xmm1, [ptr_plaintext+16*0]
1493 movdqu xmm2, [ptr_plaintext+16*1]
1494 movdqu xmm3, [ptr_plaintext+16*2]
1495 movdqu xmm4, [ptr_plaintext+16*3]
1496 movdqu xmm5, [ptr_plaintext+16*4]
1497 movdqu xmm6, [ptr_plaintext+16*5]
1498 movdqu xmm7, [ptr_plaintext+16*6]
1499 movdqu xmm8, [ptr_plaintext+16*7]
1500 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1501
1502 ; store ciphertext
1503 movdqu [ptr_ciphertext+16*0], xmm1
1504 movdqu [ptr_ciphertext+16*1], xmm2
1505 movdqu [ptr_ciphertext+16*2], xmm3
1506 movdqu [ptr_ciphertext+16*3], xmm4
1507 movdqu [ptr_ciphertext+16*4], xmm5
1508 movdqu [ptr_ciphertext+16*5], xmm6
1509 movdqu [ptr_ciphertext+16*6], xmm7
1510
1511 jmp _done
1512
1513
1514 _steal_cipher:
1515 ; start cipher stealing
1516
1517
1518 movdqa xmm2, xmm8
1519
1520 ; shift xmm8 to the left by 16-N_val bytes
1521 lea twtempl, [pshufb_shf_table]
1522 movdqu xmm0, [twtempl+N_val]
1523 pshufb xmm8, xmm0
1524
1525
1526 movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1527 movdqu [ptr_ciphertext + 112 + N_val], xmm8
1528
1529 ; shift xmm3 to the right by 16-N_val bytes
1530 lea twtempl, [pshufb_shf_table +16]
1531 sub twtempl, N_val
1532 movdqu xmm0, [twtempl]
1533 pxor xmm0, [mask1]
1534 pshufb xmm3, xmm0
1535
1536 pblendvb xmm3, xmm2 ;xmm0 is implicit
1537
1538 ; xor Tweak value
1539 movdqa xmm8, [TW]
1540 pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
1541
1542
1543 ;encrypt last block with cipher stealing
1544 pxor xmm8, [keys] ; ARK
1545 aesdec xmm8, [keys + 16*1] ; round 1
1546 aesdec xmm8, [keys + 16*2] ; round 2
1547 aesdec xmm8, [keys + 16*3] ; round 3
1548 aesdec xmm8, [keys + 16*4] ; round 4
1549 aesdec xmm8, [keys + 16*5] ; round 5
1550 aesdec xmm8, [keys + 16*6] ; round 6
1551 aesdec xmm8, [keys + 16*7] ; round 7
1552 aesdec xmm8, [keys + 16*8] ; round 8
1553 aesdec xmm8, [keys + 16*9] ; round 9
1554 aesdec xmm8, [keys + 16*10] ; round 9
1555 aesdec xmm8, [keys + 16*11] ; round 9
1556 aesdec xmm8, [keys + 16*12] ; round 9
1557 aesdec xmm8, [keys + 16*13] ; round 9
1558 aesdeclast xmm8, [keys + 16*14] ; round 10
1559
1560 ; xor Tweak value
1561 pxor xmm8, [TW]
1562
1563 _done:
1564 ; store last ciphertext value
1565 movdqu [ptr_ciphertext+16*7], xmm8
1566
1567 _ret_:
1568
1569 mov rbx, [_gpr + 8*0]
1570 %ifidn __OUTPUT_FORMAT__, win64
1571 mov rdi, [_gpr + 8*1]
1572 mov rsi, [_gpr + 8*2]
1573
1574
1575 movdqa xmm6, [_xmm + 16*0]
1576 movdqa xmm7, [_xmm + 16*1]
1577 movdqa xmm8, [_xmm + 16*2]
1578 movdqa xmm9, [_xmm + 16*3]
1579 movdqa xmm10, [_xmm + 16*4]
1580 movdqa xmm11, [_xmm + 16*5]
1581 movdqa xmm12, [_xmm + 16*6]
1582 movdqa xmm13, [_xmm + 16*7]
1583 movdqa xmm14, [_xmm + 16*8]
1584 movdqa xmm15, [_xmm + 16*9]
1585 %endif
1586
1587 add rsp, VARIABLE_OFFSET
1588
1589 ret
1590
1591
1592
1593
1594
1595 _less_than_128_bytes:
1596 cmp N_val, 16
1597 jb _ret_
1598
1599 mov tmp1, N_val
1600 and tmp1, (7 << 4)
1601 cmp tmp1, (6 << 4)
1602 je _num_blocks_is_6
1603 cmp tmp1, (5 << 4)
1604 je _num_blocks_is_5
1605 cmp tmp1, (4 << 4)
1606 je _num_blocks_is_4
1607 cmp tmp1, (3 << 4)
1608 je _num_blocks_is_3
1609 cmp tmp1, (2 << 4)
1610 je _num_blocks_is_2
1611 cmp tmp1, (1 << 4)
1612 je _num_blocks_is_1
1613
1614
1615
1616
1617 _num_blocks_is_7:
1618 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1619
1620 sub ptr_plaintext, 16*1
1621
1622 and N_val, 15 ; N_val = N_val mod 16
1623 je _done_7
1624
1625 _steal_cipher_7:
1626 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1627 shl twtempl, 1
1628 adc twtemph, twtemph
1629 cmovc ghash_poly_8b_temp, ghash_poly_8b
1630 xor twtempl, ghash_poly_8b_temp
1631 mov [TW+8*2], twtempl
1632 mov [TW+8*3], twtemph
1633
1634 movdqa [TW + 16*0] , xmm15
1635 movdqa xmm15, [TW+16*1]
1636
1637 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1638 ; store ciphertext
1639 movdqu [ptr_ciphertext+16*0], xmm1
1640 movdqu [ptr_ciphertext+16*1], xmm2
1641 movdqu [ptr_ciphertext+16*2], xmm3
1642 movdqu [ptr_ciphertext+16*3], xmm4
1643 movdqu [ptr_ciphertext+16*4], xmm5
1644 movdqu [ptr_ciphertext+16*5], xmm6
1645
1646 sub ptr_ciphertext, 16*1
1647 movdqa xmm8, xmm7
1648 jmp _steal_cipher
1649
1650 _done_7:
1651 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1652 ; store ciphertext
1653 movdqu [ptr_ciphertext+16*0], xmm1
1654 movdqu [ptr_ciphertext+16*1], xmm2
1655 movdqu [ptr_ciphertext+16*2], xmm3
1656 movdqu [ptr_ciphertext+16*3], xmm4
1657 movdqu [ptr_ciphertext+16*4], xmm5
1658 movdqu [ptr_ciphertext+16*5], xmm6
1659
1660 sub ptr_ciphertext, 16*1
1661 movdqa xmm8, xmm7
1662 jmp _done
1663
1664
1665
1666
1667
1668
1669 _num_blocks_is_6:
1670 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1671
1672 sub ptr_plaintext, 16*2
1673
1674 and N_val, 15 ; N_val = N_val mod 16
1675 je _done_6
1676
1677 _steal_cipher_6:
1678 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1679 shl twtempl, 1
1680 adc twtemph, twtemph
1681 cmovc ghash_poly_8b_temp, ghash_poly_8b
1682 xor twtempl, ghash_poly_8b_temp
1683 mov [TW+8*2], twtempl
1684 mov [TW+8*3], twtemph
1685
1686 movdqa [TW + 16*0] , xmm14
1687 movdqa xmm14, [TW+16*1]
1688
1689 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1690 ; store ciphertext
1691 movdqu [ptr_ciphertext+16*0], xmm1
1692 movdqu [ptr_ciphertext+16*1], xmm2
1693 movdqu [ptr_ciphertext+16*2], xmm3
1694 movdqu [ptr_ciphertext+16*3], xmm4
1695 movdqu [ptr_ciphertext+16*4], xmm5
1696
1697 sub ptr_ciphertext, 16*2
1698 movdqa xmm8, xmm6
1699 jmp _steal_cipher
1700
1701 _done_6:
1702 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1703 ; store ciphertext
1704 movdqu [ptr_ciphertext+16*0], xmm1
1705 movdqu [ptr_ciphertext+16*1], xmm2
1706 movdqu [ptr_ciphertext+16*2], xmm3
1707 movdqu [ptr_ciphertext+16*3], xmm4
1708 movdqu [ptr_ciphertext+16*4], xmm5
1709
1710 sub ptr_ciphertext, 16*2
1711 movdqa xmm8, xmm6
1712 jmp _done
1713
1714
1715
1716
1717
1718 _num_blocks_is_5:
1719 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1720
1721 sub ptr_plaintext, 16*3
1722
1723 and N_val, 15 ; N_val = N_val mod 16
1724 je _done_5
1725
1726 _steal_cipher_5:
1727 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1728 shl twtempl, 1
1729 adc twtemph, twtemph
1730 cmovc ghash_poly_8b_temp, ghash_poly_8b
1731 xor twtempl, ghash_poly_8b_temp
1732 mov [TW+8*2], twtempl
1733 mov [TW+8*3], twtemph
1734
1735 movdqa [TW + 16*0] , xmm13
1736 movdqa xmm13, [TW+16*1]
1737
1738 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1739 ; store ciphertext
1740 movdqu [ptr_ciphertext+16*0], xmm1
1741 movdqu [ptr_ciphertext+16*1], xmm2
1742 movdqu [ptr_ciphertext+16*2], xmm3
1743 movdqu [ptr_ciphertext+16*3], xmm4
1744
1745 sub ptr_ciphertext, 16*3
1746 movdqa xmm8, xmm5
1747 jmp _steal_cipher
1748
1749 _done_5:
1750 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1751 ; store ciphertext
1752 movdqu [ptr_ciphertext+16*0], xmm1
1753 movdqu [ptr_ciphertext+16*1], xmm2
1754 movdqu [ptr_ciphertext+16*2], xmm3
1755 movdqu [ptr_ciphertext+16*3], xmm4
1756
1757 sub ptr_ciphertext, 16*3
1758 movdqa xmm8, xmm5
1759 jmp _done
1760
1761
1762
1763
1764
1765 _num_blocks_is_4:
1766 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1767
1768 sub ptr_plaintext, 16*4
1769
1770 and N_val, 15 ; N_val = N_val mod 16
1771 je _done_4
1772
1773 _steal_cipher_4:
1774 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1775 shl twtempl, 1
1776 adc twtemph, twtemph
1777 cmovc ghash_poly_8b_temp, ghash_poly_8b
1778 xor twtempl, ghash_poly_8b_temp
1779 mov [TW+8*2], twtempl
1780 mov [TW+8*3], twtemph
1781
1782 movdqa [TW + 16*0] , xmm12
1783 movdqa xmm12, [TW+16*1]
1784
1785 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1786 ; store ciphertext
1787 movdqu [ptr_ciphertext+16*0], xmm1
1788 movdqu [ptr_ciphertext+16*1], xmm2
1789 movdqu [ptr_ciphertext+16*2], xmm3
1790
1791 sub ptr_ciphertext, 16*4
1792 movdqa xmm8, xmm4
1793 jmp _steal_cipher
1794
1795 _done_4:
1796 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1797 ; store ciphertext
1798 movdqu [ptr_ciphertext+16*0], xmm1
1799 movdqu [ptr_ciphertext+16*1], xmm2
1800 movdqu [ptr_ciphertext+16*2], xmm3
1801
1802 sub ptr_ciphertext, 16*4
1803 movdqa xmm8, xmm4
1804 jmp _done
1805
1806
1807
1808
1809 _num_blocks_is_3:
1810 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1811
1812 sub ptr_plaintext, 16*5
1813
1814 and N_val, 15 ; N_val = N_val mod 16
1815 je _done_3
1816
1817 _steal_cipher_3:
1818 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1819 shl twtempl, 1
1820 adc twtemph, twtemph
1821 cmovc ghash_poly_8b_temp, ghash_poly_8b
1822 xor twtempl, ghash_poly_8b_temp
1823 mov [TW+8*2], twtempl
1824 mov [TW+8*3], twtemph
1825
1826 movdqa [TW + 16*0] , xmm11
1827 movdqa xmm11, [TW+16*1]
1828
1829 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1830 ; store ciphertext
1831 movdqu [ptr_ciphertext+16*0], xmm1
1832 movdqu [ptr_ciphertext+16*1], xmm2
1833
1834 sub ptr_ciphertext, 16*5
1835 movdqa xmm8, xmm3
1836 jmp _steal_cipher
1837
1838 _done_3:
1839 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1840 ; store ciphertext
1841 movdqu [ptr_ciphertext+16*0], xmm1
1842 movdqu [ptr_ciphertext+16*1], xmm2
1843
1844 sub ptr_ciphertext, 16*5
1845 movdqa xmm8, xmm3
1846 jmp _done
1847
1848
1849
1850
1851
1852
1853 _num_blocks_is_2:
1854 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1855
1856 sub ptr_plaintext, 16*6
1857
1858 and N_val, 15 ; N_val = N_val mod 16
1859 je _done_2
1860
1861 _steal_cipher_2:
1862 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1863 shl twtempl, 1
1864 adc twtemph, twtemph
1865 cmovc ghash_poly_8b_temp, ghash_poly_8b
1866 xor twtempl, ghash_poly_8b_temp
1867 mov [TW+8*2], twtempl
1868 mov [TW+8*3], twtemph
1869
1870 movdqa [TW + 16*0] , xmm10
1871 movdqa xmm10, [TW+16*1]
1872
1873 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1874 ; store ciphertext
1875 movdqu [ptr_ciphertext], xmm1
1876
1877 sub ptr_ciphertext, 16*6
1878 movdqa xmm8, xmm2
1879 jmp _steal_cipher
1880
1881 _done_2:
1882 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1883 ; store ciphertext
1884 movdqu [ptr_ciphertext], xmm1
1885
1886 sub ptr_ciphertext, 16*6
1887 movdqa xmm8, xmm2
1888 jmp _done
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902 _num_blocks_is_1:
1903 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1904
1905 sub ptr_plaintext, 16*7
1906
1907 and N_val, 15 ; N_val = N_val mod 16
1908 je _done_1
1909
1910 _steal_cipher_1:
1911 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1912 shl twtempl, 1
1913 adc twtemph, twtemph
1914 cmovc ghash_poly_8b_temp, ghash_poly_8b
1915 xor twtempl, ghash_poly_8b_temp
1916 mov [TW+8*2], twtempl
1917 mov [TW+8*3], twtemph
1918
1919 movdqa [TW + 16*0] , xmm9
1920 movdqa xmm9, [TW+16*1]
1921
1922 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1923 ; store ciphertext
1924
1925 sub ptr_ciphertext, 16*7
1926 movdqa xmm8, xmm1
1927 jmp _steal_cipher
1928
1929 _done_1:
1930 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1931 ; store ciphertext
1932
1933 sub ptr_ciphertext, 16*7
1934 movdqa xmm8, xmm1
1935 jmp _done
1936
1937 section .data
1938 align 16
1939
1940 pshufb_shf_table:
1941 ; use these values for shift constants for the pshufb instruction
1942 ; different alignments result in values as shown:
1943 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1944 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1945 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1946 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1947 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1948 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1949 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1950 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1951 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1952 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1953 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1954 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1955 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1956 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1957 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1958 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1959 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1960
1961 mask1:
1962 dq 0x8080808080808080, 0x8080808080808080
1963