1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ; XTS encrypt function with 256-bit AES
30 ; expanded keys are not aligned
31 ; plaintext and ciphertext are not aligned
32 ; second key is stored in the stack as aligned to 16 Bytes
33 ; first key is required only once, no need for storage of this key
35 %include "reg_sizes.asm"
38 %define TW rsp ; store 8 tweak values
39 %define keys rsp + 16*8 ; store 15 expanded keys
41 %ifidn __OUTPUT_FORMAT__, win64
42 %define _xmm rsp + 16*23 ; store xmm6:xmm15
45 %ifidn __OUTPUT_FORMAT__, elf64
46 %define _gpr rsp + 16*23 ; store rbx
47 %define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
49 %define _gpr rsp + 16*33 ; store rdi, rsi, rbx
50 %define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
53 %define GHASH_POLY 0x87
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 ;void XTS_AES_256_enc_expanded_key_sse(
57 ; UINT8 *k2, // key used for tweaking, 16*15 bytes
58 ; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
59 ; UINT8 *TW_initial, // initial tweak value, 16 bytes
60 ; UINT64 N, // sector size, in bytes
61 ; const UINT8 *pt, // plaintext sector input data
62 ; UINT8 *ct); // ciphertext sector output data
63 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
65 ; arguments for input parameters
66 %ifidn __OUTPUT_FORMAT__, elf64
71 %xdefine ptr_plaintext r8
72 %xdefine ptr_ciphertext r9
78 %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
79 %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
82 ; arguments for temp parameters
83 %ifidn __OUTPUT_FORMAT__, elf64
85 %define target_ptr_val rsi
86 %define ghash_poly_8b r10
87 %define ghash_poly_8b_temp r11
90 %define target_ptr_val rdx
91 %define ghash_poly_8b rdi
92 %define ghash_poly_8b_temp rsi
95 %define twtempl rax ; global temp registers used for tweak computation
99 ; macro to encrypt the tweak value
103 %define %%xstate_tweak %2
105 %define %%xraw_key %4
107 %define %%ptr_key2 %6
108 %define %%ptr_key1 %7
109 %define %%ptr_expanded_keys %8
111 movdqu %%xkey2, [%%ptr_key2]
112 pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
114 movdqu %%xkey1, [%%ptr_key1]
115 movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
117 movdqu %%xkey2, [%%ptr_key2 + 16*1]
118 aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
120 movdqu %%xkey1, [%%ptr_key1 + 16*1]
121 movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
124 movdqu %%xkey2, [%%ptr_key2 + 16*2]
125 aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
127 movdqu %%xkey1, [%%ptr_key1 + 16*2]
128 movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
130 movdqu %%xkey2, [%%ptr_key2 + 16*3]
131 aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
133 movdqu %%xkey1, [%%ptr_key1 + 16*3]
134 movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
136 movdqu %%xkey2, [%%ptr_key2 + 16*4]
137 aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
139 movdqu %%xkey1, [%%ptr_key1 + 16*4]
140 movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
142 movdqu %%xkey2, [%%ptr_key2 + 16*5]
143 aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
145 movdqu %%xkey1, [%%ptr_key1 + 16*5]
146 movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
148 movdqu %%xkey2, [%%ptr_key2 + 16*6]
149 aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
151 movdqu %%xkey1, [%%ptr_key1 + 16*6]
152 movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
154 movdqu %%xkey2, [%%ptr_key2 + 16*7]
155 aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
157 movdqu %%xkey1, [%%ptr_key1 + 16*7]
158 movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
161 movdqu %%xkey2, [%%ptr_key2 + 16*8]
162 aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
164 movdqu %%xkey1, [%%ptr_key1 + 16*8]
165 movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
168 movdqu %%xkey2, [%%ptr_key2 + 16*9]
169 aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
171 movdqu %%xkey1, [%%ptr_key1 + 16*9]
172 movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
175 movdqu %%xkey2, [%%ptr_key2 + 16*10]
176 aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
178 movdqu %%xkey1, [%%ptr_key1 + 16*10]
179 movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
182 movdqu %%xkey2, [%%ptr_key2 + 16*11]
183 aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
185 movdqu %%xkey1, [%%ptr_key1 + 16*11]
186 movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
188 movdqu %%xkey2, [%%ptr_key2 + 16*12]
189 aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
191 movdqu %%xkey1, [%%ptr_key1 + 16*12]
192 movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
194 movdqu %%xkey2, [%%ptr_key2 + 16*13]
195 aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
197 movdqu %%xkey1, [%%ptr_key1 + 16*13]
198 movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
200 movdqu %%xkey2, [%%ptr_key2 + 16*14]
201 aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
203 movdqu %%xkey1, [%%ptr_key1 + 16*14]
204 movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
206 movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
210 ; generate initial tweak values
211 ; load initial plaintext values
214 %define %%ST1 %1 ; state 1
215 %define %%ST2 %2 ; state 2
216 %define %%ST3 %3 ; state 3
217 %define %%ST4 %4 ; state 4
218 %define %%ST5 %5 ; state 5
219 %define %%ST6 %6 ; state 6
220 %define %%ST7 %7 ; state 7
221 %define %%ST8 %8 ; state 8
223 %define %%TW1 %9 ; tweak 1
224 %define %%TW2 %10 ; tweak 2
225 %define %%TW3 %11 ; tweak 3
226 %define %%TW4 %12 ; tweak 4
227 %define %%TW5 %13 ; tweak 5
228 %define %%TW6 %14 ; tweak 6
229 %define %%TW7 %15 ; tweak 7
231 %define %%num_initial_blocks %16
234 ; generate next Tweak values
235 movdqa %%TW1, [TW+16*0]
236 mov twtempl, [TW+8*0]
237 mov twtemph, [TW+8*1]
238 movdqu %%ST1, [ptr_plaintext+16*0]
239 %if (%%num_initial_blocks>=2)
240 xor ghash_poly_8b_temp, ghash_poly_8b_temp
243 cmovc ghash_poly_8b_temp, ghash_poly_8b
244 xor twtempl, ghash_poly_8b_temp
245 mov [TW+8*2], twtempl
246 mov [TW+8*3], twtemph;
247 movdqa %%TW2, [TW+16*1]
248 movdqu %%ST2, [ptr_plaintext+16*1]
250 %if (%%num_initial_blocks>=3)
251 xor ghash_poly_8b_temp, ghash_poly_8b_temp
254 cmovc ghash_poly_8b_temp, ghash_poly_8b
255 xor twtempl, ghash_poly_8b_temp
256 mov [TW+8*4], twtempl
257 mov [TW+8*5], twtemph;
258 movdqa %%TW3, [TW+16*2]
259 movdqu %%ST3, [ptr_plaintext+16*2]
261 %if (%%num_initial_blocks>=4)
262 xor ghash_poly_8b_temp, ghash_poly_8b_temp
265 cmovc ghash_poly_8b_temp, ghash_poly_8b
266 xor twtempl, ghash_poly_8b_temp
267 mov [TW+8*6], twtempl
268 mov [TW+8*7], twtemph;
269 movdqa %%TW4, [TW+16*3]
270 movdqu %%ST4, [ptr_plaintext+16*3]
272 %if (%%num_initial_blocks>=5)
273 xor ghash_poly_8b_temp, ghash_poly_8b_temp
276 cmovc ghash_poly_8b_temp, ghash_poly_8b
277 xor twtempl, ghash_poly_8b_temp
278 mov [TW+8*8], twtempl
279 mov [TW+8*9], twtemph;
280 movdqa %%TW5, [TW+16*4]
281 movdqu %%ST5, [ptr_plaintext+16*4]
283 %if (%%num_initial_blocks>=6)
284 xor ghash_poly_8b_temp, ghash_poly_8b_temp
287 cmovc ghash_poly_8b_temp, ghash_poly_8b
288 xor twtempl, ghash_poly_8b_temp
289 mov [TW+8*10], twtempl
290 mov [TW+8*11], twtemph;
291 movdqa %%TW6, [TW+16*5]
292 movdqu %%ST6, [ptr_plaintext+16*5]
294 %if (%%num_initial_blocks>=7)
295 xor ghash_poly_8b_temp, ghash_poly_8b_temp
298 cmovc ghash_poly_8b_temp, ghash_poly_8b
299 xor twtempl, ghash_poly_8b_temp
300 mov [TW+8*12], twtempl
301 mov [TW+8*13], twtemph;
302 movdqa %%TW7, [TW+16*6]
303 movdqu %%ST7, [ptr_plaintext+16*6]
311 ; encrypt initial blocks of AES
312 ; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
313 ; next 8 Tweak values are generated
314 %macro encrypt_initial 18
315 %define %%ST1 %1 ; state 1
316 %define %%ST2 %2 ; state 2
317 %define %%ST3 %3 ; state 3
318 %define %%ST4 %4 ; state 4
319 %define %%ST5 %5 ; state 5
320 %define %%ST6 %6 ; state 6
321 %define %%ST7 %7 ; state 7
322 %define %%ST8 %8 ; state 8
324 %define %%TW1 %9 ; tweak 1
325 %define %%TW2 %10 ; tweak 2
326 %define %%TW3 %11 ; tweak 3
327 %define %%TW4 %12 ; tweak 4
328 %define %%TW5 %13 ; tweak 5
329 %define %%TW6 %14 ; tweak 6
330 %define %%TW7 %15 ; tweak 7
331 %define %%T0 %16 ; Temp register
332 %define %%num_blocks %17
333 ; %%num_blocks blocks encrypted
334 ; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
336 %define %%lt128 %18 ; less than 128 bytes
340 %if (%%num_blocks>=2)
343 %if (%%num_blocks>=3)
346 %if (%%num_blocks>=4)
349 %if (%%num_blocks>=5)
352 %if (%%num_blocks>=6)
355 %if (%%num_blocks>=7)
363 %if (%%num_blocks>=2)
366 %if (%%num_blocks>=3)
369 %if (%%num_blocks>=4)
372 %if (%%num_blocks>=5)
375 %if (%%num_blocks>=6)
378 %if (%%num_blocks>=7)
384 xor ghash_poly_8b_temp, ghash_poly_8b_temp
390 movdqa %%T0, [keys + 16*1]
392 %if (%%num_blocks>=2)
395 %if (%%num_blocks>=3)
398 %if (%%num_blocks>=4)
401 %if (%%num_blocks>=5)
404 %if (%%num_blocks>=6)
407 %if (%%num_blocks>=7)
411 cmovc ghash_poly_8b_temp, ghash_poly_8b
412 xor twtempl, ghash_poly_8b_temp
413 mov [TW + 8*0], twtempl ; next Tweak1 generated
414 mov [TW + 8*1], twtemph
415 xor ghash_poly_8b_temp, ghash_poly_8b_temp
419 movdqa %%T0, [keys + 16*2]
421 %if (%%num_blocks>=2)
424 %if (%%num_blocks>=3)
427 %if (%%num_blocks>=4)
430 %if (%%num_blocks>=5)
433 %if (%%num_blocks>=6)
436 %if (%%num_blocks>=7)
443 cmovc ghash_poly_8b_temp, ghash_poly_8b
444 xor twtempl, ghash_poly_8b_temp
445 mov [TW + 8*2], twtempl ; next Tweak2 generated
449 movdqa %%T0, [keys + 16*3]
451 %if (%%num_blocks>=2)
454 %if (%%num_blocks>=3)
457 %if (%%num_blocks>=4)
460 %if (%%num_blocks>=5)
463 %if (%%num_blocks>=6)
466 %if (%%num_blocks>=7)
470 mov [TW + 8*3], twtemph
471 xor ghash_poly_8b_temp, ghash_poly_8b_temp
474 cmovc ghash_poly_8b_temp, ghash_poly_8b
478 movdqa %%T0, [keys + 16*4]
480 %if (%%num_blocks>=2)
483 %if (%%num_blocks>=3)
486 %if (%%num_blocks>=4)
489 %if (%%num_blocks>=5)
492 %if (%%num_blocks>=6)
495 %if (%%num_blocks>=7)
500 xor twtempl, ghash_poly_8b_temp
501 mov [TW + 8*4], twtempl ; next Tweak3 generated
502 mov [TW + 8*5], twtemph
503 xor ghash_poly_8b_temp, ghash_poly_8b_temp
508 movdqa %%T0, [keys + 16*5]
510 %if (%%num_blocks>=2)
513 %if (%%num_blocks>=3)
516 %if (%%num_blocks>=4)
519 %if (%%num_blocks>=5)
522 %if (%%num_blocks>=6)
525 %if (%%num_blocks>=7)
531 cmovc ghash_poly_8b_temp, ghash_poly_8b
532 xor twtempl, ghash_poly_8b_temp
533 mov [TW + 8*6], twtempl ; next Tweak4 generated
534 mov [TW + 8*7], twtemph
538 movdqa %%T0, [keys + 16*6]
540 %if (%%num_blocks>=2)
543 %if (%%num_blocks>=3)
546 %if (%%num_blocks>=4)
549 %if (%%num_blocks>=5)
552 %if (%%num_blocks>=6)
555 %if (%%num_blocks>=7)
560 xor ghash_poly_8b_temp, ghash_poly_8b_temp
563 cmovc ghash_poly_8b_temp, ghash_poly_8b
564 xor twtempl, ghash_poly_8b_temp
565 mov [TW + 8*8], twtempl ; next Tweak5 generated
566 mov [TW + 8*9], twtemph
570 movdqa %%T0, [keys + 16*7]
572 %if (%%num_blocks>=2)
575 %if (%%num_blocks>=3)
578 %if (%%num_blocks>=4)
581 %if (%%num_blocks>=5)
584 %if (%%num_blocks>=6)
587 %if (%%num_blocks>=7)
592 xor ghash_poly_8b_temp, ghash_poly_8b_temp
595 cmovc ghash_poly_8b_temp, ghash_poly_8b
596 xor twtempl, ghash_poly_8b_temp
597 mov [TW + 8*10], twtempl ; next Tweak6 generated
598 mov [TW + 8*11], twtemph
601 movdqa %%T0, [keys + 16*8]
603 %if (%%num_blocks>=2)
606 %if (%%num_blocks>=3)
609 %if (%%num_blocks>=4)
612 %if (%%num_blocks>=5)
615 %if (%%num_blocks>=6)
618 %if (%%num_blocks>=7)
623 xor ghash_poly_8b_temp, ghash_poly_8b_temp
626 cmovc ghash_poly_8b_temp, ghash_poly_8b
627 xor twtempl, ghash_poly_8b_temp
628 mov [TW + 8*12], twtempl ; next Tweak7 generated
629 mov [TW + 8*13], twtemph
632 movdqa %%T0, [keys + 16*9]
634 %if (%%num_blocks>=2)
637 %if (%%num_blocks>=3)
640 %if (%%num_blocks>=4)
643 %if (%%num_blocks>=5)
646 %if (%%num_blocks>=6)
649 %if (%%num_blocks>=7)
654 xor ghash_poly_8b_temp, ghash_poly_8b_temp
657 cmovc ghash_poly_8b_temp, ghash_poly_8b
658 xor twtempl, ghash_poly_8b_temp
659 mov [TW + 8*14], twtempl ; next Tweak8 generated
660 mov [TW + 8*15], twtemph
663 movdqa %%T0, [keys + 16*10]
665 %if (%%num_blocks>=2)
668 %if (%%num_blocks>=3)
671 %if (%%num_blocks>=4)
674 %if (%%num_blocks>=5)
677 %if (%%num_blocks>=6)
680 %if (%%num_blocks>=7)
684 movdqa %%T0, [keys + 16*11]
686 %if (%%num_blocks>=2)
689 %if (%%num_blocks>=3)
692 %if (%%num_blocks>=4)
695 %if (%%num_blocks>=5)
698 %if (%%num_blocks>=6)
701 %if (%%num_blocks>=7)
706 movdqa %%T0, [keys + 16*12]
708 %if (%%num_blocks>=2)
711 %if (%%num_blocks>=3)
714 %if (%%num_blocks>=4)
717 %if (%%num_blocks>=5)
720 %if (%%num_blocks>=6)
723 %if (%%num_blocks>=7)
728 movdqa %%T0, [keys + 16*13]
730 %if (%%num_blocks>=2)
733 %if (%%num_blocks>=3)
736 %if (%%num_blocks>=4)
739 %if (%%num_blocks>=5)
742 %if (%%num_blocks>=6)
745 %if (%%num_blocks>=7)
750 movdqa %%T0, [keys + 16*14]
751 aesenclast %%ST1, %%T0
752 %if (%%num_blocks>=2)
753 aesenclast %%ST2, %%T0
755 %if (%%num_blocks>=3)
756 aesenclast %%ST3, %%T0
758 %if (%%num_blocks>=4)
759 aesenclast %%ST4, %%T0
761 %if (%%num_blocks>=5)
762 aesenclast %%ST5, %%T0
764 %if (%%num_blocks>=6)
765 aesenclast %%ST6, %%T0
767 %if (%%num_blocks>=7)
768 aesenclast %%ST7, %%T0
773 %if (%%num_blocks>=2)
776 %if (%%num_blocks>=3)
779 %if (%%num_blocks>=4)
782 %if (%%num_blocks>=5)
785 %if (%%num_blocks>=6)
788 %if (%%num_blocks>=7)
794 ; load next Tweak values
795 movdqa %%TW1, [TW + 16*0]
796 movdqa %%TW2, [TW + 16*1]
797 movdqa %%TW3, [TW + 16*2]
798 movdqa %%TW4, [TW + 16*3]
799 movdqa %%TW5, [TW + 16*4]
800 movdqa %%TW6, [TW + 16*5]
801 movdqa %%TW7, [TW + 16*6]
808 ; Encrypt 8 blocks in parallel
809 ; generate next 8 tweak values
810 %macro encrypt_by_eight 18
811 %define %%ST1 %1 ; state 1
812 %define %%ST2 %2 ; state 2
813 %define %%ST3 %3 ; state 3
814 %define %%ST4 %4 ; state 4
815 %define %%ST5 %5 ; state 5
816 %define %%ST6 %6 ; state 6
817 %define %%ST7 %7 ; state 7
818 %define %%ST8 %8 ; state 8
819 %define %%TW1 %9 ; tweak 1
820 %define %%TW2 %10 ; tweak 2
821 %define %%TW3 %11 ; tweak 3
822 %define %%TW4 %12 ; tweak 4
823 %define %%TW5 %13 ; tweak 5
824 %define %%TW6 %14 ; tweak 6
825 %define %%TW7 %15 ; tweak 7
826 %define %%TW8 %16 ; tweak 8
827 %define %%T0 %17 ; Temp register
828 %define %%last_eight %18
851 %if (0 == %%last_eight)
852 xor ghash_poly_8b_temp, ghash_poly_8b_temp
855 cmovc ghash_poly_8b_temp, ghash_poly_8b
858 movdqa %%T0, [keys + 16*1]
867 %if (0 == %%last_eight)
868 xor twtempl, ghash_poly_8b_temp
869 mov [TW + 8*0], twtempl
870 mov [TW + 8*1], twtemph
871 xor ghash_poly_8b_temp, ghash_poly_8b_temp
874 movdqa %%T0, [keys + 16*2]
883 %if (0 == %%last_eight)
886 cmovc ghash_poly_8b_temp, ghash_poly_8b
887 xor twtempl, ghash_poly_8b_temp
891 movdqa %%T0, [keys + 16*3]
900 %if (0 == %%last_eight)
901 mov [TW + 8*2], twtempl
902 mov [TW + 8*3], twtemph
903 xor ghash_poly_8b_temp, ghash_poly_8b_temp
907 movdqa %%T0, [keys + 16*4]
916 %if (0 == %%last_eight)
918 cmovc ghash_poly_8b_temp, ghash_poly_8b
919 xor twtempl, ghash_poly_8b_temp
920 mov [TW + 8*4], twtempl
923 movdqa %%T0, [keys + 16*5]
932 %if (0 == %%last_eight)
933 mov [TW + 8*5], twtemph
934 xor ghash_poly_8b_temp, ghash_poly_8b_temp
939 movdqa %%T0, [keys + 16*6]
948 %if (0 == %%last_eight)
949 cmovc ghash_poly_8b_temp, ghash_poly_8b
950 xor twtempl, ghash_poly_8b_temp
951 mov [TW + 8*6], twtempl
952 mov [TW + 8*7], twtemph
955 movdqa %%T0, [keys + 16*7]
964 %if (0 == %%last_eight)
965 xor ghash_poly_8b_temp, ghash_poly_8b_temp
968 cmovc ghash_poly_8b_temp, ghash_poly_8b
971 movdqa %%T0, [keys + 16*8]
980 %if (0 == %%last_eight)
981 xor twtempl, ghash_poly_8b_temp
982 mov [TW + 8*8], twtempl
983 mov [TW + 8*9], twtemph
984 xor ghash_poly_8b_temp, ghash_poly_8b_temp
987 movdqa %%T0, [keys + 16*9]
996 %if (0 == %%last_eight)
999 cmovc ghash_poly_8b_temp, ghash_poly_8b
1000 xor twtempl, ghash_poly_8b_temp
1003 movdqa %%T0, [keys + 16*10]
1012 %if (0 == %%last_eight)
1013 mov [TW + 8*10], twtempl
1014 mov [TW + 8*11], twtemph
1015 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1019 movdqa %%T0, [keys + 16*11]
1028 %if (0 == %%last_eight)
1029 adc twtemph, twtemph
1030 cmovc ghash_poly_8b_temp, ghash_poly_8b
1031 xor twtempl, ghash_poly_8b_temp
1032 mov [TW + 8*12], twtempl
1035 movdqa %%T0, [keys + 16*12]
1044 %if (0 == %%last_eight)
1045 mov [TW + 8*13], twtemph
1046 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1048 adc twtemph, twtemph
1051 movdqa %%T0, [keys + 16*13]
1060 %if (0 == %%last_eight)
1061 cmovc ghash_poly_8b_temp, ghash_poly_8b
1062 xor twtempl, ghash_poly_8b_temp
1063 ; mov [TW + 8*14], twtempl
1064 ; mov [TW + 8*15], twtemph
1067 movdqa %%T0, [keys + 16*14]
1068 aesenclast %%ST1, %%T0
1069 aesenclast %%ST2, %%T0
1070 aesenclast %%ST3, %%T0
1071 aesenclast %%ST4, %%T0
1072 aesenclast %%ST5, %%T0
1073 aesenclast %%ST6, %%T0
1074 aesenclast %%ST7, %%T0
1075 aesenclast %%ST8, %%T0
1087 mov [TW + 8*14], twtempl
1088 mov [TW + 8*15], twtemph
1089 ; load next Tweak values
1090 movdqa %%TW1, [TW + 16*0]
1091 movdqa %%TW2, [TW + 16*1]
1092 movdqa %%TW3, [TW + 16*2]
1093 movdqa %%TW4, [TW + 16*3]
1094 movdqa %%TW5, [TW + 16*4]
1095 movdqa %%TW6, [TW + 16*5]
1096 movdqa %%TW7, [TW + 16*6]
1103 mk_global XTS_AES_256_enc_expanded_key_sse, function
1104 XTS_AES_256_enc_expanded_key_sse:
1107 sub rsp, VARIABLE_OFFSET
1109 mov [_gpr + 8*0], rbx
1110 %ifidn __OUTPUT_FORMAT__, win64
1111 mov [_gpr + 8*1], rdi
1112 mov [_gpr + 8*2], rsi
1114 movdqa [_xmm + 16*0], xmm6
1115 movdqa [_xmm + 16*1], xmm7
1116 movdqa [_xmm + 16*2], xmm8
1117 movdqa [_xmm + 16*3], xmm9
1118 movdqa [_xmm + 16*4], xmm10
1119 movdqa [_xmm + 16*5], xmm11
1120 movdqa [_xmm + 16*6], xmm12
1121 movdqa [_xmm + 16*7], xmm13
1122 movdqa [_xmm + 16*8], xmm14
1123 movdqa [_xmm + 16*9], xmm15
1126 mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
1129 movdqu xmm1, [T_val] ; read initial Tweak value
1130 pxor xmm4, xmm4 ; for key expansion
1131 encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
1134 %ifidn __OUTPUT_FORMAT__, win64
1135 mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
1136 mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
1141 mov target_ptr_val, N_val
1142 and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
1143 sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
1144 jl _less_than_128_bytes
1146 add target_ptr_val, ptr_ciphertext
1151 jz _initial_num_blocks_is_0
1154 je _initial_num_blocks_is_4
1159 je _initial_num_blocks_is_6
1162 je _initial_num_blocks_is_5
1167 je _initial_num_blocks_is_3
1170 je _initial_num_blocks_is_2
1173 je _initial_num_blocks_is_1
1175 _initial_num_blocks_is_7:
1176 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1177 add ptr_plaintext, 16*7
1178 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
1180 movdqu [ptr_ciphertext+16*0], xmm1
1181 movdqu [ptr_ciphertext+16*1], xmm2
1182 movdqu [ptr_ciphertext+16*2], xmm3
1183 movdqu [ptr_ciphertext+16*3], xmm4
1184 movdqu [ptr_ciphertext+16*4], xmm5
1185 movdqu [ptr_ciphertext+16*5], xmm6
1186 movdqu [ptr_ciphertext+16*6], xmm7
1187 add ptr_ciphertext, 16*7
1189 cmp ptr_ciphertext, target_ptr_val
1193 _initial_num_blocks_is_6:
1194 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1195 add ptr_plaintext, 16*6
1196 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
1198 movdqu [ptr_ciphertext+16*0], xmm1
1199 movdqu [ptr_ciphertext+16*1], xmm2
1200 movdqu [ptr_ciphertext+16*2], xmm3
1201 movdqu [ptr_ciphertext+16*3], xmm4
1202 movdqu [ptr_ciphertext+16*4], xmm5
1203 movdqu [ptr_ciphertext+16*5], xmm6
1204 add ptr_ciphertext, 16*6
1206 cmp ptr_ciphertext, target_ptr_val
1210 _initial_num_blocks_is_5:
1211 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1212 add ptr_plaintext, 16*5
1213 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
1215 movdqu [ptr_ciphertext+16*0], xmm1
1216 movdqu [ptr_ciphertext+16*1], xmm2
1217 movdqu [ptr_ciphertext+16*2], xmm3
1218 movdqu [ptr_ciphertext+16*3], xmm4
1219 movdqu [ptr_ciphertext+16*4], xmm5
1220 add ptr_ciphertext, 16*5
1222 cmp ptr_ciphertext, target_ptr_val
1226 _initial_num_blocks_is_4:
1227 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1228 add ptr_plaintext, 16*4
1229 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
1231 movdqu [ptr_ciphertext+16*0], xmm1
1232 movdqu [ptr_ciphertext+16*1], xmm2
1233 movdqu [ptr_ciphertext+16*2], xmm3
1234 movdqu [ptr_ciphertext+16*3], xmm4
1235 add ptr_ciphertext, 16*4
1237 cmp ptr_ciphertext, target_ptr_val
1243 _initial_num_blocks_is_3:
1244 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1245 add ptr_plaintext, 16*3
1246 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
1248 movdqu [ptr_ciphertext+16*0], xmm1
1249 movdqu [ptr_ciphertext+16*1], xmm2
1250 movdqu [ptr_ciphertext+16*2], xmm3
1251 add ptr_ciphertext, 16*3
1253 cmp ptr_ciphertext, target_ptr_val
1257 _initial_num_blocks_is_2:
1258 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1259 add ptr_plaintext, 16*2
1260 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
1262 movdqu [ptr_ciphertext], xmm1
1263 movdqu [ptr_ciphertext+16], xmm2
1264 add ptr_ciphertext, 16*2
1266 cmp ptr_ciphertext, target_ptr_val
1271 _initial_num_blocks_is_1:
1272 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1273 add ptr_plaintext, 16*1
1274 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
1276 movdqu [ptr_ciphertext], xmm1
1277 add ptr_ciphertext, 16
1279 cmp ptr_ciphertext, target_ptr_val
1284 _initial_num_blocks_is_0:
1285 mov twtempl, [TW+8*0]
1286 mov twtemph, [TW+8*1]
1287 movdqa xmm9, [TW+16*0]
1289 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1291 adc twtemph, twtemph
1292 cmovc ghash_poly_8b_temp, ghash_poly_8b
1293 xor twtempl, ghash_poly_8b_temp
1294 mov [TW+8*2], twtempl
1295 mov [TW+8*3], twtemph
1296 movdqa xmm10, [TW+16*1]
1298 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1300 adc twtemph, twtemph
1301 cmovc ghash_poly_8b_temp, ghash_poly_8b
1302 xor twtempl, ghash_poly_8b_temp
1303 mov [TW+8*4], twtempl
1304 mov [TW+8*5], twtemph
1305 movdqa xmm11, [TW+16*2]
1308 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1310 adc twtemph, twtemph
1311 cmovc ghash_poly_8b_temp, ghash_poly_8b
1312 xor twtempl, ghash_poly_8b_temp
1313 mov [TW+8*6], twtempl
1314 mov [TW+8*7], twtemph
1315 movdqa xmm12, [TW+16*3]
1318 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1320 adc twtemph, twtemph
1321 cmovc ghash_poly_8b_temp, ghash_poly_8b
1322 xor twtempl, ghash_poly_8b_temp
1323 mov [TW+8*8], twtempl
1324 mov [TW+8*9], twtemph
1325 movdqa xmm13, [TW+16*4]
1327 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1329 adc twtemph, twtemph
1330 cmovc ghash_poly_8b_temp, ghash_poly_8b
1331 xor twtempl, ghash_poly_8b_temp
1332 mov [TW+8*10], twtempl
1333 mov [TW+8*11], twtemph
1334 movdqa xmm14, [TW+16*5]
1336 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1338 adc twtemph, twtemph
1339 cmovc ghash_poly_8b_temp, ghash_poly_8b
1340 xor twtempl, ghash_poly_8b_temp
1341 mov [TW+8*12], twtempl
1342 mov [TW+8*13], twtemph
1343 movdqa xmm15, [TW+16*6]
1345 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1347 adc twtemph, twtemph
1348 cmovc ghash_poly_8b_temp, ghash_poly_8b
1349 xor twtempl, ghash_poly_8b_temp
1350 mov [TW+8*14], twtempl
1351 mov [TW+8*15], twtemph
1352 ;movdqa xmm16, [TW+16*7]
1354 cmp ptr_ciphertext, target_ptr_val
1358 movdqu xmm1, [ptr_plaintext+16*0]
1359 movdqu xmm2, [ptr_plaintext+16*1]
1360 movdqu xmm3, [ptr_plaintext+16*2]
1361 movdqu xmm4, [ptr_plaintext+16*3]
1362 movdqu xmm5, [ptr_plaintext+16*4]
1363 movdqu xmm6, [ptr_plaintext+16*5]
1364 movdqu xmm7, [ptr_plaintext+16*6]
1365 movdqu xmm8, [ptr_plaintext+16*7]
1367 add ptr_plaintext, 128
1369 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
1372 movdqu [ptr_ciphertext+16*0], xmm1
1373 movdqu [ptr_ciphertext+16*1], xmm2
1374 movdqu [ptr_ciphertext+16*2], xmm3
1375 movdqu [ptr_ciphertext+16*3], xmm4
1376 movdqu [ptr_ciphertext+16*4], xmm5
1377 movdqu [ptr_ciphertext+16*5], xmm6
1378 movdqu [ptr_ciphertext+16*6], xmm7
1379 movdqu [ptr_ciphertext+16*7], xmm8
1380 add ptr_ciphertext, 128
1382 cmp ptr_ciphertext, target_ptr_val
1387 movdqu xmm1, [ptr_plaintext+16*0]
1388 movdqu xmm2, [ptr_plaintext+16*1]
1389 movdqu xmm3, [ptr_plaintext+16*2]
1390 movdqu xmm4, [ptr_plaintext+16*3]
1391 movdqu xmm5, [ptr_plaintext+16*4]
1392 movdqu xmm6, [ptr_plaintext+16*5]
1393 movdqu xmm7, [ptr_plaintext+16*6]
1394 movdqu xmm8, [ptr_plaintext+16*7]
1395 encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
1398 movdqu [ptr_ciphertext+16*0], xmm1
1399 movdqu [ptr_ciphertext+16*1], xmm2
1400 movdqu [ptr_ciphertext+16*2], xmm3
1401 movdqu [ptr_ciphertext+16*3], xmm4
1402 movdqu [ptr_ciphertext+16*4], xmm5
1403 movdqu [ptr_ciphertext+16*5], xmm6
1404 movdqu [ptr_ciphertext+16*6], xmm7
1407 and N_val, 15 ; N_val = N_val mod 16
1410 ; start cipher stealing
1412 ; generate next Tweak value
1413 xor ghash_poly_8b_temp, ghash_poly_8b_temp
1415 adc twtemph, twtemph
1416 cmovc ghash_poly_8b_temp, ghash_poly_8b
1417 xor twtempl, ghash_poly_8b_temp
1419 mov [TW + 8], twtemph
1423 ; shift xmm8 to the left by 16-N_val bytes
1424 lea twtempl, [pshufb_shf_table]
1425 movdqu xmm0, [twtempl+N_val]
1429 movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
1430 movdqu [ptr_ciphertext + 112 + N_val], xmm8
1432 ; shift xmm3 to the right by 16-N_val bytes
1433 lea twtempl, [pshufb_shf_table +16]
1435 movdqu xmm0, [twtempl]
1439 pblendvb xmm3, xmm2 ;xmm0 is implicit
1443 pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
1446 ;encrypt last block with cipher stealing
1447 pxor xmm8, [keys] ; ARK
1448 aesenc xmm8, [keys + 16*1] ; round 1
1449 aesenc xmm8, [keys + 16*2] ; round 2
1450 aesenc xmm8, [keys + 16*3] ; round 3
1451 aesenc xmm8, [keys + 16*4] ; round 4
1452 aesenc xmm8, [keys + 16*5] ; round 5
1453 aesenc xmm8, [keys + 16*6] ; round 6
1454 aesenc xmm8, [keys + 16*7] ; round 7
1455 aesenc xmm8, [keys + 16*8] ; round 8
1456 aesenc xmm8, [keys + 16*9] ; round 9
1457 aesenc xmm8, [keys + 16*10] ; round 9
1458 aesenc xmm8, [keys + 16*11] ; round 9
1459 aesenc xmm8, [keys + 16*12] ; round 9
1460 aesenc xmm8, [keys + 16*13] ; round 9
1461 aesenclast xmm8, [keys + 16*14] ; round 10
1467 ; store last ciphertext value
1468 movdqu [ptr_ciphertext+16*7], xmm8
1472 mov rbx, [_gpr + 8*0]
1473 %ifidn __OUTPUT_FORMAT__, win64
1474 mov rdi, [_gpr + 8*1]
1475 mov rsi, [_gpr + 8*2]
1478 movdqa xmm6, [_xmm + 16*0]
1479 movdqa xmm7, [_xmm + 16*1]
1480 movdqa xmm8, [_xmm + 16*2]
1481 movdqa xmm9, [_xmm + 16*3]
1482 movdqa xmm10, [_xmm + 16*4]
1483 movdqa xmm11, [_xmm + 16*5]
1484 movdqa xmm12, [_xmm + 16*6]
1485 movdqa xmm13, [_xmm + 16*7]
1486 movdqa xmm14, [_xmm + 16*8]
1487 movdqa xmm15, [_xmm + 16*9]
1490 add rsp, VARIABLE_OFFSET
1498 _less_than_128_bytes:
1518 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
1519 sub ptr_plaintext, 16*1
1520 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
1522 movdqu [ptr_ciphertext+16*0], xmm1
1523 movdqu [ptr_ciphertext+16*1], xmm2
1524 movdqu [ptr_ciphertext+16*2], xmm3
1525 movdqu [ptr_ciphertext+16*3], xmm4
1526 movdqu [ptr_ciphertext+16*4], xmm5
1527 movdqu [ptr_ciphertext+16*5], xmm6
1529 sub ptr_ciphertext, 16*1
1532 and N_val, 15 ; N_val = N_val mod 16
1536 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
1537 sub ptr_plaintext, 16*2
1538 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
1540 movdqu [ptr_ciphertext+16*0], xmm1
1541 movdqu [ptr_ciphertext+16*1], xmm2
1542 movdqu [ptr_ciphertext+16*2], xmm3
1543 movdqu [ptr_ciphertext+16*3], xmm4
1544 movdqu [ptr_ciphertext+16*4], xmm5
1546 sub ptr_ciphertext, 16*2
1549 and N_val, 15 ; N_val = N_val mod 16
1553 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
1554 sub ptr_plaintext, 16*3
1555 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
1557 movdqu [ptr_ciphertext+16*0], xmm1
1558 movdqu [ptr_ciphertext+16*1], xmm2
1559 movdqu [ptr_ciphertext+16*2], xmm3
1560 movdqu [ptr_ciphertext+16*3], xmm4
1562 sub ptr_ciphertext, 16*3
1565 and N_val, 15 ; N_val = N_val mod 16
1569 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
1570 sub ptr_plaintext, 16*4
1571 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
1573 movdqu [ptr_ciphertext+16*0], xmm1
1574 movdqu [ptr_ciphertext+16*1], xmm2
1575 movdqu [ptr_ciphertext+16*2], xmm3
1577 sub ptr_ciphertext, 16*4
1580 and N_val, 15 ; N_val = N_val mod 16
1584 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
1585 sub ptr_plaintext, 16*5
1586 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
1588 movdqu [ptr_ciphertext+16*0], xmm1
1589 movdqu [ptr_ciphertext+16*1], xmm2
1591 sub ptr_ciphertext, 16*5
1594 and N_val, 15 ; N_val = N_val mod 16
1599 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
1600 sub ptr_plaintext, 16*6
1601 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
1603 movdqu [ptr_ciphertext], xmm1
1605 sub ptr_ciphertext, 16*6
1608 and N_val, 15 ; N_val = N_val mod 16
1614 initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
1616 sub ptr_plaintext, 16*7
1617 encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
1620 sub ptr_ciphertext, 16*7
1623 and N_val, 15 ; N_val = N_val mod 16
1631 ; use these values for shift constants for the pshufb instruction
1632 ; different alignments result in values as shown:
1633 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
1634 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
1635 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
1636 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
1637 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
1638 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
1639 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
1640 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
1641 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
1642 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
1643 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
1644 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
1645 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
1646 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
1647 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
1648 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
1649 dq 0x0706050403020100, 0x000e0d0c0b0a0908
1652 dq 0x8080808080808080, 0x8080808080808080