2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
60 TWOONE: .octa 0x00000001000000000000000000000001
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
80 .section .rodata.cst16.enc, "aM", @progbits, 16
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
96 #define STACK_OFFSET 8*3
97 #define HashKey 16*0 // store HashKey <<1 mod poly here
98 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define VARIABLE_OFFSET 16*8
121 #define arg7 STACK_OFFSET+8(%r14)
122 #define arg8 STACK_OFFSET+16(%r14)
123 #define arg9 STACK_OFFSET+24(%r14)
124 #define arg10 STACK_OFFSET+32(%r14)
125 #define keysize 2*15*16(%arg1)
142 #define BSWAP_MASK %xmm10
146 #define GF128MUL_MASK %xmm10
180 # states of %xmm registers %xmm6:%xmm15 not saved
181 # all %xmm registers are clobbered
183 sub $VARIABLE_OFFSET, %rsp
196 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
197 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
200 movdqu (%r12), %xmm13
201 movdqa SHUF_MASK(%rip), %xmm2
202 PSHUFB_XMM %xmm2, %xmm13
204 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
216 pshufd $0x24, %xmm1, %xmm2
217 pcmpeqd TWOONE(%rip), %xmm2
218 pand POLY(%rip), %xmm2
220 movdqa %xmm13, HashKey(%rsp)
221 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
227 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
230 * Input: A and B (128-bits each, bit-reflected)
231 * Output: C = A*B*x mod poly, (i.e. >>1 )
232 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
233 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
236 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
238 pshufd $78, \GH, \TMP2
239 pshufd $78, \HK, \TMP3
240 pxor \GH, \TMP2 # TMP2 = a1+a0
241 pxor \HK, \TMP3 # TMP3 = b1+b0
242 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
243 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
244 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
246 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
248 pslldq $8, \TMP3 # left shift TMP3 2 DWs
249 psrldq $8, \TMP2 # right shift TMP2 2 DWs
251 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
253 # first phase of the reduction
257 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
258 # in in order to perform
260 pslld $31, \TMP2 # packed right shift <<31
261 pslld $30, \TMP3 # packed right shift <<30
262 pslld $25, \TMP4 # packed right shift <<25
263 pxor \TMP3, \TMP2 # xor the shifted versions
266 psrldq $4, \TMP5 # right shift TMP5 1 DW
267 pslldq $12, \TMP2 # left shift TMP2 3 DWs
270 # second phase of the reduction
272 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
273 # in in order to perform
277 psrld $1,\TMP2 # packed left shift >>1
278 psrld $2,\TMP3 # packed left shift >>2
279 psrld $7,\TMP4 # packed left shift >>7
280 pxor \TMP3,\TMP2 # xor the shifted versions
284 pxor \TMP1, \GH # result is in TMP1
287 # Reads DLEN bytes starting at DPTR and stores in XMMDst
288 # where 0 < DLEN < 16
289 # Clobbers %rax, DLEN and XMM1
290 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
294 MOVQ_R64_XMM %rax, \XMMDst
296 jz _done_read_partial_block_\@
300 mov 7(\DPTR, \DLEN, 1), %al
302 jnz _read_next_byte_\@
303 MOVQ_R64_XMM %rax, \XMM1
306 jmp _done_read_partial_block_\@
309 _read_next_byte_lt8_\@:
311 mov -1(\DPTR, \DLEN, 1), %al
313 jnz _read_next_byte_lt8_\@
314 MOVQ_R64_XMM %rax, \XMMDst
315 _done_read_partial_block_\@:
319 * if a = number of total plaintext bytes
321 * num_initial_blocks = b mod 4
322 * encrypt the initial num_initial_blocks blocks and apply ghash on
324 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
326 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
330 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
331 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
332 MOVADQ SHUF_MASK(%rip), %xmm14
333 mov arg7, %r10 # %r10 = AAD
334 mov arg8, %r11 # %r11 = aadLen
341 movdqu (%r10), %xmm\i
342 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
344 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
348 jge _get_AAD_blocks\@
352 /* read the last <16B of AAD */
357 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
358 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
360 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
363 xor %r11, %r11 # initialise the data pointer offset as zero
364 # start AES for num_initial_blocks blocks
366 mov %arg5, %rax # %rax = *Y0
367 movdqu (%rax), \XMM0 # XMM0 = Y0
368 PSHUFB_XMM %xmm14, \XMM0
370 .if (\i == 5) || (\i == 6) || (\i == 7)
372 MOVADQ ONE(%RIP),\TMP1
373 MOVADQ 0(%arg1),\TMP2
375 paddd \TMP1, \XMM0 # INCR Y0
377 movdqa \XMM0, %xmm\index
379 MOVADQ \XMM0, %xmm\index
381 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
382 pxor \TMP2, %xmm\index
386 shr $2,%eax # 128->4, 192->6, 256->8
387 add $5,%eax # 128->9, 192->11, 256->13
392 AESENC \TMP1, %xmm\index
396 jnz aes_loop_initial_\@
400 AESENCLAST \TMP1, %xmm\index # Last Round
403 movdqu (%arg3 , %r11, 1), \TMP1
404 pxor \TMP1, %xmm\index
405 movdqu %xmm\index, (%arg2 , %r11, 1)
406 # write back plaintext/ciphertext for num_initial_blocks
410 movdqa \TMP1, %xmm\index
412 PSHUFB_XMM %xmm14, %xmm\index
414 # prepare plaintext/ciphertext for GHASH computation
418 # apply GHASH on num_initial_blocks blocks
422 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
424 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
426 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
429 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
431 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
434 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
437 jl _initial_blocks_done\@
438 # no need for precomputed values
441 * Precomputations for HashKey parallel with encryption of first 4 blocks.
442 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
444 MOVADQ ONE(%RIP),\TMP1
445 paddd \TMP1, \XMM0 # INCR Y0
447 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
449 paddd \TMP1, \XMM0 # INCR Y0
451 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
453 paddd \TMP1, \XMM0 # INCR Y0
455 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
457 paddd \TMP1, \XMM0 # INCR Y0
459 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
461 MOVADQ 0(%arg1),\TMP1
467 pshufd $78, \TMP3, \TMP1
469 movdqa \TMP1, HashKey_k(%rsp)
470 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
471 # TMP5 = HashKey^2<<1 (mod poly)
472 movdqa \TMP5, HashKey_2(%rsp)
473 # HashKey_2 = HashKey^2<<1 (mod poly)
474 pshufd $78, \TMP5, \TMP1
476 movdqa \TMP1, HashKey_2_k(%rsp)
477 .irpc index, 1234 # do 4 rounds
478 movaps 0x10*\index(%arg1), \TMP1
484 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
485 # TMP5 = HashKey^3<<1 (mod poly)
486 movdqa \TMP5, HashKey_3(%rsp)
487 pshufd $78, \TMP5, \TMP1
489 movdqa \TMP1, HashKey_3_k(%rsp)
490 .irpc index, 56789 # do next 5 rounds
491 movaps 0x10*\index(%arg1), \TMP1
497 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
498 # TMP5 = HashKey^3<<1 (mod poly)
499 movdqa \TMP5, HashKey_4(%rsp)
500 pshufd $78, \TMP5, \TMP1
502 movdqa \TMP1, HashKey_4_k(%rsp)
505 shr $2,%eax # 128->4, 192->6, 256->8
506 sub $4,%eax # 128->0, 192->2, 256->4
507 jz aes_loop_pre_done\@
512 AESENC \TMP2, %xmm\index
520 AESENCLAST \TMP2, \XMM1
521 AESENCLAST \TMP2, \XMM2
522 AESENCLAST \TMP2, \XMM3
523 AESENCLAST \TMP2, \XMM4
524 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
527 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
530 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
533 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
536 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
539 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
542 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
545 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
548 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
549 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
550 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
551 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
557 # combine GHASHed value with the corresponding ciphertext
558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
559 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
560 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
562 _initial_blocks_done\@:
567 * encrypt 4 blocks at a time
568 * ghash the 4 previously encrypted ciphertext blocks
569 * arg1, %arg2, %arg3 are used as pointers only, not modified
570 * %r11 is the data offset value
572 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
573 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
580 movdqa SHUF_MASK(%rip), %xmm15
581 # multiply TMP5 * HashKey using karatsuba
584 pshufd $78, \XMM5, \TMP6
586 paddd ONE(%rip), \XMM0 # INCR CNT
587 movdqa HashKey_4(%rsp), \TMP5
588 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
590 paddd ONE(%rip), \XMM0 # INCR CNT
592 paddd ONE(%rip), \XMM0 # INCR CNT
594 paddd ONE(%rip), \XMM0 # INCR CNT
596 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
597 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
598 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
599 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
600 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
606 movdqa HashKey_4_k(%rsp), \TMP5
607 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
608 movaps 0x10(%arg1), \TMP1
609 AESENC \TMP1, \XMM1 # Round 1
613 movaps 0x20(%arg1), \TMP1
614 AESENC \TMP1, \XMM1 # Round 2
619 pshufd $78, \XMM6, \TMP2
621 movdqa HashKey_3(%rsp), \TMP5
622 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
623 movaps 0x30(%arg1), \TMP3
624 AESENC \TMP3, \XMM1 # Round 3
628 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
629 movaps 0x40(%arg1), \TMP3
630 AESENC \TMP3, \XMM1 # Round 4
634 movdqa HashKey_3_k(%rsp), \TMP5
635 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
636 movaps 0x50(%arg1), \TMP3
637 AESENC \TMP3, \XMM1 # Round 5
642 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
646 pshufd $78, \XMM7, \TMP2
648 movdqa HashKey_2(%rsp ), \TMP5
650 # Multiply TMP5 * HashKey using karatsuba
652 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
653 movaps 0x60(%arg1), \TMP3
654 AESENC \TMP3, \XMM1 # Round 6
658 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
659 movaps 0x70(%arg1), \TMP3
660 AESENC \TMP3, \XMM1 # Round 7
664 movdqa HashKey_2_k(%rsp), \TMP5
665 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
666 movaps 0x80(%arg1), \TMP3
667 AESENC \TMP3, \XMM1 # Round 8
672 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
676 # Multiply XMM8 * HashKey
677 # XMM8 and TMP5 hold the values for the two operands
680 pshufd $78, \XMM8, \TMP2
682 movdqa HashKey(%rsp), \TMP5
683 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
684 movaps 0x90(%arg1), \TMP3
685 AESENC \TMP3, \XMM1 # Round 9
689 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
692 shr $2,%eax # 128->4, 192->6, 256->8
693 sub $4,%eax # 128->0, 192->2, 256->4
694 jz aes_loop_par_enc_done
699 AESENC \TMP3, %xmm\index
705 aes_loop_par_enc_done:
707 AESENCLAST \TMP3, \XMM1 # Round 10
708 AESENCLAST \TMP3, \XMM2
709 AESENCLAST \TMP3, \XMM3
710 AESENCLAST \TMP3, \XMM4
711 movdqa HashKey_k(%rsp), \TMP5
712 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
713 movdqu (%arg3,%r11,1), \TMP3
714 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
715 movdqu 16(%arg3,%r11,1), \TMP3
716 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
717 movdqu 32(%arg3,%r11,1), \TMP3
718 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
719 movdqu 48(%arg3,%r11,1), \TMP3
720 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
721 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
722 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
723 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
724 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
725 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
726 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
727 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
728 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
736 pslldq $8, \TMP3 # left shift TMP3 2 DWs
737 psrldq $8, \TMP2 # right shift TMP2 2 DWs
739 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
741 # first phase of reduction
746 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
747 pslld $31, \TMP2 # packed right shift << 31
748 pslld $30, \TMP3 # packed right shift << 30
749 pslld $25, \TMP4 # packed right shift << 25
750 pxor \TMP3, \TMP2 # xor the shifted versions
753 psrldq $4, \TMP5 # right shift T5 1 DW
754 pslldq $12, \TMP2 # left shift T2 3 DWs
757 # second phase of reduction
759 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
762 psrld $1, \TMP2 # packed left shift >>1
763 psrld $2, \TMP3 # packed left shift >>2
764 psrld $7, \TMP4 # packed left shift >>7
765 pxor \TMP3,\TMP2 # xor the shifted versions
769 pxor \TMP1, \XMM5 # result is in TMP1
775 * decrypt 4 blocks at a time
776 * ghash the 4 previously decrypted ciphertext blocks
777 * arg1, %arg2, %arg3 are used as pointers only, not modified
778 * %r11 is the data offset value
780 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
781 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
788 movdqa SHUF_MASK(%rip), %xmm15
789 # multiply TMP5 * HashKey using karatsuba
792 pshufd $78, \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
798 paddd ONE(%rip), \XMM0 # INCR CNT
800 paddd ONE(%rip), \XMM0 # INCR CNT
802 paddd ONE(%rip), \XMM0 # INCR CNT
804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
827 pshufd $78, \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
850 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
854 pshufd $78, \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
858 # Multiply TMP5 * HashKey using karatsuba
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
880 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
888 pshufd $78, \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_dec_done
907 AESENC \TMP3, %xmm\index
913 aes_loop_par_dec_done:
915 AESENCLAST \TMP3, \XMM1 # last round
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
923 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
925 movdqu 16(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
927 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
929 movdqu 32(%arg3,%r11,1), \TMP3
930 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
933 movdqu 48(%arg3,%r11,1), \TMP3
934 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
935 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
937 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
938 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
939 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
940 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
948 pslldq $8, \TMP3 # left shift TMP3 2 DWs
949 psrldq $8, \TMP2 # right shift TMP2 2 DWs
951 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
953 # first phase of reduction
958 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
959 pslld $31, \TMP2 # packed right shift << 31
960 pslld $30, \TMP3 # packed right shift << 30
961 pslld $25, \TMP4 # packed right shift << 25
962 pxor \TMP3, \TMP2 # xor the shifted versions
965 psrldq $4, \TMP5 # right shift T5 1 DW
966 pslldq $12, \TMP2 # left shift T2 3 DWs
969 # second phase of reduction
971 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
974 psrld $1, \TMP2 # packed left shift >>1
975 psrld $2, \TMP3 # packed left shift >>2
976 psrld $7, \TMP4 # packed left shift >>7
977 pxor \TMP3,\TMP2 # xor the shifted versions
981 pxor \TMP1, \XMM5 # result is in TMP1
986 /* GHASH the last 4 ciphertext blocks. */
987 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
988 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
990 # Multiply TMP6 * HashKey (using Karatsuba)
993 pshufd $78, \XMM1, \TMP2
995 movdqa HashKey_4(%rsp), \TMP5
996 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
997 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
998 movdqa HashKey_4_k(%rsp), \TMP4
999 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1000 movdqa \XMM1, \XMMDst
1001 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1003 # Multiply TMP1 * HashKey (using Karatsuba)
1006 pshufd $78, \XMM2, \TMP2
1008 movdqa HashKey_3(%rsp), \TMP5
1009 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1010 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1011 movdqa HashKey_3_k(%rsp), \TMP4
1012 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1016 # results accumulated in TMP6, XMMDst, XMM1
1018 # Multiply TMP1 * HashKey (using Karatsuba)
1021 pshufd $78, \XMM3, \TMP2
1023 movdqa HashKey_2(%rsp), \TMP5
1024 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1025 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1026 movdqa HashKey_2_k(%rsp), \TMP4
1027 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1030 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1032 # Multiply TMP1 * HashKey (using Karatsuba)
1034 pshufd $78, \XMM4, \TMP2
1036 movdqa HashKey(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1038 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1039 movdqa HashKey_k(%rsp), \TMP4
1040 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1046 # middle section of the temp results combined as in karatsuba algorithm
1048 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1049 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1052 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1053 # first phase of the reduction
1054 movdqa \XMMDst, \TMP2
1055 movdqa \XMMDst, \TMP3
1056 movdqa \XMMDst, \TMP4
1057 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1058 pslld $31, \TMP2 # packed right shifting << 31
1059 pslld $30, \TMP3 # packed right shifting << 30
1060 pslld $25, \TMP4 # packed right shifting << 25
1061 pxor \TMP3, \TMP2 # xor the shifted versions
1064 psrldq $4, \TMP7 # right shift TMP7 1 DW
1065 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1068 # second phase of the reduction
1069 movdqa \XMMDst, \TMP2
1070 # make 3 copies of XMMDst for doing 3 shift operations
1071 movdqa \XMMDst, \TMP3
1072 movdqa \XMMDst, \TMP4
1073 psrld $1, \TMP2 # packed left shift >> 1
1074 psrld $2, \TMP3 # packed left shift >> 2
1075 psrld $7, \TMP4 # packed left shift >> 7
1076 pxor \TMP3, \TMP2 # xor the shifted versions
1080 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1084 /* Encryption of a single block
1088 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1092 shr $2,%eax # 128->4, 192->6, 256->8
1093 add $5,%eax # 128->9, 192->11, 256->13
1094 lea 16(%arg1), %r10 # get first expanded key address
1104 AESENCLAST \TMP1,\XMM0
1106 /*****************************************************************************
1107 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1108 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1109 * const u8 *in, // Ciphertext input
1110 * u64 plaintext_len, // Length of data in bytes for decryption.
1111 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1112 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1113 * // concatenated with 0x00000001. 16-byte aligned pointer.
1114 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1115 * const u8 *aad, // Additional Authentication Data (AAD)
1116 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1117 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1118 * // given authentication tag and only return the plaintext if they match.
1119 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1120 * // (most likely), 12 or 8.
1125 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1126 * set of 11 keys in the data structure void *aes_ctx
1130 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1131 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1132 * | Salt (From the SA) |
1133 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1134 * | Initialization Vector |
1135 * | (This is the sequence number from IPSec header) |
1136 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1138 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1143 * AAD padded to 128 bits with 0
1144 * for example, assume AAD is a u32 vector
1146 * if AAD is 8 bytes:
1147 * AAD[3] = {A0, A1};
1148 * padded AAD in xmm register = {A1 A0 0 0}
1151 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1152 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1154 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1155 * | 32-bit Sequence Number (A0) |
1156 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1158 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1160 * AAD Format with 32-bit Sequence Number
1162 * if AAD is 12 bytes:
1163 * AAD[3] = {A0, A1, A2};
1164 * padded AAD in xmm register = {A2 A1 A0 0}
1167 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1168 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1169 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1170 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1172 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1173 * | 64-bit Extended Sequence Number {A1,A0} |
1175 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1177 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1179 * AAD Format with 64-bit Extended Sequence Number
1181 * poly = x^128 + x^127 + x^126 + x^121 + 1
1183 *****************************************************************************/
1184 ENTRY(aesni_gcm_dec)
1189 # Decrypt first few blocks
1192 jz _initial_num_blocks_is_0_decrypt
1194 jb _initial_num_blocks_is_1_decrypt
1195 je _initial_num_blocks_is_2_decrypt
1196 _initial_num_blocks_is_3_decrypt:
1197 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1198 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1200 jmp _initial_blocks_decrypted
1201 _initial_num_blocks_is_2_decrypt:
1202 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1203 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1205 jmp _initial_blocks_decrypted
1206 _initial_num_blocks_is_1_decrypt:
1207 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1208 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1210 jmp _initial_blocks_decrypted
1211 _initial_num_blocks_is_0_decrypt:
1212 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1213 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1214 _initial_blocks_decrypted:
1216 je _zero_cipher_left_decrypt
1218 je _four_cipher_left_decrypt
1220 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1221 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1225 _four_cipher_left_decrypt:
1226 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1227 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1228 _zero_cipher_left_decrypt:
1230 and $15, %r13 # %r13 = arg4 (mod 16)
1231 je _multiple_of_16_bytes_decrypt
1233 # Handle the last <16 byte block separately
1235 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1236 movdqa SHUF_MASK(%rip), %xmm10
1237 PSHUFB_XMM %xmm10, %xmm0
1239 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1241 lea (%arg3,%r11,1), %r10
1243 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1245 lea ALL_F+16(%rip), %r12
1248 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1249 movdqu (%r12), %xmm1
1250 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1251 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1253 movdqa SHUF_MASK(%rip), %xmm10
1254 PSHUFB_XMM %xmm10 ,%xmm2
1257 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1260 MOVQ_R64_XMM %xmm0, %rax
1262 jle _less_than_8_bytes_left_decrypt
1263 mov %rax, (%arg2 , %r11, 1)
1266 MOVQ_R64_XMM %xmm0, %rax
1268 _less_than_8_bytes_left_decrypt:
1269 mov %al, (%arg2, %r11, 1)
1273 jne _less_than_8_bytes_left_decrypt
1274 _multiple_of_16_bytes_decrypt:
1275 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1276 shl $3, %r12 # convert into number of bits
1277 movd %r12d, %xmm15 # len(A) in %xmm15
1278 shl $3, %arg4 # len(C) in bits (*128)
1279 MOVQ_R64_XMM %arg4, %xmm1
1280 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1281 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1283 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1284 # final GHASH computation
1285 movdqa SHUF_MASK(%rip), %xmm10
1286 PSHUFB_XMM %xmm10, %xmm8
1288 mov %arg5, %rax # %rax = *Y0
1289 movdqu (%rax), %xmm0 # %xmm0 = Y0
1290 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1293 mov arg9, %r10 # %r10 = authTag
1294 mov arg10, %r11 # %r11 = auth_tag_len
1300 MOVQ_R64_XMM %xmm0, %rax
1306 je _return_T_done_decrypt
1314 je _return_T_done_decrypt
1321 je _return_T_done_decrypt
1326 jmp _return_T_done_decrypt
1328 movdqu %xmm0, (%r10)
1329 _return_T_done_decrypt:
1332 ENDPROC(aesni_gcm_dec)
1335 /*****************************************************************************
1336 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1337 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1338 * const u8 *in, // Plaintext input
1339 * u64 plaintext_len, // Length of data in bytes for encryption.
1340 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1341 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1342 * // concatenated with 0x00000001. 16-byte aligned pointer.
1343 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1344 * const u8 *aad, // Additional Authentication Data (AAD)
1345 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1346 * u8 *auth_tag, // Authenticated Tag output.
1347 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1353 * keys are pre-expanded and aligned to 16 bytes. we are using the
1354 * first set of 11 keys in the data structure void *aes_ctx
1359 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361 * | Salt (From the SA) |
1362 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363 * | Initialization Vector |
1364 * | (This is the sequence number from IPSec header) |
1365 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1367 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1372 * AAD padded to 128 bits with 0
1373 * for example, assume AAD is a u32 vector
1375 * if AAD is 8 bytes:
1376 * AAD[3] = {A0, A1};
1377 * padded AAD in xmm register = {A1 A0 0 0}
1380 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1381 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1383 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384 * | 32-bit Sequence Number (A0) |
1385 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1387 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1389 * AAD Format with 32-bit Sequence Number
1391 * if AAD is 12 bytes:
1392 * AAD[3] = {A0, A1, A2};
1393 * padded AAD in xmm register = {A2 A1 A0 0}
1396 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1397 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1399 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1400 * | 64-bit Extended Sequence Number {A1,A0} |
1402 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1404 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1406 * AAD Format with 64-bit Extended Sequence Number
1408 * poly = x^128 + x^127 + x^126 + x^121 + 1
1409 ***************************************************************************/
1410 ENTRY(aesni_gcm_enc)
1414 # Encrypt first few blocks
1417 jz _initial_num_blocks_is_0_encrypt
1419 jb _initial_num_blocks_is_1_encrypt
1420 je _initial_num_blocks_is_2_encrypt
1421 _initial_num_blocks_is_3_encrypt:
1422 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1423 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1425 jmp _initial_blocks_encrypted
1426 _initial_num_blocks_is_2_encrypt:
1427 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1428 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1430 jmp _initial_blocks_encrypted
1431 _initial_num_blocks_is_1_encrypt:
1432 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1433 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1435 jmp _initial_blocks_encrypted
1436 _initial_num_blocks_is_0_encrypt:
1437 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1438 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1439 _initial_blocks_encrypted:
1441 # Main loop - Encrypt remaining blocks
1444 je _zero_cipher_left_encrypt
1446 je _four_cipher_left_encrypt
1447 _encrypt_by_4_encrypt:
1448 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1449 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1452 jne _encrypt_by_4_encrypt
1453 _four_cipher_left_encrypt:
1454 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1455 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1456 _zero_cipher_left_encrypt:
1458 and $15, %r13 # %r13 = arg4 (mod 16)
1459 je _multiple_of_16_bytes_encrypt
1461 # Handle the last <16 Byte block separately
1462 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1463 movdqa SHUF_MASK(%rip), %xmm10
1464 PSHUFB_XMM %xmm10, %xmm0
1466 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1468 lea (%arg3,%r11,1), %r10
1470 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1472 lea ALL_F+16(%rip), %r12
1474 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1475 movdqu (%r12), %xmm1
1476 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1477 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1478 movdqa SHUF_MASK(%rip), %xmm10
1479 PSHUFB_XMM %xmm10,%xmm0
1482 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1483 # GHASH computation for the last <16 byte block
1484 movdqa SHUF_MASK(%rip), %xmm10
1485 PSHUFB_XMM %xmm10, %xmm0
1487 # shuffle xmm0 back to output as ciphertext
1490 MOVQ_R64_XMM %xmm0, %rax
1492 jle _less_than_8_bytes_left_encrypt
1493 mov %rax, (%arg2 , %r11, 1)
1496 MOVQ_R64_XMM %xmm0, %rax
1498 _less_than_8_bytes_left_encrypt:
1499 mov %al, (%arg2, %r11, 1)
1503 jne _less_than_8_bytes_left_encrypt
1504 _multiple_of_16_bytes_encrypt:
1505 mov arg8, %r12 # %r12 = addLen (number of bytes)
1507 movd %r12d, %xmm15 # len(A) in %xmm15
1508 shl $3, %arg4 # len(C) in bits (*128)
1509 MOVQ_R64_XMM %arg4, %xmm1
1510 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1511 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1513 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1514 # final GHASH computation
1515 movdqa SHUF_MASK(%rip), %xmm10
1516 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1518 mov %arg5, %rax # %rax = *Y0
1519 movdqu (%rax), %xmm0 # %xmm0 = Y0
1520 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1523 mov arg9, %r10 # %r10 = authTag
1524 mov arg10, %r11 # %r11 = auth_tag_len
1530 MOVQ_R64_XMM %xmm0, %rax
1536 je _return_T_done_encrypt
1544 je _return_T_done_encrypt
1551 je _return_T_done_encrypt
1556 jmp _return_T_done_encrypt
1558 movdqu %xmm0, (%r10)
1559 _return_T_done_encrypt:
1562 ENDPROC(aesni_gcm_enc)
1569 _key_expansion_256a:
1570 pshufd $0b11111111, %xmm1, %xmm1
1571 shufps $0b00010000, %xmm0, %xmm4
1573 shufps $0b10001100, %xmm0, %xmm4
1576 movaps %xmm0, (TKEYP)
1579 ENDPROC(_key_expansion_128)
1580 ENDPROC(_key_expansion_256a)
1583 _key_expansion_192a:
1584 pshufd $0b01010101, %xmm1, %xmm1
1585 shufps $0b00010000, %xmm0, %xmm4
1587 shufps $0b10001100, %xmm0, %xmm4
1594 pshufd $0b11111111, %xmm0, %xmm3
1599 shufps $0b01000100, %xmm0, %xmm6
1600 movaps %xmm6, (TKEYP)
1601 shufps $0b01001110, %xmm2, %xmm1
1602 movaps %xmm1, 0x10(TKEYP)
1605 ENDPROC(_key_expansion_192a)
1608 _key_expansion_192b:
1609 pshufd $0b01010101, %xmm1, %xmm1
1610 shufps $0b00010000, %xmm0, %xmm4
1612 shufps $0b10001100, %xmm0, %xmm4
1618 pshufd $0b11111111, %xmm0, %xmm3
1622 movaps %xmm0, (TKEYP)
1625 ENDPROC(_key_expansion_192b)
1628 _key_expansion_256b:
1629 pshufd $0b10101010, %xmm1, %xmm1
1630 shufps $0b00010000, %xmm2, %xmm4
1632 shufps $0b10001100, %xmm2, %xmm4
1635 movaps %xmm2, (TKEYP)
1638 ENDPROC(_key_expansion_256b)
1641 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1642 * unsigned int key_len)
1644 ENTRY(aesni_set_key)
1648 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1649 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1650 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1652 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1653 movaps %xmm0, (KEYP)
1654 lea 0x10(KEYP), TKEYP # key addr
1655 movl %edx, 480(KEYP)
1656 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1660 movups 0x10(UKEYP), %xmm2 # other user key
1661 movaps %xmm2, (TKEYP)
1663 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1664 call _key_expansion_256a
1665 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1666 call _key_expansion_256b
1667 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1668 call _key_expansion_256a
1669 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1670 call _key_expansion_256b
1671 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1672 call _key_expansion_256a
1673 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1674 call _key_expansion_256b
1675 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1676 call _key_expansion_256a
1677 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1678 call _key_expansion_256b
1679 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1680 call _key_expansion_256a
1681 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1682 call _key_expansion_256b
1683 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1684 call _key_expansion_256a
1685 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1686 call _key_expansion_256b
1687 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1688 call _key_expansion_256a
1691 movq 0x10(UKEYP), %xmm2 # other user key
1692 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1693 call _key_expansion_192a
1694 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1695 call _key_expansion_192b
1696 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1697 call _key_expansion_192a
1698 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1699 call _key_expansion_192b
1700 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1701 call _key_expansion_192a
1702 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1703 call _key_expansion_192b
1704 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1705 call _key_expansion_192a
1706 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1707 call _key_expansion_192b
1710 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1711 call _key_expansion_128
1712 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1713 call _key_expansion_128
1714 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1715 call _key_expansion_128
1716 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1717 call _key_expansion_128
1718 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1719 call _key_expansion_128
1720 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1721 call _key_expansion_128
1722 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1723 call _key_expansion_128
1724 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1725 call _key_expansion_128
1726 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1727 call _key_expansion_128
1728 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1729 call _key_expansion_128
1732 movaps (KEYP), %xmm0
1733 movaps (TKEYP), %xmm1
1734 movaps %xmm0, 240(TKEYP)
1735 movaps %xmm1, 240(KEYP)
1737 lea 240-16(TKEYP), UKEYP
1740 movaps (KEYP), %xmm0
1742 movaps %xmm1, (UKEYP)
1753 ENDPROC(aesni_set_key)
1756 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1763 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1764 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1765 movl (FRAME_OFFSET+20)(%esp), INP # src
1767 movl 480(KEYP), KLEN # key length
1768 movups (INP), STATE # input
1770 movups STATE, (OUTP) # output
1780 * _aesni_enc1: internal ABI
1782 * KEYP: key struct pointer
1784 * STATE: initial state (input)
1786 * STATE: finial state (output)
1793 movaps (KEYP), KEY # key
1795 pxor KEY, STATE # round 0
1799 lea 0x20(TKEYP), TKEYP
1802 movaps -0x60(TKEYP), KEY
1804 movaps -0x50(TKEYP), KEY
1808 movaps -0x40(TKEYP), KEY
1810 movaps -0x30(TKEYP), KEY
1814 movaps -0x20(TKEYP), KEY
1816 movaps -0x10(TKEYP), KEY
1820 movaps 0x10(TKEYP), KEY
1822 movaps 0x20(TKEYP), KEY
1824 movaps 0x30(TKEYP), KEY
1826 movaps 0x40(TKEYP), KEY
1828 movaps 0x50(TKEYP), KEY
1830 movaps 0x60(TKEYP), KEY
1832 movaps 0x70(TKEYP), KEY
1833 AESENCLAST KEY STATE
1835 ENDPROC(_aesni_enc1)
1838 * _aesni_enc4: internal ABI
1840 * KEYP: key struct pointer
1842 * STATE1: initial state (input)
1847 * STATE1: finial state (output)
1857 movaps (KEYP), KEY # key
1859 pxor KEY, STATE1 # round 0
1866 lea 0x20(TKEYP), TKEYP
1869 movaps -0x60(TKEYP), KEY
1874 movaps -0x50(TKEYP), KEY
1881 movaps -0x40(TKEYP), KEY
1886 movaps -0x30(TKEYP), KEY
1893 movaps -0x20(TKEYP), KEY
1898 movaps -0x10(TKEYP), KEY
1908 movaps 0x10(TKEYP), KEY
1913 movaps 0x20(TKEYP), KEY
1918 movaps 0x30(TKEYP), KEY
1923 movaps 0x40(TKEYP), KEY
1928 movaps 0x50(TKEYP), KEY
1933 movaps 0x60(TKEYP), KEY
1938 movaps 0x70(TKEYP), KEY
1939 AESENCLAST KEY STATE1 # last round
1940 AESENCLAST KEY STATE2
1941 AESENCLAST KEY STATE3
1942 AESENCLAST KEY STATE4
1944 ENDPROC(_aesni_enc4)
1947 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1955 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1956 movl (FRAME_OFFSET+20)(%esp), INP # src
1958 mov 480(KEYP), KLEN # key length
1960 movups (INP), STATE # input
1962 movups STATE, (OUTP) #output
1972 * _aesni_dec1: internal ABI
1974 * KEYP: key struct pointer
1976 * STATE: initial state (input)
1978 * STATE: finial state (output)
1985 movaps (KEYP), KEY # key
1987 pxor KEY, STATE # round 0
1991 lea 0x20(TKEYP), TKEYP
1994 movaps -0x60(TKEYP), KEY
1996 movaps -0x50(TKEYP), KEY
2000 movaps -0x40(TKEYP), KEY
2002 movaps -0x30(TKEYP), KEY
2006 movaps -0x20(TKEYP), KEY
2008 movaps -0x10(TKEYP), KEY
2012 movaps 0x10(TKEYP), KEY
2014 movaps 0x20(TKEYP), KEY
2016 movaps 0x30(TKEYP), KEY
2018 movaps 0x40(TKEYP), KEY
2020 movaps 0x50(TKEYP), KEY
2022 movaps 0x60(TKEYP), KEY
2024 movaps 0x70(TKEYP), KEY
2025 AESDECLAST KEY STATE
2027 ENDPROC(_aesni_dec1)
2030 * _aesni_dec4: internal ABI
2032 * KEYP: key struct pointer
2034 * STATE1: initial state (input)
2039 * STATE1: finial state (output)
2049 movaps (KEYP), KEY # key
2051 pxor KEY, STATE1 # round 0
2058 lea 0x20(TKEYP), TKEYP
2061 movaps -0x60(TKEYP), KEY
2066 movaps -0x50(TKEYP), KEY
2073 movaps -0x40(TKEYP), KEY
2078 movaps -0x30(TKEYP), KEY
2085 movaps -0x20(TKEYP), KEY
2090 movaps -0x10(TKEYP), KEY
2100 movaps 0x10(TKEYP), KEY
2105 movaps 0x20(TKEYP), KEY
2110 movaps 0x30(TKEYP), KEY
2115 movaps 0x40(TKEYP), KEY
2120 movaps 0x50(TKEYP), KEY
2125 movaps 0x60(TKEYP), KEY
2130 movaps 0x70(TKEYP), KEY
2131 AESDECLAST KEY STATE1 # last round
2132 AESDECLAST KEY STATE2
2133 AESDECLAST KEY STATE3
2134 AESDECLAST KEY STATE4
2136 ENDPROC(_aesni_dec4)
2139 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2142 ENTRY(aesni_ecb_enc)
2148 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2149 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2150 movl (FRAME_OFFSET+24)(%esp), INP # src
2151 movl (FRAME_OFFSET+28)(%esp), LEN # len
2153 test LEN, LEN # check length
2162 movups (INP), STATE1
2163 movups 0x10(INP), STATE2
2164 movups 0x20(INP), STATE3
2165 movups 0x30(INP), STATE4
2167 movups STATE1, (OUTP)
2168 movups STATE2, 0x10(OUTP)
2169 movups STATE3, 0x20(OUTP)
2170 movups STATE4, 0x30(OUTP)
2180 movups (INP), STATE1
2182 movups STATE1, (OUTP)
2196 ENDPROC(aesni_ecb_enc)
2199 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2202 ENTRY(aesni_ecb_dec)
2208 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2209 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2210 movl (FRAME_OFFSET+24)(%esp), INP # src
2211 movl (FRAME_OFFSET+28)(%esp), LEN # len
2223 movups (INP), STATE1
2224 movups 0x10(INP), STATE2
2225 movups 0x20(INP), STATE3
2226 movups 0x30(INP), STATE4
2228 movups STATE1, (OUTP)
2229 movups STATE2, 0x10(OUTP)
2230 movups STATE3, 0x20(OUTP)
2231 movups STATE4, 0x30(OUTP)
2241 movups (INP), STATE1
2243 movups STATE1, (OUTP)
2257 ENDPROC(aesni_ecb_dec)
2260 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len, u8 *iv)
2263 ENTRY(aesni_cbc_enc)
2270 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2271 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2272 movl (FRAME_OFFSET+28)(%esp), INP # src
2273 movl (FRAME_OFFSET+32)(%esp), LEN # len
2274 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2279 movups (IVP), STATE # load iv as initial state
2282 movups (INP), IN # load input
2285 movups STATE, (OUTP) # store output
2301 ENDPROC(aesni_cbc_enc)
2304 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2305 * size_t len, u8 *iv)
2307 ENTRY(aesni_cbc_dec)
2314 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2315 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2316 movl (FRAME_OFFSET+28)(%esp), INP # src
2317 movl (FRAME_OFFSET+32)(%esp), LEN # len
2318 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2321 jb .Lcbc_dec_just_ret
2331 movups 0x10(INP), IN2
2334 movups 0x20(INP), IN3
2336 movups 0x30(INP), IN4
2339 movups 0x20(INP), IN1
2341 movups 0x30(INP), IN2
2356 movups 0x10(INP), IN2
2359 movups STATE1, (OUTP)
2360 movups STATE2, 0x10(OUTP)
2361 movups STATE3, 0x20(OUTP)
2362 movups STATE4, 0x30(OUTP)
2376 movups STATE, (OUTP)
2394 ENDPROC(aesni_cbc_dec)
2397 .pushsection .rodata
2400 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2404 * _aesni_inc_init: internal ABI
2405 * setup registers used by _aesni_inc
2409 * CTR: == IV, in little endian
2410 * TCTR_LOW: == lower qword of CTR
2411 * INC: == 1, in little endian
2412 * BSWAP_MASK == endian swapping mask
2416 movaps .Lbswap_mask, BSWAP_MASK
2418 PSHUFB_XMM BSWAP_MASK CTR
2420 MOVQ_R64_XMM TCTR_LOW INC
2421 MOVQ_R64_XMM CTR TCTR_LOW
2423 ENDPROC(_aesni_inc_init)
2426 * _aesni_inc: internal ABI
2427 * Increase IV by 1, IV is in big endian
2430 * CTR: == IV, in little endian
2431 * TCTR_LOW: == lower qword of CTR
2432 * INC: == 1, in little endian
2433 * BSWAP_MASK == endian swapping mask
2437 * CTR: == output IV, in little endian
2438 * TCTR_LOW: == lower qword of CTR
2450 PSHUFB_XMM BSWAP_MASK IV
2455 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2456 * size_t len, u8 *iv)
2458 ENTRY(aesni_ctr_enc)
2461 jb .Lctr_enc_just_ret
2464 call _aesni_inc_init
2474 movups 0x10(INP), IN2
2477 movups 0x20(INP), IN3
2480 movups 0x30(INP), IN4
2483 movups STATE1, (OUTP)
2485 movups STATE2, 0x10(OUTP)
2487 movups STATE3, 0x20(OUTP)
2489 movups STATE4, 0x30(OUTP)
2504 movups STATE, (OUTP)
2515 ENDPROC(aesni_ctr_enc)
2518 * _aesni_gf128mul_x_ble: internal ABI
2519 * Multiply in GF(2^128) for XTS IVs
2522 * GF128MUL_MASK == mask with 0x87 and 0x01
2526 * CTR: == temporary value
2528 #define _aesni_gf128mul_x_ble() \
2529 pshufd $0x13, IV, CTR; \
2532 pand GF128MUL_MASK, CTR; \
2536 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2539 ENTRY(aesni_xts_crypt8)
2544 leaq _aesni_enc4, %r11
2545 leaq _aesni_dec4, %rax
2549 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2556 movdqu 0x00(INP), INC
2558 movdqu IV, 0x00(OUTP)
2560 _aesni_gf128mul_x_ble()
2562 movdqu 0x10(INP), INC
2564 movdqu IV, 0x10(OUTP)
2566 _aesni_gf128mul_x_ble()
2568 movdqu 0x20(INP), INC
2570 movdqu IV, 0x20(OUTP)
2572 _aesni_gf128mul_x_ble()
2574 movdqu 0x30(INP), INC
2576 movdqu IV, 0x30(OUTP)
2580 movdqu 0x00(OUTP), INC
2582 movdqu STATE1, 0x00(OUTP)
2584 _aesni_gf128mul_x_ble()
2586 movdqu 0x40(INP), INC
2588 movdqu IV, 0x40(OUTP)
2590 movdqu 0x10(OUTP), INC
2592 movdqu STATE2, 0x10(OUTP)
2594 _aesni_gf128mul_x_ble()
2596 movdqu 0x50(INP), INC
2598 movdqu IV, 0x50(OUTP)
2600 movdqu 0x20(OUTP), INC
2602 movdqu STATE3, 0x20(OUTP)
2604 _aesni_gf128mul_x_ble()
2606 movdqu 0x60(INP), INC
2608 movdqu IV, 0x60(OUTP)
2610 movdqu 0x30(OUTP), INC
2612 movdqu STATE4, 0x30(OUTP)
2614 _aesni_gf128mul_x_ble()
2616 movdqu 0x70(INP), INC
2618 movdqu IV, 0x70(OUTP)
2620 _aesni_gf128mul_x_ble()
2625 movdqu 0x40(OUTP), INC
2627 movdqu STATE1, 0x40(OUTP)
2629 movdqu 0x50(OUTP), INC
2631 movdqu STATE2, 0x50(OUTP)
2633 movdqu 0x60(OUTP), INC
2635 movdqu STATE3, 0x60(OUTP)
2637 movdqu 0x70(OUTP), INC
2639 movdqu STATE4, 0x70(OUTP)
2643 ENDPROC(aesni_xts_crypt8)