arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/inst.h>
  30 #include <asm/frame.h>
  31 #include <asm/nospec-branch.h>
  32
  33 /*
  34  * The following macros are used to move an (un)aligned 16 byte value to/from
  35  * an XMM register.  This can done for either FP or integer values, for FP use
  36  * movaps (move aligned packed single) or integer use movdqa (move double quad
  37  * aligned).  It doesn't make a performance difference which instruction is used
  38  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  39  * shorter, so that is the one we'll use for now. (same for unaligned).
  40  */
  41 #define MOVADQ  movaps
  42 #define MOVUDQ  movups
  43
  44 #ifdef __x86_64__
  45
  46 # constants in mergeable sections, linker can reorder and merge
  47 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  48 .align 16
  49 .Lgf128mul_x_ble_mask:
  50         .octa 0x00000000000000010000000000000087
  51 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  52 .align 16
  53 POLY:   .octa 0xC2000000000000000000000000000001
  54 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  55 .align 16
  56 TWOONE: .octa 0x00000001000000000000000000000001
  57
  58 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  59 .align 16
  60 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  61 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  62 .align 16
  63 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  64 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  65 .align 16
  66 MASK2:      .octa 0xffffffffffffffff0000000000000000
  67 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  68 .align 16
  69 ONE:        .octa 0x00000000000000000000000000000001
  70 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  71 .align 16
  72 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  73 .section        .rodata.cst16.dec, "aM", @progbits, 16
  74 .align 16
  75 dec:        .octa 0x1
  76 .section        .rodata.cst16.enc, "aM", @progbits, 16
  77 .align 16
  78 enc:        .octa 0x2
  79
  80 # order of these constants should not change.
  81 # more specifically, ALL_F should follow SHIFT_MASK,
  82 # and zero should follow ALL_F
  83 .section        .rodata, "a", @progbits
  84 .align 16
  85 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  86 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  87             .octa 0x00000000000000000000000000000000
  88
  89 .text
  90
  91
  92 #define STACK_OFFSET    8*3
  93
  94 #define AadHash 16*0
  95 #define AadLen 16*1
  96 #define InLen (16*1)+8
  97 #define PBlockEncKey 16*2
  98 #define OrigIV 16*3
  99 #define CurCount 16*4
 100 #define PBlockLen 16*5
 101 #define HashKey         16*6    // store HashKey <<1 mod poly here
 102 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
 103 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
 104 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 105 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 106                                 // bits of  HashKey <<1 mod poly here
 107                                 //(for Karatsuba purposes)
 108 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 109                                 // bits of  HashKey^2 <<1 mod poly here
 110                                 // (for Karatsuba purposes)
 111 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 112                                 // bits of  HashKey^3 <<1 mod poly here
 113                                 // (for Karatsuba purposes)
 114 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 115                                 // bits of  HashKey^4 <<1 mod poly here
 116                                 // (for Karatsuba purposes)
 117
 118 #define arg1 rdi
 119 #define arg2 rsi
 120 #define arg3 rdx
 121 #define arg4 rcx
 122 #define arg5 r8
 123 #define arg6 r9
 124 #define arg7 STACK_OFFSET+8(%rsp)
 125 #define arg8 STACK_OFFSET+16(%rsp)
 126 #define arg9 STACK_OFFSET+24(%rsp)
 127 #define arg10 STACK_OFFSET+32(%rsp)
 128 #define arg11 STACK_OFFSET+40(%rsp)
 129 #define keysize 2*15*16(%arg1)
 130 #endif
 131
 132
 133 #define STATE1  %xmm0
 134 #define STATE2  %xmm4
 135 #define STATE3  %xmm5
 136 #define STATE4  %xmm6
 137 #define STATE   STATE1
 138 #define IN1     %xmm1
 139 #define IN2     %xmm7
 140 #define IN3     %xmm8
 141 #define IN4     %xmm9
 142 #define IN      IN1
 143 #define KEY     %xmm2
 144 #define IV      %xmm3
 145
 146 #define BSWAP_MASK %xmm10
 147 #define CTR     %xmm11
 148 #define INC     %xmm12
 149
 150 #define GF128MUL_MASK %xmm10
 151
 152 #ifdef __x86_64__
 153 #define AREG    %rax
 154 #define KEYP    %rdi
 155 #define OUTP    %rsi
 156 #define UKEYP   OUTP
 157 #define INP     %rdx
 158 #define LEN     %rcx
 159 #define IVP     %r8
 160 #define KLEN    %r9d
 161 #define T1      %r10
 162 #define TKEYP   T1
 163 #define T2      %r11
 164 #define TCTR_LOW T2
 165 #else
 166 #define AREG    %eax
 167 #define KEYP    %edi
 168 #define OUTP    AREG
 169 #define UKEYP   OUTP
 170 #define INP     %edx
 171 #define LEN     %esi
 172 #define IVP     %ebp
 173 #define KLEN    %ebx
 174 #define T1      %ecx
 175 #define TKEYP   T1
 176 #endif
 177
 178 .macro FUNC_SAVE
 179         push    %r12
 180         push    %r13
 181         push    %r14
 182 #
 183 # states of %xmm registers %xmm6:%xmm15 not saved
 184 # all %xmm registers are clobbered
 185 #
 186 .endm
 187
 188
 189 .macro FUNC_RESTORE
 190         pop     %r14
 191         pop     %r13
 192         pop     %r12
 193 .endm
 194
 195 # Precompute hashkeys.
 196 # Input: Hash subkey.
 197 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 198 # once per key.
 199 # clobbers r12, and tmp xmm registers.
 200 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 201         mov     \SUBKEY, %r12
 202         movdqu  (%r12), \TMP3
 203         movdqa  SHUF_MASK(%rip), \TMP2
 204         PSHUFB_XMM \TMP2, \TMP3
 205
 206         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 207
 208         movdqa  \TMP3, \TMP2
 209         psllq   $1, \TMP3
 210         psrlq   $63, \TMP2
 211         movdqa  \TMP2, \TMP1
 212         pslldq  $8, \TMP2
 213         psrldq  $8, \TMP1
 214         por     \TMP2, \TMP3
 215
 216         # reduce HashKey<<1
 217
 218         pshufd  $0x24, \TMP1, \TMP2
 219         pcmpeqd TWOONE(%rip), \TMP2
 220         pand    POLY(%rip), \TMP2
 221         pxor    \TMP2, \TMP3
 222         movdqu  \TMP3, HashKey(%arg2)
 223
 224         movdqa     \TMP3, \TMP5
 225         pshufd     $78, \TMP3, \TMP1
 226         pxor       \TMP3, \TMP1
 227         movdqu     \TMP1, HashKey_k(%arg2)
 228
 229         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 230 # TMP5 = HashKey^2<<1 (mod poly)
 231         movdqu     \TMP5, HashKey_2(%arg2)
 232 # HashKey_2 = HashKey^2<<1 (mod poly)
 233         pshufd     $78, \TMP5, \TMP1
 234         pxor       \TMP5, \TMP1
 235         movdqu     \TMP1, HashKey_2_k(%arg2)
 236
 237         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 238 # TMP5 = HashKey^3<<1 (mod poly)
 239         movdqu     \TMP5, HashKey_3(%arg2)
 240         pshufd     $78, \TMP5, \TMP1
 241         pxor       \TMP5, \TMP1
 242         movdqu     \TMP1, HashKey_3_k(%arg2)
 243
 244         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 245 # TMP5 = HashKey^3<<1 (mod poly)
 246         movdqu     \TMP5, HashKey_4(%arg2)
 247         pshufd     $78, \TMP5, \TMP1
 248         pxor       \TMP5, \TMP1
 249         movdqu     \TMP1, HashKey_4_k(%arg2)
 250 .endm
 251
 252 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 253 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 254 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 255         mov \AADLEN, %r11
 256         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 257         xor %r11d, %r11d
 258         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 259         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 260         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 261         mov \Iv, %rax
 262         movdqu (%rax), %xmm0
 263         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 264
 265         movdqa  SHUF_MASK(%rip), %xmm2
 266         PSHUFB_XMM %xmm2, %xmm0
 267         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 268
 269         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 270         movdqu HashKey(%arg2), %xmm13
 271
 272         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 273         %xmm4, %xmm5, %xmm6
 274 .endm
 275
 276 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 277 # struct has been initialized by GCM_INIT.
 278 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 279 # Clobbers rax, r10-r13, and xmm0-xmm15
 280 .macro GCM_ENC_DEC operation
 281         movdqu AadHash(%arg2), %xmm8
 282         movdqu HashKey(%arg2), %xmm13
 283         add %arg5, InLen(%arg2)
 284
 285         xor %r11d, %r11d # initialise the data pointer offset as zero
 286         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 287
 288         sub %r11, %arg5         # sub partial block data used
 289         mov %arg5, %r13         # save the number of bytes
 290
 291         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 292         mov %r13, %r12
 293         # Encrypt/Decrypt first few blocks
 294
 295         and     $(3<<4), %r12
 296         jz      _initial_num_blocks_is_0_\@
 297         cmp     $(2<<4), %r12
 298         jb      _initial_num_blocks_is_1_\@
 299         je      _initial_num_blocks_is_2_\@
 300 _initial_num_blocks_is_3_\@:
 301         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 303         sub     $48, %r13
 304         jmp     _initial_blocks_\@
 305 _initial_num_blocks_is_2_\@:
 306         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 308         sub     $32, %r13
 309         jmp     _initial_blocks_\@
 310 _initial_num_blocks_is_1_\@:
 311         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 313         sub     $16, %r13
 314         jmp     _initial_blocks_\@
 315 _initial_num_blocks_is_0_\@:
 316         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 317 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 318 _initial_blocks_\@:
 319
 320         # Main loop - Encrypt/Decrypt remaining blocks
 321
 322         cmp     $0, %r13
 323         je      _zero_cipher_left_\@
 324         sub     $64, %r13
 325         je      _four_cipher_left_\@
 326 _crypt_by_4_\@:
 327         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 328         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 329         %xmm7, %xmm8, enc
 330         add     $64, %r11
 331         sub     $64, %r13
 332         jne     _crypt_by_4_\@
 333 _four_cipher_left_\@:
 334         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 335 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 336 _zero_cipher_left_\@:
 337         movdqu %xmm8, AadHash(%arg2)
 338         movdqu %xmm0, CurCount(%arg2)
 339
 340         mov     %arg5, %r13
 341         and     $15, %r13                       # %r13 = arg5 (mod 16)
 342         je      _multiple_of_16_bytes_\@
 343
 344         mov %r13, PBlockLen(%arg2)
 345
 346         # Handle the last <16 Byte block separately
 347         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 348         movdqu %xmm0, CurCount(%arg2)
 349         movdqa SHUF_MASK(%rip), %xmm10
 350         PSHUFB_XMM %xmm10, %xmm0
 351
 352         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 353         movdqu %xmm0, PBlockEncKey(%arg2)
 354
 355         cmp     $16, %arg5
 356         jge _large_enough_update_\@
 357
 358         lea (%arg4,%r11,1), %r10
 359         mov %r13, %r12
 360         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 361         jmp _data_read_\@
 362
 363 _large_enough_update_\@:
 364         sub     $16, %r11
 365         add     %r13, %r11
 366
 367         # receive the last <16 Byte block
 368         movdqu  (%arg4, %r11, 1), %xmm1
 369
 370         sub     %r13, %r11
 371         add     $16, %r11
 372
 373         lea     SHIFT_MASK+16(%rip), %r12
 374         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 375         # (r13 is the number of bytes in plaintext mod 16)
 376         sub     %r13, %r12
 377         # get the appropriate shuffle mask
 378         movdqu  (%r12), %xmm2
 379         # shift right 16-r13 bytes
 380         PSHUFB_XMM  %xmm2, %xmm1
 381
 382 _data_read_\@:
 383         lea ALL_F+16(%rip), %r12
 384         sub %r13, %r12
 385
 386 .ifc \operation, dec
 387         movdqa  %xmm1, %xmm2
 388 .endif
 389         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 390         movdqu  (%r12), %xmm1
 391         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 392         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 393 .ifc \operation, dec
 394         pand    %xmm1, %xmm2
 395         movdqa SHUF_MASK(%rip), %xmm10
 396         PSHUFB_XMM %xmm10 ,%xmm2
 397
 398         pxor %xmm2, %xmm8
 399 .else
 400         movdqa SHUF_MASK(%rip), %xmm10
 401         PSHUFB_XMM %xmm10,%xmm0
 402
 403         pxor    %xmm0, %xmm8
 404 .endif
 405
 406         movdqu %xmm8, AadHash(%arg2)
 407 .ifc \operation, enc
 408         # GHASH computation for the last <16 byte block
 409         movdqa SHUF_MASK(%rip), %xmm10
 410         # shuffle xmm0 back to output as ciphertext
 411         PSHUFB_XMM %xmm10, %xmm0
 412 .endif
 413
 414         # Output %r13 bytes
 415         MOVQ_R64_XMM %xmm0, %rax
 416         cmp $8, %r13
 417         jle _less_than_8_bytes_left_\@
 418         mov %rax, (%arg3 , %r11, 1)
 419         add $8, %r11
 420         psrldq $8, %xmm0
 421         MOVQ_R64_XMM %xmm0, %rax
 422         sub $8, %r13
 423 _less_than_8_bytes_left_\@:
 424         mov %al,  (%arg3, %r11, 1)
 425         add $1, %r11
 426         shr $8, %rax
 427         sub $1, %r13
 428         jne _less_than_8_bytes_left_\@
 429 _multiple_of_16_bytes_\@:
 430 .endm
 431
 432 # GCM_COMPLETE Finishes update of tag of last partial block
 433 # Output: Authorization Tag (AUTH_TAG)
 434 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 435 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 436         movdqu AadHash(%arg2), %xmm8
 437         movdqu HashKey(%arg2), %xmm13
 438
 439         mov PBlockLen(%arg2), %r12
 440
 441         cmp $0, %r12
 442         je _partial_done\@
 443
 444         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 445
 446 _partial_done\@:
 447         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 448         shl     $3, %r12                  # convert into number of bits
 449         movd    %r12d, %xmm15             # len(A) in %xmm15
 450         mov InLen(%arg2), %r12
 451         shl     $3, %r12                  # len(C) in bits (*128)
 452         MOVQ_R64_XMM    %r12, %xmm1
 453
 454         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 455         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 456         pxor    %xmm15, %xmm8
 457         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 458         # final GHASH computation
 459         movdqa SHUF_MASK(%rip), %xmm10
 460         PSHUFB_XMM %xmm10, %xmm8
 461
 462         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 463         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 464         pxor    %xmm8, %xmm0
 465 _return_T_\@:
 466         mov     \AUTHTAG, %r10                     # %r10 = authTag
 467         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 468         cmp     $16, %r11
 469         je      _T_16_\@
 470         cmp     $8, %r11
 471         jl      _T_4_\@
 472 _T_8_\@:
 473         MOVQ_R64_XMM    %xmm0, %rax
 474         mov     %rax, (%r10)
 475         add     $8, %r10
 476         sub     $8, %r11
 477         psrldq  $8, %xmm0
 478         cmp     $0, %r11
 479         je      _return_T_done_\@
 480 _T_4_\@:
 481         movd    %xmm0, %eax
 482         mov     %eax, (%r10)
 483         add     $4, %r10
 484         sub     $4, %r11
 485         psrldq  $4, %xmm0
 486         cmp     $0, %r11
 487         je      _return_T_done_\@
 488 _T_123_\@:
 489         movd    %xmm0, %eax
 490         cmp     $2, %r11
 491         jl      _T_1_\@
 492         mov     %ax, (%r10)
 493         cmp     $2, %r11
 494         je      _return_T_done_\@
 495         add     $2, %r10
 496         sar     $16, %eax
 497 _T_1_\@:
 498         mov     %al, (%r10)
 499         jmp     _return_T_done_\@
 500 _T_16_\@:
 501         movdqu  %xmm0, (%r10)
 502 _return_T_done_\@:
 503 .endm
 504
 505 #ifdef __x86_64__
 506 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 507 *
 508 *
 509 * Input: A and B (128-bits each, bit-reflected)
 510 * Output: C = A*B*x mod poly, (i.e. >>1 )
 511 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 512 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 513 *
 514 */
 515 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 516         movdqa    \GH, \TMP1
 517         pshufd    $78, \GH, \TMP2
 518         pshufd    $78, \HK, \TMP3
 519         pxor      \GH, \TMP2            # TMP2 = a1+a0
 520         pxor      \HK, \TMP3            # TMP3 = b1+b0
 521         PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 522         PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 523         PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 524         pxor      \GH, \TMP2
 525         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 526         movdqa    \TMP2, \TMP3
 527         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 528         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 529         pxor      \TMP3, \GH
 530         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 531
 532         # first phase of the reduction
 533
 534         movdqa    \GH, \TMP2
 535         movdqa    \GH, \TMP3
 536         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 537                                         # in in order to perform
 538                                         # independent shifts
 539         pslld     $31, \TMP2            # packed right shift <<31
 540         pslld     $30, \TMP3            # packed right shift <<30
 541         pslld     $25, \TMP4            # packed right shift <<25
 542         pxor      \TMP3, \TMP2          # xor the shifted versions
 543         pxor      \TMP4, \TMP2
 544         movdqa    \TMP2, \TMP5
 545         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 546         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 547         pxor      \TMP2, \GH
 548
 549         # second phase of the reduction
 550
 551         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 552                                         # in in order to perform
 553                                         # independent shifts
 554         movdqa    \GH,\TMP3
 555         movdqa    \GH,\TMP4
 556         psrld     $1,\TMP2              # packed left shift >>1
 557         psrld     $2,\TMP3              # packed left shift >>2
 558         psrld     $7,\TMP4              # packed left shift >>7
 559         pxor      \TMP3,\TMP2           # xor the shifted versions
 560         pxor      \TMP4,\TMP2
 561         pxor      \TMP5, \TMP2
 562         pxor      \TMP2, \GH
 563         pxor      \TMP1, \GH            # result is in TMP1
 564 .endm
 565
 566 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 567 # where 0 < DLEN < 16
 568 # Clobbers %rax, DLEN and XMM1
 569 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 570         cmp $8, \DLEN
 571         jl _read_lt8_\@
 572         mov (\DPTR), %rax
 573         MOVQ_R64_XMM %rax, \XMMDst
 574         sub $8, \DLEN
 575         jz _done_read_partial_block_\@
 576         xor %eax, %eax
 577 _read_next_byte_\@:
 578         shl $8, %rax
 579         mov 7(\DPTR, \DLEN, 1), %al
 580         dec \DLEN
 581         jnz _read_next_byte_\@
 582         MOVQ_R64_XMM %rax, \XMM1
 583         pslldq $8, \XMM1
 584         por \XMM1, \XMMDst
 585         jmp _done_read_partial_block_\@
 586 _read_lt8_\@:
 587         xor %eax, %eax
 588 _read_next_byte_lt8_\@:
 589         shl $8, %rax
 590         mov -1(\DPTR, \DLEN, 1), %al
 591         dec \DLEN
 592         jnz _read_next_byte_lt8_\@
 593         MOVQ_R64_XMM %rax, \XMMDst
 594 _done_read_partial_block_\@:
 595 .endm
 596
 597 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 598 # clobbers r10-11, xmm14
 599 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 600         TMP6 TMP7
 601         MOVADQ     SHUF_MASK(%rip), %xmm14
 602         mov        \AAD, %r10           # %r10 = AAD
 603         mov        \AADLEN, %r11                # %r11 = aadLen
 604         pxor       \TMP7, \TMP7
 605         pxor       \TMP6, \TMP6
 606
 607         cmp        $16, %r11
 608         jl         _get_AAD_rest\@
 609 _get_AAD_blocks\@:
 610         movdqu     (%r10), \TMP7
 611         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 612         pxor       \TMP7, \TMP6
 613         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 614         add        $16, %r10
 615         sub        $16, %r11
 616         cmp        $16, %r11
 617         jge        _get_AAD_blocks\@
 618
 619         movdqu     \TMP6, \TMP7
 620
 621         /* read the last <16B of AAD */
 622 _get_AAD_rest\@:
 623         cmp        $0, %r11
 624         je         _get_AAD_done\@
 625
 626         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 627         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 628         pxor       \TMP6, \TMP7
 629         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 630         movdqu \TMP7, \TMP6
 631
 632 _get_AAD_done\@:
 633         movdqu \TMP6, AadHash(%arg2)
 634 .endm
 635
 636 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 637 # between update calls.
 638 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 639 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 640 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 641 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 642         AAD_HASH operation
 643         mov     PBlockLen(%arg2), %r13
 644         cmp     $0, %r13
 645         je      _partial_block_done_\@  # Leave Macro if no partial blocks
 646         # Read in input data without over reading
 647         cmp     $16, \PLAIN_CYPH_LEN
 648         jl      _fewer_than_16_bytes_\@
 649         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 650         jmp     _data_read_\@
 651
 652 _fewer_than_16_bytes_\@:
 653         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 654         mov     \PLAIN_CYPH_LEN, %r12
 655         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 656
 657         mov PBlockLen(%arg2), %r13
 658
 659 _data_read_\@:                          # Finished reading in data
 660
 661         movdqu  PBlockEncKey(%arg2), %xmm9
 662         movdqu  HashKey(%arg2), %xmm13
 663
 664         lea     SHIFT_MASK(%rip), %r12
 665
 666         # adjust the shuffle mask pointer to be able to shift r13 bytes
 667         # r16-r13 is the number of bytes in plaintext mod 16)
 668         add     %r13, %r12
 669         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 670         PSHUFB_XMM %xmm2, %xmm9         # shift right r13 bytes
 671
 672 .ifc \operation, dec
 673         movdqa  %xmm1, %xmm3
 674         pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 675
 676         mov     \PLAIN_CYPH_LEN, %r10
 677         add     %r13, %r10
 678         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 679         sub     $16, %r10
 680         # Determine if if partial block is not being filled and
 681         # shift mask accordingly
 682         jge     _no_extra_mask_1_\@
 683         sub     %r10, %r12
 684 _no_extra_mask_1_\@:
 685
 686         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 687         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 688         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 689
 690         pand    %xmm1, %xmm3
 691         movdqa  SHUF_MASK(%rip), %xmm10
 692         PSHUFB_XMM      %xmm10, %xmm3
 693         PSHUFB_XMM      %xmm2, %xmm3
 694         pxor    %xmm3, \AAD_HASH
 695
 696         cmp     $0, %r10
 697         jl      _partial_incomplete_1_\@
 698
 699         # GHASH computation for the last <16 Byte block
 700         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 701         xor     %eax, %eax
 702
 703         mov     %rax, PBlockLen(%arg2)
 704         jmp     _dec_done_\@
 705 _partial_incomplete_1_\@:
 706         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 707 _dec_done_\@:
 708         movdqu  \AAD_HASH, AadHash(%arg2)
 709 .else
 710         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 711
 712         mov     \PLAIN_CYPH_LEN, %r10
 713         add     %r13, %r10
 714         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 715         sub     $16, %r10
 716         # Determine if if partial block is not being filled and
 717         # shift mask accordingly
 718         jge     _no_extra_mask_2_\@
 719         sub     %r10, %r12
 720 _no_extra_mask_2_\@:
 721
 722         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 723         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 724         pand    %xmm1, %xmm9
 725
 726         movdqa  SHUF_MASK(%rip), %xmm1
 727         PSHUFB_XMM %xmm1, %xmm9
 728         PSHUFB_XMM %xmm2, %xmm9
 729         pxor    %xmm9, \AAD_HASH
 730
 731         cmp     $0, %r10
 732         jl      _partial_incomplete_2_\@
 733
 734         # GHASH computation for the last <16 Byte block
 735         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 736         xor     %eax, %eax
 737
 738         mov     %rax, PBlockLen(%arg2)
 739         jmp     _encode_done_\@
 740 _partial_incomplete_2_\@:
 741         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 742 _encode_done_\@:
 743         movdqu  \AAD_HASH, AadHash(%arg2)
 744
 745         movdqa  SHUF_MASK(%rip), %xmm10
 746         # shuffle xmm9 back to output as ciphertext
 747         PSHUFB_XMM      %xmm10, %xmm9
 748         PSHUFB_XMM      %xmm2, %xmm9
 749 .endif
 750         # output encrypted Bytes
 751         cmp     $0, %r10
 752         jl      _partial_fill_\@
 753         mov     %r13, %r12
 754         mov     $16, %r13
 755         # Set r13 to be the number of bytes to write out
 756         sub     %r12, %r13
 757         jmp     _count_set_\@
 758 _partial_fill_\@:
 759         mov     \PLAIN_CYPH_LEN, %r13
 760 _count_set_\@:
 761         movdqa  %xmm9, %xmm0
 762         MOVQ_R64_XMM    %xmm0, %rax
 763         cmp     $8, %r13
 764         jle     _less_than_8_bytes_left_\@
 765
 766         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 767         add     $8, \DATA_OFFSET
 768         psrldq  $8, %xmm0
 769         MOVQ_R64_XMM    %xmm0, %rax
 770         sub     $8, %r13
 771 _less_than_8_bytes_left_\@:
 772         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 773         add     $1, \DATA_OFFSET
 774         shr     $8, %rax
 775         sub     $1, %r13
 776         jne     _less_than_8_bytes_left_\@
 777 _partial_block_done_\@:
 778 .endm # PARTIAL_BLOCK
 779
 780 /*
 781 * if a = number of total plaintext bytes
 782 * b = floor(a/16)
 783 * num_initial_blocks = b mod 4
 784 * encrypt the initial num_initial_blocks blocks and apply ghash on
 785 * the ciphertext
 786 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 787 * are clobbered
 788 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 789 */
 790
 791
 792 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 793         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 794         MOVADQ          SHUF_MASK(%rip), %xmm14
 795
 796         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 797
 798         # start AES for num_initial_blocks blocks
 799
 800         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 801
 802 .if (\i == 5) || (\i == 6) || (\i == 7)
 803
 804         MOVADQ          ONE(%RIP),\TMP1
 805         MOVADQ          0(%arg1),\TMP2
 806 .irpc index, \i_seq
 807         paddd           \TMP1, \XMM0                 # INCR Y0
 808 .ifc \operation, dec
 809         movdqa     \XMM0, %xmm\index
 810 .else
 811         MOVADQ          \XMM0, %xmm\index
 812 .endif
 813         PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 814         pxor            \TMP2, %xmm\index
 815 .endr
 816         lea     0x10(%arg1),%r10
 817         mov     keysize,%eax
 818         shr     $2,%eax                         # 128->4, 192->6, 256->8
 819         add     $5,%eax                       # 128->9, 192->11, 256->13
 820
 821 aes_loop_initial_\@:
 822         MOVADQ  (%r10),\TMP1
 823 .irpc   index, \i_seq
 824         AESENC  \TMP1, %xmm\index
 825 .endr
 826         add     $16,%r10
 827         sub     $1,%eax
 828         jnz     aes_loop_initial_\@
 829
 830         MOVADQ  (%r10), \TMP1
 831 .irpc index, \i_seq
 832         AESENCLAST \TMP1, %xmm\index         # Last Round
 833 .endr
 834 .irpc index, \i_seq
 835         movdqu     (%arg4 , %r11, 1), \TMP1
 836         pxor       \TMP1, %xmm\index
 837         movdqu     %xmm\index, (%arg3 , %r11, 1)
 838         # write back plaintext/ciphertext for num_initial_blocks
 839         add        $16, %r11
 840
 841 .ifc \operation, dec
 842         movdqa     \TMP1, %xmm\index
 843 .endif
 844         PSHUFB_XMM         %xmm14, %xmm\index
 845
 846                 # prepare plaintext/ciphertext for GHASH computation
 847 .endr
 848 .endif
 849
 850         # apply GHASH on num_initial_blocks blocks
 851
 852 .if \i == 5
 853         pxor       %xmm5, %xmm6
 854         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 855         pxor       %xmm6, %xmm7
 856         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857         pxor       %xmm7, %xmm8
 858         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859 .elseif \i == 6
 860         pxor       %xmm6, %xmm7
 861         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862         pxor       %xmm7, %xmm8
 863         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 864 .elseif \i == 7
 865         pxor       %xmm7, %xmm8
 866         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 867 .endif
 868         cmp        $64, %r13
 869         jl      _initial_blocks_done\@
 870         # no need for precomputed values
 871 /*
 872 *
 873 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 874 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 875 */
 876         MOVADQ     ONE(%RIP),\TMP1
 877         paddd      \TMP1, \XMM0              # INCR Y0
 878         MOVADQ     \XMM0, \XMM1
 879         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 880
 881         paddd      \TMP1, \XMM0              # INCR Y0
 882         MOVADQ     \XMM0, \XMM2
 883         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 884
 885         paddd      \TMP1, \XMM0              # INCR Y0
 886         MOVADQ     \XMM0, \XMM3
 887         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 888
 889         paddd      \TMP1, \XMM0              # INCR Y0
 890         MOVADQ     \XMM0, \XMM4
 891         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 892
 893         MOVADQ     0(%arg1),\TMP1
 894         pxor       \TMP1, \XMM1
 895         pxor       \TMP1, \XMM2
 896         pxor       \TMP1, \XMM3
 897         pxor       \TMP1, \XMM4
 898 .irpc index, 1234 # do 4 rounds
 899         movaps 0x10*\index(%arg1), \TMP1
 900         AESENC     \TMP1, \XMM1
 901         AESENC     \TMP1, \XMM2
 902         AESENC     \TMP1, \XMM3
 903         AESENC     \TMP1, \XMM4
 904 .endr
 905 .irpc index, 56789 # do next 5 rounds
 906         movaps 0x10*\index(%arg1), \TMP1
 907         AESENC     \TMP1, \XMM1
 908         AESENC     \TMP1, \XMM2
 909         AESENC     \TMP1, \XMM3
 910         AESENC     \TMP1, \XMM4
 911 .endr
 912         lea        0xa0(%arg1),%r10
 913         mov        keysize,%eax
 914         shr        $2,%eax                      # 128->4, 192->6, 256->8
 915         sub        $4,%eax                      # 128->0, 192->2, 256->4
 916         jz         aes_loop_pre_done\@
 917
 918 aes_loop_pre_\@:
 919         MOVADQ     (%r10),\TMP2
 920 .irpc   index, 1234
 921         AESENC     \TMP2, %xmm\index
 922 .endr
 923         add        $16,%r10
 924         sub        $1,%eax
 925         jnz        aes_loop_pre_\@
 926
 927 aes_loop_pre_done\@:
 928         MOVADQ     (%r10), \TMP2
 929         AESENCLAST \TMP2, \XMM1
 930         AESENCLAST \TMP2, \XMM2
 931         AESENCLAST \TMP2, \XMM3
 932         AESENCLAST \TMP2, \XMM4
 933         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 934         pxor       \TMP1, \XMM1
 935 .ifc \operation, dec
 936         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 937         movdqa     \TMP1, \XMM1
 938 .endif
 939         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 940         pxor       \TMP1, \XMM2
 941 .ifc \operation, dec
 942         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 943         movdqa     \TMP1, \XMM2
 944 .endif
 945         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 946         pxor       \TMP1, \XMM3
 947 .ifc \operation, dec
 948         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 949         movdqa     \TMP1, \XMM3
 950 .endif
 951         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 952         pxor       \TMP1, \XMM4
 953 .ifc \operation, dec
 954         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 955         movdqa     \TMP1, \XMM4
 956 .else
 957         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 958         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 959         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 960         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 961 .endif
 962
 963         add        $64, %r11
 964         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 965         pxor       \XMMDst, \XMM1
 966 # combine GHASHed value with the corresponding ciphertext
 967         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 968         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 969         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 970
 971 _initial_blocks_done\@:
 972
 973 .endm
 974
 975 /*
 976 * encrypt 4 blocks at a time
 977 * ghash the 4 previously encrypted ciphertext blocks
 978 * arg1, %arg3, %arg4 are used as pointers only, not modified
 979 * %r11 is the data offset value
 980 */
 981 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 982 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 983
 984         movdqa    \XMM1, \XMM5
 985         movdqa    \XMM2, \XMM6
 986         movdqa    \XMM3, \XMM7
 987         movdqa    \XMM4, \XMM8
 988
 989         movdqa    SHUF_MASK(%rip), %xmm15
 990         # multiply TMP5 * HashKey using karatsuba
 991
 992         movdqa    \XMM5, \TMP4
 993         pshufd    $78, \XMM5, \TMP6
 994         pxor      \XMM5, \TMP6
 995         paddd     ONE(%rip), \XMM0              # INCR CNT
 996         movdqu    HashKey_4(%arg2), \TMP5
 997         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 998         movdqa    \XMM0, \XMM1
 999         paddd     ONE(%rip), \XMM0              # INCR CNT
1000         movdqa    \XMM0, \XMM2
1001         paddd     ONE(%rip), \XMM0              # INCR CNT
1002         movdqa    \XMM0, \XMM3
1003         paddd     ONE(%rip), \XMM0              # INCR CNT
1004         movdqa    \XMM0, \XMM4
1005         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1006         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1007         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1008         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1009         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1010
1011         pxor      (%arg1), \XMM1
1012         pxor      (%arg1), \XMM2
1013         pxor      (%arg1), \XMM3
1014         pxor      (%arg1), \XMM4
1015         movdqu    HashKey_4_k(%arg2), \TMP5
1016         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1017         movaps 0x10(%arg1), \TMP1
1018         AESENC    \TMP1, \XMM1              # Round 1
1019         AESENC    \TMP1, \XMM2
1020         AESENC    \TMP1, \XMM3
1021         AESENC    \TMP1, \XMM4
1022         movaps 0x20(%arg1), \TMP1
1023         AESENC    \TMP1, \XMM1              # Round 2
1024         AESENC    \TMP1, \XMM2
1025         AESENC    \TMP1, \XMM3
1026         AESENC    \TMP1, \XMM4
1027         movdqa    \XMM6, \TMP1
1028         pshufd    $78, \XMM6, \TMP2
1029         pxor      \XMM6, \TMP2
1030         movdqu    HashKey_3(%arg2), \TMP5
1031         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1032         movaps 0x30(%arg1), \TMP3
1033         AESENC    \TMP3, \XMM1              # Round 3
1034         AESENC    \TMP3, \XMM2
1035         AESENC    \TMP3, \XMM3
1036         AESENC    \TMP3, \XMM4
1037         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1038         movaps 0x40(%arg1), \TMP3
1039         AESENC    \TMP3, \XMM1              # Round 4
1040         AESENC    \TMP3, \XMM2
1041         AESENC    \TMP3, \XMM3
1042         AESENC    \TMP3, \XMM4
1043         movdqu    HashKey_3_k(%arg2), \TMP5
1044         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1045         movaps 0x50(%arg1), \TMP3
1046         AESENC    \TMP3, \XMM1              # Round 5
1047         AESENC    \TMP3, \XMM2
1048         AESENC    \TMP3, \XMM3
1049         AESENC    \TMP3, \XMM4
1050         pxor      \TMP1, \TMP4
1051 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052         pxor      \XMM6, \XMM5
1053         pxor      \TMP2, \TMP6
1054         movdqa    \XMM7, \TMP1
1055         pshufd    $78, \XMM7, \TMP2
1056         pxor      \XMM7, \TMP2
1057         movdqu    HashKey_2(%arg2), \TMP5
1058
1059         # Multiply TMP5 * HashKey using karatsuba
1060
1061         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1062         movaps 0x60(%arg1), \TMP3
1063         AESENC    \TMP3, \XMM1              # Round 6
1064         AESENC    \TMP3, \XMM2
1065         AESENC    \TMP3, \XMM3
1066         AESENC    \TMP3, \XMM4
1067         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1068         movaps 0x70(%arg1), \TMP3
1069         AESENC    \TMP3, \XMM1             # Round 7
1070         AESENC    \TMP3, \XMM2
1071         AESENC    \TMP3, \XMM3
1072         AESENC    \TMP3, \XMM4
1073         movdqu    HashKey_2_k(%arg2), \TMP5
1074         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1075         movaps 0x80(%arg1), \TMP3
1076         AESENC    \TMP3, \XMM1             # Round 8
1077         AESENC    \TMP3, \XMM2
1078         AESENC    \TMP3, \XMM3
1079         AESENC    \TMP3, \XMM4
1080         pxor      \TMP1, \TMP4
1081 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082         pxor      \XMM7, \XMM5
1083         pxor      \TMP2, \TMP6
1084
1085         # Multiply XMM8 * HashKey
1086         # XMM8 and TMP5 hold the values for the two operands
1087
1088         movdqa    \XMM8, \TMP1
1089         pshufd    $78, \XMM8, \TMP2
1090         pxor      \XMM8, \TMP2
1091         movdqu    HashKey(%arg2), \TMP5
1092         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1093         movaps 0x90(%arg1), \TMP3
1094         AESENC    \TMP3, \XMM1            # Round 9
1095         AESENC    \TMP3, \XMM2
1096         AESENC    \TMP3, \XMM3
1097         AESENC    \TMP3, \XMM4
1098         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1099         lea       0xa0(%arg1),%r10
1100         mov       keysize,%eax
1101         shr       $2,%eax                       # 128->4, 192->6, 256->8
1102         sub       $4,%eax                       # 128->0, 192->2, 256->4
1103         jz        aes_loop_par_enc_done\@
1104
1105 aes_loop_par_enc\@:
1106         MOVADQ    (%r10),\TMP3
1107 .irpc   index, 1234
1108         AESENC    \TMP3, %xmm\index
1109 .endr
1110         add       $16,%r10
1111         sub       $1,%eax
1112         jnz       aes_loop_par_enc\@
1113
1114 aes_loop_par_enc_done\@:
1115         MOVADQ    (%r10), \TMP3
1116         AESENCLAST \TMP3, \XMM1           # Round 10
1117         AESENCLAST \TMP3, \XMM2
1118         AESENCLAST \TMP3, \XMM3
1119         AESENCLAST \TMP3, \XMM4
1120         movdqu    HashKey_k(%arg2), \TMP5
1121         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1122         movdqu    (%arg4,%r11,1), \TMP3
1123         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1124         movdqu    16(%arg4,%r11,1), \TMP3
1125         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1126         movdqu    32(%arg4,%r11,1), \TMP3
1127         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1128         movdqu    48(%arg4,%r11,1), \TMP3
1129         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1130         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1131         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1132         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1133         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1134         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1135         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1136         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1137         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1138
1139         pxor      \TMP4, \TMP1
1140         pxor      \XMM8, \XMM5
1141         pxor      \TMP6, \TMP2
1142         pxor      \TMP1, \TMP2
1143         pxor      \XMM5, \TMP2
1144         movdqa    \TMP2, \TMP3
1145         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1146         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1147         pxor      \TMP3, \XMM5
1148         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1149
1150         # first phase of reduction
1151
1152         movdqa    \XMM5, \TMP2
1153         movdqa    \XMM5, \TMP3
1154         movdqa    \XMM5, \TMP4
1155 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156         pslld     $31, \TMP2                   # packed right shift << 31
1157         pslld     $30, \TMP3                   # packed right shift << 30
1158         pslld     $25, \TMP4                   # packed right shift << 25
1159         pxor      \TMP3, \TMP2                 # xor the shifted versions
1160         pxor      \TMP4, \TMP2
1161         movdqa    \TMP2, \TMP5
1162         psrldq    $4, \TMP5                    # right shift T5 1 DW
1163         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1164         pxor      \TMP2, \XMM5
1165
1166         # second phase of reduction
1167
1168         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169         movdqa    \XMM5,\TMP3
1170         movdqa    \XMM5,\TMP4
1171         psrld     $1, \TMP2                    # packed left shift >>1
1172         psrld     $2, \TMP3                    # packed left shift >>2
1173         psrld     $7, \TMP4                    # packed left shift >>7
1174         pxor      \TMP3,\TMP2                  # xor the shifted versions
1175         pxor      \TMP4,\TMP2
1176         pxor      \TMP5, \TMP2
1177         pxor      \TMP2, \XMM5
1178         pxor      \TMP1, \XMM5                 # result is in TMP1
1179
1180         pxor      \XMM5, \XMM1
1181 .endm
1182
1183 /*
1184 * decrypt 4 blocks at a time
1185 * ghash the 4 previously decrypted ciphertext blocks
1186 * arg1, %arg3, %arg4 are used as pointers only, not modified
1187 * %r11 is the data offset value
1188 */
1189 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192         movdqa    \XMM1, \XMM5
1193         movdqa    \XMM2, \XMM6
1194         movdqa    \XMM3, \XMM7
1195         movdqa    \XMM4, \XMM8
1196
1197         movdqa    SHUF_MASK(%rip), %xmm15
1198         # multiply TMP5 * HashKey using karatsuba
1199
1200         movdqa    \XMM5, \TMP4
1201         pshufd    $78, \XMM5, \TMP6
1202         pxor      \XMM5, \TMP6
1203         paddd     ONE(%rip), \XMM0              # INCR CNT
1204         movdqu    HashKey_4(%arg2), \TMP5
1205         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1206         movdqa    \XMM0, \XMM1
1207         paddd     ONE(%rip), \XMM0              # INCR CNT
1208         movdqa    \XMM0, \XMM2
1209         paddd     ONE(%rip), \XMM0              # INCR CNT
1210         movdqa    \XMM0, \XMM3
1211         paddd     ONE(%rip), \XMM0              # INCR CNT
1212         movdqa    \XMM0, \XMM4
1213         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1214         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1215         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1216         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1217         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1218
1219         pxor      (%arg1), \XMM1
1220         pxor      (%arg1), \XMM2
1221         pxor      (%arg1), \XMM3
1222         pxor      (%arg1), \XMM4
1223         movdqu    HashKey_4_k(%arg2), \TMP5
1224         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1225         movaps 0x10(%arg1), \TMP1
1226         AESENC    \TMP1, \XMM1              # Round 1
1227         AESENC    \TMP1, \XMM2
1228         AESENC    \TMP1, \XMM3
1229         AESENC    \TMP1, \XMM4
1230         movaps 0x20(%arg1), \TMP1
1231         AESENC    \TMP1, \XMM1              # Round 2
1232         AESENC    \TMP1, \XMM2
1233         AESENC    \TMP1, \XMM3
1234         AESENC    \TMP1, \XMM4
1235         movdqa    \XMM6, \TMP1
1236         pshufd    $78, \XMM6, \TMP2
1237         pxor      \XMM6, \TMP2
1238         movdqu    HashKey_3(%arg2), \TMP5
1239         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1240         movaps 0x30(%arg1), \TMP3
1241         AESENC    \TMP3, \XMM1              # Round 3
1242         AESENC    \TMP3, \XMM2
1243         AESENC    \TMP3, \XMM3
1244         AESENC    \TMP3, \XMM4
1245         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1246         movaps 0x40(%arg1), \TMP3
1247         AESENC    \TMP3, \XMM1              # Round 4
1248         AESENC    \TMP3, \XMM2
1249         AESENC    \TMP3, \XMM3
1250         AESENC    \TMP3, \XMM4
1251         movdqu    HashKey_3_k(%arg2), \TMP5
1252         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1253         movaps 0x50(%arg1), \TMP3
1254         AESENC    \TMP3, \XMM1              # Round 5
1255         AESENC    \TMP3, \XMM2
1256         AESENC    \TMP3, \XMM3
1257         AESENC    \TMP3, \XMM4
1258         pxor      \TMP1, \TMP4
1259 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260         pxor      \XMM6, \XMM5
1261         pxor      \TMP2, \TMP6
1262         movdqa    \XMM7, \TMP1
1263         pshufd    $78, \XMM7, \TMP2
1264         pxor      \XMM7, \TMP2
1265         movdqu    HashKey_2(%arg2), \TMP5
1266
1267         # Multiply TMP5 * HashKey using karatsuba
1268
1269         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1270         movaps 0x60(%arg1), \TMP3
1271         AESENC    \TMP3, \XMM1              # Round 6
1272         AESENC    \TMP3, \XMM2
1273         AESENC    \TMP3, \XMM3
1274         AESENC    \TMP3, \XMM4
1275         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1276         movaps 0x70(%arg1), \TMP3
1277         AESENC    \TMP3, \XMM1             # Round 7
1278         AESENC    \TMP3, \XMM2
1279         AESENC    \TMP3, \XMM3
1280         AESENC    \TMP3, \XMM4
1281         movdqu    HashKey_2_k(%arg2), \TMP5
1282         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1283         movaps 0x80(%arg1), \TMP3
1284         AESENC    \TMP3, \XMM1             # Round 8
1285         AESENC    \TMP3, \XMM2
1286         AESENC    \TMP3, \XMM3
1287         AESENC    \TMP3, \XMM4
1288         pxor      \TMP1, \TMP4
1289 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290         pxor      \XMM7, \XMM5
1291         pxor      \TMP2, \TMP6
1292
1293         # Multiply XMM8 * HashKey
1294         # XMM8 and TMP5 hold the values for the two operands
1295
1296         movdqa    \XMM8, \TMP1
1297         pshufd    $78, \XMM8, \TMP2
1298         pxor      \XMM8, \TMP2
1299         movdqu    HashKey(%arg2), \TMP5
1300         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1301         movaps 0x90(%arg1), \TMP3
1302         AESENC    \TMP3, \XMM1            # Round 9
1303         AESENC    \TMP3, \XMM2
1304         AESENC    \TMP3, \XMM3
1305         AESENC    \TMP3, \XMM4
1306         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1307         lea       0xa0(%arg1),%r10
1308         mov       keysize,%eax
1309         shr       $2,%eax                       # 128->4, 192->6, 256->8
1310         sub       $4,%eax                       # 128->0, 192->2, 256->4
1311         jz        aes_loop_par_dec_done\@
1312
1313 aes_loop_par_dec\@:
1314         MOVADQ    (%r10),\TMP3
1315 .irpc   index, 1234
1316         AESENC    \TMP3, %xmm\index
1317 .endr
1318         add       $16,%r10
1319         sub       $1,%eax
1320         jnz       aes_loop_par_dec\@
1321
1322 aes_loop_par_dec_done\@:
1323         MOVADQ    (%r10), \TMP3
1324         AESENCLAST \TMP3, \XMM1           # last round
1325         AESENCLAST \TMP3, \XMM2
1326         AESENCLAST \TMP3, \XMM3
1327         AESENCLAST \TMP3, \XMM4
1328         movdqu    HashKey_k(%arg2), \TMP5
1329         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1330         movdqu    (%arg4,%r11,1), \TMP3
1331         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1332         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1333         movdqa    \TMP3, \XMM1
1334         movdqu    16(%arg4,%r11,1), \TMP3
1335         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1336         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1337         movdqa    \TMP3, \XMM2
1338         movdqu    32(%arg4,%r11,1), \TMP3
1339         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1340         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1341         movdqa    \TMP3, \XMM3
1342         movdqu    48(%arg4,%r11,1), \TMP3
1343         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1344         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1345         movdqa    \TMP3, \XMM4
1346         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1347         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1348         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1349         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1350
1351         pxor      \TMP4, \TMP1
1352         pxor      \XMM8, \XMM5
1353         pxor      \TMP6, \TMP2
1354         pxor      \TMP1, \TMP2
1355         pxor      \XMM5, \TMP2
1356         movdqa    \TMP2, \TMP3
1357         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1358         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1359         pxor      \TMP3, \XMM5
1360         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1361
1362         # first phase of reduction
1363
1364         movdqa    \XMM5, \TMP2
1365         movdqa    \XMM5, \TMP3
1366         movdqa    \XMM5, \TMP4
1367 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368         pslld     $31, \TMP2                   # packed right shift << 31
1369         pslld     $30, \TMP3                   # packed right shift << 30
1370         pslld     $25, \TMP4                   # packed right shift << 25
1371         pxor      \TMP3, \TMP2                 # xor the shifted versions
1372         pxor      \TMP4, \TMP2
1373         movdqa    \TMP2, \TMP5
1374         psrldq    $4, \TMP5                    # right shift T5 1 DW
1375         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1376         pxor      \TMP2, \XMM5
1377
1378         # second phase of reduction
1379
1380         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381         movdqa    \XMM5,\TMP3
1382         movdqa    \XMM5,\TMP4
1383         psrld     $1, \TMP2                    # packed left shift >>1
1384         psrld     $2, \TMP3                    # packed left shift >>2
1385         psrld     $7, \TMP4                    # packed left shift >>7
1386         pxor      \TMP3,\TMP2                  # xor the shifted versions
1387         pxor      \TMP4,\TMP2
1388         pxor      \TMP5, \TMP2
1389         pxor      \TMP2, \XMM5
1390         pxor      \TMP1, \XMM5                 # result is in TMP1
1391
1392         pxor      \XMM5, \XMM1
1393 .endm
1394
1395 /* GHASH the last 4 ciphertext blocks. */
1396 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399         # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401         movdqa    \XMM1, \TMP6
1402         pshufd    $78, \XMM1, \TMP2
1403         pxor      \XMM1, \TMP2
1404         movdqu    HashKey_4(%arg2), \TMP5
1405         PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1406         PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1407         movdqu    HashKey_4_k(%arg2), \TMP4
1408         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409         movdqa    \XMM1, \XMMDst
1410         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1411
1412         # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414         movdqa    \XMM2, \TMP1
1415         pshufd    $78, \XMM2, \TMP2
1416         pxor      \XMM2, \TMP2
1417         movdqu    HashKey_3(%arg2), \TMP5
1418         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1419         PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1420         movdqu    HashKey_3_k(%arg2), \TMP4
1421         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1422         pxor      \TMP1, \TMP6
1423         pxor      \XMM2, \XMMDst
1424         pxor      \TMP2, \XMM1
1425 # results accumulated in TMP6, XMMDst, XMM1
1426
1427         # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429         movdqa    \XMM3, \TMP1
1430         pshufd    $78, \XMM3, \TMP2
1431         pxor      \XMM3, \TMP2
1432         movdqu    HashKey_2(%arg2), \TMP5
1433         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1434         PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1435         movdqu    HashKey_2_k(%arg2), \TMP4
1436         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437         pxor      \TMP1, \TMP6
1438         pxor      \XMM3, \XMMDst
1439         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1440
1441         # Multiply TMP1 * HashKey (using Karatsuba)
1442         movdqa    \XMM4, \TMP1
1443         pshufd    $78, \XMM4, \TMP2
1444         pxor      \XMM4, \TMP2
1445         movdqu    HashKey(%arg2), \TMP5
1446         PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
1447         PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1448         movdqu    HashKey_k(%arg2), \TMP4
1449         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1450         pxor      \TMP1, \TMP6
1451         pxor      \XMM4, \XMMDst
1452         pxor      \XMM1, \TMP2
1453         pxor      \TMP6, \TMP2
1454         pxor      \XMMDst, \TMP2
1455         # middle section of the temp results combined as in karatsuba algorithm
1456         movdqa    \TMP2, \TMP4
1457         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1458         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1459         pxor      \TMP4, \XMMDst
1460         pxor      \TMP2, \TMP6
1461 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462         # first phase of the reduction
1463         movdqa    \XMMDst, \TMP2
1464         movdqa    \XMMDst, \TMP3
1465         movdqa    \XMMDst, \TMP4
1466 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467         pslld     $31, \TMP2                # packed right shifting << 31
1468         pslld     $30, \TMP3                # packed right shifting << 30
1469         pslld     $25, \TMP4                # packed right shifting << 25
1470         pxor      \TMP3, \TMP2              # xor the shifted versions
1471         pxor      \TMP4, \TMP2
1472         movdqa    \TMP2, \TMP7
1473         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1474         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1475         pxor      \TMP2, \XMMDst
1476
1477         # second phase of the reduction
1478         movdqa    \XMMDst, \TMP2
1479         # make 3 copies of XMMDst for doing 3 shift operations
1480         movdqa    \XMMDst, \TMP3
1481         movdqa    \XMMDst, \TMP4
1482         psrld     $1, \TMP2                 # packed left shift >> 1
1483         psrld     $2, \TMP3                 # packed left shift >> 2
1484         psrld     $7, \TMP4                 # packed left shift >> 7
1485         pxor      \TMP3, \TMP2              # xor the shifted versions
1486         pxor      \TMP4, \TMP2
1487         pxor      \TMP7, \TMP2
1488         pxor      \TMP2, \XMMDst
1489         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1490 .endm
1491
1492
1493 /* Encryption of a single block
1494 * uses eax & r10
1495 */
1496
1497 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1498
1499         pxor            (%arg1), \XMM0
1500         mov             keysize,%eax
1501         shr             $2,%eax                 # 128->4, 192->6, 256->8
1502         add             $5,%eax                 # 128->9, 192->11, 256->13
1503         lea             16(%arg1), %r10   # get first expanded key address
1504
1505 _esb_loop_\@:
1506         MOVADQ          (%r10),\TMP1
1507         AESENC          \TMP1,\XMM0
1508         add             $16,%r10
1509         sub             $1,%eax
1510         jnz             _esb_loop_\@
1511
1512         MOVADQ          (%r10),\TMP1
1513         AESENCLAST      \TMP1,\XMM0
1514 .endm
1515 /*****************************************************************************
1516 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1517 *                   struct gcm_context_data *data
1518 *                                      // Context data
1519 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1520 *                   const u8 *in,      // Ciphertext input
1521 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1522 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1523 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1525 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1527 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1529 *                                      // given authentication tag and only return the plaintext if they match.
1530 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531 *                                      // (most likely), 12 or 8.
1532 *
1533 * Assumptions:
1534 *
1535 * keys:
1536 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1537 *       set of 11 keys in the data structure void *aes_ctx
1538 *
1539 * iv:
1540 *       0                   1                   2                   3
1541 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                             Salt  (From the SA)               |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *       |                     Initialization Vector                     |
1546 *       |         (This is the sequence number from IPSec header)       |
1547 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548 *       |                              0x1                              |
1549 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550 *
1551 *
1552 *
1553 * AAD:
1554 *       AAD padded to 128 bits with 0
1555 *       for example, assume AAD is a u32 vector
1556 *
1557 *       if AAD is 8 bytes:
1558 *       AAD[3] = {A0, A1};
1559 *       padded AAD in xmm register = {A1 A0 0 0}
1560 *
1561 *       0                   1                   2                   3
1562 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564 *       |                               SPI (A1)                        |
1565 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566 *       |                     32-bit Sequence Number (A0)               |
1567 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568 *       |                              0x0                              |
1569 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570 *
1571 *                                       AAD Format with 32-bit Sequence Number
1572 *
1573 *       if AAD is 12 bytes:
1574 *       AAD[3] = {A0, A1, A2};
1575 *       padded AAD in xmm register = {A2 A1 A0 0}
1576 *
1577 *       0                   1                   2                   3
1578 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                               SPI (A2)                        |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1585 *       |                                                               |
1586 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587 *       |                              0x0                              |
1588 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589 *
1590 *                        AAD Format with 64-bit Extended Sequence Number
1591 *
1592 * poly = x^128 + x^127 + x^126 + x^121 + 1
1593 *
1594 *****************************************************************************/
1595 ENTRY(aesni_gcm_dec)
1596         FUNC_SAVE
1597
1598         GCM_INIT %arg6, arg7, arg8, arg9
1599         GCM_ENC_DEC dec
1600         GCM_COMPLETE arg10, arg11
1601         FUNC_RESTORE
1602         ret
1603 ENDPROC(aesni_gcm_dec)
1604
1605
1606 /*****************************************************************************
1607 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1608 *                    struct gcm_context_data *data
1609 *                                        // Context data
1610 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1611 *                    const u8 *in,       // Plaintext input
1612 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1613 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1614 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1616 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1618 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619 *                    u8 *auth_tag,       // Authenticated Tag output.
1620 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621 *                                        // 12 or 8.
1622 *
1623 * Assumptions:
1624 *
1625 * keys:
1626 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1627 *       first set of 11 keys in the data structure void *aes_ctx
1628 *
1629 *
1630 * iv:
1631 *       0                   1                   2                   3
1632 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                             Salt  (From the SA)               |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *       |                     Initialization Vector                     |
1637 *       |         (This is the sequence number from IPSec header)       |
1638 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639 *       |                              0x1                              |
1640 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641 *
1642 *
1643 *
1644 * AAD:
1645 *       AAD padded to 128 bits with 0
1646 *       for example, assume AAD is a u32 vector
1647 *
1648 *       if AAD is 8 bytes:
1649 *       AAD[3] = {A0, A1};
1650 *       padded AAD in xmm register = {A1 A0 0 0}
1651 *
1652 *       0                   1                   2                   3
1653 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655 *       |                               SPI (A1)                        |
1656 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 *       |                     32-bit Sequence Number (A0)               |
1658 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659 *       |                              0x0                              |
1660 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 *
1662 *                                 AAD Format with 32-bit Sequence Number
1663 *
1664 *       if AAD is 12 bytes:
1665 *       AAD[3] = {A0, A1, A2};
1666 *       padded AAD in xmm register = {A2 A1 A0 0}
1667 *
1668 *       0                   1                   2                   3
1669 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                               SPI (A2)                        |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1674 *       |                                                               |
1675 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676 *       |                              0x0                              |
1677 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678 *
1679 *                         AAD Format with 64-bit Extended Sequence Number
1680 *
1681 * poly = x^128 + x^127 + x^126 + x^121 + 1
1682 ***************************************************************************/
1683 ENTRY(aesni_gcm_enc)
1684         FUNC_SAVE
1685
1686         GCM_INIT %arg6, arg7, arg8, arg9
1687         GCM_ENC_DEC enc
1688
1689         GCM_COMPLETE arg10, arg11
1690         FUNC_RESTORE
1691         ret
1692 ENDPROC(aesni_gcm_enc)
1693
1694 /*****************************************************************************
1695 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1696 *                     struct gcm_context_data *data,
1697 *                                         // context data
1698 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1699 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1701 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1703 *                     u64 aad_len)        // Length of AAD in bytes.
1704 */
1705 ENTRY(aesni_gcm_init)
1706         FUNC_SAVE
1707         GCM_INIT %arg3, %arg4,%arg5, %arg6
1708         FUNC_RESTORE
1709         ret
1710 ENDPROC(aesni_gcm_init)
1711
1712 /*****************************************************************************
1713 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1714 *                    struct gcm_context_data *data,
1715 *                                        // context data
1716 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1717 *                    const u8 *in,       // Plaintext input
1718 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1719 */
1720 ENTRY(aesni_gcm_enc_update)
1721         FUNC_SAVE
1722         GCM_ENC_DEC enc
1723         FUNC_RESTORE
1724         ret
1725 ENDPROC(aesni_gcm_enc_update)
1726
1727 /*****************************************************************************
1728 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1729 *                    struct gcm_context_data *data,
1730 *                                        // context data
1731 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1732 *                    const u8 *in,       // Plaintext input
1733 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1734 */
1735 ENTRY(aesni_gcm_dec_update)
1736         FUNC_SAVE
1737         GCM_ENC_DEC dec
1738         FUNC_RESTORE
1739         ret
1740 ENDPROC(aesni_gcm_dec_update)
1741
1742 /*****************************************************************************
1743 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1744 *                    struct gcm_context_data *data,
1745 *                                        // context data
1746 *                    u8 *auth_tag,       // Authenticated Tag output.
1747 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748 *                                        // 12 or 8.
1749 */
1750 ENTRY(aesni_gcm_finalize)
1751         FUNC_SAVE
1752         GCM_COMPLETE %arg3 %arg4
1753         FUNC_RESTORE
1754         ret
1755 ENDPROC(aesni_gcm_finalize)
1756
1757 #endif
1758
1759
1760 .align 4
1761 _key_expansion_128:
1762 _key_expansion_256a:
1763         pshufd $0b11111111, %xmm1, %xmm1
1764         shufps $0b00010000, %xmm0, %xmm4
1765         pxor %xmm4, %xmm0
1766         shufps $0b10001100, %xmm0, %xmm4
1767         pxor %xmm4, %xmm0
1768         pxor %xmm1, %xmm0
1769         movaps %xmm0, (TKEYP)
1770         add $0x10, TKEYP
1771         ret
1772 ENDPROC(_key_expansion_128)
1773 ENDPROC(_key_expansion_256a)
1774
1775 .align 4
1776 _key_expansion_192a:
1777         pshufd $0b01010101, %xmm1, %xmm1
1778         shufps $0b00010000, %xmm0, %xmm4
1779         pxor %xmm4, %xmm0
1780         shufps $0b10001100, %xmm0, %xmm4
1781         pxor %xmm4, %xmm0
1782         pxor %xmm1, %xmm0
1783
1784         movaps %xmm2, %xmm5
1785         movaps %xmm2, %xmm6
1786         pslldq $4, %xmm5
1787         pshufd $0b11111111, %xmm0, %xmm3
1788         pxor %xmm3, %xmm2
1789         pxor %xmm5, %xmm2
1790
1791         movaps %xmm0, %xmm1
1792         shufps $0b01000100, %xmm0, %xmm6
1793         movaps %xmm6, (TKEYP)
1794         shufps $0b01001110, %xmm2, %xmm1
1795         movaps %xmm1, 0x10(TKEYP)
1796         add $0x20, TKEYP
1797         ret
1798 ENDPROC(_key_expansion_192a)
1799
1800 .align 4
1801 _key_expansion_192b:
1802         pshufd $0b01010101, %xmm1, %xmm1
1803         shufps $0b00010000, %xmm0, %xmm4
1804         pxor %xmm4, %xmm0
1805         shufps $0b10001100, %xmm0, %xmm4
1806         pxor %xmm4, %xmm0
1807         pxor %xmm1, %xmm0
1808
1809         movaps %xmm2, %xmm5
1810         pslldq $4, %xmm5
1811         pshufd $0b11111111, %xmm0, %xmm3
1812         pxor %xmm3, %xmm2
1813         pxor %xmm5, %xmm2
1814
1815         movaps %xmm0, (TKEYP)
1816         add $0x10, TKEYP
1817         ret
1818 ENDPROC(_key_expansion_192b)
1819
1820 .align 4
1821 _key_expansion_256b:
1822         pshufd $0b10101010, %xmm1, %xmm1
1823         shufps $0b00010000, %xmm2, %xmm4
1824         pxor %xmm4, %xmm2
1825         shufps $0b10001100, %xmm2, %xmm4
1826         pxor %xmm4, %xmm2
1827         pxor %xmm1, %xmm2
1828         movaps %xmm2, (TKEYP)
1829         add $0x10, TKEYP
1830         ret
1831 ENDPROC(_key_expansion_256b)
1832
1833 /*
1834  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1835  *                   unsigned int key_len)
1836  */
1837 ENTRY(aesni_set_key)
1838         FRAME_BEGIN
1839 #ifndef __x86_64__
1840         pushl KEYP
1841         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1842         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1843         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1844 #endif
1845         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1846         movaps %xmm0, (KEYP)
1847         lea 0x10(KEYP), TKEYP           # key addr
1848         movl %edx, 480(KEYP)
1849         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1850         cmp $24, %dl
1851         jb .Lenc_key128
1852         je .Lenc_key192
1853         movups 0x10(UKEYP), %xmm2       # other user key
1854         movaps %xmm2, (TKEYP)
1855         add $0x10, TKEYP
1856         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1857         call _key_expansion_256a
1858         AESKEYGENASSIST 0x1 %xmm0 %xmm1
1859         call _key_expansion_256b
1860         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1861         call _key_expansion_256a
1862         AESKEYGENASSIST 0x2 %xmm0 %xmm1
1863         call _key_expansion_256b
1864         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1865         call _key_expansion_256a
1866         AESKEYGENASSIST 0x4 %xmm0 %xmm1
1867         call _key_expansion_256b
1868         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1869         call _key_expansion_256a
1870         AESKEYGENASSIST 0x8 %xmm0 %xmm1
1871         call _key_expansion_256b
1872         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1873         call _key_expansion_256a
1874         AESKEYGENASSIST 0x10 %xmm0 %xmm1
1875         call _key_expansion_256b
1876         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1877         call _key_expansion_256a
1878         AESKEYGENASSIST 0x20 %xmm0 %xmm1
1879         call _key_expansion_256b
1880         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1881         call _key_expansion_256a
1882         jmp .Ldec_key
1883 .Lenc_key192:
1884         movq 0x10(UKEYP), %xmm2         # other user key
1885         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1886         call _key_expansion_192a
1887         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1888         call _key_expansion_192b
1889         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1890         call _key_expansion_192a
1891         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1892         call _key_expansion_192b
1893         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1894         call _key_expansion_192a
1895         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1896         call _key_expansion_192b
1897         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1898         call _key_expansion_192a
1899         AESKEYGENASSIST 0x80 %xmm2 %xmm1        # round 8
1900         call _key_expansion_192b
1901         jmp .Ldec_key
1902 .Lenc_key128:
1903         AESKEYGENASSIST 0x1 %xmm0 %xmm1         # round 1
1904         call _key_expansion_128
1905         AESKEYGENASSIST 0x2 %xmm0 %xmm1         # round 2
1906         call _key_expansion_128
1907         AESKEYGENASSIST 0x4 %xmm0 %xmm1         # round 3
1908         call _key_expansion_128
1909         AESKEYGENASSIST 0x8 %xmm0 %xmm1         # round 4
1910         call _key_expansion_128
1911         AESKEYGENASSIST 0x10 %xmm0 %xmm1        # round 5
1912         call _key_expansion_128
1913         AESKEYGENASSIST 0x20 %xmm0 %xmm1        # round 6
1914         call _key_expansion_128
1915         AESKEYGENASSIST 0x40 %xmm0 %xmm1        # round 7
1916         call _key_expansion_128
1917         AESKEYGENASSIST 0x80 %xmm0 %xmm1        # round 8
1918         call _key_expansion_128
1919         AESKEYGENASSIST 0x1b %xmm0 %xmm1        # round 9
1920         call _key_expansion_128
1921         AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
1922         call _key_expansion_128
1923 .Ldec_key:
1924         sub $0x10, TKEYP
1925         movaps (KEYP), %xmm0
1926         movaps (TKEYP), %xmm1
1927         movaps %xmm0, 240(TKEYP)
1928         movaps %xmm1, 240(KEYP)
1929         add $0x10, KEYP
1930         lea 240-16(TKEYP), UKEYP
1931 .align 4
1932 .Ldec_key_loop:
1933         movaps (KEYP), %xmm0
1934         AESIMC %xmm0 %xmm1
1935         movaps %xmm1, (UKEYP)
1936         add $0x10, KEYP
1937         sub $0x10, UKEYP
1938         cmp TKEYP, KEYP
1939         jb .Ldec_key_loop
1940         xor AREG, AREG
1941 #ifndef __x86_64__
1942         popl KEYP
1943 #endif
1944         FRAME_END
1945         ret
1946 ENDPROC(aesni_set_key)
1947
1948 /*
1949  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1950  */
1951 ENTRY(aesni_enc)
1952         FRAME_BEGIN
1953 #ifndef __x86_64__
1954         pushl KEYP
1955         pushl KLEN
1956         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1957         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1958         movl (FRAME_OFFSET+20)(%esp), INP       # src
1959 #endif
1960         movl 480(KEYP), KLEN            # key length
1961         movups (INP), STATE             # input
1962         call _aesni_enc1
1963         movups STATE, (OUTP)            # output
1964 #ifndef __x86_64__
1965         popl KLEN
1966         popl KEYP
1967 #endif
1968         FRAME_END
1969         ret
1970 ENDPROC(aesni_enc)
1971
1972 /*
1973  * _aesni_enc1:         internal ABI
1974  * input:
1975  *      KEYP:           key struct pointer
1976  *      KLEN:           round count
1977  *      STATE:          initial state (input)
1978  * output:
1979  *      STATE:          finial state (output)
1980  * changed:
1981  *      KEY
1982  *      TKEYP (T1)
1983  */
1984 .align 4
1985 _aesni_enc1:
1986         movaps (KEYP), KEY              # key
1987         mov KEYP, TKEYP
1988         pxor KEY, STATE         # round 0
1989         add $0x30, TKEYP
1990         cmp $24, KLEN
1991         jb .Lenc128
1992         lea 0x20(TKEYP), TKEYP
1993         je .Lenc192
1994         add $0x20, TKEYP
1995         movaps -0x60(TKEYP), KEY
1996         AESENC KEY STATE
1997         movaps -0x50(TKEYP), KEY
1998         AESENC KEY STATE
1999 .align 4
2000 .Lenc192:
2001         movaps -0x40(TKEYP), KEY
2002         AESENC KEY STATE
2003         movaps -0x30(TKEYP), KEY
2004         AESENC KEY STATE
2005 .align 4
2006 .Lenc128:
2007         movaps -0x20(TKEYP), KEY
2008         AESENC KEY STATE
2009         movaps -0x10(TKEYP), KEY
2010         AESENC KEY STATE
2011         movaps (TKEYP), KEY
2012         AESENC KEY STATE
2013         movaps 0x10(TKEYP), KEY
2014         AESENC KEY STATE
2015         movaps 0x20(TKEYP), KEY
2016         AESENC KEY STATE
2017         movaps 0x30(TKEYP), KEY
2018         AESENC KEY STATE
2019         movaps 0x40(TKEYP), KEY
2020         AESENC KEY STATE
2021         movaps 0x50(TKEYP), KEY
2022         AESENC KEY STATE
2023         movaps 0x60(TKEYP), KEY
2024         AESENC KEY STATE
2025         movaps 0x70(TKEYP), KEY
2026         AESENCLAST KEY STATE
2027         ret
2028 ENDPROC(_aesni_enc1)
2029
2030 /*
2031  * _aesni_enc4: internal ABI
2032  * input:
2033  *      KEYP:           key struct pointer
2034  *      KLEN:           round count
2035  *      STATE1:         initial state (input)
2036  *      STATE2
2037  *      STATE3
2038  *      STATE4
2039  * output:
2040  *      STATE1:         finial state (output)
2041  *      STATE2
2042  *      STATE3
2043  *      STATE4
2044  * changed:
2045  *      KEY
2046  *      TKEYP (T1)
2047  */
2048 .align 4
2049 _aesni_enc4:
2050         movaps (KEYP), KEY              # key
2051         mov KEYP, TKEYP
2052         pxor KEY, STATE1                # round 0
2053         pxor KEY, STATE2
2054         pxor KEY, STATE3
2055         pxor KEY, STATE4
2056         add $0x30, TKEYP
2057         cmp $24, KLEN
2058         jb .L4enc128
2059         lea 0x20(TKEYP), TKEYP
2060         je .L4enc192
2061         add $0x20, TKEYP
2062         movaps -0x60(TKEYP), KEY
2063         AESENC KEY STATE1
2064         AESENC KEY STATE2
2065         AESENC KEY STATE3
2066         AESENC KEY STATE4
2067         movaps -0x50(TKEYP), KEY
2068         AESENC KEY STATE1
2069         AESENC KEY STATE2
2070         AESENC KEY STATE3
2071         AESENC KEY STATE4
2072 #.align 4
2073 .L4enc192:
2074         movaps -0x40(TKEYP), KEY
2075         AESENC KEY STATE1
2076         AESENC KEY STATE2
2077         AESENC KEY STATE3
2078         AESENC KEY STATE4
2079         movaps -0x30(TKEYP), KEY
2080         AESENC KEY STATE1
2081         AESENC KEY STATE2
2082         AESENC KEY STATE3
2083         AESENC KEY STATE4
2084 #.align 4
2085 .L4enc128:
2086         movaps -0x20(TKEYP), KEY
2087         AESENC KEY STATE1
2088         AESENC KEY STATE2
2089         AESENC KEY STATE3
2090         AESENC KEY STATE4
2091         movaps -0x10(TKEYP), KEY
2092         AESENC KEY STATE1
2093         AESENC KEY STATE2
2094         AESENC KEY STATE3
2095         AESENC KEY STATE4
2096         movaps (TKEYP), KEY
2097         AESENC KEY STATE1
2098         AESENC KEY STATE2
2099         AESENC KEY STATE3
2100         AESENC KEY STATE4
2101         movaps 0x10(TKEYP), KEY
2102         AESENC KEY STATE1
2103         AESENC KEY STATE2
2104         AESENC KEY STATE3
2105         AESENC KEY STATE4
2106         movaps 0x20(TKEYP), KEY
2107         AESENC KEY STATE1
2108         AESENC KEY STATE2
2109         AESENC KEY STATE3
2110         AESENC KEY STATE4
2111         movaps 0x30(TKEYP), KEY
2112         AESENC KEY STATE1
2113         AESENC KEY STATE2
2114         AESENC KEY STATE3
2115         AESENC KEY STATE4
2116         movaps 0x40(TKEYP), KEY
2117         AESENC KEY STATE1
2118         AESENC KEY STATE2
2119         AESENC KEY STATE3
2120         AESENC KEY STATE4
2121         movaps 0x50(TKEYP), KEY
2122         AESENC KEY STATE1
2123         AESENC KEY STATE2
2124         AESENC KEY STATE3
2125         AESENC KEY STATE4
2126         movaps 0x60(TKEYP), KEY
2127         AESENC KEY STATE1
2128         AESENC KEY STATE2
2129         AESENC KEY STATE3
2130         AESENC KEY STATE4
2131         movaps 0x70(TKEYP), KEY
2132         AESENCLAST KEY STATE1           # last round
2133         AESENCLAST KEY STATE2
2134         AESENCLAST KEY STATE3
2135         AESENCLAST KEY STATE4
2136         ret
2137 ENDPROC(_aesni_enc4)
2138
2139 /*
2140  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2141  */
2142 ENTRY(aesni_dec)
2143         FRAME_BEGIN
2144 #ifndef __x86_64__
2145         pushl KEYP
2146         pushl KLEN
2147         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2148         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2149         movl (FRAME_OFFSET+20)(%esp), INP       # src
2150 #endif
2151         mov 480(KEYP), KLEN             # key length
2152         add $240, KEYP
2153         movups (INP), STATE             # input
2154         call _aesni_dec1
2155         movups STATE, (OUTP)            #output
2156 #ifndef __x86_64__
2157         popl KLEN
2158         popl KEYP
2159 #endif
2160         FRAME_END
2161         ret
2162 ENDPROC(aesni_dec)
2163
2164 /*
2165  * _aesni_dec1:         internal ABI
2166  * input:
2167  *      KEYP:           key struct pointer
2168  *      KLEN:           key length
2169  *      STATE:          initial state (input)
2170  * output:
2171  *      STATE:          finial state (output)
2172  * changed:
2173  *      KEY
2174  *      TKEYP (T1)
2175  */
2176 .align 4
2177 _aesni_dec1:
2178         movaps (KEYP), KEY              # key
2179         mov KEYP, TKEYP
2180         pxor KEY, STATE         # round 0
2181         add $0x30, TKEYP
2182         cmp $24, KLEN
2183         jb .Ldec128
2184         lea 0x20(TKEYP), TKEYP
2185         je .Ldec192
2186         add $0x20, TKEYP
2187         movaps -0x60(TKEYP), KEY
2188         AESDEC KEY STATE
2189         movaps -0x50(TKEYP), KEY
2190         AESDEC KEY STATE
2191 .align 4
2192 .Ldec192:
2193         movaps -0x40(TKEYP), KEY
2194         AESDEC KEY STATE
2195         movaps -0x30(TKEYP), KEY
2196         AESDEC KEY STATE
2197 .align 4
2198 .Ldec128:
2199         movaps -0x20(TKEYP), KEY
2200         AESDEC KEY STATE
2201         movaps -0x10(TKEYP), KEY
2202         AESDEC KEY STATE
2203         movaps (TKEYP), KEY
2204         AESDEC KEY STATE
2205         movaps 0x10(TKEYP), KEY
2206         AESDEC KEY STATE
2207         movaps 0x20(TKEYP), KEY
2208         AESDEC KEY STATE
2209         movaps 0x30(TKEYP), KEY
2210         AESDEC KEY STATE
2211         movaps 0x40(TKEYP), KEY
2212         AESDEC KEY STATE
2213         movaps 0x50(TKEYP), KEY
2214         AESDEC KEY STATE
2215         movaps 0x60(TKEYP), KEY
2216         AESDEC KEY STATE
2217         movaps 0x70(TKEYP), KEY
2218         AESDECLAST KEY STATE
2219         ret
2220 ENDPROC(_aesni_dec1)
2221
2222 /*
2223  * _aesni_dec4: internal ABI
2224  * input:
2225  *      KEYP:           key struct pointer
2226  *      KLEN:           key length
2227  *      STATE1:         initial state (input)
2228  *      STATE2
2229  *      STATE3
2230  *      STATE4
2231  * output:
2232  *      STATE1:         finial state (output)
2233  *      STATE2
2234  *      STATE3
2235  *      STATE4
2236  * changed:
2237  *      KEY
2238  *      TKEYP (T1)
2239  */
2240 .align 4
2241 _aesni_dec4:
2242         movaps (KEYP), KEY              # key
2243         mov KEYP, TKEYP
2244         pxor KEY, STATE1                # round 0
2245         pxor KEY, STATE2
2246         pxor KEY, STATE3
2247         pxor KEY, STATE4
2248         add $0x30, TKEYP
2249         cmp $24, KLEN
2250         jb .L4dec128
2251         lea 0x20(TKEYP), TKEYP
2252         je .L4dec192
2253         add $0x20, TKEYP
2254         movaps -0x60(TKEYP), KEY
2255         AESDEC KEY STATE1
2256         AESDEC KEY STATE2
2257         AESDEC KEY STATE3
2258         AESDEC KEY STATE4
2259         movaps -0x50(TKEYP), KEY
2260         AESDEC KEY STATE1
2261         AESDEC KEY STATE2
2262         AESDEC KEY STATE3
2263         AESDEC KEY STATE4
2264 .align 4
2265 .L4dec192:
2266         movaps -0x40(TKEYP), KEY
2267         AESDEC KEY STATE1
2268         AESDEC KEY STATE2
2269         AESDEC KEY STATE3
2270         AESDEC KEY STATE4
2271         movaps -0x30(TKEYP), KEY
2272         AESDEC KEY STATE1
2273         AESDEC KEY STATE2
2274         AESDEC KEY STATE3
2275         AESDEC KEY STATE4
2276 .align 4
2277 .L4dec128:
2278         movaps -0x20(TKEYP), KEY
2279         AESDEC KEY STATE1
2280         AESDEC KEY STATE2
2281         AESDEC KEY STATE3
2282         AESDEC KEY STATE4
2283         movaps -0x10(TKEYP), KEY
2284         AESDEC KEY STATE1
2285         AESDEC KEY STATE2
2286         AESDEC KEY STATE3
2287         AESDEC KEY STATE4
2288         movaps (TKEYP), KEY
2289         AESDEC KEY STATE1
2290         AESDEC KEY STATE2
2291         AESDEC KEY STATE3
2292         AESDEC KEY STATE4
2293         movaps 0x10(TKEYP), KEY
2294         AESDEC KEY STATE1
2295         AESDEC KEY STATE2
2296         AESDEC KEY STATE3
2297         AESDEC KEY STATE4
2298         movaps 0x20(TKEYP), KEY
2299         AESDEC KEY STATE1
2300         AESDEC KEY STATE2
2301         AESDEC KEY STATE3
2302         AESDEC KEY STATE4
2303         movaps 0x30(TKEYP), KEY
2304         AESDEC KEY STATE1
2305         AESDEC KEY STATE2
2306         AESDEC KEY STATE3
2307         AESDEC KEY STATE4
2308         movaps 0x40(TKEYP), KEY
2309         AESDEC KEY STATE1
2310         AESDEC KEY STATE2
2311         AESDEC KEY STATE3
2312         AESDEC KEY STATE4
2313         movaps 0x50(TKEYP), KEY
2314         AESDEC KEY STATE1
2315         AESDEC KEY STATE2
2316         AESDEC KEY STATE3
2317         AESDEC KEY STATE4
2318         movaps 0x60(TKEYP), KEY
2319         AESDEC KEY STATE1
2320         AESDEC KEY STATE2
2321         AESDEC KEY STATE3
2322         AESDEC KEY STATE4
2323         movaps 0x70(TKEYP), KEY
2324         AESDECLAST KEY STATE1           # last round
2325         AESDECLAST KEY STATE2
2326         AESDECLAST KEY STATE3
2327         AESDECLAST KEY STATE4
2328         ret
2329 ENDPROC(_aesni_dec4)
2330
2331 /*
2332  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2333  *                    size_t len)
2334  */
2335 ENTRY(aesni_ecb_enc)
2336         FRAME_BEGIN
2337 #ifndef __x86_64__
2338         pushl LEN
2339         pushl KEYP
2340         pushl KLEN
2341         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2342         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2343         movl (FRAME_OFFSET+24)(%esp), INP       # src
2344         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2345 #endif
2346         test LEN, LEN           # check length
2347         jz .Lecb_enc_ret
2348         mov 480(KEYP), KLEN
2349         cmp $16, LEN
2350         jb .Lecb_enc_ret
2351         cmp $64, LEN
2352         jb .Lecb_enc_loop1
2353 .align 4
2354 .Lecb_enc_loop4:
2355         movups (INP), STATE1
2356         movups 0x10(INP), STATE2
2357         movups 0x20(INP), STATE3
2358         movups 0x30(INP), STATE4
2359         call _aesni_enc4
2360         movups STATE1, (OUTP)
2361         movups STATE2, 0x10(OUTP)
2362         movups STATE3, 0x20(OUTP)
2363         movups STATE4, 0x30(OUTP)
2364         sub $64, LEN
2365         add $64, INP
2366         add $64, OUTP
2367         cmp $64, LEN
2368         jge .Lecb_enc_loop4
2369         cmp $16, LEN
2370         jb .Lecb_enc_ret
2371 .align 4
2372 .Lecb_enc_loop1:
2373         movups (INP), STATE1
2374         call _aesni_enc1
2375         movups STATE1, (OUTP)
2376         sub $16, LEN
2377         add $16, INP
2378         add $16, OUTP
2379         cmp $16, LEN
2380         jge .Lecb_enc_loop1
2381 .Lecb_enc_ret:
2382 #ifndef __x86_64__
2383         popl KLEN
2384         popl KEYP
2385         popl LEN
2386 #endif
2387         FRAME_END
2388         ret
2389 ENDPROC(aesni_ecb_enc)
2390
2391 /*
2392  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2393  *                    size_t len);
2394  */
2395 ENTRY(aesni_ecb_dec)
2396         FRAME_BEGIN
2397 #ifndef __x86_64__
2398         pushl LEN
2399         pushl KEYP
2400         pushl KLEN
2401         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2402         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2403         movl (FRAME_OFFSET+24)(%esp), INP       # src
2404         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2405 #endif
2406         test LEN, LEN
2407         jz .Lecb_dec_ret
2408         mov 480(KEYP), KLEN
2409         add $240, KEYP
2410         cmp $16, LEN
2411         jb .Lecb_dec_ret
2412         cmp $64, LEN
2413         jb .Lecb_dec_loop1
2414 .align 4
2415 .Lecb_dec_loop4:
2416         movups (INP), STATE1
2417         movups 0x10(INP), STATE2
2418         movups 0x20(INP), STATE3
2419         movups 0x30(INP), STATE4
2420         call _aesni_dec4
2421         movups STATE1, (OUTP)
2422         movups STATE2, 0x10(OUTP)
2423         movups STATE3, 0x20(OUTP)
2424         movups STATE4, 0x30(OUTP)
2425         sub $64, LEN
2426         add $64, INP
2427         add $64, OUTP
2428         cmp $64, LEN
2429         jge .Lecb_dec_loop4
2430         cmp $16, LEN
2431         jb .Lecb_dec_ret
2432 .align 4
2433 .Lecb_dec_loop1:
2434         movups (INP), STATE1
2435         call _aesni_dec1
2436         movups STATE1, (OUTP)
2437         sub $16, LEN
2438         add $16, INP
2439         add $16, OUTP
2440         cmp $16, LEN
2441         jge .Lecb_dec_loop1
2442 .Lecb_dec_ret:
2443 #ifndef __x86_64__
2444         popl KLEN
2445         popl KEYP
2446         popl LEN
2447 #endif
2448         FRAME_END
2449         ret
2450 ENDPROC(aesni_ecb_dec)
2451
2452 /*
2453  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2454  *                    size_t len, u8 *iv)
2455  */
2456 ENTRY(aesni_cbc_enc)
2457         FRAME_BEGIN
2458 #ifndef __x86_64__
2459         pushl IVP
2460         pushl LEN
2461         pushl KEYP
2462         pushl KLEN
2463         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2464         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2465         movl (FRAME_OFFSET+28)(%esp), INP       # src
2466         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2467         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2468 #endif
2469         cmp $16, LEN
2470         jb .Lcbc_enc_ret
2471         mov 480(KEYP), KLEN
2472         movups (IVP), STATE     # load iv as initial state
2473 .align 4
2474 .Lcbc_enc_loop:
2475         movups (INP), IN        # load input
2476         pxor IN, STATE
2477         call _aesni_enc1
2478         movups STATE, (OUTP)    # store output
2479         sub $16, LEN
2480         add $16, INP
2481         add $16, OUTP
2482         cmp $16, LEN
2483         jge .Lcbc_enc_loop
2484         movups STATE, (IVP)
2485 .Lcbc_enc_ret:
2486 #ifndef __x86_64__
2487         popl KLEN
2488         popl KEYP
2489         popl LEN
2490         popl IVP
2491 #endif
2492         FRAME_END
2493         ret
2494 ENDPROC(aesni_cbc_enc)
2495
2496 /*
2497  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2498  *                    size_t len, u8 *iv)
2499  */
2500 ENTRY(aesni_cbc_dec)
2501         FRAME_BEGIN
2502 #ifndef __x86_64__
2503         pushl IVP
2504         pushl LEN
2505         pushl KEYP
2506         pushl KLEN
2507         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2508         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2509         movl (FRAME_OFFSET+28)(%esp), INP       # src
2510         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2511         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2512 #endif
2513         cmp $16, LEN
2514         jb .Lcbc_dec_just_ret
2515         mov 480(KEYP), KLEN
2516         add $240, KEYP
2517         movups (IVP), IV
2518         cmp $64, LEN
2519         jb .Lcbc_dec_loop1
2520 .align 4
2521 .Lcbc_dec_loop4:
2522         movups (INP), IN1
2523         movaps IN1, STATE1
2524         movups 0x10(INP), IN2
2525         movaps IN2, STATE2
2526 #ifdef __x86_64__
2527         movups 0x20(INP), IN3
2528         movaps IN3, STATE3
2529         movups 0x30(INP), IN4
2530         movaps IN4, STATE4
2531 #else
2532         movups 0x20(INP), IN1
2533         movaps IN1, STATE3
2534         movups 0x30(INP), IN2
2535         movaps IN2, STATE4
2536 #endif
2537         call _aesni_dec4
2538         pxor IV, STATE1
2539 #ifdef __x86_64__
2540         pxor IN1, STATE2
2541         pxor IN2, STATE3
2542         pxor IN3, STATE4
2543         movaps IN4, IV
2544 #else
2545         pxor IN1, STATE4
2546         movaps IN2, IV
2547         movups (INP), IN1
2548         pxor IN1, STATE2
2549         movups 0x10(INP), IN2
2550         pxor IN2, STATE3
2551 #endif
2552         movups STATE1, (OUTP)
2553         movups STATE2, 0x10(OUTP)
2554         movups STATE3, 0x20(OUTP)
2555         movups STATE4, 0x30(OUTP)
2556         sub $64, LEN
2557         add $64, INP
2558         add $64, OUTP
2559         cmp $64, LEN
2560         jge .Lcbc_dec_loop4
2561         cmp $16, LEN
2562         jb .Lcbc_dec_ret
2563 .align 4
2564 .Lcbc_dec_loop1:
2565         movups (INP), IN
2566         movaps IN, STATE
2567         call _aesni_dec1
2568         pxor IV, STATE
2569         movups STATE, (OUTP)
2570         movaps IN, IV
2571         sub $16, LEN
2572         add $16, INP
2573         add $16, OUTP
2574         cmp $16, LEN
2575         jge .Lcbc_dec_loop1
2576 .Lcbc_dec_ret:
2577         movups IV, (IVP)
2578 .Lcbc_dec_just_ret:
2579 #ifndef __x86_64__
2580         popl KLEN
2581         popl KEYP
2582         popl LEN
2583         popl IVP
2584 #endif
2585         FRAME_END
2586         ret
2587 ENDPROC(aesni_cbc_dec)
2588
2589 #ifdef __x86_64__
2590 .pushsection .rodata
2591 .align 16
2592 .Lbswap_mask:
2593         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2594 .popsection
2595
2596 /*
2597  * _aesni_inc_init:     internal ABI
2598  *      setup registers used by _aesni_inc
2599  * input:
2600  *      IV
2601  * output:
2602  *      CTR:    == IV, in little endian
2603  *      TCTR_LOW: == lower qword of CTR
2604  *      INC:    == 1, in little endian
2605  *      BSWAP_MASK == endian swapping mask
2606  */
2607 .align 4
2608 _aesni_inc_init:
2609         movaps .Lbswap_mask, BSWAP_MASK
2610         movaps IV, CTR
2611         PSHUFB_XMM BSWAP_MASK CTR
2612         mov $1, TCTR_LOW
2613         MOVQ_R64_XMM TCTR_LOW INC
2614         MOVQ_R64_XMM CTR TCTR_LOW
2615         ret
2616 ENDPROC(_aesni_inc_init)
2617
2618 /*
2619  * _aesni_inc:          internal ABI
2620  *      Increase IV by 1, IV is in big endian
2621  * input:
2622  *      IV
2623  *      CTR:    == IV, in little endian
2624  *      TCTR_LOW: == lower qword of CTR
2625  *      INC:    == 1, in little endian
2626  *      BSWAP_MASK == endian swapping mask
2627  * output:
2628  *      IV:     Increase by 1
2629  * changed:
2630  *      CTR:    == output IV, in little endian
2631  *      TCTR_LOW: == lower qword of CTR
2632  */
2633 .align 4
2634 _aesni_inc:
2635         paddq INC, CTR
2636         add $1, TCTR_LOW
2637         jnc .Linc_low
2638         pslldq $8, INC
2639         paddq INC, CTR
2640         psrldq $8, INC
2641 .Linc_low:
2642         movaps CTR, IV
2643         PSHUFB_XMM BSWAP_MASK IV
2644         ret
2645 ENDPROC(_aesni_inc)
2646
2647 /*
2648  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2649  *                    size_t len, u8 *iv)
2650  */
2651 ENTRY(aesni_ctr_enc)
2652         FRAME_BEGIN
2653         cmp $16, LEN
2654         jb .Lctr_enc_just_ret
2655         mov 480(KEYP), KLEN
2656         movups (IVP), IV
2657         call _aesni_inc_init
2658         cmp $64, LEN
2659         jb .Lctr_enc_loop1
2660 .align 4
2661 .Lctr_enc_loop4:
2662         movaps IV, STATE1
2663         call _aesni_inc
2664         movups (INP), IN1
2665         movaps IV, STATE2
2666         call _aesni_inc
2667         movups 0x10(INP), IN2
2668         movaps IV, STATE3
2669         call _aesni_inc
2670         movups 0x20(INP), IN3
2671         movaps IV, STATE4
2672         call _aesni_inc
2673         movups 0x30(INP), IN4
2674         call _aesni_enc4
2675         pxor IN1, STATE1
2676         movups STATE1, (OUTP)
2677         pxor IN2, STATE2
2678         movups STATE2, 0x10(OUTP)
2679         pxor IN3, STATE3
2680         movups STATE3, 0x20(OUTP)
2681         pxor IN4, STATE4
2682         movups STATE4, 0x30(OUTP)
2683         sub $64, LEN
2684         add $64, INP
2685         add $64, OUTP
2686         cmp $64, LEN
2687         jge .Lctr_enc_loop4
2688         cmp $16, LEN
2689         jb .Lctr_enc_ret
2690 .align 4
2691 .Lctr_enc_loop1:
2692         movaps IV, STATE
2693         call _aesni_inc
2694         movups (INP), IN
2695         call _aesni_enc1
2696         pxor IN, STATE
2697         movups STATE, (OUTP)
2698         sub $16, LEN
2699         add $16, INP
2700         add $16, OUTP
2701         cmp $16, LEN
2702         jge .Lctr_enc_loop1
2703 .Lctr_enc_ret:
2704         movups IV, (IVP)
2705 .Lctr_enc_just_ret:
2706         FRAME_END
2707         ret
2708 ENDPROC(aesni_ctr_enc)
2709
2710 /*
2711  * _aesni_gf128mul_x_ble:               internal ABI
2712  *      Multiply in GF(2^128) for XTS IVs
2713  * input:
2714  *      IV:     current IV
2715  *      GF128MUL_MASK == mask with 0x87 and 0x01
2716  * output:
2717  *      IV:     next IV
2718  * changed:
2719  *      CTR:    == temporary value
2720  */
2721 #define _aesni_gf128mul_x_ble() \
2722         pshufd $0x13, IV, CTR; \
2723         paddq IV, IV; \
2724         psrad $31, CTR; \
2725         pand GF128MUL_MASK, CTR; \
2726         pxor CTR, IV;
2727
2728 /*
2729  * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2730  *                       bool enc, u8 *iv)
2731  */
2732 ENTRY(aesni_xts_crypt8)
2733         FRAME_BEGIN
2734         cmpb $0, %cl
2735         movl $0, %ecx
2736         movl $240, %r10d
2737         leaq _aesni_enc4, %r11
2738         leaq _aesni_dec4, %rax
2739         cmovel %r10d, %ecx
2740         cmoveq %rax, %r11
2741
2742         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2743         movups (IVP), IV
2744
2745         mov 480(KEYP), KLEN
2746         addq %rcx, KEYP
2747
2748         movdqa IV, STATE1
2749         movdqu 0x00(INP), INC
2750         pxor INC, STATE1
2751         movdqu IV, 0x00(OUTP)
2752
2753         _aesni_gf128mul_x_ble()
2754         movdqa IV, STATE2
2755         movdqu 0x10(INP), INC
2756         pxor INC, STATE2
2757         movdqu IV, 0x10(OUTP)
2758
2759         _aesni_gf128mul_x_ble()
2760         movdqa IV, STATE3
2761         movdqu 0x20(INP), INC
2762         pxor INC, STATE3
2763         movdqu IV, 0x20(OUTP)
2764
2765         _aesni_gf128mul_x_ble()
2766         movdqa IV, STATE4
2767         movdqu 0x30(INP), INC
2768         pxor INC, STATE4
2769         movdqu IV, 0x30(OUTP)
2770
2771         CALL_NOSPEC %r11
2772
2773         movdqu 0x00(OUTP), INC
2774         pxor INC, STATE1
2775         movdqu STATE1, 0x00(OUTP)
2776
2777         _aesni_gf128mul_x_ble()
2778         movdqa IV, STATE1
2779         movdqu 0x40(INP), INC
2780         pxor INC, STATE1
2781         movdqu IV, 0x40(OUTP)
2782
2783         movdqu 0x10(OUTP), INC
2784         pxor INC, STATE2
2785         movdqu STATE2, 0x10(OUTP)
2786
2787         _aesni_gf128mul_x_ble()
2788         movdqa IV, STATE2
2789         movdqu 0x50(INP), INC
2790         pxor INC, STATE2
2791         movdqu IV, 0x50(OUTP)
2792
2793         movdqu 0x20(OUTP), INC
2794         pxor INC, STATE3
2795         movdqu STATE3, 0x20(OUTP)
2796
2797         _aesni_gf128mul_x_ble()
2798         movdqa IV, STATE3
2799         movdqu 0x60(INP), INC
2800         pxor INC, STATE3
2801         movdqu IV, 0x60(OUTP)
2802
2803         movdqu 0x30(OUTP), INC
2804         pxor INC, STATE4
2805         movdqu STATE4, 0x30(OUTP)
2806
2807         _aesni_gf128mul_x_ble()
2808         movdqa IV, STATE4
2809         movdqu 0x70(INP), INC
2810         pxor INC, STATE4
2811         movdqu IV, 0x70(OUTP)
2812
2813         _aesni_gf128mul_x_ble()
2814         movups IV, (IVP)
2815
2816         CALL_NOSPEC %r11
2817
2818         movdqu 0x40(OUTP), INC
2819         pxor INC, STATE1
2820         movdqu STATE1, 0x40(OUTP)
2821
2822         movdqu 0x50(OUTP), INC
2823         pxor INC, STATE2
2824         movdqu STATE2, 0x50(OUTP)
2825
2826         movdqu 0x60(OUTP), INC
2827         pxor INC, STATE3
2828         movdqu STATE3, 0x60(OUTP)
2829
2830         movdqu 0x70(OUTP), INC
2831         pxor INC, STATE4
2832         movdqu STATE4, 0x70(OUTP)
2833
2834         FRAME_END
2835         ret
2836 ENDPROC(aesni_xts_crypt8)
2837
2838 #endif