2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
49 # constants in mergeable sections, linker can reorder and merge
50 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52 .Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
54 .section .rodata.cst16.POLY, "aM", @progbits, 16
56 POLY: .octa 0xC2000000000000000000000000000001
57 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
59 TWOONE: .octa 0x00000001000000000000000000000001
61 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
64 .section .rodata.cst16.MASK1, "aM", @progbits, 16
66 MASK1: .octa 0x0000000000000000ffffffffffffffff
67 .section .rodata.cst16.MASK2, "aM", @progbits, 16
69 MASK2: .octa 0xffffffffffffffff0000000000000000
70 .section .rodata.cst16.ONE, "aM", @progbits, 16
72 ONE: .octa 0x00000000000000000000000000000001
73 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76 .section .rodata.cst16.dec, "aM", @progbits, 16
79 .section .rodata.cst16.enc, "aM", @progbits, 16
83 # order of these constants should not change.
84 # more specifically, ALL_F should follow SHIFT_MASK,
85 # and zero should follow ALL_F
86 .section .rodata, "a", @progbits
88 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
90 .octa 0x00000000000000000000000000000000
96 #define STACK_OFFSET 8*3
97 #define HashKey 16*0 // store HashKey <<1 mod poly here
98 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define VARIABLE_OFFSET 16*8
121 #define arg7 STACK_OFFSET+8(%r14)
122 #define arg8 STACK_OFFSET+16(%r14)
123 #define arg9 STACK_OFFSET+24(%r14)
124 #define arg10 STACK_OFFSET+32(%r14)
125 #define keysize 2*15*16(%arg1)
142 #define BSWAP_MASK %xmm10
146 #define GF128MUL_MASK %xmm10
176 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
179 * Input: A and B (128-bits each, bit-reflected)
180 * Output: C = A*B*x mod poly, (i.e. >>1 )
181 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
185 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
187 pshufd $78, \GH, \TMP2
188 pshufd $78, \HK, \TMP3
189 pxor \GH, \TMP2 # TMP2 = a1+a0
190 pxor \HK, \TMP3 # TMP3 = b1+b0
191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
197 pslldq $8, \TMP3 # left shift TMP3 2 DWs
198 psrldq $8, \TMP2 # right shift TMP2 2 DWs
200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
202 # first phase of the reduction
206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
207 # in in order to perform
209 pslld $31, \TMP2 # packed right shift <<31
210 pslld $30, \TMP3 # packed right shift <<30
211 pslld $25, \TMP4 # packed right shift <<25
212 pxor \TMP3, \TMP2 # xor the shifted versions
215 psrldq $4, \TMP5 # right shift TMP5 1 DW
216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
219 # second phase of the reduction
221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
222 # in in order to perform
226 psrld $1,\TMP2 # packed left shift >>1
227 psrld $2,\TMP3 # packed left shift >>2
228 psrld $7,\TMP4 # packed left shift >>7
229 pxor \TMP3,\TMP2 # xor the shifted versions
233 pxor \TMP1, \GH # result is in TMP1
237 * if a = number of total plaintext bytes
239 * num_initial_blocks = b mod 4
240 * encrypt the initial num_initial_blocks blocks and apply ghash on
242 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
244 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
248 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
249 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
250 MOVADQ SHUF_MASK(%rip), %xmm14
251 mov arg7, %r10 # %r10 = AAD
252 mov arg8, %r12 # %r12 = aadLen
256 _get_AAD_loop\num_initial_blocks\operation:
263 jne _get_AAD_loop\num_initial_blocks\operation
266 je _get_AAD_loop2_done\num_initial_blocks\operation
269 _get_AAD_loop2\num_initial_blocks\operation:
273 jne _get_AAD_loop2\num_initial_blocks\operation
275 _get_AAD_loop2_done\num_initial_blocks\operation:
276 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
278 xor %r11, %r11 # initialise the data pointer offset as zero
280 # start AES for num_initial_blocks blocks
282 mov %arg5, %rax # %rax = *Y0
283 movdqu (%rax), \XMM0 # XMM0 = Y0
284 PSHUFB_XMM %xmm14, \XMM0
286 .if (\i == 5) || (\i == 6) || (\i == 7)
287 MOVADQ ONE(%RIP),\TMP1
290 paddd \TMP1, \XMM0 # INCR Y0
291 movdqa \XMM0, %xmm\index
292 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
293 pxor \TMP2, %xmm\index
297 shr $2,%eax # 128->4, 192->6, 256->8
298 add $5,%eax # 128->9, 192->11, 256->13
300 aes_loop_initial_dec\num_initial_blocks:
303 AESENC \TMP1, %xmm\index
307 jnz aes_loop_initial_dec\num_initial_blocks
311 AESENCLAST \TMP1, %xmm\index # Last Round
314 movdqu (%arg3 , %r11, 1), \TMP1
315 pxor \TMP1, %xmm\index
316 movdqu %xmm\index, (%arg2 , %r11, 1)
317 # write back plaintext/ciphertext for num_initial_blocks
320 movdqa \TMP1, %xmm\index
321 PSHUFB_XMM %xmm14, %xmm\index
322 # prepare plaintext/ciphertext for GHASH computation
325 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326 # apply GHASH on num_initial_blocks blocks
330 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
332 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
334 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
337 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
339 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
342 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
345 jl _initial_blocks_done\num_initial_blocks\operation
346 # no need for precomputed values
349 * Precomputations for HashKey parallel with encryption of first 4 blocks.
350 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
352 MOVADQ ONE(%rip), \TMP1
353 paddd \TMP1, \XMM0 # INCR Y0
355 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
357 paddd \TMP1, \XMM0 # INCR Y0
359 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
361 paddd \TMP1, \XMM0 # INCR Y0
363 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
365 paddd \TMP1, \XMM0 # INCR Y0
367 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
369 MOVADQ 0(%arg1),\TMP1
375 pshufd $78, \TMP3, \TMP1
377 movdqa \TMP1, HashKey_k(%rsp)
378 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
379 # TMP5 = HashKey^2<<1 (mod poly)
380 movdqa \TMP5, HashKey_2(%rsp)
381 # HashKey_2 = HashKey^2<<1 (mod poly)
382 pshufd $78, \TMP5, \TMP1
384 movdqa \TMP1, HashKey_2_k(%rsp)
385 .irpc index, 1234 # do 4 rounds
386 movaps 0x10*\index(%arg1), \TMP1
392 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
393 # TMP5 = HashKey^3<<1 (mod poly)
394 movdqa \TMP5, HashKey_3(%rsp)
395 pshufd $78, \TMP5, \TMP1
397 movdqa \TMP1, HashKey_3_k(%rsp)
398 .irpc index, 56789 # do next 5 rounds
399 movaps 0x10*\index(%arg1), \TMP1
405 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
406 # TMP5 = HashKey^3<<1 (mod poly)
407 movdqa \TMP5, HashKey_4(%rsp)
408 pshufd $78, \TMP5, \TMP1
410 movdqa \TMP1, HashKey_4_k(%rsp)
413 shr $2,%eax # 128->4, 192->6, 256->8
414 sub $4,%eax # 128->0, 192->2, 256->4
415 jz aes_loop_pre_dec_done\num_initial_blocks
417 aes_loop_pre_dec\num_initial_blocks:
420 AESENC \TMP2, %xmm\index
424 jnz aes_loop_pre_dec\num_initial_blocks
426 aes_loop_pre_dec_done\num_initial_blocks:
428 AESENCLAST \TMP2, \XMM1
429 AESENCLAST \TMP2, \XMM2
430 AESENCLAST \TMP2, \XMM3
431 AESENCLAST \TMP2, \XMM4
432 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
434 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
436 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
438 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
440 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
442 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
444 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
446 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
449 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
451 # combine GHASHed value with the corresponding ciphertext
452 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
453 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
454 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
456 _initial_blocks_done\num_initial_blocks\operation:
462 * if a = number of total plaintext bytes
464 * num_initial_blocks = b mod 4
465 * encrypt the initial num_initial_blocks blocks and apply ghash on
467 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
469 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
473 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
474 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
475 MOVADQ SHUF_MASK(%rip), %xmm14
476 mov arg7, %r10 # %r10 = AAD
477 mov arg8, %r12 # %r12 = aadLen
480 _get_AAD_loop\num_initial_blocks\operation:
487 jne _get_AAD_loop\num_initial_blocks\operation
489 je _get_AAD_loop2_done\num_initial_blocks\operation
491 _get_AAD_loop2\num_initial_blocks\operation:
495 jne _get_AAD_loop2\num_initial_blocks\operation
496 _get_AAD_loop2_done\num_initial_blocks\operation:
497 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
499 xor %r11, %r11 # initialise the data pointer offset as zero
501 # start AES for num_initial_blocks blocks
503 mov %arg5, %rax # %rax = *Y0
504 movdqu (%rax), \XMM0 # XMM0 = Y0
505 PSHUFB_XMM %xmm14, \XMM0
507 .if (\i == 5) || (\i == 6) || (\i == 7)
509 MOVADQ ONE(%RIP),\TMP1
510 MOVADQ 0(%arg1),\TMP2
512 paddd \TMP1, \XMM0 # INCR Y0
513 MOVADQ \XMM0, %xmm\index
514 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
515 pxor \TMP2, %xmm\index
519 shr $2,%eax # 128->4, 192->6, 256->8
520 add $5,%eax # 128->9, 192->11, 256->13
522 aes_loop_initial_enc\num_initial_blocks:
525 AESENC \TMP1, %xmm\index
529 jnz aes_loop_initial_enc\num_initial_blocks
533 AESENCLAST \TMP1, %xmm\index # Last Round
536 movdqu (%arg3 , %r11, 1), \TMP1
537 pxor \TMP1, %xmm\index
538 movdqu %xmm\index, (%arg2 , %r11, 1)
539 # write back plaintext/ciphertext for num_initial_blocks
541 PSHUFB_XMM %xmm14, %xmm\index
543 # prepare plaintext/ciphertext for GHASH computation
546 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547 # apply GHASH on num_initial_blocks blocks
551 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
566 jl _initial_blocks_done\num_initial_blocks\operation
567 # no need for precomputed values
570 * Precomputations for HashKey parallel with encryption of first 4 blocks.
571 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
573 MOVADQ ONE(%RIP),\TMP1
574 paddd \TMP1, \XMM0 # INCR Y0
576 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
578 paddd \TMP1, \XMM0 # INCR Y0
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
582 paddd \TMP1, \XMM0 # INCR Y0
584 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586 paddd \TMP1, \XMM0 # INCR Y0
588 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
590 MOVADQ 0(%arg1),\TMP1
596 pshufd $78, \TMP3, \TMP1
598 movdqa \TMP1, HashKey_k(%rsp)
599 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
600 # TMP5 = HashKey^2<<1 (mod poly)
601 movdqa \TMP5, HashKey_2(%rsp)
602 # HashKey_2 = HashKey^2<<1 (mod poly)
603 pshufd $78, \TMP5, \TMP1
605 movdqa \TMP1, HashKey_2_k(%rsp)
606 .irpc index, 1234 # do 4 rounds
607 movaps 0x10*\index(%arg1), \TMP1
613 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
614 # TMP5 = HashKey^3<<1 (mod poly)
615 movdqa \TMP5, HashKey_3(%rsp)
616 pshufd $78, \TMP5, \TMP1
618 movdqa \TMP1, HashKey_3_k(%rsp)
619 .irpc index, 56789 # do next 5 rounds
620 movaps 0x10*\index(%arg1), \TMP1
626 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
627 # TMP5 = HashKey^3<<1 (mod poly)
628 movdqa \TMP5, HashKey_4(%rsp)
629 pshufd $78, \TMP5, \TMP1
631 movdqa \TMP1, HashKey_4_k(%rsp)
634 shr $2,%eax # 128->4, 192->6, 256->8
635 sub $4,%eax # 128->0, 192->2, 256->4
636 jz aes_loop_pre_enc_done\num_initial_blocks
638 aes_loop_pre_enc\num_initial_blocks:
641 AESENC \TMP2, %xmm\index
645 jnz aes_loop_pre_enc\num_initial_blocks
647 aes_loop_pre_enc_done\num_initial_blocks:
649 AESENCLAST \TMP2, \XMM1
650 AESENCLAST \TMP2, \XMM2
651 AESENCLAST \TMP2, \XMM3
652 AESENCLAST \TMP2, \XMM4
653 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
655 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
657 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
659 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
661 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
662 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
663 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
664 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
667 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
669 # combine GHASHed value with the corresponding ciphertext
670 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
671 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
672 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
674 _initial_blocks_done\num_initial_blocks\operation:
679 * encrypt 4 blocks at a time
680 * ghash the 4 previously encrypted ciphertext blocks
681 * arg1, %arg2, %arg3 are used as pointers only, not modified
682 * %r11 is the data offset value
684 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
685 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
692 movdqa SHUF_MASK(%rip), %xmm15
693 # multiply TMP5 * HashKey using karatsuba
696 pshufd $78, \XMM5, \TMP6
698 paddd ONE(%rip), \XMM0 # INCR CNT
699 movdqa HashKey_4(%rsp), \TMP5
700 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
702 paddd ONE(%rip), \XMM0 # INCR CNT
704 paddd ONE(%rip), \XMM0 # INCR CNT
706 paddd ONE(%rip), \XMM0 # INCR CNT
708 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
709 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
710 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
711 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
712 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
718 movdqa HashKey_4_k(%rsp), \TMP5
719 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
720 movaps 0x10(%arg1), \TMP1
721 AESENC \TMP1, \XMM1 # Round 1
725 movaps 0x20(%arg1), \TMP1
726 AESENC \TMP1, \XMM1 # Round 2
731 pshufd $78, \XMM6, \TMP2
733 movdqa HashKey_3(%rsp), \TMP5
734 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
735 movaps 0x30(%arg1), \TMP3
736 AESENC \TMP3, \XMM1 # Round 3
740 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
741 movaps 0x40(%arg1), \TMP3
742 AESENC \TMP3, \XMM1 # Round 4
746 movdqa HashKey_3_k(%rsp), \TMP5
747 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
748 movaps 0x50(%arg1), \TMP3
749 AESENC \TMP3, \XMM1 # Round 5
754 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
758 pshufd $78, \XMM7, \TMP2
760 movdqa HashKey_2(%rsp ), \TMP5
762 # Multiply TMP5 * HashKey using karatsuba
764 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
765 movaps 0x60(%arg1), \TMP3
766 AESENC \TMP3, \XMM1 # Round 6
770 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
771 movaps 0x70(%arg1), \TMP3
772 AESENC \TMP3, \XMM1 # Round 7
776 movdqa HashKey_2_k(%rsp), \TMP5
777 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
778 movaps 0x80(%arg1), \TMP3
779 AESENC \TMP3, \XMM1 # Round 8
784 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
788 # Multiply XMM8 * HashKey
789 # XMM8 and TMP5 hold the values for the two operands
792 pshufd $78, \XMM8, \TMP2
794 movdqa HashKey(%rsp), \TMP5
795 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
796 movaps 0x90(%arg1), \TMP3
797 AESENC \TMP3, \XMM1 # Round 9
801 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
804 shr $2,%eax # 128->4, 192->6, 256->8
805 sub $4,%eax # 128->0, 192->2, 256->4
806 jz aes_loop_par_enc_done
811 AESENC \TMP3, %xmm\index
817 aes_loop_par_enc_done:
819 AESENCLAST \TMP3, \XMM1 # Round 10
820 AESENCLAST \TMP3, \XMM2
821 AESENCLAST \TMP3, \XMM3
822 AESENCLAST \TMP3, \XMM4
823 movdqa HashKey_k(%rsp), \TMP5
824 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
825 movdqu (%arg3,%r11,1), \TMP3
826 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
827 movdqu 16(%arg3,%r11,1), \TMP3
828 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
829 movdqu 32(%arg3,%r11,1), \TMP3
830 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
831 movdqu 48(%arg3,%r11,1), \TMP3
832 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
833 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
834 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
835 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
836 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
837 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
838 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
839 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
840 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
848 pslldq $8, \TMP3 # left shift TMP3 2 DWs
849 psrldq $8, \TMP2 # right shift TMP2 2 DWs
851 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
853 # first phase of reduction
858 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
859 pslld $31, \TMP2 # packed right shift << 31
860 pslld $30, \TMP3 # packed right shift << 30
861 pslld $25, \TMP4 # packed right shift << 25
862 pxor \TMP3, \TMP2 # xor the shifted versions
865 psrldq $4, \TMP5 # right shift T5 1 DW
866 pslldq $12, \TMP2 # left shift T2 3 DWs
869 # second phase of reduction
871 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
874 psrld $1, \TMP2 # packed left shift >>1
875 psrld $2, \TMP3 # packed left shift >>2
876 psrld $7, \TMP4 # packed left shift >>7
877 pxor \TMP3,\TMP2 # xor the shifted versions
881 pxor \TMP1, \XMM5 # result is in TMP1
887 * decrypt 4 blocks at a time
888 * ghash the 4 previously decrypted ciphertext blocks
889 * arg1, %arg2, %arg3 are used as pointers only, not modified
890 * %r11 is the data offset value
892 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
893 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
900 movdqa SHUF_MASK(%rip), %xmm15
901 # multiply TMP5 * HashKey using karatsuba
904 pshufd $78, \XMM5, \TMP6
906 paddd ONE(%rip), \XMM0 # INCR CNT
907 movdqa HashKey_4(%rsp), \TMP5
908 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
910 paddd ONE(%rip), \XMM0 # INCR CNT
912 paddd ONE(%rip), \XMM0 # INCR CNT
914 paddd ONE(%rip), \XMM0 # INCR CNT
916 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
917 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
918 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
919 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
920 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
926 movdqa HashKey_4_k(%rsp), \TMP5
927 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
928 movaps 0x10(%arg1), \TMP1
929 AESENC \TMP1, \XMM1 # Round 1
933 movaps 0x20(%arg1), \TMP1
934 AESENC \TMP1, \XMM1 # Round 2
939 pshufd $78, \XMM6, \TMP2
941 movdqa HashKey_3(%rsp), \TMP5
942 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
943 movaps 0x30(%arg1), \TMP3
944 AESENC \TMP3, \XMM1 # Round 3
948 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
949 movaps 0x40(%arg1), \TMP3
950 AESENC \TMP3, \XMM1 # Round 4
954 movdqa HashKey_3_k(%rsp), \TMP5
955 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
956 movaps 0x50(%arg1), \TMP3
957 AESENC \TMP3, \XMM1 # Round 5
962 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pshufd $78, \XMM7, \TMP2
968 movdqa HashKey_2(%rsp ), \TMP5
970 # Multiply TMP5 * HashKey using karatsuba
972 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
973 movaps 0x60(%arg1), \TMP3
974 AESENC \TMP3, \XMM1 # Round 6
978 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
979 movaps 0x70(%arg1), \TMP3
980 AESENC \TMP3, \XMM1 # Round 7
984 movdqa HashKey_2_k(%rsp), \TMP5
985 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
986 movaps 0x80(%arg1), \TMP3
987 AESENC \TMP3, \XMM1 # Round 8
992 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
996 # Multiply XMM8 * HashKey
997 # XMM8 and TMP5 hold the values for the two operands
1000 pshufd $78, \XMM8, \TMP2
1002 movdqa HashKey(%rsp), \TMP5
1003 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1004 movaps 0x90(%arg1), \TMP3
1005 AESENC \TMP3, \XMM1 # Round 9
1009 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1010 lea 0xa0(%arg1),%r10
1012 shr $2,%eax # 128->4, 192->6, 256->8
1013 sub $4,%eax # 128->0, 192->2, 256->4
1014 jz aes_loop_par_dec_done
1019 AESENC \TMP3, %xmm\index
1023 jnz aes_loop_par_dec
1025 aes_loop_par_dec_done:
1026 MOVADQ (%r10), \TMP3
1027 AESENCLAST \TMP3, \XMM1 # last round
1028 AESENCLAST \TMP3, \XMM2
1029 AESENCLAST \TMP3, \XMM3
1030 AESENCLAST \TMP3, \XMM4
1031 movdqa HashKey_k(%rsp), \TMP5
1032 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1033 movdqu (%arg3,%r11,1), \TMP3
1034 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1035 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1037 movdqu 16(%arg3,%r11,1), \TMP3
1038 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1039 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1041 movdqu 32(%arg3,%r11,1), \TMP3
1042 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1043 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1045 movdqu 48(%arg3,%r11,1), \TMP3
1046 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1047 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1049 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1050 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1051 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1052 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1060 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1061 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1063 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1065 # first phase of reduction
1070 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1071 pslld $31, \TMP2 # packed right shift << 31
1072 pslld $30, \TMP3 # packed right shift << 30
1073 pslld $25, \TMP4 # packed right shift << 25
1074 pxor \TMP3, \TMP2 # xor the shifted versions
1077 psrldq $4, \TMP5 # right shift T5 1 DW
1078 pslldq $12, \TMP2 # left shift T2 3 DWs
1081 # second phase of reduction
1083 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1086 psrld $1, \TMP2 # packed left shift >>1
1087 psrld $2, \TMP3 # packed left shift >>2
1088 psrld $7, \TMP4 # packed left shift >>7
1089 pxor \TMP3,\TMP2 # xor the shifted versions
1093 pxor \TMP1, \XMM5 # result is in TMP1
1098 /* GHASH the last 4 ciphertext blocks. */
1099 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1100 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1102 # Multiply TMP6 * HashKey (using Karatsuba)
1105 pshufd $78, \XMM1, \TMP2
1107 movdqa HashKey_4(%rsp), \TMP5
1108 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1109 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1110 movdqa HashKey_4_k(%rsp), \TMP4
1111 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1112 movdqa \XMM1, \XMMDst
1113 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1115 # Multiply TMP1 * HashKey (using Karatsuba)
1118 pshufd $78, \XMM2, \TMP2
1120 movdqa HashKey_3(%rsp), \TMP5
1121 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1122 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1123 movdqa HashKey_3_k(%rsp), \TMP4
1124 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1128 # results accumulated in TMP6, XMMDst, XMM1
1130 # Multiply TMP1 * HashKey (using Karatsuba)
1133 pshufd $78, \XMM3, \TMP2
1135 movdqa HashKey_2(%rsp), \TMP5
1136 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1137 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1138 movdqa HashKey_2_k(%rsp), \TMP4
1139 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1142 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1144 # Multiply TMP1 * HashKey (using Karatsuba)
1146 pshufd $78, \XMM4, \TMP2
1148 movdqa HashKey(%rsp), \TMP5
1149 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1150 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1151 movdqa HashKey_k(%rsp), \TMP4
1152 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1158 # middle section of the temp results combined as in karatsuba algorithm
1160 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1161 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1164 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1165 # first phase of the reduction
1166 movdqa \XMMDst, \TMP2
1167 movdqa \XMMDst, \TMP3
1168 movdqa \XMMDst, \TMP4
1169 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1170 pslld $31, \TMP2 # packed right shifting << 31
1171 pslld $30, \TMP3 # packed right shifting << 30
1172 pslld $25, \TMP4 # packed right shifting << 25
1173 pxor \TMP3, \TMP2 # xor the shifted versions
1176 psrldq $4, \TMP7 # right shift TMP7 1 DW
1177 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1180 # second phase of the reduction
1181 movdqa \XMMDst, \TMP2
1182 # make 3 copies of XMMDst for doing 3 shift operations
1183 movdqa \XMMDst, \TMP3
1184 movdqa \XMMDst, \TMP4
1185 psrld $1, \TMP2 # packed left shift >> 1
1186 psrld $2, \TMP3 # packed left shift >> 2
1187 psrld $7, \TMP4 # packed left shift >> 7
1188 pxor \TMP3, \TMP2 # xor the shifted versions
1192 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1196 /* Encryption of a single block
1200 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1204 shr $2,%eax # 128->4, 192->6, 256->8
1205 add $5,%eax # 128->9, 192->11, 256->13
1206 lea 16(%arg1), %r10 # get first expanded key address
1216 AESENCLAST \TMP1,\XMM0
1218 /*****************************************************************************
1219 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1220 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1221 * const u8 *in, // Ciphertext input
1222 * u64 plaintext_len, // Length of data in bytes for decryption.
1223 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1224 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1225 * // concatenated with 0x00000001. 16-byte aligned pointer.
1226 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1227 * const u8 *aad, // Additional Authentication Data (AAD)
1228 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1229 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1230 * // given authentication tag and only return the plaintext if they match.
1231 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1232 * // (most likely), 12 or 8.
1237 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1238 * set of 11 keys in the data structure void *aes_ctx
1242 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244 * | Salt (From the SA) |
1245 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246 * | Initialization Vector |
1247 * | (This is the sequence number from IPSec header) |
1248 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1255 * AAD padded to 128 bits with 0
1256 * for example, assume AAD is a u32 vector
1258 * if AAD is 8 bytes:
1259 * AAD[3] = {A0, A1};
1260 * padded AAD in xmm register = {A1 A0 0 0}
1263 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1264 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1266 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267 * | 32-bit Sequence Number (A0) |
1268 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1270 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1272 * AAD Format with 32-bit Sequence Number
1274 * if AAD is 12 bytes:
1275 * AAD[3] = {A0, A1, A2};
1276 * padded AAD in xmm register = {A2 A1 A0 0}
1279 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1280 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1281 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1284 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285 * | 64-bit Extended Sequence Number {A1,A0} |
1287 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1289 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1291 * AAD Format with 64-bit Extended Sequence Number
1294 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1295 * The code supports 16 too but for other sizes, the code will fail.
1298 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1299 * For other sizes, the code will fail.
1301 * poly = x^128 + x^127 + x^126 + x^121 + 1
1303 *****************************************************************************/
1304 ENTRY(aesni_gcm_dec)
1310 * states of %xmm registers %xmm6:%xmm15 not saved
1311 * all %xmm registers are clobbered
1313 sub $VARIABLE_OFFSET, %rsp
1314 and $~63, %rsp # align rsp to 64 bytes
1316 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1317 movdqa SHUF_MASK(%rip), %xmm2
1318 PSHUFB_XMM %xmm2, %xmm13
1321 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1323 movdqa %xmm13, %xmm2
1333 pshufd $0x24, %xmm1, %xmm2
1334 pcmpeqd TWOONE(%rip), %xmm2
1335 pand POLY(%rip), %xmm2
1336 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1339 # Decrypt first few blocks
1341 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1342 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1343 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1346 jz _initial_num_blocks_is_0_decrypt
1348 jb _initial_num_blocks_is_1_decrypt
1349 je _initial_num_blocks_is_2_decrypt
1350 _initial_num_blocks_is_3_decrypt:
1351 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1352 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1354 jmp _initial_blocks_decrypted
1355 _initial_num_blocks_is_2_decrypt:
1356 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1357 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1359 jmp _initial_blocks_decrypted
1360 _initial_num_blocks_is_1_decrypt:
1361 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1362 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1364 jmp _initial_blocks_decrypted
1365 _initial_num_blocks_is_0_decrypt:
1366 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1367 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1368 _initial_blocks_decrypted:
1370 je _zero_cipher_left_decrypt
1372 je _four_cipher_left_decrypt
1374 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1375 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1379 _four_cipher_left_decrypt:
1380 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1381 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1382 _zero_cipher_left_decrypt:
1384 and $15, %r13 # %r13 = arg4 (mod 16)
1385 je _multiple_of_16_bytes_decrypt
1387 # Handle the last <16 byte block separately
1389 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1390 movdqa SHUF_MASK(%rip), %xmm10
1391 PSHUFB_XMM %xmm10, %xmm0
1393 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1396 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1397 lea SHIFT_MASK+16(%rip), %r12
1399 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1400 # (%r13 is the number of bytes in plaintext mod 16)
1401 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1402 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1405 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1406 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1407 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1408 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1410 movdqa SHUF_MASK(%rip), %xmm10
1411 PSHUFB_XMM %xmm10 ,%xmm2
1414 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1415 # GHASH computation for the last <16 byte block
1420 MOVQ_R64_XMM %xmm0, %rax
1422 jle _less_than_8_bytes_left_decrypt
1423 mov %rax, (%arg2 , %r11, 1)
1426 MOVQ_R64_XMM %xmm0, %rax
1428 _less_than_8_bytes_left_decrypt:
1429 mov %al, (%arg2, %r11, 1)
1433 jne _less_than_8_bytes_left_decrypt
1434 _multiple_of_16_bytes_decrypt:
1435 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1436 shl $3, %r12 # convert into number of bits
1437 movd %r12d, %xmm15 # len(A) in %xmm15
1438 shl $3, %arg4 # len(C) in bits (*128)
1439 MOVQ_R64_XMM %arg4, %xmm1
1440 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1441 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1444 # final GHASH computation
1445 movdqa SHUF_MASK(%rip), %xmm10
1446 PSHUFB_XMM %xmm10, %xmm8
1448 mov %arg5, %rax # %rax = *Y0
1449 movdqu (%rax), %xmm0 # %xmm0 = Y0
1450 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1453 mov arg9, %r10 # %r10 = authTag
1454 mov arg10, %r11 # %r11 = auth_tag_len
1460 MOVQ_R64_XMM %xmm0, %rax
1462 jmp _return_T_done_decrypt
1464 MOVQ_R64_XMM %xmm0, %rax
1469 jmp _return_T_done_decrypt
1471 movdqu %xmm0, (%r10)
1472 _return_T_done_decrypt:
1478 ENDPROC(aesni_gcm_dec)
1481 /*****************************************************************************
1482 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1483 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1484 * const u8 *in, // Plaintext input
1485 * u64 plaintext_len, // Length of data in bytes for encryption.
1486 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1487 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1488 * // concatenated with 0x00000001. 16-byte aligned pointer.
1489 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1490 * const u8 *aad, // Additional Authentication Data (AAD)
1491 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1492 * u8 *auth_tag, // Authenticated Tag output.
1493 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1499 * keys are pre-expanded and aligned to 16 bytes. we are using the
1500 * first set of 11 keys in the data structure void *aes_ctx
1505 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507 * | Salt (From the SA) |
1508 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509 * | Initialization Vector |
1510 * | (This is the sequence number from IPSec header) |
1511 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1518 * AAD padded to 128 bits with 0
1519 * for example, assume AAD is a u32 vector
1521 * if AAD is 8 bytes:
1522 * AAD[3] = {A0, A1};
1523 * padded AAD in xmm register = {A1 A0 0 0}
1526 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1527 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530 * | 32-bit Sequence Number (A0) |
1531 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1533 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1535 * AAD Format with 32-bit Sequence Number
1537 * if AAD is 12 bytes:
1538 * AAD[3] = {A0, A1, A2};
1539 * padded AAD in xmm register = {A2 A1 A0 0}
1542 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1543 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1546 * | 64-bit Extended Sequence Number {A1,A0} |
1548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552 * AAD Format with 64-bit Extended Sequence Number
1555 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1556 * The code supports 16 too but for other sizes, the code will fail.
1559 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1560 * For other sizes, the code will fail.
1562 * poly = x^128 + x^127 + x^126 + x^121 + 1
1563 ***************************************************************************/
1564 ENTRY(aesni_gcm_enc)
1570 # states of %xmm registers %xmm6:%xmm15 not saved
1571 # all %xmm registers are clobbered
1573 sub $VARIABLE_OFFSET, %rsp
1576 movdqu (%r12), %xmm13
1577 movdqa SHUF_MASK(%rip), %xmm2
1578 PSHUFB_XMM %xmm2, %xmm13
1581 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1583 movdqa %xmm13, %xmm2
1593 pshufd $0x24, %xmm1, %xmm2
1594 pcmpeqd TWOONE(%rip), %xmm2
1595 pand POLY(%rip), %xmm2
1597 movdqa %xmm13, HashKey(%rsp)
1598 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1602 # Encrypt first few blocks
1605 jz _initial_num_blocks_is_0_encrypt
1607 jb _initial_num_blocks_is_1_encrypt
1608 je _initial_num_blocks_is_2_encrypt
1609 _initial_num_blocks_is_3_encrypt:
1610 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1611 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1613 jmp _initial_blocks_encrypted
1614 _initial_num_blocks_is_2_encrypt:
1615 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1616 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1618 jmp _initial_blocks_encrypted
1619 _initial_num_blocks_is_1_encrypt:
1620 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1621 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1623 jmp _initial_blocks_encrypted
1624 _initial_num_blocks_is_0_encrypt:
1625 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1626 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1627 _initial_blocks_encrypted:
1629 # Main loop - Encrypt remaining blocks
1632 je _zero_cipher_left_encrypt
1634 je _four_cipher_left_encrypt
1635 _encrypt_by_4_encrypt:
1636 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1637 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1640 jne _encrypt_by_4_encrypt
1641 _four_cipher_left_encrypt:
1642 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1643 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1644 _zero_cipher_left_encrypt:
1646 and $15, %r13 # %r13 = arg4 (mod 16)
1647 je _multiple_of_16_bytes_encrypt
1649 # Handle the last <16 Byte block separately
1650 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1651 movdqa SHUF_MASK(%rip), %xmm10
1652 PSHUFB_XMM %xmm10, %xmm0
1655 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1658 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1659 lea SHIFT_MASK+16(%rip), %r12
1661 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1662 # (%r13 is the number of bytes in plaintext mod 16)
1663 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1664 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1665 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1666 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1667 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1668 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1669 movdqa SHUF_MASK(%rip), %xmm10
1670 PSHUFB_XMM %xmm10,%xmm0
1673 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1674 # GHASH computation for the last <16 byte block
1678 movdqa SHUF_MASK(%rip), %xmm10
1679 PSHUFB_XMM %xmm10, %xmm0
1681 # shuffle xmm0 back to output as ciphertext
1684 MOVQ_R64_XMM %xmm0, %rax
1686 jle _less_than_8_bytes_left_encrypt
1687 mov %rax, (%arg2 , %r11, 1)
1690 MOVQ_R64_XMM %xmm0, %rax
1692 _less_than_8_bytes_left_encrypt:
1693 mov %al, (%arg2, %r11, 1)
1697 jne _less_than_8_bytes_left_encrypt
1698 _multiple_of_16_bytes_encrypt:
1699 mov arg8, %r12 # %r12 = addLen (number of bytes)
1701 movd %r12d, %xmm15 # len(A) in %xmm15
1702 shl $3, %arg4 # len(C) in bits (*128)
1703 MOVQ_R64_XMM %arg4, %xmm1
1704 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1705 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1707 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1708 # final GHASH computation
1709 movdqa SHUF_MASK(%rip), %xmm10
1710 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1712 mov %arg5, %rax # %rax = *Y0
1713 movdqu (%rax), %xmm0 # %xmm0 = Y0
1714 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1717 mov arg9, %r10 # %r10 = authTag
1718 mov arg10, %r11 # %r11 = auth_tag_len
1724 MOVQ_R64_XMM %xmm0, %rax
1726 jmp _return_T_done_encrypt
1728 MOVQ_R64_XMM %xmm0, %rax
1733 jmp _return_T_done_encrypt
1735 movdqu %xmm0, (%r10)
1736 _return_T_done_encrypt:
1742 ENDPROC(aesni_gcm_enc)
1749 _key_expansion_256a:
1750 pshufd $0b11111111, %xmm1, %xmm1
1751 shufps $0b00010000, %xmm0, %xmm4
1753 shufps $0b10001100, %xmm0, %xmm4
1756 movaps %xmm0, (TKEYP)
1759 ENDPROC(_key_expansion_128)
1760 ENDPROC(_key_expansion_256a)
1763 _key_expansion_192a:
1764 pshufd $0b01010101, %xmm1, %xmm1
1765 shufps $0b00010000, %xmm0, %xmm4
1767 shufps $0b10001100, %xmm0, %xmm4
1774 pshufd $0b11111111, %xmm0, %xmm3
1779 shufps $0b01000100, %xmm0, %xmm6
1780 movaps %xmm6, (TKEYP)
1781 shufps $0b01001110, %xmm2, %xmm1
1782 movaps %xmm1, 0x10(TKEYP)
1785 ENDPROC(_key_expansion_192a)
1788 _key_expansion_192b:
1789 pshufd $0b01010101, %xmm1, %xmm1
1790 shufps $0b00010000, %xmm0, %xmm4
1792 shufps $0b10001100, %xmm0, %xmm4
1798 pshufd $0b11111111, %xmm0, %xmm3
1802 movaps %xmm0, (TKEYP)
1805 ENDPROC(_key_expansion_192b)
1808 _key_expansion_256b:
1809 pshufd $0b10101010, %xmm1, %xmm1
1810 shufps $0b00010000, %xmm2, %xmm4
1812 shufps $0b10001100, %xmm2, %xmm4
1815 movaps %xmm2, (TKEYP)
1818 ENDPROC(_key_expansion_256b)
1821 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1822 * unsigned int key_len)
1824 ENTRY(aesni_set_key)
1828 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1829 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1830 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1832 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1833 movaps %xmm0, (KEYP)
1834 lea 0x10(KEYP), TKEYP # key addr
1835 movl %edx, 480(KEYP)
1836 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1840 movups 0x10(UKEYP), %xmm2 # other user key
1841 movaps %xmm2, (TKEYP)
1843 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1844 call _key_expansion_256a
1845 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1846 call _key_expansion_256b
1847 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1848 call _key_expansion_256a
1849 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1850 call _key_expansion_256b
1851 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1852 call _key_expansion_256a
1853 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1854 call _key_expansion_256b
1855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1856 call _key_expansion_256a
1857 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1858 call _key_expansion_256b
1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1860 call _key_expansion_256a
1861 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1862 call _key_expansion_256b
1863 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1864 call _key_expansion_256a
1865 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1866 call _key_expansion_256b
1867 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1868 call _key_expansion_256a
1871 movq 0x10(UKEYP), %xmm2 # other user key
1872 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1873 call _key_expansion_192a
1874 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1875 call _key_expansion_192b
1876 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1877 call _key_expansion_192a
1878 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1879 call _key_expansion_192b
1880 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1881 call _key_expansion_192a
1882 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1883 call _key_expansion_192b
1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1885 call _key_expansion_192a
1886 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1887 call _key_expansion_192b
1890 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1891 call _key_expansion_128
1892 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1893 call _key_expansion_128
1894 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1895 call _key_expansion_128
1896 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1897 call _key_expansion_128
1898 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1899 call _key_expansion_128
1900 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1901 call _key_expansion_128
1902 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1903 call _key_expansion_128
1904 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1905 call _key_expansion_128
1906 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1907 call _key_expansion_128
1908 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1909 call _key_expansion_128
1912 movaps (KEYP), %xmm0
1913 movaps (TKEYP), %xmm1
1914 movaps %xmm0, 240(TKEYP)
1915 movaps %xmm1, 240(KEYP)
1917 lea 240-16(TKEYP), UKEYP
1920 movaps (KEYP), %xmm0
1922 movaps %xmm1, (UKEYP)
1933 ENDPROC(aesni_set_key)
1936 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1943 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1944 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1945 movl (FRAME_OFFSET+20)(%esp), INP # src
1947 movl 480(KEYP), KLEN # key length
1948 movups (INP), STATE # input
1950 movups STATE, (OUTP) # output
1960 * _aesni_enc1: internal ABI
1962 * KEYP: key struct pointer
1964 * STATE: initial state (input)
1966 * STATE: finial state (output)
1973 movaps (KEYP), KEY # key
1975 pxor KEY, STATE # round 0
1979 lea 0x20(TKEYP), TKEYP
1982 movaps -0x60(TKEYP), KEY
1984 movaps -0x50(TKEYP), KEY
1988 movaps -0x40(TKEYP), KEY
1990 movaps -0x30(TKEYP), KEY
1994 movaps -0x20(TKEYP), KEY
1996 movaps -0x10(TKEYP), KEY
2000 movaps 0x10(TKEYP), KEY
2002 movaps 0x20(TKEYP), KEY
2004 movaps 0x30(TKEYP), KEY
2006 movaps 0x40(TKEYP), KEY
2008 movaps 0x50(TKEYP), KEY
2010 movaps 0x60(TKEYP), KEY
2012 movaps 0x70(TKEYP), KEY
2013 AESENCLAST KEY STATE
2015 ENDPROC(_aesni_enc1)
2018 * _aesni_enc4: internal ABI
2020 * KEYP: key struct pointer
2022 * STATE1: initial state (input)
2027 * STATE1: finial state (output)
2037 movaps (KEYP), KEY # key
2039 pxor KEY, STATE1 # round 0
2046 lea 0x20(TKEYP), TKEYP
2049 movaps -0x60(TKEYP), KEY
2054 movaps -0x50(TKEYP), KEY
2061 movaps -0x40(TKEYP), KEY
2066 movaps -0x30(TKEYP), KEY
2073 movaps -0x20(TKEYP), KEY
2078 movaps -0x10(TKEYP), KEY
2088 movaps 0x10(TKEYP), KEY
2093 movaps 0x20(TKEYP), KEY
2098 movaps 0x30(TKEYP), KEY
2103 movaps 0x40(TKEYP), KEY
2108 movaps 0x50(TKEYP), KEY
2113 movaps 0x60(TKEYP), KEY
2118 movaps 0x70(TKEYP), KEY
2119 AESENCLAST KEY STATE1 # last round
2120 AESENCLAST KEY STATE2
2121 AESENCLAST KEY STATE3
2122 AESENCLAST KEY STATE4
2124 ENDPROC(_aesni_enc4)
2127 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2136 movl (FRAME_OFFSET+20)(%esp), INP # src
2138 mov 480(KEYP), KLEN # key length
2140 movups (INP), STATE # input
2142 movups STATE, (OUTP) #output
2152 * _aesni_dec1: internal ABI
2154 * KEYP: key struct pointer
2156 * STATE: initial state (input)
2158 * STATE: finial state (output)
2165 movaps (KEYP), KEY # key
2167 pxor KEY, STATE # round 0
2171 lea 0x20(TKEYP), TKEYP
2174 movaps -0x60(TKEYP), KEY
2176 movaps -0x50(TKEYP), KEY
2180 movaps -0x40(TKEYP), KEY
2182 movaps -0x30(TKEYP), KEY
2186 movaps -0x20(TKEYP), KEY
2188 movaps -0x10(TKEYP), KEY
2192 movaps 0x10(TKEYP), KEY
2194 movaps 0x20(TKEYP), KEY
2196 movaps 0x30(TKEYP), KEY
2198 movaps 0x40(TKEYP), KEY
2200 movaps 0x50(TKEYP), KEY
2202 movaps 0x60(TKEYP), KEY
2204 movaps 0x70(TKEYP), KEY
2205 AESDECLAST KEY STATE
2207 ENDPROC(_aesni_dec1)
2210 * _aesni_dec4: internal ABI
2212 * KEYP: key struct pointer
2214 * STATE1: initial state (input)
2219 * STATE1: finial state (output)
2229 movaps (KEYP), KEY # key
2231 pxor KEY, STATE1 # round 0
2238 lea 0x20(TKEYP), TKEYP
2241 movaps -0x60(TKEYP), KEY
2246 movaps -0x50(TKEYP), KEY
2253 movaps -0x40(TKEYP), KEY
2258 movaps -0x30(TKEYP), KEY
2265 movaps -0x20(TKEYP), KEY
2270 movaps -0x10(TKEYP), KEY
2280 movaps 0x10(TKEYP), KEY
2285 movaps 0x20(TKEYP), KEY
2290 movaps 0x30(TKEYP), KEY
2295 movaps 0x40(TKEYP), KEY
2300 movaps 0x50(TKEYP), KEY
2305 movaps 0x60(TKEYP), KEY
2310 movaps 0x70(TKEYP), KEY
2311 AESDECLAST KEY STATE1 # last round
2312 AESDECLAST KEY STATE2
2313 AESDECLAST KEY STATE3
2314 AESDECLAST KEY STATE4
2316 ENDPROC(_aesni_dec4)
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2322 ENTRY(aesni_ecb_enc)
2328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2330 movl (FRAME_OFFSET+24)(%esp), INP # src
2331 movl (FRAME_OFFSET+28)(%esp), LEN # len
2333 test LEN, LEN # check length
2342 movups (INP), STATE1
2343 movups 0x10(INP), STATE2
2344 movups 0x20(INP), STATE3
2345 movups 0x30(INP), STATE4
2347 movups STATE1, (OUTP)
2348 movups STATE2, 0x10(OUTP)
2349 movups STATE3, 0x20(OUTP)
2350 movups STATE4, 0x30(OUTP)
2360 movups (INP), STATE1
2362 movups STATE1, (OUTP)
2376 ENDPROC(aesni_ecb_enc)
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2382 ENTRY(aesni_ecb_dec)
2388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2390 movl (FRAME_OFFSET+24)(%esp), INP # src
2391 movl (FRAME_OFFSET+28)(%esp), LEN # len
2403 movups (INP), STATE1
2404 movups 0x10(INP), STATE2
2405 movups 0x20(INP), STATE3
2406 movups 0x30(INP), STATE4
2408 movups STATE1, (OUTP)
2409 movups STATE2, 0x10(OUTP)
2410 movups STATE3, 0x20(OUTP)
2411 movups STATE4, 0x30(OUTP)
2421 movups (INP), STATE1
2423 movups STATE1, (OUTP)
2437 ENDPROC(aesni_ecb_dec)
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 * size_t len, u8 *iv)
2443 ENTRY(aesni_cbc_enc)
2450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2452 movl (FRAME_OFFSET+28)(%esp), INP # src
2453 movl (FRAME_OFFSET+32)(%esp), LEN # len
2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2459 movups (IVP), STATE # load iv as initial state
2462 movups (INP), IN # load input
2465 movups STATE, (OUTP) # store output
2481 ENDPROC(aesni_cbc_enc)
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 * size_t len, u8 *iv)
2487 ENTRY(aesni_cbc_dec)
2494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2496 movl (FRAME_OFFSET+28)(%esp), INP # src
2497 movl (FRAME_OFFSET+32)(%esp), LEN # len
2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2501 jb .Lcbc_dec_just_ret
2511 movups 0x10(INP), IN2
2514 movups 0x20(INP), IN3
2516 movups 0x30(INP), IN4
2519 movups 0x20(INP), IN1
2521 movups 0x30(INP), IN2
2536 movups 0x10(INP), IN2
2539 movups STATE1, (OUTP)
2540 movups STATE2, 0x10(OUTP)
2541 movups STATE3, 0x20(OUTP)
2542 movups STATE4, 0x30(OUTP)
2556 movups STATE, (OUTP)
2574 ENDPROC(aesni_cbc_dec)
2577 .pushsection .rodata
2580 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2584 * _aesni_inc_init: internal ABI
2585 * setup registers used by _aesni_inc
2589 * CTR: == IV, in little endian
2590 * TCTR_LOW: == lower qword of CTR
2591 * INC: == 1, in little endian
2592 * BSWAP_MASK == endian swapping mask
2596 movaps .Lbswap_mask, BSWAP_MASK
2598 PSHUFB_XMM BSWAP_MASK CTR
2600 MOVQ_R64_XMM TCTR_LOW INC
2601 MOVQ_R64_XMM CTR TCTR_LOW
2603 ENDPROC(_aesni_inc_init)
2606 * _aesni_inc: internal ABI
2607 * Increase IV by 1, IV is in big endian
2610 * CTR: == IV, in little endian
2611 * TCTR_LOW: == lower qword of CTR
2612 * INC: == 1, in little endian
2613 * BSWAP_MASK == endian swapping mask
2617 * CTR: == output IV, in little endian
2618 * TCTR_LOW: == lower qword of CTR
2630 PSHUFB_XMM BSWAP_MASK IV
2635 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2636 * size_t len, u8 *iv)
2638 ENTRY(aesni_ctr_enc)
2641 jb .Lctr_enc_just_ret
2644 call _aesni_inc_init
2654 movups 0x10(INP), IN2
2657 movups 0x20(INP), IN3
2660 movups 0x30(INP), IN4
2663 movups STATE1, (OUTP)
2665 movups STATE2, 0x10(OUTP)
2667 movups STATE3, 0x20(OUTP)
2669 movups STATE4, 0x30(OUTP)
2684 movups STATE, (OUTP)
2695 ENDPROC(aesni_ctr_enc)
2698 * _aesni_gf128mul_x_ble: internal ABI
2699 * Multiply in GF(2^128) for XTS IVs
2702 * GF128MUL_MASK == mask with 0x87 and 0x01
2706 * CTR: == temporary value
2708 #define _aesni_gf128mul_x_ble() \
2709 pshufd $0x13, IV, CTR; \
2712 pand GF128MUL_MASK, CTR; \
2716 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2719 ENTRY(aesni_xts_crypt8)
2724 leaq _aesni_enc4, %r11
2725 leaq _aesni_dec4, %rax
2729 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2736 movdqu 0x00(INP), INC
2738 movdqu IV, 0x00(OUTP)
2740 _aesni_gf128mul_x_ble()
2742 movdqu 0x10(INP), INC
2744 movdqu IV, 0x10(OUTP)
2746 _aesni_gf128mul_x_ble()
2748 movdqu 0x20(INP), INC
2750 movdqu IV, 0x20(OUTP)
2752 _aesni_gf128mul_x_ble()
2754 movdqu 0x30(INP), INC
2756 movdqu IV, 0x30(OUTP)
2760 movdqu 0x00(OUTP), INC
2762 movdqu STATE1, 0x00(OUTP)
2764 _aesni_gf128mul_x_ble()
2766 movdqu 0x40(INP), INC
2768 movdqu IV, 0x40(OUTP)
2770 movdqu 0x10(OUTP), INC
2772 movdqu STATE2, 0x10(OUTP)
2774 _aesni_gf128mul_x_ble()
2776 movdqu 0x50(INP), INC
2778 movdqu IV, 0x50(OUTP)
2780 movdqu 0x20(OUTP), INC
2782 movdqu STATE3, 0x20(OUTP)
2784 _aesni_gf128mul_x_ble()
2786 movdqu 0x60(INP), INC
2788 movdqu IV, 0x60(OUTP)
2790 movdqu 0x30(OUTP), INC
2792 movdqu STATE4, 0x30(OUTP)
2794 _aesni_gf128mul_x_ble()
2796 movdqu 0x70(INP), INC
2798 movdqu IV, 0x70(OUTP)
2800 _aesni_gf128mul_x_ble()
2805 movdqu 0x40(OUTP), INC
2807 movdqu STATE1, 0x40(OUTP)
2809 movdqu 0x50(OUTP), INC
2811 movdqu STATE2, 0x50(OUTP)
2813 movdqu 0x60(OUTP), INC
2815 movdqu STATE3, 0x60(OUTP)
2817 movdqu 0x70(OUTP), INC
2819 movdqu STATE4, 0x70(OUTP)
2823 ENDPROC(aesni_xts_crypt8)