]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
Merge tag 'random_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
8691ccd7 34#include <asm/frame.h>
54b6a1bd 35
e31ac32d
TM
36/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ movaps
45#define MOVUDQ movups
46
559ad0ff 47#ifdef __x86_64__
e31ac32d 48
e183914a
DV
49# constants in mergeable sections, linker can reorder and merge
50.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
51.align 16
52.Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
e183914a
DV
54.section .rodata.cst16.POLY, "aM", @progbits, 16
55.align 16
0bd82f5f 56POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
57.section .rodata.cst16.TWOONE, "aM", @progbits, 16
58.align 16
0bd82f5f
TS
59TWOONE: .octa 0x00000001000000000000000000000001
60
e183914a
DV
61.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
62.align 16
0bd82f5f 63SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
64.section .rodata.cst16.MASK1, "aM", @progbits, 16
65.align 16
0bd82f5f 66MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
67.section .rodata.cst16.MASK2, "aM", @progbits, 16
68.align 16
0bd82f5f 69MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
70.section .rodata.cst16.ONE, "aM", @progbits, 16
71.align 16
0bd82f5f 72ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
73.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
74.align 16
0bd82f5f 75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
76.section .rodata.cst16.dec, "aM", @progbits, 16
77.align 16
0bd82f5f 78dec: .octa 0x1
e183914a
DV
79.section .rodata.cst16.enc, "aM", @progbits, 16
80.align 16
0bd82f5f
TS
81enc: .octa 0x2
82
e183914a
DV
83# order of these constants should not change.
84# more specifically, ALL_F should follow SHIFT_MASK,
85# and zero should follow ALL_F
86.section .rodata, "a", @progbits
87.align 16
88SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89ALL_F: .octa 0xffffffffffffffffffffffffffffffff
90 .octa 0x00000000000000000000000000000000
91
0bd82f5f 92
54b6a1bd
HY
93.text
94
0bd82f5f
TS
95
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
e31ac32d 125#define keysize 2*15*16(%arg1)
559ad0ff 126#endif
0bd82f5f
TS
127
128
54b6a1bd
HY
129#define STATE1 %xmm0
130#define STATE2 %xmm4
131#define STATE3 %xmm5
132#define STATE4 %xmm6
133#define STATE STATE1
134#define IN1 %xmm1
135#define IN2 %xmm7
136#define IN3 %xmm8
137#define IN4 %xmm9
138#define IN IN1
139#define KEY %xmm2
140#define IV %xmm3
0d258efb 141
12387a46
HY
142#define BSWAP_MASK %xmm10
143#define CTR %xmm11
144#define INC %xmm12
54b6a1bd 145
c456a9cd
JK
146#define GF128MUL_MASK %xmm10
147
0d258efb
MK
148#ifdef __x86_64__
149#define AREG %rax
54b6a1bd
HY
150#define KEYP %rdi
151#define OUTP %rsi
0d258efb 152#define UKEYP OUTP
54b6a1bd
HY
153#define INP %rdx
154#define LEN %rcx
155#define IVP %r8
156#define KLEN %r9d
157#define T1 %r10
158#define TKEYP T1
159#define T2 %r11
12387a46 160#define TCTR_LOW T2
0d258efb
MK
161#else
162#define AREG %eax
163#define KEYP %edi
164#define OUTP AREG
165#define UKEYP OUTP
166#define INP %edx
167#define LEN %esi
168#define IVP %ebp
169#define KLEN %ebx
170#define T1 %ecx
171#define TKEYP T1
172#endif
54b6a1bd 173
0bd82f5f 174
559ad0ff 175#ifdef __x86_64__
0bd82f5f
TS
176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177*
178*
179* Input: A and B (128-bits each, bit-reflected)
180* Output: C = A*B*x mod poly, (i.e. >>1 )
181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183*
184*/
185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
186 movdqa \GH, \TMP1
187 pshufd $78, \GH, \TMP2
188 pshufd $78, \HK, \TMP3
189 pxor \GH, \TMP2 # TMP2 = a1+a0
190 pxor \HK, \TMP3 # TMP3 = b1+b0
191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
194 pxor \GH, \TMP2
195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
196 movdqa \TMP2, \TMP3
197 pslldq $8, \TMP3 # left shift TMP3 2 DWs
198 psrldq $8, \TMP2 # right shift TMP2 2 DWs
199 pxor \TMP3, \GH
200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
201
202 # first phase of the reduction
203
204 movdqa \GH, \TMP2
205 movdqa \GH, \TMP3
206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
207 # in in order to perform
208 # independent shifts
209 pslld $31, \TMP2 # packed right shift <<31
210 pslld $30, \TMP3 # packed right shift <<30
211 pslld $25, \TMP4 # packed right shift <<25
212 pxor \TMP3, \TMP2 # xor the shifted versions
213 pxor \TMP4, \TMP2
214 movdqa \TMP2, \TMP5
215 psrldq $4, \TMP5 # right shift TMP5 1 DW
216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
217 pxor \TMP2, \GH
218
219 # second phase of the reduction
220
221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
222 # in in order to perform
223 # independent shifts
224 movdqa \GH,\TMP3
225 movdqa \GH,\TMP4
226 psrld $1,\TMP2 # packed left shift >>1
227 psrld $2,\TMP3 # packed left shift >>2
228 psrld $7,\TMP4 # packed left shift >>7
229 pxor \TMP3,\TMP2 # xor the shifted versions
230 pxor \TMP4,\TMP2
231 pxor \TMP5, \TMP2
232 pxor \TMP2, \GH
233 pxor \TMP1, \GH # result is in TMP1
234.endm
235
236/*
237* if a = number of total plaintext bytes
238* b = floor(a/16)
239* num_initial_blocks = b mod 4
240* encrypt the initial num_initial_blocks blocks and apply ghash on
241* the ciphertext
242* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
243* are clobbered
244* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
245*/
246
0bd82f5f 247
3c097b80
TS
248.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
249XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 250 MOVADQ SHUF_MASK(%rip), %xmm14
0bd82f5f
TS
251 mov arg7, %r10 # %r10 = AAD
252 mov arg8, %r12 # %r12 = aadLen
253 mov %r12, %r11
254 pxor %xmm\i, %xmm\i
e31ac32d 255
0bd82f5f
TS
256_get_AAD_loop\num_initial_blocks\operation:
257 movd (%r10), \TMP1
258 pslldq $12, \TMP1
259 psrldq $4, %xmm\i
260 pxor \TMP1, %xmm\i
261 add $4, %r10
262 sub $4, %r12
263 jne _get_AAD_loop\num_initial_blocks\operation
e31ac32d 264
0bd82f5f
TS
265 cmp $16, %r11
266 je _get_AAD_loop2_done\num_initial_blocks\operation
e31ac32d 267
0bd82f5f
TS
268 mov $16, %r12
269_get_AAD_loop2\num_initial_blocks\operation:
270 psrldq $4, %xmm\i
271 sub $4, %r12
272 cmp %r11, %r12
273 jne _get_AAD_loop2\num_initial_blocks\operation
e31ac32d 274
0bd82f5f 275_get_AAD_loop2_done\num_initial_blocks\operation:
3c097b80
TS
276 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
277
0bd82f5f
TS
278 xor %r11, %r11 # initialise the data pointer offset as zero
279
280 # start AES for num_initial_blocks blocks
281
282 mov %arg5, %rax # %rax = *Y0
283 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
284 PSHUFB_XMM %xmm14, \XMM0
285
286.if (\i == 5) || (\i == 6) || (\i == 7)
e31ac32d
TM
287 MOVADQ ONE(%RIP),\TMP1
288 MOVADQ (%arg1),\TMP2
0bd82f5f 289.irpc index, \i_seq
e31ac32d 290 paddd \TMP1, \XMM0 # INCR Y0
0bd82f5f 291 movdqa \XMM0, %xmm\index
3c097b80 292 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
e31ac32d 293 pxor \TMP2, %xmm\index
0bd82f5f 294.endr
e31ac32d
TM
295 lea 0x10(%arg1),%r10
296 mov keysize,%eax
297 shr $2,%eax # 128->4, 192->6, 256->8
298 add $5,%eax # 128->9, 192->11, 256->13
299
300aes_loop_initial_dec\num_initial_blocks:
301 MOVADQ (%r10),\TMP1
302.irpc index, \i_seq
303 AESENC \TMP1, %xmm\index
0bd82f5f 304.endr
e31ac32d
TM
305 add $16,%r10
306 sub $1,%eax
307 jnz aes_loop_initial_dec\num_initial_blocks
308
309 MOVADQ (%r10), \TMP1
0bd82f5f 310.irpc index, \i_seq
e31ac32d 311 AESENCLAST \TMP1, %xmm\index # Last Round
0bd82f5f
TS
312.endr
313.irpc index, \i_seq
314 movdqu (%arg3 , %r11, 1), \TMP1
315 pxor \TMP1, %xmm\index
316 movdqu %xmm\index, (%arg2 , %r11, 1)
317 # write back plaintext/ciphertext for num_initial_blocks
318 add $16, %r11
3c097b80 319
0bd82f5f 320 movdqa \TMP1, %xmm\index
3c097b80 321 PSHUFB_XMM %xmm14, %xmm\index
e31ac32d 322 # prepare plaintext/ciphertext for GHASH computation
0bd82f5f
TS
323.endr
324.endif
325 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326 # apply GHASH on num_initial_blocks blocks
327
328.if \i == 5
329 pxor %xmm5, %xmm6
330 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
331 pxor %xmm6, %xmm7
332 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
333 pxor %xmm7, %xmm8
334 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
335.elseif \i == 6
336 pxor %xmm6, %xmm7
337 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
338 pxor %xmm7, %xmm8
339 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
340.elseif \i == 7
341 pxor %xmm7, %xmm8
342 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
343.endif
344 cmp $64, %r13
345 jl _initial_blocks_done\num_initial_blocks\operation
346 # no need for precomputed values
347/*
348*
349* Precomputations for HashKey parallel with encryption of first 4 blocks.
350* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
351*/
e31ac32d
TM
352 MOVADQ ONE(%rip), \TMP1
353 paddd \TMP1, \XMM0 # INCR Y0
354 MOVADQ \XMM0, \XMM1
3c097b80
TS
355 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
356
e31ac32d
TM
357 paddd \TMP1, \XMM0 # INCR Y0
358 MOVADQ \XMM0, \XMM2
3c097b80
TS
359 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
360
e31ac32d
TM
361 paddd \TMP1, \XMM0 # INCR Y0
362 MOVADQ \XMM0, \XMM3
3c097b80
TS
363 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
364
e31ac32d
TM
365 paddd \TMP1, \XMM0 # INCR Y0
366 MOVADQ \XMM0, \XMM4
3c097b80
TS
367 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
368
e31ac32d
TM
369 MOVADQ 0(%arg1),\TMP1
370 pxor \TMP1, \XMM1
371 pxor \TMP1, \XMM2
372 pxor \TMP1, \XMM3
373 pxor \TMP1, \XMM4
0bd82f5f
TS
374 movdqa \TMP3, \TMP5
375 pshufd $78, \TMP3, \TMP1
376 pxor \TMP3, \TMP1
377 movdqa \TMP1, HashKey_k(%rsp)
378 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
379# TMP5 = HashKey^2<<1 (mod poly)
380 movdqa \TMP5, HashKey_2(%rsp)
381# HashKey_2 = HashKey^2<<1 (mod poly)
382 pshufd $78, \TMP5, \TMP1
383 pxor \TMP5, \TMP1
384 movdqa \TMP1, HashKey_2_k(%rsp)
385.irpc index, 1234 # do 4 rounds
386 movaps 0x10*\index(%arg1), \TMP1
387 AESENC \TMP1, \XMM1
388 AESENC \TMP1, \XMM2
389 AESENC \TMP1, \XMM3
390 AESENC \TMP1, \XMM4
391.endr
392 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
393# TMP5 = HashKey^3<<1 (mod poly)
394 movdqa \TMP5, HashKey_3(%rsp)
395 pshufd $78, \TMP5, \TMP1
396 pxor \TMP5, \TMP1
397 movdqa \TMP1, HashKey_3_k(%rsp)
398.irpc index, 56789 # do next 5 rounds
399 movaps 0x10*\index(%arg1), \TMP1
400 AESENC \TMP1, \XMM1
401 AESENC \TMP1, \XMM2
402 AESENC \TMP1, \XMM3
403 AESENC \TMP1, \XMM4
404.endr
405 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
406# TMP5 = HashKey^3<<1 (mod poly)
407 movdqa \TMP5, HashKey_4(%rsp)
408 pshufd $78, \TMP5, \TMP1
409 pxor \TMP5, \TMP1
410 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
411 lea 0xa0(%arg1),%r10
412 mov keysize,%eax
413 shr $2,%eax # 128->4, 192->6, 256->8
414 sub $4,%eax # 128->0, 192->2, 256->4
415 jz aes_loop_pre_dec_done\num_initial_blocks
416
417aes_loop_pre_dec\num_initial_blocks:
418 MOVADQ (%r10),\TMP2
419.irpc index, 1234
420 AESENC \TMP2, %xmm\index
421.endr
422 add $16,%r10
423 sub $1,%eax
424 jnz aes_loop_pre_dec\num_initial_blocks
425
426aes_loop_pre_dec_done\num_initial_blocks:
427 MOVADQ (%r10), \TMP2
0bd82f5f
TS
428 AESENCLAST \TMP2, \XMM1
429 AESENCLAST \TMP2, \XMM2
430 AESENCLAST \TMP2, \XMM3
431 AESENCLAST \TMP2, \XMM4
432 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
433 pxor \TMP1, \XMM1
0bd82f5f
TS
434 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
435 movdqa \TMP1, \XMM1
0bd82f5f
TS
436 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
437 pxor \TMP1, \XMM2
0bd82f5f
TS
438 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
439 movdqa \TMP1, \XMM2
0bd82f5f
TS
440 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
441 pxor \TMP1, \XMM3
0bd82f5f
TS
442 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
443 movdqa \TMP1, \XMM3
0bd82f5f
TS
444 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
445 pxor \TMP1, \XMM4
0bd82f5f
TS
446 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
447 movdqa \TMP1, \XMM4
3c097b80 448 add $64, %r11
3c097b80
TS
449 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
450 pxor \XMMDst, \XMM1
451# combine GHASHed value with the corresponding ciphertext
3c097b80 452 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 453 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
454 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
455
456_initial_blocks_done\num_initial_blocks\operation:
457
458.endm
459
460
461/*
462* if a = number of total plaintext bytes
463* b = floor(a/16)
464* num_initial_blocks = b mod 4
465* encrypt the initial num_initial_blocks blocks and apply ghash on
466* the ciphertext
467* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
468* are clobbered
469* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
470*/
471
472
473.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
474XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 475 MOVADQ SHUF_MASK(%rip), %xmm14
3c097b80
TS
476 mov arg7, %r10 # %r10 = AAD
477 mov arg8, %r12 # %r12 = aadLen
478 mov %r12, %r11
479 pxor %xmm\i, %xmm\i
480_get_AAD_loop\num_initial_blocks\operation:
481 movd (%r10), \TMP1
482 pslldq $12, \TMP1
483 psrldq $4, %xmm\i
484 pxor \TMP1, %xmm\i
485 add $4, %r10
486 sub $4, %r12
487 jne _get_AAD_loop\num_initial_blocks\operation
488 cmp $16, %r11
489 je _get_AAD_loop2_done\num_initial_blocks\operation
490 mov $16, %r12
491_get_AAD_loop2\num_initial_blocks\operation:
492 psrldq $4, %xmm\i
493 sub $4, %r12
494 cmp %r11, %r12
495 jne _get_AAD_loop2\num_initial_blocks\operation
496_get_AAD_loop2_done\num_initial_blocks\operation:
3c097b80
TS
497 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
498
499 xor %r11, %r11 # initialise the data pointer offset as zero
500
501 # start AES for num_initial_blocks blocks
502
503 mov %arg5, %rax # %rax = *Y0
504 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
505 PSHUFB_XMM %xmm14, \XMM0
506
507.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 508
e31ac32d
TM
509 MOVADQ ONE(%RIP),\TMP1
510 MOVADQ 0(%arg1),\TMP2
3c097b80 511.irpc index, \i_seq
e31ac32d
TM
512 paddd \TMP1, \XMM0 # INCR Y0
513 MOVADQ \XMM0, %xmm\index
514 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
515 pxor \TMP2, %xmm\index
3c097b80 516.endr
e31ac32d
TM
517 lea 0x10(%arg1),%r10
518 mov keysize,%eax
519 shr $2,%eax # 128->4, 192->6, 256->8
520 add $5,%eax # 128->9, 192->11, 256->13
521
522aes_loop_initial_enc\num_initial_blocks:
523 MOVADQ (%r10),\TMP1
524.irpc index, \i_seq
525 AESENC \TMP1, %xmm\index
3c097b80 526.endr
e31ac32d
TM
527 add $16,%r10
528 sub $1,%eax
529 jnz aes_loop_initial_enc\num_initial_blocks
530
531 MOVADQ (%r10), \TMP1
3c097b80 532.irpc index, \i_seq
e31ac32d 533 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
534.endr
535.irpc index, \i_seq
536 movdqu (%arg3 , %r11, 1), \TMP1
537 pxor \TMP1, %xmm\index
538 movdqu %xmm\index, (%arg2 , %r11, 1)
539 # write back plaintext/ciphertext for num_initial_blocks
540 add $16, %r11
3c097b80
TS
541 PSHUFB_XMM %xmm14, %xmm\index
542
543 # prepare plaintext/ciphertext for GHASH computation
544.endr
545.endif
546 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547 # apply GHASH on num_initial_blocks blocks
548
549.if \i == 5
550 pxor %xmm5, %xmm6
551 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552 pxor %xmm6, %xmm7
553 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
554 pxor %xmm7, %xmm8
555 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
556.elseif \i == 6
557 pxor %xmm6, %xmm7
558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
559 pxor %xmm7, %xmm8
560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
561.elseif \i == 7
562 pxor %xmm7, %xmm8
563 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
564.endif
565 cmp $64, %r13
566 jl _initial_blocks_done\num_initial_blocks\operation
567 # no need for precomputed values
568/*
569*
570* Precomputations for HashKey parallel with encryption of first 4 blocks.
571* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
572*/
e31ac32d
TM
573 MOVADQ ONE(%RIP),\TMP1
574 paddd \TMP1, \XMM0 # INCR Y0
575 MOVADQ \XMM0, \XMM1
3c097b80
TS
576 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
577
e31ac32d
TM
578 paddd \TMP1, \XMM0 # INCR Y0
579 MOVADQ \XMM0, \XMM2
3c097b80
TS
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
e31ac32d
TM
582 paddd \TMP1, \XMM0 # INCR Y0
583 MOVADQ \XMM0, \XMM3
3c097b80
TS
584 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
585
e31ac32d
TM
586 paddd \TMP1, \XMM0 # INCR Y0
587 MOVADQ \XMM0, \XMM4
3c097b80
TS
588 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
589
e31ac32d
TM
590 MOVADQ 0(%arg1),\TMP1
591 pxor \TMP1, \XMM1
592 pxor \TMP1, \XMM2
593 pxor \TMP1, \XMM3
594 pxor \TMP1, \XMM4
3c097b80
TS
595 movdqa \TMP3, \TMP5
596 pshufd $78, \TMP3, \TMP1
597 pxor \TMP3, \TMP1
598 movdqa \TMP1, HashKey_k(%rsp)
599 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
600# TMP5 = HashKey^2<<1 (mod poly)
601 movdqa \TMP5, HashKey_2(%rsp)
602# HashKey_2 = HashKey^2<<1 (mod poly)
603 pshufd $78, \TMP5, \TMP1
604 pxor \TMP5, \TMP1
605 movdqa \TMP1, HashKey_2_k(%rsp)
606.irpc index, 1234 # do 4 rounds
607 movaps 0x10*\index(%arg1), \TMP1
608 AESENC \TMP1, \XMM1
609 AESENC \TMP1, \XMM2
610 AESENC \TMP1, \XMM3
611 AESENC \TMP1, \XMM4
612.endr
613 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
614# TMP5 = HashKey^3<<1 (mod poly)
615 movdqa \TMP5, HashKey_3(%rsp)
616 pshufd $78, \TMP5, \TMP1
617 pxor \TMP5, \TMP1
618 movdqa \TMP1, HashKey_3_k(%rsp)
619.irpc index, 56789 # do next 5 rounds
620 movaps 0x10*\index(%arg1), \TMP1
621 AESENC \TMP1, \XMM1
622 AESENC \TMP1, \XMM2
623 AESENC \TMP1, \XMM3
624 AESENC \TMP1, \XMM4
625.endr
626 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
627# TMP5 = HashKey^3<<1 (mod poly)
628 movdqa \TMP5, HashKey_4(%rsp)
629 pshufd $78, \TMP5, \TMP1
630 pxor \TMP5, \TMP1
631 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
632 lea 0xa0(%arg1),%r10
633 mov keysize,%eax
634 shr $2,%eax # 128->4, 192->6, 256->8
635 sub $4,%eax # 128->0, 192->2, 256->4
636 jz aes_loop_pre_enc_done\num_initial_blocks
637
638aes_loop_pre_enc\num_initial_blocks:
639 MOVADQ (%r10),\TMP2
640.irpc index, 1234
641 AESENC \TMP2, %xmm\index
642.endr
643 add $16,%r10
644 sub $1,%eax
645 jnz aes_loop_pre_enc\num_initial_blocks
646
647aes_loop_pre_enc_done\num_initial_blocks:
648 MOVADQ (%r10), \TMP2
3c097b80
TS
649 AESENCLAST \TMP2, \XMM1
650 AESENCLAST \TMP2, \XMM2
651 AESENCLAST \TMP2, \XMM3
652 AESENCLAST \TMP2, \XMM4
653 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
654 pxor \TMP1, \XMM1
655 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
656 pxor \TMP1, \XMM2
657 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
658 pxor \TMP1, \XMM3
659 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
660 pxor \TMP1, \XMM4
0bd82f5f
TS
661 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
662 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
663 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
664 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
3c097b80 665
0bd82f5f 666 add $64, %r11
3c097b80 667 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
668 pxor \XMMDst, \XMM1
669# combine GHASHed value with the corresponding ciphertext
3c097b80 670 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 671 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
672 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
673
0bd82f5f 674_initial_blocks_done\num_initial_blocks\operation:
3c097b80 675
0bd82f5f
TS
676.endm
677
678/*
679* encrypt 4 blocks at a time
680* ghash the 4 previously encrypted ciphertext blocks
681* arg1, %arg2, %arg3 are used as pointers only, not modified
682* %r11 is the data offset value
683*/
3c097b80
TS
684.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
685TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
686
687 movdqa \XMM1, \XMM5
688 movdqa \XMM2, \XMM6
689 movdqa \XMM3, \XMM7
690 movdqa \XMM4, \XMM8
691
692 movdqa SHUF_MASK(%rip), %xmm15
693 # multiply TMP5 * HashKey using karatsuba
694
695 movdqa \XMM5, \TMP4
696 pshufd $78, \XMM5, \TMP6
697 pxor \XMM5, \TMP6
698 paddd ONE(%rip), \XMM0 # INCR CNT
699 movdqa HashKey_4(%rsp), \TMP5
700 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
701 movdqa \XMM0, \XMM1
702 paddd ONE(%rip), \XMM0 # INCR CNT
703 movdqa \XMM0, \XMM2
704 paddd ONE(%rip), \XMM0 # INCR CNT
705 movdqa \XMM0, \XMM3
706 paddd ONE(%rip), \XMM0 # INCR CNT
707 movdqa \XMM0, \XMM4
708 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
709 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
710 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
711 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
712 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
713
714 pxor (%arg1), \XMM1
715 pxor (%arg1), \XMM2
716 pxor (%arg1), \XMM3
717 pxor (%arg1), \XMM4
718 movdqa HashKey_4_k(%rsp), \TMP5
719 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
720 movaps 0x10(%arg1), \TMP1
721 AESENC \TMP1, \XMM1 # Round 1
722 AESENC \TMP1, \XMM2
723 AESENC \TMP1, \XMM3
724 AESENC \TMP1, \XMM4
725 movaps 0x20(%arg1), \TMP1
726 AESENC \TMP1, \XMM1 # Round 2
727 AESENC \TMP1, \XMM2
728 AESENC \TMP1, \XMM3
729 AESENC \TMP1, \XMM4
730 movdqa \XMM6, \TMP1
731 pshufd $78, \XMM6, \TMP2
732 pxor \XMM6, \TMP2
733 movdqa HashKey_3(%rsp), \TMP5
734 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
735 movaps 0x30(%arg1), \TMP3
736 AESENC \TMP3, \XMM1 # Round 3
737 AESENC \TMP3, \XMM2
738 AESENC \TMP3, \XMM3
739 AESENC \TMP3, \XMM4
740 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
741 movaps 0x40(%arg1), \TMP3
742 AESENC \TMP3, \XMM1 # Round 4
743 AESENC \TMP3, \XMM2
744 AESENC \TMP3, \XMM3
745 AESENC \TMP3, \XMM4
746 movdqa HashKey_3_k(%rsp), \TMP5
747 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
748 movaps 0x50(%arg1), \TMP3
749 AESENC \TMP3, \XMM1 # Round 5
750 AESENC \TMP3, \XMM2
751 AESENC \TMP3, \XMM3
752 AESENC \TMP3, \XMM4
753 pxor \TMP1, \TMP4
754# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
755 pxor \XMM6, \XMM5
756 pxor \TMP2, \TMP6
757 movdqa \XMM7, \TMP1
758 pshufd $78, \XMM7, \TMP2
759 pxor \XMM7, \TMP2
760 movdqa HashKey_2(%rsp ), \TMP5
761
762 # Multiply TMP5 * HashKey using karatsuba
763
764 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
765 movaps 0x60(%arg1), \TMP3
766 AESENC \TMP3, \XMM1 # Round 6
767 AESENC \TMP3, \XMM2
768 AESENC \TMP3, \XMM3
769 AESENC \TMP3, \XMM4
770 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
771 movaps 0x70(%arg1), \TMP3
772 AESENC \TMP3, \XMM1 # Round 7
773 AESENC \TMP3, \XMM2
774 AESENC \TMP3, \XMM3
775 AESENC \TMP3, \XMM4
776 movdqa HashKey_2_k(%rsp), \TMP5
777 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
778 movaps 0x80(%arg1), \TMP3
779 AESENC \TMP3, \XMM1 # Round 8
780 AESENC \TMP3, \XMM2
781 AESENC \TMP3, \XMM3
782 AESENC \TMP3, \XMM4
783 pxor \TMP1, \TMP4
784# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
785 pxor \XMM7, \XMM5
786 pxor \TMP2, \TMP6
787
788 # Multiply XMM8 * HashKey
789 # XMM8 and TMP5 hold the values for the two operands
790
791 movdqa \XMM8, \TMP1
792 pshufd $78, \XMM8, \TMP2
793 pxor \XMM8, \TMP2
794 movdqa HashKey(%rsp), \TMP5
795 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
796 movaps 0x90(%arg1), \TMP3
797 AESENC \TMP3, \XMM1 # Round 9
798 AESENC \TMP3, \XMM2
799 AESENC \TMP3, \XMM3
800 AESENC \TMP3, \XMM4
801 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
802 lea 0xa0(%arg1),%r10
803 mov keysize,%eax
804 shr $2,%eax # 128->4, 192->6, 256->8
805 sub $4,%eax # 128->0, 192->2, 256->4
806 jz aes_loop_par_enc_done
807
808aes_loop_par_enc:
809 MOVADQ (%r10),\TMP3
810.irpc index, 1234
811 AESENC \TMP3, %xmm\index
812.endr
813 add $16,%r10
814 sub $1,%eax
815 jnz aes_loop_par_enc
816
817aes_loop_par_enc_done:
818 MOVADQ (%r10), \TMP3
3c097b80
TS
819 AESENCLAST \TMP3, \XMM1 # Round 10
820 AESENCLAST \TMP3, \XMM2
821 AESENCLAST \TMP3, \XMM3
822 AESENCLAST \TMP3, \XMM4
823 movdqa HashKey_k(%rsp), \TMP5
824 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
825 movdqu (%arg3,%r11,1), \TMP3
826 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
827 movdqu 16(%arg3,%r11,1), \TMP3
828 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
829 movdqu 32(%arg3,%r11,1), \TMP3
830 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
831 movdqu 48(%arg3,%r11,1), \TMP3
832 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
833 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
834 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
835 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
836 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
837 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
838 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
839 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
840 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
841
842 pxor \TMP4, \TMP1
843 pxor \XMM8, \XMM5
844 pxor \TMP6, \TMP2
845 pxor \TMP1, \TMP2
846 pxor \XMM5, \TMP2
847 movdqa \TMP2, \TMP3
848 pslldq $8, \TMP3 # left shift TMP3 2 DWs
849 psrldq $8, \TMP2 # right shift TMP2 2 DWs
850 pxor \TMP3, \XMM5
851 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
852
853 # first phase of reduction
854
855 movdqa \XMM5, \TMP2
856 movdqa \XMM5, \TMP3
857 movdqa \XMM5, \TMP4
858# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
859 pslld $31, \TMP2 # packed right shift << 31
860 pslld $30, \TMP3 # packed right shift << 30
861 pslld $25, \TMP4 # packed right shift << 25
862 pxor \TMP3, \TMP2 # xor the shifted versions
863 pxor \TMP4, \TMP2
864 movdqa \TMP2, \TMP5
865 psrldq $4, \TMP5 # right shift T5 1 DW
866 pslldq $12, \TMP2 # left shift T2 3 DWs
867 pxor \TMP2, \XMM5
868
869 # second phase of reduction
870
871 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
872 movdqa \XMM5,\TMP3
873 movdqa \XMM5,\TMP4
874 psrld $1, \TMP2 # packed left shift >>1
875 psrld $2, \TMP3 # packed left shift >>2
876 psrld $7, \TMP4 # packed left shift >>7
877 pxor \TMP3,\TMP2 # xor the shifted versions
878 pxor \TMP4,\TMP2
879 pxor \TMP5, \TMP2
880 pxor \TMP2, \XMM5
881 pxor \TMP1, \XMM5 # result is in TMP1
882
883 pxor \XMM5, \XMM1
884.endm
885
886/*
887* decrypt 4 blocks at a time
888* ghash the 4 previously decrypted ciphertext blocks
889* arg1, %arg2, %arg3 are used as pointers only, not modified
890* %r11 is the data offset value
891*/
892.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
893TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
894
895 movdqa \XMM1, \XMM5
896 movdqa \XMM2, \XMM6
897 movdqa \XMM3, \XMM7
898 movdqa \XMM4, \XMM8
899
3c097b80 900 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
901 # multiply TMP5 * HashKey using karatsuba
902
903 movdqa \XMM5, \TMP4
904 pshufd $78, \XMM5, \TMP6
905 pxor \XMM5, \TMP6
906 paddd ONE(%rip), \XMM0 # INCR CNT
907 movdqa HashKey_4(%rsp), \TMP5
908 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
909 movdqa \XMM0, \XMM1
910 paddd ONE(%rip), \XMM0 # INCR CNT
911 movdqa \XMM0, \XMM2
912 paddd ONE(%rip), \XMM0 # INCR CNT
913 movdqa \XMM0, \XMM3
914 paddd ONE(%rip), \XMM0 # INCR CNT
915 movdqa \XMM0, \XMM4
3c097b80 916 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 917 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
918 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
919 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
920 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
921
0bd82f5f
TS
922 pxor (%arg1), \XMM1
923 pxor (%arg1), \XMM2
924 pxor (%arg1), \XMM3
925 pxor (%arg1), \XMM4
926 movdqa HashKey_4_k(%rsp), \TMP5
927 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
928 movaps 0x10(%arg1), \TMP1
929 AESENC \TMP1, \XMM1 # Round 1
930 AESENC \TMP1, \XMM2
931 AESENC \TMP1, \XMM3
932 AESENC \TMP1, \XMM4
933 movaps 0x20(%arg1), \TMP1
934 AESENC \TMP1, \XMM1 # Round 2
935 AESENC \TMP1, \XMM2
936 AESENC \TMP1, \XMM3
937 AESENC \TMP1, \XMM4
938 movdqa \XMM6, \TMP1
939 pshufd $78, \XMM6, \TMP2
940 pxor \XMM6, \TMP2
941 movdqa HashKey_3(%rsp), \TMP5
942 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
943 movaps 0x30(%arg1), \TMP3
944 AESENC \TMP3, \XMM1 # Round 3
945 AESENC \TMP3, \XMM2
946 AESENC \TMP3, \XMM3
947 AESENC \TMP3, \XMM4
948 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
949 movaps 0x40(%arg1), \TMP3
950 AESENC \TMP3, \XMM1 # Round 4
951 AESENC \TMP3, \XMM2
952 AESENC \TMP3, \XMM3
953 AESENC \TMP3, \XMM4
954 movdqa HashKey_3_k(%rsp), \TMP5
955 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
956 movaps 0x50(%arg1), \TMP3
957 AESENC \TMP3, \XMM1 # Round 5
958 AESENC \TMP3, \XMM2
959 AESENC \TMP3, \XMM3
960 AESENC \TMP3, \XMM4
961 pxor \TMP1, \TMP4
962# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
963 pxor \XMM6, \XMM5
964 pxor \TMP2, \TMP6
965 movdqa \XMM7, \TMP1
966 pshufd $78, \XMM7, \TMP2
967 pxor \XMM7, \TMP2
968 movdqa HashKey_2(%rsp ), \TMP5
969
970 # Multiply TMP5 * HashKey using karatsuba
971
972 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
973 movaps 0x60(%arg1), \TMP3
974 AESENC \TMP3, \XMM1 # Round 6
975 AESENC \TMP3, \XMM2
976 AESENC \TMP3, \XMM3
977 AESENC \TMP3, \XMM4
978 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
979 movaps 0x70(%arg1), \TMP3
980 AESENC \TMP3, \XMM1 # Round 7
981 AESENC \TMP3, \XMM2
982 AESENC \TMP3, \XMM3
983 AESENC \TMP3, \XMM4
984 movdqa HashKey_2_k(%rsp), \TMP5
985 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
986 movaps 0x80(%arg1), \TMP3
987 AESENC \TMP3, \XMM1 # Round 8
988 AESENC \TMP3, \XMM2
989 AESENC \TMP3, \XMM3
990 AESENC \TMP3, \XMM4
991 pxor \TMP1, \TMP4
992# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
993 pxor \XMM7, \XMM5
994 pxor \TMP2, \TMP6
995
996 # Multiply XMM8 * HashKey
997 # XMM8 and TMP5 hold the values for the two operands
998
999 movdqa \XMM8, \TMP1
1000 pshufd $78, \XMM8, \TMP2
1001 pxor \XMM8, \TMP2
1002 movdqa HashKey(%rsp), \TMP5
1003 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1004 movaps 0x90(%arg1), \TMP3
1005 AESENC \TMP3, \XMM1 # Round 9
1006 AESENC \TMP3, \XMM2
1007 AESENC \TMP3, \XMM3
1008 AESENC \TMP3, \XMM4
1009 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1010 lea 0xa0(%arg1),%r10
1011 mov keysize,%eax
1012 shr $2,%eax # 128->4, 192->6, 256->8
1013 sub $4,%eax # 128->0, 192->2, 256->4
1014 jz aes_loop_par_dec_done
1015
1016aes_loop_par_dec:
1017 MOVADQ (%r10),\TMP3
1018.irpc index, 1234
1019 AESENC \TMP3, %xmm\index
1020.endr
1021 add $16,%r10
1022 sub $1,%eax
1023 jnz aes_loop_par_dec
1024
1025aes_loop_par_dec_done:
1026 MOVADQ (%r10), \TMP3
1027 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
1028 AESENCLAST \TMP3, \XMM2
1029 AESENCLAST \TMP3, \XMM3
1030 AESENCLAST \TMP3, \XMM4
1031 movdqa HashKey_k(%rsp), \TMP5
1032 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1033 movdqu (%arg3,%r11,1), \TMP3
1034 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1035 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1036 movdqa \TMP3, \XMM1
0bd82f5f
TS
1037 movdqu 16(%arg3,%r11,1), \TMP3
1038 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1039 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1040 movdqa \TMP3, \XMM2
0bd82f5f
TS
1041 movdqu 32(%arg3,%r11,1), \TMP3
1042 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1043 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1044 movdqa \TMP3, \XMM3
0bd82f5f
TS
1045 movdqu 48(%arg3,%r11,1), \TMP3
1046 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1047 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1048 movdqa \TMP3, \XMM4
3c097b80
TS
1049 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1050 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1051 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1052 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1053
1054 pxor \TMP4, \TMP1
1055 pxor \XMM8, \XMM5
1056 pxor \TMP6, \TMP2
1057 pxor \TMP1, \TMP2
1058 pxor \XMM5, \TMP2
1059 movdqa \TMP2, \TMP3
1060 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1061 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1062 pxor \TMP3, \XMM5
1063 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1064
1065 # first phase of reduction
1066
1067 movdqa \XMM5, \TMP2
1068 movdqa \XMM5, \TMP3
1069 movdqa \XMM5, \TMP4
1070# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1071 pslld $31, \TMP2 # packed right shift << 31
1072 pslld $30, \TMP3 # packed right shift << 30
1073 pslld $25, \TMP4 # packed right shift << 25
1074 pxor \TMP3, \TMP2 # xor the shifted versions
1075 pxor \TMP4, \TMP2
1076 movdqa \TMP2, \TMP5
1077 psrldq $4, \TMP5 # right shift T5 1 DW
1078 pslldq $12, \TMP2 # left shift T2 3 DWs
1079 pxor \TMP2, \XMM5
1080
1081 # second phase of reduction
1082
1083 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1084 movdqa \XMM5,\TMP3
1085 movdqa \XMM5,\TMP4
1086 psrld $1, \TMP2 # packed left shift >>1
1087 psrld $2, \TMP3 # packed left shift >>2
1088 psrld $7, \TMP4 # packed left shift >>7
1089 pxor \TMP3,\TMP2 # xor the shifted versions
1090 pxor \TMP4,\TMP2
1091 pxor \TMP5, \TMP2
1092 pxor \TMP2, \XMM5
1093 pxor \TMP1, \XMM5 # result is in TMP1
1094
1095 pxor \XMM5, \XMM1
1096.endm
1097
1098/* GHASH the last 4 ciphertext blocks. */
1099.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1100TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1101
1102 # Multiply TMP6 * HashKey (using Karatsuba)
1103
1104 movdqa \XMM1, \TMP6
1105 pshufd $78, \XMM1, \TMP2
1106 pxor \XMM1, \TMP2
1107 movdqa HashKey_4(%rsp), \TMP5
1108 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1109 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1110 movdqa HashKey_4_k(%rsp), \TMP4
1111 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1112 movdqa \XMM1, \XMMDst
1113 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1114
1115 # Multiply TMP1 * HashKey (using Karatsuba)
1116
1117 movdqa \XMM2, \TMP1
1118 pshufd $78, \XMM2, \TMP2
1119 pxor \XMM2, \TMP2
1120 movdqa HashKey_3(%rsp), \TMP5
1121 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1122 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1123 movdqa HashKey_3_k(%rsp), \TMP4
1124 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1125 pxor \TMP1, \TMP6
1126 pxor \XMM2, \XMMDst
1127 pxor \TMP2, \XMM1
1128# results accumulated in TMP6, XMMDst, XMM1
1129
1130 # Multiply TMP1 * HashKey (using Karatsuba)
1131
1132 movdqa \XMM3, \TMP1
1133 pshufd $78, \XMM3, \TMP2
1134 pxor \XMM3, \TMP2
1135 movdqa HashKey_2(%rsp), \TMP5
1136 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1137 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1138 movdqa HashKey_2_k(%rsp), \TMP4
1139 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1140 pxor \TMP1, \TMP6
1141 pxor \XMM3, \XMMDst
1142 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1143
1144 # Multiply TMP1 * HashKey (using Karatsuba)
1145 movdqa \XMM4, \TMP1
1146 pshufd $78, \XMM4, \TMP2
1147 pxor \XMM4, \TMP2
1148 movdqa HashKey(%rsp), \TMP5
1149 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1150 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1151 movdqa HashKey_k(%rsp), \TMP4
1152 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1153 pxor \TMP1, \TMP6
1154 pxor \XMM4, \XMMDst
1155 pxor \XMM1, \TMP2
1156 pxor \TMP6, \TMP2
1157 pxor \XMMDst, \TMP2
1158 # middle section of the temp results combined as in karatsuba algorithm
1159 movdqa \TMP2, \TMP4
1160 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1161 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1162 pxor \TMP4, \XMMDst
1163 pxor \TMP2, \TMP6
1164# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1165 # first phase of the reduction
1166 movdqa \XMMDst, \TMP2
1167 movdqa \XMMDst, \TMP3
1168 movdqa \XMMDst, \TMP4
1169# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1170 pslld $31, \TMP2 # packed right shifting << 31
1171 pslld $30, \TMP3 # packed right shifting << 30
1172 pslld $25, \TMP4 # packed right shifting << 25
1173 pxor \TMP3, \TMP2 # xor the shifted versions
1174 pxor \TMP4, \TMP2
1175 movdqa \TMP2, \TMP7
1176 psrldq $4, \TMP7 # right shift TMP7 1 DW
1177 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1178 pxor \TMP2, \XMMDst
1179
1180 # second phase of the reduction
1181 movdqa \XMMDst, \TMP2
1182 # make 3 copies of XMMDst for doing 3 shift operations
1183 movdqa \XMMDst, \TMP3
1184 movdqa \XMMDst, \TMP4
1185 psrld $1, \TMP2 # packed left shift >> 1
1186 psrld $2, \TMP3 # packed left shift >> 2
1187 psrld $7, \TMP4 # packed left shift >> 7
1188 pxor \TMP3, \TMP2 # xor the shifted versions
1189 pxor \TMP4, \TMP2
1190 pxor \TMP7, \TMP2
1191 pxor \TMP2, \XMMDst
1192 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1193.endm
1194
0bd82f5f 1195
e31ac32d
TM
1196/* Encryption of a single block
1197* uses eax & r10
1198*/
0bd82f5f 1199
e31ac32d 1200.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1201
e31ac32d
TM
1202 pxor (%arg1), \XMM0
1203 mov keysize,%eax
1204 shr $2,%eax # 128->4, 192->6, 256->8
1205 add $5,%eax # 128->9, 192->11, 256->13
1206 lea 16(%arg1), %r10 # get first expanded key address
1207
1208_esb_loop_\@:
1209 MOVADQ (%r10),\TMP1
1210 AESENC \TMP1,\XMM0
1211 add $16,%r10
1212 sub $1,%eax
1213 jnz _esb_loop_\@
1214
1215 MOVADQ (%r10),\TMP1
1216 AESENCLAST \TMP1,\XMM0
1217.endm
0bd82f5f
TS
1218/*****************************************************************************
1219* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1220* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1221* const u8 *in, // Ciphertext input
1222* u64 plaintext_len, // Length of data in bytes for decryption.
1223* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1224* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1225* // concatenated with 0x00000001. 16-byte aligned pointer.
1226* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1227* const u8 *aad, // Additional Authentication Data (AAD)
1228* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1229* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1230* // given authentication tag and only return the plaintext if they match.
1231* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1232* // (most likely), 12 or 8.
1233*
1234* Assumptions:
1235*
1236* keys:
1237* keys are pre-expanded and aligned to 16 bytes. we are using the first
1238* set of 11 keys in the data structure void *aes_ctx
1239*
1240* iv:
1241* 0 1 2 3
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | Salt (From the SA) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | Initialization Vector |
1247* | (This is the sequence number from IPSec header) |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x1 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252*
1253*
1254* AAD:
1255* AAD padded to 128 bits with 0
1256* for example, assume AAD is a u32 vector
1257*
1258* if AAD is 8 bytes:
1259* AAD[3] = {A0, A1};
1260* padded AAD in xmm register = {A1 A0 0 0}
1261*
1262* 0 1 2 3
1263* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1264* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1265* | SPI (A1) |
1266* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267* | 32-bit Sequence Number (A0) |
1268* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269* | 0x0 |
1270* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1271*
1272* AAD Format with 32-bit Sequence Number
1273*
1274* if AAD is 12 bytes:
1275* AAD[3] = {A0, A1, A2};
1276* padded AAD in xmm register = {A2 A1 A0 0}
1277*
1278* 0 1 2 3
1279* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1280* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1281* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1283* | SPI (A2) |
1284* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285* | 64-bit Extended Sequence Number {A1,A0} |
1286* | |
1287* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1288* | 0x0 |
1289* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1290*
1291* AAD Format with 64-bit Extended Sequence Number
1292*
1293* aadLen:
1294* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1295* The code supports 16 too but for other sizes, the code will fail.
1296*
1297* TLen:
1298* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1299* For other sizes, the code will fail.
1300*
1301* poly = x^128 + x^127 + x^126 + x^121 + 1
1302*
1303*****************************************************************************/
0bd82f5f
TS
1304ENTRY(aesni_gcm_dec)
1305 push %r12
1306 push %r13
1307 push %r14
1308 mov %rsp, %r14
1309/*
1310* states of %xmm registers %xmm6:%xmm15 not saved
1311* all %xmm registers are clobbered
1312*/
1313 sub $VARIABLE_OFFSET, %rsp
1314 and $~63, %rsp # align rsp to 64 bytes
1315 mov %arg6, %r12
1316 movdqu (%r12), %xmm13 # %xmm13 = HashKey
3c097b80
TS
1317 movdqa SHUF_MASK(%rip), %xmm2
1318 PSHUFB_XMM %xmm2, %xmm13
1319
0bd82f5f
TS
1320
1321# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1322
1323 movdqa %xmm13, %xmm2
1324 psllq $1, %xmm13
1325 psrlq $63, %xmm2
1326 movdqa %xmm2, %xmm1
1327 pslldq $8, %xmm2
1328 psrldq $8, %xmm1
1329 por %xmm2, %xmm13
1330
1331 # Reduction
1332
1333 pshufd $0x24, %xmm1, %xmm2
1334 pcmpeqd TWOONE(%rip), %xmm2
1335 pand POLY(%rip), %xmm2
1336 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1337
1338
1339 # Decrypt first few blocks
1340
1341 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1342 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1343 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1344 mov %r13, %r12
1345 and $(3<<4), %r12
1346 jz _initial_num_blocks_is_0_decrypt
1347 cmp $(2<<4), %r12
1348 jb _initial_num_blocks_is_1_decrypt
1349 je _initial_num_blocks_is_2_decrypt
1350_initial_num_blocks_is_3_decrypt:
3c097b80 1351 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1352%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1353 sub $48, %r13
1354 jmp _initial_blocks_decrypted
1355_initial_num_blocks_is_2_decrypt:
3c097b80 1356 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1357%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1358 sub $32, %r13
1359 jmp _initial_blocks_decrypted
1360_initial_num_blocks_is_1_decrypt:
3c097b80 1361 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1362%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1363 sub $16, %r13
1364 jmp _initial_blocks_decrypted
1365_initial_num_blocks_is_0_decrypt:
3c097b80 1366 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1367%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1368_initial_blocks_decrypted:
1369 cmp $0, %r13
1370 je _zero_cipher_left_decrypt
1371 sub $64, %r13
1372 je _four_cipher_left_decrypt
1373_decrypt_by_4:
3c097b80 1374 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1375%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1376 add $64, %r11
1377 sub $64, %r13
1378 jne _decrypt_by_4
1379_four_cipher_left_decrypt:
1380 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1381%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1382_zero_cipher_left_decrypt:
1383 mov %arg4, %r13
1384 and $15, %r13 # %r13 = arg4 (mod 16)
1385 je _multiple_of_16_bytes_decrypt
1386
0d2eb44f 1387 # Handle the last <16 byte block separately
0bd82f5f
TS
1388
1389 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
3c097b80
TS
1390 movdqa SHUF_MASK(%rip), %xmm10
1391 PSHUFB_XMM %xmm10, %xmm0
1392
0bd82f5f
TS
1393 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1394 sub $16, %r11
1395 add %r13, %r11
0d2eb44f 1396 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
0bd82f5f
TS
1397 lea SHIFT_MASK+16(%rip), %r12
1398 sub %r13, %r12
1399# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1400# (%r13 is the number of bytes in plaintext mod 16)
1401 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
3c097b80
TS
1402 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1403
0bd82f5f
TS
1404 movdqa %xmm1, %xmm2
1405 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1406 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1407 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1408 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1409 pand %xmm1, %xmm2
3c097b80
TS
1410 movdqa SHUF_MASK(%rip), %xmm10
1411 PSHUFB_XMM %xmm10 ,%xmm2
1412
0bd82f5f
TS
1413 pxor %xmm2, %xmm8
1414 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1415 # GHASH computation for the last <16 byte block
1416 sub %r13, %r11
1417 add $16, %r11
1418
1419 # output %r13 bytes
3c097b80 1420 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1421 cmp $8, %r13
1422 jle _less_than_8_bytes_left_decrypt
1423 mov %rax, (%arg2 , %r11, 1)
1424 add $8, %r11
1425 psrldq $8, %xmm0
3c097b80 1426 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1427 sub $8, %r13
1428_less_than_8_bytes_left_decrypt:
1429 mov %al, (%arg2, %r11, 1)
1430 add $1, %r11
1431 shr $8, %rax
1432 sub $1, %r13
1433 jne _less_than_8_bytes_left_decrypt
1434_multiple_of_16_bytes_decrypt:
1435 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1436 shl $3, %r12 # convert into number of bits
1437 movd %r12d, %xmm15 # len(A) in %xmm15
1438 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1439 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1440 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1441 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1442 pxor %xmm15, %xmm8
1443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1444 # final GHASH computation
3c097b80
TS
1445 movdqa SHUF_MASK(%rip), %xmm10
1446 PSHUFB_XMM %xmm10, %xmm8
1447
0bd82f5f
TS
1448 mov %arg5, %rax # %rax = *Y0
1449 movdqu (%rax), %xmm0 # %xmm0 = Y0
1450 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1451 pxor %xmm8, %xmm0
1452_return_T_decrypt:
1453 mov arg9, %r10 # %r10 = authTag
1454 mov arg10, %r11 # %r11 = auth_tag_len
1455 cmp $16, %r11
1456 je _T_16_decrypt
1457 cmp $12, %r11
1458 je _T_12_decrypt
1459_T_8_decrypt:
3c097b80 1460 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1461 mov %rax, (%r10)
1462 jmp _return_T_done_decrypt
1463_T_12_decrypt:
3c097b80 1464 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1465 mov %rax, (%r10)
1466 psrldq $8, %xmm0
1467 movd %xmm0, %eax
1468 mov %eax, 8(%r10)
1469 jmp _return_T_done_decrypt
1470_T_16_decrypt:
1471 movdqu %xmm0, (%r10)
1472_return_T_done_decrypt:
1473 mov %r14, %rsp
1474 pop %r14
1475 pop %r13
1476 pop %r12
1477 ret
8309b745 1478ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1479
1480
1481/*****************************************************************************
1482* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1483* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1484* const u8 *in, // Plaintext input
1485* u64 plaintext_len, // Length of data in bytes for encryption.
1486* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1487* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1488* // concatenated with 0x00000001. 16-byte aligned pointer.
1489* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1490* const u8 *aad, // Additional Authentication Data (AAD)
1491* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1492* u8 *auth_tag, // Authenticated Tag output.
1493* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1494* // 12 or 8.
1495*
1496* Assumptions:
1497*
1498* keys:
1499* keys are pre-expanded and aligned to 16 bytes. we are using the
1500* first set of 11 keys in the data structure void *aes_ctx
1501*
1502*
1503* iv:
1504* 0 1 2 3
1505* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | Salt (From the SA) |
1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509* | Initialization Vector |
1510* | (This is the sequence number from IPSec header) |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512* | 0x1 |
1513* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1514*
1515*
1516*
1517* AAD:
1518* AAD padded to 128 bits with 0
1519* for example, assume AAD is a u32 vector
1520*
1521* if AAD is 8 bytes:
1522* AAD[3] = {A0, A1};
1523* padded AAD in xmm register = {A1 A0 0 0}
1524*
1525* 0 1 2 3
1526* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1527* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1528* | SPI (A1) |
1529* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530* | 32-bit Sequence Number (A0) |
1531* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1532* | 0x0 |
1533* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1534*
1535* AAD Format with 32-bit Sequence Number
1536*
1537* if AAD is 12 bytes:
1538* AAD[3] = {A0, A1, A2};
1539* padded AAD in xmm register = {A2 A1 A0 0}
1540*
1541* 0 1 2 3
1542* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1543* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544* | SPI (A2) |
1545* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1546* | 64-bit Extended Sequence Number {A1,A0} |
1547* | |
1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549* | 0x0 |
1550* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1551*
1552* AAD Format with 64-bit Extended Sequence Number
1553*
1554* aadLen:
1555* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1556* The code supports 16 too but for other sizes, the code will fail.
1557*
1558* TLen:
1559* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1560* For other sizes, the code will fail.
1561*
1562* poly = x^128 + x^127 + x^126 + x^121 + 1
1563***************************************************************************/
1564ENTRY(aesni_gcm_enc)
1565 push %r12
1566 push %r13
1567 push %r14
1568 mov %rsp, %r14
1569#
1570# states of %xmm registers %xmm6:%xmm15 not saved
1571# all %xmm registers are clobbered
1572#
1573 sub $VARIABLE_OFFSET, %rsp
1574 and $~63, %rsp
1575 mov %arg6, %r12
1576 movdqu (%r12), %xmm13
3c097b80
TS
1577 movdqa SHUF_MASK(%rip), %xmm2
1578 PSHUFB_XMM %xmm2, %xmm13
1579
0bd82f5f
TS
1580
1581# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1582
1583 movdqa %xmm13, %xmm2
1584 psllq $1, %xmm13
1585 psrlq $63, %xmm2
1586 movdqa %xmm2, %xmm1
1587 pslldq $8, %xmm2
1588 psrldq $8, %xmm1
1589 por %xmm2, %xmm13
1590
1591 # reduce HashKey<<1
1592
1593 pshufd $0x24, %xmm1, %xmm2
1594 pcmpeqd TWOONE(%rip), %xmm2
1595 pand POLY(%rip), %xmm2
1596 pxor %xmm2, %xmm13
1597 movdqa %xmm13, HashKey(%rsp)
1598 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1599 and $-16, %r13
1600 mov %r13, %r12
1601
1602 # Encrypt first few blocks
1603
1604 and $(3<<4), %r12
1605 jz _initial_num_blocks_is_0_encrypt
1606 cmp $(2<<4), %r12
1607 jb _initial_num_blocks_is_1_encrypt
1608 je _initial_num_blocks_is_2_encrypt
1609_initial_num_blocks_is_3_encrypt:
3c097b80 1610 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1611%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1612 sub $48, %r13
1613 jmp _initial_blocks_encrypted
1614_initial_num_blocks_is_2_encrypt:
3c097b80 1615 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1616%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1617 sub $32, %r13
1618 jmp _initial_blocks_encrypted
1619_initial_num_blocks_is_1_encrypt:
3c097b80 1620 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1621%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1622 sub $16, %r13
1623 jmp _initial_blocks_encrypted
1624_initial_num_blocks_is_0_encrypt:
3c097b80 1625 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1626%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1627_initial_blocks_encrypted:
1628
1629 # Main loop - Encrypt remaining blocks
1630
1631 cmp $0, %r13
1632 je _zero_cipher_left_encrypt
1633 sub $64, %r13
1634 je _four_cipher_left_encrypt
1635_encrypt_by_4_encrypt:
3c097b80 1636 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1637%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1638 add $64, %r11
1639 sub $64, %r13
1640 jne _encrypt_by_4_encrypt
1641_four_cipher_left_encrypt:
1642 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1643%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1644_zero_cipher_left_encrypt:
1645 mov %arg4, %r13
1646 and $15, %r13 # %r13 = arg4 (mod 16)
1647 je _multiple_of_16_bytes_encrypt
1648
0d2eb44f 1649 # Handle the last <16 Byte block separately
0bd82f5f 1650 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
3c097b80
TS
1651 movdqa SHUF_MASK(%rip), %xmm10
1652 PSHUFB_XMM %xmm10, %xmm0
1653
60af520c 1654
0bd82f5f
TS
1655 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1656 sub $16, %r11
1657 add %r13, %r11
1658 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1659 lea SHIFT_MASK+16(%rip), %r12
1660 sub %r13, %r12
1661 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1662 # (%r13 is the number of bytes in plaintext mod 16)
1663 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
3c097b80 1664 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
0bd82f5f
TS
1665 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1666 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1667 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1668 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
3c097b80
TS
1669 movdqa SHUF_MASK(%rip), %xmm10
1670 PSHUFB_XMM %xmm10,%xmm0
0bd82f5f 1671
0bd82f5f
TS
1672 pxor %xmm0, %xmm8
1673 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1674 # GHASH computation for the last <16 byte block
1675 sub %r13, %r11
1676 add $16, %r11
60af520c
TS
1677
1678 movdqa SHUF_MASK(%rip), %xmm10
1679 PSHUFB_XMM %xmm10, %xmm0
3c097b80 1680
0bd82f5f
TS
1681 # shuffle xmm0 back to output as ciphertext
1682
1683 # Output %r13 bytes
3c097b80 1684 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1685 cmp $8, %r13
1686 jle _less_than_8_bytes_left_encrypt
1687 mov %rax, (%arg2 , %r11, 1)
1688 add $8, %r11
1689 psrldq $8, %xmm0
3c097b80 1690 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1691 sub $8, %r13
1692_less_than_8_bytes_left_encrypt:
1693 mov %al, (%arg2, %r11, 1)
1694 add $1, %r11
1695 shr $8, %rax
1696 sub $1, %r13
1697 jne _less_than_8_bytes_left_encrypt
1698_multiple_of_16_bytes_encrypt:
1699 mov arg8, %r12 # %r12 = addLen (number of bytes)
1700 shl $3, %r12
1701 movd %r12d, %xmm15 # len(A) in %xmm15
1702 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1703 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1704 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1705 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1706 pxor %xmm15, %xmm8
1707 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1708 # final GHASH computation
3c097b80
TS
1709 movdqa SHUF_MASK(%rip), %xmm10
1710 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
0bd82f5f 1711
0bd82f5f
TS
1712 mov %arg5, %rax # %rax = *Y0
1713 movdqu (%rax), %xmm0 # %xmm0 = Y0
1714 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1715 pxor %xmm8, %xmm0
1716_return_T_encrypt:
1717 mov arg9, %r10 # %r10 = authTag
1718 mov arg10, %r11 # %r11 = auth_tag_len
1719 cmp $16, %r11
1720 je _T_16_encrypt
1721 cmp $12, %r11
1722 je _T_12_encrypt
1723_T_8_encrypt:
3c097b80 1724 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1725 mov %rax, (%r10)
1726 jmp _return_T_done_encrypt
1727_T_12_encrypt:
3c097b80 1728 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1729 mov %rax, (%r10)
1730 psrldq $8, %xmm0
1731 movd %xmm0, %eax
1732 mov %eax, 8(%r10)
1733 jmp _return_T_done_encrypt
1734_T_16_encrypt:
1735 movdqu %xmm0, (%r10)
1736_return_T_done_encrypt:
1737 mov %r14, %rsp
1738 pop %r14
1739 pop %r13
1740 pop %r12
1741 ret
8309b745 1742ENDPROC(aesni_gcm_enc)
3c097b80 1743
559ad0ff 1744#endif
0bd82f5f
TS
1745
1746
8309b745 1747.align 4
54b6a1bd
HY
1748_key_expansion_128:
1749_key_expansion_256a:
1750 pshufd $0b11111111, %xmm1, %xmm1
1751 shufps $0b00010000, %xmm0, %xmm4
1752 pxor %xmm4, %xmm0
1753 shufps $0b10001100, %xmm0, %xmm4
1754 pxor %xmm4, %xmm0
1755 pxor %xmm1, %xmm0
0d258efb
MK
1756 movaps %xmm0, (TKEYP)
1757 add $0x10, TKEYP
54b6a1bd 1758 ret
8309b745
JK
1759ENDPROC(_key_expansion_128)
1760ENDPROC(_key_expansion_256a)
54b6a1bd 1761
0d258efb 1762.align 4
54b6a1bd
HY
1763_key_expansion_192a:
1764 pshufd $0b01010101, %xmm1, %xmm1
1765 shufps $0b00010000, %xmm0, %xmm4
1766 pxor %xmm4, %xmm0
1767 shufps $0b10001100, %xmm0, %xmm4
1768 pxor %xmm4, %xmm0
1769 pxor %xmm1, %xmm0
1770
1771 movaps %xmm2, %xmm5
1772 movaps %xmm2, %xmm6
1773 pslldq $4, %xmm5
1774 pshufd $0b11111111, %xmm0, %xmm3
1775 pxor %xmm3, %xmm2
1776 pxor %xmm5, %xmm2
1777
1778 movaps %xmm0, %xmm1
1779 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1780 movaps %xmm6, (TKEYP)
54b6a1bd 1781 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1782 movaps %xmm1, 0x10(TKEYP)
1783 add $0x20, TKEYP
54b6a1bd 1784 ret
8309b745 1785ENDPROC(_key_expansion_192a)
54b6a1bd 1786
0d258efb 1787.align 4
54b6a1bd
HY
1788_key_expansion_192b:
1789 pshufd $0b01010101, %xmm1, %xmm1
1790 shufps $0b00010000, %xmm0, %xmm4
1791 pxor %xmm4, %xmm0
1792 shufps $0b10001100, %xmm0, %xmm4
1793 pxor %xmm4, %xmm0
1794 pxor %xmm1, %xmm0
1795
1796 movaps %xmm2, %xmm5
1797 pslldq $4, %xmm5
1798 pshufd $0b11111111, %xmm0, %xmm3
1799 pxor %xmm3, %xmm2
1800 pxor %xmm5, %xmm2
1801
0d258efb
MK
1802 movaps %xmm0, (TKEYP)
1803 add $0x10, TKEYP
54b6a1bd 1804 ret
8309b745 1805ENDPROC(_key_expansion_192b)
54b6a1bd 1806
0d258efb 1807.align 4
54b6a1bd
HY
1808_key_expansion_256b:
1809 pshufd $0b10101010, %xmm1, %xmm1
1810 shufps $0b00010000, %xmm2, %xmm4
1811 pxor %xmm4, %xmm2
1812 shufps $0b10001100, %xmm2, %xmm4
1813 pxor %xmm4, %xmm2
1814 pxor %xmm1, %xmm2
0d258efb
MK
1815 movaps %xmm2, (TKEYP)
1816 add $0x10, TKEYP
54b6a1bd 1817 ret
8309b745 1818ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1819
1820/*
1821 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1822 * unsigned int key_len)
1823 */
1824ENTRY(aesni_set_key)
8691ccd7 1825 FRAME_BEGIN
0d258efb
MK
1826#ifndef __x86_64__
1827 pushl KEYP
8691ccd7
JP
1828 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1829 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1830 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1831#endif
1832 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1833 movaps %xmm0, (KEYP)
1834 lea 0x10(KEYP), TKEYP # key addr
1835 movl %edx, 480(KEYP)
54b6a1bd
HY
1836 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1837 cmp $24, %dl
1838 jb .Lenc_key128
1839 je .Lenc_key192
0d258efb
MK
1840 movups 0x10(UKEYP), %xmm2 # other user key
1841 movaps %xmm2, (TKEYP)
1842 add $0x10, TKEYP
b369e521 1843 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1844 call _key_expansion_256a
b369e521 1845 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1846 call _key_expansion_256b
b369e521 1847 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1848 call _key_expansion_256a
b369e521 1849 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1850 call _key_expansion_256b
b369e521 1851 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1852 call _key_expansion_256a
b369e521 1853 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1854 call _key_expansion_256b
b369e521 1855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1856 call _key_expansion_256a
b369e521 1857 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1858 call _key_expansion_256b
b369e521 1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1860 call _key_expansion_256a
b369e521 1861 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1862 call _key_expansion_256b
b369e521 1863 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1864 call _key_expansion_256a
b369e521 1865 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1866 call _key_expansion_256b
b369e521 1867 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1868 call _key_expansion_256a
1869 jmp .Ldec_key
1870.Lenc_key192:
0d258efb 1871 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1872 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1873 call _key_expansion_192a
b369e521 1874 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1875 call _key_expansion_192b
b369e521 1876 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1877 call _key_expansion_192a
b369e521 1878 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1879 call _key_expansion_192b
b369e521 1880 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1881 call _key_expansion_192a
b369e521 1882 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1883 call _key_expansion_192b
b369e521 1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1885 call _key_expansion_192a
b369e521 1886 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1887 call _key_expansion_192b
1888 jmp .Ldec_key
1889.Lenc_key128:
b369e521 1890 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1891 call _key_expansion_128
b369e521 1892 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1893 call _key_expansion_128
b369e521 1894 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1895 call _key_expansion_128
b369e521 1896 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1897 call _key_expansion_128
b369e521 1898 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1899 call _key_expansion_128
b369e521 1900 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1901 call _key_expansion_128
b369e521 1902 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1903 call _key_expansion_128
b369e521 1904 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1905 call _key_expansion_128
b369e521 1906 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1907 call _key_expansion_128
b369e521 1908 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1909 call _key_expansion_128
1910.Ldec_key:
0d258efb
MK
1911 sub $0x10, TKEYP
1912 movaps (KEYP), %xmm0
1913 movaps (TKEYP), %xmm1
1914 movaps %xmm0, 240(TKEYP)
1915 movaps %xmm1, 240(KEYP)
1916 add $0x10, KEYP
1917 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1918.align 4
1919.Ldec_key_loop:
0d258efb 1920 movaps (KEYP), %xmm0
b369e521 1921 AESIMC %xmm0 %xmm1
0d258efb
MK
1922 movaps %xmm1, (UKEYP)
1923 add $0x10, KEYP
1924 sub $0x10, UKEYP
1925 cmp TKEYP, KEYP
54b6a1bd 1926 jb .Ldec_key_loop
0d258efb
MK
1927 xor AREG, AREG
1928#ifndef __x86_64__
1929 popl KEYP
1930#endif
8691ccd7 1931 FRAME_END
54b6a1bd 1932 ret
8309b745 1933ENDPROC(aesni_set_key)
54b6a1bd
HY
1934
1935/*
1936 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1937 */
1938ENTRY(aesni_enc)
8691ccd7 1939 FRAME_BEGIN
0d258efb
MK
1940#ifndef __x86_64__
1941 pushl KEYP
1942 pushl KLEN
8691ccd7
JP
1943 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1944 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1945 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1946#endif
54b6a1bd
HY
1947 movl 480(KEYP), KLEN # key length
1948 movups (INP), STATE # input
1949 call _aesni_enc1
1950 movups STATE, (OUTP) # output
0d258efb
MK
1951#ifndef __x86_64__
1952 popl KLEN
1953 popl KEYP
1954#endif
8691ccd7 1955 FRAME_END
54b6a1bd 1956 ret
8309b745 1957ENDPROC(aesni_enc)
54b6a1bd
HY
1958
1959/*
1960 * _aesni_enc1: internal ABI
1961 * input:
1962 * KEYP: key struct pointer
1963 * KLEN: round count
1964 * STATE: initial state (input)
1965 * output:
1966 * STATE: finial state (output)
1967 * changed:
1968 * KEY
1969 * TKEYP (T1)
1970 */
0d258efb 1971.align 4
54b6a1bd
HY
1972_aesni_enc1:
1973 movaps (KEYP), KEY # key
1974 mov KEYP, TKEYP
1975 pxor KEY, STATE # round 0
1976 add $0x30, TKEYP
1977 cmp $24, KLEN
1978 jb .Lenc128
1979 lea 0x20(TKEYP), TKEYP
1980 je .Lenc192
1981 add $0x20, TKEYP
1982 movaps -0x60(TKEYP), KEY
b369e521 1983 AESENC KEY STATE
54b6a1bd 1984 movaps -0x50(TKEYP), KEY
b369e521 1985 AESENC KEY STATE
54b6a1bd
HY
1986.align 4
1987.Lenc192:
1988 movaps -0x40(TKEYP), KEY
b369e521 1989 AESENC KEY STATE
54b6a1bd 1990 movaps -0x30(TKEYP), KEY
b369e521 1991 AESENC KEY STATE
54b6a1bd
HY
1992.align 4
1993.Lenc128:
1994 movaps -0x20(TKEYP), KEY
b369e521 1995 AESENC KEY STATE
54b6a1bd 1996 movaps -0x10(TKEYP), KEY
b369e521 1997 AESENC KEY STATE
54b6a1bd 1998 movaps (TKEYP), KEY
b369e521 1999 AESENC KEY STATE
54b6a1bd 2000 movaps 0x10(TKEYP), KEY
b369e521 2001 AESENC KEY STATE
54b6a1bd 2002 movaps 0x20(TKEYP), KEY
b369e521 2003 AESENC KEY STATE
54b6a1bd 2004 movaps 0x30(TKEYP), KEY
b369e521 2005 AESENC KEY STATE
54b6a1bd 2006 movaps 0x40(TKEYP), KEY
b369e521 2007 AESENC KEY STATE
54b6a1bd 2008 movaps 0x50(TKEYP), KEY
b369e521 2009 AESENC KEY STATE
54b6a1bd 2010 movaps 0x60(TKEYP), KEY
b369e521 2011 AESENC KEY STATE
54b6a1bd 2012 movaps 0x70(TKEYP), KEY
b369e521 2013 AESENCLAST KEY STATE
54b6a1bd 2014 ret
8309b745 2015ENDPROC(_aesni_enc1)
54b6a1bd
HY
2016
2017/*
2018 * _aesni_enc4: internal ABI
2019 * input:
2020 * KEYP: key struct pointer
2021 * KLEN: round count
2022 * STATE1: initial state (input)
2023 * STATE2
2024 * STATE3
2025 * STATE4
2026 * output:
2027 * STATE1: finial state (output)
2028 * STATE2
2029 * STATE3
2030 * STATE4
2031 * changed:
2032 * KEY
2033 * TKEYP (T1)
2034 */
0d258efb 2035.align 4
54b6a1bd
HY
2036_aesni_enc4:
2037 movaps (KEYP), KEY # key
2038 mov KEYP, TKEYP
2039 pxor KEY, STATE1 # round 0
2040 pxor KEY, STATE2
2041 pxor KEY, STATE3
2042 pxor KEY, STATE4
2043 add $0x30, TKEYP
2044 cmp $24, KLEN
2045 jb .L4enc128
2046 lea 0x20(TKEYP), TKEYP
2047 je .L4enc192
2048 add $0x20, TKEYP
2049 movaps -0x60(TKEYP), KEY
b369e521
HY
2050 AESENC KEY STATE1
2051 AESENC KEY STATE2
2052 AESENC KEY STATE3
2053 AESENC KEY STATE4
54b6a1bd 2054 movaps -0x50(TKEYP), KEY
b369e521
HY
2055 AESENC KEY STATE1
2056 AESENC KEY STATE2
2057 AESENC KEY STATE3
2058 AESENC KEY STATE4
54b6a1bd
HY
2059#.align 4
2060.L4enc192:
2061 movaps -0x40(TKEYP), KEY
b369e521
HY
2062 AESENC KEY STATE1
2063 AESENC KEY STATE2
2064 AESENC KEY STATE3
2065 AESENC KEY STATE4
54b6a1bd 2066 movaps -0x30(TKEYP), KEY
b369e521
HY
2067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
54b6a1bd
HY
2071#.align 4
2072.L4enc128:
2073 movaps -0x20(TKEYP), KEY
b369e521
HY
2074 AESENC KEY STATE1
2075 AESENC KEY STATE2
2076 AESENC KEY STATE3
2077 AESENC KEY STATE4
54b6a1bd 2078 movaps -0x10(TKEYP), KEY
b369e521
HY
2079 AESENC KEY STATE1
2080 AESENC KEY STATE2
2081 AESENC KEY STATE3
2082 AESENC KEY STATE4
54b6a1bd 2083 movaps (TKEYP), KEY
b369e521
HY
2084 AESENC KEY STATE1
2085 AESENC KEY STATE2
2086 AESENC KEY STATE3
2087 AESENC KEY STATE4
54b6a1bd 2088 movaps 0x10(TKEYP), KEY
b369e521
HY
2089 AESENC KEY STATE1
2090 AESENC KEY STATE2
2091 AESENC KEY STATE3
2092 AESENC KEY STATE4
54b6a1bd 2093 movaps 0x20(TKEYP), KEY
b369e521
HY
2094 AESENC KEY STATE1
2095 AESENC KEY STATE2
2096 AESENC KEY STATE3
2097 AESENC KEY STATE4
54b6a1bd 2098 movaps 0x30(TKEYP), KEY
b369e521
HY
2099 AESENC KEY STATE1
2100 AESENC KEY STATE2
2101 AESENC KEY STATE3
2102 AESENC KEY STATE4
54b6a1bd 2103 movaps 0x40(TKEYP), KEY
b369e521
HY
2104 AESENC KEY STATE1
2105 AESENC KEY STATE2
2106 AESENC KEY STATE3
2107 AESENC KEY STATE4
54b6a1bd 2108 movaps 0x50(TKEYP), KEY
b369e521
HY
2109 AESENC KEY STATE1
2110 AESENC KEY STATE2
2111 AESENC KEY STATE3
2112 AESENC KEY STATE4
54b6a1bd 2113 movaps 0x60(TKEYP), KEY
b369e521
HY
2114 AESENC KEY STATE1
2115 AESENC KEY STATE2
2116 AESENC KEY STATE3
2117 AESENC KEY STATE4
54b6a1bd 2118 movaps 0x70(TKEYP), KEY
b369e521
HY
2119 AESENCLAST KEY STATE1 # last round
2120 AESENCLAST KEY STATE2
2121 AESENCLAST KEY STATE3
2122 AESENCLAST KEY STATE4
54b6a1bd 2123 ret
8309b745 2124ENDPROC(_aesni_enc4)
54b6a1bd
HY
2125
2126/*
2127 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2128 */
2129ENTRY(aesni_dec)
8691ccd7 2130 FRAME_BEGIN
0d258efb
MK
2131#ifndef __x86_64__
2132 pushl KEYP
2133 pushl KLEN
8691ccd7
JP
2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2136 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 2137#endif
54b6a1bd
HY
2138 mov 480(KEYP), KLEN # key length
2139 add $240, KEYP
2140 movups (INP), STATE # input
2141 call _aesni_dec1
2142 movups STATE, (OUTP) #output
0d258efb
MK
2143#ifndef __x86_64__
2144 popl KLEN
2145 popl KEYP
2146#endif
8691ccd7 2147 FRAME_END
54b6a1bd 2148 ret
8309b745 2149ENDPROC(aesni_dec)
54b6a1bd
HY
2150
2151/*
2152 * _aesni_dec1: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE: initial state (input)
2157 * output:
2158 * STATE: finial state (output)
2159 * changed:
2160 * KEY
2161 * TKEYP (T1)
2162 */
0d258efb 2163.align 4
54b6a1bd
HY
2164_aesni_dec1:
2165 movaps (KEYP), KEY # key
2166 mov KEYP, TKEYP
2167 pxor KEY, STATE # round 0
2168 add $0x30, TKEYP
2169 cmp $24, KLEN
2170 jb .Ldec128
2171 lea 0x20(TKEYP), TKEYP
2172 je .Ldec192
2173 add $0x20, TKEYP
2174 movaps -0x60(TKEYP), KEY
b369e521 2175 AESDEC KEY STATE
54b6a1bd 2176 movaps -0x50(TKEYP), KEY
b369e521 2177 AESDEC KEY STATE
54b6a1bd
HY
2178.align 4
2179.Ldec192:
2180 movaps -0x40(TKEYP), KEY
b369e521 2181 AESDEC KEY STATE
54b6a1bd 2182 movaps -0x30(TKEYP), KEY
b369e521 2183 AESDEC KEY STATE
54b6a1bd
HY
2184.align 4
2185.Ldec128:
2186 movaps -0x20(TKEYP), KEY
b369e521 2187 AESDEC KEY STATE
54b6a1bd 2188 movaps -0x10(TKEYP), KEY
b369e521 2189 AESDEC KEY STATE
54b6a1bd 2190 movaps (TKEYP), KEY
b369e521 2191 AESDEC KEY STATE
54b6a1bd 2192 movaps 0x10(TKEYP), KEY
b369e521 2193 AESDEC KEY STATE
54b6a1bd 2194 movaps 0x20(TKEYP), KEY
b369e521 2195 AESDEC KEY STATE
54b6a1bd 2196 movaps 0x30(TKEYP), KEY
b369e521 2197 AESDEC KEY STATE
54b6a1bd 2198 movaps 0x40(TKEYP), KEY
b369e521 2199 AESDEC KEY STATE
54b6a1bd 2200 movaps 0x50(TKEYP), KEY
b369e521 2201 AESDEC KEY STATE
54b6a1bd 2202 movaps 0x60(TKEYP), KEY
b369e521 2203 AESDEC KEY STATE
54b6a1bd 2204 movaps 0x70(TKEYP), KEY
b369e521 2205 AESDECLAST KEY STATE
54b6a1bd 2206 ret
8309b745 2207ENDPROC(_aesni_dec1)
54b6a1bd
HY
2208
2209/*
2210 * _aesni_dec4: internal ABI
2211 * input:
2212 * KEYP: key struct pointer
2213 * KLEN: key length
2214 * STATE1: initial state (input)
2215 * STATE2
2216 * STATE3
2217 * STATE4
2218 * output:
2219 * STATE1: finial state (output)
2220 * STATE2
2221 * STATE3
2222 * STATE4
2223 * changed:
2224 * KEY
2225 * TKEYP (T1)
2226 */
0d258efb 2227.align 4
54b6a1bd
HY
2228_aesni_dec4:
2229 movaps (KEYP), KEY # key
2230 mov KEYP, TKEYP
2231 pxor KEY, STATE1 # round 0
2232 pxor KEY, STATE2
2233 pxor KEY, STATE3
2234 pxor KEY, STATE4
2235 add $0x30, TKEYP
2236 cmp $24, KLEN
2237 jb .L4dec128
2238 lea 0x20(TKEYP), TKEYP
2239 je .L4dec192
2240 add $0x20, TKEYP
2241 movaps -0x60(TKEYP), KEY
b369e521
HY
2242 AESDEC KEY STATE1
2243 AESDEC KEY STATE2
2244 AESDEC KEY STATE3
2245 AESDEC KEY STATE4
54b6a1bd 2246 movaps -0x50(TKEYP), KEY
b369e521
HY
2247 AESDEC KEY STATE1
2248 AESDEC KEY STATE2
2249 AESDEC KEY STATE3
2250 AESDEC KEY STATE4
54b6a1bd
HY
2251.align 4
2252.L4dec192:
2253 movaps -0x40(TKEYP), KEY
b369e521
HY
2254 AESDEC KEY STATE1
2255 AESDEC KEY STATE2
2256 AESDEC KEY STATE3
2257 AESDEC KEY STATE4
54b6a1bd 2258 movaps -0x30(TKEYP), KEY
b369e521
HY
2259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
54b6a1bd
HY
2263.align 4
2264.L4dec128:
2265 movaps -0x20(TKEYP), KEY
b369e521
HY
2266 AESDEC KEY STATE1
2267 AESDEC KEY STATE2
2268 AESDEC KEY STATE3
2269 AESDEC KEY STATE4
54b6a1bd 2270 movaps -0x10(TKEYP), KEY
b369e521
HY
2271 AESDEC KEY STATE1
2272 AESDEC KEY STATE2
2273 AESDEC KEY STATE3
2274 AESDEC KEY STATE4
54b6a1bd 2275 movaps (TKEYP), KEY
b369e521
HY
2276 AESDEC KEY STATE1
2277 AESDEC KEY STATE2
2278 AESDEC KEY STATE3
2279 AESDEC KEY STATE4
54b6a1bd 2280 movaps 0x10(TKEYP), KEY
b369e521
HY
2281 AESDEC KEY STATE1
2282 AESDEC KEY STATE2
2283 AESDEC KEY STATE3
2284 AESDEC KEY STATE4
54b6a1bd 2285 movaps 0x20(TKEYP), KEY
b369e521
HY
2286 AESDEC KEY STATE1
2287 AESDEC KEY STATE2
2288 AESDEC KEY STATE3
2289 AESDEC KEY STATE4
54b6a1bd 2290 movaps 0x30(TKEYP), KEY
b369e521
HY
2291 AESDEC KEY STATE1
2292 AESDEC KEY STATE2
2293 AESDEC KEY STATE3
2294 AESDEC KEY STATE4
54b6a1bd 2295 movaps 0x40(TKEYP), KEY
b369e521
HY
2296 AESDEC KEY STATE1
2297 AESDEC KEY STATE2
2298 AESDEC KEY STATE3
2299 AESDEC KEY STATE4
54b6a1bd 2300 movaps 0x50(TKEYP), KEY
b369e521
HY
2301 AESDEC KEY STATE1
2302 AESDEC KEY STATE2
2303 AESDEC KEY STATE3
2304 AESDEC KEY STATE4
54b6a1bd 2305 movaps 0x60(TKEYP), KEY
b369e521
HY
2306 AESDEC KEY STATE1
2307 AESDEC KEY STATE2
2308 AESDEC KEY STATE3
2309 AESDEC KEY STATE4
54b6a1bd 2310 movaps 0x70(TKEYP), KEY
b369e521
HY
2311 AESDECLAST KEY STATE1 # last round
2312 AESDECLAST KEY STATE2
2313 AESDECLAST KEY STATE3
2314 AESDECLAST KEY STATE4
54b6a1bd 2315 ret
8309b745 2316ENDPROC(_aesni_dec4)
54b6a1bd
HY
2317
2318/*
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320 * size_t len)
2321 */
2322ENTRY(aesni_ecb_enc)
8691ccd7 2323 FRAME_BEGIN
0d258efb
MK
2324#ifndef __x86_64__
2325 pushl LEN
2326 pushl KEYP
2327 pushl KLEN
8691ccd7
JP
2328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2330 movl (FRAME_OFFSET+24)(%esp), INP # src
2331 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2332#endif
54b6a1bd
HY
2333 test LEN, LEN # check length
2334 jz .Lecb_enc_ret
2335 mov 480(KEYP), KLEN
2336 cmp $16, LEN
2337 jb .Lecb_enc_ret
2338 cmp $64, LEN
2339 jb .Lecb_enc_loop1
2340.align 4
2341.Lecb_enc_loop4:
2342 movups (INP), STATE1
2343 movups 0x10(INP), STATE2
2344 movups 0x20(INP), STATE3
2345 movups 0x30(INP), STATE4
2346 call _aesni_enc4
2347 movups STATE1, (OUTP)
2348 movups STATE2, 0x10(OUTP)
2349 movups STATE3, 0x20(OUTP)
2350 movups STATE4, 0x30(OUTP)
2351 sub $64, LEN
2352 add $64, INP
2353 add $64, OUTP
2354 cmp $64, LEN
2355 jge .Lecb_enc_loop4
2356 cmp $16, LEN
2357 jb .Lecb_enc_ret
2358.align 4
2359.Lecb_enc_loop1:
2360 movups (INP), STATE1
2361 call _aesni_enc1
2362 movups STATE1, (OUTP)
2363 sub $16, LEN
2364 add $16, INP
2365 add $16, OUTP
2366 cmp $16, LEN
2367 jge .Lecb_enc_loop1
2368.Lecb_enc_ret:
0d258efb
MK
2369#ifndef __x86_64__
2370 popl KLEN
2371 popl KEYP
2372 popl LEN
2373#endif
8691ccd7 2374 FRAME_END
54b6a1bd 2375 ret
8309b745 2376ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2377
2378/*
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380 * size_t len);
2381 */
2382ENTRY(aesni_ecb_dec)
8691ccd7 2383 FRAME_BEGIN
0d258efb
MK
2384#ifndef __x86_64__
2385 pushl LEN
2386 pushl KEYP
2387 pushl KLEN
8691ccd7
JP
2388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2390 movl (FRAME_OFFSET+24)(%esp), INP # src
2391 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2392#endif
54b6a1bd
HY
2393 test LEN, LEN
2394 jz .Lecb_dec_ret
2395 mov 480(KEYP), KLEN
2396 add $240, KEYP
2397 cmp $16, LEN
2398 jb .Lecb_dec_ret
2399 cmp $64, LEN
2400 jb .Lecb_dec_loop1
2401.align 4
2402.Lecb_dec_loop4:
2403 movups (INP), STATE1
2404 movups 0x10(INP), STATE2
2405 movups 0x20(INP), STATE3
2406 movups 0x30(INP), STATE4
2407 call _aesni_dec4
2408 movups STATE1, (OUTP)
2409 movups STATE2, 0x10(OUTP)
2410 movups STATE3, 0x20(OUTP)
2411 movups STATE4, 0x30(OUTP)
2412 sub $64, LEN
2413 add $64, INP
2414 add $64, OUTP
2415 cmp $64, LEN
2416 jge .Lecb_dec_loop4
2417 cmp $16, LEN
2418 jb .Lecb_dec_ret
2419.align 4
2420.Lecb_dec_loop1:
2421 movups (INP), STATE1
2422 call _aesni_dec1
2423 movups STATE1, (OUTP)
2424 sub $16, LEN
2425 add $16, INP
2426 add $16, OUTP
2427 cmp $16, LEN
2428 jge .Lecb_dec_loop1
2429.Lecb_dec_ret:
0d258efb
MK
2430#ifndef __x86_64__
2431 popl KLEN
2432 popl KEYP
2433 popl LEN
2434#endif
8691ccd7 2435 FRAME_END
54b6a1bd 2436 ret
8309b745 2437ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2438
2439/*
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 * size_t len, u8 *iv)
2442 */
2443ENTRY(aesni_cbc_enc)
8691ccd7 2444 FRAME_BEGIN
0d258efb
MK
2445#ifndef __x86_64__
2446 pushl IVP
2447 pushl LEN
2448 pushl KEYP
2449 pushl KLEN
8691ccd7
JP
2450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2452 movl (FRAME_OFFSET+28)(%esp), INP # src
2453 movl (FRAME_OFFSET+32)(%esp), LEN # len
2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2455#endif
54b6a1bd
HY
2456 cmp $16, LEN
2457 jb .Lcbc_enc_ret
2458 mov 480(KEYP), KLEN
2459 movups (IVP), STATE # load iv as initial state
2460.align 4
2461.Lcbc_enc_loop:
2462 movups (INP), IN # load input
2463 pxor IN, STATE
2464 call _aesni_enc1
2465 movups STATE, (OUTP) # store output
2466 sub $16, LEN
2467 add $16, INP
2468 add $16, OUTP
2469 cmp $16, LEN
2470 jge .Lcbc_enc_loop
2471 movups STATE, (IVP)
2472.Lcbc_enc_ret:
0d258efb
MK
2473#ifndef __x86_64__
2474 popl KLEN
2475 popl KEYP
2476 popl LEN
2477 popl IVP
2478#endif
8691ccd7 2479 FRAME_END
54b6a1bd 2480 ret
8309b745 2481ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2482
2483/*
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 * size_t len, u8 *iv)
2486 */
2487ENTRY(aesni_cbc_dec)
8691ccd7 2488 FRAME_BEGIN
0d258efb
MK
2489#ifndef __x86_64__
2490 pushl IVP
2491 pushl LEN
2492 pushl KEYP
2493 pushl KLEN
8691ccd7
JP
2494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2496 movl (FRAME_OFFSET+28)(%esp), INP # src
2497 movl (FRAME_OFFSET+32)(%esp), LEN # len
2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2499#endif
54b6a1bd 2500 cmp $16, LEN
e6efaa02 2501 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2502 mov 480(KEYP), KLEN
2503 add $240, KEYP
2504 movups (IVP), IV
2505 cmp $64, LEN
2506 jb .Lcbc_dec_loop1
2507.align 4
2508.Lcbc_dec_loop4:
2509 movups (INP), IN1
2510 movaps IN1, STATE1
2511 movups 0x10(INP), IN2
2512 movaps IN2, STATE2
0d258efb 2513#ifdef __x86_64__
54b6a1bd
HY
2514 movups 0x20(INP), IN3
2515 movaps IN3, STATE3
2516 movups 0x30(INP), IN4
2517 movaps IN4, STATE4
0d258efb
MK
2518#else
2519 movups 0x20(INP), IN1
2520 movaps IN1, STATE3
2521 movups 0x30(INP), IN2
2522 movaps IN2, STATE4
2523#endif
54b6a1bd
HY
2524 call _aesni_dec4
2525 pxor IV, STATE1
0d258efb 2526#ifdef __x86_64__
54b6a1bd
HY
2527 pxor IN1, STATE2
2528 pxor IN2, STATE3
2529 pxor IN3, STATE4
2530 movaps IN4, IV
0d258efb 2531#else
0d258efb
MK
2532 pxor IN1, STATE4
2533 movaps IN2, IV
7c8d5184
MK
2534 movups (INP), IN1
2535 pxor IN1, STATE2
2536 movups 0x10(INP), IN2
2537 pxor IN2, STATE3
0d258efb 2538#endif
54b6a1bd
HY
2539 movups STATE1, (OUTP)
2540 movups STATE2, 0x10(OUTP)
2541 movups STATE3, 0x20(OUTP)
2542 movups STATE4, 0x30(OUTP)
2543 sub $64, LEN
2544 add $64, INP
2545 add $64, OUTP
2546 cmp $64, LEN
2547 jge .Lcbc_dec_loop4
2548 cmp $16, LEN
2549 jb .Lcbc_dec_ret
2550.align 4
2551.Lcbc_dec_loop1:
2552 movups (INP), IN
2553 movaps IN, STATE
2554 call _aesni_dec1
2555 pxor IV, STATE
2556 movups STATE, (OUTP)
2557 movaps IN, IV
2558 sub $16, LEN
2559 add $16, INP
2560 add $16, OUTP
2561 cmp $16, LEN
2562 jge .Lcbc_dec_loop1
54b6a1bd 2563.Lcbc_dec_ret:
e6efaa02
HY
2564 movups IV, (IVP)
2565.Lcbc_dec_just_ret:
0d258efb
MK
2566#ifndef __x86_64__
2567 popl KLEN
2568 popl KEYP
2569 popl LEN
2570 popl IVP
2571#endif
8691ccd7 2572 FRAME_END
54b6a1bd 2573 ret
8309b745 2574ENDPROC(aesni_cbc_dec)
12387a46 2575
0d258efb 2576#ifdef __x86_64__
1253cab8 2577.pushsection .rodata
12387a46
HY
2578.align 16
2579.Lbswap_mask:
2580 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2581.popsection
12387a46
HY
2582
2583/*
2584 * _aesni_inc_init: internal ABI
2585 * setup registers used by _aesni_inc
2586 * input:
2587 * IV
2588 * output:
2589 * CTR: == IV, in little endian
2590 * TCTR_LOW: == lower qword of CTR
2591 * INC: == 1, in little endian
2592 * BSWAP_MASK == endian swapping mask
2593 */
0d258efb 2594.align 4
12387a46
HY
2595_aesni_inc_init:
2596 movaps .Lbswap_mask, BSWAP_MASK
2597 movaps IV, CTR
2598 PSHUFB_XMM BSWAP_MASK CTR
2599 mov $1, TCTR_LOW
32cbd7df
HY
2600 MOVQ_R64_XMM TCTR_LOW INC
2601 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2602 ret
8309b745 2603ENDPROC(_aesni_inc_init)
12387a46
HY
2604
2605/*
2606 * _aesni_inc: internal ABI
2607 * Increase IV by 1, IV is in big endian
2608 * input:
2609 * IV
2610 * CTR: == IV, in little endian
2611 * TCTR_LOW: == lower qword of CTR
2612 * INC: == 1, in little endian
2613 * BSWAP_MASK == endian swapping mask
2614 * output:
2615 * IV: Increase by 1
2616 * changed:
2617 * CTR: == output IV, in little endian
2618 * TCTR_LOW: == lower qword of CTR
2619 */
0d258efb 2620.align 4
12387a46
HY
2621_aesni_inc:
2622 paddq INC, CTR
2623 add $1, TCTR_LOW
2624 jnc .Linc_low
2625 pslldq $8, INC
2626 paddq INC, CTR
2627 psrldq $8, INC
2628.Linc_low:
2629 movaps CTR, IV
2630 PSHUFB_XMM BSWAP_MASK IV
2631 ret
8309b745 2632ENDPROC(_aesni_inc)
12387a46
HY
2633
2634/*
2635 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2636 * size_t len, u8 *iv)
2637 */
2638ENTRY(aesni_ctr_enc)
8691ccd7 2639 FRAME_BEGIN
12387a46
HY
2640 cmp $16, LEN
2641 jb .Lctr_enc_just_ret
2642 mov 480(KEYP), KLEN
2643 movups (IVP), IV
2644 call _aesni_inc_init
2645 cmp $64, LEN
2646 jb .Lctr_enc_loop1
2647.align 4
2648.Lctr_enc_loop4:
2649 movaps IV, STATE1
2650 call _aesni_inc
2651 movups (INP), IN1
2652 movaps IV, STATE2
2653 call _aesni_inc
2654 movups 0x10(INP), IN2
2655 movaps IV, STATE3
2656 call _aesni_inc
2657 movups 0x20(INP), IN3
2658 movaps IV, STATE4
2659 call _aesni_inc
2660 movups 0x30(INP), IN4
2661 call _aesni_enc4
2662 pxor IN1, STATE1
2663 movups STATE1, (OUTP)
2664 pxor IN2, STATE2
2665 movups STATE2, 0x10(OUTP)
2666 pxor IN3, STATE3
2667 movups STATE3, 0x20(OUTP)
2668 pxor IN4, STATE4
2669 movups STATE4, 0x30(OUTP)
2670 sub $64, LEN
2671 add $64, INP
2672 add $64, OUTP
2673 cmp $64, LEN
2674 jge .Lctr_enc_loop4
2675 cmp $16, LEN
2676 jb .Lctr_enc_ret
2677.align 4
2678.Lctr_enc_loop1:
2679 movaps IV, STATE
2680 call _aesni_inc
2681 movups (INP), IN
2682 call _aesni_enc1
2683 pxor IN, STATE
2684 movups STATE, (OUTP)
2685 sub $16, LEN
2686 add $16, INP
2687 add $16, OUTP
2688 cmp $16, LEN
2689 jge .Lctr_enc_loop1
2690.Lctr_enc_ret:
2691 movups IV, (IVP)
2692.Lctr_enc_just_ret:
8691ccd7 2693 FRAME_END
12387a46 2694 ret
8309b745 2695ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2696
2697/*
2698 * _aesni_gf128mul_x_ble: internal ABI
2699 * Multiply in GF(2^128) for XTS IVs
2700 * input:
2701 * IV: current IV
2702 * GF128MUL_MASK == mask with 0x87 and 0x01
2703 * output:
2704 * IV: next IV
2705 * changed:
2706 * CTR: == temporary value
2707 */
2708#define _aesni_gf128mul_x_ble() \
2709 pshufd $0x13, IV, CTR; \
2710 paddq IV, IV; \
2711 psrad $31, CTR; \
2712 pand GF128MUL_MASK, CTR; \
2713 pxor CTR, IV;
2714
2715/*
2716 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2717 * bool enc, u8 *iv)
2718 */
2719ENTRY(aesni_xts_crypt8)
8691ccd7 2720 FRAME_BEGIN
c456a9cd
JK
2721 cmpb $0, %cl
2722 movl $0, %ecx
2723 movl $240, %r10d
2724 leaq _aesni_enc4, %r11
2725 leaq _aesni_dec4, %rax
2726 cmovel %r10d, %ecx
2727 cmoveq %rax, %r11
2728
2729 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2730 movups (IVP), IV
2731
2732 mov 480(KEYP), KLEN
2733 addq %rcx, KEYP
2734
2735 movdqa IV, STATE1
fe6510b5
JK
2736 movdqu 0x00(INP), INC
2737 pxor INC, STATE1
c456a9cd
JK
2738 movdqu IV, 0x00(OUTP)
2739
2740 _aesni_gf128mul_x_ble()
2741 movdqa IV, STATE2
fe6510b5
JK
2742 movdqu 0x10(INP), INC
2743 pxor INC, STATE2
c456a9cd
JK
2744 movdqu IV, 0x10(OUTP)
2745
2746 _aesni_gf128mul_x_ble()
2747 movdqa IV, STATE3
fe6510b5
JK
2748 movdqu 0x20(INP), INC
2749 pxor INC, STATE3
c456a9cd
JK
2750 movdqu IV, 0x20(OUTP)
2751
2752 _aesni_gf128mul_x_ble()
2753 movdqa IV, STATE4
fe6510b5
JK
2754 movdqu 0x30(INP), INC
2755 pxor INC, STATE4
c456a9cd
JK
2756 movdqu IV, 0x30(OUTP)
2757
2758 call *%r11
2759
fe6510b5
JK
2760 movdqu 0x00(OUTP), INC
2761 pxor INC, STATE1
c456a9cd
JK
2762 movdqu STATE1, 0x00(OUTP)
2763
2764 _aesni_gf128mul_x_ble()
2765 movdqa IV, STATE1
fe6510b5
JK
2766 movdqu 0x40(INP), INC
2767 pxor INC, STATE1
c456a9cd
JK
2768 movdqu IV, 0x40(OUTP)
2769
fe6510b5
JK
2770 movdqu 0x10(OUTP), INC
2771 pxor INC, STATE2
c456a9cd
JK
2772 movdqu STATE2, 0x10(OUTP)
2773
2774 _aesni_gf128mul_x_ble()
2775 movdqa IV, STATE2
fe6510b5
JK
2776 movdqu 0x50(INP), INC
2777 pxor INC, STATE2
c456a9cd
JK
2778 movdqu IV, 0x50(OUTP)
2779
fe6510b5
JK
2780 movdqu 0x20(OUTP), INC
2781 pxor INC, STATE3
c456a9cd
JK
2782 movdqu STATE3, 0x20(OUTP)
2783
2784 _aesni_gf128mul_x_ble()
2785 movdqa IV, STATE3
fe6510b5
JK
2786 movdqu 0x60(INP), INC
2787 pxor INC, STATE3
c456a9cd
JK
2788 movdqu IV, 0x60(OUTP)
2789
fe6510b5
JK
2790 movdqu 0x30(OUTP), INC
2791 pxor INC, STATE4
c456a9cd
JK
2792 movdqu STATE4, 0x30(OUTP)
2793
2794 _aesni_gf128mul_x_ble()
2795 movdqa IV, STATE4
fe6510b5
JK
2796 movdqu 0x70(INP), INC
2797 pxor INC, STATE4
c456a9cd
JK
2798 movdqu IV, 0x70(OUTP)
2799
2800 _aesni_gf128mul_x_ble()
2801 movups IV, (IVP)
2802
2803 call *%r11
2804
fe6510b5
JK
2805 movdqu 0x40(OUTP), INC
2806 pxor INC, STATE1
c456a9cd
JK
2807 movdqu STATE1, 0x40(OUTP)
2808
fe6510b5
JK
2809 movdqu 0x50(OUTP), INC
2810 pxor INC, STATE2
c456a9cd
JK
2811 movdqu STATE2, 0x50(OUTP)
2812
fe6510b5
JK
2813 movdqu 0x60(OUTP), INC
2814 pxor INC, STATE3
c456a9cd
JK
2815 movdqu STATE3, 0x60(OUTP)
2816
fe6510b5
JK
2817 movdqu 0x70(OUTP), INC
2818 pxor INC, STATE4
c456a9cd
JK
2819 movdqu STATE4, 0x70(OUTP)
2820
8691ccd7 2821 FRAME_END
c456a9cd
JK
2822 ret
2823ENDPROC(aesni_xts_crypt8)
2824
0d258efb 2825#endif