]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
Merge remote-tracking branches 'asoc/topic/tas6424', 'asoc/topic/tfa9879', 'asoc...
[mirror_ubuntu-focal-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
8691ccd7 34#include <asm/frame.h>
9697fa39 35#include <asm/nospec-branch.h>
54b6a1bd 36
e31ac32d
TM
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
559ad0ff 48#ifdef __x86_64__
e31ac32d 49
e183914a
DV
50# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
e183914a
DV
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
0bd82f5f 57POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
0bd82f5f
TS
60TWOONE: .octa 0x00000001000000000000000000000001
61
e183914a
DV
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
0bd82f5f 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
0bd82f5f 67MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
0bd82f5f 70MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
0bd82f5f 73ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
0bd82f5f 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
0bd82f5f 79dec: .octa 0x1
e183914a
DV
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
0bd82f5f
TS
82enc: .octa 0x2
83
e183914a
DV
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
0487ccac
SD
93.section .rodata
94.align 16
95.type aad_shift_arr, @object
96.size aad_shift_arr, 272
97aad_shift_arr:
98 .octa 0xffffffffffffffffffffffffffffffff
99 .octa 0xffffffffffffffffffffffffffffff0C
100 .octa 0xffffffffffffffffffffffffffff0D0C
101 .octa 0xffffffffffffffffffffffffff0E0D0C
102 .octa 0xffffffffffffffffffffffff0F0E0D0C
103 .octa 0xffffffffffffffffffffff0C0B0A0908
104 .octa 0xffffffffffffffffffff0D0C0B0A0908
105 .octa 0xffffffffffffffffff0E0D0C0B0A0908
106 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
107 .octa 0xffffffffffffff0C0B0A090807060504
108 .octa 0xffffffffffff0D0C0B0A090807060504
109 .octa 0xffffffffff0E0D0C0B0A090807060504
110 .octa 0xffffffff0F0E0D0C0B0A090807060504
111 .octa 0xffffff0C0B0A09080706050403020100
112 .octa 0xffff0D0C0B0A09080706050403020100
113 .octa 0xff0E0D0C0B0A09080706050403020100
114 .octa 0x0F0E0D0C0B0A09080706050403020100
115
0bd82f5f 116
54b6a1bd
HY
117.text
118
0bd82f5f
TS
119
120#define STACK_OFFSET 8*3
121#define HashKey 16*0 // store HashKey <<1 mod poly here
122#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
123#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
124#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
125#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
126 // bits of HashKey <<1 mod poly here
127 //(for Karatsuba purposes)
128#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
129 // bits of HashKey^2 <<1 mod poly here
130 // (for Karatsuba purposes)
131#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
132 // bits of HashKey^3 <<1 mod poly here
133 // (for Karatsuba purposes)
134#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
135 // bits of HashKey^4 <<1 mod poly here
136 // (for Karatsuba purposes)
137#define VARIABLE_OFFSET 16*8
138
139#define arg1 rdi
140#define arg2 rsi
141#define arg3 rdx
142#define arg4 rcx
143#define arg5 r8
144#define arg6 r9
145#define arg7 STACK_OFFSET+8(%r14)
146#define arg8 STACK_OFFSET+16(%r14)
147#define arg9 STACK_OFFSET+24(%r14)
148#define arg10 STACK_OFFSET+32(%r14)
e31ac32d 149#define keysize 2*15*16(%arg1)
559ad0ff 150#endif
0bd82f5f
TS
151
152
54b6a1bd
HY
153#define STATE1 %xmm0
154#define STATE2 %xmm4
155#define STATE3 %xmm5
156#define STATE4 %xmm6
157#define STATE STATE1
158#define IN1 %xmm1
159#define IN2 %xmm7
160#define IN3 %xmm8
161#define IN4 %xmm9
162#define IN IN1
163#define KEY %xmm2
164#define IV %xmm3
0d258efb 165
12387a46
HY
166#define BSWAP_MASK %xmm10
167#define CTR %xmm11
168#define INC %xmm12
54b6a1bd 169
c456a9cd
JK
170#define GF128MUL_MASK %xmm10
171
0d258efb
MK
172#ifdef __x86_64__
173#define AREG %rax
54b6a1bd
HY
174#define KEYP %rdi
175#define OUTP %rsi
0d258efb 176#define UKEYP OUTP
54b6a1bd
HY
177#define INP %rdx
178#define LEN %rcx
179#define IVP %r8
180#define KLEN %r9d
181#define T1 %r10
182#define TKEYP T1
183#define T2 %r11
12387a46 184#define TCTR_LOW T2
0d258efb
MK
185#else
186#define AREG %eax
187#define KEYP %edi
188#define OUTP AREG
189#define UKEYP OUTP
190#define INP %edx
191#define LEN %esi
192#define IVP %ebp
193#define KLEN %ebx
194#define T1 %ecx
195#define TKEYP T1
196#endif
54b6a1bd 197
0bd82f5f 198
559ad0ff 199#ifdef __x86_64__
0bd82f5f
TS
200/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
201*
202*
203* Input: A and B (128-bits each, bit-reflected)
204* Output: C = A*B*x mod poly, (i.e. >>1 )
205* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
206* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
207*
208*/
209.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
210 movdqa \GH, \TMP1
211 pshufd $78, \GH, \TMP2
212 pshufd $78, \HK, \TMP3
213 pxor \GH, \TMP2 # TMP2 = a1+a0
214 pxor \HK, \TMP3 # TMP3 = b1+b0
215 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
216 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
217 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
218 pxor \GH, \TMP2
219 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
220 movdqa \TMP2, \TMP3
221 pslldq $8, \TMP3 # left shift TMP3 2 DWs
222 psrldq $8, \TMP2 # right shift TMP2 2 DWs
223 pxor \TMP3, \GH
224 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
225
226 # first phase of the reduction
227
228 movdqa \GH, \TMP2
229 movdqa \GH, \TMP3
230 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
231 # in in order to perform
232 # independent shifts
233 pslld $31, \TMP2 # packed right shift <<31
234 pslld $30, \TMP3 # packed right shift <<30
235 pslld $25, \TMP4 # packed right shift <<25
236 pxor \TMP3, \TMP2 # xor the shifted versions
237 pxor \TMP4, \TMP2
238 movdqa \TMP2, \TMP5
239 psrldq $4, \TMP5 # right shift TMP5 1 DW
240 pslldq $12, \TMP2 # left shift TMP2 3 DWs
241 pxor \TMP2, \GH
242
243 # second phase of the reduction
244
245 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
246 # in in order to perform
247 # independent shifts
248 movdqa \GH,\TMP3
249 movdqa \GH,\TMP4
250 psrld $1,\TMP2 # packed left shift >>1
251 psrld $2,\TMP3 # packed left shift >>2
252 psrld $7,\TMP4 # packed left shift >>7
253 pxor \TMP3,\TMP2 # xor the shifted versions
254 pxor \TMP4,\TMP2
255 pxor \TMP5, \TMP2
256 pxor \TMP2, \GH
257 pxor \TMP1, \GH # result is in TMP1
258.endm
259
260/*
261* if a = number of total plaintext bytes
262* b = floor(a/16)
263* num_initial_blocks = b mod 4
264* encrypt the initial num_initial_blocks blocks and apply ghash on
265* the ciphertext
266* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
267* are clobbered
268* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
269*/
270
0bd82f5f 271
3c097b80
TS
272.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
273XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 274 MOVADQ SHUF_MASK(%rip), %xmm14
0bd82f5f
TS
275 mov arg7, %r10 # %r10 = AAD
276 mov arg8, %r12 # %r12 = aadLen
277 mov %r12, %r11
278 pxor %xmm\i, %xmm\i
0487ccac 279 pxor \XMM2, \XMM2
e31ac32d 280
0487ccac
SD
281 cmp $16, %r11
282 jl _get_AAD_rest8\num_initial_blocks\operation
283_get_AAD_blocks\num_initial_blocks\operation:
284 movdqu (%r10), %xmm\i
285 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
286 pxor %xmm\i, \XMM2
287 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
288 add $16, %r10
289 sub $16, %r12
290 sub $16, %r11
291 cmp $16, %r11
292 jge _get_AAD_blocks\num_initial_blocks\operation
293
294 movdqu \XMM2, %xmm\i
295 cmp $0, %r11
296 je _get_AAD_done\num_initial_blocks\operation
297
298 pxor %xmm\i,%xmm\i
299
300 /* read the last <16B of AAD. since we have at least 4B of
301 data right after the AAD (the ICV, and maybe some CT), we can
302 read 4B/8B blocks safely, and then get rid of the extra stuff */
303_get_AAD_rest8\num_initial_blocks\operation:
304 cmp $4, %r11
305 jle _get_AAD_rest4\num_initial_blocks\operation
306 movq (%r10), \TMP1
307 add $8, %r10
308 sub $8, %r11
309 pslldq $8, \TMP1
310 psrldq $8, %xmm\i
0bd82f5f 311 pxor \TMP1, %xmm\i
0487ccac
SD
312 jmp _get_AAD_rest8\num_initial_blocks\operation
313_get_AAD_rest4\num_initial_blocks\operation:
314 cmp $0, %r11
315 jle _get_AAD_rest0\num_initial_blocks\operation
316 mov (%r10), %eax
317 movq %rax, \TMP1
0bd82f5f 318 add $4, %r10
0487ccac
SD
319 sub $4, %r10
320 pslldq $12, \TMP1
0bd82f5f 321 psrldq $4, %xmm\i
0487ccac
SD
322 pxor \TMP1, %xmm\i
323_get_AAD_rest0\num_initial_blocks\operation:
324 /* finalize: shift out the extra bytes we read, and align
325 left. since pslldq can only shift by an immediate, we use
326 vpshufb and an array of shuffle masks */
327 movq %r12, %r11
328 salq $4, %r11
329 movdqu aad_shift_arr(%r11), \TMP1
330 PSHUFB_XMM \TMP1, %xmm\i
331_get_AAD_rest_final\num_initial_blocks\operation:
3c097b80 332 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
0487ccac
SD
333 pxor \XMM2, %xmm\i
334 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
3c097b80 335
0487ccac 336_get_AAD_done\num_initial_blocks\operation:
0bd82f5f 337 xor %r11, %r11 # initialise the data pointer offset as zero
0487ccac 338 # start AES for num_initial_blocks blocks
0bd82f5f
TS
339
340 mov %arg5, %rax # %rax = *Y0
341 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
342 PSHUFB_XMM %xmm14, \XMM0
343
344.if (\i == 5) || (\i == 6) || (\i == 7)
e31ac32d
TM
345 MOVADQ ONE(%RIP),\TMP1
346 MOVADQ (%arg1),\TMP2
0bd82f5f 347.irpc index, \i_seq
e31ac32d 348 paddd \TMP1, \XMM0 # INCR Y0
0bd82f5f 349 movdqa \XMM0, %xmm\index
3c097b80 350 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
e31ac32d 351 pxor \TMP2, %xmm\index
0bd82f5f 352.endr
e31ac32d
TM
353 lea 0x10(%arg1),%r10
354 mov keysize,%eax
355 shr $2,%eax # 128->4, 192->6, 256->8
356 add $5,%eax # 128->9, 192->11, 256->13
357
358aes_loop_initial_dec\num_initial_blocks:
359 MOVADQ (%r10),\TMP1
360.irpc index, \i_seq
361 AESENC \TMP1, %xmm\index
0bd82f5f 362.endr
e31ac32d
TM
363 add $16,%r10
364 sub $1,%eax
365 jnz aes_loop_initial_dec\num_initial_blocks
366
367 MOVADQ (%r10), \TMP1
0bd82f5f 368.irpc index, \i_seq
e31ac32d 369 AESENCLAST \TMP1, %xmm\index # Last Round
0bd82f5f
TS
370.endr
371.irpc index, \i_seq
372 movdqu (%arg3 , %r11, 1), \TMP1
373 pxor \TMP1, %xmm\index
374 movdqu %xmm\index, (%arg2 , %r11, 1)
375 # write back plaintext/ciphertext for num_initial_blocks
376 add $16, %r11
3c097b80 377
0bd82f5f 378 movdqa \TMP1, %xmm\index
3c097b80 379 PSHUFB_XMM %xmm14, %xmm\index
e31ac32d 380 # prepare plaintext/ciphertext for GHASH computation
0bd82f5f
TS
381.endr
382.endif
0487ccac 383
0bd82f5f
TS
384 # apply GHASH on num_initial_blocks blocks
385
386.if \i == 5
387 pxor %xmm5, %xmm6
388 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
389 pxor %xmm6, %xmm7
390 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
391 pxor %xmm7, %xmm8
392 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
393.elseif \i == 6
394 pxor %xmm6, %xmm7
395 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
396 pxor %xmm7, %xmm8
397 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
398.elseif \i == 7
399 pxor %xmm7, %xmm8
400 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
401.endif
402 cmp $64, %r13
403 jl _initial_blocks_done\num_initial_blocks\operation
404 # no need for precomputed values
405/*
406*
407* Precomputations for HashKey parallel with encryption of first 4 blocks.
408* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
409*/
e31ac32d
TM
410 MOVADQ ONE(%rip), \TMP1
411 paddd \TMP1, \XMM0 # INCR Y0
412 MOVADQ \XMM0, \XMM1
3c097b80
TS
413 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
414
e31ac32d
TM
415 paddd \TMP1, \XMM0 # INCR Y0
416 MOVADQ \XMM0, \XMM2
3c097b80
TS
417 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
418
e31ac32d
TM
419 paddd \TMP1, \XMM0 # INCR Y0
420 MOVADQ \XMM0, \XMM3
3c097b80
TS
421 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
422
e31ac32d
TM
423 paddd \TMP1, \XMM0 # INCR Y0
424 MOVADQ \XMM0, \XMM4
3c097b80
TS
425 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
426
e31ac32d
TM
427 MOVADQ 0(%arg1),\TMP1
428 pxor \TMP1, \XMM1
429 pxor \TMP1, \XMM2
430 pxor \TMP1, \XMM3
431 pxor \TMP1, \XMM4
0bd82f5f
TS
432 movdqa \TMP3, \TMP5
433 pshufd $78, \TMP3, \TMP1
434 pxor \TMP3, \TMP1
435 movdqa \TMP1, HashKey_k(%rsp)
436 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
437# TMP5 = HashKey^2<<1 (mod poly)
438 movdqa \TMP5, HashKey_2(%rsp)
439# HashKey_2 = HashKey^2<<1 (mod poly)
440 pshufd $78, \TMP5, \TMP1
441 pxor \TMP5, \TMP1
442 movdqa \TMP1, HashKey_2_k(%rsp)
443.irpc index, 1234 # do 4 rounds
444 movaps 0x10*\index(%arg1), \TMP1
445 AESENC \TMP1, \XMM1
446 AESENC \TMP1, \XMM2
447 AESENC \TMP1, \XMM3
448 AESENC \TMP1, \XMM4
449.endr
450 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
451# TMP5 = HashKey^3<<1 (mod poly)
452 movdqa \TMP5, HashKey_3(%rsp)
453 pshufd $78, \TMP5, \TMP1
454 pxor \TMP5, \TMP1
455 movdqa \TMP1, HashKey_3_k(%rsp)
456.irpc index, 56789 # do next 5 rounds
457 movaps 0x10*\index(%arg1), \TMP1
458 AESENC \TMP1, \XMM1
459 AESENC \TMP1, \XMM2
460 AESENC \TMP1, \XMM3
461 AESENC \TMP1, \XMM4
462.endr
463 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
464# TMP5 = HashKey^3<<1 (mod poly)
465 movdqa \TMP5, HashKey_4(%rsp)
466 pshufd $78, \TMP5, \TMP1
467 pxor \TMP5, \TMP1
468 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
469 lea 0xa0(%arg1),%r10
470 mov keysize,%eax
471 shr $2,%eax # 128->4, 192->6, 256->8
472 sub $4,%eax # 128->0, 192->2, 256->4
473 jz aes_loop_pre_dec_done\num_initial_blocks
474
475aes_loop_pre_dec\num_initial_blocks:
476 MOVADQ (%r10),\TMP2
477.irpc index, 1234
478 AESENC \TMP2, %xmm\index
479.endr
480 add $16,%r10
481 sub $1,%eax
482 jnz aes_loop_pre_dec\num_initial_blocks
483
484aes_loop_pre_dec_done\num_initial_blocks:
485 MOVADQ (%r10), \TMP2
0bd82f5f
TS
486 AESENCLAST \TMP2, \XMM1
487 AESENCLAST \TMP2, \XMM2
488 AESENCLAST \TMP2, \XMM3
489 AESENCLAST \TMP2, \XMM4
490 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
491 pxor \TMP1, \XMM1
0bd82f5f
TS
492 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
493 movdqa \TMP1, \XMM1
0bd82f5f
TS
494 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
495 pxor \TMP1, \XMM2
0bd82f5f
TS
496 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
497 movdqa \TMP1, \XMM2
0bd82f5f
TS
498 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
499 pxor \TMP1, \XMM3
0bd82f5f
TS
500 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
501 movdqa \TMP1, \XMM3
0bd82f5f
TS
502 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
503 pxor \TMP1, \XMM4
0bd82f5f
TS
504 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
505 movdqa \TMP1, \XMM4
3c097b80 506 add $64, %r11
3c097b80
TS
507 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
508 pxor \XMMDst, \XMM1
509# combine GHASHed value with the corresponding ciphertext
3c097b80 510 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 511 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
512 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
513
514_initial_blocks_done\num_initial_blocks\operation:
515
516.endm
517
518
519/*
520* if a = number of total plaintext bytes
521* b = floor(a/16)
522* num_initial_blocks = b mod 4
523* encrypt the initial num_initial_blocks blocks and apply ghash on
524* the ciphertext
525* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
526* are clobbered
527* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
528*/
529
530
531.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
532XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 533 MOVADQ SHUF_MASK(%rip), %xmm14
3c097b80
TS
534 mov arg7, %r10 # %r10 = AAD
535 mov arg8, %r12 # %r12 = aadLen
536 mov %r12, %r11
537 pxor %xmm\i, %xmm\i
0487ccac
SD
538 pxor \XMM2, \XMM2
539
540 cmp $16, %r11
541 jl _get_AAD_rest8\num_initial_blocks\operation
542_get_AAD_blocks\num_initial_blocks\operation:
543 movdqu (%r10), %xmm\i
544 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
545 pxor %xmm\i, \XMM2
546 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547 add $16, %r10
548 sub $16, %r12
549 sub $16, %r11
550 cmp $16, %r11
551 jge _get_AAD_blocks\num_initial_blocks\operation
552
553 movdqu \XMM2, %xmm\i
554 cmp $0, %r11
555 je _get_AAD_done\num_initial_blocks\operation
556
557 pxor %xmm\i,%xmm\i
558
559 /* read the last <16B of AAD. since we have at least 4B of
560 data right after the AAD (the ICV, and maybe some PT), we can
561 read 4B/8B blocks safely, and then get rid of the extra stuff */
562_get_AAD_rest8\num_initial_blocks\operation:
563 cmp $4, %r11
564 jle _get_AAD_rest4\num_initial_blocks\operation
565 movq (%r10), \TMP1
566 add $8, %r10
567 sub $8, %r11
568 pslldq $8, \TMP1
569 psrldq $8, %xmm\i
3c097b80 570 pxor \TMP1, %xmm\i
0487ccac
SD
571 jmp _get_AAD_rest8\num_initial_blocks\operation
572_get_AAD_rest4\num_initial_blocks\operation:
573 cmp $0, %r11
574 jle _get_AAD_rest0\num_initial_blocks\operation
575 mov (%r10), %eax
576 movq %rax, \TMP1
3c097b80 577 add $4, %r10
0487ccac
SD
578 sub $4, %r10
579 pslldq $12, \TMP1
3c097b80 580 psrldq $4, %xmm\i
0487ccac
SD
581 pxor \TMP1, %xmm\i
582_get_AAD_rest0\num_initial_blocks\operation:
583 /* finalize: shift out the extra bytes we read, and align
584 left. since pslldq can only shift by an immediate, we use
585 vpshufb and an array of shuffle masks */
586 movq %r12, %r11
587 salq $4, %r11
588 movdqu aad_shift_arr(%r11), \TMP1
589 PSHUFB_XMM \TMP1, %xmm\i
590_get_AAD_rest_final\num_initial_blocks\operation:
3c097b80 591 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
0487ccac
SD
592 pxor \XMM2, %xmm\i
593 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
3c097b80 594
0487ccac 595_get_AAD_done\num_initial_blocks\operation:
3c097b80 596 xor %r11, %r11 # initialise the data pointer offset as zero
0487ccac 597 # start AES for num_initial_blocks blocks
3c097b80
TS
598
599 mov %arg5, %rax # %rax = *Y0
600 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
601 PSHUFB_XMM %xmm14, \XMM0
602
603.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 604
e31ac32d
TM
605 MOVADQ ONE(%RIP),\TMP1
606 MOVADQ 0(%arg1),\TMP2
3c097b80 607.irpc index, \i_seq
e31ac32d
TM
608 paddd \TMP1, \XMM0 # INCR Y0
609 MOVADQ \XMM0, %xmm\index
610 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
611 pxor \TMP2, %xmm\index
3c097b80 612.endr
e31ac32d
TM
613 lea 0x10(%arg1),%r10
614 mov keysize,%eax
615 shr $2,%eax # 128->4, 192->6, 256->8
616 add $5,%eax # 128->9, 192->11, 256->13
617
618aes_loop_initial_enc\num_initial_blocks:
619 MOVADQ (%r10),\TMP1
620.irpc index, \i_seq
621 AESENC \TMP1, %xmm\index
3c097b80 622.endr
e31ac32d
TM
623 add $16,%r10
624 sub $1,%eax
625 jnz aes_loop_initial_enc\num_initial_blocks
626
627 MOVADQ (%r10), \TMP1
3c097b80 628.irpc index, \i_seq
e31ac32d 629 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
630.endr
631.irpc index, \i_seq
632 movdqu (%arg3 , %r11, 1), \TMP1
633 pxor \TMP1, %xmm\index
634 movdqu %xmm\index, (%arg2 , %r11, 1)
635 # write back plaintext/ciphertext for num_initial_blocks
636 add $16, %r11
3c097b80
TS
637 PSHUFB_XMM %xmm14, %xmm\index
638
639 # prepare plaintext/ciphertext for GHASH computation
640.endr
641.endif
0487ccac 642
3c097b80
TS
643 # apply GHASH on num_initial_blocks blocks
644
645.if \i == 5
646 pxor %xmm5, %xmm6
647 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
648 pxor %xmm6, %xmm7
649 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
650 pxor %xmm7, %xmm8
651 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
652.elseif \i == 6
653 pxor %xmm6, %xmm7
654 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
655 pxor %xmm7, %xmm8
656 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
657.elseif \i == 7
658 pxor %xmm7, %xmm8
659 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
660.endif
661 cmp $64, %r13
662 jl _initial_blocks_done\num_initial_blocks\operation
663 # no need for precomputed values
664/*
665*
666* Precomputations for HashKey parallel with encryption of first 4 blocks.
667* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
668*/
e31ac32d
TM
669 MOVADQ ONE(%RIP),\TMP1
670 paddd \TMP1, \XMM0 # INCR Y0
671 MOVADQ \XMM0, \XMM1
3c097b80
TS
672 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
673
e31ac32d
TM
674 paddd \TMP1, \XMM0 # INCR Y0
675 MOVADQ \XMM0, \XMM2
3c097b80
TS
676 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
677
e31ac32d
TM
678 paddd \TMP1, \XMM0 # INCR Y0
679 MOVADQ \XMM0, \XMM3
3c097b80
TS
680 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
681
e31ac32d
TM
682 paddd \TMP1, \XMM0 # INCR Y0
683 MOVADQ \XMM0, \XMM4
3c097b80
TS
684 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
685
e31ac32d
TM
686 MOVADQ 0(%arg1),\TMP1
687 pxor \TMP1, \XMM1
688 pxor \TMP1, \XMM2
689 pxor \TMP1, \XMM3
690 pxor \TMP1, \XMM4
3c097b80
TS
691 movdqa \TMP3, \TMP5
692 pshufd $78, \TMP3, \TMP1
693 pxor \TMP3, \TMP1
694 movdqa \TMP1, HashKey_k(%rsp)
695 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
696# TMP5 = HashKey^2<<1 (mod poly)
697 movdqa \TMP5, HashKey_2(%rsp)
698# HashKey_2 = HashKey^2<<1 (mod poly)
699 pshufd $78, \TMP5, \TMP1
700 pxor \TMP5, \TMP1
701 movdqa \TMP1, HashKey_2_k(%rsp)
702.irpc index, 1234 # do 4 rounds
703 movaps 0x10*\index(%arg1), \TMP1
704 AESENC \TMP1, \XMM1
705 AESENC \TMP1, \XMM2
706 AESENC \TMP1, \XMM3
707 AESENC \TMP1, \XMM4
708.endr
709 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
710# TMP5 = HashKey^3<<1 (mod poly)
711 movdqa \TMP5, HashKey_3(%rsp)
712 pshufd $78, \TMP5, \TMP1
713 pxor \TMP5, \TMP1
714 movdqa \TMP1, HashKey_3_k(%rsp)
715.irpc index, 56789 # do next 5 rounds
716 movaps 0x10*\index(%arg1), \TMP1
717 AESENC \TMP1, \XMM1
718 AESENC \TMP1, \XMM2
719 AESENC \TMP1, \XMM3
720 AESENC \TMP1, \XMM4
721.endr
722 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
723# TMP5 = HashKey^3<<1 (mod poly)
724 movdqa \TMP5, HashKey_4(%rsp)
725 pshufd $78, \TMP5, \TMP1
726 pxor \TMP5, \TMP1
727 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
728 lea 0xa0(%arg1),%r10
729 mov keysize,%eax
730 shr $2,%eax # 128->4, 192->6, 256->8
731 sub $4,%eax # 128->0, 192->2, 256->4
732 jz aes_loop_pre_enc_done\num_initial_blocks
733
734aes_loop_pre_enc\num_initial_blocks:
735 MOVADQ (%r10),\TMP2
736.irpc index, 1234
737 AESENC \TMP2, %xmm\index
738.endr
739 add $16,%r10
740 sub $1,%eax
741 jnz aes_loop_pre_enc\num_initial_blocks
742
743aes_loop_pre_enc_done\num_initial_blocks:
744 MOVADQ (%r10), \TMP2
3c097b80
TS
745 AESENCLAST \TMP2, \XMM1
746 AESENCLAST \TMP2, \XMM2
747 AESENCLAST \TMP2, \XMM3
748 AESENCLAST \TMP2, \XMM4
749 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
750 pxor \TMP1, \XMM1
751 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
752 pxor \TMP1, \XMM2
753 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
754 pxor \TMP1, \XMM3
755 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
756 pxor \TMP1, \XMM4
0bd82f5f
TS
757 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
758 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
759 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
760 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
3c097b80 761
0bd82f5f 762 add $64, %r11
3c097b80 763 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
764 pxor \XMMDst, \XMM1
765# combine GHASHed value with the corresponding ciphertext
3c097b80 766 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 767 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
768 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
769
0bd82f5f 770_initial_blocks_done\num_initial_blocks\operation:
3c097b80 771
0bd82f5f
TS
772.endm
773
774/*
775* encrypt 4 blocks at a time
776* ghash the 4 previously encrypted ciphertext blocks
777* arg1, %arg2, %arg3 are used as pointers only, not modified
778* %r11 is the data offset value
779*/
3c097b80
TS
780.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
781TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
782
783 movdqa \XMM1, \XMM5
784 movdqa \XMM2, \XMM6
785 movdqa \XMM3, \XMM7
786 movdqa \XMM4, \XMM8
787
788 movdqa SHUF_MASK(%rip), %xmm15
789 # multiply TMP5 * HashKey using karatsuba
790
791 movdqa \XMM5, \TMP4
792 pshufd $78, \XMM5, \TMP6
793 pxor \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
797 movdqa \XMM0, \XMM1
798 paddd ONE(%rip), \XMM0 # INCR CNT
799 movdqa \XMM0, \XMM2
800 paddd ONE(%rip), \XMM0 # INCR CNT
801 movdqa \XMM0, \XMM3
802 paddd ONE(%rip), \XMM0 # INCR CNT
803 movdqa \XMM0, \XMM4
804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
809
810 pxor (%arg1), \XMM1
811 pxor (%arg1), \XMM2
812 pxor (%arg1), \XMM3
813 pxor (%arg1), \XMM4
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
818 AESENC \TMP1, \XMM2
819 AESENC \TMP1, \XMM3
820 AESENC \TMP1, \XMM4
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
823 AESENC \TMP1, \XMM2
824 AESENC \TMP1, \XMM3
825 AESENC \TMP1, \XMM4
826 movdqa \XMM6, \TMP1
827 pshufd $78, \XMM6, \TMP2
828 pxor \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
833 AESENC \TMP3, \XMM2
834 AESENC \TMP3, \XMM3
835 AESENC \TMP3, \XMM4
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
839 AESENC \TMP3, \XMM2
840 AESENC \TMP3, \XMM3
841 AESENC \TMP3, \XMM4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
846 AESENC \TMP3, \XMM2
847 AESENC \TMP3, \XMM3
848 AESENC \TMP3, \XMM4
849 pxor \TMP1, \TMP4
850# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
851 pxor \XMM6, \XMM5
852 pxor \TMP2, \TMP6
853 movdqa \XMM7, \TMP1
854 pshufd $78, \XMM7, \TMP2
855 pxor \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
857
858 # Multiply TMP5 * HashKey using karatsuba
859
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
863 AESENC \TMP3, \XMM2
864 AESENC \TMP3, \XMM3
865 AESENC \TMP3, \XMM4
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
869 AESENC \TMP3, \XMM2
870 AESENC \TMP3, \XMM3
871 AESENC \TMP3, \XMM4
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
876 AESENC \TMP3, \XMM2
877 AESENC \TMP3, \XMM3
878 AESENC \TMP3, \XMM4
879 pxor \TMP1, \TMP4
880# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
881 pxor \XMM7, \XMM5
882 pxor \TMP2, \TMP6
883
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
886
887 movdqa \XMM8, \TMP1
888 pshufd $78, \XMM8, \TMP2
889 pxor \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
894 AESENC \TMP3, \XMM2
895 AESENC \TMP3, \XMM3
896 AESENC \TMP3, \XMM4
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
898 lea 0xa0(%arg1),%r10
899 mov keysize,%eax
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_enc_done
903
904aes_loop_par_enc:
905 MOVADQ (%r10),\TMP3
906.irpc index, 1234
907 AESENC \TMP3, %xmm\index
908.endr
909 add $16,%r10
910 sub $1,%eax
911 jnz aes_loop_par_enc
912
913aes_loop_par_enc_done:
914 MOVADQ (%r10), \TMP3
3c097b80
TS
915 AESENCLAST \TMP3, \XMM1 # Round 10
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
923 movdqu 16(%arg3,%r11,1), \TMP3
924 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
925 movdqu 32(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
927 movdqu 48(%arg3,%r11,1), \TMP3
928 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
929 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
932 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
933 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
936 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
937
938 pxor \TMP4, \TMP1
939 pxor \XMM8, \XMM5
940 pxor \TMP6, \TMP2
941 pxor \TMP1, \TMP2
942 pxor \XMM5, \TMP2
943 movdqa \TMP2, \TMP3
944 pslldq $8, \TMP3 # left shift TMP3 2 DWs
945 psrldq $8, \TMP2 # right shift TMP2 2 DWs
946 pxor \TMP3, \XMM5
947 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
948
949 # first phase of reduction
950
951 movdqa \XMM5, \TMP2
952 movdqa \XMM5, \TMP3
953 movdqa \XMM5, \TMP4
954# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
955 pslld $31, \TMP2 # packed right shift << 31
956 pslld $30, \TMP3 # packed right shift << 30
957 pslld $25, \TMP4 # packed right shift << 25
958 pxor \TMP3, \TMP2 # xor the shifted versions
959 pxor \TMP4, \TMP2
960 movdqa \TMP2, \TMP5
961 psrldq $4, \TMP5 # right shift T5 1 DW
962 pslldq $12, \TMP2 # left shift T2 3 DWs
963 pxor \TMP2, \XMM5
964
965 # second phase of reduction
966
967 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
968 movdqa \XMM5,\TMP3
969 movdqa \XMM5,\TMP4
970 psrld $1, \TMP2 # packed left shift >>1
971 psrld $2, \TMP3 # packed left shift >>2
972 psrld $7, \TMP4 # packed left shift >>7
973 pxor \TMP3,\TMP2 # xor the shifted versions
974 pxor \TMP4,\TMP2
975 pxor \TMP5, \TMP2
976 pxor \TMP2, \XMM5
977 pxor \TMP1, \XMM5 # result is in TMP1
978
979 pxor \XMM5, \XMM1
980.endm
981
982/*
983* decrypt 4 blocks at a time
984* ghash the 4 previously decrypted ciphertext blocks
985* arg1, %arg2, %arg3 are used as pointers only, not modified
986* %r11 is the data offset value
987*/
988.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
989TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
990
991 movdqa \XMM1, \XMM5
992 movdqa \XMM2, \XMM6
993 movdqa \XMM3, \XMM7
994 movdqa \XMM4, \XMM8
995
3c097b80 996 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
997 # multiply TMP5 * HashKey using karatsuba
998
999 movdqa \XMM5, \TMP4
1000 pshufd $78, \XMM5, \TMP6
1001 pxor \XMM5, \TMP6
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1003 movdqa HashKey_4(%rsp), \TMP5
1004 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1005 movdqa \XMM0, \XMM1
1006 paddd ONE(%rip), \XMM0 # INCR CNT
1007 movdqa \XMM0, \XMM2
1008 paddd ONE(%rip), \XMM0 # INCR CNT
1009 movdqa \XMM0, \XMM3
1010 paddd ONE(%rip), \XMM0 # INCR CNT
1011 movdqa \XMM0, \XMM4
3c097b80 1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 1013 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
1014 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1016 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1017
0bd82f5f
TS
1018 pxor (%arg1), \XMM1
1019 pxor (%arg1), \XMM2
1020 pxor (%arg1), \XMM3
1021 pxor (%arg1), \XMM4
1022 movdqa HashKey_4_k(%rsp), \TMP5
1023 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1024 movaps 0x10(%arg1), \TMP1
1025 AESENC \TMP1, \XMM1 # Round 1
1026 AESENC \TMP1, \XMM2
1027 AESENC \TMP1, \XMM3
1028 AESENC \TMP1, \XMM4
1029 movaps 0x20(%arg1), \TMP1
1030 AESENC \TMP1, \XMM1 # Round 2
1031 AESENC \TMP1, \XMM2
1032 AESENC \TMP1, \XMM3
1033 AESENC \TMP1, \XMM4
1034 movdqa \XMM6, \TMP1
1035 pshufd $78, \XMM6, \TMP2
1036 pxor \XMM6, \TMP2
1037 movdqa HashKey_3(%rsp), \TMP5
1038 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1039 movaps 0x30(%arg1), \TMP3
1040 AESENC \TMP3, \XMM1 # Round 3
1041 AESENC \TMP3, \XMM2
1042 AESENC \TMP3, \XMM3
1043 AESENC \TMP3, \XMM4
1044 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1045 movaps 0x40(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 4
1047 AESENC \TMP3, \XMM2
1048 AESENC \TMP3, \XMM3
1049 AESENC \TMP3, \XMM4
1050 movdqa HashKey_3_k(%rsp), \TMP5
1051 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1052 movaps 0x50(%arg1), \TMP3
1053 AESENC \TMP3, \XMM1 # Round 5
1054 AESENC \TMP3, \XMM2
1055 AESENC \TMP3, \XMM3
1056 AESENC \TMP3, \XMM4
1057 pxor \TMP1, \TMP4
1058# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1059 pxor \XMM6, \XMM5
1060 pxor \TMP2, \TMP6
1061 movdqa \XMM7, \TMP1
1062 pshufd $78, \XMM7, \TMP2
1063 pxor \XMM7, \TMP2
1064 movdqa HashKey_2(%rsp ), \TMP5
1065
1066 # Multiply TMP5 * HashKey using karatsuba
1067
1068 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1069 movaps 0x60(%arg1), \TMP3
1070 AESENC \TMP3, \XMM1 # Round 6
1071 AESENC \TMP3, \XMM2
1072 AESENC \TMP3, \XMM3
1073 AESENC \TMP3, \XMM4
1074 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1075 movaps 0x70(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 7
1077 AESENC \TMP3, \XMM2
1078 AESENC \TMP3, \XMM3
1079 AESENC \TMP3, \XMM4
1080 movdqa HashKey_2_k(%rsp), \TMP5
1081 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 movaps 0x80(%arg1), \TMP3
1083 AESENC \TMP3, \XMM1 # Round 8
1084 AESENC \TMP3, \XMM2
1085 AESENC \TMP3, \XMM3
1086 AESENC \TMP3, \XMM4
1087 pxor \TMP1, \TMP4
1088# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1089 pxor \XMM7, \XMM5
1090 pxor \TMP2, \TMP6
1091
1092 # Multiply XMM8 * HashKey
1093 # XMM8 and TMP5 hold the values for the two operands
1094
1095 movdqa \XMM8, \TMP1
1096 pshufd $78, \XMM8, \TMP2
1097 pxor \XMM8, \TMP2
1098 movdqa HashKey(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 movaps 0x90(%arg1), \TMP3
1101 AESENC \TMP3, \XMM1 # Round 9
1102 AESENC \TMP3, \XMM2
1103 AESENC \TMP3, \XMM3
1104 AESENC \TMP3, \XMM4
1105 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1106 lea 0xa0(%arg1),%r10
1107 mov keysize,%eax
1108 shr $2,%eax # 128->4, 192->6, 256->8
1109 sub $4,%eax # 128->0, 192->2, 256->4
1110 jz aes_loop_par_dec_done
1111
1112aes_loop_par_dec:
1113 MOVADQ (%r10),\TMP3
1114.irpc index, 1234
1115 AESENC \TMP3, %xmm\index
1116.endr
1117 add $16,%r10
1118 sub $1,%eax
1119 jnz aes_loop_par_dec
1120
1121aes_loop_par_dec_done:
1122 MOVADQ (%r10), \TMP3
1123 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
1124 AESENCLAST \TMP3, \XMM2
1125 AESENCLAST \TMP3, \XMM3
1126 AESENCLAST \TMP3, \XMM4
1127 movdqa HashKey_k(%rsp), \TMP5
1128 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1129 movdqu (%arg3,%r11,1), \TMP3
1130 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1131 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1132 movdqa \TMP3, \XMM1
0bd82f5f
TS
1133 movdqu 16(%arg3,%r11,1), \TMP3
1134 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1135 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1136 movdqa \TMP3, \XMM2
0bd82f5f
TS
1137 movdqu 32(%arg3,%r11,1), \TMP3
1138 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1139 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1140 movdqa \TMP3, \XMM3
0bd82f5f
TS
1141 movdqu 48(%arg3,%r11,1), \TMP3
1142 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
1143 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1144 movdqa \TMP3, \XMM4
3c097b80
TS
1145 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1148 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1149
1150 pxor \TMP4, \TMP1
1151 pxor \XMM8, \XMM5
1152 pxor \TMP6, \TMP2
1153 pxor \TMP1, \TMP2
1154 pxor \XMM5, \TMP2
1155 movdqa \TMP2, \TMP3
1156 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1157 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1158 pxor \TMP3, \XMM5
1159 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1160
1161 # first phase of reduction
1162
1163 movdqa \XMM5, \TMP2
1164 movdqa \XMM5, \TMP3
1165 movdqa \XMM5, \TMP4
1166# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1167 pslld $31, \TMP2 # packed right shift << 31
1168 pslld $30, \TMP3 # packed right shift << 30
1169 pslld $25, \TMP4 # packed right shift << 25
1170 pxor \TMP3, \TMP2 # xor the shifted versions
1171 pxor \TMP4, \TMP2
1172 movdqa \TMP2, \TMP5
1173 psrldq $4, \TMP5 # right shift T5 1 DW
1174 pslldq $12, \TMP2 # left shift T2 3 DWs
1175 pxor \TMP2, \XMM5
1176
1177 # second phase of reduction
1178
1179 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1180 movdqa \XMM5,\TMP3
1181 movdqa \XMM5,\TMP4
1182 psrld $1, \TMP2 # packed left shift >>1
1183 psrld $2, \TMP3 # packed left shift >>2
1184 psrld $7, \TMP4 # packed left shift >>7
1185 pxor \TMP3,\TMP2 # xor the shifted versions
1186 pxor \TMP4,\TMP2
1187 pxor \TMP5, \TMP2
1188 pxor \TMP2, \XMM5
1189 pxor \TMP1, \XMM5 # result is in TMP1
1190
1191 pxor \XMM5, \XMM1
1192.endm
1193
1194/* GHASH the last 4 ciphertext blocks. */
1195.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1196TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1197
1198 # Multiply TMP6 * HashKey (using Karatsuba)
1199
1200 movdqa \XMM1, \TMP6
1201 pshufd $78, \XMM1, \TMP2
1202 pxor \XMM1, \TMP2
1203 movdqa HashKey_4(%rsp), \TMP5
1204 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1205 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1206 movdqa HashKey_4_k(%rsp), \TMP4
1207 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1208 movdqa \XMM1, \XMMDst
1209 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1210
1211 # Multiply TMP1 * HashKey (using Karatsuba)
1212
1213 movdqa \XMM2, \TMP1
1214 pshufd $78, \XMM2, \TMP2
1215 pxor \XMM2, \TMP2
1216 movdqa HashKey_3(%rsp), \TMP5
1217 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1218 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1219 movdqa HashKey_3_k(%rsp), \TMP4
1220 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1221 pxor \TMP1, \TMP6
1222 pxor \XMM2, \XMMDst
1223 pxor \TMP2, \XMM1
1224# results accumulated in TMP6, XMMDst, XMM1
1225
1226 # Multiply TMP1 * HashKey (using Karatsuba)
1227
1228 movdqa \XMM3, \TMP1
1229 pshufd $78, \XMM3, \TMP2
1230 pxor \XMM3, \TMP2
1231 movdqa HashKey_2(%rsp), \TMP5
1232 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1233 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1234 movdqa HashKey_2_k(%rsp), \TMP4
1235 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1236 pxor \TMP1, \TMP6
1237 pxor \XMM3, \XMMDst
1238 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1239
1240 # Multiply TMP1 * HashKey (using Karatsuba)
1241 movdqa \XMM4, \TMP1
1242 pshufd $78, \XMM4, \TMP2
1243 pxor \XMM4, \TMP2
1244 movdqa HashKey(%rsp), \TMP5
1245 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1246 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1247 movdqa HashKey_k(%rsp), \TMP4
1248 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1249 pxor \TMP1, \TMP6
1250 pxor \XMM4, \XMMDst
1251 pxor \XMM1, \TMP2
1252 pxor \TMP6, \TMP2
1253 pxor \XMMDst, \TMP2
1254 # middle section of the temp results combined as in karatsuba algorithm
1255 movdqa \TMP2, \TMP4
1256 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1257 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1258 pxor \TMP4, \XMMDst
1259 pxor \TMP2, \TMP6
1260# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1261 # first phase of the reduction
1262 movdqa \XMMDst, \TMP2
1263 movdqa \XMMDst, \TMP3
1264 movdqa \XMMDst, \TMP4
1265# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1266 pslld $31, \TMP2 # packed right shifting << 31
1267 pslld $30, \TMP3 # packed right shifting << 30
1268 pslld $25, \TMP4 # packed right shifting << 25
1269 pxor \TMP3, \TMP2 # xor the shifted versions
1270 pxor \TMP4, \TMP2
1271 movdqa \TMP2, \TMP7
1272 psrldq $4, \TMP7 # right shift TMP7 1 DW
1273 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1274 pxor \TMP2, \XMMDst
1275
1276 # second phase of the reduction
1277 movdqa \XMMDst, \TMP2
1278 # make 3 copies of XMMDst for doing 3 shift operations
1279 movdqa \XMMDst, \TMP3
1280 movdqa \XMMDst, \TMP4
1281 psrld $1, \TMP2 # packed left shift >> 1
1282 psrld $2, \TMP3 # packed left shift >> 2
1283 psrld $7, \TMP4 # packed left shift >> 7
1284 pxor \TMP3, \TMP2 # xor the shifted versions
1285 pxor \TMP4, \TMP2
1286 pxor \TMP7, \TMP2
1287 pxor \TMP2, \XMMDst
1288 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1289.endm
1290
0bd82f5f 1291
e31ac32d
TM
1292/* Encryption of a single block
1293* uses eax & r10
1294*/
0bd82f5f 1295
e31ac32d 1296.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1297
e31ac32d
TM
1298 pxor (%arg1), \XMM0
1299 mov keysize,%eax
1300 shr $2,%eax # 128->4, 192->6, 256->8
1301 add $5,%eax # 128->9, 192->11, 256->13
1302 lea 16(%arg1), %r10 # get first expanded key address
1303
1304_esb_loop_\@:
1305 MOVADQ (%r10),\TMP1
1306 AESENC \TMP1,\XMM0
1307 add $16,%r10
1308 sub $1,%eax
1309 jnz _esb_loop_\@
1310
1311 MOVADQ (%r10),\TMP1
1312 AESENCLAST \TMP1,\XMM0
1313.endm
0bd82f5f
TS
1314/*****************************************************************************
1315* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1316* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1317* const u8 *in, // Ciphertext input
1318* u64 plaintext_len, // Length of data in bytes for decryption.
1319* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1320* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1321* // concatenated with 0x00000001. 16-byte aligned pointer.
1322* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1323* const u8 *aad, // Additional Authentication Data (AAD)
1324* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1325* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1326* // given authentication tag and only return the plaintext if they match.
1327* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1328* // (most likely), 12 or 8.
1329*
1330* Assumptions:
1331*
1332* keys:
1333* keys are pre-expanded and aligned to 16 bytes. we are using the first
1334* set of 11 keys in the data structure void *aes_ctx
1335*
1336* iv:
1337* 0 1 2 3
1338* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1339* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1340* | Salt (From the SA) |
1341* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1342* | Initialization Vector |
1343* | (This is the sequence number from IPSec header) |
1344* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1345* | 0x1 |
1346* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1347*
1348*
1349*
1350* AAD:
1351* AAD padded to 128 bits with 0
1352* for example, assume AAD is a u32 vector
1353*
1354* if AAD is 8 bytes:
1355* AAD[3] = {A0, A1};
1356* padded AAD in xmm register = {A1 A0 0 0}
1357*
1358* 0 1 2 3
1359* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361* | SPI (A1) |
1362* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363* | 32-bit Sequence Number (A0) |
1364* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1365* | 0x0 |
1366* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1367*
1368* AAD Format with 32-bit Sequence Number
1369*
1370* if AAD is 12 bytes:
1371* AAD[3] = {A0, A1, A2};
1372* padded AAD in xmm register = {A2 A1 A0 0}
1373*
1374* 0 1 2 3
1375* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1376* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1377* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1378* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1379* | SPI (A2) |
1380* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1381* | 64-bit Extended Sequence Number {A1,A0} |
1382* | |
1383* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384* | 0x0 |
1385* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386*
1387* AAD Format with 64-bit Extended Sequence Number
1388*
1389* aadLen:
1390* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1391* The code supports 16 too but for other sizes, the code will fail.
1392*
1393* TLen:
1394* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1395* For other sizes, the code will fail.
1396*
1397* poly = x^128 + x^127 + x^126 + x^121 + 1
1398*
1399*****************************************************************************/
0bd82f5f
TS
1400ENTRY(aesni_gcm_dec)
1401 push %r12
1402 push %r13
1403 push %r14
1404 mov %rsp, %r14
1405/*
1406* states of %xmm registers %xmm6:%xmm15 not saved
1407* all %xmm registers are clobbered
1408*/
1409 sub $VARIABLE_OFFSET, %rsp
1410 and $~63, %rsp # align rsp to 64 bytes
1411 mov %arg6, %r12
1412 movdqu (%r12), %xmm13 # %xmm13 = HashKey
3c097b80
TS
1413 movdqa SHUF_MASK(%rip), %xmm2
1414 PSHUFB_XMM %xmm2, %xmm13
1415
0bd82f5f
TS
1416
1417# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1418
1419 movdqa %xmm13, %xmm2
1420 psllq $1, %xmm13
1421 psrlq $63, %xmm2
1422 movdqa %xmm2, %xmm1
1423 pslldq $8, %xmm2
1424 psrldq $8, %xmm1
1425 por %xmm2, %xmm13
1426
1427 # Reduction
1428
1429 pshufd $0x24, %xmm1, %xmm2
1430 pcmpeqd TWOONE(%rip), %xmm2
1431 pand POLY(%rip), %xmm2
1432 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1433
1434
1435 # Decrypt first few blocks
1436
1437 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1438 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1439 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1440 mov %r13, %r12
1441 and $(3<<4), %r12
1442 jz _initial_num_blocks_is_0_decrypt
1443 cmp $(2<<4), %r12
1444 jb _initial_num_blocks_is_1_decrypt
1445 je _initial_num_blocks_is_2_decrypt
1446_initial_num_blocks_is_3_decrypt:
3c097b80 1447 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1448%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1449 sub $48, %r13
1450 jmp _initial_blocks_decrypted
1451_initial_num_blocks_is_2_decrypt:
3c097b80 1452 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1453%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1454 sub $32, %r13
1455 jmp _initial_blocks_decrypted
1456_initial_num_blocks_is_1_decrypt:
3c097b80 1457 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1458%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1459 sub $16, %r13
1460 jmp _initial_blocks_decrypted
1461_initial_num_blocks_is_0_decrypt:
3c097b80 1462 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1463%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1464_initial_blocks_decrypted:
1465 cmp $0, %r13
1466 je _zero_cipher_left_decrypt
1467 sub $64, %r13
1468 je _four_cipher_left_decrypt
1469_decrypt_by_4:
3c097b80 1470 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1471%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1472 add $64, %r11
1473 sub $64, %r13
1474 jne _decrypt_by_4
1475_four_cipher_left_decrypt:
1476 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1477%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1478_zero_cipher_left_decrypt:
1479 mov %arg4, %r13
1480 and $15, %r13 # %r13 = arg4 (mod 16)
1481 je _multiple_of_16_bytes_decrypt
1482
0d2eb44f 1483 # Handle the last <16 byte block separately
0bd82f5f
TS
1484
1485 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
3c097b80
TS
1486 movdqa SHUF_MASK(%rip), %xmm10
1487 PSHUFB_XMM %xmm10, %xmm0
1488
0bd82f5f
TS
1489 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1490 sub $16, %r11
1491 add %r13, %r11
0d2eb44f 1492 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
0bd82f5f
TS
1493 lea SHIFT_MASK+16(%rip), %r12
1494 sub %r13, %r12
1495# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1496# (%r13 is the number of bytes in plaintext mod 16)
1497 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
3c097b80
TS
1498 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1499
0bd82f5f
TS
1500 movdqa %xmm1, %xmm2
1501 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1502 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1503 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1504 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1505 pand %xmm1, %xmm2
3c097b80
TS
1506 movdqa SHUF_MASK(%rip), %xmm10
1507 PSHUFB_XMM %xmm10 ,%xmm2
1508
0bd82f5f
TS
1509 pxor %xmm2, %xmm8
1510 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1511 # GHASH computation for the last <16 byte block
1512 sub %r13, %r11
1513 add $16, %r11
1514
1515 # output %r13 bytes
3c097b80 1516 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1517 cmp $8, %r13
1518 jle _less_than_8_bytes_left_decrypt
1519 mov %rax, (%arg2 , %r11, 1)
1520 add $8, %r11
1521 psrldq $8, %xmm0
3c097b80 1522 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1523 sub $8, %r13
1524_less_than_8_bytes_left_decrypt:
1525 mov %al, (%arg2, %r11, 1)
1526 add $1, %r11
1527 shr $8, %rax
1528 sub $1, %r13
1529 jne _less_than_8_bytes_left_decrypt
1530_multiple_of_16_bytes_decrypt:
1531 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1532 shl $3, %r12 # convert into number of bits
1533 movd %r12d, %xmm15 # len(A) in %xmm15
1534 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1535 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1536 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1537 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1538 pxor %xmm15, %xmm8
1539 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1540 # final GHASH computation
3c097b80
TS
1541 movdqa SHUF_MASK(%rip), %xmm10
1542 PSHUFB_XMM %xmm10, %xmm8
1543
0bd82f5f
TS
1544 mov %arg5, %rax # %rax = *Y0
1545 movdqu (%rax), %xmm0 # %xmm0 = Y0
1546 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1547 pxor %xmm8, %xmm0
1548_return_T_decrypt:
1549 mov arg9, %r10 # %r10 = authTag
1550 mov arg10, %r11 # %r11 = auth_tag_len
1551 cmp $16, %r11
1552 je _T_16_decrypt
38d9deec
SD
1553 cmp $8, %r11
1554 jl _T_4_decrypt
0bd82f5f 1555_T_8_decrypt:
3c097b80 1556 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f 1557 mov %rax, (%r10)
38d9deec
SD
1558 add $8, %r10
1559 sub $8, %r11
0bd82f5f 1560 psrldq $8, %xmm0
38d9deec
SD
1561 cmp $0, %r11
1562 je _return_T_done_decrypt
1563_T_4_decrypt:
1564 movd %xmm0, %eax
1565 mov %eax, (%r10)
1566 add $4, %r10
1567 sub $4, %r11
1568 psrldq $4, %xmm0
1569 cmp $0, %r11
1570 je _return_T_done_decrypt
1571_T_123_decrypt:
0bd82f5f 1572 movd %xmm0, %eax
38d9deec
SD
1573 cmp $2, %r11
1574 jl _T_1_decrypt
1575 mov %ax, (%r10)
1576 cmp $2, %r11
1577 je _return_T_done_decrypt
1578 add $2, %r10
1579 sar $16, %eax
1580_T_1_decrypt:
1581 mov %al, (%r10)
0bd82f5f
TS
1582 jmp _return_T_done_decrypt
1583_T_16_decrypt:
1584 movdqu %xmm0, (%r10)
1585_return_T_done_decrypt:
1586 mov %r14, %rsp
1587 pop %r14
1588 pop %r13
1589 pop %r12
1590 ret
8309b745 1591ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1592
1593
1594/*****************************************************************************
1595* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1596* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1597* const u8 *in, // Plaintext input
1598* u64 plaintext_len, // Length of data in bytes for encryption.
1599* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1600* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1601* // concatenated with 0x00000001. 16-byte aligned pointer.
1602* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1603* const u8 *aad, // Additional Authentication Data (AAD)
1604* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1605* u8 *auth_tag, // Authenticated Tag output.
1606* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1607* // 12 or 8.
1608*
1609* Assumptions:
1610*
1611* keys:
1612* keys are pre-expanded and aligned to 16 bytes. we are using the
1613* first set of 11 keys in the data structure void *aes_ctx
1614*
1615*
1616* iv:
1617* 0 1 2 3
1618* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1619* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1620* | Salt (From the SA) |
1621* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1622* | Initialization Vector |
1623* | (This is the sequence number from IPSec header) |
1624* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1625* | 0x1 |
1626* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1627*
1628*
1629*
1630* AAD:
1631* AAD padded to 128 bits with 0
1632* for example, assume AAD is a u32 vector
1633*
1634* if AAD is 8 bytes:
1635* AAD[3] = {A0, A1};
1636* padded AAD in xmm register = {A1 A0 0 0}
1637*
1638* 0 1 2 3
1639* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641* | SPI (A1) |
1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643* | 32-bit Sequence Number (A0) |
1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645* | 0x0 |
1646* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1647*
1648* AAD Format with 32-bit Sequence Number
1649*
1650* if AAD is 12 bytes:
1651* AAD[3] = {A0, A1, A2};
1652* padded AAD in xmm register = {A2 A1 A0 0}
1653*
1654* 0 1 2 3
1655* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657* | SPI (A2) |
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | 64-bit Extended Sequence Number {A1,A0} |
1660* | |
1661* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662* | 0x0 |
1663* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1664*
1665* AAD Format with 64-bit Extended Sequence Number
1666*
1667* aadLen:
1668* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1669* The code supports 16 too but for other sizes, the code will fail.
1670*
1671* TLen:
1672* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1673* For other sizes, the code will fail.
1674*
1675* poly = x^128 + x^127 + x^126 + x^121 + 1
1676***************************************************************************/
1677ENTRY(aesni_gcm_enc)
1678 push %r12
1679 push %r13
1680 push %r14
1681 mov %rsp, %r14
1682#
1683# states of %xmm registers %xmm6:%xmm15 not saved
1684# all %xmm registers are clobbered
1685#
1686 sub $VARIABLE_OFFSET, %rsp
1687 and $~63, %rsp
1688 mov %arg6, %r12
1689 movdqu (%r12), %xmm13
3c097b80
TS
1690 movdqa SHUF_MASK(%rip), %xmm2
1691 PSHUFB_XMM %xmm2, %xmm13
1692
0bd82f5f
TS
1693
1694# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1695
1696 movdqa %xmm13, %xmm2
1697 psllq $1, %xmm13
1698 psrlq $63, %xmm2
1699 movdqa %xmm2, %xmm1
1700 pslldq $8, %xmm2
1701 psrldq $8, %xmm1
1702 por %xmm2, %xmm13
1703
1704 # reduce HashKey<<1
1705
1706 pshufd $0x24, %xmm1, %xmm2
1707 pcmpeqd TWOONE(%rip), %xmm2
1708 pand POLY(%rip), %xmm2
1709 pxor %xmm2, %xmm13
1710 movdqa %xmm13, HashKey(%rsp)
1711 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1712 and $-16, %r13
1713 mov %r13, %r12
1714
1715 # Encrypt first few blocks
1716
1717 and $(3<<4), %r12
1718 jz _initial_num_blocks_is_0_encrypt
1719 cmp $(2<<4), %r12
1720 jb _initial_num_blocks_is_1_encrypt
1721 je _initial_num_blocks_is_2_encrypt
1722_initial_num_blocks_is_3_encrypt:
3c097b80 1723 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1724%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1725 sub $48, %r13
1726 jmp _initial_blocks_encrypted
1727_initial_num_blocks_is_2_encrypt:
3c097b80 1728 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1729%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1730 sub $32, %r13
1731 jmp _initial_blocks_encrypted
1732_initial_num_blocks_is_1_encrypt:
3c097b80 1733 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1734%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1735 sub $16, %r13
1736 jmp _initial_blocks_encrypted
1737_initial_num_blocks_is_0_encrypt:
3c097b80 1738 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1739%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1740_initial_blocks_encrypted:
1741
1742 # Main loop - Encrypt remaining blocks
1743
1744 cmp $0, %r13
1745 je _zero_cipher_left_encrypt
1746 sub $64, %r13
1747 je _four_cipher_left_encrypt
1748_encrypt_by_4_encrypt:
3c097b80 1749 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1750%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1751 add $64, %r11
1752 sub $64, %r13
1753 jne _encrypt_by_4_encrypt
1754_four_cipher_left_encrypt:
1755 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1756%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1757_zero_cipher_left_encrypt:
1758 mov %arg4, %r13
1759 and $15, %r13 # %r13 = arg4 (mod 16)
1760 je _multiple_of_16_bytes_encrypt
1761
0d2eb44f 1762 # Handle the last <16 Byte block separately
0bd82f5f 1763 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
3c097b80
TS
1764 movdqa SHUF_MASK(%rip), %xmm10
1765 PSHUFB_XMM %xmm10, %xmm0
1766
60af520c 1767
0bd82f5f
TS
1768 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1769 sub $16, %r11
1770 add %r13, %r11
1771 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1772 lea SHIFT_MASK+16(%rip), %r12
1773 sub %r13, %r12
1774 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1775 # (%r13 is the number of bytes in plaintext mod 16)
1776 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
3c097b80 1777 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
0bd82f5f
TS
1778 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1779 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1780 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1781 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
3c097b80
TS
1782 movdqa SHUF_MASK(%rip), %xmm10
1783 PSHUFB_XMM %xmm10,%xmm0
0bd82f5f 1784
0bd82f5f
TS
1785 pxor %xmm0, %xmm8
1786 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1787 # GHASH computation for the last <16 byte block
1788 sub %r13, %r11
1789 add $16, %r11
60af520c
TS
1790
1791 movdqa SHUF_MASK(%rip), %xmm10
1792 PSHUFB_XMM %xmm10, %xmm0
3c097b80 1793
0bd82f5f
TS
1794 # shuffle xmm0 back to output as ciphertext
1795
1796 # Output %r13 bytes
3c097b80 1797 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1798 cmp $8, %r13
1799 jle _less_than_8_bytes_left_encrypt
1800 mov %rax, (%arg2 , %r11, 1)
1801 add $8, %r11
1802 psrldq $8, %xmm0
3c097b80 1803 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1804 sub $8, %r13
1805_less_than_8_bytes_left_encrypt:
1806 mov %al, (%arg2, %r11, 1)
1807 add $1, %r11
1808 shr $8, %rax
1809 sub $1, %r13
1810 jne _less_than_8_bytes_left_encrypt
1811_multiple_of_16_bytes_encrypt:
1812 mov arg8, %r12 # %r12 = addLen (number of bytes)
1813 shl $3, %r12
1814 movd %r12d, %xmm15 # len(A) in %xmm15
1815 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1816 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1817 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1818 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1819 pxor %xmm15, %xmm8
1820 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1821 # final GHASH computation
3c097b80
TS
1822 movdqa SHUF_MASK(%rip), %xmm10
1823 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
0bd82f5f 1824
0bd82f5f
TS
1825 mov %arg5, %rax # %rax = *Y0
1826 movdqu (%rax), %xmm0 # %xmm0 = Y0
1827 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1828 pxor %xmm8, %xmm0
1829_return_T_encrypt:
1830 mov arg9, %r10 # %r10 = authTag
1831 mov arg10, %r11 # %r11 = auth_tag_len
1832 cmp $16, %r11
1833 je _T_16_encrypt
38d9deec
SD
1834 cmp $8, %r11
1835 jl _T_4_encrypt
0bd82f5f 1836_T_8_encrypt:
3c097b80 1837 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f 1838 mov %rax, (%r10)
38d9deec
SD
1839 add $8, %r10
1840 sub $8, %r11
0bd82f5f 1841 psrldq $8, %xmm0
38d9deec
SD
1842 cmp $0, %r11
1843 je _return_T_done_encrypt
1844_T_4_encrypt:
1845 movd %xmm0, %eax
1846 mov %eax, (%r10)
1847 add $4, %r10
1848 sub $4, %r11
1849 psrldq $4, %xmm0
1850 cmp $0, %r11
1851 je _return_T_done_encrypt
1852_T_123_encrypt:
0bd82f5f 1853 movd %xmm0, %eax
38d9deec
SD
1854 cmp $2, %r11
1855 jl _T_1_encrypt
1856 mov %ax, (%r10)
1857 cmp $2, %r11
1858 je _return_T_done_encrypt
1859 add $2, %r10
1860 sar $16, %eax
1861_T_1_encrypt:
1862 mov %al, (%r10)
0bd82f5f
TS
1863 jmp _return_T_done_encrypt
1864_T_16_encrypt:
1865 movdqu %xmm0, (%r10)
1866_return_T_done_encrypt:
1867 mov %r14, %rsp
1868 pop %r14
1869 pop %r13
1870 pop %r12
1871 ret
8309b745 1872ENDPROC(aesni_gcm_enc)
3c097b80 1873
559ad0ff 1874#endif
0bd82f5f
TS
1875
1876
8309b745 1877.align 4
54b6a1bd
HY
1878_key_expansion_128:
1879_key_expansion_256a:
1880 pshufd $0b11111111, %xmm1, %xmm1
1881 shufps $0b00010000, %xmm0, %xmm4
1882 pxor %xmm4, %xmm0
1883 shufps $0b10001100, %xmm0, %xmm4
1884 pxor %xmm4, %xmm0
1885 pxor %xmm1, %xmm0
0d258efb
MK
1886 movaps %xmm0, (TKEYP)
1887 add $0x10, TKEYP
54b6a1bd 1888 ret
8309b745
JK
1889ENDPROC(_key_expansion_128)
1890ENDPROC(_key_expansion_256a)
54b6a1bd 1891
0d258efb 1892.align 4
54b6a1bd
HY
1893_key_expansion_192a:
1894 pshufd $0b01010101, %xmm1, %xmm1
1895 shufps $0b00010000, %xmm0, %xmm4
1896 pxor %xmm4, %xmm0
1897 shufps $0b10001100, %xmm0, %xmm4
1898 pxor %xmm4, %xmm0
1899 pxor %xmm1, %xmm0
1900
1901 movaps %xmm2, %xmm5
1902 movaps %xmm2, %xmm6
1903 pslldq $4, %xmm5
1904 pshufd $0b11111111, %xmm0, %xmm3
1905 pxor %xmm3, %xmm2
1906 pxor %xmm5, %xmm2
1907
1908 movaps %xmm0, %xmm1
1909 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1910 movaps %xmm6, (TKEYP)
54b6a1bd 1911 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1912 movaps %xmm1, 0x10(TKEYP)
1913 add $0x20, TKEYP
54b6a1bd 1914 ret
8309b745 1915ENDPROC(_key_expansion_192a)
54b6a1bd 1916
0d258efb 1917.align 4
54b6a1bd
HY
1918_key_expansion_192b:
1919 pshufd $0b01010101, %xmm1, %xmm1
1920 shufps $0b00010000, %xmm0, %xmm4
1921 pxor %xmm4, %xmm0
1922 shufps $0b10001100, %xmm0, %xmm4
1923 pxor %xmm4, %xmm0
1924 pxor %xmm1, %xmm0
1925
1926 movaps %xmm2, %xmm5
1927 pslldq $4, %xmm5
1928 pshufd $0b11111111, %xmm0, %xmm3
1929 pxor %xmm3, %xmm2
1930 pxor %xmm5, %xmm2
1931
0d258efb
MK
1932 movaps %xmm0, (TKEYP)
1933 add $0x10, TKEYP
54b6a1bd 1934 ret
8309b745 1935ENDPROC(_key_expansion_192b)
54b6a1bd 1936
0d258efb 1937.align 4
54b6a1bd
HY
1938_key_expansion_256b:
1939 pshufd $0b10101010, %xmm1, %xmm1
1940 shufps $0b00010000, %xmm2, %xmm4
1941 pxor %xmm4, %xmm2
1942 shufps $0b10001100, %xmm2, %xmm4
1943 pxor %xmm4, %xmm2
1944 pxor %xmm1, %xmm2
0d258efb
MK
1945 movaps %xmm2, (TKEYP)
1946 add $0x10, TKEYP
54b6a1bd 1947 ret
8309b745 1948ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1949
1950/*
1951 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1952 * unsigned int key_len)
1953 */
1954ENTRY(aesni_set_key)
8691ccd7 1955 FRAME_BEGIN
0d258efb
MK
1956#ifndef __x86_64__
1957 pushl KEYP
8691ccd7
JP
1958 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1959 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1960 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1961#endif
1962 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1963 movaps %xmm0, (KEYP)
1964 lea 0x10(KEYP), TKEYP # key addr
1965 movl %edx, 480(KEYP)
54b6a1bd
HY
1966 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1967 cmp $24, %dl
1968 jb .Lenc_key128
1969 je .Lenc_key192
0d258efb
MK
1970 movups 0x10(UKEYP), %xmm2 # other user key
1971 movaps %xmm2, (TKEYP)
1972 add $0x10, TKEYP
b369e521 1973 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1974 call _key_expansion_256a
b369e521 1975 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1976 call _key_expansion_256b
b369e521 1977 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1978 call _key_expansion_256a
b369e521 1979 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1980 call _key_expansion_256b
b369e521 1981 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1982 call _key_expansion_256a
b369e521 1983 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1984 call _key_expansion_256b
b369e521 1985 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1986 call _key_expansion_256a
b369e521 1987 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1988 call _key_expansion_256b
b369e521 1989 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1990 call _key_expansion_256a
b369e521 1991 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1992 call _key_expansion_256b
b369e521 1993 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1994 call _key_expansion_256a
b369e521 1995 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1996 call _key_expansion_256b
b369e521 1997 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1998 call _key_expansion_256a
1999 jmp .Ldec_key
2000.Lenc_key192:
0d258efb 2001 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 2002 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 2003 call _key_expansion_192a
b369e521 2004 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 2005 call _key_expansion_192b
b369e521 2006 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 2007 call _key_expansion_192a
b369e521 2008 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 2009 call _key_expansion_192b
b369e521 2010 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 2011 call _key_expansion_192a
b369e521 2012 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 2013 call _key_expansion_192b
b369e521 2014 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 2015 call _key_expansion_192a
b369e521 2016 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
2017 call _key_expansion_192b
2018 jmp .Ldec_key
2019.Lenc_key128:
b369e521 2020 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 2021 call _key_expansion_128
b369e521 2022 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 2023 call _key_expansion_128
b369e521 2024 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 2025 call _key_expansion_128
b369e521 2026 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 2027 call _key_expansion_128
b369e521 2028 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 2029 call _key_expansion_128
b369e521 2030 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 2031 call _key_expansion_128
b369e521 2032 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 2033 call _key_expansion_128
b369e521 2034 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 2035 call _key_expansion_128
b369e521 2036 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 2037 call _key_expansion_128
b369e521 2038 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
2039 call _key_expansion_128
2040.Ldec_key:
0d258efb
MK
2041 sub $0x10, TKEYP
2042 movaps (KEYP), %xmm0
2043 movaps (TKEYP), %xmm1
2044 movaps %xmm0, 240(TKEYP)
2045 movaps %xmm1, 240(KEYP)
2046 add $0x10, KEYP
2047 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
2048.align 4
2049.Ldec_key_loop:
0d258efb 2050 movaps (KEYP), %xmm0
b369e521 2051 AESIMC %xmm0 %xmm1
0d258efb
MK
2052 movaps %xmm1, (UKEYP)
2053 add $0x10, KEYP
2054 sub $0x10, UKEYP
2055 cmp TKEYP, KEYP
54b6a1bd 2056 jb .Ldec_key_loop
0d258efb
MK
2057 xor AREG, AREG
2058#ifndef __x86_64__
2059 popl KEYP
2060#endif
8691ccd7 2061 FRAME_END
54b6a1bd 2062 ret
8309b745 2063ENDPROC(aesni_set_key)
54b6a1bd
HY
2064
2065/*
2066 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2067 */
2068ENTRY(aesni_enc)
8691ccd7 2069 FRAME_BEGIN
0d258efb
MK
2070#ifndef __x86_64__
2071 pushl KEYP
2072 pushl KLEN
8691ccd7
JP
2073 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2074 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2075 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 2076#endif
54b6a1bd
HY
2077 movl 480(KEYP), KLEN # key length
2078 movups (INP), STATE # input
2079 call _aesni_enc1
2080 movups STATE, (OUTP) # output
0d258efb
MK
2081#ifndef __x86_64__
2082 popl KLEN
2083 popl KEYP
2084#endif
8691ccd7 2085 FRAME_END
54b6a1bd 2086 ret
8309b745 2087ENDPROC(aesni_enc)
54b6a1bd
HY
2088
2089/*
2090 * _aesni_enc1: internal ABI
2091 * input:
2092 * KEYP: key struct pointer
2093 * KLEN: round count
2094 * STATE: initial state (input)
2095 * output:
2096 * STATE: finial state (output)
2097 * changed:
2098 * KEY
2099 * TKEYP (T1)
2100 */
0d258efb 2101.align 4
54b6a1bd
HY
2102_aesni_enc1:
2103 movaps (KEYP), KEY # key
2104 mov KEYP, TKEYP
2105 pxor KEY, STATE # round 0
2106 add $0x30, TKEYP
2107 cmp $24, KLEN
2108 jb .Lenc128
2109 lea 0x20(TKEYP), TKEYP
2110 je .Lenc192
2111 add $0x20, TKEYP
2112 movaps -0x60(TKEYP), KEY
b369e521 2113 AESENC KEY STATE
54b6a1bd 2114 movaps -0x50(TKEYP), KEY
b369e521 2115 AESENC KEY STATE
54b6a1bd
HY
2116.align 4
2117.Lenc192:
2118 movaps -0x40(TKEYP), KEY
b369e521 2119 AESENC KEY STATE
54b6a1bd 2120 movaps -0x30(TKEYP), KEY
b369e521 2121 AESENC KEY STATE
54b6a1bd
HY
2122.align 4
2123.Lenc128:
2124 movaps -0x20(TKEYP), KEY
b369e521 2125 AESENC KEY STATE
54b6a1bd 2126 movaps -0x10(TKEYP), KEY
b369e521 2127 AESENC KEY STATE
54b6a1bd 2128 movaps (TKEYP), KEY
b369e521 2129 AESENC KEY STATE
54b6a1bd 2130 movaps 0x10(TKEYP), KEY
b369e521 2131 AESENC KEY STATE
54b6a1bd 2132 movaps 0x20(TKEYP), KEY
b369e521 2133 AESENC KEY STATE
54b6a1bd 2134 movaps 0x30(TKEYP), KEY
b369e521 2135 AESENC KEY STATE
54b6a1bd 2136 movaps 0x40(TKEYP), KEY
b369e521 2137 AESENC KEY STATE
54b6a1bd 2138 movaps 0x50(TKEYP), KEY
b369e521 2139 AESENC KEY STATE
54b6a1bd 2140 movaps 0x60(TKEYP), KEY
b369e521 2141 AESENC KEY STATE
54b6a1bd 2142 movaps 0x70(TKEYP), KEY
b369e521 2143 AESENCLAST KEY STATE
54b6a1bd 2144 ret
8309b745 2145ENDPROC(_aesni_enc1)
54b6a1bd
HY
2146
2147/*
2148 * _aesni_enc4: internal ABI
2149 * input:
2150 * KEYP: key struct pointer
2151 * KLEN: round count
2152 * STATE1: initial state (input)
2153 * STATE2
2154 * STATE3
2155 * STATE4
2156 * output:
2157 * STATE1: finial state (output)
2158 * STATE2
2159 * STATE3
2160 * STATE4
2161 * changed:
2162 * KEY
2163 * TKEYP (T1)
2164 */
0d258efb 2165.align 4
54b6a1bd
HY
2166_aesni_enc4:
2167 movaps (KEYP), KEY # key
2168 mov KEYP, TKEYP
2169 pxor KEY, STATE1 # round 0
2170 pxor KEY, STATE2
2171 pxor KEY, STATE3
2172 pxor KEY, STATE4
2173 add $0x30, TKEYP
2174 cmp $24, KLEN
2175 jb .L4enc128
2176 lea 0x20(TKEYP), TKEYP
2177 je .L4enc192
2178 add $0x20, TKEYP
2179 movaps -0x60(TKEYP), KEY
b369e521
HY
2180 AESENC KEY STATE1
2181 AESENC KEY STATE2
2182 AESENC KEY STATE3
2183 AESENC KEY STATE4
54b6a1bd 2184 movaps -0x50(TKEYP), KEY
b369e521
HY
2185 AESENC KEY STATE1
2186 AESENC KEY STATE2
2187 AESENC KEY STATE3
2188 AESENC KEY STATE4
54b6a1bd
HY
2189#.align 4
2190.L4enc192:
2191 movaps -0x40(TKEYP), KEY
b369e521
HY
2192 AESENC KEY STATE1
2193 AESENC KEY STATE2
2194 AESENC KEY STATE3
2195 AESENC KEY STATE4
54b6a1bd 2196 movaps -0x30(TKEYP), KEY
b369e521
HY
2197 AESENC KEY STATE1
2198 AESENC KEY STATE2
2199 AESENC KEY STATE3
2200 AESENC KEY STATE4
54b6a1bd
HY
2201#.align 4
2202.L4enc128:
2203 movaps -0x20(TKEYP), KEY
b369e521
HY
2204 AESENC KEY STATE1
2205 AESENC KEY STATE2
2206 AESENC KEY STATE3
2207 AESENC KEY STATE4
54b6a1bd 2208 movaps -0x10(TKEYP), KEY
b369e521
HY
2209 AESENC KEY STATE1
2210 AESENC KEY STATE2
2211 AESENC KEY STATE3
2212 AESENC KEY STATE4
54b6a1bd 2213 movaps (TKEYP), KEY
b369e521
HY
2214 AESENC KEY STATE1
2215 AESENC KEY STATE2
2216 AESENC KEY STATE3
2217 AESENC KEY STATE4
54b6a1bd 2218 movaps 0x10(TKEYP), KEY
b369e521
HY
2219 AESENC KEY STATE1
2220 AESENC KEY STATE2
2221 AESENC KEY STATE3
2222 AESENC KEY STATE4
54b6a1bd 2223 movaps 0x20(TKEYP), KEY
b369e521
HY
2224 AESENC KEY STATE1
2225 AESENC KEY STATE2
2226 AESENC KEY STATE3
2227 AESENC KEY STATE4
54b6a1bd 2228 movaps 0x30(TKEYP), KEY
b369e521
HY
2229 AESENC KEY STATE1
2230 AESENC KEY STATE2
2231 AESENC KEY STATE3
2232 AESENC KEY STATE4
54b6a1bd 2233 movaps 0x40(TKEYP), KEY
b369e521
HY
2234 AESENC KEY STATE1
2235 AESENC KEY STATE2
2236 AESENC KEY STATE3
2237 AESENC KEY STATE4
54b6a1bd 2238 movaps 0x50(TKEYP), KEY
b369e521
HY
2239 AESENC KEY STATE1
2240 AESENC KEY STATE2
2241 AESENC KEY STATE3
2242 AESENC KEY STATE4
54b6a1bd 2243 movaps 0x60(TKEYP), KEY
b369e521
HY
2244 AESENC KEY STATE1
2245 AESENC KEY STATE2
2246 AESENC KEY STATE3
2247 AESENC KEY STATE4
54b6a1bd 2248 movaps 0x70(TKEYP), KEY
b369e521
HY
2249 AESENCLAST KEY STATE1 # last round
2250 AESENCLAST KEY STATE2
2251 AESENCLAST KEY STATE3
2252 AESENCLAST KEY STATE4
54b6a1bd 2253 ret
8309b745 2254ENDPROC(_aesni_enc4)
54b6a1bd
HY
2255
2256/*
2257 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2258 */
2259ENTRY(aesni_dec)
8691ccd7 2260 FRAME_BEGIN
0d258efb
MK
2261#ifndef __x86_64__
2262 pushl KEYP
2263 pushl KLEN
8691ccd7
JP
2264 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2265 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2266 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 2267#endif
54b6a1bd
HY
2268 mov 480(KEYP), KLEN # key length
2269 add $240, KEYP
2270 movups (INP), STATE # input
2271 call _aesni_dec1
2272 movups STATE, (OUTP) #output
0d258efb
MK
2273#ifndef __x86_64__
2274 popl KLEN
2275 popl KEYP
2276#endif
8691ccd7 2277 FRAME_END
54b6a1bd 2278 ret
8309b745 2279ENDPROC(aesni_dec)
54b6a1bd
HY
2280
2281/*
2282 * _aesni_dec1: internal ABI
2283 * input:
2284 * KEYP: key struct pointer
2285 * KLEN: key length
2286 * STATE: initial state (input)
2287 * output:
2288 * STATE: finial state (output)
2289 * changed:
2290 * KEY
2291 * TKEYP (T1)
2292 */
0d258efb 2293.align 4
54b6a1bd
HY
2294_aesni_dec1:
2295 movaps (KEYP), KEY # key
2296 mov KEYP, TKEYP
2297 pxor KEY, STATE # round 0
2298 add $0x30, TKEYP
2299 cmp $24, KLEN
2300 jb .Ldec128
2301 lea 0x20(TKEYP), TKEYP
2302 je .Ldec192
2303 add $0x20, TKEYP
2304 movaps -0x60(TKEYP), KEY
b369e521 2305 AESDEC KEY STATE
54b6a1bd 2306 movaps -0x50(TKEYP), KEY
b369e521 2307 AESDEC KEY STATE
54b6a1bd
HY
2308.align 4
2309.Ldec192:
2310 movaps -0x40(TKEYP), KEY
b369e521 2311 AESDEC KEY STATE
54b6a1bd 2312 movaps -0x30(TKEYP), KEY
b369e521 2313 AESDEC KEY STATE
54b6a1bd
HY
2314.align 4
2315.Ldec128:
2316 movaps -0x20(TKEYP), KEY
b369e521 2317 AESDEC KEY STATE
54b6a1bd 2318 movaps -0x10(TKEYP), KEY
b369e521 2319 AESDEC KEY STATE
54b6a1bd 2320 movaps (TKEYP), KEY
b369e521 2321 AESDEC KEY STATE
54b6a1bd 2322 movaps 0x10(TKEYP), KEY
b369e521 2323 AESDEC KEY STATE
54b6a1bd 2324 movaps 0x20(TKEYP), KEY
b369e521 2325 AESDEC KEY STATE
54b6a1bd 2326 movaps 0x30(TKEYP), KEY
b369e521 2327 AESDEC KEY STATE
54b6a1bd 2328 movaps 0x40(TKEYP), KEY
b369e521 2329 AESDEC KEY STATE
54b6a1bd 2330 movaps 0x50(TKEYP), KEY
b369e521 2331 AESDEC KEY STATE
54b6a1bd 2332 movaps 0x60(TKEYP), KEY
b369e521 2333 AESDEC KEY STATE
54b6a1bd 2334 movaps 0x70(TKEYP), KEY
b369e521 2335 AESDECLAST KEY STATE
54b6a1bd 2336 ret
8309b745 2337ENDPROC(_aesni_dec1)
54b6a1bd
HY
2338
2339/*
2340 * _aesni_dec4: internal ABI
2341 * input:
2342 * KEYP: key struct pointer
2343 * KLEN: key length
2344 * STATE1: initial state (input)
2345 * STATE2
2346 * STATE3
2347 * STATE4
2348 * output:
2349 * STATE1: finial state (output)
2350 * STATE2
2351 * STATE3
2352 * STATE4
2353 * changed:
2354 * KEY
2355 * TKEYP (T1)
2356 */
0d258efb 2357.align 4
54b6a1bd
HY
2358_aesni_dec4:
2359 movaps (KEYP), KEY # key
2360 mov KEYP, TKEYP
2361 pxor KEY, STATE1 # round 0
2362 pxor KEY, STATE2
2363 pxor KEY, STATE3
2364 pxor KEY, STATE4
2365 add $0x30, TKEYP
2366 cmp $24, KLEN
2367 jb .L4dec128
2368 lea 0x20(TKEYP), TKEYP
2369 je .L4dec192
2370 add $0x20, TKEYP
2371 movaps -0x60(TKEYP), KEY
b369e521
HY
2372 AESDEC KEY STATE1
2373 AESDEC KEY STATE2
2374 AESDEC KEY STATE3
2375 AESDEC KEY STATE4
54b6a1bd 2376 movaps -0x50(TKEYP), KEY
b369e521
HY
2377 AESDEC KEY STATE1
2378 AESDEC KEY STATE2
2379 AESDEC KEY STATE3
2380 AESDEC KEY STATE4
54b6a1bd
HY
2381.align 4
2382.L4dec192:
2383 movaps -0x40(TKEYP), KEY
b369e521
HY
2384 AESDEC KEY STATE1
2385 AESDEC KEY STATE2
2386 AESDEC KEY STATE3
2387 AESDEC KEY STATE4
54b6a1bd 2388 movaps -0x30(TKEYP), KEY
b369e521
HY
2389 AESDEC KEY STATE1
2390 AESDEC KEY STATE2
2391 AESDEC KEY STATE3
2392 AESDEC KEY STATE4
54b6a1bd
HY
2393.align 4
2394.L4dec128:
2395 movaps -0x20(TKEYP), KEY
b369e521
HY
2396 AESDEC KEY STATE1
2397 AESDEC KEY STATE2
2398 AESDEC KEY STATE3
2399 AESDEC KEY STATE4
54b6a1bd 2400 movaps -0x10(TKEYP), KEY
b369e521
HY
2401 AESDEC KEY STATE1
2402 AESDEC KEY STATE2
2403 AESDEC KEY STATE3
2404 AESDEC KEY STATE4
54b6a1bd 2405 movaps (TKEYP), KEY
b369e521
HY
2406 AESDEC KEY STATE1
2407 AESDEC KEY STATE2
2408 AESDEC KEY STATE3
2409 AESDEC KEY STATE4
54b6a1bd 2410 movaps 0x10(TKEYP), KEY
b369e521
HY
2411 AESDEC KEY STATE1
2412 AESDEC KEY STATE2
2413 AESDEC KEY STATE3
2414 AESDEC KEY STATE4
54b6a1bd 2415 movaps 0x20(TKEYP), KEY
b369e521
HY
2416 AESDEC KEY STATE1
2417 AESDEC KEY STATE2
2418 AESDEC KEY STATE3
2419 AESDEC KEY STATE4
54b6a1bd 2420 movaps 0x30(TKEYP), KEY
b369e521
HY
2421 AESDEC KEY STATE1
2422 AESDEC KEY STATE2
2423 AESDEC KEY STATE3
2424 AESDEC KEY STATE4
54b6a1bd 2425 movaps 0x40(TKEYP), KEY
b369e521
HY
2426 AESDEC KEY STATE1
2427 AESDEC KEY STATE2
2428 AESDEC KEY STATE3
2429 AESDEC KEY STATE4
54b6a1bd 2430 movaps 0x50(TKEYP), KEY
b369e521
HY
2431 AESDEC KEY STATE1
2432 AESDEC KEY STATE2
2433 AESDEC KEY STATE3
2434 AESDEC KEY STATE4
54b6a1bd 2435 movaps 0x60(TKEYP), KEY
b369e521
HY
2436 AESDEC KEY STATE1
2437 AESDEC KEY STATE2
2438 AESDEC KEY STATE3
2439 AESDEC KEY STATE4
54b6a1bd 2440 movaps 0x70(TKEYP), KEY
b369e521
HY
2441 AESDECLAST KEY STATE1 # last round
2442 AESDECLAST KEY STATE2
2443 AESDECLAST KEY STATE3
2444 AESDECLAST KEY STATE4
54b6a1bd 2445 ret
8309b745 2446ENDPROC(_aesni_dec4)
54b6a1bd
HY
2447
2448/*
2449 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2450 * size_t len)
2451 */
2452ENTRY(aesni_ecb_enc)
8691ccd7 2453 FRAME_BEGIN
0d258efb
MK
2454#ifndef __x86_64__
2455 pushl LEN
2456 pushl KEYP
2457 pushl KLEN
8691ccd7
JP
2458 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2459 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2460 movl (FRAME_OFFSET+24)(%esp), INP # src
2461 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2462#endif
54b6a1bd
HY
2463 test LEN, LEN # check length
2464 jz .Lecb_enc_ret
2465 mov 480(KEYP), KLEN
2466 cmp $16, LEN
2467 jb .Lecb_enc_ret
2468 cmp $64, LEN
2469 jb .Lecb_enc_loop1
2470.align 4
2471.Lecb_enc_loop4:
2472 movups (INP), STATE1
2473 movups 0x10(INP), STATE2
2474 movups 0x20(INP), STATE3
2475 movups 0x30(INP), STATE4
2476 call _aesni_enc4
2477 movups STATE1, (OUTP)
2478 movups STATE2, 0x10(OUTP)
2479 movups STATE3, 0x20(OUTP)
2480 movups STATE4, 0x30(OUTP)
2481 sub $64, LEN
2482 add $64, INP
2483 add $64, OUTP
2484 cmp $64, LEN
2485 jge .Lecb_enc_loop4
2486 cmp $16, LEN
2487 jb .Lecb_enc_ret
2488.align 4
2489.Lecb_enc_loop1:
2490 movups (INP), STATE1
2491 call _aesni_enc1
2492 movups STATE1, (OUTP)
2493 sub $16, LEN
2494 add $16, INP
2495 add $16, OUTP
2496 cmp $16, LEN
2497 jge .Lecb_enc_loop1
2498.Lecb_enc_ret:
0d258efb
MK
2499#ifndef __x86_64__
2500 popl KLEN
2501 popl KEYP
2502 popl LEN
2503#endif
8691ccd7 2504 FRAME_END
54b6a1bd 2505 ret
8309b745 2506ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2507
2508/*
2509 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2510 * size_t len);
2511 */
2512ENTRY(aesni_ecb_dec)
8691ccd7 2513 FRAME_BEGIN
0d258efb
MK
2514#ifndef __x86_64__
2515 pushl LEN
2516 pushl KEYP
2517 pushl KLEN
8691ccd7
JP
2518 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2519 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2520 movl (FRAME_OFFSET+24)(%esp), INP # src
2521 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2522#endif
54b6a1bd
HY
2523 test LEN, LEN
2524 jz .Lecb_dec_ret
2525 mov 480(KEYP), KLEN
2526 add $240, KEYP
2527 cmp $16, LEN
2528 jb .Lecb_dec_ret
2529 cmp $64, LEN
2530 jb .Lecb_dec_loop1
2531.align 4
2532.Lecb_dec_loop4:
2533 movups (INP), STATE1
2534 movups 0x10(INP), STATE2
2535 movups 0x20(INP), STATE3
2536 movups 0x30(INP), STATE4
2537 call _aesni_dec4
2538 movups STATE1, (OUTP)
2539 movups STATE2, 0x10(OUTP)
2540 movups STATE3, 0x20(OUTP)
2541 movups STATE4, 0x30(OUTP)
2542 sub $64, LEN
2543 add $64, INP
2544 add $64, OUTP
2545 cmp $64, LEN
2546 jge .Lecb_dec_loop4
2547 cmp $16, LEN
2548 jb .Lecb_dec_ret
2549.align 4
2550.Lecb_dec_loop1:
2551 movups (INP), STATE1
2552 call _aesni_dec1
2553 movups STATE1, (OUTP)
2554 sub $16, LEN
2555 add $16, INP
2556 add $16, OUTP
2557 cmp $16, LEN
2558 jge .Lecb_dec_loop1
2559.Lecb_dec_ret:
0d258efb
MK
2560#ifndef __x86_64__
2561 popl KLEN
2562 popl KEYP
2563 popl LEN
2564#endif
8691ccd7 2565 FRAME_END
54b6a1bd 2566 ret
8309b745 2567ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2568
2569/*
2570 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2571 * size_t len, u8 *iv)
2572 */
2573ENTRY(aesni_cbc_enc)
8691ccd7 2574 FRAME_BEGIN
0d258efb
MK
2575#ifndef __x86_64__
2576 pushl IVP
2577 pushl LEN
2578 pushl KEYP
2579 pushl KLEN
8691ccd7
JP
2580 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2581 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2582 movl (FRAME_OFFSET+28)(%esp), INP # src
2583 movl (FRAME_OFFSET+32)(%esp), LEN # len
2584 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2585#endif
54b6a1bd
HY
2586 cmp $16, LEN
2587 jb .Lcbc_enc_ret
2588 mov 480(KEYP), KLEN
2589 movups (IVP), STATE # load iv as initial state
2590.align 4
2591.Lcbc_enc_loop:
2592 movups (INP), IN # load input
2593 pxor IN, STATE
2594 call _aesni_enc1
2595 movups STATE, (OUTP) # store output
2596 sub $16, LEN
2597 add $16, INP
2598 add $16, OUTP
2599 cmp $16, LEN
2600 jge .Lcbc_enc_loop
2601 movups STATE, (IVP)
2602.Lcbc_enc_ret:
0d258efb
MK
2603#ifndef __x86_64__
2604 popl KLEN
2605 popl KEYP
2606 popl LEN
2607 popl IVP
2608#endif
8691ccd7 2609 FRAME_END
54b6a1bd 2610 ret
8309b745 2611ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2612
2613/*
2614 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 * size_t len, u8 *iv)
2616 */
2617ENTRY(aesni_cbc_dec)
8691ccd7 2618 FRAME_BEGIN
0d258efb
MK
2619#ifndef __x86_64__
2620 pushl IVP
2621 pushl LEN
2622 pushl KEYP
2623 pushl KLEN
8691ccd7
JP
2624 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2625 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2626 movl (FRAME_OFFSET+28)(%esp), INP # src
2627 movl (FRAME_OFFSET+32)(%esp), LEN # len
2628 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2629#endif
54b6a1bd 2630 cmp $16, LEN
e6efaa02 2631 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2632 mov 480(KEYP), KLEN
2633 add $240, KEYP
2634 movups (IVP), IV
2635 cmp $64, LEN
2636 jb .Lcbc_dec_loop1
2637.align 4
2638.Lcbc_dec_loop4:
2639 movups (INP), IN1
2640 movaps IN1, STATE1
2641 movups 0x10(INP), IN2
2642 movaps IN2, STATE2
0d258efb 2643#ifdef __x86_64__
54b6a1bd
HY
2644 movups 0x20(INP), IN3
2645 movaps IN3, STATE3
2646 movups 0x30(INP), IN4
2647 movaps IN4, STATE4
0d258efb
MK
2648#else
2649 movups 0x20(INP), IN1
2650 movaps IN1, STATE3
2651 movups 0x30(INP), IN2
2652 movaps IN2, STATE4
2653#endif
54b6a1bd
HY
2654 call _aesni_dec4
2655 pxor IV, STATE1
0d258efb 2656#ifdef __x86_64__
54b6a1bd
HY
2657 pxor IN1, STATE2
2658 pxor IN2, STATE3
2659 pxor IN3, STATE4
2660 movaps IN4, IV
0d258efb 2661#else
0d258efb
MK
2662 pxor IN1, STATE4
2663 movaps IN2, IV
7c8d5184
MK
2664 movups (INP), IN1
2665 pxor IN1, STATE2
2666 movups 0x10(INP), IN2
2667 pxor IN2, STATE3
0d258efb 2668#endif
54b6a1bd
HY
2669 movups STATE1, (OUTP)
2670 movups STATE2, 0x10(OUTP)
2671 movups STATE3, 0x20(OUTP)
2672 movups STATE4, 0x30(OUTP)
2673 sub $64, LEN
2674 add $64, INP
2675 add $64, OUTP
2676 cmp $64, LEN
2677 jge .Lcbc_dec_loop4
2678 cmp $16, LEN
2679 jb .Lcbc_dec_ret
2680.align 4
2681.Lcbc_dec_loop1:
2682 movups (INP), IN
2683 movaps IN, STATE
2684 call _aesni_dec1
2685 pxor IV, STATE
2686 movups STATE, (OUTP)
2687 movaps IN, IV
2688 sub $16, LEN
2689 add $16, INP
2690 add $16, OUTP
2691 cmp $16, LEN
2692 jge .Lcbc_dec_loop1
54b6a1bd 2693.Lcbc_dec_ret:
e6efaa02
HY
2694 movups IV, (IVP)
2695.Lcbc_dec_just_ret:
0d258efb
MK
2696#ifndef __x86_64__
2697 popl KLEN
2698 popl KEYP
2699 popl LEN
2700 popl IVP
2701#endif
8691ccd7 2702 FRAME_END
54b6a1bd 2703 ret
8309b745 2704ENDPROC(aesni_cbc_dec)
12387a46 2705
0d258efb 2706#ifdef __x86_64__
1253cab8 2707.pushsection .rodata
12387a46
HY
2708.align 16
2709.Lbswap_mask:
2710 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2711.popsection
12387a46
HY
2712
2713/*
2714 * _aesni_inc_init: internal ABI
2715 * setup registers used by _aesni_inc
2716 * input:
2717 * IV
2718 * output:
2719 * CTR: == IV, in little endian
2720 * TCTR_LOW: == lower qword of CTR
2721 * INC: == 1, in little endian
2722 * BSWAP_MASK == endian swapping mask
2723 */
0d258efb 2724.align 4
12387a46
HY
2725_aesni_inc_init:
2726 movaps .Lbswap_mask, BSWAP_MASK
2727 movaps IV, CTR
2728 PSHUFB_XMM BSWAP_MASK CTR
2729 mov $1, TCTR_LOW
32cbd7df
HY
2730 MOVQ_R64_XMM TCTR_LOW INC
2731 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2732 ret
8309b745 2733ENDPROC(_aesni_inc_init)
12387a46
HY
2734
2735/*
2736 * _aesni_inc: internal ABI
2737 * Increase IV by 1, IV is in big endian
2738 * input:
2739 * IV
2740 * CTR: == IV, in little endian
2741 * TCTR_LOW: == lower qword of CTR
2742 * INC: == 1, in little endian
2743 * BSWAP_MASK == endian swapping mask
2744 * output:
2745 * IV: Increase by 1
2746 * changed:
2747 * CTR: == output IV, in little endian
2748 * TCTR_LOW: == lower qword of CTR
2749 */
0d258efb 2750.align 4
12387a46
HY
2751_aesni_inc:
2752 paddq INC, CTR
2753 add $1, TCTR_LOW
2754 jnc .Linc_low
2755 pslldq $8, INC
2756 paddq INC, CTR
2757 psrldq $8, INC
2758.Linc_low:
2759 movaps CTR, IV
2760 PSHUFB_XMM BSWAP_MASK IV
2761 ret
8309b745 2762ENDPROC(_aesni_inc)
12387a46
HY
2763
2764/*
2765 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2766 * size_t len, u8 *iv)
2767 */
2768ENTRY(aesni_ctr_enc)
8691ccd7 2769 FRAME_BEGIN
12387a46
HY
2770 cmp $16, LEN
2771 jb .Lctr_enc_just_ret
2772 mov 480(KEYP), KLEN
2773 movups (IVP), IV
2774 call _aesni_inc_init
2775 cmp $64, LEN
2776 jb .Lctr_enc_loop1
2777.align 4
2778.Lctr_enc_loop4:
2779 movaps IV, STATE1
2780 call _aesni_inc
2781 movups (INP), IN1
2782 movaps IV, STATE2
2783 call _aesni_inc
2784 movups 0x10(INP), IN2
2785 movaps IV, STATE3
2786 call _aesni_inc
2787 movups 0x20(INP), IN3
2788 movaps IV, STATE4
2789 call _aesni_inc
2790 movups 0x30(INP), IN4
2791 call _aesni_enc4
2792 pxor IN1, STATE1
2793 movups STATE1, (OUTP)
2794 pxor IN2, STATE2
2795 movups STATE2, 0x10(OUTP)
2796 pxor IN3, STATE3
2797 movups STATE3, 0x20(OUTP)
2798 pxor IN4, STATE4
2799 movups STATE4, 0x30(OUTP)
2800 sub $64, LEN
2801 add $64, INP
2802 add $64, OUTP
2803 cmp $64, LEN
2804 jge .Lctr_enc_loop4
2805 cmp $16, LEN
2806 jb .Lctr_enc_ret
2807.align 4
2808.Lctr_enc_loop1:
2809 movaps IV, STATE
2810 call _aesni_inc
2811 movups (INP), IN
2812 call _aesni_enc1
2813 pxor IN, STATE
2814 movups STATE, (OUTP)
2815 sub $16, LEN
2816 add $16, INP
2817 add $16, OUTP
2818 cmp $16, LEN
2819 jge .Lctr_enc_loop1
2820.Lctr_enc_ret:
2821 movups IV, (IVP)
2822.Lctr_enc_just_ret:
8691ccd7 2823 FRAME_END
12387a46 2824 ret
8309b745 2825ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2826
2827/*
2828 * _aesni_gf128mul_x_ble: internal ABI
2829 * Multiply in GF(2^128) for XTS IVs
2830 * input:
2831 * IV: current IV
2832 * GF128MUL_MASK == mask with 0x87 and 0x01
2833 * output:
2834 * IV: next IV
2835 * changed:
2836 * CTR: == temporary value
2837 */
2838#define _aesni_gf128mul_x_ble() \
2839 pshufd $0x13, IV, CTR; \
2840 paddq IV, IV; \
2841 psrad $31, CTR; \
2842 pand GF128MUL_MASK, CTR; \
2843 pxor CTR, IV;
2844
2845/*
2846 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2847 * bool enc, u8 *iv)
2848 */
2849ENTRY(aesni_xts_crypt8)
8691ccd7 2850 FRAME_BEGIN
c456a9cd
JK
2851 cmpb $0, %cl
2852 movl $0, %ecx
2853 movl $240, %r10d
2854 leaq _aesni_enc4, %r11
2855 leaq _aesni_dec4, %rax
2856 cmovel %r10d, %ecx
2857 cmoveq %rax, %r11
2858
2859 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2860 movups (IVP), IV
2861
2862 mov 480(KEYP), KLEN
2863 addq %rcx, KEYP
2864
2865 movdqa IV, STATE1
fe6510b5
JK
2866 movdqu 0x00(INP), INC
2867 pxor INC, STATE1
c456a9cd
JK
2868 movdqu IV, 0x00(OUTP)
2869
2870 _aesni_gf128mul_x_ble()
2871 movdqa IV, STATE2
fe6510b5
JK
2872 movdqu 0x10(INP), INC
2873 pxor INC, STATE2
c456a9cd
JK
2874 movdqu IV, 0x10(OUTP)
2875
2876 _aesni_gf128mul_x_ble()
2877 movdqa IV, STATE3
fe6510b5
JK
2878 movdqu 0x20(INP), INC
2879 pxor INC, STATE3
c456a9cd
JK
2880 movdqu IV, 0x20(OUTP)
2881
2882 _aesni_gf128mul_x_ble()
2883 movdqa IV, STATE4
fe6510b5
JK
2884 movdqu 0x30(INP), INC
2885 pxor INC, STATE4
c456a9cd
JK
2886 movdqu IV, 0x30(OUTP)
2887
9697fa39 2888 CALL_NOSPEC %r11
c456a9cd 2889
fe6510b5
JK
2890 movdqu 0x00(OUTP), INC
2891 pxor INC, STATE1
c456a9cd
JK
2892 movdqu STATE1, 0x00(OUTP)
2893
2894 _aesni_gf128mul_x_ble()
2895 movdqa IV, STATE1
fe6510b5
JK
2896 movdqu 0x40(INP), INC
2897 pxor INC, STATE1
c456a9cd
JK
2898 movdqu IV, 0x40(OUTP)
2899
fe6510b5
JK
2900 movdqu 0x10(OUTP), INC
2901 pxor INC, STATE2
c456a9cd
JK
2902 movdqu STATE2, 0x10(OUTP)
2903
2904 _aesni_gf128mul_x_ble()
2905 movdqa IV, STATE2
fe6510b5
JK
2906 movdqu 0x50(INP), INC
2907 pxor INC, STATE2
c456a9cd
JK
2908 movdqu IV, 0x50(OUTP)
2909
fe6510b5
JK
2910 movdqu 0x20(OUTP), INC
2911 pxor INC, STATE3
c456a9cd
JK
2912 movdqu STATE3, 0x20(OUTP)
2913
2914 _aesni_gf128mul_x_ble()
2915 movdqa IV, STATE3
fe6510b5
JK
2916 movdqu 0x60(INP), INC
2917 pxor INC, STATE3
c456a9cd
JK
2918 movdqu IV, 0x60(OUTP)
2919
fe6510b5
JK
2920 movdqu 0x30(OUTP), INC
2921 pxor INC, STATE4
c456a9cd
JK
2922 movdqu STATE4, 0x30(OUTP)
2923
2924 _aesni_gf128mul_x_ble()
2925 movdqa IV, STATE4
fe6510b5
JK
2926 movdqu 0x70(INP), INC
2927 pxor INC, STATE4
c456a9cd
JK
2928 movdqu IV, 0x70(OUTP)
2929
2930 _aesni_gf128mul_x_ble()
2931 movups IV, (IVP)
2932
9697fa39 2933 CALL_NOSPEC %r11
c456a9cd 2934
fe6510b5
JK
2935 movdqu 0x40(OUTP), INC
2936 pxor INC, STATE1
c456a9cd
JK
2937 movdqu STATE1, 0x40(OUTP)
2938
fe6510b5
JK
2939 movdqu 0x50(OUTP), INC
2940 pxor INC, STATE2
c456a9cd
JK
2941 movdqu STATE2, 0x50(OUTP)
2942
fe6510b5
JK
2943 movdqu 0x60(OUTP), INC
2944 pxor INC, STATE3
c456a9cd
JK
2945 movdqu STATE3, 0x60(OUTP)
2946
fe6510b5
JK
2947 movdqu 0x70(OUTP), INC
2948 pxor INC, STATE4
c456a9cd
JK
2949 movdqu STATE4, 0x70(OUTP)
2950
8691ccd7 2951 FRAME_END
c456a9cd
JK
2952 ret
2953ENDPROC(aesni_xts_crypt8)
2954
0d258efb 2955#endif