]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
crypto: aesni - Add GCM_INIT macro
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
8691ccd7 34#include <asm/frame.h>
9697fa39 35#include <asm/nospec-branch.h>
54b6a1bd 36
e31ac32d
TM
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
559ad0ff 48#ifdef __x86_64__
e31ac32d 49
e183914a
DV
50# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
e183914a
DV
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
0bd82f5f 57POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
0bd82f5f
TS
60TWOONE: .octa 0x00000001000000000000000000000001
61
e183914a
DV
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
0bd82f5f 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
0bd82f5f 67MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
0bd82f5f 70MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
0bd82f5f 73ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
0bd82f5f 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
0bd82f5f 79dec: .octa 0x1
e183914a
DV
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
0bd82f5f
TS
82enc: .octa 0x2
83
e183914a
DV
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
54b6a1bd
HY
93.text
94
0bd82f5f
TS
95
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
e31ac32d 125#define keysize 2*15*16(%arg1)
559ad0ff 126#endif
0bd82f5f
TS
127
128
54b6a1bd
HY
129#define STATE1 %xmm0
130#define STATE2 %xmm4
131#define STATE3 %xmm5
132#define STATE4 %xmm6
133#define STATE STATE1
134#define IN1 %xmm1
135#define IN2 %xmm7
136#define IN3 %xmm8
137#define IN4 %xmm9
138#define IN IN1
139#define KEY %xmm2
140#define IV %xmm3
0d258efb 141
12387a46
HY
142#define BSWAP_MASK %xmm10
143#define CTR %xmm11
144#define INC %xmm12
54b6a1bd 145
c456a9cd
JK
146#define GF128MUL_MASK %xmm10
147
0d258efb
MK
148#ifdef __x86_64__
149#define AREG %rax
54b6a1bd
HY
150#define KEYP %rdi
151#define OUTP %rsi
0d258efb 152#define UKEYP OUTP
54b6a1bd
HY
153#define INP %rdx
154#define LEN %rcx
155#define IVP %r8
156#define KLEN %r9d
157#define T1 %r10
158#define TKEYP T1
159#define T2 %r11
12387a46 160#define TCTR_LOW T2
0d258efb
MK
161#else
162#define AREG %eax
163#define KEYP %edi
164#define OUTP AREG
165#define UKEYP OUTP
166#define INP %edx
167#define LEN %esi
168#define IVP %ebp
169#define KLEN %ebx
170#define T1 %ecx
171#define TKEYP T1
172#endif
54b6a1bd 173
6c2c86b3
DW
174.macro FUNC_SAVE
175 push %r12
176 push %r13
177 push %r14
178 mov %rsp, %r14
179#
180# states of %xmm registers %xmm6:%xmm15 not saved
181# all %xmm registers are clobbered
182#
183 sub $VARIABLE_OFFSET, %rsp
184 and $~63, %rsp
185.endm
186
187
188.macro FUNC_RESTORE
189 mov %r14, %rsp
190 pop %r14
191 pop %r13
192 pop %r12
193.endm
0bd82f5f 194
7af964c2
DW
195
196# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
197# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
198.macro GCM_INIT
199 mov %arg6, %r12
200 movdqu (%r12), %xmm13
201 movdqa SHUF_MASK(%rip), %xmm2
202 PSHUFB_XMM %xmm2, %xmm13
203
204 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
205
206 movdqa %xmm13, %xmm2
207 psllq $1, %xmm13
208 psrlq $63, %xmm2
209 movdqa %xmm2, %xmm1
210 pslldq $8, %xmm2
211 psrldq $8, %xmm1
212 por %xmm2, %xmm13
213
214 # reduce HashKey<<1
215
216 pshufd $0x24, %xmm1, %xmm2
217 pcmpeqd TWOONE(%rip), %xmm2
218 pand POLY(%rip), %xmm2
219 pxor %xmm2, %xmm13
220 movdqa %xmm13, HashKey(%rsp)
221 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
222 and $-16, %r13
223 mov %r13, %r12
224.endm
225
559ad0ff 226#ifdef __x86_64__
0bd82f5f
TS
227/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
228*
229*
230* Input: A and B (128-bits each, bit-reflected)
231* Output: C = A*B*x mod poly, (i.e. >>1 )
232* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
233* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
234*
235*/
236.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
237 movdqa \GH, \TMP1
238 pshufd $78, \GH, \TMP2
239 pshufd $78, \HK, \TMP3
240 pxor \GH, \TMP2 # TMP2 = a1+a0
241 pxor \HK, \TMP3 # TMP3 = b1+b0
242 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
243 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
244 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
245 pxor \GH, \TMP2
246 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
247 movdqa \TMP2, \TMP3
248 pslldq $8, \TMP3 # left shift TMP3 2 DWs
249 psrldq $8, \TMP2 # right shift TMP2 2 DWs
250 pxor \TMP3, \GH
251 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
252
253 # first phase of the reduction
254
255 movdqa \GH, \TMP2
256 movdqa \GH, \TMP3
257 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
258 # in in order to perform
259 # independent shifts
260 pslld $31, \TMP2 # packed right shift <<31
261 pslld $30, \TMP3 # packed right shift <<30
262 pslld $25, \TMP4 # packed right shift <<25
263 pxor \TMP3, \TMP2 # xor the shifted versions
264 pxor \TMP4, \TMP2
265 movdqa \TMP2, \TMP5
266 psrldq $4, \TMP5 # right shift TMP5 1 DW
267 pslldq $12, \TMP2 # left shift TMP2 3 DWs
268 pxor \TMP2, \GH
269
270 # second phase of the reduction
271
272 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
273 # in in order to perform
274 # independent shifts
275 movdqa \GH,\TMP3
276 movdqa \GH,\TMP4
277 psrld $1,\TMP2 # packed left shift >>1
278 psrld $2,\TMP3 # packed left shift >>2
279 psrld $7,\TMP4 # packed left shift >>7
280 pxor \TMP3,\TMP2 # xor the shifted versions
281 pxor \TMP4,\TMP2
282 pxor \TMP5, \TMP2
283 pxor \TMP2, \GH
284 pxor \TMP1, \GH # result is in TMP1
285.endm
286
b20209c9
JS
287# Reads DLEN bytes starting at DPTR and stores in XMMDst
288# where 0 < DLEN < 16
289# Clobbers %rax, DLEN and XMM1
290.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
291 cmp $8, \DLEN
292 jl _read_lt8_\@
293 mov (\DPTR), %rax
294 MOVQ_R64_XMM %rax, \XMMDst
295 sub $8, \DLEN
296 jz _done_read_partial_block_\@
297 xor %eax, %eax
298_read_next_byte_\@:
299 shl $8, %rax
300 mov 7(\DPTR, \DLEN, 1), %al
301 dec \DLEN
302 jnz _read_next_byte_\@
303 MOVQ_R64_XMM %rax, \XMM1
304 pslldq $8, \XMM1
305 por \XMM1, \XMMDst
306 jmp _done_read_partial_block_\@
307_read_lt8_\@:
308 xor %eax, %eax
309_read_next_byte_lt8_\@:
310 shl $8, %rax
311 mov -1(\DPTR, \DLEN, 1), %al
312 dec \DLEN
313 jnz _read_next_byte_lt8_\@
314 MOVQ_R64_XMM %rax, \XMMDst
315_done_read_partial_block_\@:
316.endm
317
0bd82f5f
TS
318/*
319* if a = number of total plaintext bytes
320* b = floor(a/16)
321* num_initial_blocks = b mod 4
322* encrypt the initial num_initial_blocks blocks and apply ghash on
323* the ciphertext
324* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
325* are clobbered
326* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
327*/
328
0bd82f5f 329
e1fd316f 330.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
3c097b80 331XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 332 MOVADQ SHUF_MASK(%rip), %xmm14
3c097b80 333 mov arg7, %r10 # %r10 = AAD
1ecdd37e 334 mov arg8, %r11 # %r11 = aadLen
3c097b80 335 pxor %xmm\i, %xmm\i
0487ccac
SD
336 pxor \XMM2, \XMM2
337
338 cmp $16, %r11
e1fd316f
DW
339 jl _get_AAD_rest\@
340_get_AAD_blocks\@:
0487ccac
SD
341 movdqu (%r10), %xmm\i
342 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
343 pxor %xmm\i, \XMM2
344 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
345 add $16, %r10
0487ccac
SD
346 sub $16, %r11
347 cmp $16, %r11
e1fd316f 348 jge _get_AAD_blocks\@
0487ccac
SD
349
350 movdqu \XMM2, %xmm\i
1ecdd37e
JS
351
352 /* read the last <16B of AAD */
e1fd316f 353_get_AAD_rest\@:
0487ccac 354 cmp $0, %r11
e1fd316f 355 je _get_AAD_done\@
0487ccac 356
1ecdd37e 357 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
3c097b80 358 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
0487ccac
SD
359 pxor \XMM2, %xmm\i
360 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
3c097b80 361
e1fd316f 362_get_AAD_done\@:
3c097b80 363 xor %r11, %r11 # initialise the data pointer offset as zero
0487ccac 364 # start AES for num_initial_blocks blocks
3c097b80
TS
365
366 mov %arg5, %rax # %rax = *Y0
367 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
368 PSHUFB_XMM %xmm14, \XMM0
369
370.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 371
e31ac32d
TM
372 MOVADQ ONE(%RIP),\TMP1
373 MOVADQ 0(%arg1),\TMP2
3c097b80 374.irpc index, \i_seq
e31ac32d 375 paddd \TMP1, \XMM0 # INCR Y0
e1fd316f
DW
376.ifc \operation, dec
377 movdqa \XMM0, %xmm\index
378.else
e31ac32d 379 MOVADQ \XMM0, %xmm\index
e1fd316f 380.endif
e31ac32d
TM
381 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
382 pxor \TMP2, %xmm\index
3c097b80 383.endr
e31ac32d
TM
384 lea 0x10(%arg1),%r10
385 mov keysize,%eax
386 shr $2,%eax # 128->4, 192->6, 256->8
387 add $5,%eax # 128->9, 192->11, 256->13
388
e1fd316f 389aes_loop_initial_\@:
e31ac32d
TM
390 MOVADQ (%r10),\TMP1
391.irpc index, \i_seq
392 AESENC \TMP1, %xmm\index
3c097b80 393.endr
e31ac32d
TM
394 add $16,%r10
395 sub $1,%eax
e1fd316f 396 jnz aes_loop_initial_\@
e31ac32d
TM
397
398 MOVADQ (%r10), \TMP1
3c097b80 399.irpc index, \i_seq
e31ac32d 400 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
401.endr
402.irpc index, \i_seq
403 movdqu (%arg3 , %r11, 1), \TMP1
404 pxor \TMP1, %xmm\index
405 movdqu %xmm\index, (%arg2 , %r11, 1)
406 # write back plaintext/ciphertext for num_initial_blocks
407 add $16, %r11
e1fd316f
DW
408
409.ifc \operation, dec
410 movdqa \TMP1, %xmm\index
411.endif
3c097b80
TS
412 PSHUFB_XMM %xmm14, %xmm\index
413
414 # prepare plaintext/ciphertext for GHASH computation
415.endr
416.endif
0487ccac 417
3c097b80
TS
418 # apply GHASH on num_initial_blocks blocks
419
420.if \i == 5
421 pxor %xmm5, %xmm6
422 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
423 pxor %xmm6, %xmm7
424 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
425 pxor %xmm7, %xmm8
426 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
427.elseif \i == 6
428 pxor %xmm6, %xmm7
429 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
430 pxor %xmm7, %xmm8
431 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
432.elseif \i == 7
433 pxor %xmm7, %xmm8
434 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
435.endif
436 cmp $64, %r13
e1fd316f 437 jl _initial_blocks_done\@
3c097b80
TS
438 # no need for precomputed values
439/*
440*
441* Precomputations for HashKey parallel with encryption of first 4 blocks.
442* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
443*/
e31ac32d
TM
444 MOVADQ ONE(%RIP),\TMP1
445 paddd \TMP1, \XMM0 # INCR Y0
446 MOVADQ \XMM0, \XMM1
3c097b80
TS
447 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
448
e31ac32d
TM
449 paddd \TMP1, \XMM0 # INCR Y0
450 MOVADQ \XMM0, \XMM2
3c097b80
TS
451 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
452
e31ac32d
TM
453 paddd \TMP1, \XMM0 # INCR Y0
454 MOVADQ \XMM0, \XMM3
3c097b80
TS
455 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
456
e31ac32d
TM
457 paddd \TMP1, \XMM0 # INCR Y0
458 MOVADQ \XMM0, \XMM4
3c097b80
TS
459 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
460
e31ac32d
TM
461 MOVADQ 0(%arg1),\TMP1
462 pxor \TMP1, \XMM1
463 pxor \TMP1, \XMM2
464 pxor \TMP1, \XMM3
465 pxor \TMP1, \XMM4
3c097b80
TS
466 movdqa \TMP3, \TMP5
467 pshufd $78, \TMP3, \TMP1
468 pxor \TMP3, \TMP1
469 movdqa \TMP1, HashKey_k(%rsp)
470 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
471# TMP5 = HashKey^2<<1 (mod poly)
472 movdqa \TMP5, HashKey_2(%rsp)
473# HashKey_2 = HashKey^2<<1 (mod poly)
474 pshufd $78, \TMP5, \TMP1
475 pxor \TMP5, \TMP1
476 movdqa \TMP1, HashKey_2_k(%rsp)
477.irpc index, 1234 # do 4 rounds
478 movaps 0x10*\index(%arg1), \TMP1
479 AESENC \TMP1, \XMM1
480 AESENC \TMP1, \XMM2
481 AESENC \TMP1, \XMM3
482 AESENC \TMP1, \XMM4
483.endr
484 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
485# TMP5 = HashKey^3<<1 (mod poly)
486 movdqa \TMP5, HashKey_3(%rsp)
487 pshufd $78, \TMP5, \TMP1
488 pxor \TMP5, \TMP1
489 movdqa \TMP1, HashKey_3_k(%rsp)
490.irpc index, 56789 # do next 5 rounds
491 movaps 0x10*\index(%arg1), \TMP1
492 AESENC \TMP1, \XMM1
493 AESENC \TMP1, \XMM2
494 AESENC \TMP1, \XMM3
495 AESENC \TMP1, \XMM4
496.endr
497 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
498# TMP5 = HashKey^3<<1 (mod poly)
499 movdqa \TMP5, HashKey_4(%rsp)
500 pshufd $78, \TMP5, \TMP1
501 pxor \TMP5, \TMP1
502 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
503 lea 0xa0(%arg1),%r10
504 mov keysize,%eax
505 shr $2,%eax # 128->4, 192->6, 256->8
506 sub $4,%eax # 128->0, 192->2, 256->4
e1fd316f 507 jz aes_loop_pre_done\@
e31ac32d 508
e1fd316f 509aes_loop_pre_\@:
e31ac32d
TM
510 MOVADQ (%r10),\TMP2
511.irpc index, 1234
512 AESENC \TMP2, %xmm\index
513.endr
514 add $16,%r10
515 sub $1,%eax
e1fd316f 516 jnz aes_loop_pre_\@
e31ac32d 517
e1fd316f 518aes_loop_pre_done\@:
e31ac32d 519 MOVADQ (%r10), \TMP2
3c097b80
TS
520 AESENCLAST \TMP2, \XMM1
521 AESENCLAST \TMP2, \XMM2
522 AESENCLAST \TMP2, \XMM3
523 AESENCLAST \TMP2, \XMM4
524 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
525 pxor \TMP1, \XMM1
e1fd316f
DW
526.ifc \operation, dec
527 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
528 movdqa \TMP1, \XMM1
529.endif
3c097b80
TS
530 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
531 pxor \TMP1, \XMM2
e1fd316f
DW
532.ifc \operation, dec
533 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
534 movdqa \TMP1, \XMM2
535.endif
3c097b80
TS
536 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
537 pxor \TMP1, \XMM3
e1fd316f
DW
538.ifc \operation, dec
539 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
540 movdqa \TMP1, \XMM3
541.endif
3c097b80
TS
542 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
543 pxor \TMP1, \XMM4
e1fd316f
DW
544.ifc \operation, dec
545 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
546 movdqa \TMP1, \XMM4
547.else
0bd82f5f
TS
548 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
549 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
550 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
551 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
e1fd316f 552.endif
3c097b80 553
0bd82f5f 554 add $64, %r11
3c097b80 555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
556 pxor \XMMDst, \XMM1
557# combine GHASHed value with the corresponding ciphertext
3c097b80 558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 559 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
560 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
561
e1fd316f 562_initial_blocks_done\@:
3c097b80 563
0bd82f5f
TS
564.endm
565
566/*
567* encrypt 4 blocks at a time
568* ghash the 4 previously encrypted ciphertext blocks
569* arg1, %arg2, %arg3 are used as pointers only, not modified
570* %r11 is the data offset value
571*/
3c097b80
TS
572.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
573TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
574
575 movdqa \XMM1, \XMM5
576 movdqa \XMM2, \XMM6
577 movdqa \XMM3, \XMM7
578 movdqa \XMM4, \XMM8
579
580 movdqa SHUF_MASK(%rip), %xmm15
581 # multiply TMP5 * HashKey using karatsuba
582
583 movdqa \XMM5, \TMP4
584 pshufd $78, \XMM5, \TMP6
585 pxor \XMM5, \TMP6
586 paddd ONE(%rip), \XMM0 # INCR CNT
587 movdqa HashKey_4(%rsp), \TMP5
588 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
589 movdqa \XMM0, \XMM1
590 paddd ONE(%rip), \XMM0 # INCR CNT
591 movdqa \XMM0, \XMM2
592 paddd ONE(%rip), \XMM0 # INCR CNT
593 movdqa \XMM0, \XMM3
594 paddd ONE(%rip), \XMM0 # INCR CNT
595 movdqa \XMM0, \XMM4
596 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
597 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
598 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
599 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
600 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
601
602 pxor (%arg1), \XMM1
603 pxor (%arg1), \XMM2
604 pxor (%arg1), \XMM3
605 pxor (%arg1), \XMM4
606 movdqa HashKey_4_k(%rsp), \TMP5
607 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
608 movaps 0x10(%arg1), \TMP1
609 AESENC \TMP1, \XMM1 # Round 1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613 movaps 0x20(%arg1), \TMP1
614 AESENC \TMP1, \XMM1 # Round 2
615 AESENC \TMP1, \XMM2
616 AESENC \TMP1, \XMM3
617 AESENC \TMP1, \XMM4
618 movdqa \XMM6, \TMP1
619 pshufd $78, \XMM6, \TMP2
620 pxor \XMM6, \TMP2
621 movdqa HashKey_3(%rsp), \TMP5
622 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
623 movaps 0x30(%arg1), \TMP3
624 AESENC \TMP3, \XMM1 # Round 3
625 AESENC \TMP3, \XMM2
626 AESENC \TMP3, \XMM3
627 AESENC \TMP3, \XMM4
628 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
629 movaps 0x40(%arg1), \TMP3
630 AESENC \TMP3, \XMM1 # Round 4
631 AESENC \TMP3, \XMM2
632 AESENC \TMP3, \XMM3
633 AESENC \TMP3, \XMM4
634 movdqa HashKey_3_k(%rsp), \TMP5
635 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
636 movaps 0x50(%arg1), \TMP3
637 AESENC \TMP3, \XMM1 # Round 5
638 AESENC \TMP3, \XMM2
639 AESENC \TMP3, \XMM3
640 AESENC \TMP3, \XMM4
641 pxor \TMP1, \TMP4
642# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
643 pxor \XMM6, \XMM5
644 pxor \TMP2, \TMP6
645 movdqa \XMM7, \TMP1
646 pshufd $78, \XMM7, \TMP2
647 pxor \XMM7, \TMP2
648 movdqa HashKey_2(%rsp ), \TMP5
649
650 # Multiply TMP5 * HashKey using karatsuba
651
652 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
653 movaps 0x60(%arg1), \TMP3
654 AESENC \TMP3, \XMM1 # Round 6
655 AESENC \TMP3, \XMM2
656 AESENC \TMP3, \XMM3
657 AESENC \TMP3, \XMM4
658 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
659 movaps 0x70(%arg1), \TMP3
660 AESENC \TMP3, \XMM1 # Round 7
661 AESENC \TMP3, \XMM2
662 AESENC \TMP3, \XMM3
663 AESENC \TMP3, \XMM4
664 movdqa HashKey_2_k(%rsp), \TMP5
665 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
666 movaps 0x80(%arg1), \TMP3
667 AESENC \TMP3, \XMM1 # Round 8
668 AESENC \TMP3, \XMM2
669 AESENC \TMP3, \XMM3
670 AESENC \TMP3, \XMM4
671 pxor \TMP1, \TMP4
672# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
673 pxor \XMM7, \XMM5
674 pxor \TMP2, \TMP6
675
676 # Multiply XMM8 * HashKey
677 # XMM8 and TMP5 hold the values for the two operands
678
679 movdqa \XMM8, \TMP1
680 pshufd $78, \XMM8, \TMP2
681 pxor \XMM8, \TMP2
682 movdqa HashKey(%rsp), \TMP5
683 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
684 movaps 0x90(%arg1), \TMP3
685 AESENC \TMP3, \XMM1 # Round 9
686 AESENC \TMP3, \XMM2
687 AESENC \TMP3, \XMM3
688 AESENC \TMP3, \XMM4
689 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
690 lea 0xa0(%arg1),%r10
691 mov keysize,%eax
692 shr $2,%eax # 128->4, 192->6, 256->8
693 sub $4,%eax # 128->0, 192->2, 256->4
694 jz aes_loop_par_enc_done
695
696aes_loop_par_enc:
697 MOVADQ (%r10),\TMP3
698.irpc index, 1234
699 AESENC \TMP3, %xmm\index
700.endr
701 add $16,%r10
702 sub $1,%eax
703 jnz aes_loop_par_enc
704
705aes_loop_par_enc_done:
706 MOVADQ (%r10), \TMP3
3c097b80
TS
707 AESENCLAST \TMP3, \XMM1 # Round 10
708 AESENCLAST \TMP3, \XMM2
709 AESENCLAST \TMP3, \XMM3
710 AESENCLAST \TMP3, \XMM4
711 movdqa HashKey_k(%rsp), \TMP5
712 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
713 movdqu (%arg3,%r11,1), \TMP3
714 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
715 movdqu 16(%arg3,%r11,1), \TMP3
716 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
717 movdqu 32(%arg3,%r11,1), \TMP3
718 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
719 movdqu 48(%arg3,%r11,1), \TMP3
720 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
721 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
722 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
723 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
724 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
725 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
726 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
727 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
728 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
729
730 pxor \TMP4, \TMP1
731 pxor \XMM8, \XMM5
732 pxor \TMP6, \TMP2
733 pxor \TMP1, \TMP2
734 pxor \XMM5, \TMP2
735 movdqa \TMP2, \TMP3
736 pslldq $8, \TMP3 # left shift TMP3 2 DWs
737 psrldq $8, \TMP2 # right shift TMP2 2 DWs
738 pxor \TMP3, \XMM5
739 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
740
741 # first phase of reduction
742
743 movdqa \XMM5, \TMP2
744 movdqa \XMM5, \TMP3
745 movdqa \XMM5, \TMP4
746# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
747 pslld $31, \TMP2 # packed right shift << 31
748 pslld $30, \TMP3 # packed right shift << 30
749 pslld $25, \TMP4 # packed right shift << 25
750 pxor \TMP3, \TMP2 # xor the shifted versions
751 pxor \TMP4, \TMP2
752 movdqa \TMP2, \TMP5
753 psrldq $4, \TMP5 # right shift T5 1 DW
754 pslldq $12, \TMP2 # left shift T2 3 DWs
755 pxor \TMP2, \XMM5
756
757 # second phase of reduction
758
759 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
760 movdqa \XMM5,\TMP3
761 movdqa \XMM5,\TMP4
762 psrld $1, \TMP2 # packed left shift >>1
763 psrld $2, \TMP3 # packed left shift >>2
764 psrld $7, \TMP4 # packed left shift >>7
765 pxor \TMP3,\TMP2 # xor the shifted versions
766 pxor \TMP4,\TMP2
767 pxor \TMP5, \TMP2
768 pxor \TMP2, \XMM5
769 pxor \TMP1, \XMM5 # result is in TMP1
770
771 pxor \XMM5, \XMM1
772.endm
773
774/*
775* decrypt 4 blocks at a time
776* ghash the 4 previously decrypted ciphertext blocks
777* arg1, %arg2, %arg3 are used as pointers only, not modified
778* %r11 is the data offset value
779*/
780.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
781TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
782
783 movdqa \XMM1, \XMM5
784 movdqa \XMM2, \XMM6
785 movdqa \XMM3, \XMM7
786 movdqa \XMM4, \XMM8
787
3c097b80 788 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
789 # multiply TMP5 * HashKey using karatsuba
790
791 movdqa \XMM5, \TMP4
792 pshufd $78, \XMM5, \TMP6
793 pxor \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
797 movdqa \XMM0, \XMM1
798 paddd ONE(%rip), \XMM0 # INCR CNT
799 movdqa \XMM0, \XMM2
800 paddd ONE(%rip), \XMM0 # INCR CNT
801 movdqa \XMM0, \XMM3
802 paddd ONE(%rip), \XMM0 # INCR CNT
803 movdqa \XMM0, \XMM4
3c097b80 804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
809
0bd82f5f
TS
810 pxor (%arg1), \XMM1
811 pxor (%arg1), \XMM2
812 pxor (%arg1), \XMM3
813 pxor (%arg1), \XMM4
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
818 AESENC \TMP1, \XMM2
819 AESENC \TMP1, \XMM3
820 AESENC \TMP1, \XMM4
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
823 AESENC \TMP1, \XMM2
824 AESENC \TMP1, \XMM3
825 AESENC \TMP1, \XMM4
826 movdqa \XMM6, \TMP1
827 pshufd $78, \XMM6, \TMP2
828 pxor \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
833 AESENC \TMP3, \XMM2
834 AESENC \TMP3, \XMM3
835 AESENC \TMP3, \XMM4
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
839 AESENC \TMP3, \XMM2
840 AESENC \TMP3, \XMM3
841 AESENC \TMP3, \XMM4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
846 AESENC \TMP3, \XMM2
847 AESENC \TMP3, \XMM3
848 AESENC \TMP3, \XMM4
849 pxor \TMP1, \TMP4
850# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
851 pxor \XMM6, \XMM5
852 pxor \TMP2, \TMP6
853 movdqa \XMM7, \TMP1
854 pshufd $78, \XMM7, \TMP2
855 pxor \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
857
858 # Multiply TMP5 * HashKey using karatsuba
859
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
863 AESENC \TMP3, \XMM2
864 AESENC \TMP3, \XMM3
865 AESENC \TMP3, \XMM4
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
869 AESENC \TMP3, \XMM2
870 AESENC \TMP3, \XMM3
871 AESENC \TMP3, \XMM4
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
876 AESENC \TMP3, \XMM2
877 AESENC \TMP3, \XMM3
878 AESENC \TMP3, \XMM4
879 pxor \TMP1, \TMP4
880# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
881 pxor \XMM7, \XMM5
882 pxor \TMP2, \TMP6
883
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
886
887 movdqa \XMM8, \TMP1
888 pshufd $78, \XMM8, \TMP2
889 pxor \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
894 AESENC \TMP3, \XMM2
895 AESENC \TMP3, \XMM3
896 AESENC \TMP3, \XMM4
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
898 lea 0xa0(%arg1),%r10
899 mov keysize,%eax
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_dec_done
903
904aes_loop_par_dec:
905 MOVADQ (%r10),\TMP3
906.irpc index, 1234
907 AESENC \TMP3, %xmm\index
908.endr
909 add $16,%r10
910 sub $1,%eax
911 jnz aes_loop_par_dec
912
913aes_loop_par_dec_done:
914 MOVADQ (%r10), \TMP3
915 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
923 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
924 movdqa \TMP3, \XMM1
0bd82f5f
TS
925 movdqu 16(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
927 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
928 movdqa \TMP3, \XMM2
0bd82f5f
TS
929 movdqu 32(%arg3,%r11,1), \TMP3
930 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
932 movdqa \TMP3, \XMM3
0bd82f5f
TS
933 movdqu 48(%arg3,%r11,1), \TMP3
934 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
0bd82f5f
TS
935 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
936 movdqa \TMP3, \XMM4
3c097b80
TS
937 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
938 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
939 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
940 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
941
942 pxor \TMP4, \TMP1
943 pxor \XMM8, \XMM5
944 pxor \TMP6, \TMP2
945 pxor \TMP1, \TMP2
946 pxor \XMM5, \TMP2
947 movdqa \TMP2, \TMP3
948 pslldq $8, \TMP3 # left shift TMP3 2 DWs
949 psrldq $8, \TMP2 # right shift TMP2 2 DWs
950 pxor \TMP3, \XMM5
951 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
952
953 # first phase of reduction
954
955 movdqa \XMM5, \TMP2
956 movdqa \XMM5, \TMP3
957 movdqa \XMM5, \TMP4
958# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
959 pslld $31, \TMP2 # packed right shift << 31
960 pslld $30, \TMP3 # packed right shift << 30
961 pslld $25, \TMP4 # packed right shift << 25
962 pxor \TMP3, \TMP2 # xor the shifted versions
963 pxor \TMP4, \TMP2
964 movdqa \TMP2, \TMP5
965 psrldq $4, \TMP5 # right shift T5 1 DW
966 pslldq $12, \TMP2 # left shift T2 3 DWs
967 pxor \TMP2, \XMM5
968
969 # second phase of reduction
970
971 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
972 movdqa \XMM5,\TMP3
973 movdqa \XMM5,\TMP4
974 psrld $1, \TMP2 # packed left shift >>1
975 psrld $2, \TMP3 # packed left shift >>2
976 psrld $7, \TMP4 # packed left shift >>7
977 pxor \TMP3,\TMP2 # xor the shifted versions
978 pxor \TMP4,\TMP2
979 pxor \TMP5, \TMP2
980 pxor \TMP2, \XMM5
981 pxor \TMP1, \XMM5 # result is in TMP1
982
983 pxor \XMM5, \XMM1
984.endm
985
986/* GHASH the last 4 ciphertext blocks. */
987.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
988TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
989
990 # Multiply TMP6 * HashKey (using Karatsuba)
991
992 movdqa \XMM1, \TMP6
993 pshufd $78, \XMM1, \TMP2
994 pxor \XMM1, \TMP2
995 movdqa HashKey_4(%rsp), \TMP5
996 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
997 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
998 movdqa HashKey_4_k(%rsp), \TMP4
999 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1000 movdqa \XMM1, \XMMDst
1001 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1002
1003 # Multiply TMP1 * HashKey (using Karatsuba)
1004
1005 movdqa \XMM2, \TMP1
1006 pshufd $78, \XMM2, \TMP2
1007 pxor \XMM2, \TMP2
1008 movdqa HashKey_3(%rsp), \TMP5
1009 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1010 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1011 movdqa HashKey_3_k(%rsp), \TMP4
1012 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1013 pxor \TMP1, \TMP6
1014 pxor \XMM2, \XMMDst
1015 pxor \TMP2, \XMM1
1016# results accumulated in TMP6, XMMDst, XMM1
1017
1018 # Multiply TMP1 * HashKey (using Karatsuba)
1019
1020 movdqa \XMM3, \TMP1
1021 pshufd $78, \XMM3, \TMP2
1022 pxor \XMM3, \TMP2
1023 movdqa HashKey_2(%rsp), \TMP5
1024 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1025 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1026 movdqa HashKey_2_k(%rsp), \TMP4
1027 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1028 pxor \TMP1, \TMP6
1029 pxor \XMM3, \XMMDst
1030 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1031
1032 # Multiply TMP1 * HashKey (using Karatsuba)
1033 movdqa \XMM4, \TMP1
1034 pshufd $78, \XMM4, \TMP2
1035 pxor \XMM4, \TMP2
1036 movdqa HashKey(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1038 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1039 movdqa HashKey_k(%rsp), \TMP4
1040 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1041 pxor \TMP1, \TMP6
1042 pxor \XMM4, \XMMDst
1043 pxor \XMM1, \TMP2
1044 pxor \TMP6, \TMP2
1045 pxor \XMMDst, \TMP2
1046 # middle section of the temp results combined as in karatsuba algorithm
1047 movdqa \TMP2, \TMP4
1048 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1049 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1050 pxor \TMP4, \XMMDst
1051 pxor \TMP2, \TMP6
1052# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1053 # first phase of the reduction
1054 movdqa \XMMDst, \TMP2
1055 movdqa \XMMDst, \TMP3
1056 movdqa \XMMDst, \TMP4
1057# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1058 pslld $31, \TMP2 # packed right shifting << 31
1059 pslld $30, \TMP3 # packed right shifting << 30
1060 pslld $25, \TMP4 # packed right shifting << 25
1061 pxor \TMP3, \TMP2 # xor the shifted versions
1062 pxor \TMP4, \TMP2
1063 movdqa \TMP2, \TMP7
1064 psrldq $4, \TMP7 # right shift TMP7 1 DW
1065 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1066 pxor \TMP2, \XMMDst
1067
1068 # second phase of the reduction
1069 movdqa \XMMDst, \TMP2
1070 # make 3 copies of XMMDst for doing 3 shift operations
1071 movdqa \XMMDst, \TMP3
1072 movdqa \XMMDst, \TMP4
1073 psrld $1, \TMP2 # packed left shift >> 1
1074 psrld $2, \TMP3 # packed left shift >> 2
1075 psrld $7, \TMP4 # packed left shift >> 7
1076 pxor \TMP3, \TMP2 # xor the shifted versions
1077 pxor \TMP4, \TMP2
1078 pxor \TMP7, \TMP2
1079 pxor \TMP2, \XMMDst
1080 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1081.endm
1082
0bd82f5f 1083
e31ac32d
TM
1084/* Encryption of a single block
1085* uses eax & r10
1086*/
0bd82f5f 1087
e31ac32d 1088.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1089
e31ac32d
TM
1090 pxor (%arg1), \XMM0
1091 mov keysize,%eax
1092 shr $2,%eax # 128->4, 192->6, 256->8
1093 add $5,%eax # 128->9, 192->11, 256->13
1094 lea 16(%arg1), %r10 # get first expanded key address
1095
1096_esb_loop_\@:
1097 MOVADQ (%r10),\TMP1
1098 AESENC \TMP1,\XMM0
1099 add $16,%r10
1100 sub $1,%eax
1101 jnz _esb_loop_\@
1102
1103 MOVADQ (%r10),\TMP1
1104 AESENCLAST \TMP1,\XMM0
1105.endm
0bd82f5f
TS
1106/*****************************************************************************
1107* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1108* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1109* const u8 *in, // Ciphertext input
1110* u64 plaintext_len, // Length of data in bytes for decryption.
1111* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1112* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1113* // concatenated with 0x00000001. 16-byte aligned pointer.
1114* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1115* const u8 *aad, // Additional Authentication Data (AAD)
1116* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1117* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1118* // given authentication tag and only return the plaintext if they match.
1119* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1120* // (most likely), 12 or 8.
1121*
1122* Assumptions:
1123*
1124* keys:
1125* keys are pre-expanded and aligned to 16 bytes. we are using the first
1126* set of 11 keys in the data structure void *aes_ctx
1127*
1128* iv:
1129* 0 1 2 3
1130* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1131* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1132* | Salt (From the SA) |
1133* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1134* | Initialization Vector |
1135* | (This is the sequence number from IPSec header) |
1136* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1137* | 0x1 |
1138* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1139*
1140*
1141*
1142* AAD:
1143* AAD padded to 128 bits with 0
1144* for example, assume AAD is a u32 vector
1145*
1146* if AAD is 8 bytes:
1147* AAD[3] = {A0, A1};
1148* padded AAD in xmm register = {A1 A0 0 0}
1149*
1150* 0 1 2 3
1151* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1152* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1153* | SPI (A1) |
1154* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1155* | 32-bit Sequence Number (A0) |
1156* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1157* | 0x0 |
1158* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1159*
1160* AAD Format with 32-bit Sequence Number
1161*
1162* if AAD is 12 bytes:
1163* AAD[3] = {A0, A1, A2};
1164* padded AAD in xmm register = {A2 A1 A0 0}
1165*
1166* 0 1 2 3
1167* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1168* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1169* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1170* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1171* | SPI (A2) |
1172* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1173* | 64-bit Extended Sequence Number {A1,A0} |
1174* | |
1175* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1176* | 0x0 |
1177* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1178*
1179* AAD Format with 64-bit Extended Sequence Number
1180*
0bd82f5f
TS
1181* poly = x^128 + x^127 + x^126 + x^121 + 1
1182*
1183*****************************************************************************/
0bd82f5f 1184ENTRY(aesni_gcm_dec)
6c2c86b3 1185 FUNC_SAVE
0bd82f5f 1186
7af964c2 1187 GCM_INIT
0bd82f5f
TS
1188
1189 # Decrypt first few blocks
1190
0bd82f5f
TS
1191 and $(3<<4), %r12
1192 jz _initial_num_blocks_is_0_decrypt
1193 cmp $(2<<4), %r12
1194 jb _initial_num_blocks_is_1_decrypt
1195 je _initial_num_blocks_is_2_decrypt
1196_initial_num_blocks_is_3_decrypt:
e1fd316f 1197 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1198%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1199 sub $48, %r13
1200 jmp _initial_blocks_decrypted
1201_initial_num_blocks_is_2_decrypt:
e1fd316f 1202 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1203%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1204 sub $32, %r13
1205 jmp _initial_blocks_decrypted
1206_initial_num_blocks_is_1_decrypt:
e1fd316f 1207 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1208%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1209 sub $16, %r13
1210 jmp _initial_blocks_decrypted
1211_initial_num_blocks_is_0_decrypt:
e1fd316f 1212 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1213%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1214_initial_blocks_decrypted:
1215 cmp $0, %r13
1216 je _zero_cipher_left_decrypt
1217 sub $64, %r13
1218 je _four_cipher_left_decrypt
1219_decrypt_by_4:
3c097b80 1220 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1221%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1222 add $64, %r11
1223 sub $64, %r13
1224 jne _decrypt_by_4
1225_four_cipher_left_decrypt:
1226 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1227%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1228_zero_cipher_left_decrypt:
1229 mov %arg4, %r13
1230 and $15, %r13 # %r13 = arg4 (mod 16)
1231 je _multiple_of_16_bytes_decrypt
1232
0d2eb44f 1233 # Handle the last <16 byte block separately
0bd82f5f
TS
1234
1235 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
3c097b80
TS
1236 movdqa SHUF_MASK(%rip), %xmm10
1237 PSHUFB_XMM %xmm10, %xmm0
1238
0bd82f5f 1239 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
3c097b80 1240
b20209c9
JS
1241 lea (%arg3,%r11,1), %r10
1242 mov %r13, %r12
1243 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1244
1245 lea ALL_F+16(%rip), %r12
1246 sub %r13, %r12
0bd82f5f
TS
1247 movdqa %xmm1, %xmm2
1248 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
b20209c9 1249 movdqu (%r12), %xmm1
0bd82f5f
TS
1250 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1251 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1252 pand %xmm1, %xmm2
3c097b80
TS
1253 movdqa SHUF_MASK(%rip), %xmm10
1254 PSHUFB_XMM %xmm10 ,%xmm2
1255
0bd82f5f
TS
1256 pxor %xmm2, %xmm8
1257 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
0bd82f5f
TS
1258
1259 # output %r13 bytes
3c097b80 1260 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1261 cmp $8, %r13
1262 jle _less_than_8_bytes_left_decrypt
1263 mov %rax, (%arg2 , %r11, 1)
1264 add $8, %r11
1265 psrldq $8, %xmm0
3c097b80 1266 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1267 sub $8, %r13
1268_less_than_8_bytes_left_decrypt:
1269 mov %al, (%arg2, %r11, 1)
1270 add $1, %r11
1271 shr $8, %rax
1272 sub $1, %r13
1273 jne _less_than_8_bytes_left_decrypt
1274_multiple_of_16_bytes_decrypt:
1275 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1276 shl $3, %r12 # convert into number of bits
1277 movd %r12d, %xmm15 # len(A) in %xmm15
1278 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1279 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1280 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1281 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1282 pxor %xmm15, %xmm8
1283 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1284 # final GHASH computation
3c097b80
TS
1285 movdqa SHUF_MASK(%rip), %xmm10
1286 PSHUFB_XMM %xmm10, %xmm8
1287
0bd82f5f
TS
1288 mov %arg5, %rax # %rax = *Y0
1289 movdqu (%rax), %xmm0 # %xmm0 = Y0
1290 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1291 pxor %xmm8, %xmm0
1292_return_T_decrypt:
1293 mov arg9, %r10 # %r10 = authTag
1294 mov arg10, %r11 # %r11 = auth_tag_len
1295 cmp $16, %r11
1296 je _T_16_decrypt
38d9deec
SD
1297 cmp $8, %r11
1298 jl _T_4_decrypt
0bd82f5f 1299_T_8_decrypt:
3c097b80 1300 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f 1301 mov %rax, (%r10)
38d9deec
SD
1302 add $8, %r10
1303 sub $8, %r11
0bd82f5f 1304 psrldq $8, %xmm0
38d9deec
SD
1305 cmp $0, %r11
1306 je _return_T_done_decrypt
1307_T_4_decrypt:
1308 movd %xmm0, %eax
1309 mov %eax, (%r10)
1310 add $4, %r10
1311 sub $4, %r11
1312 psrldq $4, %xmm0
1313 cmp $0, %r11
1314 je _return_T_done_decrypt
1315_T_123_decrypt:
0bd82f5f 1316 movd %xmm0, %eax
38d9deec
SD
1317 cmp $2, %r11
1318 jl _T_1_decrypt
1319 mov %ax, (%r10)
1320 cmp $2, %r11
1321 je _return_T_done_decrypt
1322 add $2, %r10
1323 sar $16, %eax
1324_T_1_decrypt:
1325 mov %al, (%r10)
0bd82f5f
TS
1326 jmp _return_T_done_decrypt
1327_T_16_decrypt:
1328 movdqu %xmm0, (%r10)
1329_return_T_done_decrypt:
6c2c86b3 1330 FUNC_RESTORE
0bd82f5f 1331 ret
8309b745 1332ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1333
1334
1335/*****************************************************************************
1336* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1337* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1338* const u8 *in, // Plaintext input
1339* u64 plaintext_len, // Length of data in bytes for encryption.
1340* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1341* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1342* // concatenated with 0x00000001. 16-byte aligned pointer.
1343* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1344* const u8 *aad, // Additional Authentication Data (AAD)
1345* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1346* u8 *auth_tag, // Authenticated Tag output.
1347* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1348* // 12 or 8.
1349*
1350* Assumptions:
1351*
1352* keys:
1353* keys are pre-expanded and aligned to 16 bytes. we are using the
1354* first set of 11 keys in the data structure void *aes_ctx
1355*
1356*
1357* iv:
1358* 0 1 2 3
1359* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361* | Salt (From the SA) |
1362* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363* | Initialization Vector |
1364* | (This is the sequence number from IPSec header) |
1365* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366* | 0x1 |
1367* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368*
1369*
1370*
1371* AAD:
1372* AAD padded to 128 bits with 0
1373* for example, assume AAD is a u32 vector
1374*
1375* if AAD is 8 bytes:
1376* AAD[3] = {A0, A1};
1377* padded AAD in xmm register = {A1 A0 0 0}
1378*
1379* 0 1 2 3
1380* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1381* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1382* | SPI (A1) |
1383* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384* | 32-bit Sequence Number (A0) |
1385* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386* | 0x0 |
1387* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1388*
1389* AAD Format with 32-bit Sequence Number
1390*
1391* if AAD is 12 bytes:
1392* AAD[3] = {A0, A1, A2};
1393* padded AAD in xmm register = {A2 A1 A0 0}
1394*
1395* 0 1 2 3
1396* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1397* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1398* | SPI (A2) |
1399* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1400* | 64-bit Extended Sequence Number {A1,A0} |
1401* | |
1402* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1403* | 0x0 |
1404* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1405*
1406* AAD Format with 64-bit Extended Sequence Number
1407*
0bd82f5f
TS
1408* poly = x^128 + x^127 + x^126 + x^121 + 1
1409***************************************************************************/
1410ENTRY(aesni_gcm_enc)
6c2c86b3 1411 FUNC_SAVE
0bd82f5f 1412
7af964c2 1413 GCM_INIT
0bd82f5f
TS
1414 # Encrypt first few blocks
1415
1416 and $(3<<4), %r12
1417 jz _initial_num_blocks_is_0_encrypt
1418 cmp $(2<<4), %r12
1419 jb _initial_num_blocks_is_1_encrypt
1420 je _initial_num_blocks_is_2_encrypt
1421_initial_num_blocks_is_3_encrypt:
e1fd316f 1422 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1423%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1424 sub $48, %r13
1425 jmp _initial_blocks_encrypted
1426_initial_num_blocks_is_2_encrypt:
e1fd316f 1427 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1428%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1429 sub $32, %r13
1430 jmp _initial_blocks_encrypted
1431_initial_num_blocks_is_1_encrypt:
e1fd316f 1432 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1433%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1434 sub $16, %r13
1435 jmp _initial_blocks_encrypted
1436_initial_num_blocks_is_0_encrypt:
e1fd316f 1437 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
0bd82f5f
TS
1438%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1439_initial_blocks_encrypted:
1440
1441 # Main loop - Encrypt remaining blocks
1442
1443 cmp $0, %r13
1444 je _zero_cipher_left_encrypt
1445 sub $64, %r13
1446 je _four_cipher_left_encrypt
1447_encrypt_by_4_encrypt:
3c097b80 1448 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
0bd82f5f
TS
1449%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1450 add $64, %r11
1451 sub $64, %r13
1452 jne _encrypt_by_4_encrypt
1453_four_cipher_left_encrypt:
1454 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1455%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1456_zero_cipher_left_encrypt:
1457 mov %arg4, %r13
1458 and $15, %r13 # %r13 = arg4 (mod 16)
1459 je _multiple_of_16_bytes_encrypt
1460
0d2eb44f 1461 # Handle the last <16 Byte block separately
0bd82f5f 1462 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
3c097b80
TS
1463 movdqa SHUF_MASK(%rip), %xmm10
1464 PSHUFB_XMM %xmm10, %xmm0
1465
0bd82f5f 1466 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
b20209c9
JS
1467
1468 lea (%arg3,%r11,1), %r10
1469 mov %r13, %r12
1470 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1471
1472 lea ALL_F+16(%rip), %r12
0bd82f5f 1473 sub %r13, %r12
0bd82f5f 1474 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
b20209c9 1475 movdqu (%r12), %xmm1
0bd82f5f
TS
1476 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1477 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
3c097b80
TS
1478 movdqa SHUF_MASK(%rip), %xmm10
1479 PSHUFB_XMM %xmm10,%xmm0
0bd82f5f 1480
0bd82f5f
TS
1481 pxor %xmm0, %xmm8
1482 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1483 # GHASH computation for the last <16 byte block
60af520c
TS
1484 movdqa SHUF_MASK(%rip), %xmm10
1485 PSHUFB_XMM %xmm10, %xmm0
3c097b80 1486
0bd82f5f
TS
1487 # shuffle xmm0 back to output as ciphertext
1488
1489 # Output %r13 bytes
3c097b80 1490 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1491 cmp $8, %r13
1492 jle _less_than_8_bytes_left_encrypt
1493 mov %rax, (%arg2 , %r11, 1)
1494 add $8, %r11
1495 psrldq $8, %xmm0
3c097b80 1496 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f
TS
1497 sub $8, %r13
1498_less_than_8_bytes_left_encrypt:
1499 mov %al, (%arg2, %r11, 1)
1500 add $1, %r11
1501 shr $8, %rax
1502 sub $1, %r13
1503 jne _less_than_8_bytes_left_encrypt
1504_multiple_of_16_bytes_encrypt:
1505 mov arg8, %r12 # %r12 = addLen (number of bytes)
1506 shl $3, %r12
1507 movd %r12d, %xmm15 # len(A) in %xmm15
1508 shl $3, %arg4 # len(C) in bits (*128)
3c097b80 1509 MOVQ_R64_XMM %arg4, %xmm1
0bd82f5f
TS
1510 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1511 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1512 pxor %xmm15, %xmm8
1513 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1514 # final GHASH computation
3c097b80
TS
1515 movdqa SHUF_MASK(%rip), %xmm10
1516 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
0bd82f5f 1517
0bd82f5f
TS
1518 mov %arg5, %rax # %rax = *Y0
1519 movdqu (%rax), %xmm0 # %xmm0 = Y0
1520 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1521 pxor %xmm8, %xmm0
1522_return_T_encrypt:
1523 mov arg9, %r10 # %r10 = authTag
1524 mov arg10, %r11 # %r11 = auth_tag_len
1525 cmp $16, %r11
1526 je _T_16_encrypt
38d9deec
SD
1527 cmp $8, %r11
1528 jl _T_4_encrypt
0bd82f5f 1529_T_8_encrypt:
3c097b80 1530 MOVQ_R64_XMM %xmm0, %rax
0bd82f5f 1531 mov %rax, (%r10)
38d9deec
SD
1532 add $8, %r10
1533 sub $8, %r11
0bd82f5f 1534 psrldq $8, %xmm0
38d9deec
SD
1535 cmp $0, %r11
1536 je _return_T_done_encrypt
1537_T_4_encrypt:
1538 movd %xmm0, %eax
1539 mov %eax, (%r10)
1540 add $4, %r10
1541 sub $4, %r11
1542 psrldq $4, %xmm0
1543 cmp $0, %r11
1544 je _return_T_done_encrypt
1545_T_123_encrypt:
0bd82f5f 1546 movd %xmm0, %eax
38d9deec
SD
1547 cmp $2, %r11
1548 jl _T_1_encrypt
1549 mov %ax, (%r10)
1550 cmp $2, %r11
1551 je _return_T_done_encrypt
1552 add $2, %r10
1553 sar $16, %eax
1554_T_1_encrypt:
1555 mov %al, (%r10)
0bd82f5f
TS
1556 jmp _return_T_done_encrypt
1557_T_16_encrypt:
1558 movdqu %xmm0, (%r10)
1559_return_T_done_encrypt:
6c2c86b3 1560 FUNC_RESTORE
0bd82f5f 1561 ret
8309b745 1562ENDPROC(aesni_gcm_enc)
3c097b80 1563
559ad0ff 1564#endif
0bd82f5f
TS
1565
1566
8309b745 1567.align 4
54b6a1bd
HY
1568_key_expansion_128:
1569_key_expansion_256a:
1570 pshufd $0b11111111, %xmm1, %xmm1
1571 shufps $0b00010000, %xmm0, %xmm4
1572 pxor %xmm4, %xmm0
1573 shufps $0b10001100, %xmm0, %xmm4
1574 pxor %xmm4, %xmm0
1575 pxor %xmm1, %xmm0
0d258efb
MK
1576 movaps %xmm0, (TKEYP)
1577 add $0x10, TKEYP
54b6a1bd 1578 ret
8309b745
JK
1579ENDPROC(_key_expansion_128)
1580ENDPROC(_key_expansion_256a)
54b6a1bd 1581
0d258efb 1582.align 4
54b6a1bd
HY
1583_key_expansion_192a:
1584 pshufd $0b01010101, %xmm1, %xmm1
1585 shufps $0b00010000, %xmm0, %xmm4
1586 pxor %xmm4, %xmm0
1587 shufps $0b10001100, %xmm0, %xmm4
1588 pxor %xmm4, %xmm0
1589 pxor %xmm1, %xmm0
1590
1591 movaps %xmm2, %xmm5
1592 movaps %xmm2, %xmm6
1593 pslldq $4, %xmm5
1594 pshufd $0b11111111, %xmm0, %xmm3
1595 pxor %xmm3, %xmm2
1596 pxor %xmm5, %xmm2
1597
1598 movaps %xmm0, %xmm1
1599 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1600 movaps %xmm6, (TKEYP)
54b6a1bd 1601 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1602 movaps %xmm1, 0x10(TKEYP)
1603 add $0x20, TKEYP
54b6a1bd 1604 ret
8309b745 1605ENDPROC(_key_expansion_192a)
54b6a1bd 1606
0d258efb 1607.align 4
54b6a1bd
HY
1608_key_expansion_192b:
1609 pshufd $0b01010101, %xmm1, %xmm1
1610 shufps $0b00010000, %xmm0, %xmm4
1611 pxor %xmm4, %xmm0
1612 shufps $0b10001100, %xmm0, %xmm4
1613 pxor %xmm4, %xmm0
1614 pxor %xmm1, %xmm0
1615
1616 movaps %xmm2, %xmm5
1617 pslldq $4, %xmm5
1618 pshufd $0b11111111, %xmm0, %xmm3
1619 pxor %xmm3, %xmm2
1620 pxor %xmm5, %xmm2
1621
0d258efb
MK
1622 movaps %xmm0, (TKEYP)
1623 add $0x10, TKEYP
54b6a1bd 1624 ret
8309b745 1625ENDPROC(_key_expansion_192b)
54b6a1bd 1626
0d258efb 1627.align 4
54b6a1bd
HY
1628_key_expansion_256b:
1629 pshufd $0b10101010, %xmm1, %xmm1
1630 shufps $0b00010000, %xmm2, %xmm4
1631 pxor %xmm4, %xmm2
1632 shufps $0b10001100, %xmm2, %xmm4
1633 pxor %xmm4, %xmm2
1634 pxor %xmm1, %xmm2
0d258efb
MK
1635 movaps %xmm2, (TKEYP)
1636 add $0x10, TKEYP
54b6a1bd 1637 ret
8309b745 1638ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1639
1640/*
1641 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1642 * unsigned int key_len)
1643 */
1644ENTRY(aesni_set_key)
8691ccd7 1645 FRAME_BEGIN
0d258efb
MK
1646#ifndef __x86_64__
1647 pushl KEYP
8691ccd7
JP
1648 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1649 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1650 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1651#endif
1652 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1653 movaps %xmm0, (KEYP)
1654 lea 0x10(KEYP), TKEYP # key addr
1655 movl %edx, 480(KEYP)
54b6a1bd
HY
1656 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1657 cmp $24, %dl
1658 jb .Lenc_key128
1659 je .Lenc_key192
0d258efb
MK
1660 movups 0x10(UKEYP), %xmm2 # other user key
1661 movaps %xmm2, (TKEYP)
1662 add $0x10, TKEYP
b369e521 1663 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1664 call _key_expansion_256a
b369e521 1665 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1666 call _key_expansion_256b
b369e521 1667 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1668 call _key_expansion_256a
b369e521 1669 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1670 call _key_expansion_256b
b369e521 1671 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1672 call _key_expansion_256a
b369e521 1673 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1674 call _key_expansion_256b
b369e521 1675 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1676 call _key_expansion_256a
b369e521 1677 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1678 call _key_expansion_256b
b369e521 1679 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1680 call _key_expansion_256a
b369e521 1681 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1682 call _key_expansion_256b
b369e521 1683 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1684 call _key_expansion_256a
b369e521 1685 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1686 call _key_expansion_256b
b369e521 1687 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1688 call _key_expansion_256a
1689 jmp .Ldec_key
1690.Lenc_key192:
0d258efb 1691 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1692 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1693 call _key_expansion_192a
b369e521 1694 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1695 call _key_expansion_192b
b369e521 1696 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1697 call _key_expansion_192a
b369e521 1698 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1699 call _key_expansion_192b
b369e521 1700 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1701 call _key_expansion_192a
b369e521 1702 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1703 call _key_expansion_192b
b369e521 1704 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1705 call _key_expansion_192a
b369e521 1706 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1707 call _key_expansion_192b
1708 jmp .Ldec_key
1709.Lenc_key128:
b369e521 1710 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1711 call _key_expansion_128
b369e521 1712 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1713 call _key_expansion_128
b369e521 1714 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1715 call _key_expansion_128
b369e521 1716 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1717 call _key_expansion_128
b369e521 1718 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1719 call _key_expansion_128
b369e521 1720 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1721 call _key_expansion_128
b369e521 1722 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1723 call _key_expansion_128
b369e521 1724 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1725 call _key_expansion_128
b369e521 1726 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1727 call _key_expansion_128
b369e521 1728 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1729 call _key_expansion_128
1730.Ldec_key:
0d258efb
MK
1731 sub $0x10, TKEYP
1732 movaps (KEYP), %xmm0
1733 movaps (TKEYP), %xmm1
1734 movaps %xmm0, 240(TKEYP)
1735 movaps %xmm1, 240(KEYP)
1736 add $0x10, KEYP
1737 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1738.align 4
1739.Ldec_key_loop:
0d258efb 1740 movaps (KEYP), %xmm0
b369e521 1741 AESIMC %xmm0 %xmm1
0d258efb
MK
1742 movaps %xmm1, (UKEYP)
1743 add $0x10, KEYP
1744 sub $0x10, UKEYP
1745 cmp TKEYP, KEYP
54b6a1bd 1746 jb .Ldec_key_loop
0d258efb
MK
1747 xor AREG, AREG
1748#ifndef __x86_64__
1749 popl KEYP
1750#endif
8691ccd7 1751 FRAME_END
54b6a1bd 1752 ret
8309b745 1753ENDPROC(aesni_set_key)
54b6a1bd
HY
1754
1755/*
1756 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1757 */
1758ENTRY(aesni_enc)
8691ccd7 1759 FRAME_BEGIN
0d258efb
MK
1760#ifndef __x86_64__
1761 pushl KEYP
1762 pushl KLEN
8691ccd7
JP
1763 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1764 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1765 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1766#endif
54b6a1bd
HY
1767 movl 480(KEYP), KLEN # key length
1768 movups (INP), STATE # input
1769 call _aesni_enc1
1770 movups STATE, (OUTP) # output
0d258efb
MK
1771#ifndef __x86_64__
1772 popl KLEN
1773 popl KEYP
1774#endif
8691ccd7 1775 FRAME_END
54b6a1bd 1776 ret
8309b745 1777ENDPROC(aesni_enc)
54b6a1bd
HY
1778
1779/*
1780 * _aesni_enc1: internal ABI
1781 * input:
1782 * KEYP: key struct pointer
1783 * KLEN: round count
1784 * STATE: initial state (input)
1785 * output:
1786 * STATE: finial state (output)
1787 * changed:
1788 * KEY
1789 * TKEYP (T1)
1790 */
0d258efb 1791.align 4
54b6a1bd
HY
1792_aesni_enc1:
1793 movaps (KEYP), KEY # key
1794 mov KEYP, TKEYP
1795 pxor KEY, STATE # round 0
1796 add $0x30, TKEYP
1797 cmp $24, KLEN
1798 jb .Lenc128
1799 lea 0x20(TKEYP), TKEYP
1800 je .Lenc192
1801 add $0x20, TKEYP
1802 movaps -0x60(TKEYP), KEY
b369e521 1803 AESENC KEY STATE
54b6a1bd 1804 movaps -0x50(TKEYP), KEY
b369e521 1805 AESENC KEY STATE
54b6a1bd
HY
1806.align 4
1807.Lenc192:
1808 movaps -0x40(TKEYP), KEY
b369e521 1809 AESENC KEY STATE
54b6a1bd 1810 movaps -0x30(TKEYP), KEY
b369e521 1811 AESENC KEY STATE
54b6a1bd
HY
1812.align 4
1813.Lenc128:
1814 movaps -0x20(TKEYP), KEY
b369e521 1815 AESENC KEY STATE
54b6a1bd 1816 movaps -0x10(TKEYP), KEY
b369e521 1817 AESENC KEY STATE
54b6a1bd 1818 movaps (TKEYP), KEY
b369e521 1819 AESENC KEY STATE
54b6a1bd 1820 movaps 0x10(TKEYP), KEY
b369e521 1821 AESENC KEY STATE
54b6a1bd 1822 movaps 0x20(TKEYP), KEY
b369e521 1823 AESENC KEY STATE
54b6a1bd 1824 movaps 0x30(TKEYP), KEY
b369e521 1825 AESENC KEY STATE
54b6a1bd 1826 movaps 0x40(TKEYP), KEY
b369e521 1827 AESENC KEY STATE
54b6a1bd 1828 movaps 0x50(TKEYP), KEY
b369e521 1829 AESENC KEY STATE
54b6a1bd 1830 movaps 0x60(TKEYP), KEY
b369e521 1831 AESENC KEY STATE
54b6a1bd 1832 movaps 0x70(TKEYP), KEY
b369e521 1833 AESENCLAST KEY STATE
54b6a1bd 1834 ret
8309b745 1835ENDPROC(_aesni_enc1)
54b6a1bd
HY
1836
1837/*
1838 * _aesni_enc4: internal ABI
1839 * input:
1840 * KEYP: key struct pointer
1841 * KLEN: round count
1842 * STATE1: initial state (input)
1843 * STATE2
1844 * STATE3
1845 * STATE4
1846 * output:
1847 * STATE1: finial state (output)
1848 * STATE2
1849 * STATE3
1850 * STATE4
1851 * changed:
1852 * KEY
1853 * TKEYP (T1)
1854 */
0d258efb 1855.align 4
54b6a1bd
HY
1856_aesni_enc4:
1857 movaps (KEYP), KEY # key
1858 mov KEYP, TKEYP
1859 pxor KEY, STATE1 # round 0
1860 pxor KEY, STATE2
1861 pxor KEY, STATE3
1862 pxor KEY, STATE4
1863 add $0x30, TKEYP
1864 cmp $24, KLEN
1865 jb .L4enc128
1866 lea 0x20(TKEYP), TKEYP
1867 je .L4enc192
1868 add $0x20, TKEYP
1869 movaps -0x60(TKEYP), KEY
b369e521
HY
1870 AESENC KEY STATE1
1871 AESENC KEY STATE2
1872 AESENC KEY STATE3
1873 AESENC KEY STATE4
54b6a1bd 1874 movaps -0x50(TKEYP), KEY
b369e521
HY
1875 AESENC KEY STATE1
1876 AESENC KEY STATE2
1877 AESENC KEY STATE3
1878 AESENC KEY STATE4
54b6a1bd
HY
1879#.align 4
1880.L4enc192:
1881 movaps -0x40(TKEYP), KEY
b369e521
HY
1882 AESENC KEY STATE1
1883 AESENC KEY STATE2
1884 AESENC KEY STATE3
1885 AESENC KEY STATE4
54b6a1bd 1886 movaps -0x30(TKEYP), KEY
b369e521
HY
1887 AESENC KEY STATE1
1888 AESENC KEY STATE2
1889 AESENC KEY STATE3
1890 AESENC KEY STATE4
54b6a1bd
HY
1891#.align 4
1892.L4enc128:
1893 movaps -0x20(TKEYP), KEY
b369e521
HY
1894 AESENC KEY STATE1
1895 AESENC KEY STATE2
1896 AESENC KEY STATE3
1897 AESENC KEY STATE4
54b6a1bd 1898 movaps -0x10(TKEYP), KEY
b369e521
HY
1899 AESENC KEY STATE1
1900 AESENC KEY STATE2
1901 AESENC KEY STATE3
1902 AESENC KEY STATE4
54b6a1bd 1903 movaps (TKEYP), KEY
b369e521
HY
1904 AESENC KEY STATE1
1905 AESENC KEY STATE2
1906 AESENC KEY STATE3
1907 AESENC KEY STATE4
54b6a1bd 1908 movaps 0x10(TKEYP), KEY
b369e521
HY
1909 AESENC KEY STATE1
1910 AESENC KEY STATE2
1911 AESENC KEY STATE3
1912 AESENC KEY STATE4
54b6a1bd 1913 movaps 0x20(TKEYP), KEY
b369e521
HY
1914 AESENC KEY STATE1
1915 AESENC KEY STATE2
1916 AESENC KEY STATE3
1917 AESENC KEY STATE4
54b6a1bd 1918 movaps 0x30(TKEYP), KEY
b369e521
HY
1919 AESENC KEY STATE1
1920 AESENC KEY STATE2
1921 AESENC KEY STATE3
1922 AESENC KEY STATE4
54b6a1bd 1923 movaps 0x40(TKEYP), KEY
b369e521
HY
1924 AESENC KEY STATE1
1925 AESENC KEY STATE2
1926 AESENC KEY STATE3
1927 AESENC KEY STATE4
54b6a1bd 1928 movaps 0x50(TKEYP), KEY
b369e521
HY
1929 AESENC KEY STATE1
1930 AESENC KEY STATE2
1931 AESENC KEY STATE3
1932 AESENC KEY STATE4
54b6a1bd 1933 movaps 0x60(TKEYP), KEY
b369e521
HY
1934 AESENC KEY STATE1
1935 AESENC KEY STATE2
1936 AESENC KEY STATE3
1937 AESENC KEY STATE4
54b6a1bd 1938 movaps 0x70(TKEYP), KEY
b369e521
HY
1939 AESENCLAST KEY STATE1 # last round
1940 AESENCLAST KEY STATE2
1941 AESENCLAST KEY STATE3
1942 AESENCLAST KEY STATE4
54b6a1bd 1943 ret
8309b745 1944ENDPROC(_aesni_enc4)
54b6a1bd
HY
1945
1946/*
1947 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1948 */
1949ENTRY(aesni_dec)
8691ccd7 1950 FRAME_BEGIN
0d258efb
MK
1951#ifndef __x86_64__
1952 pushl KEYP
1953 pushl KLEN
8691ccd7
JP
1954 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1955 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1956 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1957#endif
54b6a1bd
HY
1958 mov 480(KEYP), KLEN # key length
1959 add $240, KEYP
1960 movups (INP), STATE # input
1961 call _aesni_dec1
1962 movups STATE, (OUTP) #output
0d258efb
MK
1963#ifndef __x86_64__
1964 popl KLEN
1965 popl KEYP
1966#endif
8691ccd7 1967 FRAME_END
54b6a1bd 1968 ret
8309b745 1969ENDPROC(aesni_dec)
54b6a1bd
HY
1970
1971/*
1972 * _aesni_dec1: internal ABI
1973 * input:
1974 * KEYP: key struct pointer
1975 * KLEN: key length
1976 * STATE: initial state (input)
1977 * output:
1978 * STATE: finial state (output)
1979 * changed:
1980 * KEY
1981 * TKEYP (T1)
1982 */
0d258efb 1983.align 4
54b6a1bd
HY
1984_aesni_dec1:
1985 movaps (KEYP), KEY # key
1986 mov KEYP, TKEYP
1987 pxor KEY, STATE # round 0
1988 add $0x30, TKEYP
1989 cmp $24, KLEN
1990 jb .Ldec128
1991 lea 0x20(TKEYP), TKEYP
1992 je .Ldec192
1993 add $0x20, TKEYP
1994 movaps -0x60(TKEYP), KEY
b369e521 1995 AESDEC KEY STATE
54b6a1bd 1996 movaps -0x50(TKEYP), KEY
b369e521 1997 AESDEC KEY STATE
54b6a1bd
HY
1998.align 4
1999.Ldec192:
2000 movaps -0x40(TKEYP), KEY
b369e521 2001 AESDEC KEY STATE
54b6a1bd 2002 movaps -0x30(TKEYP), KEY
b369e521 2003 AESDEC KEY STATE
54b6a1bd
HY
2004.align 4
2005.Ldec128:
2006 movaps -0x20(TKEYP), KEY
b369e521 2007 AESDEC KEY STATE
54b6a1bd 2008 movaps -0x10(TKEYP), KEY
b369e521 2009 AESDEC KEY STATE
54b6a1bd 2010 movaps (TKEYP), KEY
b369e521 2011 AESDEC KEY STATE
54b6a1bd 2012 movaps 0x10(TKEYP), KEY
b369e521 2013 AESDEC KEY STATE
54b6a1bd 2014 movaps 0x20(TKEYP), KEY
b369e521 2015 AESDEC KEY STATE
54b6a1bd 2016 movaps 0x30(TKEYP), KEY
b369e521 2017 AESDEC KEY STATE
54b6a1bd 2018 movaps 0x40(TKEYP), KEY
b369e521 2019 AESDEC KEY STATE
54b6a1bd 2020 movaps 0x50(TKEYP), KEY
b369e521 2021 AESDEC KEY STATE
54b6a1bd 2022 movaps 0x60(TKEYP), KEY
b369e521 2023 AESDEC KEY STATE
54b6a1bd 2024 movaps 0x70(TKEYP), KEY
b369e521 2025 AESDECLAST KEY STATE
54b6a1bd 2026 ret
8309b745 2027ENDPROC(_aesni_dec1)
54b6a1bd
HY
2028
2029/*
2030 * _aesni_dec4: internal ABI
2031 * input:
2032 * KEYP: key struct pointer
2033 * KLEN: key length
2034 * STATE1: initial state (input)
2035 * STATE2
2036 * STATE3
2037 * STATE4
2038 * output:
2039 * STATE1: finial state (output)
2040 * STATE2
2041 * STATE3
2042 * STATE4
2043 * changed:
2044 * KEY
2045 * TKEYP (T1)
2046 */
0d258efb 2047.align 4
54b6a1bd
HY
2048_aesni_dec4:
2049 movaps (KEYP), KEY # key
2050 mov KEYP, TKEYP
2051 pxor KEY, STATE1 # round 0
2052 pxor KEY, STATE2
2053 pxor KEY, STATE3
2054 pxor KEY, STATE4
2055 add $0x30, TKEYP
2056 cmp $24, KLEN
2057 jb .L4dec128
2058 lea 0x20(TKEYP), TKEYP
2059 je .L4dec192
2060 add $0x20, TKEYP
2061 movaps -0x60(TKEYP), KEY
b369e521
HY
2062 AESDEC KEY STATE1
2063 AESDEC KEY STATE2
2064 AESDEC KEY STATE3
2065 AESDEC KEY STATE4
54b6a1bd 2066 movaps -0x50(TKEYP), KEY
b369e521
HY
2067 AESDEC KEY STATE1
2068 AESDEC KEY STATE2
2069 AESDEC KEY STATE3
2070 AESDEC KEY STATE4
54b6a1bd
HY
2071.align 4
2072.L4dec192:
2073 movaps -0x40(TKEYP), KEY
b369e521
HY
2074 AESDEC KEY STATE1
2075 AESDEC KEY STATE2
2076 AESDEC KEY STATE3
2077 AESDEC KEY STATE4
54b6a1bd 2078 movaps -0x30(TKEYP), KEY
b369e521
HY
2079 AESDEC KEY STATE1
2080 AESDEC KEY STATE2
2081 AESDEC KEY STATE3
2082 AESDEC KEY STATE4
54b6a1bd
HY
2083.align 4
2084.L4dec128:
2085 movaps -0x20(TKEYP), KEY
b369e521
HY
2086 AESDEC KEY STATE1
2087 AESDEC KEY STATE2
2088 AESDEC KEY STATE3
2089 AESDEC KEY STATE4
54b6a1bd 2090 movaps -0x10(TKEYP), KEY
b369e521
HY
2091 AESDEC KEY STATE1
2092 AESDEC KEY STATE2
2093 AESDEC KEY STATE3
2094 AESDEC KEY STATE4
54b6a1bd 2095 movaps (TKEYP), KEY
b369e521
HY
2096 AESDEC KEY STATE1
2097 AESDEC KEY STATE2
2098 AESDEC KEY STATE3
2099 AESDEC KEY STATE4
54b6a1bd 2100 movaps 0x10(TKEYP), KEY
b369e521
HY
2101 AESDEC KEY STATE1
2102 AESDEC KEY STATE2
2103 AESDEC KEY STATE3
2104 AESDEC KEY STATE4
54b6a1bd 2105 movaps 0x20(TKEYP), KEY
b369e521
HY
2106 AESDEC KEY STATE1
2107 AESDEC KEY STATE2
2108 AESDEC KEY STATE3
2109 AESDEC KEY STATE4
54b6a1bd 2110 movaps 0x30(TKEYP), KEY
b369e521
HY
2111 AESDEC KEY STATE1
2112 AESDEC KEY STATE2
2113 AESDEC KEY STATE3
2114 AESDEC KEY STATE4
54b6a1bd 2115 movaps 0x40(TKEYP), KEY
b369e521
HY
2116 AESDEC KEY STATE1
2117 AESDEC KEY STATE2
2118 AESDEC KEY STATE3
2119 AESDEC KEY STATE4
54b6a1bd 2120 movaps 0x50(TKEYP), KEY
b369e521
HY
2121 AESDEC KEY STATE1
2122 AESDEC KEY STATE2
2123 AESDEC KEY STATE3
2124 AESDEC KEY STATE4
54b6a1bd 2125 movaps 0x60(TKEYP), KEY
b369e521
HY
2126 AESDEC KEY STATE1
2127 AESDEC KEY STATE2
2128 AESDEC KEY STATE3
2129 AESDEC KEY STATE4
54b6a1bd 2130 movaps 0x70(TKEYP), KEY
b369e521
HY
2131 AESDECLAST KEY STATE1 # last round
2132 AESDECLAST KEY STATE2
2133 AESDECLAST KEY STATE3
2134 AESDECLAST KEY STATE4
54b6a1bd 2135 ret
8309b745 2136ENDPROC(_aesni_dec4)
54b6a1bd
HY
2137
2138/*
2139 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2140 * size_t len)
2141 */
2142ENTRY(aesni_ecb_enc)
8691ccd7 2143 FRAME_BEGIN
0d258efb
MK
2144#ifndef __x86_64__
2145 pushl LEN
2146 pushl KEYP
2147 pushl KLEN
8691ccd7
JP
2148 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2149 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2150 movl (FRAME_OFFSET+24)(%esp), INP # src
2151 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2152#endif
54b6a1bd
HY
2153 test LEN, LEN # check length
2154 jz .Lecb_enc_ret
2155 mov 480(KEYP), KLEN
2156 cmp $16, LEN
2157 jb .Lecb_enc_ret
2158 cmp $64, LEN
2159 jb .Lecb_enc_loop1
2160.align 4
2161.Lecb_enc_loop4:
2162 movups (INP), STATE1
2163 movups 0x10(INP), STATE2
2164 movups 0x20(INP), STATE3
2165 movups 0x30(INP), STATE4
2166 call _aesni_enc4
2167 movups STATE1, (OUTP)
2168 movups STATE2, 0x10(OUTP)
2169 movups STATE3, 0x20(OUTP)
2170 movups STATE4, 0x30(OUTP)
2171 sub $64, LEN
2172 add $64, INP
2173 add $64, OUTP
2174 cmp $64, LEN
2175 jge .Lecb_enc_loop4
2176 cmp $16, LEN
2177 jb .Lecb_enc_ret
2178.align 4
2179.Lecb_enc_loop1:
2180 movups (INP), STATE1
2181 call _aesni_enc1
2182 movups STATE1, (OUTP)
2183 sub $16, LEN
2184 add $16, INP
2185 add $16, OUTP
2186 cmp $16, LEN
2187 jge .Lecb_enc_loop1
2188.Lecb_enc_ret:
0d258efb
MK
2189#ifndef __x86_64__
2190 popl KLEN
2191 popl KEYP
2192 popl LEN
2193#endif
8691ccd7 2194 FRAME_END
54b6a1bd 2195 ret
8309b745 2196ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2197
2198/*
2199 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2200 * size_t len);
2201 */
2202ENTRY(aesni_ecb_dec)
8691ccd7 2203 FRAME_BEGIN
0d258efb
MK
2204#ifndef __x86_64__
2205 pushl LEN
2206 pushl KEYP
2207 pushl KLEN
8691ccd7
JP
2208 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2209 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2210 movl (FRAME_OFFSET+24)(%esp), INP # src
2211 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2212#endif
54b6a1bd
HY
2213 test LEN, LEN
2214 jz .Lecb_dec_ret
2215 mov 480(KEYP), KLEN
2216 add $240, KEYP
2217 cmp $16, LEN
2218 jb .Lecb_dec_ret
2219 cmp $64, LEN
2220 jb .Lecb_dec_loop1
2221.align 4
2222.Lecb_dec_loop4:
2223 movups (INP), STATE1
2224 movups 0x10(INP), STATE2
2225 movups 0x20(INP), STATE3
2226 movups 0x30(INP), STATE4
2227 call _aesni_dec4
2228 movups STATE1, (OUTP)
2229 movups STATE2, 0x10(OUTP)
2230 movups STATE3, 0x20(OUTP)
2231 movups STATE4, 0x30(OUTP)
2232 sub $64, LEN
2233 add $64, INP
2234 add $64, OUTP
2235 cmp $64, LEN
2236 jge .Lecb_dec_loop4
2237 cmp $16, LEN
2238 jb .Lecb_dec_ret
2239.align 4
2240.Lecb_dec_loop1:
2241 movups (INP), STATE1
2242 call _aesni_dec1
2243 movups STATE1, (OUTP)
2244 sub $16, LEN
2245 add $16, INP
2246 add $16, OUTP
2247 cmp $16, LEN
2248 jge .Lecb_dec_loop1
2249.Lecb_dec_ret:
0d258efb
MK
2250#ifndef __x86_64__
2251 popl KLEN
2252 popl KEYP
2253 popl LEN
2254#endif
8691ccd7 2255 FRAME_END
54b6a1bd 2256 ret
8309b745 2257ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2258
2259/*
2260 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len, u8 *iv)
2262 */
2263ENTRY(aesni_cbc_enc)
8691ccd7 2264 FRAME_BEGIN
0d258efb
MK
2265#ifndef __x86_64__
2266 pushl IVP
2267 pushl LEN
2268 pushl KEYP
2269 pushl KLEN
8691ccd7
JP
2270 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2271 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2272 movl (FRAME_OFFSET+28)(%esp), INP # src
2273 movl (FRAME_OFFSET+32)(%esp), LEN # len
2274 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2275#endif
54b6a1bd
HY
2276 cmp $16, LEN
2277 jb .Lcbc_enc_ret
2278 mov 480(KEYP), KLEN
2279 movups (IVP), STATE # load iv as initial state
2280.align 4
2281.Lcbc_enc_loop:
2282 movups (INP), IN # load input
2283 pxor IN, STATE
2284 call _aesni_enc1
2285 movups STATE, (OUTP) # store output
2286 sub $16, LEN
2287 add $16, INP
2288 add $16, OUTP
2289 cmp $16, LEN
2290 jge .Lcbc_enc_loop
2291 movups STATE, (IVP)
2292.Lcbc_enc_ret:
0d258efb
MK
2293#ifndef __x86_64__
2294 popl KLEN
2295 popl KEYP
2296 popl LEN
2297 popl IVP
2298#endif
8691ccd7 2299 FRAME_END
54b6a1bd 2300 ret
8309b745 2301ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2302
2303/*
2304 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2305 * size_t len, u8 *iv)
2306 */
2307ENTRY(aesni_cbc_dec)
8691ccd7 2308 FRAME_BEGIN
0d258efb
MK
2309#ifndef __x86_64__
2310 pushl IVP
2311 pushl LEN
2312 pushl KEYP
2313 pushl KLEN
8691ccd7
JP
2314 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2315 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2316 movl (FRAME_OFFSET+28)(%esp), INP # src
2317 movl (FRAME_OFFSET+32)(%esp), LEN # len
2318 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2319#endif
54b6a1bd 2320 cmp $16, LEN
e6efaa02 2321 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2322 mov 480(KEYP), KLEN
2323 add $240, KEYP
2324 movups (IVP), IV
2325 cmp $64, LEN
2326 jb .Lcbc_dec_loop1
2327.align 4
2328.Lcbc_dec_loop4:
2329 movups (INP), IN1
2330 movaps IN1, STATE1
2331 movups 0x10(INP), IN2
2332 movaps IN2, STATE2
0d258efb 2333#ifdef __x86_64__
54b6a1bd
HY
2334 movups 0x20(INP), IN3
2335 movaps IN3, STATE3
2336 movups 0x30(INP), IN4
2337 movaps IN4, STATE4
0d258efb
MK
2338#else
2339 movups 0x20(INP), IN1
2340 movaps IN1, STATE3
2341 movups 0x30(INP), IN2
2342 movaps IN2, STATE4
2343#endif
54b6a1bd
HY
2344 call _aesni_dec4
2345 pxor IV, STATE1
0d258efb 2346#ifdef __x86_64__
54b6a1bd
HY
2347 pxor IN1, STATE2
2348 pxor IN2, STATE3
2349 pxor IN3, STATE4
2350 movaps IN4, IV
0d258efb 2351#else
0d258efb
MK
2352 pxor IN1, STATE4
2353 movaps IN2, IV
7c8d5184
MK
2354 movups (INP), IN1
2355 pxor IN1, STATE2
2356 movups 0x10(INP), IN2
2357 pxor IN2, STATE3
0d258efb 2358#endif
54b6a1bd
HY
2359 movups STATE1, (OUTP)
2360 movups STATE2, 0x10(OUTP)
2361 movups STATE3, 0x20(OUTP)
2362 movups STATE4, 0x30(OUTP)
2363 sub $64, LEN
2364 add $64, INP
2365 add $64, OUTP
2366 cmp $64, LEN
2367 jge .Lcbc_dec_loop4
2368 cmp $16, LEN
2369 jb .Lcbc_dec_ret
2370.align 4
2371.Lcbc_dec_loop1:
2372 movups (INP), IN
2373 movaps IN, STATE
2374 call _aesni_dec1
2375 pxor IV, STATE
2376 movups STATE, (OUTP)
2377 movaps IN, IV
2378 sub $16, LEN
2379 add $16, INP
2380 add $16, OUTP
2381 cmp $16, LEN
2382 jge .Lcbc_dec_loop1
54b6a1bd 2383.Lcbc_dec_ret:
e6efaa02
HY
2384 movups IV, (IVP)
2385.Lcbc_dec_just_ret:
0d258efb
MK
2386#ifndef __x86_64__
2387 popl KLEN
2388 popl KEYP
2389 popl LEN
2390 popl IVP
2391#endif
8691ccd7 2392 FRAME_END
54b6a1bd 2393 ret
8309b745 2394ENDPROC(aesni_cbc_dec)
12387a46 2395
0d258efb 2396#ifdef __x86_64__
1253cab8 2397.pushsection .rodata
12387a46
HY
2398.align 16
2399.Lbswap_mask:
2400 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2401.popsection
12387a46
HY
2402
2403/*
2404 * _aesni_inc_init: internal ABI
2405 * setup registers used by _aesni_inc
2406 * input:
2407 * IV
2408 * output:
2409 * CTR: == IV, in little endian
2410 * TCTR_LOW: == lower qword of CTR
2411 * INC: == 1, in little endian
2412 * BSWAP_MASK == endian swapping mask
2413 */
0d258efb 2414.align 4
12387a46
HY
2415_aesni_inc_init:
2416 movaps .Lbswap_mask, BSWAP_MASK
2417 movaps IV, CTR
2418 PSHUFB_XMM BSWAP_MASK CTR
2419 mov $1, TCTR_LOW
32cbd7df
HY
2420 MOVQ_R64_XMM TCTR_LOW INC
2421 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2422 ret
8309b745 2423ENDPROC(_aesni_inc_init)
12387a46
HY
2424
2425/*
2426 * _aesni_inc: internal ABI
2427 * Increase IV by 1, IV is in big endian
2428 * input:
2429 * IV
2430 * CTR: == IV, in little endian
2431 * TCTR_LOW: == lower qword of CTR
2432 * INC: == 1, in little endian
2433 * BSWAP_MASK == endian swapping mask
2434 * output:
2435 * IV: Increase by 1
2436 * changed:
2437 * CTR: == output IV, in little endian
2438 * TCTR_LOW: == lower qword of CTR
2439 */
0d258efb 2440.align 4
12387a46
HY
2441_aesni_inc:
2442 paddq INC, CTR
2443 add $1, TCTR_LOW
2444 jnc .Linc_low
2445 pslldq $8, INC
2446 paddq INC, CTR
2447 psrldq $8, INC
2448.Linc_low:
2449 movaps CTR, IV
2450 PSHUFB_XMM BSWAP_MASK IV
2451 ret
8309b745 2452ENDPROC(_aesni_inc)
12387a46
HY
2453
2454/*
2455 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2456 * size_t len, u8 *iv)
2457 */
2458ENTRY(aesni_ctr_enc)
8691ccd7 2459 FRAME_BEGIN
12387a46
HY
2460 cmp $16, LEN
2461 jb .Lctr_enc_just_ret
2462 mov 480(KEYP), KLEN
2463 movups (IVP), IV
2464 call _aesni_inc_init
2465 cmp $64, LEN
2466 jb .Lctr_enc_loop1
2467.align 4
2468.Lctr_enc_loop4:
2469 movaps IV, STATE1
2470 call _aesni_inc
2471 movups (INP), IN1
2472 movaps IV, STATE2
2473 call _aesni_inc
2474 movups 0x10(INP), IN2
2475 movaps IV, STATE3
2476 call _aesni_inc
2477 movups 0x20(INP), IN3
2478 movaps IV, STATE4
2479 call _aesni_inc
2480 movups 0x30(INP), IN4
2481 call _aesni_enc4
2482 pxor IN1, STATE1
2483 movups STATE1, (OUTP)
2484 pxor IN2, STATE2
2485 movups STATE2, 0x10(OUTP)
2486 pxor IN3, STATE3
2487 movups STATE3, 0x20(OUTP)
2488 pxor IN4, STATE4
2489 movups STATE4, 0x30(OUTP)
2490 sub $64, LEN
2491 add $64, INP
2492 add $64, OUTP
2493 cmp $64, LEN
2494 jge .Lctr_enc_loop4
2495 cmp $16, LEN
2496 jb .Lctr_enc_ret
2497.align 4
2498.Lctr_enc_loop1:
2499 movaps IV, STATE
2500 call _aesni_inc
2501 movups (INP), IN
2502 call _aesni_enc1
2503 pxor IN, STATE
2504 movups STATE, (OUTP)
2505 sub $16, LEN
2506 add $16, INP
2507 add $16, OUTP
2508 cmp $16, LEN
2509 jge .Lctr_enc_loop1
2510.Lctr_enc_ret:
2511 movups IV, (IVP)
2512.Lctr_enc_just_ret:
8691ccd7 2513 FRAME_END
12387a46 2514 ret
8309b745 2515ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2516
2517/*
2518 * _aesni_gf128mul_x_ble: internal ABI
2519 * Multiply in GF(2^128) for XTS IVs
2520 * input:
2521 * IV: current IV
2522 * GF128MUL_MASK == mask with 0x87 and 0x01
2523 * output:
2524 * IV: next IV
2525 * changed:
2526 * CTR: == temporary value
2527 */
2528#define _aesni_gf128mul_x_ble() \
2529 pshufd $0x13, IV, CTR; \
2530 paddq IV, IV; \
2531 psrad $31, CTR; \
2532 pand GF128MUL_MASK, CTR; \
2533 pxor CTR, IV;
2534
2535/*
2536 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2537 * bool enc, u8 *iv)
2538 */
2539ENTRY(aesni_xts_crypt8)
8691ccd7 2540 FRAME_BEGIN
c456a9cd
JK
2541 cmpb $0, %cl
2542 movl $0, %ecx
2543 movl $240, %r10d
2544 leaq _aesni_enc4, %r11
2545 leaq _aesni_dec4, %rax
2546 cmovel %r10d, %ecx
2547 cmoveq %rax, %r11
2548
2549 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2550 movups (IVP), IV
2551
2552 mov 480(KEYP), KLEN
2553 addq %rcx, KEYP
2554
2555 movdqa IV, STATE1
fe6510b5
JK
2556 movdqu 0x00(INP), INC
2557 pxor INC, STATE1
c456a9cd
JK
2558 movdqu IV, 0x00(OUTP)
2559
2560 _aesni_gf128mul_x_ble()
2561 movdqa IV, STATE2
fe6510b5
JK
2562 movdqu 0x10(INP), INC
2563 pxor INC, STATE2
c456a9cd
JK
2564 movdqu IV, 0x10(OUTP)
2565
2566 _aesni_gf128mul_x_ble()
2567 movdqa IV, STATE3
fe6510b5
JK
2568 movdqu 0x20(INP), INC
2569 pxor INC, STATE3
c456a9cd
JK
2570 movdqu IV, 0x20(OUTP)
2571
2572 _aesni_gf128mul_x_ble()
2573 movdqa IV, STATE4
fe6510b5
JK
2574 movdqu 0x30(INP), INC
2575 pxor INC, STATE4
c456a9cd
JK
2576 movdqu IV, 0x30(OUTP)
2577
9697fa39 2578 CALL_NOSPEC %r11
c456a9cd 2579
fe6510b5
JK
2580 movdqu 0x00(OUTP), INC
2581 pxor INC, STATE1
c456a9cd
JK
2582 movdqu STATE1, 0x00(OUTP)
2583
2584 _aesni_gf128mul_x_ble()
2585 movdqa IV, STATE1
fe6510b5
JK
2586 movdqu 0x40(INP), INC
2587 pxor INC, STATE1
c456a9cd
JK
2588 movdqu IV, 0x40(OUTP)
2589
fe6510b5
JK
2590 movdqu 0x10(OUTP), INC
2591 pxor INC, STATE2
c456a9cd
JK
2592 movdqu STATE2, 0x10(OUTP)
2593
2594 _aesni_gf128mul_x_ble()
2595 movdqa IV, STATE2
fe6510b5
JK
2596 movdqu 0x50(INP), INC
2597 pxor INC, STATE2
c456a9cd
JK
2598 movdqu IV, 0x50(OUTP)
2599
fe6510b5
JK
2600 movdqu 0x20(OUTP), INC
2601 pxor INC, STATE3
c456a9cd
JK
2602 movdqu STATE3, 0x20(OUTP)
2603
2604 _aesni_gf128mul_x_ble()
2605 movdqa IV, STATE3
fe6510b5
JK
2606 movdqu 0x60(INP), INC
2607 pxor INC, STATE3
c456a9cd
JK
2608 movdqu IV, 0x60(OUTP)
2609
fe6510b5
JK
2610 movdqu 0x30(OUTP), INC
2611 pxor INC, STATE4
c456a9cd
JK
2612 movdqu STATE4, 0x30(OUTP)
2613
2614 _aesni_gf128mul_x_ble()
2615 movdqa IV, STATE4
fe6510b5
JK
2616 movdqu 0x70(INP), INC
2617 pxor INC, STATE4
c456a9cd
JK
2618 movdqu IV, 0x70(OUTP)
2619
2620 _aesni_gf128mul_x_ble()
2621 movups IV, (IVP)
2622
9697fa39 2623 CALL_NOSPEC %r11
c456a9cd 2624
fe6510b5
JK
2625 movdqu 0x40(OUTP), INC
2626 pxor INC, STATE1
c456a9cd
JK
2627 movdqu STATE1, 0x40(OUTP)
2628
fe6510b5
JK
2629 movdqu 0x50(OUTP), INC
2630 pxor INC, STATE2
c456a9cd
JK
2631 movdqu STATE2, 0x50(OUTP)
2632
fe6510b5
JK
2633 movdqu 0x60(OUTP), INC
2634 pxor INC, STATE3
c456a9cd
JK
2635 movdqu STATE3, 0x60(OUTP)
2636
fe6510b5
JK
2637 movdqu 0x70(OUTP), INC
2638 pxor INC, STATE4
c456a9cd
JK
2639 movdqu STATE4, 0x70(OUTP)
2640
8691ccd7 2641 FRAME_END
c456a9cd
JK
2642 ret
2643ENDPROC(aesni_xts_crypt8)
2644
0d258efb 2645#endif