]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
2874c5fd 1/* SPDX-License-Identifier: GPL-2.0-or-later */
54b6a1bd
HY
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
0bd82f5f
TS
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
0d258efb
MK
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
54b6a1bd
HY
26 */
27
28#include <linux/linkage.h>
b369e521 29#include <asm/inst.h>
8691ccd7 30#include <asm/frame.h>
9697fa39 31#include <asm/nospec-branch.h>
54b6a1bd 32
e31ac32d
TM
33/*
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register. This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned). It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released. However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
40 */
41#define MOVADQ movaps
42#define MOVUDQ movups
43
559ad0ff 44#ifdef __x86_64__
e31ac32d 45
e183914a
DV
46# constants in mergeable sections, linker can reorder and merge
47.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
48.align 16
49.Lgf128mul_x_ble_mask:
50 .octa 0x00000000000000010000000000000087
e183914a
DV
51.section .rodata.cst16.POLY, "aM", @progbits, 16
52.align 16
0bd82f5f 53POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
54.section .rodata.cst16.TWOONE, "aM", @progbits, 16
55.align 16
0bd82f5f
TS
56TWOONE: .octa 0x00000001000000000000000000000001
57
e183914a
DV
58.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
59.align 16
0bd82f5f 60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
61.section .rodata.cst16.MASK1, "aM", @progbits, 16
62.align 16
0bd82f5f 63MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
64.section .rodata.cst16.MASK2, "aM", @progbits, 16
65.align 16
0bd82f5f 66MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
67.section .rodata.cst16.ONE, "aM", @progbits, 16
68.align 16
0bd82f5f 69ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
70.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
71.align 16
0bd82f5f 72F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
73.section .rodata.cst16.dec, "aM", @progbits, 16
74.align 16
0bd82f5f 75dec: .octa 0x1
e183914a
DV
76.section .rodata.cst16.enc, "aM", @progbits, 16
77.align 16
0bd82f5f
TS
78enc: .octa 0x2
79
e183914a
DV
80# order of these constants should not change.
81# more specifically, ALL_F should follow SHIFT_MASK,
82# and zero should follow ALL_F
83.section .rodata, "a", @progbits
84.align 16
85SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86ALL_F: .octa 0xffffffffffffffffffffffffffffffff
87 .octa 0x00000000000000000000000000000000
88
54b6a1bd
HY
89.text
90
0bd82f5f
TS
91
92#define STACK_OFFSET 8*3
0bd82f5f 93
9ee4a5df
DW
94#define AadHash 16*0
95#define AadLen 16*1
96#define InLen (16*1)+8
97#define PBlockEncKey 16*2
98#define OrigIV 16*3
99#define CurCount 16*4
100#define PBlockLen 16*5
1476db2d
DW
101#define HashKey 16*6 // store HashKey <<1 mod poly here
102#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
103#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
104#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
105#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
106 // bits of HashKey <<1 mod poly here
107 //(for Karatsuba purposes)
108#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
109 // bits of HashKey^2 <<1 mod poly here
110 // (for Karatsuba purposes)
111#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
112 // bits of HashKey^3 <<1 mod poly here
113 // (for Karatsuba purposes)
114#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
115 // bits of HashKey^4 <<1 mod poly here
116 // (for Karatsuba purposes)
9ee4a5df 117
0bd82f5f
TS
118#define arg1 rdi
119#define arg2 rsi
120#define arg3 rdx
121#define arg4 rcx
122#define arg5 r8
123#define arg6 r9
1476db2d
DW
124#define arg7 STACK_OFFSET+8(%rsp)
125#define arg8 STACK_OFFSET+16(%rsp)
126#define arg9 STACK_OFFSET+24(%rsp)
127#define arg10 STACK_OFFSET+32(%rsp)
128#define arg11 STACK_OFFSET+40(%rsp)
e31ac32d 129#define keysize 2*15*16(%arg1)
559ad0ff 130#endif
0bd82f5f
TS
131
132
54b6a1bd
HY
133#define STATE1 %xmm0
134#define STATE2 %xmm4
135#define STATE3 %xmm5
136#define STATE4 %xmm6
137#define STATE STATE1
138#define IN1 %xmm1
139#define IN2 %xmm7
140#define IN3 %xmm8
141#define IN4 %xmm9
142#define IN IN1
143#define KEY %xmm2
144#define IV %xmm3
0d258efb 145
12387a46
HY
146#define BSWAP_MASK %xmm10
147#define CTR %xmm11
148#define INC %xmm12
54b6a1bd 149
c456a9cd
JK
150#define GF128MUL_MASK %xmm10
151
0d258efb
MK
152#ifdef __x86_64__
153#define AREG %rax
54b6a1bd
HY
154#define KEYP %rdi
155#define OUTP %rsi
0d258efb 156#define UKEYP OUTP
54b6a1bd
HY
157#define INP %rdx
158#define LEN %rcx
159#define IVP %r8
160#define KLEN %r9d
161#define T1 %r10
162#define TKEYP T1
163#define T2 %r11
12387a46 164#define TCTR_LOW T2
0d258efb
MK
165#else
166#define AREG %eax
167#define KEYP %edi
168#define OUTP AREG
169#define UKEYP OUTP
170#define INP %edx
171#define LEN %esi
172#define IVP %ebp
173#define KLEN %ebx
174#define T1 %ecx
175#define TKEYP T1
176#endif
54b6a1bd 177
6c2c86b3
DW
178.macro FUNC_SAVE
179 push %r12
180 push %r13
181 push %r14
6c2c86b3
DW
182#
183# states of %xmm registers %xmm6:%xmm15 not saved
184# all %xmm registers are clobbered
185#
6c2c86b3
DW
186.endm
187
188
189.macro FUNC_RESTORE
6c2c86b3
DW
190 pop %r14
191 pop %r13
192 pop %r12
193.endm
0bd82f5f 194
1476db2d
DW
195# Precompute hashkeys.
196# Input: Hash subkey.
197# Output: HashKeys stored in gcm_context_data. Only needs to be called
198# once per key.
199# clobbers r12, and tmp xmm registers.
fb8986e6
DW
200.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
201 mov \SUBKEY, %r12
1476db2d
DW
202 movdqu (%r12), \TMP3
203 movdqa SHUF_MASK(%rip), \TMP2
204 PSHUFB_XMM \TMP2, \TMP3
205
206 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
207
208 movdqa \TMP3, \TMP2
209 psllq $1, \TMP3
210 psrlq $63, \TMP2
211 movdqa \TMP2, \TMP1
212 pslldq $8, \TMP2
213 psrldq $8, \TMP1
214 por \TMP2, \TMP3
215
216 # reduce HashKey<<1
217
218 pshufd $0x24, \TMP1, \TMP2
219 pcmpeqd TWOONE(%rip), \TMP2
220 pand POLY(%rip), \TMP2
221 pxor \TMP2, \TMP3
e5b954e8 222 movdqu \TMP3, HashKey(%arg2)
1476db2d
DW
223
224 movdqa \TMP3, \TMP5
225 pshufd $78, \TMP3, \TMP1
226 pxor \TMP3, \TMP1
e5b954e8 227 movdqu \TMP1, HashKey_k(%arg2)
1476db2d
DW
228
229 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230# TMP5 = HashKey^2<<1 (mod poly)
e5b954e8 231 movdqu \TMP5, HashKey_2(%arg2)
1476db2d
DW
232# HashKey_2 = HashKey^2<<1 (mod poly)
233 pshufd $78, \TMP5, \TMP1
234 pxor \TMP5, \TMP1
e5b954e8 235 movdqu \TMP1, HashKey_2_k(%arg2)
1476db2d
DW
236
237 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238# TMP5 = HashKey^3<<1 (mod poly)
e5b954e8 239 movdqu \TMP5, HashKey_3(%arg2)
1476db2d
DW
240 pshufd $78, \TMP5, \TMP1
241 pxor \TMP5, \TMP1
e5b954e8 242 movdqu \TMP1, HashKey_3_k(%arg2)
1476db2d
DW
243
244 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245# TMP5 = HashKey^3<<1 (mod poly)
e5b954e8 246 movdqu \TMP5, HashKey_4(%arg2)
1476db2d
DW
247 pshufd $78, \TMP5, \TMP1
248 pxor \TMP5, \TMP1
e5b954e8 249 movdqu \TMP1, HashKey_4_k(%arg2)
1476db2d 250.endm
7af964c2
DW
251
252# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
fb8986e6
DW
254.macro GCM_INIT Iv SUBKEY AAD AADLEN
255 mov \AADLEN, %r11
9660474b 256 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
a7bea830 257 xor %r11d, %r11d
9660474b
DW
258 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
fb8986e6 261 mov \Iv, %rax
9660474b
DW
262 movdqu (%rax), %xmm0
263 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
264
265 movdqa SHUF_MASK(%rip), %xmm2
266 PSHUFB_XMM %xmm2, %xmm0
267 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
268
fb8986e6 269 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
e5b954e8 270 movdqu HashKey(%arg2), %xmm13
c594c540 271
fb8986e6
DW
272 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
273 %xmm4, %xmm5, %xmm6
7af964c2
DW
274.endm
275
ba45833e
DW
276# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277# struct has been initialized by GCM_INIT.
278# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279# Clobbers rax, r10-r13, and xmm0-xmm15
280.macro GCM_ENC_DEC operation
9660474b 281 movdqu AadHash(%arg2), %xmm8
1476db2d 282 movdqu HashKey(%arg2), %xmm13
9660474b 283 add %arg5, InLen(%arg2)
ae952c5e 284
a7bea830 285 xor %r11d, %r11d # initialise the data pointer offset as zero
ae952c5e
DW
286 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
287
288 sub %r11, %arg5 # sub partial block data used
9660474b 289 mov %arg5, %r13 # save the number of bytes
ae952c5e 290
9660474b
DW
291 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
292 mov %r13, %r12
ba45833e
DW
293 # Encrypt/Decrypt first few blocks
294
295 and $(3<<4), %r12
296 jz _initial_num_blocks_is_0_\@
297 cmp $(2<<4), %r12
298 jb _initial_num_blocks_is_1_\@
299 je _initial_num_blocks_is_2_\@
300_initial_num_blocks_is_3_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
303 sub $48, %r13
304 jmp _initial_blocks_\@
305_initial_num_blocks_is_2_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
308 sub $32, %r13
309 jmp _initial_blocks_\@
310_initial_num_blocks_is_1_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
313 sub $16, %r13
314 jmp _initial_blocks_\@
315_initial_num_blocks_is_0_\@:
316 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
318_initial_blocks_\@:
319
320 # Main loop - Encrypt/Decrypt remaining blocks
321
322 cmp $0, %r13
323 je _zero_cipher_left_\@
324 sub $64, %r13
325 je _four_cipher_left_\@
326_crypt_by_4_\@:
327 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
328 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
329 %xmm7, %xmm8, enc
330 add $64, %r11
331 sub $64, %r13
332 jne _crypt_by_4_\@
333_four_cipher_left_\@:
334 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336_zero_cipher_left_\@:
9660474b
DW
337 movdqu %xmm8, AadHash(%arg2)
338 movdqu %xmm0, CurCount(%arg2)
339
9ee4a5df
DW
340 mov %arg5, %r13
341 and $15, %r13 # %r13 = arg5 (mod 16)
ba45833e
DW
342 je _multiple_of_16_bytes_\@
343
9660474b
DW
344 mov %r13, PBlockLen(%arg2)
345
ba45833e
DW
346 # Handle the last <16 Byte block separately
347 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
9660474b 348 movdqu %xmm0, CurCount(%arg2)
9ee4a5df 349 movdqa SHUF_MASK(%rip), %xmm10
ba45833e
DW
350 PSHUFB_XMM %xmm10, %xmm0
351
352 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
9660474b 353 movdqu %xmm0, PBlockEncKey(%arg2)
ba45833e 354
933d6aef
DW
355 cmp $16, %arg5
356 jge _large_enough_update_\@
357
9ee4a5df 358 lea (%arg4,%r11,1), %r10
ba45833e
DW
359 mov %r13, %r12
360 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
933d6aef
DW
361 jmp _data_read_\@
362
363_large_enough_update_\@:
364 sub $16, %r11
365 add %r13, %r11
366
367 # receive the last <16 Byte block
368 movdqu (%arg4, %r11, 1), %xmm1
ba45833e 369
933d6aef
DW
370 sub %r13, %r11
371 add $16, %r11
372
373 lea SHIFT_MASK+16(%rip), %r12
374 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375 # (r13 is the number of bytes in plaintext mod 16)
376 sub %r13, %r12
377 # get the appropriate shuffle mask
378 movdqu (%r12), %xmm2
379 # shift right 16-r13 bytes
380 PSHUFB_XMM %xmm2, %xmm1
381
382_data_read_\@:
ba45833e
DW
383 lea ALL_F+16(%rip), %r12
384 sub %r13, %r12
933d6aef 385
ba45833e
DW
386.ifc \operation, dec
387 movdqa %xmm1, %xmm2
388.endif
389 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
390 movdqu (%r12), %xmm1
391 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
392 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
393.ifc \operation, dec
394 pand %xmm1, %xmm2
395 movdqa SHUF_MASK(%rip), %xmm10
396 PSHUFB_XMM %xmm10 ,%xmm2
397
398 pxor %xmm2, %xmm8
399.else
400 movdqa SHUF_MASK(%rip), %xmm10
401 PSHUFB_XMM %xmm10,%xmm0
402
403 pxor %xmm0, %xmm8
404.endif
405
9660474b 406 movdqu %xmm8, AadHash(%arg2)
ba45833e
DW
407.ifc \operation, enc
408 # GHASH computation for the last <16 byte block
409 movdqa SHUF_MASK(%rip), %xmm10
410 # shuffle xmm0 back to output as ciphertext
411 PSHUFB_XMM %xmm10, %xmm0
412.endif
413
414 # Output %r13 bytes
415 MOVQ_R64_XMM %xmm0, %rax
416 cmp $8, %r13
417 jle _less_than_8_bytes_left_\@
9ee4a5df 418 mov %rax, (%arg3 , %r11, 1)
ba45833e
DW
419 add $8, %r11
420 psrldq $8, %xmm0
421 MOVQ_R64_XMM %xmm0, %rax
422 sub $8, %r13
423_less_than_8_bytes_left_\@:
9ee4a5df 424 mov %al, (%arg3, %r11, 1)
ba45833e
DW
425 add $1, %r11
426 shr $8, %rax
427 sub $1, %r13
428 jne _less_than_8_bytes_left_\@
429_multiple_of_16_bytes_\@:
430.endm
431
adcadab3
DW
432# GCM_COMPLETE Finishes update of tag of last partial block
433# Output: Authorization Tag (AUTH_TAG)
434# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
fb8986e6 435.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
9660474b 436 movdqu AadHash(%arg2), %xmm8
1476db2d 437 movdqu HashKey(%arg2), %xmm13
e2e34b08
DW
438
439 mov PBlockLen(%arg2), %r12
440
441 cmp $0, %r12
442 je _partial_done\@
443
444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445
446_partial_done\@:
9660474b 447 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
adcadab3
DW
448 shl $3, %r12 # convert into number of bits
449 movd %r12d, %xmm15 # len(A) in %xmm15
9660474b
DW
450 mov InLen(%arg2), %r12
451 shl $3, %r12 # len(C) in bits (*128)
452 MOVQ_R64_XMM %r12, %xmm1
453
adcadab3
DW
454 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
455 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
456 pxor %xmm15, %xmm8
457 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458 # final GHASH computation
459 movdqa SHUF_MASK(%rip), %xmm10
460 PSHUFB_XMM %xmm10, %xmm8
461
9660474b 462 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
adcadab3
DW
463 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
464 pxor %xmm8, %xmm0
465_return_T_\@:
fb8986e6
DW
466 mov \AUTHTAG, %r10 # %r10 = authTag
467 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
adcadab3
DW
468 cmp $16, %r11
469 je _T_16_\@
470 cmp $8, %r11
471 jl _T_4_\@
472_T_8_\@:
473 MOVQ_R64_XMM %xmm0, %rax
474 mov %rax, (%r10)
475 add $8, %r10
476 sub $8, %r11
477 psrldq $8, %xmm0
478 cmp $0, %r11
479 je _return_T_done_\@
480_T_4_\@:
481 movd %xmm0, %eax
482 mov %eax, (%r10)
483 add $4, %r10
484 sub $4, %r11
485 psrldq $4, %xmm0
486 cmp $0, %r11
487 je _return_T_done_\@
488_T_123_\@:
489 movd %xmm0, %eax
490 cmp $2, %r11
491 jl _T_1_\@
492 mov %ax, (%r10)
493 cmp $2, %r11
494 je _return_T_done_\@
495 add $2, %r10
496 sar $16, %eax
497_T_1_\@:
498 mov %al, (%r10)
499 jmp _return_T_done_\@
500_T_16_\@:
501 movdqu %xmm0, (%r10)
502_return_T_done_\@:
503.endm
504
559ad0ff 505#ifdef __x86_64__
0bd82f5f
TS
506/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
507*
508*
509* Input: A and B (128-bits each, bit-reflected)
510* Output: C = A*B*x mod poly, (i.e. >>1 )
511* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
513*
514*/
515.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
516 movdqa \GH, \TMP1
517 pshufd $78, \GH, \TMP2
518 pshufd $78, \HK, \TMP3
519 pxor \GH, \TMP2 # TMP2 = a1+a0
520 pxor \HK, \TMP3 # TMP3 = b1+b0
521 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
522 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
523 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
524 pxor \GH, \TMP2
525 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
526 movdqa \TMP2, \TMP3
527 pslldq $8, \TMP3 # left shift TMP3 2 DWs
528 psrldq $8, \TMP2 # right shift TMP2 2 DWs
529 pxor \TMP3, \GH
530 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
531
532 # first phase of the reduction
533
534 movdqa \GH, \TMP2
535 movdqa \GH, \TMP3
536 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
537 # in in order to perform
538 # independent shifts
539 pslld $31, \TMP2 # packed right shift <<31
540 pslld $30, \TMP3 # packed right shift <<30
541 pslld $25, \TMP4 # packed right shift <<25
542 pxor \TMP3, \TMP2 # xor the shifted versions
543 pxor \TMP4, \TMP2
544 movdqa \TMP2, \TMP5
545 psrldq $4, \TMP5 # right shift TMP5 1 DW
546 pslldq $12, \TMP2 # left shift TMP2 3 DWs
547 pxor \TMP2, \GH
548
549 # second phase of the reduction
550
551 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
552 # in in order to perform
553 # independent shifts
554 movdqa \GH,\TMP3
555 movdqa \GH,\TMP4
556 psrld $1,\TMP2 # packed left shift >>1
557 psrld $2,\TMP3 # packed left shift >>2
558 psrld $7,\TMP4 # packed left shift >>7
559 pxor \TMP3,\TMP2 # xor the shifted versions
560 pxor \TMP4,\TMP2
561 pxor \TMP5, \TMP2
562 pxor \TMP2, \GH
563 pxor \TMP1, \GH # result is in TMP1
564.endm
565
b20209c9
JS
566# Reads DLEN bytes starting at DPTR and stores in XMMDst
567# where 0 < DLEN < 16
568# Clobbers %rax, DLEN and XMM1
569.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
570 cmp $8, \DLEN
571 jl _read_lt8_\@
572 mov (\DPTR), %rax
573 MOVQ_R64_XMM %rax, \XMMDst
574 sub $8, \DLEN
575 jz _done_read_partial_block_\@
576 xor %eax, %eax
577_read_next_byte_\@:
578 shl $8, %rax
579 mov 7(\DPTR, \DLEN, 1), %al
580 dec \DLEN
581 jnz _read_next_byte_\@
582 MOVQ_R64_XMM %rax, \XMM1
583 pslldq $8, \XMM1
584 por \XMM1, \XMMDst
585 jmp _done_read_partial_block_\@
586_read_lt8_\@:
587 xor %eax, %eax
588_read_next_byte_lt8_\@:
589 shl $8, %rax
590 mov -1(\DPTR, \DLEN, 1), %al
591 dec \DLEN
592 jnz _read_next_byte_lt8_\@
593 MOVQ_R64_XMM %rax, \XMMDst
594_done_read_partial_block_\@:
595.endm
596
c594c540
DW
597# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598# clobbers r10-11, xmm14
fb8986e6 599.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
c594c540
DW
600 TMP6 TMP7
601 MOVADQ SHUF_MASK(%rip), %xmm14
fb8986e6
DW
602 mov \AAD, %r10 # %r10 = AAD
603 mov \AADLEN, %r11 # %r11 = aadLen
c594c540
DW
604 pxor \TMP7, \TMP7
605 pxor \TMP6, \TMP6
0487ccac
SD
606
607 cmp $16, %r11
e1fd316f
DW
608 jl _get_AAD_rest\@
609_get_AAD_blocks\@:
c594c540
DW
610 movdqu (%r10), \TMP7
611 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
612 pxor \TMP7, \TMP6
613 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0487ccac 614 add $16, %r10
0487ccac
SD
615 sub $16, %r11
616 cmp $16, %r11
e1fd316f 617 jge _get_AAD_blocks\@
0487ccac 618
c594c540 619 movdqu \TMP6, \TMP7
1ecdd37e
JS
620
621 /* read the last <16B of AAD */
e1fd316f 622_get_AAD_rest\@:
0487ccac 623 cmp $0, %r11
e1fd316f 624 je _get_AAD_done\@
0487ccac 625
c594c540
DW
626 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
628 pxor \TMP6, \TMP7
629 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
630 movdqu \TMP7, \TMP6
3c097b80 631
e1fd316f 632_get_AAD_done\@:
c594c540
DW
633 movdqu \TMP6, AadHash(%arg2)
634.endm
635
ae952c5e
DW
636# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637# between update calls.
638# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
642 AAD_HASH operation
643 mov PBlockLen(%arg2), %r13
644 cmp $0, %r13
645 je _partial_block_done_\@ # Leave Macro if no partial blocks
646 # Read in input data without over reading
647 cmp $16, \PLAIN_CYPH_LEN
648 jl _fewer_than_16_bytes_\@
649 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
650 jmp _data_read_\@
651
652_fewer_than_16_bytes_\@:
653 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654 mov \PLAIN_CYPH_LEN, %r12
655 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
656
657 mov PBlockLen(%arg2), %r13
658
659_data_read_\@: # Finished reading in data
660
661 movdqu PBlockEncKey(%arg2), %xmm9
662 movdqu HashKey(%arg2), %xmm13
663
664 lea SHIFT_MASK(%rip), %r12
665
666 # adjust the shuffle mask pointer to be able to shift r13 bytes
667 # r16-r13 is the number of bytes in plaintext mod 16)
668 add %r13, %r12
669 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
670 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
671
672.ifc \operation, dec
673 movdqa %xmm1, %xmm3
674 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
675
676 mov \PLAIN_CYPH_LEN, %r10
677 add %r13, %r10
678 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
679 sub $16, %r10
680 # Determine if if partial block is not being filled and
681 # shift mask accordingly
682 jge _no_extra_mask_1_\@
683 sub %r10, %r12
684_no_extra_mask_1_\@:
685
686 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
687 # get the appropriate mask to mask out bottom r13 bytes of xmm9
688 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
689
690 pand %xmm1, %xmm3
691 movdqa SHUF_MASK(%rip), %xmm10
692 PSHUFB_XMM %xmm10, %xmm3
693 PSHUFB_XMM %xmm2, %xmm3
694 pxor %xmm3, \AAD_HASH
695
696 cmp $0, %r10
697 jl _partial_incomplete_1_\@
698
699 # GHASH computation for the last <16 Byte block
700 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
a7bea830 701 xor %eax, %eax
ae952c5e
DW
702
703 mov %rax, PBlockLen(%arg2)
704 jmp _dec_done_\@
705_partial_incomplete_1_\@:
706 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
707_dec_done_\@:
708 movdqu \AAD_HASH, AadHash(%arg2)
709.else
710 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
711
712 mov \PLAIN_CYPH_LEN, %r10
713 add %r13, %r10
714 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
715 sub $16, %r10
716 # Determine if if partial block is not being filled and
717 # shift mask accordingly
718 jge _no_extra_mask_2_\@
719 sub %r10, %r12
720_no_extra_mask_2_\@:
721
722 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
723 # get the appropriate mask to mask out bottom r13 bytes of xmm9
724 pand %xmm1, %xmm9
725
726 movdqa SHUF_MASK(%rip), %xmm1
727 PSHUFB_XMM %xmm1, %xmm9
728 PSHUFB_XMM %xmm2, %xmm9
729 pxor %xmm9, \AAD_HASH
730
731 cmp $0, %r10
732 jl _partial_incomplete_2_\@
733
734 # GHASH computation for the last <16 Byte block
735 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
a7bea830 736 xor %eax, %eax
ae952c5e
DW
737
738 mov %rax, PBlockLen(%arg2)
739 jmp _encode_done_\@
740_partial_incomplete_2_\@:
741 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
742_encode_done_\@:
743 movdqu \AAD_HASH, AadHash(%arg2)
744
745 movdqa SHUF_MASK(%rip), %xmm10
746 # shuffle xmm9 back to output as ciphertext
747 PSHUFB_XMM %xmm10, %xmm9
748 PSHUFB_XMM %xmm2, %xmm9
749.endif
750 # output encrypted Bytes
751 cmp $0, %r10
752 jl _partial_fill_\@
753 mov %r13, %r12
754 mov $16, %r13
755 # Set r13 to be the number of bytes to write out
756 sub %r12, %r13
757 jmp _count_set_\@
758_partial_fill_\@:
759 mov \PLAIN_CYPH_LEN, %r13
760_count_set_\@:
761 movdqa %xmm9, %xmm0
762 MOVQ_R64_XMM %xmm0, %rax
763 cmp $8, %r13
764 jle _less_than_8_bytes_left_\@
765
766 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
767 add $8, \DATA_OFFSET
768 psrldq $8, %xmm0
769 MOVQ_R64_XMM %xmm0, %rax
770 sub $8, %r13
771_less_than_8_bytes_left_\@:
772 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
773 add $1, \DATA_OFFSET
774 shr $8, %rax
775 sub $1, %r13
776 jne _less_than_8_bytes_left_\@
777_partial_block_done_\@:
778.endm # PARTIAL_BLOCK
779
c594c540
DW
780/*
781* if a = number of total plaintext bytes
782* b = floor(a/16)
783* num_initial_blocks = b mod 4
784* encrypt the initial num_initial_blocks blocks and apply ghash on
785* the ciphertext
786* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
787* are clobbered
1476db2d 788* arg1, %arg2, %arg3 are used as a pointer only, not modified
c594c540
DW
789*/
790
791
792.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
793 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
9660474b 794 MOVADQ SHUF_MASK(%rip), %xmm14
c594c540
DW
795
796 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
797
0487ccac 798 # start AES for num_initial_blocks blocks
3c097b80 799
9660474b 800 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
3c097b80
TS
801
802.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 803
e31ac32d
TM
804 MOVADQ ONE(%RIP),\TMP1
805 MOVADQ 0(%arg1),\TMP2
3c097b80 806.irpc index, \i_seq
e31ac32d 807 paddd \TMP1, \XMM0 # INCR Y0
e1fd316f
DW
808.ifc \operation, dec
809 movdqa \XMM0, %xmm\index
810.else
e31ac32d 811 MOVADQ \XMM0, %xmm\index
e1fd316f 812.endif
e31ac32d
TM
813 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
814 pxor \TMP2, %xmm\index
3c097b80 815.endr
e31ac32d
TM
816 lea 0x10(%arg1),%r10
817 mov keysize,%eax
818 shr $2,%eax # 128->4, 192->6, 256->8
819 add $5,%eax # 128->9, 192->11, 256->13
820
e1fd316f 821aes_loop_initial_\@:
e31ac32d
TM
822 MOVADQ (%r10),\TMP1
823.irpc index, \i_seq
824 AESENC \TMP1, %xmm\index
3c097b80 825.endr
e31ac32d
TM
826 add $16,%r10
827 sub $1,%eax
e1fd316f 828 jnz aes_loop_initial_\@
e31ac32d
TM
829
830 MOVADQ (%r10), \TMP1
3c097b80 831.irpc index, \i_seq
e31ac32d 832 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
833.endr
834.irpc index, \i_seq
9ee4a5df 835 movdqu (%arg4 , %r11, 1), \TMP1
3c097b80 836 pxor \TMP1, %xmm\index
9ee4a5df 837 movdqu %xmm\index, (%arg3 , %r11, 1)
3c097b80
TS
838 # write back plaintext/ciphertext for num_initial_blocks
839 add $16, %r11
e1fd316f
DW
840
841.ifc \operation, dec
842 movdqa \TMP1, %xmm\index
843.endif
3c097b80
TS
844 PSHUFB_XMM %xmm14, %xmm\index
845
846 # prepare plaintext/ciphertext for GHASH computation
847.endr
848.endif
0487ccac 849
3c097b80
TS
850 # apply GHASH on num_initial_blocks blocks
851
852.if \i == 5
853 pxor %xmm5, %xmm6
854 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
855 pxor %xmm6, %xmm7
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 pxor %xmm7, %xmm8
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 6
860 pxor %xmm6, %xmm7
861 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862 pxor %xmm7, %xmm8
863 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
864.elseif \i == 7
865 pxor %xmm7, %xmm8
866 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
867.endif
868 cmp $64, %r13
e1fd316f 869 jl _initial_blocks_done\@
3c097b80
TS
870 # no need for precomputed values
871/*
872*
873* Precomputations for HashKey parallel with encryption of first 4 blocks.
874* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
875*/
e31ac32d
TM
876 MOVADQ ONE(%RIP),\TMP1
877 paddd \TMP1, \XMM0 # INCR Y0
878 MOVADQ \XMM0, \XMM1
3c097b80
TS
879 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
880
e31ac32d
TM
881 paddd \TMP1, \XMM0 # INCR Y0
882 MOVADQ \XMM0, \XMM2
3c097b80
TS
883 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
884
e31ac32d
TM
885 paddd \TMP1, \XMM0 # INCR Y0
886 MOVADQ \XMM0, \XMM3
3c097b80
TS
887 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
888
e31ac32d
TM
889 paddd \TMP1, \XMM0 # INCR Y0
890 MOVADQ \XMM0, \XMM4
3c097b80
TS
891 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
892
e31ac32d
TM
893 MOVADQ 0(%arg1),\TMP1
894 pxor \TMP1, \XMM1
895 pxor \TMP1, \XMM2
896 pxor \TMP1, \XMM3
897 pxor \TMP1, \XMM4
3c097b80
TS
898.irpc index, 1234 # do 4 rounds
899 movaps 0x10*\index(%arg1), \TMP1
900 AESENC \TMP1, \XMM1
901 AESENC \TMP1, \XMM2
902 AESENC \TMP1, \XMM3
903 AESENC \TMP1, \XMM4
904.endr
3c097b80
TS
905.irpc index, 56789 # do next 5 rounds
906 movaps 0x10*\index(%arg1), \TMP1
907 AESENC \TMP1, \XMM1
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911.endr
e31ac32d
TM
912 lea 0xa0(%arg1),%r10
913 mov keysize,%eax
914 shr $2,%eax # 128->4, 192->6, 256->8
915 sub $4,%eax # 128->0, 192->2, 256->4
e1fd316f 916 jz aes_loop_pre_done\@
e31ac32d 917
e1fd316f 918aes_loop_pre_\@:
e31ac32d
TM
919 MOVADQ (%r10),\TMP2
920.irpc index, 1234
921 AESENC \TMP2, %xmm\index
922.endr
923 add $16,%r10
924 sub $1,%eax
e1fd316f 925 jnz aes_loop_pre_\@
e31ac32d 926
e1fd316f 927aes_loop_pre_done\@:
e31ac32d 928 MOVADQ (%r10), \TMP2
3c097b80
TS
929 AESENCLAST \TMP2, \XMM1
930 AESENCLAST \TMP2, \XMM2
931 AESENCLAST \TMP2, \XMM3
932 AESENCLAST \TMP2, \XMM4
9ee4a5df 933 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
3c097b80 934 pxor \TMP1, \XMM1
e1fd316f 935.ifc \operation, dec
9ee4a5df 936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
e1fd316f
DW
937 movdqa \TMP1, \XMM1
938.endif
9ee4a5df 939 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
3c097b80 940 pxor \TMP1, \XMM2
e1fd316f 941.ifc \operation, dec
9ee4a5df 942 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
e1fd316f
DW
943 movdqa \TMP1, \XMM2
944.endif
9ee4a5df 945 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
3c097b80 946 pxor \TMP1, \XMM3
e1fd316f 947.ifc \operation, dec
9ee4a5df 948 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
e1fd316f
DW
949 movdqa \TMP1, \XMM3
950.endif
9ee4a5df 951 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
3c097b80 952 pxor \TMP1, \XMM4
e1fd316f 953.ifc \operation, dec
9ee4a5df 954 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f
DW
955 movdqa \TMP1, \XMM4
956.else
9ee4a5df
DW
957 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
958 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
959 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
960 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f 961.endif
3c097b80 962
0bd82f5f 963 add $64, %r11
3c097b80 964 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
965 pxor \XMMDst, \XMM1
966# combine GHASHed value with the corresponding ciphertext
3c097b80 967 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 968 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
969 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
970
e1fd316f 971_initial_blocks_done\@:
3c097b80 972
0bd82f5f
TS
973.endm
974
975/*
976* encrypt 4 blocks at a time
977* ghash the 4 previously encrypted ciphertext blocks
9ee4a5df 978* arg1, %arg3, %arg4 are used as pointers only, not modified
0bd82f5f
TS
979* %r11 is the data offset value
980*/
3c097b80
TS
981.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
982TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
983
984 movdqa \XMM1, \XMM5
985 movdqa \XMM2, \XMM6
986 movdqa \XMM3, \XMM7
987 movdqa \XMM4, \XMM8
988
989 movdqa SHUF_MASK(%rip), %xmm15
990 # multiply TMP5 * HashKey using karatsuba
991
992 movdqa \XMM5, \TMP4
993 pshufd $78, \XMM5, \TMP6
994 pxor \XMM5, \TMP6
995 paddd ONE(%rip), \XMM0 # INCR CNT
e5b954e8 996 movdqu HashKey_4(%arg2), \TMP5
3c097b80
TS
997 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
998 movdqa \XMM0, \XMM1
999 paddd ONE(%rip), \XMM0 # INCR CNT
1000 movdqa \XMM0, \XMM2
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1002 movdqa \XMM0, \XMM3
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1004 movdqa \XMM0, \XMM4
1005 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1006 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor (%arg1), \XMM1
1012 pxor (%arg1), \XMM2
1013 pxor (%arg1), \XMM3
1014 pxor (%arg1), \XMM4
e5b954e8 1015 movdqu HashKey_4_k(%arg2), \TMP5
3c097b80
TS
1016 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1017 movaps 0x10(%arg1), \TMP1
1018 AESENC \TMP1, \XMM1 # Round 1
1019 AESENC \TMP1, \XMM2
1020 AESENC \TMP1, \XMM3
1021 AESENC \TMP1, \XMM4
1022 movaps 0x20(%arg1), \TMP1
1023 AESENC \TMP1, \XMM1 # Round 2
1024 AESENC \TMP1, \XMM2
1025 AESENC \TMP1, \XMM3
1026 AESENC \TMP1, \XMM4
1027 movdqa \XMM6, \TMP1
1028 pshufd $78, \XMM6, \TMP2
1029 pxor \XMM6, \TMP2
e5b954e8 1030 movdqu HashKey_3(%arg2), \TMP5
3c097b80
TS
1031 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1032 movaps 0x30(%arg1), \TMP3
1033 AESENC \TMP3, \XMM1 # Round 3
1034 AESENC \TMP3, \XMM2
1035 AESENC \TMP3, \XMM3
1036 AESENC \TMP3, \XMM4
1037 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1038 movaps 0x40(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 4
1040 AESENC \TMP3, \XMM2
1041 AESENC \TMP3, \XMM3
1042 AESENC \TMP3, \XMM4
e5b954e8 1043 movdqu HashKey_3_k(%arg2), \TMP5
3c097b80
TS
1044 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1045 movaps 0x50(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 5
1047 AESENC \TMP3, \XMM2
1048 AESENC \TMP3, \XMM3
1049 AESENC \TMP3, \XMM4
1050 pxor \TMP1, \TMP4
1051# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052 pxor \XMM6, \XMM5
1053 pxor \TMP2, \TMP6
1054 movdqa \XMM7, \TMP1
1055 pshufd $78, \XMM7, \TMP2
1056 pxor \XMM7, \TMP2
e5b954e8 1057 movdqu HashKey_2(%arg2), \TMP5
3c097b80
TS
1058
1059 # Multiply TMP5 * HashKey using karatsuba
1060
1061 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1062 movaps 0x60(%arg1), \TMP3
1063 AESENC \TMP3, \XMM1 # Round 6
1064 AESENC \TMP3, \XMM2
1065 AESENC \TMP3, \XMM3
1066 AESENC \TMP3, \XMM4
1067 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1068 movaps 0x70(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 7
1070 AESENC \TMP3, \XMM2
1071 AESENC \TMP3, \XMM3
1072 AESENC \TMP3, \XMM4
e5b954e8 1073 movdqu HashKey_2_k(%arg2), \TMP5
3c097b80
TS
1074 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movaps 0x80(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 8
1077 AESENC \TMP3, \XMM2
1078 AESENC \TMP3, \XMM3
1079 AESENC \TMP3, \XMM4
1080 pxor \TMP1, \TMP4
1081# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082 pxor \XMM7, \XMM5
1083 pxor \TMP2, \TMP6
1084
1085 # Multiply XMM8 * HashKey
1086 # XMM8 and TMP5 hold the values for the two operands
1087
1088 movdqa \XMM8, \TMP1
1089 pshufd $78, \XMM8, \TMP2
1090 pxor \XMM8, \TMP2
e5b954e8 1091 movdqu HashKey(%arg2), \TMP5
3c097b80
TS
1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x90(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 9
1095 AESENC \TMP3, \XMM2
1096 AESENC \TMP3, \XMM3
1097 AESENC \TMP3, \XMM4
1098 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1099 lea 0xa0(%arg1),%r10
1100 mov keysize,%eax
1101 shr $2,%eax # 128->4, 192->6, 256->8
1102 sub $4,%eax # 128->0, 192->2, 256->4
fb8986e6 1103 jz aes_loop_par_enc_done\@
e31ac32d 1104
fb8986e6 1105aes_loop_par_enc\@:
e31ac32d
TM
1106 MOVADQ (%r10),\TMP3
1107.irpc index, 1234
1108 AESENC \TMP3, %xmm\index
1109.endr
1110 add $16,%r10
1111 sub $1,%eax
fb8986e6 1112 jnz aes_loop_par_enc\@
e31ac32d 1113
fb8986e6 1114aes_loop_par_enc_done\@:
e31ac32d 1115 MOVADQ (%r10), \TMP3
3c097b80
TS
1116 AESENCLAST \TMP3, \XMM1 # Round 10
1117 AESENCLAST \TMP3, \XMM2
1118 AESENCLAST \TMP3, \XMM3
1119 AESENCLAST \TMP3, \XMM4
e5b954e8 1120 movdqu HashKey_k(%arg2), \TMP5
3c097b80 1121 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1122 movdqu (%arg4,%r11,1), \TMP3
3c097b80 1123 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1124 movdqu 16(%arg4,%r11,1), \TMP3
3c097b80 1125 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1126 movdqu 32(%arg4,%r11,1), \TMP3
3c097b80 1127 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1128 movdqu 48(%arg4,%r11,1), \TMP3
3c097b80 1129 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df
DW
1130 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1133 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
3c097b80
TS
1134 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1135 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1136 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1137 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1138
1139 pxor \TMP4, \TMP1
1140 pxor \XMM8, \XMM5
1141 pxor \TMP6, \TMP2
1142 pxor \TMP1, \TMP2
1143 pxor \XMM5, \TMP2
1144 movdqa \TMP2, \TMP3
1145 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1146 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1147 pxor \TMP3, \XMM5
1148 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1149
1150 # first phase of reduction
1151
1152 movdqa \XMM5, \TMP2
1153 movdqa \XMM5, \TMP3
1154 movdqa \XMM5, \TMP4
1155# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156 pslld $31, \TMP2 # packed right shift << 31
1157 pslld $30, \TMP3 # packed right shift << 30
1158 pslld $25, \TMP4 # packed right shift << 25
1159 pxor \TMP3, \TMP2 # xor the shifted versions
1160 pxor \TMP4, \TMP2
1161 movdqa \TMP2, \TMP5
1162 psrldq $4, \TMP5 # right shift T5 1 DW
1163 pslldq $12, \TMP2 # left shift T2 3 DWs
1164 pxor \TMP2, \XMM5
1165
1166 # second phase of reduction
1167
1168 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169 movdqa \XMM5,\TMP3
1170 movdqa \XMM5,\TMP4
1171 psrld $1, \TMP2 # packed left shift >>1
1172 psrld $2, \TMP3 # packed left shift >>2
1173 psrld $7, \TMP4 # packed left shift >>7
1174 pxor \TMP3,\TMP2 # xor the shifted versions
1175 pxor \TMP4,\TMP2
1176 pxor \TMP5, \TMP2
1177 pxor \TMP2, \XMM5
1178 pxor \TMP1, \XMM5 # result is in TMP1
1179
1180 pxor \XMM5, \XMM1
1181.endm
1182
1183/*
1184* decrypt 4 blocks at a time
1185* ghash the 4 previously decrypted ciphertext blocks
9ee4a5df 1186* arg1, %arg3, %arg4 are used as pointers only, not modified
3c097b80
TS
1187* %r11 is the data offset value
1188*/
1189.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
1190TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192 movdqa \XMM1, \XMM5
1193 movdqa \XMM2, \XMM6
1194 movdqa \XMM3, \XMM7
1195 movdqa \XMM4, \XMM8
1196
3c097b80 1197 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
1198 # multiply TMP5 * HashKey using karatsuba
1199
1200 movdqa \XMM5, \TMP4
1201 pshufd $78, \XMM5, \TMP6
1202 pxor \XMM5, \TMP6
1203 paddd ONE(%rip), \XMM0 # INCR CNT
e5b954e8 1204 movdqu HashKey_4(%arg2), \TMP5
0bd82f5f
TS
1205 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1206 movdqa \XMM0, \XMM1
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1208 movdqa \XMM0, \XMM2
1209 paddd ONE(%rip), \XMM0 # INCR CNT
1210 movdqa \XMM0, \XMM3
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1212 movdqa \XMM0, \XMM4
3c097b80 1213 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 1214 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
1215 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1216 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1217 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1218
0bd82f5f
TS
1219 pxor (%arg1), \XMM1
1220 pxor (%arg1), \XMM2
1221 pxor (%arg1), \XMM3
1222 pxor (%arg1), \XMM4
e5b954e8 1223 movdqu HashKey_4_k(%arg2), \TMP5
0bd82f5f
TS
1224 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1225 movaps 0x10(%arg1), \TMP1
1226 AESENC \TMP1, \XMM1 # Round 1
1227 AESENC \TMP1, \XMM2
1228 AESENC \TMP1, \XMM3
1229 AESENC \TMP1, \XMM4
1230 movaps 0x20(%arg1), \TMP1
1231 AESENC \TMP1, \XMM1 # Round 2
1232 AESENC \TMP1, \XMM2
1233 AESENC \TMP1, \XMM3
1234 AESENC \TMP1, \XMM4
1235 movdqa \XMM6, \TMP1
1236 pshufd $78, \XMM6, \TMP2
1237 pxor \XMM6, \TMP2
e5b954e8 1238 movdqu HashKey_3(%arg2), \TMP5
0bd82f5f
TS
1239 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1240 movaps 0x30(%arg1), \TMP3
1241 AESENC \TMP3, \XMM1 # Round 3
1242 AESENC \TMP3, \XMM2
1243 AESENC \TMP3, \XMM3
1244 AESENC \TMP3, \XMM4
1245 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1246 movaps 0x40(%arg1), \TMP3
1247 AESENC \TMP3, \XMM1 # Round 4
1248 AESENC \TMP3, \XMM2
1249 AESENC \TMP3, \XMM3
1250 AESENC \TMP3, \XMM4
e5b954e8 1251 movdqu HashKey_3_k(%arg2), \TMP5
0bd82f5f
TS
1252 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1253 movaps 0x50(%arg1), \TMP3
1254 AESENC \TMP3, \XMM1 # Round 5
1255 AESENC \TMP3, \XMM2
1256 AESENC \TMP3, \XMM3
1257 AESENC \TMP3, \XMM4
1258 pxor \TMP1, \TMP4
1259# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260 pxor \XMM6, \XMM5
1261 pxor \TMP2, \TMP6
1262 movdqa \XMM7, \TMP1
1263 pshufd $78, \XMM7, \TMP2
1264 pxor \XMM7, \TMP2
e5b954e8 1265 movdqu HashKey_2(%arg2), \TMP5
0bd82f5f
TS
1266
1267 # Multiply TMP5 * HashKey using karatsuba
1268
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 movaps 0x60(%arg1), \TMP3
1271 AESENC \TMP3, \XMM1 # Round 6
1272 AESENC \TMP3, \XMM2
1273 AESENC \TMP3, \XMM3
1274 AESENC \TMP3, \XMM4
1275 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1276 movaps 0x70(%arg1), \TMP3
1277 AESENC \TMP3, \XMM1 # Round 7
1278 AESENC \TMP3, \XMM2
1279 AESENC \TMP3, \XMM3
1280 AESENC \TMP3, \XMM4
e5b954e8 1281 movdqu HashKey_2_k(%arg2), \TMP5
0bd82f5f
TS
1282 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1283 movaps 0x80(%arg1), \TMP3
1284 AESENC \TMP3, \XMM1 # Round 8
1285 AESENC \TMP3, \XMM2
1286 AESENC \TMP3, \XMM3
1287 AESENC \TMP3, \XMM4
1288 pxor \TMP1, \TMP4
1289# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290 pxor \XMM7, \XMM5
1291 pxor \TMP2, \TMP6
1292
1293 # Multiply XMM8 * HashKey
1294 # XMM8 and TMP5 hold the values for the two operands
1295
1296 movdqa \XMM8, \TMP1
1297 pshufd $78, \XMM8, \TMP2
1298 pxor \XMM8, \TMP2
e5b954e8 1299 movdqu HashKey(%arg2), \TMP5
0bd82f5f
TS
1300 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1301 movaps 0x90(%arg1), \TMP3
1302 AESENC \TMP3, \XMM1 # Round 9
1303 AESENC \TMP3, \XMM2
1304 AESENC \TMP3, \XMM3
1305 AESENC \TMP3, \XMM4
1306 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1307 lea 0xa0(%arg1),%r10
1308 mov keysize,%eax
1309 shr $2,%eax # 128->4, 192->6, 256->8
1310 sub $4,%eax # 128->0, 192->2, 256->4
fb8986e6 1311 jz aes_loop_par_dec_done\@
e31ac32d 1312
fb8986e6 1313aes_loop_par_dec\@:
e31ac32d
TM
1314 MOVADQ (%r10),\TMP3
1315.irpc index, 1234
1316 AESENC \TMP3, %xmm\index
1317.endr
1318 add $16,%r10
1319 sub $1,%eax
fb8986e6 1320 jnz aes_loop_par_dec\@
e31ac32d 1321
fb8986e6 1322aes_loop_par_dec_done\@:
e31ac32d
TM
1323 MOVADQ (%r10), \TMP3
1324 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
1325 AESENCLAST \TMP3, \XMM2
1326 AESENCLAST \TMP3, \XMM3
1327 AESENCLAST \TMP3, \XMM4
e5b954e8 1328 movdqu HashKey_k(%arg2), \TMP5
0bd82f5f 1329 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1330 movdqu (%arg4,%r11,1), \TMP3
0bd82f5f 1331 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1332 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1333 movdqa \TMP3, \XMM1
9ee4a5df 1334 movdqu 16(%arg4,%r11,1), \TMP3
0bd82f5f 1335 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1336 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1337 movdqa \TMP3, \XMM2
9ee4a5df 1338 movdqu 32(%arg4,%r11,1), \TMP3
0bd82f5f 1339 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1340 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1341 movdqa \TMP3, \XMM3
9ee4a5df 1342 movdqu 48(%arg4,%r11,1), \TMP3
0bd82f5f 1343 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df 1344 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1345 movdqa \TMP3, \XMM4
3c097b80
TS
1346 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1347 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1348 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1349 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1350
1351 pxor \TMP4, \TMP1
1352 pxor \XMM8, \XMM5
1353 pxor \TMP6, \TMP2
1354 pxor \TMP1, \TMP2
1355 pxor \XMM5, \TMP2
1356 movdqa \TMP2, \TMP3
1357 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1358 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1359 pxor \TMP3, \XMM5
1360 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1361
1362 # first phase of reduction
1363
1364 movdqa \XMM5, \TMP2
1365 movdqa \XMM5, \TMP3
1366 movdqa \XMM5, \TMP4
1367# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368 pslld $31, \TMP2 # packed right shift << 31
1369 pslld $30, \TMP3 # packed right shift << 30
1370 pslld $25, \TMP4 # packed right shift << 25
1371 pxor \TMP3, \TMP2 # xor the shifted versions
1372 pxor \TMP4, \TMP2
1373 movdqa \TMP2, \TMP5
1374 psrldq $4, \TMP5 # right shift T5 1 DW
1375 pslldq $12, \TMP2 # left shift T2 3 DWs
1376 pxor \TMP2, \XMM5
1377
1378 # second phase of reduction
1379
1380 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381 movdqa \XMM5,\TMP3
1382 movdqa \XMM5,\TMP4
1383 psrld $1, \TMP2 # packed left shift >>1
1384 psrld $2, \TMP3 # packed left shift >>2
1385 psrld $7, \TMP4 # packed left shift >>7
1386 pxor \TMP3,\TMP2 # xor the shifted versions
1387 pxor \TMP4,\TMP2
1388 pxor \TMP5, \TMP2
1389 pxor \TMP2, \XMM5
1390 pxor \TMP1, \XMM5 # result is in TMP1
1391
1392 pxor \XMM5, \XMM1
1393.endm
1394
1395/* GHASH the last 4 ciphertext blocks. */
1396.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399 # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401 movdqa \XMM1, \TMP6
1402 pshufd $78, \XMM1, \TMP2
1403 pxor \XMM1, \TMP2
e5b954e8 1404 movdqu HashKey_4(%arg2), \TMP5
0bd82f5f
TS
1405 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1406 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
e5b954e8 1407 movdqu HashKey_4_k(%arg2), \TMP4
0bd82f5f
TS
1408 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1409 movdqa \XMM1, \XMMDst
1410 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1411
1412 # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414 movdqa \XMM2, \TMP1
1415 pshufd $78, \XMM2, \TMP2
1416 pxor \XMM2, \TMP2
e5b954e8 1417 movdqu HashKey_3(%arg2), \TMP5
0bd82f5f
TS
1418 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1419 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
e5b954e8 1420 movdqu HashKey_3_k(%arg2), \TMP4
0bd82f5f
TS
1421 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1422 pxor \TMP1, \TMP6
1423 pxor \XMM2, \XMMDst
1424 pxor \TMP2, \XMM1
1425# results accumulated in TMP6, XMMDst, XMM1
1426
1427 # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429 movdqa \XMM3, \TMP1
1430 pshufd $78, \XMM3, \TMP2
1431 pxor \XMM3, \TMP2
e5b954e8 1432 movdqu HashKey_2(%arg2), \TMP5
0bd82f5f
TS
1433 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1434 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
e5b954e8 1435 movdqu HashKey_2_k(%arg2), \TMP4
0bd82f5f
TS
1436 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1437 pxor \TMP1, \TMP6
1438 pxor \XMM3, \XMMDst
1439 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1440
1441 # Multiply TMP1 * HashKey (using Karatsuba)
1442 movdqa \XMM4, \TMP1
1443 pshufd $78, \XMM4, \TMP2
1444 pxor \XMM4, \TMP2
e5b954e8 1445 movdqu HashKey(%arg2), \TMP5
0bd82f5f
TS
1446 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1447 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
e5b954e8 1448 movdqu HashKey_k(%arg2), \TMP4
0bd82f5f
TS
1449 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1450 pxor \TMP1, \TMP6
1451 pxor \XMM4, \XMMDst
1452 pxor \XMM1, \TMP2
1453 pxor \TMP6, \TMP2
1454 pxor \XMMDst, \TMP2
1455 # middle section of the temp results combined as in karatsuba algorithm
1456 movdqa \TMP2, \TMP4
1457 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1458 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1459 pxor \TMP4, \XMMDst
1460 pxor \TMP2, \TMP6
1461# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462 # first phase of the reduction
1463 movdqa \XMMDst, \TMP2
1464 movdqa \XMMDst, \TMP3
1465 movdqa \XMMDst, \TMP4
1466# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467 pslld $31, \TMP2 # packed right shifting << 31
1468 pslld $30, \TMP3 # packed right shifting << 30
1469 pslld $25, \TMP4 # packed right shifting << 25
1470 pxor \TMP3, \TMP2 # xor the shifted versions
1471 pxor \TMP4, \TMP2
1472 movdqa \TMP2, \TMP7
1473 psrldq $4, \TMP7 # right shift TMP7 1 DW
1474 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1475 pxor \TMP2, \XMMDst
1476
1477 # second phase of the reduction
1478 movdqa \XMMDst, \TMP2
1479 # make 3 copies of XMMDst for doing 3 shift operations
1480 movdqa \XMMDst, \TMP3
1481 movdqa \XMMDst, \TMP4
1482 psrld $1, \TMP2 # packed left shift >> 1
1483 psrld $2, \TMP3 # packed left shift >> 2
1484 psrld $7, \TMP4 # packed left shift >> 7
1485 pxor \TMP3, \TMP2 # xor the shifted versions
1486 pxor \TMP4, \TMP2
1487 pxor \TMP7, \TMP2
1488 pxor \TMP2, \XMMDst
1489 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1490.endm
1491
0bd82f5f 1492
e31ac32d
TM
1493/* Encryption of a single block
1494* uses eax & r10
1495*/
0bd82f5f 1496
e31ac32d 1497.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1498
e31ac32d
TM
1499 pxor (%arg1), \XMM0
1500 mov keysize,%eax
1501 shr $2,%eax # 128->4, 192->6, 256->8
1502 add $5,%eax # 128->9, 192->11, 256->13
1503 lea 16(%arg1), %r10 # get first expanded key address
1504
1505_esb_loop_\@:
1506 MOVADQ (%r10),\TMP1
1507 AESENC \TMP1,\XMM0
1508 add $16,%r10
1509 sub $1,%eax
1510 jnz _esb_loop_\@
1511
1512 MOVADQ (%r10),\TMP1
1513 AESENCLAST \TMP1,\XMM0
1514.endm
0bd82f5f
TS
1515/*****************************************************************************
1516* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1517* struct gcm_context_data *data
1518* // Context data
0bd82f5f
TS
1519* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1520* const u8 *in, // Ciphertext input
1521* u64 plaintext_len, // Length of data in bytes for decryption.
1522* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1523* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524* // concatenated with 0x00000001. 16-byte aligned pointer.
1525* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526* const u8 *aad, // Additional Authentication Data (AAD)
1527* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1529* // given authentication tag and only return the plaintext if they match.
1530* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531* // (most likely), 12 or 8.
1532*
1533* Assumptions:
1534*
1535* keys:
1536* keys are pre-expanded and aligned to 16 bytes. we are using the first
1537* set of 11 keys in the data structure void *aes_ctx
1538*
1539* iv:
1540* 0 1 2 3
1541* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | Salt (From the SA) |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545* | Initialization Vector |
1546* | (This is the sequence number from IPSec header) |
1547* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548* | 0x1 |
1549* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550*
1551*
1552*
1553* AAD:
1554* AAD padded to 128 bits with 0
1555* for example, assume AAD is a u32 vector
1556*
1557* if AAD is 8 bytes:
1558* AAD[3] = {A0, A1};
1559* padded AAD in xmm register = {A1 A0 0 0}
1560*
1561* 0 1 2 3
1562* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564* | SPI (A1) |
1565* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566* | 32-bit Sequence Number (A0) |
1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568* | 0x0 |
1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*
1571* AAD Format with 32-bit Sequence Number
1572*
1573* if AAD is 12 bytes:
1574* AAD[3] = {A0, A1, A2};
1575* padded AAD in xmm register = {A2 A1 A0 0}
1576*
1577* 0 1 2 3
1578* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582* | SPI (A2) |
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584* | 64-bit Extended Sequence Number {A1,A0} |
1585* | |
1586* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587* | 0x0 |
1588* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589*
1590* AAD Format with 64-bit Extended Sequence Number
1591*
0bd82f5f
TS
1592* poly = x^128 + x^127 + x^126 + x^121 + 1
1593*
1594*****************************************************************************/
0bd82f5f 1595ENTRY(aesni_gcm_dec)
6c2c86b3 1596 FUNC_SAVE
0bd82f5f 1597
fb8986e6 1598 GCM_INIT %arg6, arg7, arg8, arg9
ba45833e 1599 GCM_ENC_DEC dec
fb8986e6 1600 GCM_COMPLETE arg10, arg11
6c2c86b3 1601 FUNC_RESTORE
0bd82f5f 1602 ret
8309b745 1603ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1604
1605
1606/*****************************************************************************
1607* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1608* struct gcm_context_data *data
1609* // Context data
0bd82f5f
TS
1610* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1611* const u8 *in, // Plaintext input
1612* u64 plaintext_len, // Length of data in bytes for encryption.
1613* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1614* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615* // concatenated with 0x00000001. 16-byte aligned pointer.
1616* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617* const u8 *aad, // Additional Authentication Data (AAD)
1618* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619* u8 *auth_tag, // Authenticated Tag output.
1620* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621* // 12 or 8.
1622*
1623* Assumptions:
1624*
1625* keys:
1626* keys are pre-expanded and aligned to 16 bytes. we are using the
1627* first set of 11 keys in the data structure void *aes_ctx
1628*
1629*
1630* iv:
1631* 0 1 2 3
1632* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | Salt (From the SA) |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636* | Initialization Vector |
1637* | (This is the sequence number from IPSec header) |
1638* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639* | 0x1 |
1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641*
1642*
1643*
1644* AAD:
1645* AAD padded to 128 bits with 0
1646* for example, assume AAD is a u32 vector
1647*
1648* if AAD is 8 bytes:
1649* AAD[3] = {A0, A1};
1650* padded AAD in xmm register = {A1 A0 0 0}
1651*
1652* 0 1 2 3
1653* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655* | SPI (A1) |
1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657* | 32-bit Sequence Number (A0) |
1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659* | 0x0 |
1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*
1662* AAD Format with 32-bit Sequence Number
1663*
1664* if AAD is 12 bytes:
1665* AAD[3] = {A0, A1, A2};
1666* padded AAD in xmm register = {A2 A1 A0 0}
1667*
1668* 0 1 2 3
1669* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671* | SPI (A2) |
1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673* | 64-bit Extended Sequence Number {A1,A0} |
1674* | |
1675* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676* | 0x0 |
1677* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678*
1679* AAD Format with 64-bit Extended Sequence Number
1680*
0bd82f5f
TS
1681* poly = x^128 + x^127 + x^126 + x^121 + 1
1682***************************************************************************/
1683ENTRY(aesni_gcm_enc)
6c2c86b3 1684 FUNC_SAVE
0bd82f5f 1685
fb8986e6 1686 GCM_INIT %arg6, arg7, arg8, arg9
ba45833e 1687 GCM_ENC_DEC enc
fb8986e6
DW
1688
1689 GCM_COMPLETE arg10, arg11
6c2c86b3 1690 FUNC_RESTORE
0bd82f5f 1691 ret
8309b745 1692ENDPROC(aesni_gcm_enc)
3c097b80 1693
fb8986e6
DW
1694/*****************************************************************************
1695* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1696* struct gcm_context_data *data,
1697* // context data
1698* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1699* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700* // concatenated with 0x00000001. 16-byte aligned pointer.
1701* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702* const u8 *aad, // Additional Authentication Data (AAD)
1703* u64 aad_len) // Length of AAD in bytes.
1704*/
1705ENTRY(aesni_gcm_init)
1706 FUNC_SAVE
1707 GCM_INIT %arg3, %arg4,%arg5, %arg6
1708 FUNC_RESTORE
1709 ret
1710ENDPROC(aesni_gcm_init)
1711
1712/*****************************************************************************
1713* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1714* struct gcm_context_data *data,
1715* // context data
1716* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1717* const u8 *in, // Plaintext input
1718* u64 plaintext_len, // Length of data in bytes for encryption.
1719*/
1720ENTRY(aesni_gcm_enc_update)
1721 FUNC_SAVE
1722 GCM_ENC_DEC enc
1723 FUNC_RESTORE
1724 ret
1725ENDPROC(aesni_gcm_enc_update)
1726
1727/*****************************************************************************
1728* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1729* struct gcm_context_data *data,
1730* // context data
1731* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1732* const u8 *in, // Plaintext input
1733* u64 plaintext_len, // Length of data in bytes for encryption.
1734*/
1735ENTRY(aesni_gcm_dec_update)
1736 FUNC_SAVE
1737 GCM_ENC_DEC dec
1738 FUNC_RESTORE
1739 ret
1740ENDPROC(aesni_gcm_dec_update)
1741
1742/*****************************************************************************
1743* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1744* struct gcm_context_data *data,
1745* // context data
1746* u8 *auth_tag, // Authenticated Tag output.
1747* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748* // 12 or 8.
1749*/
1750ENTRY(aesni_gcm_finalize)
1751 FUNC_SAVE
1752 GCM_COMPLETE %arg3 %arg4
1753 FUNC_RESTORE
1754 ret
1755ENDPROC(aesni_gcm_finalize)
1756
559ad0ff 1757#endif
0bd82f5f
TS
1758
1759
8309b745 1760.align 4
54b6a1bd
HY
1761_key_expansion_128:
1762_key_expansion_256a:
1763 pshufd $0b11111111, %xmm1, %xmm1
1764 shufps $0b00010000, %xmm0, %xmm4
1765 pxor %xmm4, %xmm0
1766 shufps $0b10001100, %xmm0, %xmm4
1767 pxor %xmm4, %xmm0
1768 pxor %xmm1, %xmm0
0d258efb
MK
1769 movaps %xmm0, (TKEYP)
1770 add $0x10, TKEYP
54b6a1bd 1771 ret
8309b745
JK
1772ENDPROC(_key_expansion_128)
1773ENDPROC(_key_expansion_256a)
54b6a1bd 1774
0d258efb 1775.align 4
54b6a1bd
HY
1776_key_expansion_192a:
1777 pshufd $0b01010101, %xmm1, %xmm1
1778 shufps $0b00010000, %xmm0, %xmm4
1779 pxor %xmm4, %xmm0
1780 shufps $0b10001100, %xmm0, %xmm4
1781 pxor %xmm4, %xmm0
1782 pxor %xmm1, %xmm0
1783
1784 movaps %xmm2, %xmm5
1785 movaps %xmm2, %xmm6
1786 pslldq $4, %xmm5
1787 pshufd $0b11111111, %xmm0, %xmm3
1788 pxor %xmm3, %xmm2
1789 pxor %xmm5, %xmm2
1790
1791 movaps %xmm0, %xmm1
1792 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1793 movaps %xmm6, (TKEYP)
54b6a1bd 1794 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1795 movaps %xmm1, 0x10(TKEYP)
1796 add $0x20, TKEYP
54b6a1bd 1797 ret
8309b745 1798ENDPROC(_key_expansion_192a)
54b6a1bd 1799
0d258efb 1800.align 4
54b6a1bd
HY
1801_key_expansion_192b:
1802 pshufd $0b01010101, %xmm1, %xmm1
1803 shufps $0b00010000, %xmm0, %xmm4
1804 pxor %xmm4, %xmm0
1805 shufps $0b10001100, %xmm0, %xmm4
1806 pxor %xmm4, %xmm0
1807 pxor %xmm1, %xmm0
1808
1809 movaps %xmm2, %xmm5
1810 pslldq $4, %xmm5
1811 pshufd $0b11111111, %xmm0, %xmm3
1812 pxor %xmm3, %xmm2
1813 pxor %xmm5, %xmm2
1814
0d258efb
MK
1815 movaps %xmm0, (TKEYP)
1816 add $0x10, TKEYP
54b6a1bd 1817 ret
8309b745 1818ENDPROC(_key_expansion_192b)
54b6a1bd 1819
0d258efb 1820.align 4
54b6a1bd
HY
1821_key_expansion_256b:
1822 pshufd $0b10101010, %xmm1, %xmm1
1823 shufps $0b00010000, %xmm2, %xmm4
1824 pxor %xmm4, %xmm2
1825 shufps $0b10001100, %xmm2, %xmm4
1826 pxor %xmm4, %xmm2
1827 pxor %xmm1, %xmm2
0d258efb
MK
1828 movaps %xmm2, (TKEYP)
1829 add $0x10, TKEYP
54b6a1bd 1830 ret
8309b745 1831ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1832
1833/*
1834 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1835 * unsigned int key_len)
1836 */
1837ENTRY(aesni_set_key)
8691ccd7 1838 FRAME_BEGIN
0d258efb
MK
1839#ifndef __x86_64__
1840 pushl KEYP
8691ccd7
JP
1841 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1842 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1843 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1844#endif
1845 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1846 movaps %xmm0, (KEYP)
1847 lea 0x10(KEYP), TKEYP # key addr
1848 movl %edx, 480(KEYP)
54b6a1bd
HY
1849 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1850 cmp $24, %dl
1851 jb .Lenc_key128
1852 je .Lenc_key192
0d258efb
MK
1853 movups 0x10(UKEYP), %xmm2 # other user key
1854 movaps %xmm2, (TKEYP)
1855 add $0x10, TKEYP
b369e521 1856 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1857 call _key_expansion_256a
b369e521 1858 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1859 call _key_expansion_256b
b369e521 1860 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1861 call _key_expansion_256a
b369e521 1862 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1863 call _key_expansion_256b
b369e521 1864 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1865 call _key_expansion_256a
b369e521 1866 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1867 call _key_expansion_256b
b369e521 1868 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1869 call _key_expansion_256a
b369e521 1870 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1871 call _key_expansion_256b
b369e521 1872 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1873 call _key_expansion_256a
b369e521 1874 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1875 call _key_expansion_256b
b369e521 1876 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1877 call _key_expansion_256a
b369e521 1878 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1879 call _key_expansion_256b
b369e521 1880 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1881 call _key_expansion_256a
1882 jmp .Ldec_key
1883.Lenc_key192:
0d258efb 1884 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1885 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1886 call _key_expansion_192a
b369e521 1887 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1888 call _key_expansion_192b
b369e521 1889 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1890 call _key_expansion_192a
b369e521 1891 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1892 call _key_expansion_192b
b369e521 1893 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1894 call _key_expansion_192a
b369e521 1895 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1896 call _key_expansion_192b
b369e521 1897 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1898 call _key_expansion_192a
b369e521 1899 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1900 call _key_expansion_192b
1901 jmp .Ldec_key
1902.Lenc_key128:
b369e521 1903 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1904 call _key_expansion_128
b369e521 1905 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1906 call _key_expansion_128
b369e521 1907 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1908 call _key_expansion_128
b369e521 1909 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1910 call _key_expansion_128
b369e521 1911 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1912 call _key_expansion_128
b369e521 1913 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1914 call _key_expansion_128
b369e521 1915 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1916 call _key_expansion_128
b369e521 1917 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1918 call _key_expansion_128
b369e521 1919 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1920 call _key_expansion_128
b369e521 1921 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1922 call _key_expansion_128
1923.Ldec_key:
0d258efb
MK
1924 sub $0x10, TKEYP
1925 movaps (KEYP), %xmm0
1926 movaps (TKEYP), %xmm1
1927 movaps %xmm0, 240(TKEYP)
1928 movaps %xmm1, 240(KEYP)
1929 add $0x10, KEYP
1930 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1931.align 4
1932.Ldec_key_loop:
0d258efb 1933 movaps (KEYP), %xmm0
b369e521 1934 AESIMC %xmm0 %xmm1
0d258efb
MK
1935 movaps %xmm1, (UKEYP)
1936 add $0x10, KEYP
1937 sub $0x10, UKEYP
1938 cmp TKEYP, KEYP
54b6a1bd 1939 jb .Ldec_key_loop
0d258efb
MK
1940 xor AREG, AREG
1941#ifndef __x86_64__
1942 popl KEYP
1943#endif
8691ccd7 1944 FRAME_END
54b6a1bd 1945 ret
8309b745 1946ENDPROC(aesni_set_key)
54b6a1bd
HY
1947
1948/*
1949 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1950 */
1951ENTRY(aesni_enc)
8691ccd7 1952 FRAME_BEGIN
0d258efb
MK
1953#ifndef __x86_64__
1954 pushl KEYP
1955 pushl KLEN
8691ccd7
JP
1956 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1957 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1958 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1959#endif
54b6a1bd
HY
1960 movl 480(KEYP), KLEN # key length
1961 movups (INP), STATE # input
1962 call _aesni_enc1
1963 movups STATE, (OUTP) # output
0d258efb
MK
1964#ifndef __x86_64__
1965 popl KLEN
1966 popl KEYP
1967#endif
8691ccd7 1968 FRAME_END
54b6a1bd 1969 ret
8309b745 1970ENDPROC(aesni_enc)
54b6a1bd
HY
1971
1972/*
1973 * _aesni_enc1: internal ABI
1974 * input:
1975 * KEYP: key struct pointer
1976 * KLEN: round count
1977 * STATE: initial state (input)
1978 * output:
1979 * STATE: finial state (output)
1980 * changed:
1981 * KEY
1982 * TKEYP (T1)
1983 */
0d258efb 1984.align 4
54b6a1bd
HY
1985_aesni_enc1:
1986 movaps (KEYP), KEY # key
1987 mov KEYP, TKEYP
1988 pxor KEY, STATE # round 0
1989 add $0x30, TKEYP
1990 cmp $24, KLEN
1991 jb .Lenc128
1992 lea 0x20(TKEYP), TKEYP
1993 je .Lenc192
1994 add $0x20, TKEYP
1995 movaps -0x60(TKEYP), KEY
b369e521 1996 AESENC KEY STATE
54b6a1bd 1997 movaps -0x50(TKEYP), KEY
b369e521 1998 AESENC KEY STATE
54b6a1bd
HY
1999.align 4
2000.Lenc192:
2001 movaps -0x40(TKEYP), KEY
b369e521 2002 AESENC KEY STATE
54b6a1bd 2003 movaps -0x30(TKEYP), KEY
b369e521 2004 AESENC KEY STATE
54b6a1bd
HY
2005.align 4
2006.Lenc128:
2007 movaps -0x20(TKEYP), KEY
b369e521 2008 AESENC KEY STATE
54b6a1bd 2009 movaps -0x10(TKEYP), KEY
b369e521 2010 AESENC KEY STATE
54b6a1bd 2011 movaps (TKEYP), KEY
b369e521 2012 AESENC KEY STATE
54b6a1bd 2013 movaps 0x10(TKEYP), KEY
b369e521 2014 AESENC KEY STATE
54b6a1bd 2015 movaps 0x20(TKEYP), KEY
b369e521 2016 AESENC KEY STATE
54b6a1bd 2017 movaps 0x30(TKEYP), KEY
b369e521 2018 AESENC KEY STATE
54b6a1bd 2019 movaps 0x40(TKEYP), KEY
b369e521 2020 AESENC KEY STATE
54b6a1bd 2021 movaps 0x50(TKEYP), KEY
b369e521 2022 AESENC KEY STATE
54b6a1bd 2023 movaps 0x60(TKEYP), KEY
b369e521 2024 AESENC KEY STATE
54b6a1bd 2025 movaps 0x70(TKEYP), KEY
b369e521 2026 AESENCLAST KEY STATE
54b6a1bd 2027 ret
8309b745 2028ENDPROC(_aesni_enc1)
54b6a1bd
HY
2029
2030/*
2031 * _aesni_enc4: internal ABI
2032 * input:
2033 * KEYP: key struct pointer
2034 * KLEN: round count
2035 * STATE1: initial state (input)
2036 * STATE2
2037 * STATE3
2038 * STATE4
2039 * output:
2040 * STATE1: finial state (output)
2041 * STATE2
2042 * STATE3
2043 * STATE4
2044 * changed:
2045 * KEY
2046 * TKEYP (T1)
2047 */
0d258efb 2048.align 4
54b6a1bd
HY
2049_aesni_enc4:
2050 movaps (KEYP), KEY # key
2051 mov KEYP, TKEYP
2052 pxor KEY, STATE1 # round 0
2053 pxor KEY, STATE2
2054 pxor KEY, STATE3
2055 pxor KEY, STATE4
2056 add $0x30, TKEYP
2057 cmp $24, KLEN
2058 jb .L4enc128
2059 lea 0x20(TKEYP), TKEYP
2060 je .L4enc192
2061 add $0x20, TKEYP
2062 movaps -0x60(TKEYP), KEY
b369e521
HY
2063 AESENC KEY STATE1
2064 AESENC KEY STATE2
2065 AESENC KEY STATE3
2066 AESENC KEY STATE4
54b6a1bd 2067 movaps -0x50(TKEYP), KEY
b369e521
HY
2068 AESENC KEY STATE1
2069 AESENC KEY STATE2
2070 AESENC KEY STATE3
2071 AESENC KEY STATE4
54b6a1bd
HY
2072#.align 4
2073.L4enc192:
2074 movaps -0x40(TKEYP), KEY
b369e521
HY
2075 AESENC KEY STATE1
2076 AESENC KEY STATE2
2077 AESENC KEY STATE3
2078 AESENC KEY STATE4
54b6a1bd 2079 movaps -0x30(TKEYP), KEY
b369e521
HY
2080 AESENC KEY STATE1
2081 AESENC KEY STATE2
2082 AESENC KEY STATE3
2083 AESENC KEY STATE4
54b6a1bd
HY
2084#.align 4
2085.L4enc128:
2086 movaps -0x20(TKEYP), KEY
b369e521
HY
2087 AESENC KEY STATE1
2088 AESENC KEY STATE2
2089 AESENC KEY STATE3
2090 AESENC KEY STATE4
54b6a1bd 2091 movaps -0x10(TKEYP), KEY
b369e521
HY
2092 AESENC KEY STATE1
2093 AESENC KEY STATE2
2094 AESENC KEY STATE3
2095 AESENC KEY STATE4
54b6a1bd 2096 movaps (TKEYP), KEY
b369e521
HY
2097 AESENC KEY STATE1
2098 AESENC KEY STATE2
2099 AESENC KEY STATE3
2100 AESENC KEY STATE4
54b6a1bd 2101 movaps 0x10(TKEYP), KEY
b369e521
HY
2102 AESENC KEY STATE1
2103 AESENC KEY STATE2
2104 AESENC KEY STATE3
2105 AESENC KEY STATE4
54b6a1bd 2106 movaps 0x20(TKEYP), KEY
b369e521
HY
2107 AESENC KEY STATE1
2108 AESENC KEY STATE2
2109 AESENC KEY STATE3
2110 AESENC KEY STATE4
54b6a1bd 2111 movaps 0x30(TKEYP), KEY
b369e521
HY
2112 AESENC KEY STATE1
2113 AESENC KEY STATE2
2114 AESENC KEY STATE3
2115 AESENC KEY STATE4
54b6a1bd 2116 movaps 0x40(TKEYP), KEY
b369e521
HY
2117 AESENC KEY STATE1
2118 AESENC KEY STATE2
2119 AESENC KEY STATE3
2120 AESENC KEY STATE4
54b6a1bd 2121 movaps 0x50(TKEYP), KEY
b369e521
HY
2122 AESENC KEY STATE1
2123 AESENC KEY STATE2
2124 AESENC KEY STATE3
2125 AESENC KEY STATE4
54b6a1bd 2126 movaps 0x60(TKEYP), KEY
b369e521
HY
2127 AESENC KEY STATE1
2128 AESENC KEY STATE2
2129 AESENC KEY STATE3
2130 AESENC KEY STATE4
54b6a1bd 2131 movaps 0x70(TKEYP), KEY
b369e521
HY
2132 AESENCLAST KEY STATE1 # last round
2133 AESENCLAST KEY STATE2
2134 AESENCLAST KEY STATE3
2135 AESENCLAST KEY STATE4
54b6a1bd 2136 ret
8309b745 2137ENDPROC(_aesni_enc4)
54b6a1bd
HY
2138
2139/*
2140 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2141 */
2142ENTRY(aesni_dec)
8691ccd7 2143 FRAME_BEGIN
0d258efb
MK
2144#ifndef __x86_64__
2145 pushl KEYP
2146 pushl KLEN
8691ccd7
JP
2147 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2148 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2149 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 2150#endif
54b6a1bd
HY
2151 mov 480(KEYP), KLEN # key length
2152 add $240, KEYP
2153 movups (INP), STATE # input
2154 call _aesni_dec1
2155 movups STATE, (OUTP) #output
0d258efb
MK
2156#ifndef __x86_64__
2157 popl KLEN
2158 popl KEYP
2159#endif
8691ccd7 2160 FRAME_END
54b6a1bd 2161 ret
8309b745 2162ENDPROC(aesni_dec)
54b6a1bd
HY
2163
2164/*
2165 * _aesni_dec1: internal ABI
2166 * input:
2167 * KEYP: key struct pointer
2168 * KLEN: key length
2169 * STATE: initial state (input)
2170 * output:
2171 * STATE: finial state (output)
2172 * changed:
2173 * KEY
2174 * TKEYP (T1)
2175 */
0d258efb 2176.align 4
54b6a1bd
HY
2177_aesni_dec1:
2178 movaps (KEYP), KEY # key
2179 mov KEYP, TKEYP
2180 pxor KEY, STATE # round 0
2181 add $0x30, TKEYP
2182 cmp $24, KLEN
2183 jb .Ldec128
2184 lea 0x20(TKEYP), TKEYP
2185 je .Ldec192
2186 add $0x20, TKEYP
2187 movaps -0x60(TKEYP), KEY
b369e521 2188 AESDEC KEY STATE
54b6a1bd 2189 movaps -0x50(TKEYP), KEY
b369e521 2190 AESDEC KEY STATE
54b6a1bd
HY
2191.align 4
2192.Ldec192:
2193 movaps -0x40(TKEYP), KEY
b369e521 2194 AESDEC KEY STATE
54b6a1bd 2195 movaps -0x30(TKEYP), KEY
b369e521 2196 AESDEC KEY STATE
54b6a1bd
HY
2197.align 4
2198.Ldec128:
2199 movaps -0x20(TKEYP), KEY
b369e521 2200 AESDEC KEY STATE
54b6a1bd 2201 movaps -0x10(TKEYP), KEY
b369e521 2202 AESDEC KEY STATE
54b6a1bd 2203 movaps (TKEYP), KEY
b369e521 2204 AESDEC KEY STATE
54b6a1bd 2205 movaps 0x10(TKEYP), KEY
b369e521 2206 AESDEC KEY STATE
54b6a1bd 2207 movaps 0x20(TKEYP), KEY
b369e521 2208 AESDEC KEY STATE
54b6a1bd 2209 movaps 0x30(TKEYP), KEY
b369e521 2210 AESDEC KEY STATE
54b6a1bd 2211 movaps 0x40(TKEYP), KEY
b369e521 2212 AESDEC KEY STATE
54b6a1bd 2213 movaps 0x50(TKEYP), KEY
b369e521 2214 AESDEC KEY STATE
54b6a1bd 2215 movaps 0x60(TKEYP), KEY
b369e521 2216 AESDEC KEY STATE
54b6a1bd 2217 movaps 0x70(TKEYP), KEY
b369e521 2218 AESDECLAST KEY STATE
54b6a1bd 2219 ret
8309b745 2220ENDPROC(_aesni_dec1)
54b6a1bd
HY
2221
2222/*
2223 * _aesni_dec4: internal ABI
2224 * input:
2225 * KEYP: key struct pointer
2226 * KLEN: key length
2227 * STATE1: initial state (input)
2228 * STATE2
2229 * STATE3
2230 * STATE4
2231 * output:
2232 * STATE1: finial state (output)
2233 * STATE2
2234 * STATE3
2235 * STATE4
2236 * changed:
2237 * KEY
2238 * TKEYP (T1)
2239 */
0d258efb 2240.align 4
54b6a1bd
HY
2241_aesni_dec4:
2242 movaps (KEYP), KEY # key
2243 mov KEYP, TKEYP
2244 pxor KEY, STATE1 # round 0
2245 pxor KEY, STATE2
2246 pxor KEY, STATE3
2247 pxor KEY, STATE4
2248 add $0x30, TKEYP
2249 cmp $24, KLEN
2250 jb .L4dec128
2251 lea 0x20(TKEYP), TKEYP
2252 je .L4dec192
2253 add $0x20, TKEYP
2254 movaps -0x60(TKEYP), KEY
b369e521
HY
2255 AESDEC KEY STATE1
2256 AESDEC KEY STATE2
2257 AESDEC KEY STATE3
2258 AESDEC KEY STATE4
54b6a1bd 2259 movaps -0x50(TKEYP), KEY
b369e521
HY
2260 AESDEC KEY STATE1
2261 AESDEC KEY STATE2
2262 AESDEC KEY STATE3
2263 AESDEC KEY STATE4
54b6a1bd
HY
2264.align 4
2265.L4dec192:
2266 movaps -0x40(TKEYP), KEY
b369e521
HY
2267 AESDEC KEY STATE1
2268 AESDEC KEY STATE2
2269 AESDEC KEY STATE3
2270 AESDEC KEY STATE4
54b6a1bd 2271 movaps -0x30(TKEYP), KEY
b369e521
HY
2272 AESDEC KEY STATE1
2273 AESDEC KEY STATE2
2274 AESDEC KEY STATE3
2275 AESDEC KEY STATE4
54b6a1bd
HY
2276.align 4
2277.L4dec128:
2278 movaps -0x20(TKEYP), KEY
b369e521
HY
2279 AESDEC KEY STATE1
2280 AESDEC KEY STATE2
2281 AESDEC KEY STATE3
2282 AESDEC KEY STATE4
54b6a1bd 2283 movaps -0x10(TKEYP), KEY
b369e521
HY
2284 AESDEC KEY STATE1
2285 AESDEC KEY STATE2
2286 AESDEC KEY STATE3
2287 AESDEC KEY STATE4
54b6a1bd 2288 movaps (TKEYP), KEY
b369e521
HY
2289 AESDEC KEY STATE1
2290 AESDEC KEY STATE2
2291 AESDEC KEY STATE3
2292 AESDEC KEY STATE4
54b6a1bd 2293 movaps 0x10(TKEYP), KEY
b369e521
HY
2294 AESDEC KEY STATE1
2295 AESDEC KEY STATE2
2296 AESDEC KEY STATE3
2297 AESDEC KEY STATE4
54b6a1bd 2298 movaps 0x20(TKEYP), KEY
b369e521
HY
2299 AESDEC KEY STATE1
2300 AESDEC KEY STATE2
2301 AESDEC KEY STATE3
2302 AESDEC KEY STATE4
54b6a1bd 2303 movaps 0x30(TKEYP), KEY
b369e521
HY
2304 AESDEC KEY STATE1
2305 AESDEC KEY STATE2
2306 AESDEC KEY STATE3
2307 AESDEC KEY STATE4
54b6a1bd 2308 movaps 0x40(TKEYP), KEY
b369e521
HY
2309 AESDEC KEY STATE1
2310 AESDEC KEY STATE2
2311 AESDEC KEY STATE3
2312 AESDEC KEY STATE4
54b6a1bd 2313 movaps 0x50(TKEYP), KEY
b369e521
HY
2314 AESDEC KEY STATE1
2315 AESDEC KEY STATE2
2316 AESDEC KEY STATE3
2317 AESDEC KEY STATE4
54b6a1bd 2318 movaps 0x60(TKEYP), KEY
b369e521
HY
2319 AESDEC KEY STATE1
2320 AESDEC KEY STATE2
2321 AESDEC KEY STATE3
2322 AESDEC KEY STATE4
54b6a1bd 2323 movaps 0x70(TKEYP), KEY
b369e521
HY
2324 AESDECLAST KEY STATE1 # last round
2325 AESDECLAST KEY STATE2
2326 AESDECLAST KEY STATE3
2327 AESDECLAST KEY STATE4
54b6a1bd 2328 ret
8309b745 2329ENDPROC(_aesni_dec4)
54b6a1bd
HY
2330
2331/*
2332 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2333 * size_t len)
2334 */
2335ENTRY(aesni_ecb_enc)
8691ccd7 2336 FRAME_BEGIN
0d258efb
MK
2337#ifndef __x86_64__
2338 pushl LEN
2339 pushl KEYP
2340 pushl KLEN
8691ccd7
JP
2341 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2342 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2343 movl (FRAME_OFFSET+24)(%esp), INP # src
2344 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2345#endif
54b6a1bd
HY
2346 test LEN, LEN # check length
2347 jz .Lecb_enc_ret
2348 mov 480(KEYP), KLEN
2349 cmp $16, LEN
2350 jb .Lecb_enc_ret
2351 cmp $64, LEN
2352 jb .Lecb_enc_loop1
2353.align 4
2354.Lecb_enc_loop4:
2355 movups (INP), STATE1
2356 movups 0x10(INP), STATE2
2357 movups 0x20(INP), STATE3
2358 movups 0x30(INP), STATE4
2359 call _aesni_enc4
2360 movups STATE1, (OUTP)
2361 movups STATE2, 0x10(OUTP)
2362 movups STATE3, 0x20(OUTP)
2363 movups STATE4, 0x30(OUTP)
2364 sub $64, LEN
2365 add $64, INP
2366 add $64, OUTP
2367 cmp $64, LEN
2368 jge .Lecb_enc_loop4
2369 cmp $16, LEN
2370 jb .Lecb_enc_ret
2371.align 4
2372.Lecb_enc_loop1:
2373 movups (INP), STATE1
2374 call _aesni_enc1
2375 movups STATE1, (OUTP)
2376 sub $16, LEN
2377 add $16, INP
2378 add $16, OUTP
2379 cmp $16, LEN
2380 jge .Lecb_enc_loop1
2381.Lecb_enc_ret:
0d258efb
MK
2382#ifndef __x86_64__
2383 popl KLEN
2384 popl KEYP
2385 popl LEN
2386#endif
8691ccd7 2387 FRAME_END
54b6a1bd 2388 ret
8309b745 2389ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2390
2391/*
2392 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2393 * size_t len);
2394 */
2395ENTRY(aesni_ecb_dec)
8691ccd7 2396 FRAME_BEGIN
0d258efb
MK
2397#ifndef __x86_64__
2398 pushl LEN
2399 pushl KEYP
2400 pushl KLEN
8691ccd7
JP
2401 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2402 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2403 movl (FRAME_OFFSET+24)(%esp), INP # src
2404 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2405#endif
54b6a1bd
HY
2406 test LEN, LEN
2407 jz .Lecb_dec_ret
2408 mov 480(KEYP), KLEN
2409 add $240, KEYP
2410 cmp $16, LEN
2411 jb .Lecb_dec_ret
2412 cmp $64, LEN
2413 jb .Lecb_dec_loop1
2414.align 4
2415.Lecb_dec_loop4:
2416 movups (INP), STATE1
2417 movups 0x10(INP), STATE2
2418 movups 0x20(INP), STATE3
2419 movups 0x30(INP), STATE4
2420 call _aesni_dec4
2421 movups STATE1, (OUTP)
2422 movups STATE2, 0x10(OUTP)
2423 movups STATE3, 0x20(OUTP)
2424 movups STATE4, 0x30(OUTP)
2425 sub $64, LEN
2426 add $64, INP
2427 add $64, OUTP
2428 cmp $64, LEN
2429 jge .Lecb_dec_loop4
2430 cmp $16, LEN
2431 jb .Lecb_dec_ret
2432.align 4
2433.Lecb_dec_loop1:
2434 movups (INP), STATE1
2435 call _aesni_dec1
2436 movups STATE1, (OUTP)
2437 sub $16, LEN
2438 add $16, INP
2439 add $16, OUTP
2440 cmp $16, LEN
2441 jge .Lecb_dec_loop1
2442.Lecb_dec_ret:
0d258efb
MK
2443#ifndef __x86_64__
2444 popl KLEN
2445 popl KEYP
2446 popl LEN
2447#endif
8691ccd7 2448 FRAME_END
54b6a1bd 2449 ret
8309b745 2450ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2451
2452/*
2453 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2454 * size_t len, u8 *iv)
2455 */
2456ENTRY(aesni_cbc_enc)
8691ccd7 2457 FRAME_BEGIN
0d258efb
MK
2458#ifndef __x86_64__
2459 pushl IVP
2460 pushl LEN
2461 pushl KEYP
2462 pushl KLEN
8691ccd7
JP
2463 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2464 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2465 movl (FRAME_OFFSET+28)(%esp), INP # src
2466 movl (FRAME_OFFSET+32)(%esp), LEN # len
2467 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2468#endif
54b6a1bd
HY
2469 cmp $16, LEN
2470 jb .Lcbc_enc_ret
2471 mov 480(KEYP), KLEN
2472 movups (IVP), STATE # load iv as initial state
2473.align 4
2474.Lcbc_enc_loop:
2475 movups (INP), IN # load input
2476 pxor IN, STATE
2477 call _aesni_enc1
2478 movups STATE, (OUTP) # store output
2479 sub $16, LEN
2480 add $16, INP
2481 add $16, OUTP
2482 cmp $16, LEN
2483 jge .Lcbc_enc_loop
2484 movups STATE, (IVP)
2485.Lcbc_enc_ret:
0d258efb
MK
2486#ifndef __x86_64__
2487 popl KLEN
2488 popl KEYP
2489 popl LEN
2490 popl IVP
2491#endif
8691ccd7 2492 FRAME_END
54b6a1bd 2493 ret
8309b745 2494ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2495
2496/*
2497 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2498 * size_t len, u8 *iv)
2499 */
2500ENTRY(aesni_cbc_dec)
8691ccd7 2501 FRAME_BEGIN
0d258efb
MK
2502#ifndef __x86_64__
2503 pushl IVP
2504 pushl LEN
2505 pushl KEYP
2506 pushl KLEN
8691ccd7
JP
2507 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2508 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2509 movl (FRAME_OFFSET+28)(%esp), INP # src
2510 movl (FRAME_OFFSET+32)(%esp), LEN # len
2511 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2512#endif
54b6a1bd 2513 cmp $16, LEN
e6efaa02 2514 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2515 mov 480(KEYP), KLEN
2516 add $240, KEYP
2517 movups (IVP), IV
2518 cmp $64, LEN
2519 jb .Lcbc_dec_loop1
2520.align 4
2521.Lcbc_dec_loop4:
2522 movups (INP), IN1
2523 movaps IN1, STATE1
2524 movups 0x10(INP), IN2
2525 movaps IN2, STATE2
0d258efb 2526#ifdef __x86_64__
54b6a1bd
HY
2527 movups 0x20(INP), IN3
2528 movaps IN3, STATE3
2529 movups 0x30(INP), IN4
2530 movaps IN4, STATE4
0d258efb
MK
2531#else
2532 movups 0x20(INP), IN1
2533 movaps IN1, STATE3
2534 movups 0x30(INP), IN2
2535 movaps IN2, STATE4
2536#endif
54b6a1bd
HY
2537 call _aesni_dec4
2538 pxor IV, STATE1
0d258efb 2539#ifdef __x86_64__
54b6a1bd
HY
2540 pxor IN1, STATE2
2541 pxor IN2, STATE3
2542 pxor IN3, STATE4
2543 movaps IN4, IV
0d258efb 2544#else
0d258efb
MK
2545 pxor IN1, STATE4
2546 movaps IN2, IV
7c8d5184
MK
2547 movups (INP), IN1
2548 pxor IN1, STATE2
2549 movups 0x10(INP), IN2
2550 pxor IN2, STATE3
0d258efb 2551#endif
54b6a1bd
HY
2552 movups STATE1, (OUTP)
2553 movups STATE2, 0x10(OUTP)
2554 movups STATE3, 0x20(OUTP)
2555 movups STATE4, 0x30(OUTP)
2556 sub $64, LEN
2557 add $64, INP
2558 add $64, OUTP
2559 cmp $64, LEN
2560 jge .Lcbc_dec_loop4
2561 cmp $16, LEN
2562 jb .Lcbc_dec_ret
2563.align 4
2564.Lcbc_dec_loop1:
2565 movups (INP), IN
2566 movaps IN, STATE
2567 call _aesni_dec1
2568 pxor IV, STATE
2569 movups STATE, (OUTP)
2570 movaps IN, IV
2571 sub $16, LEN
2572 add $16, INP
2573 add $16, OUTP
2574 cmp $16, LEN
2575 jge .Lcbc_dec_loop1
54b6a1bd 2576.Lcbc_dec_ret:
e6efaa02
HY
2577 movups IV, (IVP)
2578.Lcbc_dec_just_ret:
0d258efb
MK
2579#ifndef __x86_64__
2580 popl KLEN
2581 popl KEYP
2582 popl LEN
2583 popl IVP
2584#endif
8691ccd7 2585 FRAME_END
54b6a1bd 2586 ret
8309b745 2587ENDPROC(aesni_cbc_dec)
12387a46 2588
0d258efb 2589#ifdef __x86_64__
1253cab8 2590.pushsection .rodata
12387a46
HY
2591.align 16
2592.Lbswap_mask:
2593 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2594.popsection
12387a46
HY
2595
2596/*
2597 * _aesni_inc_init: internal ABI
2598 * setup registers used by _aesni_inc
2599 * input:
2600 * IV
2601 * output:
2602 * CTR: == IV, in little endian
2603 * TCTR_LOW: == lower qword of CTR
2604 * INC: == 1, in little endian
2605 * BSWAP_MASK == endian swapping mask
2606 */
0d258efb 2607.align 4
12387a46
HY
2608_aesni_inc_init:
2609 movaps .Lbswap_mask, BSWAP_MASK
2610 movaps IV, CTR
2611 PSHUFB_XMM BSWAP_MASK CTR
2612 mov $1, TCTR_LOW
32cbd7df
HY
2613 MOVQ_R64_XMM TCTR_LOW INC
2614 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2615 ret
8309b745 2616ENDPROC(_aesni_inc_init)
12387a46
HY
2617
2618/*
2619 * _aesni_inc: internal ABI
2620 * Increase IV by 1, IV is in big endian
2621 * input:
2622 * IV
2623 * CTR: == IV, in little endian
2624 * TCTR_LOW: == lower qword of CTR
2625 * INC: == 1, in little endian
2626 * BSWAP_MASK == endian swapping mask
2627 * output:
2628 * IV: Increase by 1
2629 * changed:
2630 * CTR: == output IV, in little endian
2631 * TCTR_LOW: == lower qword of CTR
2632 */
0d258efb 2633.align 4
12387a46
HY
2634_aesni_inc:
2635 paddq INC, CTR
2636 add $1, TCTR_LOW
2637 jnc .Linc_low
2638 pslldq $8, INC
2639 paddq INC, CTR
2640 psrldq $8, INC
2641.Linc_low:
2642 movaps CTR, IV
2643 PSHUFB_XMM BSWAP_MASK IV
2644 ret
8309b745 2645ENDPROC(_aesni_inc)
12387a46
HY
2646
2647/*
2648 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2649 * size_t len, u8 *iv)
2650 */
2651ENTRY(aesni_ctr_enc)
8691ccd7 2652 FRAME_BEGIN
12387a46
HY
2653 cmp $16, LEN
2654 jb .Lctr_enc_just_ret
2655 mov 480(KEYP), KLEN
2656 movups (IVP), IV
2657 call _aesni_inc_init
2658 cmp $64, LEN
2659 jb .Lctr_enc_loop1
2660.align 4
2661.Lctr_enc_loop4:
2662 movaps IV, STATE1
2663 call _aesni_inc
2664 movups (INP), IN1
2665 movaps IV, STATE2
2666 call _aesni_inc
2667 movups 0x10(INP), IN2
2668 movaps IV, STATE3
2669 call _aesni_inc
2670 movups 0x20(INP), IN3
2671 movaps IV, STATE4
2672 call _aesni_inc
2673 movups 0x30(INP), IN4
2674 call _aesni_enc4
2675 pxor IN1, STATE1
2676 movups STATE1, (OUTP)
2677 pxor IN2, STATE2
2678 movups STATE2, 0x10(OUTP)
2679 pxor IN3, STATE3
2680 movups STATE3, 0x20(OUTP)
2681 pxor IN4, STATE4
2682 movups STATE4, 0x30(OUTP)
2683 sub $64, LEN
2684 add $64, INP
2685 add $64, OUTP
2686 cmp $64, LEN
2687 jge .Lctr_enc_loop4
2688 cmp $16, LEN
2689 jb .Lctr_enc_ret
2690.align 4
2691.Lctr_enc_loop1:
2692 movaps IV, STATE
2693 call _aesni_inc
2694 movups (INP), IN
2695 call _aesni_enc1
2696 pxor IN, STATE
2697 movups STATE, (OUTP)
2698 sub $16, LEN
2699 add $16, INP
2700 add $16, OUTP
2701 cmp $16, LEN
2702 jge .Lctr_enc_loop1
2703.Lctr_enc_ret:
2704 movups IV, (IVP)
2705.Lctr_enc_just_ret:
8691ccd7 2706 FRAME_END
12387a46 2707 ret
8309b745 2708ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2709
2710/*
2711 * _aesni_gf128mul_x_ble: internal ABI
2712 * Multiply in GF(2^128) for XTS IVs
2713 * input:
2714 * IV: current IV
2715 * GF128MUL_MASK == mask with 0x87 and 0x01
2716 * output:
2717 * IV: next IV
2718 * changed:
2719 * CTR: == temporary value
2720 */
2721#define _aesni_gf128mul_x_ble() \
2722 pshufd $0x13, IV, CTR; \
2723 paddq IV, IV; \
2724 psrad $31, CTR; \
2725 pand GF128MUL_MASK, CTR; \
2726 pxor CTR, IV;
2727
2728/*
2729 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2730 * bool enc, u8 *iv)
2731 */
2732ENTRY(aesni_xts_crypt8)
8691ccd7 2733 FRAME_BEGIN
c456a9cd
JK
2734 cmpb $0, %cl
2735 movl $0, %ecx
2736 movl $240, %r10d
2737 leaq _aesni_enc4, %r11
2738 leaq _aesni_dec4, %rax
2739 cmovel %r10d, %ecx
2740 cmoveq %rax, %r11
2741
2742 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2743 movups (IVP), IV
2744
2745 mov 480(KEYP), KLEN
2746 addq %rcx, KEYP
2747
2748 movdqa IV, STATE1
fe6510b5
JK
2749 movdqu 0x00(INP), INC
2750 pxor INC, STATE1
c456a9cd
JK
2751 movdqu IV, 0x00(OUTP)
2752
2753 _aesni_gf128mul_x_ble()
2754 movdqa IV, STATE2
fe6510b5
JK
2755 movdqu 0x10(INP), INC
2756 pxor INC, STATE2
c456a9cd
JK
2757 movdqu IV, 0x10(OUTP)
2758
2759 _aesni_gf128mul_x_ble()
2760 movdqa IV, STATE3
fe6510b5
JK
2761 movdqu 0x20(INP), INC
2762 pxor INC, STATE3
c456a9cd
JK
2763 movdqu IV, 0x20(OUTP)
2764
2765 _aesni_gf128mul_x_ble()
2766 movdqa IV, STATE4
fe6510b5
JK
2767 movdqu 0x30(INP), INC
2768 pxor INC, STATE4
c456a9cd
JK
2769 movdqu IV, 0x30(OUTP)
2770
9697fa39 2771 CALL_NOSPEC %r11
c456a9cd 2772
fe6510b5
JK
2773 movdqu 0x00(OUTP), INC
2774 pxor INC, STATE1
c456a9cd
JK
2775 movdqu STATE1, 0x00(OUTP)
2776
2777 _aesni_gf128mul_x_ble()
2778 movdqa IV, STATE1
fe6510b5
JK
2779 movdqu 0x40(INP), INC
2780 pxor INC, STATE1
c456a9cd
JK
2781 movdqu IV, 0x40(OUTP)
2782
fe6510b5
JK
2783 movdqu 0x10(OUTP), INC
2784 pxor INC, STATE2
c456a9cd
JK
2785 movdqu STATE2, 0x10(OUTP)
2786
2787 _aesni_gf128mul_x_ble()
2788 movdqa IV, STATE2
fe6510b5
JK
2789 movdqu 0x50(INP), INC
2790 pxor INC, STATE2
c456a9cd
JK
2791 movdqu IV, 0x50(OUTP)
2792
fe6510b5
JK
2793 movdqu 0x20(OUTP), INC
2794 pxor INC, STATE3
c456a9cd
JK
2795 movdqu STATE3, 0x20(OUTP)
2796
2797 _aesni_gf128mul_x_ble()
2798 movdqa IV, STATE3
fe6510b5
JK
2799 movdqu 0x60(INP), INC
2800 pxor INC, STATE3
c456a9cd
JK
2801 movdqu IV, 0x60(OUTP)
2802
fe6510b5
JK
2803 movdqu 0x30(OUTP), INC
2804 pxor INC, STATE4
c456a9cd
JK
2805 movdqu STATE4, 0x30(OUTP)
2806
2807 _aesni_gf128mul_x_ble()
2808 movdqa IV, STATE4
fe6510b5
JK
2809 movdqu 0x70(INP), INC
2810 pxor INC, STATE4
c456a9cd
JK
2811 movdqu IV, 0x70(OUTP)
2812
2813 _aesni_gf128mul_x_ble()
2814 movups IV, (IVP)
2815
9697fa39 2816 CALL_NOSPEC %r11
c456a9cd 2817
fe6510b5
JK
2818 movdqu 0x40(OUTP), INC
2819 pxor INC, STATE1
c456a9cd
JK
2820 movdqu STATE1, 0x40(OUTP)
2821
fe6510b5
JK
2822 movdqu 0x50(OUTP), INC
2823 pxor INC, STATE2
c456a9cd
JK
2824 movdqu STATE2, 0x50(OUTP)
2825
fe6510b5
JK
2826 movdqu 0x60(OUTP), INC
2827 pxor INC, STATE3
c456a9cd
JK
2828 movdqu STATE3, 0x60(OUTP)
2829
fe6510b5
JK
2830 movdqu 0x70(OUTP), INC
2831 pxor INC, STATE4
c456a9cd
JK
2832 movdqu STATE4, 0x70(OUTP)
2833
8691ccd7 2834 FRAME_END
c456a9cd
JK
2835 ret
2836ENDPROC(aesni_xts_crypt8)
2837
0d258efb 2838#endif