]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
crypto: aesni - Move ghash_mul to GCM_COMPLETE
[mirror_ubuntu-eoan-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
8691ccd7 34#include <asm/frame.h>
9697fa39 35#include <asm/nospec-branch.h>
54b6a1bd 36
e31ac32d
TM
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
559ad0ff 48#ifdef __x86_64__
e31ac32d 49
e183914a
DV
50# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
e183914a
DV
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
0bd82f5f 57POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
0bd82f5f
TS
60TWOONE: .octa 0x00000001000000000000000000000001
61
e183914a
DV
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
0bd82f5f 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
0bd82f5f 67MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
0bd82f5f 70MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
0bd82f5f 73ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
0bd82f5f 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
0bd82f5f 79dec: .octa 0x1
e183914a
DV
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
0bd82f5f
TS
82enc: .octa 0x2
83
e183914a
DV
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
54b6a1bd
HY
93.text
94
0bd82f5f
TS
95
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
9ee4a5df
DW
115#define AadHash 16*0
116#define AadLen 16*1
117#define InLen (16*1)+8
118#define PBlockEncKey 16*2
119#define OrigIV 16*3
120#define CurCount 16*4
121#define PBlockLen 16*5
122
0bd82f5f
TS
123#define arg1 rdi
124#define arg2 rsi
125#define arg3 rdx
126#define arg4 rcx
127#define arg5 r8
128#define arg6 r9
129#define arg7 STACK_OFFSET+8(%r14)
130#define arg8 STACK_OFFSET+16(%r14)
131#define arg9 STACK_OFFSET+24(%r14)
132#define arg10 STACK_OFFSET+32(%r14)
9ee4a5df 133#define arg11 STACK_OFFSET+40(%r14)
e31ac32d 134#define keysize 2*15*16(%arg1)
559ad0ff 135#endif
0bd82f5f
TS
136
137
54b6a1bd
HY
138#define STATE1 %xmm0
139#define STATE2 %xmm4
140#define STATE3 %xmm5
141#define STATE4 %xmm6
142#define STATE STATE1
143#define IN1 %xmm1
144#define IN2 %xmm7
145#define IN3 %xmm8
146#define IN4 %xmm9
147#define IN IN1
148#define KEY %xmm2
149#define IV %xmm3
0d258efb 150
12387a46
HY
151#define BSWAP_MASK %xmm10
152#define CTR %xmm11
153#define INC %xmm12
54b6a1bd 154
c456a9cd
JK
155#define GF128MUL_MASK %xmm10
156
0d258efb
MK
157#ifdef __x86_64__
158#define AREG %rax
54b6a1bd
HY
159#define KEYP %rdi
160#define OUTP %rsi
0d258efb 161#define UKEYP OUTP
54b6a1bd
HY
162#define INP %rdx
163#define LEN %rcx
164#define IVP %r8
165#define KLEN %r9d
166#define T1 %r10
167#define TKEYP T1
168#define T2 %r11
12387a46 169#define TCTR_LOW T2
0d258efb
MK
170#else
171#define AREG %eax
172#define KEYP %edi
173#define OUTP AREG
174#define UKEYP OUTP
175#define INP %edx
176#define LEN %esi
177#define IVP %ebp
178#define KLEN %ebx
179#define T1 %ecx
180#define TKEYP T1
181#endif
54b6a1bd 182
6c2c86b3
DW
183.macro FUNC_SAVE
184 push %r12
185 push %r13
186 push %r14
187 mov %rsp, %r14
188#
189# states of %xmm registers %xmm6:%xmm15 not saved
190# all %xmm registers are clobbered
191#
192 sub $VARIABLE_OFFSET, %rsp
193 and $~63, %rsp
194.endm
195
196
197.macro FUNC_RESTORE
198 mov %r14, %rsp
199 pop %r14
200 pop %r13
201 pop %r12
202.endm
0bd82f5f 203
7af964c2
DW
204
205# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
206# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
207.macro GCM_INIT
9660474b
DW
208
209 mov arg9, %r11
210 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
211 xor %r11, %r11
212 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
213 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
214 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
215 mov %arg6, %rax
216 movdqu (%rax), %xmm0
217 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
218
219 movdqa SHUF_MASK(%rip), %xmm2
220 PSHUFB_XMM %xmm2, %xmm0
221 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
222
9ee4a5df 223 mov arg7, %r12
7af964c2 224 movdqu (%r12), %xmm13
9ee4a5df 225 movdqa SHUF_MASK(%rip), %xmm2
7af964c2
DW
226 PSHUFB_XMM %xmm2, %xmm13
227
228 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
229
230 movdqa %xmm13, %xmm2
231 psllq $1, %xmm13
232 psrlq $63, %xmm2
233 movdqa %xmm2, %xmm1
234 pslldq $8, %xmm2
235 psrldq $8, %xmm1
236 por %xmm2, %xmm13
237
238 # reduce HashKey<<1
239
240 pshufd $0x24, %xmm1, %xmm2
241 pcmpeqd TWOONE(%rip), %xmm2
242 pand POLY(%rip), %xmm2
243 pxor %xmm2, %xmm13
244 movdqa %xmm13, HashKey(%rsp)
c594c540
DW
245
246 CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
247 %xmm5 %xmm6
7af964c2
DW
248.endm
249
ba45833e
DW
250# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
251# struct has been initialized by GCM_INIT.
252# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
253# Clobbers rax, r10-r13, and xmm0-xmm15
254.macro GCM_ENC_DEC operation
9660474b
DW
255 movdqu AadHash(%arg2), %xmm8
256 movdqu HashKey(%rsp), %xmm13
257 add %arg5, InLen(%arg2)
258 mov %arg5, %r13 # save the number of bytes
259 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
260 mov %r13, %r12
ba45833e
DW
261 # Encrypt/Decrypt first few blocks
262
263 and $(3<<4), %r12
264 jz _initial_num_blocks_is_0_\@
265 cmp $(2<<4), %r12
266 jb _initial_num_blocks_is_1_\@
267 je _initial_num_blocks_is_2_\@
268_initial_num_blocks_is_3_\@:
269 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
270%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
271 sub $48, %r13
272 jmp _initial_blocks_\@
273_initial_num_blocks_is_2_\@:
274 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
275%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
276 sub $32, %r13
277 jmp _initial_blocks_\@
278_initial_num_blocks_is_1_\@:
279 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
280%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
281 sub $16, %r13
282 jmp _initial_blocks_\@
283_initial_num_blocks_is_0_\@:
284 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
285%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
286_initial_blocks_\@:
287
288 # Main loop - Encrypt/Decrypt remaining blocks
289
290 cmp $0, %r13
291 je _zero_cipher_left_\@
292 sub $64, %r13
293 je _four_cipher_left_\@
294_crypt_by_4_\@:
295 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
296 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
297 %xmm7, %xmm8, enc
298 add $64, %r11
299 sub $64, %r13
300 jne _crypt_by_4_\@
301_four_cipher_left_\@:
302 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
303%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
304_zero_cipher_left_\@:
9660474b
DW
305 movdqu %xmm8, AadHash(%arg2)
306 movdqu %xmm0, CurCount(%arg2)
307
9ee4a5df
DW
308 mov %arg5, %r13
309 and $15, %r13 # %r13 = arg5 (mod 16)
ba45833e
DW
310 je _multiple_of_16_bytes_\@
311
9660474b
DW
312 mov %r13, PBlockLen(%arg2)
313
ba45833e
DW
314 # Handle the last <16 Byte block separately
315 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
9660474b 316 movdqu %xmm0, CurCount(%arg2)
9ee4a5df 317 movdqa SHUF_MASK(%rip), %xmm10
ba45833e
DW
318 PSHUFB_XMM %xmm10, %xmm0
319
320 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
9660474b 321 movdqu %xmm0, PBlockEncKey(%arg2)
ba45833e 322
9ee4a5df 323 lea (%arg4,%r11,1), %r10
ba45833e
DW
324 mov %r13, %r12
325 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
326
327 lea ALL_F+16(%rip), %r12
328 sub %r13, %r12
329.ifc \operation, dec
330 movdqa %xmm1, %xmm2
331.endif
332 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
333 movdqu (%r12), %xmm1
334 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
335 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
336.ifc \operation, dec
337 pand %xmm1, %xmm2
338 movdqa SHUF_MASK(%rip), %xmm10
339 PSHUFB_XMM %xmm10 ,%xmm2
340
341 pxor %xmm2, %xmm8
342.else
343 movdqa SHUF_MASK(%rip), %xmm10
344 PSHUFB_XMM %xmm10,%xmm0
345
346 pxor %xmm0, %xmm8
347.endif
348
9660474b 349 movdqu %xmm8, AadHash(%arg2)
ba45833e
DW
350.ifc \operation, enc
351 # GHASH computation for the last <16 byte block
352 movdqa SHUF_MASK(%rip), %xmm10
353 # shuffle xmm0 back to output as ciphertext
354 PSHUFB_XMM %xmm10, %xmm0
355.endif
356
357 # Output %r13 bytes
358 MOVQ_R64_XMM %xmm0, %rax
359 cmp $8, %r13
360 jle _less_than_8_bytes_left_\@
9ee4a5df 361 mov %rax, (%arg3 , %r11, 1)
ba45833e
DW
362 add $8, %r11
363 psrldq $8, %xmm0
364 MOVQ_R64_XMM %xmm0, %rax
365 sub $8, %r13
366_less_than_8_bytes_left_\@:
9ee4a5df 367 mov %al, (%arg3, %r11, 1)
ba45833e
DW
368 add $1, %r11
369 shr $8, %rax
370 sub $1, %r13
371 jne _less_than_8_bytes_left_\@
372_multiple_of_16_bytes_\@:
373.endm
374
adcadab3
DW
375# GCM_COMPLETE Finishes update of tag of last partial block
376# Output: Authorization Tag (AUTH_TAG)
377# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
378.macro GCM_COMPLETE
9660474b
DW
379 movdqu AadHash(%arg2), %xmm8
380 movdqu HashKey(%rsp), %xmm13
e2e34b08
DW
381
382 mov PBlockLen(%arg2), %r12
383
384 cmp $0, %r12
385 je _partial_done\@
386
387 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
388
389_partial_done\@:
9660474b 390 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
adcadab3
DW
391 shl $3, %r12 # convert into number of bits
392 movd %r12d, %xmm15 # len(A) in %xmm15
9660474b
DW
393 mov InLen(%arg2), %r12
394 shl $3, %r12 # len(C) in bits (*128)
395 MOVQ_R64_XMM %r12, %xmm1
396
adcadab3
DW
397 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
398 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
399 pxor %xmm15, %xmm8
400 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
401 # final GHASH computation
402 movdqa SHUF_MASK(%rip), %xmm10
403 PSHUFB_XMM %xmm10, %xmm8
404
9660474b 405 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
adcadab3
DW
406 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
407 pxor %xmm8, %xmm0
408_return_T_\@:
9ee4a5df
DW
409 mov arg10, %r10 # %r10 = authTag
410 mov arg11, %r11 # %r11 = auth_tag_len
adcadab3
DW
411 cmp $16, %r11
412 je _T_16_\@
413 cmp $8, %r11
414 jl _T_4_\@
415_T_8_\@:
416 MOVQ_R64_XMM %xmm0, %rax
417 mov %rax, (%r10)
418 add $8, %r10
419 sub $8, %r11
420 psrldq $8, %xmm0
421 cmp $0, %r11
422 je _return_T_done_\@
423_T_4_\@:
424 movd %xmm0, %eax
425 mov %eax, (%r10)
426 add $4, %r10
427 sub $4, %r11
428 psrldq $4, %xmm0
429 cmp $0, %r11
430 je _return_T_done_\@
431_T_123_\@:
432 movd %xmm0, %eax
433 cmp $2, %r11
434 jl _T_1_\@
435 mov %ax, (%r10)
436 cmp $2, %r11
437 je _return_T_done_\@
438 add $2, %r10
439 sar $16, %eax
440_T_1_\@:
441 mov %al, (%r10)
442 jmp _return_T_done_\@
443_T_16_\@:
444 movdqu %xmm0, (%r10)
445_return_T_done_\@:
446.endm
447
559ad0ff 448#ifdef __x86_64__
0bd82f5f
TS
449/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
450*
451*
452* Input: A and B (128-bits each, bit-reflected)
453* Output: C = A*B*x mod poly, (i.e. >>1 )
454* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
455* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
456*
457*/
458.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
459 movdqa \GH, \TMP1
460 pshufd $78, \GH, \TMP2
461 pshufd $78, \HK, \TMP3
462 pxor \GH, \TMP2 # TMP2 = a1+a0
463 pxor \HK, \TMP3 # TMP3 = b1+b0
464 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
465 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
466 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
467 pxor \GH, \TMP2
468 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
469 movdqa \TMP2, \TMP3
470 pslldq $8, \TMP3 # left shift TMP3 2 DWs
471 psrldq $8, \TMP2 # right shift TMP2 2 DWs
472 pxor \TMP3, \GH
473 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
474
475 # first phase of the reduction
476
477 movdqa \GH, \TMP2
478 movdqa \GH, \TMP3
479 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
480 # in in order to perform
481 # independent shifts
482 pslld $31, \TMP2 # packed right shift <<31
483 pslld $30, \TMP3 # packed right shift <<30
484 pslld $25, \TMP4 # packed right shift <<25
485 pxor \TMP3, \TMP2 # xor the shifted versions
486 pxor \TMP4, \TMP2
487 movdqa \TMP2, \TMP5
488 psrldq $4, \TMP5 # right shift TMP5 1 DW
489 pslldq $12, \TMP2 # left shift TMP2 3 DWs
490 pxor \TMP2, \GH
491
492 # second phase of the reduction
493
494 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
495 # in in order to perform
496 # independent shifts
497 movdqa \GH,\TMP3
498 movdqa \GH,\TMP4
499 psrld $1,\TMP2 # packed left shift >>1
500 psrld $2,\TMP3 # packed left shift >>2
501 psrld $7,\TMP4 # packed left shift >>7
502 pxor \TMP3,\TMP2 # xor the shifted versions
503 pxor \TMP4,\TMP2
504 pxor \TMP5, \TMP2
505 pxor \TMP2, \GH
506 pxor \TMP1, \GH # result is in TMP1
507.endm
508
b20209c9
JS
509# Reads DLEN bytes starting at DPTR and stores in XMMDst
510# where 0 < DLEN < 16
511# Clobbers %rax, DLEN and XMM1
512.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
513 cmp $8, \DLEN
514 jl _read_lt8_\@
515 mov (\DPTR), %rax
516 MOVQ_R64_XMM %rax, \XMMDst
517 sub $8, \DLEN
518 jz _done_read_partial_block_\@
519 xor %eax, %eax
520_read_next_byte_\@:
521 shl $8, %rax
522 mov 7(\DPTR, \DLEN, 1), %al
523 dec \DLEN
524 jnz _read_next_byte_\@
525 MOVQ_R64_XMM %rax, \XMM1
526 pslldq $8, \XMM1
527 por \XMM1, \XMMDst
528 jmp _done_read_partial_block_\@
529_read_lt8_\@:
530 xor %eax, %eax
531_read_next_byte_lt8_\@:
532 shl $8, %rax
533 mov -1(\DPTR, \DLEN, 1), %al
534 dec \DLEN
535 jnz _read_next_byte_lt8_\@
536 MOVQ_R64_XMM %rax, \XMMDst
537_done_read_partial_block_\@:
538.endm
539
c594c540
DW
540# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
541# clobbers r10-11, xmm14
542.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
543 TMP6 TMP7
544 MOVADQ SHUF_MASK(%rip), %xmm14
545 mov arg8, %r10 # %r10 = AAD
546 mov arg9, %r11 # %r11 = aadLen
547 pxor \TMP7, \TMP7
548 pxor \TMP6, \TMP6
0487ccac
SD
549
550 cmp $16, %r11
e1fd316f
DW
551 jl _get_AAD_rest\@
552_get_AAD_blocks\@:
c594c540
DW
553 movdqu (%r10), \TMP7
554 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
555 pxor \TMP7, \TMP6
556 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0487ccac 557 add $16, %r10
0487ccac
SD
558 sub $16, %r11
559 cmp $16, %r11
e1fd316f 560 jge _get_AAD_blocks\@
0487ccac 561
c594c540 562 movdqu \TMP6, \TMP7
1ecdd37e
JS
563
564 /* read the last <16B of AAD */
e1fd316f 565_get_AAD_rest\@:
0487ccac 566 cmp $0, %r11
e1fd316f 567 je _get_AAD_done\@
0487ccac 568
c594c540
DW
569 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
570 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
571 pxor \TMP6, \TMP7
572 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
573 movdqu \TMP7, \TMP6
3c097b80 574
e1fd316f 575_get_AAD_done\@:
c594c540
DW
576 movdqu \TMP6, AadHash(%arg2)
577.endm
578
579/*
580* if a = number of total plaintext bytes
581* b = floor(a/16)
582* num_initial_blocks = b mod 4
583* encrypt the initial num_initial_blocks blocks and apply ghash on
584* the ciphertext
585* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
586* are clobbered
587* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
588*/
589
590
591.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
592 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
9660474b 593 MOVADQ SHUF_MASK(%rip), %xmm14
c594c540
DW
594
595 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
596
3c097b80 597 xor %r11, %r11 # initialise the data pointer offset as zero
0487ccac 598 # start AES for num_initial_blocks blocks
3c097b80 599
9660474b 600 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
3c097b80
TS
601
602.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 603
e31ac32d
TM
604 MOVADQ ONE(%RIP),\TMP1
605 MOVADQ 0(%arg1),\TMP2
3c097b80 606.irpc index, \i_seq
e31ac32d 607 paddd \TMP1, \XMM0 # INCR Y0
e1fd316f
DW
608.ifc \operation, dec
609 movdqa \XMM0, %xmm\index
610.else
e31ac32d 611 MOVADQ \XMM0, %xmm\index
e1fd316f 612.endif
e31ac32d
TM
613 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
614 pxor \TMP2, %xmm\index
3c097b80 615.endr
e31ac32d
TM
616 lea 0x10(%arg1),%r10
617 mov keysize,%eax
618 shr $2,%eax # 128->4, 192->6, 256->8
619 add $5,%eax # 128->9, 192->11, 256->13
620
e1fd316f 621aes_loop_initial_\@:
e31ac32d
TM
622 MOVADQ (%r10),\TMP1
623.irpc index, \i_seq
624 AESENC \TMP1, %xmm\index
3c097b80 625.endr
e31ac32d
TM
626 add $16,%r10
627 sub $1,%eax
e1fd316f 628 jnz aes_loop_initial_\@
e31ac32d
TM
629
630 MOVADQ (%r10), \TMP1
3c097b80 631.irpc index, \i_seq
e31ac32d 632 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
633.endr
634.irpc index, \i_seq
9ee4a5df 635 movdqu (%arg4 , %r11, 1), \TMP1
3c097b80 636 pxor \TMP1, %xmm\index
9ee4a5df 637 movdqu %xmm\index, (%arg3 , %r11, 1)
3c097b80
TS
638 # write back plaintext/ciphertext for num_initial_blocks
639 add $16, %r11
e1fd316f
DW
640
641.ifc \operation, dec
642 movdqa \TMP1, %xmm\index
643.endif
3c097b80
TS
644 PSHUFB_XMM %xmm14, %xmm\index
645
646 # prepare plaintext/ciphertext for GHASH computation
647.endr
648.endif
0487ccac 649
3c097b80
TS
650 # apply GHASH on num_initial_blocks blocks
651
652.if \i == 5
653 pxor %xmm5, %xmm6
654 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
655 pxor %xmm6, %xmm7
656 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
657 pxor %xmm7, %xmm8
658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
659.elseif \i == 6
660 pxor %xmm6, %xmm7
661 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
662 pxor %xmm7, %xmm8
663 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
664.elseif \i == 7
665 pxor %xmm7, %xmm8
666 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
667.endif
668 cmp $64, %r13
e1fd316f 669 jl _initial_blocks_done\@
3c097b80
TS
670 # no need for precomputed values
671/*
672*
673* Precomputations for HashKey parallel with encryption of first 4 blocks.
674* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
675*/
e31ac32d
TM
676 MOVADQ ONE(%RIP),\TMP1
677 paddd \TMP1, \XMM0 # INCR Y0
678 MOVADQ \XMM0, \XMM1
3c097b80
TS
679 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
680
e31ac32d
TM
681 paddd \TMP1, \XMM0 # INCR Y0
682 MOVADQ \XMM0, \XMM2
3c097b80
TS
683 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
684
e31ac32d
TM
685 paddd \TMP1, \XMM0 # INCR Y0
686 MOVADQ \XMM0, \XMM3
3c097b80
TS
687 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
688
e31ac32d
TM
689 paddd \TMP1, \XMM0 # INCR Y0
690 MOVADQ \XMM0, \XMM4
3c097b80
TS
691 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
692
e31ac32d
TM
693 MOVADQ 0(%arg1),\TMP1
694 pxor \TMP1, \XMM1
695 pxor \TMP1, \XMM2
696 pxor \TMP1, \XMM3
697 pxor \TMP1, \XMM4
3c097b80
TS
698 movdqa \TMP3, \TMP5
699 pshufd $78, \TMP3, \TMP1
700 pxor \TMP3, \TMP1
701 movdqa \TMP1, HashKey_k(%rsp)
702 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
703# TMP5 = HashKey^2<<1 (mod poly)
704 movdqa \TMP5, HashKey_2(%rsp)
705# HashKey_2 = HashKey^2<<1 (mod poly)
706 pshufd $78, \TMP5, \TMP1
707 pxor \TMP5, \TMP1
708 movdqa \TMP1, HashKey_2_k(%rsp)
709.irpc index, 1234 # do 4 rounds
710 movaps 0x10*\index(%arg1), \TMP1
711 AESENC \TMP1, \XMM1
712 AESENC \TMP1, \XMM2
713 AESENC \TMP1, \XMM3
714 AESENC \TMP1, \XMM4
715.endr
716 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
717# TMP5 = HashKey^3<<1 (mod poly)
718 movdqa \TMP5, HashKey_3(%rsp)
719 pshufd $78, \TMP5, \TMP1
720 pxor \TMP5, \TMP1
721 movdqa \TMP1, HashKey_3_k(%rsp)
722.irpc index, 56789 # do next 5 rounds
723 movaps 0x10*\index(%arg1), \TMP1
724 AESENC \TMP1, \XMM1
725 AESENC \TMP1, \XMM2
726 AESENC \TMP1, \XMM3
727 AESENC \TMP1, \XMM4
728.endr
729 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
730# TMP5 = HashKey^3<<1 (mod poly)
731 movdqa \TMP5, HashKey_4(%rsp)
732 pshufd $78, \TMP5, \TMP1
733 pxor \TMP5, \TMP1
734 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
735 lea 0xa0(%arg1),%r10
736 mov keysize,%eax
737 shr $2,%eax # 128->4, 192->6, 256->8
738 sub $4,%eax # 128->0, 192->2, 256->4
e1fd316f 739 jz aes_loop_pre_done\@
e31ac32d 740
e1fd316f 741aes_loop_pre_\@:
e31ac32d
TM
742 MOVADQ (%r10),\TMP2
743.irpc index, 1234
744 AESENC \TMP2, %xmm\index
745.endr
746 add $16,%r10
747 sub $1,%eax
e1fd316f 748 jnz aes_loop_pre_\@
e31ac32d 749
e1fd316f 750aes_loop_pre_done\@:
e31ac32d 751 MOVADQ (%r10), \TMP2
3c097b80
TS
752 AESENCLAST \TMP2, \XMM1
753 AESENCLAST \TMP2, \XMM2
754 AESENCLAST \TMP2, \XMM3
755 AESENCLAST \TMP2, \XMM4
9ee4a5df 756 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
3c097b80 757 pxor \TMP1, \XMM1
e1fd316f 758.ifc \operation, dec
9ee4a5df 759 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
e1fd316f
DW
760 movdqa \TMP1, \XMM1
761.endif
9ee4a5df 762 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
3c097b80 763 pxor \TMP1, \XMM2
e1fd316f 764.ifc \operation, dec
9ee4a5df 765 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
e1fd316f
DW
766 movdqa \TMP1, \XMM2
767.endif
9ee4a5df 768 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
3c097b80 769 pxor \TMP1, \XMM3
e1fd316f 770.ifc \operation, dec
9ee4a5df 771 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
e1fd316f
DW
772 movdqa \TMP1, \XMM3
773.endif
9ee4a5df 774 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
3c097b80 775 pxor \TMP1, \XMM4
e1fd316f 776.ifc \operation, dec
9ee4a5df 777 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f
DW
778 movdqa \TMP1, \XMM4
779.else
9ee4a5df
DW
780 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
781 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
782 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
783 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f 784.endif
3c097b80 785
0bd82f5f 786 add $64, %r11
3c097b80 787 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
788 pxor \XMMDst, \XMM1
789# combine GHASHed value with the corresponding ciphertext
3c097b80 790 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 791 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
792 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
793
e1fd316f 794_initial_blocks_done\@:
3c097b80 795
0bd82f5f
TS
796.endm
797
798/*
799* encrypt 4 blocks at a time
800* ghash the 4 previously encrypted ciphertext blocks
9ee4a5df 801* arg1, %arg3, %arg4 are used as pointers only, not modified
0bd82f5f
TS
802* %r11 is the data offset value
803*/
3c097b80
TS
804.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
805TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
806
807 movdqa \XMM1, \XMM5
808 movdqa \XMM2, \XMM6
809 movdqa \XMM3, \XMM7
810 movdqa \XMM4, \XMM8
811
812 movdqa SHUF_MASK(%rip), %xmm15
813 # multiply TMP5 * HashKey using karatsuba
814
815 movdqa \XMM5, \TMP4
816 pshufd $78, \XMM5, \TMP6
817 pxor \XMM5, \TMP6
818 paddd ONE(%rip), \XMM0 # INCR CNT
819 movdqa HashKey_4(%rsp), \TMP5
820 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
821 movdqa \XMM0, \XMM1
822 paddd ONE(%rip), \XMM0 # INCR CNT
823 movdqa \XMM0, \XMM2
824 paddd ONE(%rip), \XMM0 # INCR CNT
825 movdqa \XMM0, \XMM3
826 paddd ONE(%rip), \XMM0 # INCR CNT
827 movdqa \XMM0, \XMM4
828 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
829 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
830 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
831 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
832 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
833
834 pxor (%arg1), \XMM1
835 pxor (%arg1), \XMM2
836 pxor (%arg1), \XMM3
837 pxor (%arg1), \XMM4
838 movdqa HashKey_4_k(%rsp), \TMP5
839 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
840 movaps 0x10(%arg1), \TMP1
841 AESENC \TMP1, \XMM1 # Round 1
842 AESENC \TMP1, \XMM2
843 AESENC \TMP1, \XMM3
844 AESENC \TMP1, \XMM4
845 movaps 0x20(%arg1), \TMP1
846 AESENC \TMP1, \XMM1 # Round 2
847 AESENC \TMP1, \XMM2
848 AESENC \TMP1, \XMM3
849 AESENC \TMP1, \XMM4
850 movdqa \XMM6, \TMP1
851 pshufd $78, \XMM6, \TMP2
852 pxor \XMM6, \TMP2
853 movdqa HashKey_3(%rsp), \TMP5
854 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
855 movaps 0x30(%arg1), \TMP3
856 AESENC \TMP3, \XMM1 # Round 3
857 AESENC \TMP3, \XMM2
858 AESENC \TMP3, \XMM3
859 AESENC \TMP3, \XMM4
860 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
861 movaps 0x40(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 4
863 AESENC \TMP3, \XMM2
864 AESENC \TMP3, \XMM3
865 AESENC \TMP3, \XMM4
866 movdqa HashKey_3_k(%rsp), \TMP5
867 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
868 movaps 0x50(%arg1), \TMP3
869 AESENC \TMP3, \XMM1 # Round 5
870 AESENC \TMP3, \XMM2
871 AESENC \TMP3, \XMM3
872 AESENC \TMP3, \XMM4
873 pxor \TMP1, \TMP4
874# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
875 pxor \XMM6, \XMM5
876 pxor \TMP2, \TMP6
877 movdqa \XMM7, \TMP1
878 pshufd $78, \XMM7, \TMP2
879 pxor \XMM7, \TMP2
880 movdqa HashKey_2(%rsp ), \TMP5
881
882 # Multiply TMP5 * HashKey using karatsuba
883
884 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
885 movaps 0x60(%arg1), \TMP3
886 AESENC \TMP3, \XMM1 # Round 6
887 AESENC \TMP3, \XMM2
888 AESENC \TMP3, \XMM3
889 AESENC \TMP3, \XMM4
890 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
891 movaps 0x70(%arg1), \TMP3
892 AESENC \TMP3, \XMM1 # Round 7
893 AESENC \TMP3, \XMM2
894 AESENC \TMP3, \XMM3
895 AESENC \TMP3, \XMM4
896 movdqa HashKey_2_k(%rsp), \TMP5
897 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
898 movaps 0x80(%arg1), \TMP3
899 AESENC \TMP3, \XMM1 # Round 8
900 AESENC \TMP3, \XMM2
901 AESENC \TMP3, \XMM3
902 AESENC \TMP3, \XMM4
903 pxor \TMP1, \TMP4
904# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
905 pxor \XMM7, \XMM5
906 pxor \TMP2, \TMP6
907
908 # Multiply XMM8 * HashKey
909 # XMM8 and TMP5 hold the values for the two operands
910
911 movdqa \XMM8, \TMP1
912 pshufd $78, \XMM8, \TMP2
913 pxor \XMM8, \TMP2
914 movdqa HashKey(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
916 movaps 0x90(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 9
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
922 lea 0xa0(%arg1),%r10
923 mov keysize,%eax
924 shr $2,%eax # 128->4, 192->6, 256->8
925 sub $4,%eax # 128->0, 192->2, 256->4
926 jz aes_loop_par_enc_done
927
928aes_loop_par_enc:
929 MOVADQ (%r10),\TMP3
930.irpc index, 1234
931 AESENC \TMP3, %xmm\index
932.endr
933 add $16,%r10
934 sub $1,%eax
935 jnz aes_loop_par_enc
936
937aes_loop_par_enc_done:
938 MOVADQ (%r10), \TMP3
3c097b80
TS
939 AESENCLAST \TMP3, \XMM1 # Round 10
940 AESENCLAST \TMP3, \XMM2
941 AESENCLAST \TMP3, \XMM3
942 AESENCLAST \TMP3, \XMM4
943 movdqa HashKey_k(%rsp), \TMP5
944 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 945 movdqu (%arg4,%r11,1), \TMP3
3c097b80 946 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 947 movdqu 16(%arg4,%r11,1), \TMP3
3c097b80 948 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 949 movdqu 32(%arg4,%r11,1), \TMP3
3c097b80 950 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 951 movdqu 48(%arg4,%r11,1), \TMP3
3c097b80 952 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df
DW
953 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
954 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
955 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
956 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
3c097b80
TS
957 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
958 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
959 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
960 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
961
962 pxor \TMP4, \TMP1
963 pxor \XMM8, \XMM5
964 pxor \TMP6, \TMP2
965 pxor \TMP1, \TMP2
966 pxor \XMM5, \TMP2
967 movdqa \TMP2, \TMP3
968 pslldq $8, \TMP3 # left shift TMP3 2 DWs
969 psrldq $8, \TMP2 # right shift TMP2 2 DWs
970 pxor \TMP3, \XMM5
971 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
972
973 # first phase of reduction
974
975 movdqa \XMM5, \TMP2
976 movdqa \XMM5, \TMP3
977 movdqa \XMM5, \TMP4
978# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
979 pslld $31, \TMP2 # packed right shift << 31
980 pslld $30, \TMP3 # packed right shift << 30
981 pslld $25, \TMP4 # packed right shift << 25
982 pxor \TMP3, \TMP2 # xor the shifted versions
983 pxor \TMP4, \TMP2
984 movdqa \TMP2, \TMP5
985 psrldq $4, \TMP5 # right shift T5 1 DW
986 pslldq $12, \TMP2 # left shift T2 3 DWs
987 pxor \TMP2, \XMM5
988
989 # second phase of reduction
990
991 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
992 movdqa \XMM5,\TMP3
993 movdqa \XMM5,\TMP4
994 psrld $1, \TMP2 # packed left shift >>1
995 psrld $2, \TMP3 # packed left shift >>2
996 psrld $7, \TMP4 # packed left shift >>7
997 pxor \TMP3,\TMP2 # xor the shifted versions
998 pxor \TMP4,\TMP2
999 pxor \TMP5, \TMP2
1000 pxor \TMP2, \XMM5
1001 pxor \TMP1, \XMM5 # result is in TMP1
1002
1003 pxor \XMM5, \XMM1
1004.endm
1005
1006/*
1007* decrypt 4 blocks at a time
1008* ghash the 4 previously decrypted ciphertext blocks
9ee4a5df 1009* arg1, %arg3, %arg4 are used as pointers only, not modified
3c097b80
TS
1010* %r11 is the data offset value
1011*/
1012.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
1013TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1014
1015 movdqa \XMM1, \XMM5
1016 movdqa \XMM2, \XMM6
1017 movdqa \XMM3, \XMM7
1018 movdqa \XMM4, \XMM8
1019
3c097b80 1020 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
1021 # multiply TMP5 * HashKey using karatsuba
1022
1023 movdqa \XMM5, \TMP4
1024 pshufd $78, \XMM5, \TMP6
1025 pxor \XMM5, \TMP6
1026 paddd ONE(%rip), \XMM0 # INCR CNT
1027 movdqa HashKey_4(%rsp), \TMP5
1028 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1029 movdqa \XMM0, \XMM1
1030 paddd ONE(%rip), \XMM0 # INCR CNT
1031 movdqa \XMM0, \XMM2
1032 paddd ONE(%rip), \XMM0 # INCR CNT
1033 movdqa \XMM0, \XMM3
1034 paddd ONE(%rip), \XMM0 # INCR CNT
1035 movdqa \XMM0, \XMM4
3c097b80 1036 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 1037 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
1038 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1039 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1040 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1041
0bd82f5f
TS
1042 pxor (%arg1), \XMM1
1043 pxor (%arg1), \XMM2
1044 pxor (%arg1), \XMM3
1045 pxor (%arg1), \XMM4
1046 movdqa HashKey_4_k(%rsp), \TMP5
1047 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1048 movaps 0x10(%arg1), \TMP1
1049 AESENC \TMP1, \XMM1 # Round 1
1050 AESENC \TMP1, \XMM2
1051 AESENC \TMP1, \XMM3
1052 AESENC \TMP1, \XMM4
1053 movaps 0x20(%arg1), \TMP1
1054 AESENC \TMP1, \XMM1 # Round 2
1055 AESENC \TMP1, \XMM2
1056 AESENC \TMP1, \XMM3
1057 AESENC \TMP1, \XMM4
1058 movdqa \XMM6, \TMP1
1059 pshufd $78, \XMM6, \TMP2
1060 pxor \XMM6, \TMP2
1061 movdqa HashKey_3(%rsp), \TMP5
1062 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1063 movaps 0x30(%arg1), \TMP3
1064 AESENC \TMP3, \XMM1 # Round 3
1065 AESENC \TMP3, \XMM2
1066 AESENC \TMP3, \XMM3
1067 AESENC \TMP3, \XMM4
1068 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1069 movaps 0x40(%arg1), \TMP3
1070 AESENC \TMP3, \XMM1 # Round 4
1071 AESENC \TMP3, \XMM2
1072 AESENC \TMP3, \XMM3
1073 AESENC \TMP3, \XMM4
1074 movdqa HashKey_3_k(%rsp), \TMP5
1075 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1076 movaps 0x50(%arg1), \TMP3
1077 AESENC \TMP3, \XMM1 # Round 5
1078 AESENC \TMP3, \XMM2
1079 AESENC \TMP3, \XMM3
1080 AESENC \TMP3, \XMM4
1081 pxor \TMP1, \TMP4
1082# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1083 pxor \XMM6, \XMM5
1084 pxor \TMP2, \TMP6
1085 movdqa \XMM7, \TMP1
1086 pshufd $78, \XMM7, \TMP2
1087 pxor \XMM7, \TMP2
1088 movdqa HashKey_2(%rsp ), \TMP5
1089
1090 # Multiply TMP5 * HashKey using karatsuba
1091
1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x60(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 6
1095 AESENC \TMP3, \XMM2
1096 AESENC \TMP3, \XMM3
1097 AESENC \TMP3, \XMM4
1098 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1099 movaps 0x70(%arg1), \TMP3
1100 AESENC \TMP3, \XMM1 # Round 7
1101 AESENC \TMP3, \XMM2
1102 AESENC \TMP3, \XMM3
1103 AESENC \TMP3, \XMM4
1104 movdqa HashKey_2_k(%rsp), \TMP5
1105 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1106 movaps 0x80(%arg1), \TMP3
1107 AESENC \TMP3, \XMM1 # Round 8
1108 AESENC \TMP3, \XMM2
1109 AESENC \TMP3, \XMM3
1110 AESENC \TMP3, \XMM4
1111 pxor \TMP1, \TMP4
1112# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1113 pxor \XMM7, \XMM5
1114 pxor \TMP2, \TMP6
1115
1116 # Multiply XMM8 * HashKey
1117 # XMM8 and TMP5 hold the values for the two operands
1118
1119 movdqa \XMM8, \TMP1
1120 pshufd $78, \XMM8, \TMP2
1121 pxor \XMM8, \TMP2
1122 movdqa HashKey(%rsp), \TMP5
1123 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1124 movaps 0x90(%arg1), \TMP3
1125 AESENC \TMP3, \XMM1 # Round 9
1126 AESENC \TMP3, \XMM2
1127 AESENC \TMP3, \XMM3
1128 AESENC \TMP3, \XMM4
1129 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1130 lea 0xa0(%arg1),%r10
1131 mov keysize,%eax
1132 shr $2,%eax # 128->4, 192->6, 256->8
1133 sub $4,%eax # 128->0, 192->2, 256->4
1134 jz aes_loop_par_dec_done
1135
1136aes_loop_par_dec:
1137 MOVADQ (%r10),\TMP3
1138.irpc index, 1234
1139 AESENC \TMP3, %xmm\index
1140.endr
1141 add $16,%r10
1142 sub $1,%eax
1143 jnz aes_loop_par_dec
1144
1145aes_loop_par_dec_done:
1146 MOVADQ (%r10), \TMP3
1147 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
1148 AESENCLAST \TMP3, \XMM2
1149 AESENCLAST \TMP3, \XMM3
1150 AESENCLAST \TMP3, \XMM4
1151 movdqa HashKey_k(%rsp), \TMP5
1152 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1153 movdqu (%arg4,%r11,1), \TMP3
0bd82f5f 1154 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1155 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1156 movdqa \TMP3, \XMM1
9ee4a5df 1157 movdqu 16(%arg4,%r11,1), \TMP3
0bd82f5f 1158 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1159 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1160 movdqa \TMP3, \XMM2
9ee4a5df 1161 movdqu 32(%arg4,%r11,1), \TMP3
0bd82f5f 1162 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1163 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1164 movdqa \TMP3, \XMM3
9ee4a5df 1165 movdqu 48(%arg4,%r11,1), \TMP3
0bd82f5f 1166 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df 1167 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1168 movdqa \TMP3, \XMM4
3c097b80
TS
1169 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1170 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1171 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1172 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1173
1174 pxor \TMP4, \TMP1
1175 pxor \XMM8, \XMM5
1176 pxor \TMP6, \TMP2
1177 pxor \TMP1, \TMP2
1178 pxor \XMM5, \TMP2
1179 movdqa \TMP2, \TMP3
1180 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1181 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1182 pxor \TMP3, \XMM5
1183 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1184
1185 # first phase of reduction
1186
1187 movdqa \XMM5, \TMP2
1188 movdqa \XMM5, \TMP3
1189 movdqa \XMM5, \TMP4
1190# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1191 pslld $31, \TMP2 # packed right shift << 31
1192 pslld $30, \TMP3 # packed right shift << 30
1193 pslld $25, \TMP4 # packed right shift << 25
1194 pxor \TMP3, \TMP2 # xor the shifted versions
1195 pxor \TMP4, \TMP2
1196 movdqa \TMP2, \TMP5
1197 psrldq $4, \TMP5 # right shift T5 1 DW
1198 pslldq $12, \TMP2 # left shift T2 3 DWs
1199 pxor \TMP2, \XMM5
1200
1201 # second phase of reduction
1202
1203 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1204 movdqa \XMM5,\TMP3
1205 movdqa \XMM5,\TMP4
1206 psrld $1, \TMP2 # packed left shift >>1
1207 psrld $2, \TMP3 # packed left shift >>2
1208 psrld $7, \TMP4 # packed left shift >>7
1209 pxor \TMP3,\TMP2 # xor the shifted versions
1210 pxor \TMP4,\TMP2
1211 pxor \TMP5, \TMP2
1212 pxor \TMP2, \XMM5
1213 pxor \TMP1, \XMM5 # result is in TMP1
1214
1215 pxor \XMM5, \XMM1
1216.endm
1217
1218/* GHASH the last 4 ciphertext blocks. */
1219.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1220TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1221
1222 # Multiply TMP6 * HashKey (using Karatsuba)
1223
1224 movdqa \XMM1, \TMP6
1225 pshufd $78, \XMM1, \TMP2
1226 pxor \XMM1, \TMP2
1227 movdqa HashKey_4(%rsp), \TMP5
1228 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1229 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1230 movdqa HashKey_4_k(%rsp), \TMP4
1231 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1232 movdqa \XMM1, \XMMDst
1233 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1234
1235 # Multiply TMP1 * HashKey (using Karatsuba)
1236
1237 movdqa \XMM2, \TMP1
1238 pshufd $78, \XMM2, \TMP2
1239 pxor \XMM2, \TMP2
1240 movdqa HashKey_3(%rsp), \TMP5
1241 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1242 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1243 movdqa HashKey_3_k(%rsp), \TMP4
1244 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1245 pxor \TMP1, \TMP6
1246 pxor \XMM2, \XMMDst
1247 pxor \TMP2, \XMM1
1248# results accumulated in TMP6, XMMDst, XMM1
1249
1250 # Multiply TMP1 * HashKey (using Karatsuba)
1251
1252 movdqa \XMM3, \TMP1
1253 pshufd $78, \XMM3, \TMP2
1254 pxor \XMM3, \TMP2
1255 movdqa HashKey_2(%rsp), \TMP5
1256 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1257 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1258 movdqa HashKey_2_k(%rsp), \TMP4
1259 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1260 pxor \TMP1, \TMP6
1261 pxor \XMM3, \XMMDst
1262 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1263
1264 # Multiply TMP1 * HashKey (using Karatsuba)
1265 movdqa \XMM4, \TMP1
1266 pshufd $78, \XMM4, \TMP2
1267 pxor \XMM4, \TMP2
1268 movdqa HashKey(%rsp), \TMP5
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1271 movdqa HashKey_k(%rsp), \TMP4
1272 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1273 pxor \TMP1, \TMP6
1274 pxor \XMM4, \XMMDst
1275 pxor \XMM1, \TMP2
1276 pxor \TMP6, \TMP2
1277 pxor \XMMDst, \TMP2
1278 # middle section of the temp results combined as in karatsuba algorithm
1279 movdqa \TMP2, \TMP4
1280 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1281 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1282 pxor \TMP4, \XMMDst
1283 pxor \TMP2, \TMP6
1284# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1285 # first phase of the reduction
1286 movdqa \XMMDst, \TMP2
1287 movdqa \XMMDst, \TMP3
1288 movdqa \XMMDst, \TMP4
1289# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1290 pslld $31, \TMP2 # packed right shifting << 31
1291 pslld $30, \TMP3 # packed right shifting << 30
1292 pslld $25, \TMP4 # packed right shifting << 25
1293 pxor \TMP3, \TMP2 # xor the shifted versions
1294 pxor \TMP4, \TMP2
1295 movdqa \TMP2, \TMP7
1296 psrldq $4, \TMP7 # right shift TMP7 1 DW
1297 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1298 pxor \TMP2, \XMMDst
1299
1300 # second phase of the reduction
1301 movdqa \XMMDst, \TMP2
1302 # make 3 copies of XMMDst for doing 3 shift operations
1303 movdqa \XMMDst, \TMP3
1304 movdqa \XMMDst, \TMP4
1305 psrld $1, \TMP2 # packed left shift >> 1
1306 psrld $2, \TMP3 # packed left shift >> 2
1307 psrld $7, \TMP4 # packed left shift >> 7
1308 pxor \TMP3, \TMP2 # xor the shifted versions
1309 pxor \TMP4, \TMP2
1310 pxor \TMP7, \TMP2
1311 pxor \TMP2, \XMMDst
1312 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1313.endm
1314
0bd82f5f 1315
e31ac32d
TM
1316/* Encryption of a single block
1317* uses eax & r10
1318*/
0bd82f5f 1319
e31ac32d 1320.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1321
e31ac32d
TM
1322 pxor (%arg1), \XMM0
1323 mov keysize,%eax
1324 shr $2,%eax # 128->4, 192->6, 256->8
1325 add $5,%eax # 128->9, 192->11, 256->13
1326 lea 16(%arg1), %r10 # get first expanded key address
1327
1328_esb_loop_\@:
1329 MOVADQ (%r10),\TMP1
1330 AESENC \TMP1,\XMM0
1331 add $16,%r10
1332 sub $1,%eax
1333 jnz _esb_loop_\@
1334
1335 MOVADQ (%r10),\TMP1
1336 AESENCLAST \TMP1,\XMM0
1337.endm
0bd82f5f
TS
1338/*****************************************************************************
1339* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1340* struct gcm_context_data *data
1341* // Context data
0bd82f5f
TS
1342* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1343* const u8 *in, // Ciphertext input
1344* u64 plaintext_len, // Length of data in bytes for decryption.
1345* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1346* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1347* // concatenated with 0x00000001. 16-byte aligned pointer.
1348* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1349* const u8 *aad, // Additional Authentication Data (AAD)
1350* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1351* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1352* // given authentication tag and only return the plaintext if they match.
1353* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1354* // (most likely), 12 or 8.
1355*
1356* Assumptions:
1357*
1358* keys:
1359* keys are pre-expanded and aligned to 16 bytes. we are using the first
1360* set of 11 keys in the data structure void *aes_ctx
1361*
1362* iv:
1363* 0 1 2 3
1364* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1365* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366* | Salt (From the SA) |
1367* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368* | Initialization Vector |
1369* | (This is the sequence number from IPSec header) |
1370* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1371* | 0x1 |
1372* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1373*
1374*
1375*
1376* AAD:
1377* AAD padded to 128 bits with 0
1378* for example, assume AAD is a u32 vector
1379*
1380* if AAD is 8 bytes:
1381* AAD[3] = {A0, A1};
1382* padded AAD in xmm register = {A1 A0 0 0}
1383*
1384* 0 1 2 3
1385* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1386* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1387* | SPI (A1) |
1388* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1389* | 32-bit Sequence Number (A0) |
1390* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1391* | 0x0 |
1392* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1393*
1394* AAD Format with 32-bit Sequence Number
1395*
1396* if AAD is 12 bytes:
1397* AAD[3] = {A0, A1, A2};
1398* padded AAD in xmm register = {A2 A1 A0 0}
1399*
1400* 0 1 2 3
1401* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1402* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1403* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1404* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1405* | SPI (A2) |
1406* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1407* | 64-bit Extended Sequence Number {A1,A0} |
1408* | |
1409* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1410* | 0x0 |
1411* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1412*
1413* AAD Format with 64-bit Extended Sequence Number
1414*
0bd82f5f
TS
1415* poly = x^128 + x^127 + x^126 + x^121 + 1
1416*
1417*****************************************************************************/
0bd82f5f 1418ENTRY(aesni_gcm_dec)
6c2c86b3 1419 FUNC_SAVE
0bd82f5f 1420
7af964c2 1421 GCM_INIT
ba45833e 1422 GCM_ENC_DEC dec
adcadab3 1423 GCM_COMPLETE
6c2c86b3 1424 FUNC_RESTORE
0bd82f5f 1425 ret
8309b745 1426ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1427
1428
1429/*****************************************************************************
1430* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1431* struct gcm_context_data *data
1432* // Context data
0bd82f5f
TS
1433* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1434* const u8 *in, // Plaintext input
1435* u64 plaintext_len, // Length of data in bytes for encryption.
1436* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1437* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1438* // concatenated with 0x00000001. 16-byte aligned pointer.
1439* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1440* const u8 *aad, // Additional Authentication Data (AAD)
1441* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1442* u8 *auth_tag, // Authenticated Tag output.
1443* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1444* // 12 or 8.
1445*
1446* Assumptions:
1447*
1448* keys:
1449* keys are pre-expanded and aligned to 16 bytes. we are using the
1450* first set of 11 keys in the data structure void *aes_ctx
1451*
1452*
1453* iv:
1454* 0 1 2 3
1455* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1456* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1457* | Salt (From the SA) |
1458* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1459* | Initialization Vector |
1460* | (This is the sequence number from IPSec header) |
1461* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1462* | 0x1 |
1463* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1464*
1465*
1466*
1467* AAD:
1468* AAD padded to 128 bits with 0
1469* for example, assume AAD is a u32 vector
1470*
1471* if AAD is 8 bytes:
1472* AAD[3] = {A0, A1};
1473* padded AAD in xmm register = {A1 A0 0 0}
1474*
1475* 0 1 2 3
1476* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1477* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1478* | SPI (A1) |
1479* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1480* | 32-bit Sequence Number (A0) |
1481* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1482* | 0x0 |
1483* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1484*
1485* AAD Format with 32-bit Sequence Number
1486*
1487* if AAD is 12 bytes:
1488* AAD[3] = {A0, A1, A2};
1489* padded AAD in xmm register = {A2 A1 A0 0}
1490*
1491* 0 1 2 3
1492* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1493* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1494* | SPI (A2) |
1495* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496* | 64-bit Extended Sequence Number {A1,A0} |
1497* | |
1498* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1499* | 0x0 |
1500* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501*
1502* AAD Format with 64-bit Extended Sequence Number
1503*
0bd82f5f
TS
1504* poly = x^128 + x^127 + x^126 + x^121 + 1
1505***************************************************************************/
1506ENTRY(aesni_gcm_enc)
6c2c86b3 1507 FUNC_SAVE
0bd82f5f 1508
7af964c2 1509 GCM_INIT
ba45833e 1510 GCM_ENC_DEC enc
adcadab3 1511 GCM_COMPLETE
6c2c86b3 1512 FUNC_RESTORE
0bd82f5f 1513 ret
8309b745 1514ENDPROC(aesni_gcm_enc)
3c097b80 1515
559ad0ff 1516#endif
0bd82f5f
TS
1517
1518
8309b745 1519.align 4
54b6a1bd
HY
1520_key_expansion_128:
1521_key_expansion_256a:
1522 pshufd $0b11111111, %xmm1, %xmm1
1523 shufps $0b00010000, %xmm0, %xmm4
1524 pxor %xmm4, %xmm0
1525 shufps $0b10001100, %xmm0, %xmm4
1526 pxor %xmm4, %xmm0
1527 pxor %xmm1, %xmm0
0d258efb
MK
1528 movaps %xmm0, (TKEYP)
1529 add $0x10, TKEYP
54b6a1bd 1530 ret
8309b745
JK
1531ENDPROC(_key_expansion_128)
1532ENDPROC(_key_expansion_256a)
54b6a1bd 1533
0d258efb 1534.align 4
54b6a1bd
HY
1535_key_expansion_192a:
1536 pshufd $0b01010101, %xmm1, %xmm1
1537 shufps $0b00010000, %xmm0, %xmm4
1538 pxor %xmm4, %xmm0
1539 shufps $0b10001100, %xmm0, %xmm4
1540 pxor %xmm4, %xmm0
1541 pxor %xmm1, %xmm0
1542
1543 movaps %xmm2, %xmm5
1544 movaps %xmm2, %xmm6
1545 pslldq $4, %xmm5
1546 pshufd $0b11111111, %xmm0, %xmm3
1547 pxor %xmm3, %xmm2
1548 pxor %xmm5, %xmm2
1549
1550 movaps %xmm0, %xmm1
1551 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1552 movaps %xmm6, (TKEYP)
54b6a1bd 1553 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1554 movaps %xmm1, 0x10(TKEYP)
1555 add $0x20, TKEYP
54b6a1bd 1556 ret
8309b745 1557ENDPROC(_key_expansion_192a)
54b6a1bd 1558
0d258efb 1559.align 4
54b6a1bd
HY
1560_key_expansion_192b:
1561 pshufd $0b01010101, %xmm1, %xmm1
1562 shufps $0b00010000, %xmm0, %xmm4
1563 pxor %xmm4, %xmm0
1564 shufps $0b10001100, %xmm0, %xmm4
1565 pxor %xmm4, %xmm0
1566 pxor %xmm1, %xmm0
1567
1568 movaps %xmm2, %xmm5
1569 pslldq $4, %xmm5
1570 pshufd $0b11111111, %xmm0, %xmm3
1571 pxor %xmm3, %xmm2
1572 pxor %xmm5, %xmm2
1573
0d258efb
MK
1574 movaps %xmm0, (TKEYP)
1575 add $0x10, TKEYP
54b6a1bd 1576 ret
8309b745 1577ENDPROC(_key_expansion_192b)
54b6a1bd 1578
0d258efb 1579.align 4
54b6a1bd
HY
1580_key_expansion_256b:
1581 pshufd $0b10101010, %xmm1, %xmm1
1582 shufps $0b00010000, %xmm2, %xmm4
1583 pxor %xmm4, %xmm2
1584 shufps $0b10001100, %xmm2, %xmm4
1585 pxor %xmm4, %xmm2
1586 pxor %xmm1, %xmm2
0d258efb
MK
1587 movaps %xmm2, (TKEYP)
1588 add $0x10, TKEYP
54b6a1bd 1589 ret
8309b745 1590ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1591
1592/*
1593 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1594 * unsigned int key_len)
1595 */
1596ENTRY(aesni_set_key)
8691ccd7 1597 FRAME_BEGIN
0d258efb
MK
1598#ifndef __x86_64__
1599 pushl KEYP
8691ccd7
JP
1600 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1601 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1602 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1603#endif
1604 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1605 movaps %xmm0, (KEYP)
1606 lea 0x10(KEYP), TKEYP # key addr
1607 movl %edx, 480(KEYP)
54b6a1bd
HY
1608 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1609 cmp $24, %dl
1610 jb .Lenc_key128
1611 je .Lenc_key192
0d258efb
MK
1612 movups 0x10(UKEYP), %xmm2 # other user key
1613 movaps %xmm2, (TKEYP)
1614 add $0x10, TKEYP
b369e521 1615 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1616 call _key_expansion_256a
b369e521 1617 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1618 call _key_expansion_256b
b369e521 1619 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1620 call _key_expansion_256a
b369e521 1621 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1622 call _key_expansion_256b
b369e521 1623 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1624 call _key_expansion_256a
b369e521 1625 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1626 call _key_expansion_256b
b369e521 1627 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1628 call _key_expansion_256a
b369e521 1629 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1630 call _key_expansion_256b
b369e521 1631 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1632 call _key_expansion_256a
b369e521 1633 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1634 call _key_expansion_256b
b369e521 1635 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1636 call _key_expansion_256a
b369e521 1637 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1638 call _key_expansion_256b
b369e521 1639 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1640 call _key_expansion_256a
1641 jmp .Ldec_key
1642.Lenc_key192:
0d258efb 1643 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1644 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1645 call _key_expansion_192a
b369e521 1646 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1647 call _key_expansion_192b
b369e521 1648 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1649 call _key_expansion_192a
b369e521 1650 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1651 call _key_expansion_192b
b369e521 1652 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1653 call _key_expansion_192a
b369e521 1654 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1655 call _key_expansion_192b
b369e521 1656 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1657 call _key_expansion_192a
b369e521 1658 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1659 call _key_expansion_192b
1660 jmp .Ldec_key
1661.Lenc_key128:
b369e521 1662 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1663 call _key_expansion_128
b369e521 1664 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1665 call _key_expansion_128
b369e521 1666 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1667 call _key_expansion_128
b369e521 1668 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1669 call _key_expansion_128
b369e521 1670 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1671 call _key_expansion_128
b369e521 1672 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1673 call _key_expansion_128
b369e521 1674 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1675 call _key_expansion_128
b369e521 1676 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1677 call _key_expansion_128
b369e521 1678 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1679 call _key_expansion_128
b369e521 1680 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1681 call _key_expansion_128
1682.Ldec_key:
0d258efb
MK
1683 sub $0x10, TKEYP
1684 movaps (KEYP), %xmm0
1685 movaps (TKEYP), %xmm1
1686 movaps %xmm0, 240(TKEYP)
1687 movaps %xmm1, 240(KEYP)
1688 add $0x10, KEYP
1689 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1690.align 4
1691.Ldec_key_loop:
0d258efb 1692 movaps (KEYP), %xmm0
b369e521 1693 AESIMC %xmm0 %xmm1
0d258efb
MK
1694 movaps %xmm1, (UKEYP)
1695 add $0x10, KEYP
1696 sub $0x10, UKEYP
1697 cmp TKEYP, KEYP
54b6a1bd 1698 jb .Ldec_key_loop
0d258efb
MK
1699 xor AREG, AREG
1700#ifndef __x86_64__
1701 popl KEYP
1702#endif
8691ccd7 1703 FRAME_END
54b6a1bd 1704 ret
8309b745 1705ENDPROC(aesni_set_key)
54b6a1bd
HY
1706
1707/*
1708 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1709 */
1710ENTRY(aesni_enc)
8691ccd7 1711 FRAME_BEGIN
0d258efb
MK
1712#ifndef __x86_64__
1713 pushl KEYP
1714 pushl KLEN
8691ccd7
JP
1715 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1716 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1717 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1718#endif
54b6a1bd
HY
1719 movl 480(KEYP), KLEN # key length
1720 movups (INP), STATE # input
1721 call _aesni_enc1
1722 movups STATE, (OUTP) # output
0d258efb
MK
1723#ifndef __x86_64__
1724 popl KLEN
1725 popl KEYP
1726#endif
8691ccd7 1727 FRAME_END
54b6a1bd 1728 ret
8309b745 1729ENDPROC(aesni_enc)
54b6a1bd
HY
1730
1731/*
1732 * _aesni_enc1: internal ABI
1733 * input:
1734 * KEYP: key struct pointer
1735 * KLEN: round count
1736 * STATE: initial state (input)
1737 * output:
1738 * STATE: finial state (output)
1739 * changed:
1740 * KEY
1741 * TKEYP (T1)
1742 */
0d258efb 1743.align 4
54b6a1bd
HY
1744_aesni_enc1:
1745 movaps (KEYP), KEY # key
1746 mov KEYP, TKEYP
1747 pxor KEY, STATE # round 0
1748 add $0x30, TKEYP
1749 cmp $24, KLEN
1750 jb .Lenc128
1751 lea 0x20(TKEYP), TKEYP
1752 je .Lenc192
1753 add $0x20, TKEYP
1754 movaps -0x60(TKEYP), KEY
b369e521 1755 AESENC KEY STATE
54b6a1bd 1756 movaps -0x50(TKEYP), KEY
b369e521 1757 AESENC KEY STATE
54b6a1bd
HY
1758.align 4
1759.Lenc192:
1760 movaps -0x40(TKEYP), KEY
b369e521 1761 AESENC KEY STATE
54b6a1bd 1762 movaps -0x30(TKEYP), KEY
b369e521 1763 AESENC KEY STATE
54b6a1bd
HY
1764.align 4
1765.Lenc128:
1766 movaps -0x20(TKEYP), KEY
b369e521 1767 AESENC KEY STATE
54b6a1bd 1768 movaps -0x10(TKEYP), KEY
b369e521 1769 AESENC KEY STATE
54b6a1bd 1770 movaps (TKEYP), KEY
b369e521 1771 AESENC KEY STATE
54b6a1bd 1772 movaps 0x10(TKEYP), KEY
b369e521 1773 AESENC KEY STATE
54b6a1bd 1774 movaps 0x20(TKEYP), KEY
b369e521 1775 AESENC KEY STATE
54b6a1bd 1776 movaps 0x30(TKEYP), KEY
b369e521 1777 AESENC KEY STATE
54b6a1bd 1778 movaps 0x40(TKEYP), KEY
b369e521 1779 AESENC KEY STATE
54b6a1bd 1780 movaps 0x50(TKEYP), KEY
b369e521 1781 AESENC KEY STATE
54b6a1bd 1782 movaps 0x60(TKEYP), KEY
b369e521 1783 AESENC KEY STATE
54b6a1bd 1784 movaps 0x70(TKEYP), KEY
b369e521 1785 AESENCLAST KEY STATE
54b6a1bd 1786 ret
8309b745 1787ENDPROC(_aesni_enc1)
54b6a1bd
HY
1788
1789/*
1790 * _aesni_enc4: internal ABI
1791 * input:
1792 * KEYP: key struct pointer
1793 * KLEN: round count
1794 * STATE1: initial state (input)
1795 * STATE2
1796 * STATE3
1797 * STATE4
1798 * output:
1799 * STATE1: finial state (output)
1800 * STATE2
1801 * STATE3
1802 * STATE4
1803 * changed:
1804 * KEY
1805 * TKEYP (T1)
1806 */
0d258efb 1807.align 4
54b6a1bd
HY
1808_aesni_enc4:
1809 movaps (KEYP), KEY # key
1810 mov KEYP, TKEYP
1811 pxor KEY, STATE1 # round 0
1812 pxor KEY, STATE2
1813 pxor KEY, STATE3
1814 pxor KEY, STATE4
1815 add $0x30, TKEYP
1816 cmp $24, KLEN
1817 jb .L4enc128
1818 lea 0x20(TKEYP), TKEYP
1819 je .L4enc192
1820 add $0x20, TKEYP
1821 movaps -0x60(TKEYP), KEY
b369e521
HY
1822 AESENC KEY STATE1
1823 AESENC KEY STATE2
1824 AESENC KEY STATE3
1825 AESENC KEY STATE4
54b6a1bd 1826 movaps -0x50(TKEYP), KEY
b369e521
HY
1827 AESENC KEY STATE1
1828 AESENC KEY STATE2
1829 AESENC KEY STATE3
1830 AESENC KEY STATE4
54b6a1bd
HY
1831#.align 4
1832.L4enc192:
1833 movaps -0x40(TKEYP), KEY
b369e521
HY
1834 AESENC KEY STATE1
1835 AESENC KEY STATE2
1836 AESENC KEY STATE3
1837 AESENC KEY STATE4
54b6a1bd 1838 movaps -0x30(TKEYP), KEY
b369e521
HY
1839 AESENC KEY STATE1
1840 AESENC KEY STATE2
1841 AESENC KEY STATE3
1842 AESENC KEY STATE4
54b6a1bd
HY
1843#.align 4
1844.L4enc128:
1845 movaps -0x20(TKEYP), KEY
b369e521
HY
1846 AESENC KEY STATE1
1847 AESENC KEY STATE2
1848 AESENC KEY STATE3
1849 AESENC KEY STATE4
54b6a1bd 1850 movaps -0x10(TKEYP), KEY
b369e521
HY
1851 AESENC KEY STATE1
1852 AESENC KEY STATE2
1853 AESENC KEY STATE3
1854 AESENC KEY STATE4
54b6a1bd 1855 movaps (TKEYP), KEY
b369e521
HY
1856 AESENC KEY STATE1
1857 AESENC KEY STATE2
1858 AESENC KEY STATE3
1859 AESENC KEY STATE4
54b6a1bd 1860 movaps 0x10(TKEYP), KEY
b369e521
HY
1861 AESENC KEY STATE1
1862 AESENC KEY STATE2
1863 AESENC KEY STATE3
1864 AESENC KEY STATE4
54b6a1bd 1865 movaps 0x20(TKEYP), KEY
b369e521
HY
1866 AESENC KEY STATE1
1867 AESENC KEY STATE2
1868 AESENC KEY STATE3
1869 AESENC KEY STATE4
54b6a1bd 1870 movaps 0x30(TKEYP), KEY
b369e521
HY
1871 AESENC KEY STATE1
1872 AESENC KEY STATE2
1873 AESENC KEY STATE3
1874 AESENC KEY STATE4
54b6a1bd 1875 movaps 0x40(TKEYP), KEY
b369e521
HY
1876 AESENC KEY STATE1
1877 AESENC KEY STATE2
1878 AESENC KEY STATE3
1879 AESENC KEY STATE4
54b6a1bd 1880 movaps 0x50(TKEYP), KEY
b369e521
HY
1881 AESENC KEY STATE1
1882 AESENC KEY STATE2
1883 AESENC KEY STATE3
1884 AESENC KEY STATE4
54b6a1bd 1885 movaps 0x60(TKEYP), KEY
b369e521
HY
1886 AESENC KEY STATE1
1887 AESENC KEY STATE2
1888 AESENC KEY STATE3
1889 AESENC KEY STATE4
54b6a1bd 1890 movaps 0x70(TKEYP), KEY
b369e521
HY
1891 AESENCLAST KEY STATE1 # last round
1892 AESENCLAST KEY STATE2
1893 AESENCLAST KEY STATE3
1894 AESENCLAST KEY STATE4
54b6a1bd 1895 ret
8309b745 1896ENDPROC(_aesni_enc4)
54b6a1bd
HY
1897
1898/*
1899 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1900 */
1901ENTRY(aesni_dec)
8691ccd7 1902 FRAME_BEGIN
0d258efb
MK
1903#ifndef __x86_64__
1904 pushl KEYP
1905 pushl KLEN
8691ccd7
JP
1906 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1907 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1908 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1909#endif
54b6a1bd
HY
1910 mov 480(KEYP), KLEN # key length
1911 add $240, KEYP
1912 movups (INP), STATE # input
1913 call _aesni_dec1
1914 movups STATE, (OUTP) #output
0d258efb
MK
1915#ifndef __x86_64__
1916 popl KLEN
1917 popl KEYP
1918#endif
8691ccd7 1919 FRAME_END
54b6a1bd 1920 ret
8309b745 1921ENDPROC(aesni_dec)
54b6a1bd
HY
1922
1923/*
1924 * _aesni_dec1: internal ABI
1925 * input:
1926 * KEYP: key struct pointer
1927 * KLEN: key length
1928 * STATE: initial state (input)
1929 * output:
1930 * STATE: finial state (output)
1931 * changed:
1932 * KEY
1933 * TKEYP (T1)
1934 */
0d258efb 1935.align 4
54b6a1bd
HY
1936_aesni_dec1:
1937 movaps (KEYP), KEY # key
1938 mov KEYP, TKEYP
1939 pxor KEY, STATE # round 0
1940 add $0x30, TKEYP
1941 cmp $24, KLEN
1942 jb .Ldec128
1943 lea 0x20(TKEYP), TKEYP
1944 je .Ldec192
1945 add $0x20, TKEYP
1946 movaps -0x60(TKEYP), KEY
b369e521 1947 AESDEC KEY STATE
54b6a1bd 1948 movaps -0x50(TKEYP), KEY
b369e521 1949 AESDEC KEY STATE
54b6a1bd
HY
1950.align 4
1951.Ldec192:
1952 movaps -0x40(TKEYP), KEY
b369e521 1953 AESDEC KEY STATE
54b6a1bd 1954 movaps -0x30(TKEYP), KEY
b369e521 1955 AESDEC KEY STATE
54b6a1bd
HY
1956.align 4
1957.Ldec128:
1958 movaps -0x20(TKEYP), KEY
b369e521 1959 AESDEC KEY STATE
54b6a1bd 1960 movaps -0x10(TKEYP), KEY
b369e521 1961 AESDEC KEY STATE
54b6a1bd 1962 movaps (TKEYP), KEY
b369e521 1963 AESDEC KEY STATE
54b6a1bd 1964 movaps 0x10(TKEYP), KEY
b369e521 1965 AESDEC KEY STATE
54b6a1bd 1966 movaps 0x20(TKEYP), KEY
b369e521 1967 AESDEC KEY STATE
54b6a1bd 1968 movaps 0x30(TKEYP), KEY
b369e521 1969 AESDEC KEY STATE
54b6a1bd 1970 movaps 0x40(TKEYP), KEY
b369e521 1971 AESDEC KEY STATE
54b6a1bd 1972 movaps 0x50(TKEYP), KEY
b369e521 1973 AESDEC KEY STATE
54b6a1bd 1974 movaps 0x60(TKEYP), KEY
b369e521 1975 AESDEC KEY STATE
54b6a1bd 1976 movaps 0x70(TKEYP), KEY
b369e521 1977 AESDECLAST KEY STATE
54b6a1bd 1978 ret
8309b745 1979ENDPROC(_aesni_dec1)
54b6a1bd
HY
1980
1981/*
1982 * _aesni_dec4: internal ABI
1983 * input:
1984 * KEYP: key struct pointer
1985 * KLEN: key length
1986 * STATE1: initial state (input)
1987 * STATE2
1988 * STATE3
1989 * STATE4
1990 * output:
1991 * STATE1: finial state (output)
1992 * STATE2
1993 * STATE3
1994 * STATE4
1995 * changed:
1996 * KEY
1997 * TKEYP (T1)
1998 */
0d258efb 1999.align 4
54b6a1bd
HY
2000_aesni_dec4:
2001 movaps (KEYP), KEY # key
2002 mov KEYP, TKEYP
2003 pxor KEY, STATE1 # round 0
2004 pxor KEY, STATE2
2005 pxor KEY, STATE3
2006 pxor KEY, STATE4
2007 add $0x30, TKEYP
2008 cmp $24, KLEN
2009 jb .L4dec128
2010 lea 0x20(TKEYP), TKEYP
2011 je .L4dec192
2012 add $0x20, TKEYP
2013 movaps -0x60(TKEYP), KEY
b369e521
HY
2014 AESDEC KEY STATE1
2015 AESDEC KEY STATE2
2016 AESDEC KEY STATE3
2017 AESDEC KEY STATE4
54b6a1bd 2018 movaps -0x50(TKEYP), KEY
b369e521
HY
2019 AESDEC KEY STATE1
2020 AESDEC KEY STATE2
2021 AESDEC KEY STATE3
2022 AESDEC KEY STATE4
54b6a1bd
HY
2023.align 4
2024.L4dec192:
2025 movaps -0x40(TKEYP), KEY
b369e521
HY
2026 AESDEC KEY STATE1
2027 AESDEC KEY STATE2
2028 AESDEC KEY STATE3
2029 AESDEC KEY STATE4
54b6a1bd 2030 movaps -0x30(TKEYP), KEY
b369e521
HY
2031 AESDEC KEY STATE1
2032 AESDEC KEY STATE2
2033 AESDEC KEY STATE3
2034 AESDEC KEY STATE4
54b6a1bd
HY
2035.align 4
2036.L4dec128:
2037 movaps -0x20(TKEYP), KEY
b369e521
HY
2038 AESDEC KEY STATE1
2039 AESDEC KEY STATE2
2040 AESDEC KEY STATE3
2041 AESDEC KEY STATE4
54b6a1bd 2042 movaps -0x10(TKEYP), KEY
b369e521
HY
2043 AESDEC KEY STATE1
2044 AESDEC KEY STATE2
2045 AESDEC KEY STATE3
2046 AESDEC KEY STATE4
54b6a1bd 2047 movaps (TKEYP), KEY
b369e521
HY
2048 AESDEC KEY STATE1
2049 AESDEC KEY STATE2
2050 AESDEC KEY STATE3
2051 AESDEC KEY STATE4
54b6a1bd 2052 movaps 0x10(TKEYP), KEY
b369e521
HY
2053 AESDEC KEY STATE1
2054 AESDEC KEY STATE2
2055 AESDEC KEY STATE3
2056 AESDEC KEY STATE4
54b6a1bd 2057 movaps 0x20(TKEYP), KEY
b369e521
HY
2058 AESDEC KEY STATE1
2059 AESDEC KEY STATE2
2060 AESDEC KEY STATE3
2061 AESDEC KEY STATE4
54b6a1bd 2062 movaps 0x30(TKEYP), KEY
b369e521
HY
2063 AESDEC KEY STATE1
2064 AESDEC KEY STATE2
2065 AESDEC KEY STATE3
2066 AESDEC KEY STATE4
54b6a1bd 2067 movaps 0x40(TKEYP), KEY
b369e521
HY
2068 AESDEC KEY STATE1
2069 AESDEC KEY STATE2
2070 AESDEC KEY STATE3
2071 AESDEC KEY STATE4
54b6a1bd 2072 movaps 0x50(TKEYP), KEY
b369e521
HY
2073 AESDEC KEY STATE1
2074 AESDEC KEY STATE2
2075 AESDEC KEY STATE3
2076 AESDEC KEY STATE4
54b6a1bd 2077 movaps 0x60(TKEYP), KEY
b369e521
HY
2078 AESDEC KEY STATE1
2079 AESDEC KEY STATE2
2080 AESDEC KEY STATE3
2081 AESDEC KEY STATE4
54b6a1bd 2082 movaps 0x70(TKEYP), KEY
b369e521
HY
2083 AESDECLAST KEY STATE1 # last round
2084 AESDECLAST KEY STATE2
2085 AESDECLAST KEY STATE3
2086 AESDECLAST KEY STATE4
54b6a1bd 2087 ret
8309b745 2088ENDPROC(_aesni_dec4)
54b6a1bd
HY
2089
2090/*
2091 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2092 * size_t len)
2093 */
2094ENTRY(aesni_ecb_enc)
8691ccd7 2095 FRAME_BEGIN
0d258efb
MK
2096#ifndef __x86_64__
2097 pushl LEN
2098 pushl KEYP
2099 pushl KLEN
8691ccd7
JP
2100 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2101 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2102 movl (FRAME_OFFSET+24)(%esp), INP # src
2103 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2104#endif
54b6a1bd
HY
2105 test LEN, LEN # check length
2106 jz .Lecb_enc_ret
2107 mov 480(KEYP), KLEN
2108 cmp $16, LEN
2109 jb .Lecb_enc_ret
2110 cmp $64, LEN
2111 jb .Lecb_enc_loop1
2112.align 4
2113.Lecb_enc_loop4:
2114 movups (INP), STATE1
2115 movups 0x10(INP), STATE2
2116 movups 0x20(INP), STATE3
2117 movups 0x30(INP), STATE4
2118 call _aesni_enc4
2119 movups STATE1, (OUTP)
2120 movups STATE2, 0x10(OUTP)
2121 movups STATE3, 0x20(OUTP)
2122 movups STATE4, 0x30(OUTP)
2123 sub $64, LEN
2124 add $64, INP
2125 add $64, OUTP
2126 cmp $64, LEN
2127 jge .Lecb_enc_loop4
2128 cmp $16, LEN
2129 jb .Lecb_enc_ret
2130.align 4
2131.Lecb_enc_loop1:
2132 movups (INP), STATE1
2133 call _aesni_enc1
2134 movups STATE1, (OUTP)
2135 sub $16, LEN
2136 add $16, INP
2137 add $16, OUTP
2138 cmp $16, LEN
2139 jge .Lecb_enc_loop1
2140.Lecb_enc_ret:
0d258efb
MK
2141#ifndef __x86_64__
2142 popl KLEN
2143 popl KEYP
2144 popl LEN
2145#endif
8691ccd7 2146 FRAME_END
54b6a1bd 2147 ret
8309b745 2148ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2149
2150/*
2151 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2152 * size_t len);
2153 */
2154ENTRY(aesni_ecb_dec)
8691ccd7 2155 FRAME_BEGIN
0d258efb
MK
2156#ifndef __x86_64__
2157 pushl LEN
2158 pushl KEYP
2159 pushl KLEN
8691ccd7
JP
2160 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2161 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2162 movl (FRAME_OFFSET+24)(%esp), INP # src
2163 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2164#endif
54b6a1bd
HY
2165 test LEN, LEN
2166 jz .Lecb_dec_ret
2167 mov 480(KEYP), KLEN
2168 add $240, KEYP
2169 cmp $16, LEN
2170 jb .Lecb_dec_ret
2171 cmp $64, LEN
2172 jb .Lecb_dec_loop1
2173.align 4
2174.Lecb_dec_loop4:
2175 movups (INP), STATE1
2176 movups 0x10(INP), STATE2
2177 movups 0x20(INP), STATE3
2178 movups 0x30(INP), STATE4
2179 call _aesni_dec4
2180 movups STATE1, (OUTP)
2181 movups STATE2, 0x10(OUTP)
2182 movups STATE3, 0x20(OUTP)
2183 movups STATE4, 0x30(OUTP)
2184 sub $64, LEN
2185 add $64, INP
2186 add $64, OUTP
2187 cmp $64, LEN
2188 jge .Lecb_dec_loop4
2189 cmp $16, LEN
2190 jb .Lecb_dec_ret
2191.align 4
2192.Lecb_dec_loop1:
2193 movups (INP), STATE1
2194 call _aesni_dec1
2195 movups STATE1, (OUTP)
2196 sub $16, LEN
2197 add $16, INP
2198 add $16, OUTP
2199 cmp $16, LEN
2200 jge .Lecb_dec_loop1
2201.Lecb_dec_ret:
0d258efb
MK
2202#ifndef __x86_64__
2203 popl KLEN
2204 popl KEYP
2205 popl LEN
2206#endif
8691ccd7 2207 FRAME_END
54b6a1bd 2208 ret
8309b745 2209ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2210
2211/*
2212 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2213 * size_t len, u8 *iv)
2214 */
2215ENTRY(aesni_cbc_enc)
8691ccd7 2216 FRAME_BEGIN
0d258efb
MK
2217#ifndef __x86_64__
2218 pushl IVP
2219 pushl LEN
2220 pushl KEYP
2221 pushl KLEN
8691ccd7
JP
2222 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2223 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2224 movl (FRAME_OFFSET+28)(%esp), INP # src
2225 movl (FRAME_OFFSET+32)(%esp), LEN # len
2226 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2227#endif
54b6a1bd
HY
2228 cmp $16, LEN
2229 jb .Lcbc_enc_ret
2230 mov 480(KEYP), KLEN
2231 movups (IVP), STATE # load iv as initial state
2232.align 4
2233.Lcbc_enc_loop:
2234 movups (INP), IN # load input
2235 pxor IN, STATE
2236 call _aesni_enc1
2237 movups STATE, (OUTP) # store output
2238 sub $16, LEN
2239 add $16, INP
2240 add $16, OUTP
2241 cmp $16, LEN
2242 jge .Lcbc_enc_loop
2243 movups STATE, (IVP)
2244.Lcbc_enc_ret:
0d258efb
MK
2245#ifndef __x86_64__
2246 popl KLEN
2247 popl KEYP
2248 popl LEN
2249 popl IVP
2250#endif
8691ccd7 2251 FRAME_END
54b6a1bd 2252 ret
8309b745 2253ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2254
2255/*
2256 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2257 * size_t len, u8 *iv)
2258 */
2259ENTRY(aesni_cbc_dec)
8691ccd7 2260 FRAME_BEGIN
0d258efb
MK
2261#ifndef __x86_64__
2262 pushl IVP
2263 pushl LEN
2264 pushl KEYP
2265 pushl KLEN
8691ccd7
JP
2266 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2267 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2268 movl (FRAME_OFFSET+28)(%esp), INP # src
2269 movl (FRAME_OFFSET+32)(%esp), LEN # len
2270 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2271#endif
54b6a1bd 2272 cmp $16, LEN
e6efaa02 2273 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2274 mov 480(KEYP), KLEN
2275 add $240, KEYP
2276 movups (IVP), IV
2277 cmp $64, LEN
2278 jb .Lcbc_dec_loop1
2279.align 4
2280.Lcbc_dec_loop4:
2281 movups (INP), IN1
2282 movaps IN1, STATE1
2283 movups 0x10(INP), IN2
2284 movaps IN2, STATE2
0d258efb 2285#ifdef __x86_64__
54b6a1bd
HY
2286 movups 0x20(INP), IN3
2287 movaps IN3, STATE3
2288 movups 0x30(INP), IN4
2289 movaps IN4, STATE4
0d258efb
MK
2290#else
2291 movups 0x20(INP), IN1
2292 movaps IN1, STATE3
2293 movups 0x30(INP), IN2
2294 movaps IN2, STATE4
2295#endif
54b6a1bd
HY
2296 call _aesni_dec4
2297 pxor IV, STATE1
0d258efb 2298#ifdef __x86_64__
54b6a1bd
HY
2299 pxor IN1, STATE2
2300 pxor IN2, STATE3
2301 pxor IN3, STATE4
2302 movaps IN4, IV
0d258efb 2303#else
0d258efb
MK
2304 pxor IN1, STATE4
2305 movaps IN2, IV
7c8d5184
MK
2306 movups (INP), IN1
2307 pxor IN1, STATE2
2308 movups 0x10(INP), IN2
2309 pxor IN2, STATE3
0d258efb 2310#endif
54b6a1bd
HY
2311 movups STATE1, (OUTP)
2312 movups STATE2, 0x10(OUTP)
2313 movups STATE3, 0x20(OUTP)
2314 movups STATE4, 0x30(OUTP)
2315 sub $64, LEN
2316 add $64, INP
2317 add $64, OUTP
2318 cmp $64, LEN
2319 jge .Lcbc_dec_loop4
2320 cmp $16, LEN
2321 jb .Lcbc_dec_ret
2322.align 4
2323.Lcbc_dec_loop1:
2324 movups (INP), IN
2325 movaps IN, STATE
2326 call _aesni_dec1
2327 pxor IV, STATE
2328 movups STATE, (OUTP)
2329 movaps IN, IV
2330 sub $16, LEN
2331 add $16, INP
2332 add $16, OUTP
2333 cmp $16, LEN
2334 jge .Lcbc_dec_loop1
54b6a1bd 2335.Lcbc_dec_ret:
e6efaa02
HY
2336 movups IV, (IVP)
2337.Lcbc_dec_just_ret:
0d258efb
MK
2338#ifndef __x86_64__
2339 popl KLEN
2340 popl KEYP
2341 popl LEN
2342 popl IVP
2343#endif
8691ccd7 2344 FRAME_END
54b6a1bd 2345 ret
8309b745 2346ENDPROC(aesni_cbc_dec)
12387a46 2347
0d258efb 2348#ifdef __x86_64__
1253cab8 2349.pushsection .rodata
12387a46
HY
2350.align 16
2351.Lbswap_mask:
2352 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2353.popsection
12387a46
HY
2354
2355/*
2356 * _aesni_inc_init: internal ABI
2357 * setup registers used by _aesni_inc
2358 * input:
2359 * IV
2360 * output:
2361 * CTR: == IV, in little endian
2362 * TCTR_LOW: == lower qword of CTR
2363 * INC: == 1, in little endian
2364 * BSWAP_MASK == endian swapping mask
2365 */
0d258efb 2366.align 4
12387a46
HY
2367_aesni_inc_init:
2368 movaps .Lbswap_mask, BSWAP_MASK
2369 movaps IV, CTR
2370 PSHUFB_XMM BSWAP_MASK CTR
2371 mov $1, TCTR_LOW
32cbd7df
HY
2372 MOVQ_R64_XMM TCTR_LOW INC
2373 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2374 ret
8309b745 2375ENDPROC(_aesni_inc_init)
12387a46
HY
2376
2377/*
2378 * _aesni_inc: internal ABI
2379 * Increase IV by 1, IV is in big endian
2380 * input:
2381 * IV
2382 * CTR: == IV, in little endian
2383 * TCTR_LOW: == lower qword of CTR
2384 * INC: == 1, in little endian
2385 * BSWAP_MASK == endian swapping mask
2386 * output:
2387 * IV: Increase by 1
2388 * changed:
2389 * CTR: == output IV, in little endian
2390 * TCTR_LOW: == lower qword of CTR
2391 */
0d258efb 2392.align 4
12387a46
HY
2393_aesni_inc:
2394 paddq INC, CTR
2395 add $1, TCTR_LOW
2396 jnc .Linc_low
2397 pslldq $8, INC
2398 paddq INC, CTR
2399 psrldq $8, INC
2400.Linc_low:
2401 movaps CTR, IV
2402 PSHUFB_XMM BSWAP_MASK IV
2403 ret
8309b745 2404ENDPROC(_aesni_inc)
12387a46
HY
2405
2406/*
2407 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2408 * size_t len, u8 *iv)
2409 */
2410ENTRY(aesni_ctr_enc)
8691ccd7 2411 FRAME_BEGIN
12387a46
HY
2412 cmp $16, LEN
2413 jb .Lctr_enc_just_ret
2414 mov 480(KEYP), KLEN
2415 movups (IVP), IV
2416 call _aesni_inc_init
2417 cmp $64, LEN
2418 jb .Lctr_enc_loop1
2419.align 4
2420.Lctr_enc_loop4:
2421 movaps IV, STATE1
2422 call _aesni_inc
2423 movups (INP), IN1
2424 movaps IV, STATE2
2425 call _aesni_inc
2426 movups 0x10(INP), IN2
2427 movaps IV, STATE3
2428 call _aesni_inc
2429 movups 0x20(INP), IN3
2430 movaps IV, STATE4
2431 call _aesni_inc
2432 movups 0x30(INP), IN4
2433 call _aesni_enc4
2434 pxor IN1, STATE1
2435 movups STATE1, (OUTP)
2436 pxor IN2, STATE2
2437 movups STATE2, 0x10(OUTP)
2438 pxor IN3, STATE3
2439 movups STATE3, 0x20(OUTP)
2440 pxor IN4, STATE4
2441 movups STATE4, 0x30(OUTP)
2442 sub $64, LEN
2443 add $64, INP
2444 add $64, OUTP
2445 cmp $64, LEN
2446 jge .Lctr_enc_loop4
2447 cmp $16, LEN
2448 jb .Lctr_enc_ret
2449.align 4
2450.Lctr_enc_loop1:
2451 movaps IV, STATE
2452 call _aesni_inc
2453 movups (INP), IN
2454 call _aesni_enc1
2455 pxor IN, STATE
2456 movups STATE, (OUTP)
2457 sub $16, LEN
2458 add $16, INP
2459 add $16, OUTP
2460 cmp $16, LEN
2461 jge .Lctr_enc_loop1
2462.Lctr_enc_ret:
2463 movups IV, (IVP)
2464.Lctr_enc_just_ret:
8691ccd7 2465 FRAME_END
12387a46 2466 ret
8309b745 2467ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2468
2469/*
2470 * _aesni_gf128mul_x_ble: internal ABI
2471 * Multiply in GF(2^128) for XTS IVs
2472 * input:
2473 * IV: current IV
2474 * GF128MUL_MASK == mask with 0x87 and 0x01
2475 * output:
2476 * IV: next IV
2477 * changed:
2478 * CTR: == temporary value
2479 */
2480#define _aesni_gf128mul_x_ble() \
2481 pshufd $0x13, IV, CTR; \
2482 paddq IV, IV; \
2483 psrad $31, CTR; \
2484 pand GF128MUL_MASK, CTR; \
2485 pxor CTR, IV;
2486
2487/*
2488 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 * bool enc, u8 *iv)
2490 */
2491ENTRY(aesni_xts_crypt8)
8691ccd7 2492 FRAME_BEGIN
c456a9cd
JK
2493 cmpb $0, %cl
2494 movl $0, %ecx
2495 movl $240, %r10d
2496 leaq _aesni_enc4, %r11
2497 leaq _aesni_dec4, %rax
2498 cmovel %r10d, %ecx
2499 cmoveq %rax, %r11
2500
2501 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2502 movups (IVP), IV
2503
2504 mov 480(KEYP), KLEN
2505 addq %rcx, KEYP
2506
2507 movdqa IV, STATE1
fe6510b5
JK
2508 movdqu 0x00(INP), INC
2509 pxor INC, STATE1
c456a9cd
JK
2510 movdqu IV, 0x00(OUTP)
2511
2512 _aesni_gf128mul_x_ble()
2513 movdqa IV, STATE2
fe6510b5
JK
2514 movdqu 0x10(INP), INC
2515 pxor INC, STATE2
c456a9cd
JK
2516 movdqu IV, 0x10(OUTP)
2517
2518 _aesni_gf128mul_x_ble()
2519 movdqa IV, STATE3
fe6510b5
JK
2520 movdqu 0x20(INP), INC
2521 pxor INC, STATE3
c456a9cd
JK
2522 movdqu IV, 0x20(OUTP)
2523
2524 _aesni_gf128mul_x_ble()
2525 movdqa IV, STATE4
fe6510b5
JK
2526 movdqu 0x30(INP), INC
2527 pxor INC, STATE4
c456a9cd
JK
2528 movdqu IV, 0x30(OUTP)
2529
9697fa39 2530 CALL_NOSPEC %r11
c456a9cd 2531
fe6510b5
JK
2532 movdqu 0x00(OUTP), INC
2533 pxor INC, STATE1
c456a9cd
JK
2534 movdqu STATE1, 0x00(OUTP)
2535
2536 _aesni_gf128mul_x_ble()
2537 movdqa IV, STATE1
fe6510b5
JK
2538 movdqu 0x40(INP), INC
2539 pxor INC, STATE1
c456a9cd
JK
2540 movdqu IV, 0x40(OUTP)
2541
fe6510b5
JK
2542 movdqu 0x10(OUTP), INC
2543 pxor INC, STATE2
c456a9cd
JK
2544 movdqu STATE2, 0x10(OUTP)
2545
2546 _aesni_gf128mul_x_ble()
2547 movdqa IV, STATE2
fe6510b5
JK
2548 movdqu 0x50(INP), INC
2549 pxor INC, STATE2
c456a9cd
JK
2550 movdqu IV, 0x50(OUTP)
2551
fe6510b5
JK
2552 movdqu 0x20(OUTP), INC
2553 pxor INC, STATE3
c456a9cd
JK
2554 movdqu STATE3, 0x20(OUTP)
2555
2556 _aesni_gf128mul_x_ble()
2557 movdqa IV, STATE3
fe6510b5
JK
2558 movdqu 0x60(INP), INC
2559 pxor INC, STATE3
c456a9cd
JK
2560 movdqu IV, 0x60(OUTP)
2561
fe6510b5
JK
2562 movdqu 0x30(OUTP), INC
2563 pxor INC, STATE4
c456a9cd
JK
2564 movdqu STATE4, 0x30(OUTP)
2565
2566 _aesni_gf128mul_x_ble()
2567 movdqa IV, STATE4
fe6510b5
JK
2568 movdqu 0x70(INP), INC
2569 pxor INC, STATE4
c456a9cd
JK
2570 movdqu IV, 0x70(OUTP)
2571
2572 _aesni_gf128mul_x_ble()
2573 movups IV, (IVP)
2574
9697fa39 2575 CALL_NOSPEC %r11
c456a9cd 2576
fe6510b5
JK
2577 movdqu 0x40(OUTP), INC
2578 pxor INC, STATE1
c456a9cd
JK
2579 movdqu STATE1, 0x40(OUTP)
2580
fe6510b5
JK
2581 movdqu 0x50(OUTP), INC
2582 pxor INC, STATE2
c456a9cd
JK
2583 movdqu STATE2, 0x50(OUTP)
2584
fe6510b5
JK
2585 movdqu 0x60(OUTP), INC
2586 pxor INC, STATE3
c456a9cd
JK
2587 movdqu STATE3, 0x60(OUTP)
2588
fe6510b5
JK
2589 movdqu 0x70(OUTP), INC
2590 pxor INC, STATE4
c456a9cd
JK
2591 movdqu STATE4, 0x70(OUTP)
2592
8691ccd7 2593 FRAME_END
c456a9cd
JK
2594 ret
2595ENDPROC(aesni_xts_crypt8)
2596
0d258efb 2597#endif