]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - arch/x86/crypto/aesni-intel_asm.S
crypto: aesni - Introduce gcm_context_data
[mirror_ubuntu-eoan-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
54b6a1bd
HY
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
0bd82f5f
TS
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
0d258efb
MK
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
54b6a1bd
HY
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
b369e521 33#include <asm/inst.h>
8691ccd7 34#include <asm/frame.h>
9697fa39 35#include <asm/nospec-branch.h>
54b6a1bd 36
e31ac32d
TM
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ movaps
46#define MOVUDQ movups
47
559ad0ff 48#ifdef __x86_64__
e31ac32d 49
e183914a
DV
50# constants in mergeable sections, linker can reorder and merge
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
c456a9cd
JK
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
e183914a
DV
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
0bd82f5f 57POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
0bd82f5f
TS
60TWOONE: .octa 0x00000001000000000000000000000001
61
e183914a
DV
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
0bd82f5f 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
0bd82f5f 67MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
0bd82f5f 70MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
0bd82f5f 73ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
0bd82f5f 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
0bd82f5f 79dec: .octa 0x1
e183914a
DV
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
0bd82f5f
TS
82enc: .octa 0x2
83
e183914a
DV
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
54b6a1bd
HY
93.text
94
0bd82f5f
TS
95
96#define STACK_OFFSET 8*3
97#define HashKey 16*0 // store HashKey <<1 mod poly here
98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113#define VARIABLE_OFFSET 16*8
114
9ee4a5df
DW
115#define AadHash 16*0
116#define AadLen 16*1
117#define InLen (16*1)+8
118#define PBlockEncKey 16*2
119#define OrigIV 16*3
120#define CurCount 16*4
121#define PBlockLen 16*5
122
0bd82f5f
TS
123#define arg1 rdi
124#define arg2 rsi
125#define arg3 rdx
126#define arg4 rcx
127#define arg5 r8
128#define arg6 r9
129#define arg7 STACK_OFFSET+8(%r14)
130#define arg8 STACK_OFFSET+16(%r14)
131#define arg9 STACK_OFFSET+24(%r14)
132#define arg10 STACK_OFFSET+32(%r14)
9ee4a5df 133#define arg11 STACK_OFFSET+40(%r14)
e31ac32d 134#define keysize 2*15*16(%arg1)
559ad0ff 135#endif
0bd82f5f
TS
136
137
54b6a1bd
HY
138#define STATE1 %xmm0
139#define STATE2 %xmm4
140#define STATE3 %xmm5
141#define STATE4 %xmm6
142#define STATE STATE1
143#define IN1 %xmm1
144#define IN2 %xmm7
145#define IN3 %xmm8
146#define IN4 %xmm9
147#define IN IN1
148#define KEY %xmm2
149#define IV %xmm3
0d258efb 150
12387a46
HY
151#define BSWAP_MASK %xmm10
152#define CTR %xmm11
153#define INC %xmm12
54b6a1bd 154
c456a9cd
JK
155#define GF128MUL_MASK %xmm10
156
0d258efb
MK
157#ifdef __x86_64__
158#define AREG %rax
54b6a1bd
HY
159#define KEYP %rdi
160#define OUTP %rsi
0d258efb 161#define UKEYP OUTP
54b6a1bd
HY
162#define INP %rdx
163#define LEN %rcx
164#define IVP %r8
165#define KLEN %r9d
166#define T1 %r10
167#define TKEYP T1
168#define T2 %r11
12387a46 169#define TCTR_LOW T2
0d258efb
MK
170#else
171#define AREG %eax
172#define KEYP %edi
173#define OUTP AREG
174#define UKEYP OUTP
175#define INP %edx
176#define LEN %esi
177#define IVP %ebp
178#define KLEN %ebx
179#define T1 %ecx
180#define TKEYP T1
181#endif
54b6a1bd 182
6c2c86b3
DW
183.macro FUNC_SAVE
184 push %r12
185 push %r13
186 push %r14
187 mov %rsp, %r14
188#
189# states of %xmm registers %xmm6:%xmm15 not saved
190# all %xmm registers are clobbered
191#
192 sub $VARIABLE_OFFSET, %rsp
193 and $~63, %rsp
194.endm
195
196
197.macro FUNC_RESTORE
198 mov %r14, %rsp
199 pop %r14
200 pop %r13
201 pop %r12
202.endm
0bd82f5f 203
7af964c2
DW
204
205# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
206# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
207.macro GCM_INIT
9ee4a5df 208 mov arg7, %r12
7af964c2 209 movdqu (%r12), %xmm13
9ee4a5df 210 movdqa SHUF_MASK(%rip), %xmm2
7af964c2
DW
211 PSHUFB_XMM %xmm2, %xmm13
212
213 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
214
215 movdqa %xmm13, %xmm2
216 psllq $1, %xmm13
217 psrlq $63, %xmm2
218 movdqa %xmm2, %xmm1
219 pslldq $8, %xmm2
220 psrldq $8, %xmm1
221 por %xmm2, %xmm13
222
223 # reduce HashKey<<1
224
225 pshufd $0x24, %xmm1, %xmm2
226 pcmpeqd TWOONE(%rip), %xmm2
227 pand POLY(%rip), %xmm2
228 pxor %xmm2, %xmm13
229 movdqa %xmm13, HashKey(%rsp)
9ee4a5df 230 mov %arg5, %r13 # %xmm13 holds HashKey<<1 (mod poly)
7af964c2
DW
231 and $-16, %r13
232 mov %r13, %r12
233.endm
234
ba45833e
DW
235# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
236# struct has been initialized by GCM_INIT.
237# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
238# Clobbers rax, r10-r13, and xmm0-xmm15
239.macro GCM_ENC_DEC operation
240 # Encrypt/Decrypt first few blocks
241
242 and $(3<<4), %r12
243 jz _initial_num_blocks_is_0_\@
244 cmp $(2<<4), %r12
245 jb _initial_num_blocks_is_1_\@
246 je _initial_num_blocks_is_2_\@
247_initial_num_blocks_is_3_\@:
248 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
249%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
250 sub $48, %r13
251 jmp _initial_blocks_\@
252_initial_num_blocks_is_2_\@:
253 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
254%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
255 sub $32, %r13
256 jmp _initial_blocks_\@
257_initial_num_blocks_is_1_\@:
258 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
259%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
260 sub $16, %r13
261 jmp _initial_blocks_\@
262_initial_num_blocks_is_0_\@:
263 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
264%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
265_initial_blocks_\@:
266
267 # Main loop - Encrypt/Decrypt remaining blocks
268
269 cmp $0, %r13
270 je _zero_cipher_left_\@
271 sub $64, %r13
272 je _four_cipher_left_\@
273_crypt_by_4_\@:
274 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
275 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
276 %xmm7, %xmm8, enc
277 add $64, %r11
278 sub $64, %r13
279 jne _crypt_by_4_\@
280_four_cipher_left_\@:
281 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
282%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
283_zero_cipher_left_\@:
9ee4a5df
DW
284 mov %arg5, %r13
285 and $15, %r13 # %r13 = arg5 (mod 16)
ba45833e
DW
286 je _multiple_of_16_bytes_\@
287
288 # Handle the last <16 Byte block separately
289 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
9ee4a5df 290 movdqa SHUF_MASK(%rip), %xmm10
ba45833e
DW
291 PSHUFB_XMM %xmm10, %xmm0
292
293 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
294
9ee4a5df 295 lea (%arg4,%r11,1), %r10
ba45833e
DW
296 mov %r13, %r12
297 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
298
299 lea ALL_F+16(%rip), %r12
300 sub %r13, %r12
301.ifc \operation, dec
302 movdqa %xmm1, %xmm2
303.endif
304 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
305 movdqu (%r12), %xmm1
306 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
307 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
308.ifc \operation, dec
309 pand %xmm1, %xmm2
310 movdqa SHUF_MASK(%rip), %xmm10
311 PSHUFB_XMM %xmm10 ,%xmm2
312
313 pxor %xmm2, %xmm8
314.else
315 movdqa SHUF_MASK(%rip), %xmm10
316 PSHUFB_XMM %xmm10,%xmm0
317
318 pxor %xmm0, %xmm8
319.endif
320
321 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
322.ifc \operation, enc
323 # GHASH computation for the last <16 byte block
324 movdqa SHUF_MASK(%rip), %xmm10
325 # shuffle xmm0 back to output as ciphertext
326 PSHUFB_XMM %xmm10, %xmm0
327.endif
328
329 # Output %r13 bytes
330 MOVQ_R64_XMM %xmm0, %rax
331 cmp $8, %r13
332 jle _less_than_8_bytes_left_\@
9ee4a5df 333 mov %rax, (%arg3 , %r11, 1)
ba45833e
DW
334 add $8, %r11
335 psrldq $8, %xmm0
336 MOVQ_R64_XMM %xmm0, %rax
337 sub $8, %r13
338_less_than_8_bytes_left_\@:
9ee4a5df 339 mov %al, (%arg3, %r11, 1)
ba45833e
DW
340 add $1, %r11
341 shr $8, %rax
342 sub $1, %r13
343 jne _less_than_8_bytes_left_\@
344_multiple_of_16_bytes_\@:
345.endm
346
adcadab3
DW
347# GCM_COMPLETE Finishes update of tag of last partial block
348# Output: Authorization Tag (AUTH_TAG)
349# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
350.macro GCM_COMPLETE
9ee4a5df 351 mov arg9, %r12 # %r13 = aadLen (number of bytes)
adcadab3
DW
352 shl $3, %r12 # convert into number of bits
353 movd %r12d, %xmm15 # len(A) in %xmm15
9ee4a5df
DW
354 shl $3, %arg5 # len(C) in bits (*128)
355 MOVQ_R64_XMM %arg5, %xmm1
adcadab3
DW
356 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
357 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
358 pxor %xmm15, %xmm8
359 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
360 # final GHASH computation
361 movdqa SHUF_MASK(%rip), %xmm10
362 PSHUFB_XMM %xmm10, %xmm8
363
9ee4a5df 364 mov %arg6, %rax # %rax = *Y0
adcadab3
DW
365 movdqu (%rax), %xmm0 # %xmm0 = Y0
366 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
367 pxor %xmm8, %xmm0
368_return_T_\@:
9ee4a5df
DW
369 mov arg10, %r10 # %r10 = authTag
370 mov arg11, %r11 # %r11 = auth_tag_len
adcadab3
DW
371 cmp $16, %r11
372 je _T_16_\@
373 cmp $8, %r11
374 jl _T_4_\@
375_T_8_\@:
376 MOVQ_R64_XMM %xmm0, %rax
377 mov %rax, (%r10)
378 add $8, %r10
379 sub $8, %r11
380 psrldq $8, %xmm0
381 cmp $0, %r11
382 je _return_T_done_\@
383_T_4_\@:
384 movd %xmm0, %eax
385 mov %eax, (%r10)
386 add $4, %r10
387 sub $4, %r11
388 psrldq $4, %xmm0
389 cmp $0, %r11
390 je _return_T_done_\@
391_T_123_\@:
392 movd %xmm0, %eax
393 cmp $2, %r11
394 jl _T_1_\@
395 mov %ax, (%r10)
396 cmp $2, %r11
397 je _return_T_done_\@
398 add $2, %r10
399 sar $16, %eax
400_T_1_\@:
401 mov %al, (%r10)
402 jmp _return_T_done_\@
403_T_16_\@:
404 movdqu %xmm0, (%r10)
405_return_T_done_\@:
406.endm
407
559ad0ff 408#ifdef __x86_64__
0bd82f5f
TS
409/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
410*
411*
412* Input: A and B (128-bits each, bit-reflected)
413* Output: C = A*B*x mod poly, (i.e. >>1 )
414* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
415* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
416*
417*/
418.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
419 movdqa \GH, \TMP1
420 pshufd $78, \GH, \TMP2
421 pshufd $78, \HK, \TMP3
422 pxor \GH, \TMP2 # TMP2 = a1+a0
423 pxor \HK, \TMP3 # TMP3 = b1+b0
424 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
425 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
426 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
427 pxor \GH, \TMP2
428 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
429 movdqa \TMP2, \TMP3
430 pslldq $8, \TMP3 # left shift TMP3 2 DWs
431 psrldq $8, \TMP2 # right shift TMP2 2 DWs
432 pxor \TMP3, \GH
433 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
434
435 # first phase of the reduction
436
437 movdqa \GH, \TMP2
438 movdqa \GH, \TMP3
439 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
440 # in in order to perform
441 # independent shifts
442 pslld $31, \TMP2 # packed right shift <<31
443 pslld $30, \TMP3 # packed right shift <<30
444 pslld $25, \TMP4 # packed right shift <<25
445 pxor \TMP3, \TMP2 # xor the shifted versions
446 pxor \TMP4, \TMP2
447 movdqa \TMP2, \TMP5
448 psrldq $4, \TMP5 # right shift TMP5 1 DW
449 pslldq $12, \TMP2 # left shift TMP2 3 DWs
450 pxor \TMP2, \GH
451
452 # second phase of the reduction
453
454 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
455 # in in order to perform
456 # independent shifts
457 movdqa \GH,\TMP3
458 movdqa \GH,\TMP4
459 psrld $1,\TMP2 # packed left shift >>1
460 psrld $2,\TMP3 # packed left shift >>2
461 psrld $7,\TMP4 # packed left shift >>7
462 pxor \TMP3,\TMP2 # xor the shifted versions
463 pxor \TMP4,\TMP2
464 pxor \TMP5, \TMP2
465 pxor \TMP2, \GH
466 pxor \TMP1, \GH # result is in TMP1
467.endm
468
b20209c9
JS
469# Reads DLEN bytes starting at DPTR and stores in XMMDst
470# where 0 < DLEN < 16
471# Clobbers %rax, DLEN and XMM1
472.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
473 cmp $8, \DLEN
474 jl _read_lt8_\@
475 mov (\DPTR), %rax
476 MOVQ_R64_XMM %rax, \XMMDst
477 sub $8, \DLEN
478 jz _done_read_partial_block_\@
479 xor %eax, %eax
480_read_next_byte_\@:
481 shl $8, %rax
482 mov 7(\DPTR, \DLEN, 1), %al
483 dec \DLEN
484 jnz _read_next_byte_\@
485 MOVQ_R64_XMM %rax, \XMM1
486 pslldq $8, \XMM1
487 por \XMM1, \XMMDst
488 jmp _done_read_partial_block_\@
489_read_lt8_\@:
490 xor %eax, %eax
491_read_next_byte_lt8_\@:
492 shl $8, %rax
493 mov -1(\DPTR, \DLEN, 1), %al
494 dec \DLEN
495 jnz _read_next_byte_lt8_\@
496 MOVQ_R64_XMM %rax, \XMMDst
497_done_read_partial_block_\@:
498.endm
499
0bd82f5f
TS
500/*
501* if a = number of total plaintext bytes
502* b = floor(a/16)
503* num_initial_blocks = b mod 4
504* encrypt the initial num_initial_blocks blocks and apply ghash on
505* the ciphertext
506* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
507* are clobbered
9ee4a5df 508* arg1, %arg3, %arg4, %r14 are used as a pointer only, not modified
0bd82f5f
TS
509*/
510
0bd82f5f 511
e1fd316f 512.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
3c097b80 513XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
e31ac32d 514 MOVADQ SHUF_MASK(%rip), %xmm14
9ee4a5df
DW
515 mov arg8, %r10 # %r10 = AAD
516 mov arg9, %r11 # %r11 = aadLen
3c097b80 517 pxor %xmm\i, %xmm\i
0487ccac
SD
518 pxor \XMM2, \XMM2
519
520 cmp $16, %r11
e1fd316f
DW
521 jl _get_AAD_rest\@
522_get_AAD_blocks\@:
0487ccac
SD
523 movdqu (%r10), %xmm\i
524 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
525 pxor %xmm\i, \XMM2
526 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
527 add $16, %r10
0487ccac
SD
528 sub $16, %r11
529 cmp $16, %r11
e1fd316f 530 jge _get_AAD_blocks\@
0487ccac
SD
531
532 movdqu \XMM2, %xmm\i
1ecdd37e
JS
533
534 /* read the last <16B of AAD */
e1fd316f 535_get_AAD_rest\@:
0487ccac 536 cmp $0, %r11
e1fd316f 537 je _get_AAD_done\@
0487ccac 538
1ecdd37e 539 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
3c097b80 540 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
0487ccac
SD
541 pxor \XMM2, %xmm\i
542 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
3c097b80 543
e1fd316f 544_get_AAD_done\@:
3c097b80 545 xor %r11, %r11 # initialise the data pointer offset as zero
0487ccac 546 # start AES for num_initial_blocks blocks
3c097b80 547
9ee4a5df 548 mov %arg6, %rax # %rax = *Y0
3c097b80 549 movdqu (%rax), \XMM0 # XMM0 = Y0
3c097b80
TS
550 PSHUFB_XMM %xmm14, \XMM0
551
552.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 553
e31ac32d
TM
554 MOVADQ ONE(%RIP),\TMP1
555 MOVADQ 0(%arg1),\TMP2
3c097b80 556.irpc index, \i_seq
e31ac32d 557 paddd \TMP1, \XMM0 # INCR Y0
e1fd316f
DW
558.ifc \operation, dec
559 movdqa \XMM0, %xmm\index
560.else
e31ac32d 561 MOVADQ \XMM0, %xmm\index
e1fd316f 562.endif
e31ac32d
TM
563 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
564 pxor \TMP2, %xmm\index
3c097b80 565.endr
e31ac32d
TM
566 lea 0x10(%arg1),%r10
567 mov keysize,%eax
568 shr $2,%eax # 128->4, 192->6, 256->8
569 add $5,%eax # 128->9, 192->11, 256->13
570
e1fd316f 571aes_loop_initial_\@:
e31ac32d
TM
572 MOVADQ (%r10),\TMP1
573.irpc index, \i_seq
574 AESENC \TMP1, %xmm\index
3c097b80 575.endr
e31ac32d
TM
576 add $16,%r10
577 sub $1,%eax
e1fd316f 578 jnz aes_loop_initial_\@
e31ac32d
TM
579
580 MOVADQ (%r10), \TMP1
3c097b80 581.irpc index, \i_seq
e31ac32d 582 AESENCLAST \TMP1, %xmm\index # Last Round
3c097b80
TS
583.endr
584.irpc index, \i_seq
9ee4a5df 585 movdqu (%arg4 , %r11, 1), \TMP1
3c097b80 586 pxor \TMP1, %xmm\index
9ee4a5df 587 movdqu %xmm\index, (%arg3 , %r11, 1)
3c097b80
TS
588 # write back plaintext/ciphertext for num_initial_blocks
589 add $16, %r11
e1fd316f
DW
590
591.ifc \operation, dec
592 movdqa \TMP1, %xmm\index
593.endif
3c097b80
TS
594 PSHUFB_XMM %xmm14, %xmm\index
595
596 # prepare plaintext/ciphertext for GHASH computation
597.endr
598.endif
0487ccac 599
3c097b80
TS
600 # apply GHASH on num_initial_blocks blocks
601
602.if \i == 5
603 pxor %xmm5, %xmm6
604 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
605 pxor %xmm6, %xmm7
606 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
607 pxor %xmm7, %xmm8
608 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
609.elseif \i == 6
610 pxor %xmm6, %xmm7
611 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
612 pxor %xmm7, %xmm8
613 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
614.elseif \i == 7
615 pxor %xmm7, %xmm8
616 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
617.endif
618 cmp $64, %r13
e1fd316f 619 jl _initial_blocks_done\@
3c097b80
TS
620 # no need for precomputed values
621/*
622*
623* Precomputations for HashKey parallel with encryption of first 4 blocks.
624* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
625*/
e31ac32d
TM
626 MOVADQ ONE(%RIP),\TMP1
627 paddd \TMP1, \XMM0 # INCR Y0
628 MOVADQ \XMM0, \XMM1
3c097b80
TS
629 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
630
e31ac32d
TM
631 paddd \TMP1, \XMM0 # INCR Y0
632 MOVADQ \XMM0, \XMM2
3c097b80
TS
633 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
634
e31ac32d
TM
635 paddd \TMP1, \XMM0 # INCR Y0
636 MOVADQ \XMM0, \XMM3
3c097b80
TS
637 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
638
e31ac32d
TM
639 paddd \TMP1, \XMM0 # INCR Y0
640 MOVADQ \XMM0, \XMM4
3c097b80
TS
641 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
642
e31ac32d
TM
643 MOVADQ 0(%arg1),\TMP1
644 pxor \TMP1, \XMM1
645 pxor \TMP1, \XMM2
646 pxor \TMP1, \XMM3
647 pxor \TMP1, \XMM4
3c097b80
TS
648 movdqa \TMP3, \TMP5
649 pshufd $78, \TMP3, \TMP1
650 pxor \TMP3, \TMP1
651 movdqa \TMP1, HashKey_k(%rsp)
652 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
653# TMP5 = HashKey^2<<1 (mod poly)
654 movdqa \TMP5, HashKey_2(%rsp)
655# HashKey_2 = HashKey^2<<1 (mod poly)
656 pshufd $78, \TMP5, \TMP1
657 pxor \TMP5, \TMP1
658 movdqa \TMP1, HashKey_2_k(%rsp)
659.irpc index, 1234 # do 4 rounds
660 movaps 0x10*\index(%arg1), \TMP1
661 AESENC \TMP1, \XMM1
662 AESENC \TMP1, \XMM2
663 AESENC \TMP1, \XMM3
664 AESENC \TMP1, \XMM4
665.endr
666 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
667# TMP5 = HashKey^3<<1 (mod poly)
668 movdqa \TMP5, HashKey_3(%rsp)
669 pshufd $78, \TMP5, \TMP1
670 pxor \TMP5, \TMP1
671 movdqa \TMP1, HashKey_3_k(%rsp)
672.irpc index, 56789 # do next 5 rounds
673 movaps 0x10*\index(%arg1), \TMP1
674 AESENC \TMP1, \XMM1
675 AESENC \TMP1, \XMM2
676 AESENC \TMP1, \XMM3
677 AESENC \TMP1, \XMM4
678.endr
679 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
680# TMP5 = HashKey^3<<1 (mod poly)
681 movdqa \TMP5, HashKey_4(%rsp)
682 pshufd $78, \TMP5, \TMP1
683 pxor \TMP5, \TMP1
684 movdqa \TMP1, HashKey_4_k(%rsp)
e31ac32d
TM
685 lea 0xa0(%arg1),%r10
686 mov keysize,%eax
687 shr $2,%eax # 128->4, 192->6, 256->8
688 sub $4,%eax # 128->0, 192->2, 256->4
e1fd316f 689 jz aes_loop_pre_done\@
e31ac32d 690
e1fd316f 691aes_loop_pre_\@:
e31ac32d
TM
692 MOVADQ (%r10),\TMP2
693.irpc index, 1234
694 AESENC \TMP2, %xmm\index
695.endr
696 add $16,%r10
697 sub $1,%eax
e1fd316f 698 jnz aes_loop_pre_\@
e31ac32d 699
e1fd316f 700aes_loop_pre_done\@:
e31ac32d 701 MOVADQ (%r10), \TMP2
3c097b80
TS
702 AESENCLAST \TMP2, \XMM1
703 AESENCLAST \TMP2, \XMM2
704 AESENCLAST \TMP2, \XMM3
705 AESENCLAST \TMP2, \XMM4
9ee4a5df 706 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
3c097b80 707 pxor \TMP1, \XMM1
e1fd316f 708.ifc \operation, dec
9ee4a5df 709 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
e1fd316f
DW
710 movdqa \TMP1, \XMM1
711.endif
9ee4a5df 712 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
3c097b80 713 pxor \TMP1, \XMM2
e1fd316f 714.ifc \operation, dec
9ee4a5df 715 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
e1fd316f
DW
716 movdqa \TMP1, \XMM2
717.endif
9ee4a5df 718 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
3c097b80 719 pxor \TMP1, \XMM3
e1fd316f 720.ifc \operation, dec
9ee4a5df 721 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
e1fd316f
DW
722 movdqa \TMP1, \XMM3
723.endif
9ee4a5df 724 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
3c097b80 725 pxor \TMP1, \XMM4
e1fd316f 726.ifc \operation, dec
9ee4a5df 727 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f
DW
728 movdqa \TMP1, \XMM4
729.else
9ee4a5df
DW
730 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
731 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
732 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
733 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f 734.endif
3c097b80 735
0bd82f5f 736 add $64, %r11
3c097b80 737 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
738 pxor \XMMDst, \XMM1
739# combine GHASHed value with the corresponding ciphertext
3c097b80 740 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 741 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
3c097b80
TS
742 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
743
e1fd316f 744_initial_blocks_done\@:
3c097b80 745
0bd82f5f
TS
746.endm
747
748/*
749* encrypt 4 blocks at a time
750* ghash the 4 previously encrypted ciphertext blocks
9ee4a5df 751* arg1, %arg3, %arg4 are used as pointers only, not modified
0bd82f5f
TS
752* %r11 is the data offset value
753*/
3c097b80
TS
754.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
755TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
756
757 movdqa \XMM1, \XMM5
758 movdqa \XMM2, \XMM6
759 movdqa \XMM3, \XMM7
760 movdqa \XMM4, \XMM8
761
762 movdqa SHUF_MASK(%rip), %xmm15
763 # multiply TMP5 * HashKey using karatsuba
764
765 movdqa \XMM5, \TMP4
766 pshufd $78, \XMM5, \TMP6
767 pxor \XMM5, \TMP6
768 paddd ONE(%rip), \XMM0 # INCR CNT
769 movdqa HashKey_4(%rsp), \TMP5
770 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
771 movdqa \XMM0, \XMM1
772 paddd ONE(%rip), \XMM0 # INCR CNT
773 movdqa \XMM0, \XMM2
774 paddd ONE(%rip), \XMM0 # INCR CNT
775 movdqa \XMM0, \XMM3
776 paddd ONE(%rip), \XMM0 # INCR CNT
777 movdqa \XMM0, \XMM4
778 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
779 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
780 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
781 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
782 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
783
784 pxor (%arg1), \XMM1
785 pxor (%arg1), \XMM2
786 pxor (%arg1), \XMM3
787 pxor (%arg1), \XMM4
788 movdqa HashKey_4_k(%rsp), \TMP5
789 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
790 movaps 0x10(%arg1), \TMP1
791 AESENC \TMP1, \XMM1 # Round 1
792 AESENC \TMP1, \XMM2
793 AESENC \TMP1, \XMM3
794 AESENC \TMP1, \XMM4
795 movaps 0x20(%arg1), \TMP1
796 AESENC \TMP1, \XMM1 # Round 2
797 AESENC \TMP1, \XMM2
798 AESENC \TMP1, \XMM3
799 AESENC \TMP1, \XMM4
800 movdqa \XMM6, \TMP1
801 pshufd $78, \XMM6, \TMP2
802 pxor \XMM6, \TMP2
803 movdqa HashKey_3(%rsp), \TMP5
804 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
805 movaps 0x30(%arg1), \TMP3
806 AESENC \TMP3, \XMM1 # Round 3
807 AESENC \TMP3, \XMM2
808 AESENC \TMP3, \XMM3
809 AESENC \TMP3, \XMM4
810 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
811 movaps 0x40(%arg1), \TMP3
812 AESENC \TMP3, \XMM1 # Round 4
813 AESENC \TMP3, \XMM2
814 AESENC \TMP3, \XMM3
815 AESENC \TMP3, \XMM4
816 movdqa HashKey_3_k(%rsp), \TMP5
817 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
818 movaps 0x50(%arg1), \TMP3
819 AESENC \TMP3, \XMM1 # Round 5
820 AESENC \TMP3, \XMM2
821 AESENC \TMP3, \XMM3
822 AESENC \TMP3, \XMM4
823 pxor \TMP1, \TMP4
824# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
825 pxor \XMM6, \XMM5
826 pxor \TMP2, \TMP6
827 movdqa \XMM7, \TMP1
828 pshufd $78, \XMM7, \TMP2
829 pxor \XMM7, \TMP2
830 movdqa HashKey_2(%rsp ), \TMP5
831
832 # Multiply TMP5 * HashKey using karatsuba
833
834 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
835 movaps 0x60(%arg1), \TMP3
836 AESENC \TMP3, \XMM1 # Round 6
837 AESENC \TMP3, \XMM2
838 AESENC \TMP3, \XMM3
839 AESENC \TMP3, \XMM4
840 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
841 movaps 0x70(%arg1), \TMP3
842 AESENC \TMP3, \XMM1 # Round 7
843 AESENC \TMP3, \XMM2
844 AESENC \TMP3, \XMM3
845 AESENC \TMP3, \XMM4
846 movdqa HashKey_2_k(%rsp), \TMP5
847 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
848 movaps 0x80(%arg1), \TMP3
849 AESENC \TMP3, \XMM1 # Round 8
850 AESENC \TMP3, \XMM2
851 AESENC \TMP3, \XMM3
852 AESENC \TMP3, \XMM4
853 pxor \TMP1, \TMP4
854# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
855 pxor \XMM7, \XMM5
856 pxor \TMP2, \TMP6
857
858 # Multiply XMM8 * HashKey
859 # XMM8 and TMP5 hold the values for the two operands
860
861 movdqa \XMM8, \TMP1
862 pshufd $78, \XMM8, \TMP2
863 pxor \XMM8, \TMP2
864 movdqa HashKey(%rsp), \TMP5
865 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
866 movaps 0x90(%arg1), \TMP3
867 AESENC \TMP3, \XMM1 # Round 9
868 AESENC \TMP3, \XMM2
869 AESENC \TMP3, \XMM3
870 AESENC \TMP3, \XMM4
871 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
872 lea 0xa0(%arg1),%r10
873 mov keysize,%eax
874 shr $2,%eax # 128->4, 192->6, 256->8
875 sub $4,%eax # 128->0, 192->2, 256->4
876 jz aes_loop_par_enc_done
877
878aes_loop_par_enc:
879 MOVADQ (%r10),\TMP3
880.irpc index, 1234
881 AESENC \TMP3, %xmm\index
882.endr
883 add $16,%r10
884 sub $1,%eax
885 jnz aes_loop_par_enc
886
887aes_loop_par_enc_done:
888 MOVADQ (%r10), \TMP3
3c097b80
TS
889 AESENCLAST \TMP3, \XMM1 # Round 10
890 AESENCLAST \TMP3, \XMM2
891 AESENCLAST \TMP3, \XMM3
892 AESENCLAST \TMP3, \XMM4
893 movdqa HashKey_k(%rsp), \TMP5
894 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 895 movdqu (%arg4,%r11,1), \TMP3
3c097b80 896 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 897 movdqu 16(%arg4,%r11,1), \TMP3
3c097b80 898 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 899 movdqu 32(%arg4,%r11,1), \TMP3
3c097b80 900 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 901 movdqu 48(%arg4,%r11,1), \TMP3
3c097b80 902 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df
DW
903 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
904 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
905 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
906 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
3c097b80
TS
907 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
908 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
909 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
910 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
911
912 pxor \TMP4, \TMP1
913 pxor \XMM8, \XMM5
914 pxor \TMP6, \TMP2
915 pxor \TMP1, \TMP2
916 pxor \XMM5, \TMP2
917 movdqa \TMP2, \TMP3
918 pslldq $8, \TMP3 # left shift TMP3 2 DWs
919 psrldq $8, \TMP2 # right shift TMP2 2 DWs
920 pxor \TMP3, \XMM5
921 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
922
923 # first phase of reduction
924
925 movdqa \XMM5, \TMP2
926 movdqa \XMM5, \TMP3
927 movdqa \XMM5, \TMP4
928# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
929 pslld $31, \TMP2 # packed right shift << 31
930 pslld $30, \TMP3 # packed right shift << 30
931 pslld $25, \TMP4 # packed right shift << 25
932 pxor \TMP3, \TMP2 # xor the shifted versions
933 pxor \TMP4, \TMP2
934 movdqa \TMP2, \TMP5
935 psrldq $4, \TMP5 # right shift T5 1 DW
936 pslldq $12, \TMP2 # left shift T2 3 DWs
937 pxor \TMP2, \XMM5
938
939 # second phase of reduction
940
941 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
942 movdqa \XMM5,\TMP3
943 movdqa \XMM5,\TMP4
944 psrld $1, \TMP2 # packed left shift >>1
945 psrld $2, \TMP3 # packed left shift >>2
946 psrld $7, \TMP4 # packed left shift >>7
947 pxor \TMP3,\TMP2 # xor the shifted versions
948 pxor \TMP4,\TMP2
949 pxor \TMP5, \TMP2
950 pxor \TMP2, \XMM5
951 pxor \TMP1, \XMM5 # result is in TMP1
952
953 pxor \XMM5, \XMM1
954.endm
955
956/*
957* decrypt 4 blocks at a time
958* ghash the 4 previously decrypted ciphertext blocks
9ee4a5df 959* arg1, %arg3, %arg4 are used as pointers only, not modified
3c097b80
TS
960* %r11 is the data offset value
961*/
962.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
963TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
964
965 movdqa \XMM1, \XMM5
966 movdqa \XMM2, \XMM6
967 movdqa \XMM3, \XMM7
968 movdqa \XMM4, \XMM8
969
3c097b80 970 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
971 # multiply TMP5 * HashKey using karatsuba
972
973 movdqa \XMM5, \TMP4
974 pshufd $78, \XMM5, \TMP6
975 pxor \XMM5, \TMP6
976 paddd ONE(%rip), \XMM0 # INCR CNT
977 movdqa HashKey_4(%rsp), \TMP5
978 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
979 movdqa \XMM0, \XMM1
980 paddd ONE(%rip), \XMM0 # INCR CNT
981 movdqa \XMM0, \XMM2
982 paddd ONE(%rip), \XMM0 # INCR CNT
983 movdqa \XMM0, \XMM3
984 paddd ONE(%rip), \XMM0 # INCR CNT
985 movdqa \XMM0, \XMM4
3c097b80 986 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
0bd82f5f 987 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
3c097b80
TS
988 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
989 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
990 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
991
0bd82f5f
TS
992 pxor (%arg1), \XMM1
993 pxor (%arg1), \XMM2
994 pxor (%arg1), \XMM3
995 pxor (%arg1), \XMM4
996 movdqa HashKey_4_k(%rsp), \TMP5
997 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
998 movaps 0x10(%arg1), \TMP1
999 AESENC \TMP1, \XMM1 # Round 1
1000 AESENC \TMP1, \XMM2
1001 AESENC \TMP1, \XMM3
1002 AESENC \TMP1, \XMM4
1003 movaps 0x20(%arg1), \TMP1
1004 AESENC \TMP1, \XMM1 # Round 2
1005 AESENC \TMP1, \XMM2
1006 AESENC \TMP1, \XMM3
1007 AESENC \TMP1, \XMM4
1008 movdqa \XMM6, \TMP1
1009 pshufd $78, \XMM6, \TMP2
1010 pxor \XMM6, \TMP2
1011 movdqa HashKey_3(%rsp), \TMP5
1012 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1013 movaps 0x30(%arg1), \TMP3
1014 AESENC \TMP3, \XMM1 # Round 3
1015 AESENC \TMP3, \XMM2
1016 AESENC \TMP3, \XMM3
1017 AESENC \TMP3, \XMM4
1018 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1019 movaps 0x40(%arg1), \TMP3
1020 AESENC \TMP3, \XMM1 # Round 4
1021 AESENC \TMP3, \XMM2
1022 AESENC \TMP3, \XMM3
1023 AESENC \TMP3, \XMM4
1024 movdqa HashKey_3_k(%rsp), \TMP5
1025 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1026 movaps 0x50(%arg1), \TMP3
1027 AESENC \TMP3, \XMM1 # Round 5
1028 AESENC \TMP3, \XMM2
1029 AESENC \TMP3, \XMM3
1030 AESENC \TMP3, \XMM4
1031 pxor \TMP1, \TMP4
1032# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1033 pxor \XMM6, \XMM5
1034 pxor \TMP2, \TMP6
1035 movdqa \XMM7, \TMP1
1036 pshufd $78, \XMM7, \TMP2
1037 pxor \XMM7, \TMP2
1038 movdqa HashKey_2(%rsp ), \TMP5
1039
1040 # Multiply TMP5 * HashKey using karatsuba
1041
1042 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1043 movaps 0x60(%arg1), \TMP3
1044 AESENC \TMP3, \XMM1 # Round 6
1045 AESENC \TMP3, \XMM2
1046 AESENC \TMP3, \XMM3
1047 AESENC \TMP3, \XMM4
1048 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1049 movaps 0x70(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1 # Round 7
1051 AESENC \TMP3, \XMM2
1052 AESENC \TMP3, \XMM3
1053 AESENC \TMP3, \XMM4
1054 movdqa HashKey_2_k(%rsp), \TMP5
1055 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1056 movaps 0x80(%arg1), \TMP3
1057 AESENC \TMP3, \XMM1 # Round 8
1058 AESENC \TMP3, \XMM2
1059 AESENC \TMP3, \XMM3
1060 AESENC \TMP3, \XMM4
1061 pxor \TMP1, \TMP4
1062# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1063 pxor \XMM7, \XMM5
1064 pxor \TMP2, \TMP6
1065
1066 # Multiply XMM8 * HashKey
1067 # XMM8 and TMP5 hold the values for the two operands
1068
1069 movdqa \XMM8, \TMP1
1070 pshufd $78, \XMM8, \TMP2
1071 pxor \XMM8, \TMP2
1072 movdqa HashKey(%rsp), \TMP5
1073 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1074 movaps 0x90(%arg1), \TMP3
1075 AESENC \TMP3, \XMM1 # Round 9
1076 AESENC \TMP3, \XMM2
1077 AESENC \TMP3, \XMM3
1078 AESENC \TMP3, \XMM4
1079 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1080 lea 0xa0(%arg1),%r10
1081 mov keysize,%eax
1082 shr $2,%eax # 128->4, 192->6, 256->8
1083 sub $4,%eax # 128->0, 192->2, 256->4
1084 jz aes_loop_par_dec_done
1085
1086aes_loop_par_dec:
1087 MOVADQ (%r10),\TMP3
1088.irpc index, 1234
1089 AESENC \TMP3, %xmm\index
1090.endr
1091 add $16,%r10
1092 sub $1,%eax
1093 jnz aes_loop_par_dec
1094
1095aes_loop_par_dec_done:
1096 MOVADQ (%r10), \TMP3
1097 AESENCLAST \TMP3, \XMM1 # last round
0bd82f5f
TS
1098 AESENCLAST \TMP3, \XMM2
1099 AESENCLAST \TMP3, \XMM3
1100 AESENCLAST \TMP3, \XMM4
1101 movdqa HashKey_k(%rsp), \TMP5
1102 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1103 movdqu (%arg4,%r11,1), \TMP3
0bd82f5f 1104 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1105 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1106 movdqa \TMP3, \XMM1
9ee4a5df 1107 movdqu 16(%arg4,%r11,1), \TMP3
0bd82f5f 1108 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1109 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1110 movdqa \TMP3, \XMM2
9ee4a5df 1111 movdqu 32(%arg4,%r11,1), \TMP3
0bd82f5f 1112 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1113 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1114 movdqa \TMP3, \XMM3
9ee4a5df 1115 movdqu 48(%arg4,%r11,1), \TMP3
0bd82f5f 1116 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df 1117 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1118 movdqa \TMP3, \XMM4
3c097b80
TS
1119 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1120 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1121 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1122 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1123
1124 pxor \TMP4, \TMP1
1125 pxor \XMM8, \XMM5
1126 pxor \TMP6, \TMP2
1127 pxor \TMP1, \TMP2
1128 pxor \XMM5, \TMP2
1129 movdqa \TMP2, \TMP3
1130 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1131 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1132 pxor \TMP3, \XMM5
1133 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1134
1135 # first phase of reduction
1136
1137 movdqa \XMM5, \TMP2
1138 movdqa \XMM5, \TMP3
1139 movdqa \XMM5, \TMP4
1140# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1141 pslld $31, \TMP2 # packed right shift << 31
1142 pslld $30, \TMP3 # packed right shift << 30
1143 pslld $25, \TMP4 # packed right shift << 25
1144 pxor \TMP3, \TMP2 # xor the shifted versions
1145 pxor \TMP4, \TMP2
1146 movdqa \TMP2, \TMP5
1147 psrldq $4, \TMP5 # right shift T5 1 DW
1148 pslldq $12, \TMP2 # left shift T2 3 DWs
1149 pxor \TMP2, \XMM5
1150
1151 # second phase of reduction
1152
1153 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1154 movdqa \XMM5,\TMP3
1155 movdqa \XMM5,\TMP4
1156 psrld $1, \TMP2 # packed left shift >>1
1157 psrld $2, \TMP3 # packed left shift >>2
1158 psrld $7, \TMP4 # packed left shift >>7
1159 pxor \TMP3,\TMP2 # xor the shifted versions
1160 pxor \TMP4,\TMP2
1161 pxor \TMP5, \TMP2
1162 pxor \TMP2, \XMM5
1163 pxor \TMP1, \XMM5 # result is in TMP1
1164
1165 pxor \XMM5, \XMM1
1166.endm
1167
1168/* GHASH the last 4 ciphertext blocks. */
1169.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1170TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1171
1172 # Multiply TMP6 * HashKey (using Karatsuba)
1173
1174 movdqa \XMM1, \TMP6
1175 pshufd $78, \XMM1, \TMP2
1176 pxor \XMM1, \TMP2
1177 movdqa HashKey_4(%rsp), \TMP5
1178 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1179 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1180 movdqa HashKey_4_k(%rsp), \TMP4
1181 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1182 movdqa \XMM1, \XMMDst
1183 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1184
1185 # Multiply TMP1 * HashKey (using Karatsuba)
1186
1187 movdqa \XMM2, \TMP1
1188 pshufd $78, \XMM2, \TMP2
1189 pxor \XMM2, \TMP2
1190 movdqa HashKey_3(%rsp), \TMP5
1191 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1192 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1193 movdqa HashKey_3_k(%rsp), \TMP4
1194 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1195 pxor \TMP1, \TMP6
1196 pxor \XMM2, \XMMDst
1197 pxor \TMP2, \XMM1
1198# results accumulated in TMP6, XMMDst, XMM1
1199
1200 # Multiply TMP1 * HashKey (using Karatsuba)
1201
1202 movdqa \XMM3, \TMP1
1203 pshufd $78, \XMM3, \TMP2
1204 pxor \XMM3, \TMP2
1205 movdqa HashKey_2(%rsp), \TMP5
1206 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1207 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1208 movdqa HashKey_2_k(%rsp), \TMP4
1209 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1210 pxor \TMP1, \TMP6
1211 pxor \XMM3, \XMMDst
1212 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1213
1214 # Multiply TMP1 * HashKey (using Karatsuba)
1215 movdqa \XMM4, \TMP1
1216 pshufd $78, \XMM4, \TMP2
1217 pxor \XMM4, \TMP2
1218 movdqa HashKey(%rsp), \TMP5
1219 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1220 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1221 movdqa HashKey_k(%rsp), \TMP4
1222 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1223 pxor \TMP1, \TMP6
1224 pxor \XMM4, \XMMDst
1225 pxor \XMM1, \TMP2
1226 pxor \TMP6, \TMP2
1227 pxor \XMMDst, \TMP2
1228 # middle section of the temp results combined as in karatsuba algorithm
1229 movdqa \TMP2, \TMP4
1230 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1231 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1232 pxor \TMP4, \XMMDst
1233 pxor \TMP2, \TMP6
1234# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1235 # first phase of the reduction
1236 movdqa \XMMDst, \TMP2
1237 movdqa \XMMDst, \TMP3
1238 movdqa \XMMDst, \TMP4
1239# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1240 pslld $31, \TMP2 # packed right shifting << 31
1241 pslld $30, \TMP3 # packed right shifting << 30
1242 pslld $25, \TMP4 # packed right shifting << 25
1243 pxor \TMP3, \TMP2 # xor the shifted versions
1244 pxor \TMP4, \TMP2
1245 movdqa \TMP2, \TMP7
1246 psrldq $4, \TMP7 # right shift TMP7 1 DW
1247 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1248 pxor \TMP2, \XMMDst
1249
1250 # second phase of the reduction
1251 movdqa \XMMDst, \TMP2
1252 # make 3 copies of XMMDst for doing 3 shift operations
1253 movdqa \XMMDst, \TMP3
1254 movdqa \XMMDst, \TMP4
1255 psrld $1, \TMP2 # packed left shift >> 1
1256 psrld $2, \TMP3 # packed left shift >> 2
1257 psrld $7, \TMP4 # packed left shift >> 7
1258 pxor \TMP3, \TMP2 # xor the shifted versions
1259 pxor \TMP4, \TMP2
1260 pxor \TMP7, \TMP2
1261 pxor \TMP2, \XMMDst
1262 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1263.endm
1264
0bd82f5f 1265
e31ac32d
TM
1266/* Encryption of a single block
1267* uses eax & r10
1268*/
0bd82f5f 1269
e31ac32d 1270.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1271
e31ac32d
TM
1272 pxor (%arg1), \XMM0
1273 mov keysize,%eax
1274 shr $2,%eax # 128->4, 192->6, 256->8
1275 add $5,%eax # 128->9, 192->11, 256->13
1276 lea 16(%arg1), %r10 # get first expanded key address
1277
1278_esb_loop_\@:
1279 MOVADQ (%r10),\TMP1
1280 AESENC \TMP1,\XMM0
1281 add $16,%r10
1282 sub $1,%eax
1283 jnz _esb_loop_\@
1284
1285 MOVADQ (%r10),\TMP1
1286 AESENCLAST \TMP1,\XMM0
1287.endm
0bd82f5f
TS
1288/*****************************************************************************
1289* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1290* struct gcm_context_data *data
1291* // Context data
0bd82f5f
TS
1292* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1293* const u8 *in, // Ciphertext input
1294* u64 plaintext_len, // Length of data in bytes for decryption.
1295* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1296* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1297* // concatenated with 0x00000001. 16-byte aligned pointer.
1298* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1299* const u8 *aad, // Additional Authentication Data (AAD)
1300* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1301* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1302* // given authentication tag and only return the plaintext if they match.
1303* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1304* // (most likely), 12 or 8.
1305*
1306* Assumptions:
1307*
1308* keys:
1309* keys are pre-expanded and aligned to 16 bytes. we are using the first
1310* set of 11 keys in the data structure void *aes_ctx
1311*
1312* iv:
1313* 0 1 2 3
1314* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1315* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1316* | Salt (From the SA) |
1317* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1318* | Initialization Vector |
1319* | (This is the sequence number from IPSec header) |
1320* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1321* | 0x1 |
1322* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1323*
1324*
1325*
1326* AAD:
1327* AAD padded to 128 bits with 0
1328* for example, assume AAD is a u32 vector
1329*
1330* if AAD is 8 bytes:
1331* AAD[3] = {A0, A1};
1332* padded AAD in xmm register = {A1 A0 0 0}
1333*
1334* 0 1 2 3
1335* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1336* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1337* | SPI (A1) |
1338* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1339* | 32-bit Sequence Number (A0) |
1340* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1341* | 0x0 |
1342* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1343*
1344* AAD Format with 32-bit Sequence Number
1345*
1346* if AAD is 12 bytes:
1347* AAD[3] = {A0, A1, A2};
1348* padded AAD in xmm register = {A2 A1 A0 0}
1349*
1350* 0 1 2 3
1351* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1352* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1353* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1354* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1355* | SPI (A2) |
1356* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1357* | 64-bit Extended Sequence Number {A1,A0} |
1358* | |
1359* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1360* | 0x0 |
1361* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362*
1363* AAD Format with 64-bit Extended Sequence Number
1364*
0bd82f5f
TS
1365* poly = x^128 + x^127 + x^126 + x^121 + 1
1366*
1367*****************************************************************************/
0bd82f5f 1368ENTRY(aesni_gcm_dec)
6c2c86b3 1369 FUNC_SAVE
0bd82f5f 1370
7af964c2 1371 GCM_INIT
ba45833e 1372 GCM_ENC_DEC dec
adcadab3 1373 GCM_COMPLETE
6c2c86b3 1374 FUNC_RESTORE
0bd82f5f 1375 ret
8309b745 1376ENDPROC(aesni_gcm_dec)
0bd82f5f
TS
1377
1378
1379/*****************************************************************************
1380* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1381* struct gcm_context_data *data
1382* // Context data
0bd82f5f
TS
1383* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1384* const u8 *in, // Plaintext input
1385* u64 plaintext_len, // Length of data in bytes for encryption.
1386* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1387* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1388* // concatenated with 0x00000001. 16-byte aligned pointer.
1389* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1390* const u8 *aad, // Additional Authentication Data (AAD)
1391* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1392* u8 *auth_tag, // Authenticated Tag output.
1393* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1394* // 12 or 8.
1395*
1396* Assumptions:
1397*
1398* keys:
1399* keys are pre-expanded and aligned to 16 bytes. we are using the
1400* first set of 11 keys in the data structure void *aes_ctx
1401*
1402*
1403* iv:
1404* 0 1 2 3
1405* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1406* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1407* | Salt (From the SA) |
1408* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1409* | Initialization Vector |
1410* | (This is the sequence number from IPSec header) |
1411* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1412* | 0x1 |
1413* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1414*
1415*
1416*
1417* AAD:
1418* AAD padded to 128 bits with 0
1419* for example, assume AAD is a u32 vector
1420*
1421* if AAD is 8 bytes:
1422* AAD[3] = {A0, A1};
1423* padded AAD in xmm register = {A1 A0 0 0}
1424*
1425* 0 1 2 3
1426* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1427* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1428* | SPI (A1) |
1429* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1430* | 32-bit Sequence Number (A0) |
1431* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1432* | 0x0 |
1433* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1434*
1435* AAD Format with 32-bit Sequence Number
1436*
1437* if AAD is 12 bytes:
1438* AAD[3] = {A0, A1, A2};
1439* padded AAD in xmm register = {A2 A1 A0 0}
1440*
1441* 0 1 2 3
1442* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1443* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1444* | SPI (A2) |
1445* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1446* | 64-bit Extended Sequence Number {A1,A0} |
1447* | |
1448* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1449* | 0x0 |
1450* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1451*
1452* AAD Format with 64-bit Extended Sequence Number
1453*
0bd82f5f
TS
1454* poly = x^128 + x^127 + x^126 + x^121 + 1
1455***************************************************************************/
1456ENTRY(aesni_gcm_enc)
6c2c86b3 1457 FUNC_SAVE
0bd82f5f 1458
7af964c2 1459 GCM_INIT
ba45833e 1460 GCM_ENC_DEC enc
adcadab3 1461 GCM_COMPLETE
6c2c86b3 1462 FUNC_RESTORE
0bd82f5f 1463 ret
8309b745 1464ENDPROC(aesni_gcm_enc)
3c097b80 1465
559ad0ff 1466#endif
0bd82f5f
TS
1467
1468
8309b745 1469.align 4
54b6a1bd
HY
1470_key_expansion_128:
1471_key_expansion_256a:
1472 pshufd $0b11111111, %xmm1, %xmm1
1473 shufps $0b00010000, %xmm0, %xmm4
1474 pxor %xmm4, %xmm0
1475 shufps $0b10001100, %xmm0, %xmm4
1476 pxor %xmm4, %xmm0
1477 pxor %xmm1, %xmm0
0d258efb
MK
1478 movaps %xmm0, (TKEYP)
1479 add $0x10, TKEYP
54b6a1bd 1480 ret
8309b745
JK
1481ENDPROC(_key_expansion_128)
1482ENDPROC(_key_expansion_256a)
54b6a1bd 1483
0d258efb 1484.align 4
54b6a1bd
HY
1485_key_expansion_192a:
1486 pshufd $0b01010101, %xmm1, %xmm1
1487 shufps $0b00010000, %xmm0, %xmm4
1488 pxor %xmm4, %xmm0
1489 shufps $0b10001100, %xmm0, %xmm4
1490 pxor %xmm4, %xmm0
1491 pxor %xmm1, %xmm0
1492
1493 movaps %xmm2, %xmm5
1494 movaps %xmm2, %xmm6
1495 pslldq $4, %xmm5
1496 pshufd $0b11111111, %xmm0, %xmm3
1497 pxor %xmm3, %xmm2
1498 pxor %xmm5, %xmm2
1499
1500 movaps %xmm0, %xmm1
1501 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1502 movaps %xmm6, (TKEYP)
54b6a1bd 1503 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1504 movaps %xmm1, 0x10(TKEYP)
1505 add $0x20, TKEYP
54b6a1bd 1506 ret
8309b745 1507ENDPROC(_key_expansion_192a)
54b6a1bd 1508
0d258efb 1509.align 4
54b6a1bd
HY
1510_key_expansion_192b:
1511 pshufd $0b01010101, %xmm1, %xmm1
1512 shufps $0b00010000, %xmm0, %xmm4
1513 pxor %xmm4, %xmm0
1514 shufps $0b10001100, %xmm0, %xmm4
1515 pxor %xmm4, %xmm0
1516 pxor %xmm1, %xmm0
1517
1518 movaps %xmm2, %xmm5
1519 pslldq $4, %xmm5
1520 pshufd $0b11111111, %xmm0, %xmm3
1521 pxor %xmm3, %xmm2
1522 pxor %xmm5, %xmm2
1523
0d258efb
MK
1524 movaps %xmm0, (TKEYP)
1525 add $0x10, TKEYP
54b6a1bd 1526 ret
8309b745 1527ENDPROC(_key_expansion_192b)
54b6a1bd 1528
0d258efb 1529.align 4
54b6a1bd
HY
1530_key_expansion_256b:
1531 pshufd $0b10101010, %xmm1, %xmm1
1532 shufps $0b00010000, %xmm2, %xmm4
1533 pxor %xmm4, %xmm2
1534 shufps $0b10001100, %xmm2, %xmm4
1535 pxor %xmm4, %xmm2
1536 pxor %xmm1, %xmm2
0d258efb
MK
1537 movaps %xmm2, (TKEYP)
1538 add $0x10, TKEYP
54b6a1bd 1539 ret
8309b745 1540ENDPROC(_key_expansion_256b)
54b6a1bd
HY
1541
1542/*
1543 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1544 * unsigned int key_len)
1545 */
1546ENTRY(aesni_set_key)
8691ccd7 1547 FRAME_BEGIN
0d258efb
MK
1548#ifndef __x86_64__
1549 pushl KEYP
8691ccd7
JP
1550 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1551 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1552 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1553#endif
1554 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1555 movaps %xmm0, (KEYP)
1556 lea 0x10(KEYP), TKEYP # key addr
1557 movl %edx, 480(KEYP)
54b6a1bd
HY
1558 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1559 cmp $24, %dl
1560 jb .Lenc_key128
1561 je .Lenc_key192
0d258efb
MK
1562 movups 0x10(UKEYP), %xmm2 # other user key
1563 movaps %xmm2, (TKEYP)
1564 add $0x10, TKEYP
b369e521 1565 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1566 call _key_expansion_256a
b369e521 1567 AESKEYGENASSIST 0x1 %xmm0 %xmm1
54b6a1bd 1568 call _key_expansion_256b
b369e521 1569 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1570 call _key_expansion_256a
b369e521 1571 AESKEYGENASSIST 0x2 %xmm0 %xmm1
54b6a1bd 1572 call _key_expansion_256b
b369e521 1573 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1574 call _key_expansion_256a
b369e521 1575 AESKEYGENASSIST 0x4 %xmm0 %xmm1
54b6a1bd 1576 call _key_expansion_256b
b369e521 1577 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1578 call _key_expansion_256a
b369e521 1579 AESKEYGENASSIST 0x8 %xmm0 %xmm1
54b6a1bd 1580 call _key_expansion_256b
b369e521 1581 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1582 call _key_expansion_256a
b369e521 1583 AESKEYGENASSIST 0x10 %xmm0 %xmm1
54b6a1bd 1584 call _key_expansion_256b
b369e521 1585 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1586 call _key_expansion_256a
b369e521 1587 AESKEYGENASSIST 0x20 %xmm0 %xmm1
54b6a1bd 1588 call _key_expansion_256b
b369e521 1589 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd
HY
1590 call _key_expansion_256a
1591 jmp .Ldec_key
1592.Lenc_key192:
0d258efb 1593 movq 0x10(UKEYP), %xmm2 # other user key
b369e521 1594 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
54b6a1bd 1595 call _key_expansion_192a
b369e521 1596 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
54b6a1bd 1597 call _key_expansion_192b
b369e521 1598 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
54b6a1bd 1599 call _key_expansion_192a
b369e521 1600 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
54b6a1bd 1601 call _key_expansion_192b
b369e521 1602 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
54b6a1bd 1603 call _key_expansion_192a
b369e521 1604 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
54b6a1bd 1605 call _key_expansion_192b
b369e521 1606 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
54b6a1bd 1607 call _key_expansion_192a
b369e521 1608 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
54b6a1bd
HY
1609 call _key_expansion_192b
1610 jmp .Ldec_key
1611.Lenc_key128:
b369e521 1612 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
54b6a1bd 1613 call _key_expansion_128
b369e521 1614 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
54b6a1bd 1615 call _key_expansion_128
b369e521 1616 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
54b6a1bd 1617 call _key_expansion_128
b369e521 1618 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
54b6a1bd 1619 call _key_expansion_128
b369e521 1620 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
54b6a1bd 1621 call _key_expansion_128
b369e521 1622 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
54b6a1bd 1623 call _key_expansion_128
b369e521 1624 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
54b6a1bd 1625 call _key_expansion_128
b369e521 1626 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
54b6a1bd 1627 call _key_expansion_128
b369e521 1628 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
54b6a1bd 1629 call _key_expansion_128
b369e521 1630 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
54b6a1bd
HY
1631 call _key_expansion_128
1632.Ldec_key:
0d258efb
MK
1633 sub $0x10, TKEYP
1634 movaps (KEYP), %xmm0
1635 movaps (TKEYP), %xmm1
1636 movaps %xmm0, 240(TKEYP)
1637 movaps %xmm1, 240(KEYP)
1638 add $0x10, KEYP
1639 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1640.align 4
1641.Ldec_key_loop:
0d258efb 1642 movaps (KEYP), %xmm0
b369e521 1643 AESIMC %xmm0 %xmm1
0d258efb
MK
1644 movaps %xmm1, (UKEYP)
1645 add $0x10, KEYP
1646 sub $0x10, UKEYP
1647 cmp TKEYP, KEYP
54b6a1bd 1648 jb .Ldec_key_loop
0d258efb
MK
1649 xor AREG, AREG
1650#ifndef __x86_64__
1651 popl KEYP
1652#endif
8691ccd7 1653 FRAME_END
54b6a1bd 1654 ret
8309b745 1655ENDPROC(aesni_set_key)
54b6a1bd
HY
1656
1657/*
1658 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1659 */
1660ENTRY(aesni_enc)
8691ccd7 1661 FRAME_BEGIN
0d258efb
MK
1662#ifndef __x86_64__
1663 pushl KEYP
1664 pushl KLEN
8691ccd7
JP
1665 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1666 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1667 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1668#endif
54b6a1bd
HY
1669 movl 480(KEYP), KLEN # key length
1670 movups (INP), STATE # input
1671 call _aesni_enc1
1672 movups STATE, (OUTP) # output
0d258efb
MK
1673#ifndef __x86_64__
1674 popl KLEN
1675 popl KEYP
1676#endif
8691ccd7 1677 FRAME_END
54b6a1bd 1678 ret
8309b745 1679ENDPROC(aesni_enc)
54b6a1bd
HY
1680
1681/*
1682 * _aesni_enc1: internal ABI
1683 * input:
1684 * KEYP: key struct pointer
1685 * KLEN: round count
1686 * STATE: initial state (input)
1687 * output:
1688 * STATE: finial state (output)
1689 * changed:
1690 * KEY
1691 * TKEYP (T1)
1692 */
0d258efb 1693.align 4
54b6a1bd
HY
1694_aesni_enc1:
1695 movaps (KEYP), KEY # key
1696 mov KEYP, TKEYP
1697 pxor KEY, STATE # round 0
1698 add $0x30, TKEYP
1699 cmp $24, KLEN
1700 jb .Lenc128
1701 lea 0x20(TKEYP), TKEYP
1702 je .Lenc192
1703 add $0x20, TKEYP
1704 movaps -0x60(TKEYP), KEY
b369e521 1705 AESENC KEY STATE
54b6a1bd 1706 movaps -0x50(TKEYP), KEY
b369e521 1707 AESENC KEY STATE
54b6a1bd
HY
1708.align 4
1709.Lenc192:
1710 movaps -0x40(TKEYP), KEY
b369e521 1711 AESENC KEY STATE
54b6a1bd 1712 movaps -0x30(TKEYP), KEY
b369e521 1713 AESENC KEY STATE
54b6a1bd
HY
1714.align 4
1715.Lenc128:
1716 movaps -0x20(TKEYP), KEY
b369e521 1717 AESENC KEY STATE
54b6a1bd 1718 movaps -0x10(TKEYP), KEY
b369e521 1719 AESENC KEY STATE
54b6a1bd 1720 movaps (TKEYP), KEY
b369e521 1721 AESENC KEY STATE
54b6a1bd 1722 movaps 0x10(TKEYP), KEY
b369e521 1723 AESENC KEY STATE
54b6a1bd 1724 movaps 0x20(TKEYP), KEY
b369e521 1725 AESENC KEY STATE
54b6a1bd 1726 movaps 0x30(TKEYP), KEY
b369e521 1727 AESENC KEY STATE
54b6a1bd 1728 movaps 0x40(TKEYP), KEY
b369e521 1729 AESENC KEY STATE
54b6a1bd 1730 movaps 0x50(TKEYP), KEY
b369e521 1731 AESENC KEY STATE
54b6a1bd 1732 movaps 0x60(TKEYP), KEY
b369e521 1733 AESENC KEY STATE
54b6a1bd 1734 movaps 0x70(TKEYP), KEY
b369e521 1735 AESENCLAST KEY STATE
54b6a1bd 1736 ret
8309b745 1737ENDPROC(_aesni_enc1)
54b6a1bd
HY
1738
1739/*
1740 * _aesni_enc4: internal ABI
1741 * input:
1742 * KEYP: key struct pointer
1743 * KLEN: round count
1744 * STATE1: initial state (input)
1745 * STATE2
1746 * STATE3
1747 * STATE4
1748 * output:
1749 * STATE1: finial state (output)
1750 * STATE2
1751 * STATE3
1752 * STATE4
1753 * changed:
1754 * KEY
1755 * TKEYP (T1)
1756 */
0d258efb 1757.align 4
54b6a1bd
HY
1758_aesni_enc4:
1759 movaps (KEYP), KEY # key
1760 mov KEYP, TKEYP
1761 pxor KEY, STATE1 # round 0
1762 pxor KEY, STATE2
1763 pxor KEY, STATE3
1764 pxor KEY, STATE4
1765 add $0x30, TKEYP
1766 cmp $24, KLEN
1767 jb .L4enc128
1768 lea 0x20(TKEYP), TKEYP
1769 je .L4enc192
1770 add $0x20, TKEYP
1771 movaps -0x60(TKEYP), KEY
b369e521
HY
1772 AESENC KEY STATE1
1773 AESENC KEY STATE2
1774 AESENC KEY STATE3
1775 AESENC KEY STATE4
54b6a1bd 1776 movaps -0x50(TKEYP), KEY
b369e521
HY
1777 AESENC KEY STATE1
1778 AESENC KEY STATE2
1779 AESENC KEY STATE3
1780 AESENC KEY STATE4
54b6a1bd
HY
1781#.align 4
1782.L4enc192:
1783 movaps -0x40(TKEYP), KEY
b369e521
HY
1784 AESENC KEY STATE1
1785 AESENC KEY STATE2
1786 AESENC KEY STATE3
1787 AESENC KEY STATE4
54b6a1bd 1788 movaps -0x30(TKEYP), KEY
b369e521
HY
1789 AESENC KEY STATE1
1790 AESENC KEY STATE2
1791 AESENC KEY STATE3
1792 AESENC KEY STATE4
54b6a1bd
HY
1793#.align 4
1794.L4enc128:
1795 movaps -0x20(TKEYP), KEY
b369e521
HY
1796 AESENC KEY STATE1
1797 AESENC KEY STATE2
1798 AESENC KEY STATE3
1799 AESENC KEY STATE4
54b6a1bd 1800 movaps -0x10(TKEYP), KEY
b369e521
HY
1801 AESENC KEY STATE1
1802 AESENC KEY STATE2
1803 AESENC KEY STATE3
1804 AESENC KEY STATE4
54b6a1bd 1805 movaps (TKEYP), KEY
b369e521
HY
1806 AESENC KEY STATE1
1807 AESENC KEY STATE2
1808 AESENC KEY STATE3
1809 AESENC KEY STATE4
54b6a1bd 1810 movaps 0x10(TKEYP), KEY
b369e521
HY
1811 AESENC KEY STATE1
1812 AESENC KEY STATE2
1813 AESENC KEY STATE3
1814 AESENC KEY STATE4
54b6a1bd 1815 movaps 0x20(TKEYP), KEY
b369e521
HY
1816 AESENC KEY STATE1
1817 AESENC KEY STATE2
1818 AESENC KEY STATE3
1819 AESENC KEY STATE4
54b6a1bd 1820 movaps 0x30(TKEYP), KEY
b369e521
HY
1821 AESENC KEY STATE1
1822 AESENC KEY STATE2
1823 AESENC KEY STATE3
1824 AESENC KEY STATE4
54b6a1bd 1825 movaps 0x40(TKEYP), KEY
b369e521
HY
1826 AESENC KEY STATE1
1827 AESENC KEY STATE2
1828 AESENC KEY STATE3
1829 AESENC KEY STATE4
54b6a1bd 1830 movaps 0x50(TKEYP), KEY
b369e521
HY
1831 AESENC KEY STATE1
1832 AESENC KEY STATE2
1833 AESENC KEY STATE3
1834 AESENC KEY STATE4
54b6a1bd 1835 movaps 0x60(TKEYP), KEY
b369e521
HY
1836 AESENC KEY STATE1
1837 AESENC KEY STATE2
1838 AESENC KEY STATE3
1839 AESENC KEY STATE4
54b6a1bd 1840 movaps 0x70(TKEYP), KEY
b369e521
HY
1841 AESENCLAST KEY STATE1 # last round
1842 AESENCLAST KEY STATE2
1843 AESENCLAST KEY STATE3
1844 AESENCLAST KEY STATE4
54b6a1bd 1845 ret
8309b745 1846ENDPROC(_aesni_enc4)
54b6a1bd
HY
1847
1848/*
1849 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1850 */
1851ENTRY(aesni_dec)
8691ccd7 1852 FRAME_BEGIN
0d258efb
MK
1853#ifndef __x86_64__
1854 pushl KEYP
1855 pushl KLEN
8691ccd7
JP
1856 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1857 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1858 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1859#endif
54b6a1bd
HY
1860 mov 480(KEYP), KLEN # key length
1861 add $240, KEYP
1862 movups (INP), STATE # input
1863 call _aesni_dec1
1864 movups STATE, (OUTP) #output
0d258efb
MK
1865#ifndef __x86_64__
1866 popl KLEN
1867 popl KEYP
1868#endif
8691ccd7 1869 FRAME_END
54b6a1bd 1870 ret
8309b745 1871ENDPROC(aesni_dec)
54b6a1bd
HY
1872
1873/*
1874 * _aesni_dec1: internal ABI
1875 * input:
1876 * KEYP: key struct pointer
1877 * KLEN: key length
1878 * STATE: initial state (input)
1879 * output:
1880 * STATE: finial state (output)
1881 * changed:
1882 * KEY
1883 * TKEYP (T1)
1884 */
0d258efb 1885.align 4
54b6a1bd
HY
1886_aesni_dec1:
1887 movaps (KEYP), KEY # key
1888 mov KEYP, TKEYP
1889 pxor KEY, STATE # round 0
1890 add $0x30, TKEYP
1891 cmp $24, KLEN
1892 jb .Ldec128
1893 lea 0x20(TKEYP), TKEYP
1894 je .Ldec192
1895 add $0x20, TKEYP
1896 movaps -0x60(TKEYP), KEY
b369e521 1897 AESDEC KEY STATE
54b6a1bd 1898 movaps -0x50(TKEYP), KEY
b369e521 1899 AESDEC KEY STATE
54b6a1bd
HY
1900.align 4
1901.Ldec192:
1902 movaps -0x40(TKEYP), KEY
b369e521 1903 AESDEC KEY STATE
54b6a1bd 1904 movaps -0x30(TKEYP), KEY
b369e521 1905 AESDEC KEY STATE
54b6a1bd
HY
1906.align 4
1907.Ldec128:
1908 movaps -0x20(TKEYP), KEY
b369e521 1909 AESDEC KEY STATE
54b6a1bd 1910 movaps -0x10(TKEYP), KEY
b369e521 1911 AESDEC KEY STATE
54b6a1bd 1912 movaps (TKEYP), KEY
b369e521 1913 AESDEC KEY STATE
54b6a1bd 1914 movaps 0x10(TKEYP), KEY
b369e521 1915 AESDEC KEY STATE
54b6a1bd 1916 movaps 0x20(TKEYP), KEY
b369e521 1917 AESDEC KEY STATE
54b6a1bd 1918 movaps 0x30(TKEYP), KEY
b369e521 1919 AESDEC KEY STATE
54b6a1bd 1920 movaps 0x40(TKEYP), KEY
b369e521 1921 AESDEC KEY STATE
54b6a1bd 1922 movaps 0x50(TKEYP), KEY
b369e521 1923 AESDEC KEY STATE
54b6a1bd 1924 movaps 0x60(TKEYP), KEY
b369e521 1925 AESDEC KEY STATE
54b6a1bd 1926 movaps 0x70(TKEYP), KEY
b369e521 1927 AESDECLAST KEY STATE
54b6a1bd 1928 ret
8309b745 1929ENDPROC(_aesni_dec1)
54b6a1bd
HY
1930
1931/*
1932 * _aesni_dec4: internal ABI
1933 * input:
1934 * KEYP: key struct pointer
1935 * KLEN: key length
1936 * STATE1: initial state (input)
1937 * STATE2
1938 * STATE3
1939 * STATE4
1940 * output:
1941 * STATE1: finial state (output)
1942 * STATE2
1943 * STATE3
1944 * STATE4
1945 * changed:
1946 * KEY
1947 * TKEYP (T1)
1948 */
0d258efb 1949.align 4
54b6a1bd
HY
1950_aesni_dec4:
1951 movaps (KEYP), KEY # key
1952 mov KEYP, TKEYP
1953 pxor KEY, STATE1 # round 0
1954 pxor KEY, STATE2
1955 pxor KEY, STATE3
1956 pxor KEY, STATE4
1957 add $0x30, TKEYP
1958 cmp $24, KLEN
1959 jb .L4dec128
1960 lea 0x20(TKEYP), TKEYP
1961 je .L4dec192
1962 add $0x20, TKEYP
1963 movaps -0x60(TKEYP), KEY
b369e521
HY
1964 AESDEC KEY STATE1
1965 AESDEC KEY STATE2
1966 AESDEC KEY STATE3
1967 AESDEC KEY STATE4
54b6a1bd 1968 movaps -0x50(TKEYP), KEY
b369e521
HY
1969 AESDEC KEY STATE1
1970 AESDEC KEY STATE2
1971 AESDEC KEY STATE3
1972 AESDEC KEY STATE4
54b6a1bd
HY
1973.align 4
1974.L4dec192:
1975 movaps -0x40(TKEYP), KEY
b369e521
HY
1976 AESDEC KEY STATE1
1977 AESDEC KEY STATE2
1978 AESDEC KEY STATE3
1979 AESDEC KEY STATE4
54b6a1bd 1980 movaps -0x30(TKEYP), KEY
b369e521
HY
1981 AESDEC KEY STATE1
1982 AESDEC KEY STATE2
1983 AESDEC KEY STATE3
1984 AESDEC KEY STATE4
54b6a1bd
HY
1985.align 4
1986.L4dec128:
1987 movaps -0x20(TKEYP), KEY
b369e521
HY
1988 AESDEC KEY STATE1
1989 AESDEC KEY STATE2
1990 AESDEC KEY STATE3
1991 AESDEC KEY STATE4
54b6a1bd 1992 movaps -0x10(TKEYP), KEY
b369e521
HY
1993 AESDEC KEY STATE1
1994 AESDEC KEY STATE2
1995 AESDEC KEY STATE3
1996 AESDEC KEY STATE4
54b6a1bd 1997 movaps (TKEYP), KEY
b369e521
HY
1998 AESDEC KEY STATE1
1999 AESDEC KEY STATE2
2000 AESDEC KEY STATE3
2001 AESDEC KEY STATE4
54b6a1bd 2002 movaps 0x10(TKEYP), KEY
b369e521
HY
2003 AESDEC KEY STATE1
2004 AESDEC KEY STATE2
2005 AESDEC KEY STATE3
2006 AESDEC KEY STATE4
54b6a1bd 2007 movaps 0x20(TKEYP), KEY
b369e521
HY
2008 AESDEC KEY STATE1
2009 AESDEC KEY STATE2
2010 AESDEC KEY STATE3
2011 AESDEC KEY STATE4
54b6a1bd 2012 movaps 0x30(TKEYP), KEY
b369e521
HY
2013 AESDEC KEY STATE1
2014 AESDEC KEY STATE2
2015 AESDEC KEY STATE3
2016 AESDEC KEY STATE4
54b6a1bd 2017 movaps 0x40(TKEYP), KEY
b369e521
HY
2018 AESDEC KEY STATE1
2019 AESDEC KEY STATE2
2020 AESDEC KEY STATE3
2021 AESDEC KEY STATE4
54b6a1bd 2022 movaps 0x50(TKEYP), KEY
b369e521
HY
2023 AESDEC KEY STATE1
2024 AESDEC KEY STATE2
2025 AESDEC KEY STATE3
2026 AESDEC KEY STATE4
54b6a1bd 2027 movaps 0x60(TKEYP), KEY
b369e521
HY
2028 AESDEC KEY STATE1
2029 AESDEC KEY STATE2
2030 AESDEC KEY STATE3
2031 AESDEC KEY STATE4
54b6a1bd 2032 movaps 0x70(TKEYP), KEY
b369e521
HY
2033 AESDECLAST KEY STATE1 # last round
2034 AESDECLAST KEY STATE2
2035 AESDECLAST KEY STATE3
2036 AESDECLAST KEY STATE4
54b6a1bd 2037 ret
8309b745 2038ENDPROC(_aesni_dec4)
54b6a1bd
HY
2039
2040/*
2041 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2042 * size_t len)
2043 */
2044ENTRY(aesni_ecb_enc)
8691ccd7 2045 FRAME_BEGIN
0d258efb
MK
2046#ifndef __x86_64__
2047 pushl LEN
2048 pushl KEYP
2049 pushl KLEN
8691ccd7
JP
2050 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2051 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2052 movl (FRAME_OFFSET+24)(%esp), INP # src
2053 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2054#endif
54b6a1bd
HY
2055 test LEN, LEN # check length
2056 jz .Lecb_enc_ret
2057 mov 480(KEYP), KLEN
2058 cmp $16, LEN
2059 jb .Lecb_enc_ret
2060 cmp $64, LEN
2061 jb .Lecb_enc_loop1
2062.align 4
2063.Lecb_enc_loop4:
2064 movups (INP), STATE1
2065 movups 0x10(INP), STATE2
2066 movups 0x20(INP), STATE3
2067 movups 0x30(INP), STATE4
2068 call _aesni_enc4
2069 movups STATE1, (OUTP)
2070 movups STATE2, 0x10(OUTP)
2071 movups STATE3, 0x20(OUTP)
2072 movups STATE4, 0x30(OUTP)
2073 sub $64, LEN
2074 add $64, INP
2075 add $64, OUTP
2076 cmp $64, LEN
2077 jge .Lecb_enc_loop4
2078 cmp $16, LEN
2079 jb .Lecb_enc_ret
2080.align 4
2081.Lecb_enc_loop1:
2082 movups (INP), STATE1
2083 call _aesni_enc1
2084 movups STATE1, (OUTP)
2085 sub $16, LEN
2086 add $16, INP
2087 add $16, OUTP
2088 cmp $16, LEN
2089 jge .Lecb_enc_loop1
2090.Lecb_enc_ret:
0d258efb
MK
2091#ifndef __x86_64__
2092 popl KLEN
2093 popl KEYP
2094 popl LEN
2095#endif
8691ccd7 2096 FRAME_END
54b6a1bd 2097 ret
8309b745 2098ENDPROC(aesni_ecb_enc)
54b6a1bd
HY
2099
2100/*
2101 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2102 * size_t len);
2103 */
2104ENTRY(aesni_ecb_dec)
8691ccd7 2105 FRAME_BEGIN
0d258efb
MK
2106#ifndef __x86_64__
2107 pushl LEN
2108 pushl KEYP
2109 pushl KLEN
8691ccd7
JP
2110 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2111 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2112 movl (FRAME_OFFSET+24)(%esp), INP # src
2113 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2114#endif
54b6a1bd
HY
2115 test LEN, LEN
2116 jz .Lecb_dec_ret
2117 mov 480(KEYP), KLEN
2118 add $240, KEYP
2119 cmp $16, LEN
2120 jb .Lecb_dec_ret
2121 cmp $64, LEN
2122 jb .Lecb_dec_loop1
2123.align 4
2124.Lecb_dec_loop4:
2125 movups (INP), STATE1
2126 movups 0x10(INP), STATE2
2127 movups 0x20(INP), STATE3
2128 movups 0x30(INP), STATE4
2129 call _aesni_dec4
2130 movups STATE1, (OUTP)
2131 movups STATE2, 0x10(OUTP)
2132 movups STATE3, 0x20(OUTP)
2133 movups STATE4, 0x30(OUTP)
2134 sub $64, LEN
2135 add $64, INP
2136 add $64, OUTP
2137 cmp $64, LEN
2138 jge .Lecb_dec_loop4
2139 cmp $16, LEN
2140 jb .Lecb_dec_ret
2141.align 4
2142.Lecb_dec_loop1:
2143 movups (INP), STATE1
2144 call _aesni_dec1
2145 movups STATE1, (OUTP)
2146 sub $16, LEN
2147 add $16, INP
2148 add $16, OUTP
2149 cmp $16, LEN
2150 jge .Lecb_dec_loop1
2151.Lecb_dec_ret:
0d258efb
MK
2152#ifndef __x86_64__
2153 popl KLEN
2154 popl KEYP
2155 popl LEN
2156#endif
8691ccd7 2157 FRAME_END
54b6a1bd 2158 ret
8309b745 2159ENDPROC(aesni_ecb_dec)
54b6a1bd
HY
2160
2161/*
2162 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2163 * size_t len, u8 *iv)
2164 */
2165ENTRY(aesni_cbc_enc)
8691ccd7 2166 FRAME_BEGIN
0d258efb
MK
2167#ifndef __x86_64__
2168 pushl IVP
2169 pushl LEN
2170 pushl KEYP
2171 pushl KLEN
8691ccd7
JP
2172 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2173 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2174 movl (FRAME_OFFSET+28)(%esp), INP # src
2175 movl (FRAME_OFFSET+32)(%esp), LEN # len
2176 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2177#endif
54b6a1bd
HY
2178 cmp $16, LEN
2179 jb .Lcbc_enc_ret
2180 mov 480(KEYP), KLEN
2181 movups (IVP), STATE # load iv as initial state
2182.align 4
2183.Lcbc_enc_loop:
2184 movups (INP), IN # load input
2185 pxor IN, STATE
2186 call _aesni_enc1
2187 movups STATE, (OUTP) # store output
2188 sub $16, LEN
2189 add $16, INP
2190 add $16, OUTP
2191 cmp $16, LEN
2192 jge .Lcbc_enc_loop
2193 movups STATE, (IVP)
2194.Lcbc_enc_ret:
0d258efb
MK
2195#ifndef __x86_64__
2196 popl KLEN
2197 popl KEYP
2198 popl LEN
2199 popl IVP
2200#endif
8691ccd7 2201 FRAME_END
54b6a1bd 2202 ret
8309b745 2203ENDPROC(aesni_cbc_enc)
54b6a1bd
HY
2204
2205/*
2206 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2207 * size_t len, u8 *iv)
2208 */
2209ENTRY(aesni_cbc_dec)
8691ccd7 2210 FRAME_BEGIN
0d258efb
MK
2211#ifndef __x86_64__
2212 pushl IVP
2213 pushl LEN
2214 pushl KEYP
2215 pushl KLEN
8691ccd7
JP
2216 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2217 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2218 movl (FRAME_OFFSET+28)(%esp), INP # src
2219 movl (FRAME_OFFSET+32)(%esp), LEN # len
2220 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2221#endif
54b6a1bd 2222 cmp $16, LEN
e6efaa02 2223 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2224 mov 480(KEYP), KLEN
2225 add $240, KEYP
2226 movups (IVP), IV
2227 cmp $64, LEN
2228 jb .Lcbc_dec_loop1
2229.align 4
2230.Lcbc_dec_loop4:
2231 movups (INP), IN1
2232 movaps IN1, STATE1
2233 movups 0x10(INP), IN2
2234 movaps IN2, STATE2
0d258efb 2235#ifdef __x86_64__
54b6a1bd
HY
2236 movups 0x20(INP), IN3
2237 movaps IN3, STATE3
2238 movups 0x30(INP), IN4
2239 movaps IN4, STATE4
0d258efb
MK
2240#else
2241 movups 0x20(INP), IN1
2242 movaps IN1, STATE3
2243 movups 0x30(INP), IN2
2244 movaps IN2, STATE4
2245#endif
54b6a1bd
HY
2246 call _aesni_dec4
2247 pxor IV, STATE1
0d258efb 2248#ifdef __x86_64__
54b6a1bd
HY
2249 pxor IN1, STATE2
2250 pxor IN2, STATE3
2251 pxor IN3, STATE4
2252 movaps IN4, IV
0d258efb 2253#else
0d258efb
MK
2254 pxor IN1, STATE4
2255 movaps IN2, IV
7c8d5184
MK
2256 movups (INP), IN1
2257 pxor IN1, STATE2
2258 movups 0x10(INP), IN2
2259 pxor IN2, STATE3
0d258efb 2260#endif
54b6a1bd
HY
2261 movups STATE1, (OUTP)
2262 movups STATE2, 0x10(OUTP)
2263 movups STATE3, 0x20(OUTP)
2264 movups STATE4, 0x30(OUTP)
2265 sub $64, LEN
2266 add $64, INP
2267 add $64, OUTP
2268 cmp $64, LEN
2269 jge .Lcbc_dec_loop4
2270 cmp $16, LEN
2271 jb .Lcbc_dec_ret
2272.align 4
2273.Lcbc_dec_loop1:
2274 movups (INP), IN
2275 movaps IN, STATE
2276 call _aesni_dec1
2277 pxor IV, STATE
2278 movups STATE, (OUTP)
2279 movaps IN, IV
2280 sub $16, LEN
2281 add $16, INP
2282 add $16, OUTP
2283 cmp $16, LEN
2284 jge .Lcbc_dec_loop1
54b6a1bd 2285.Lcbc_dec_ret:
e6efaa02
HY
2286 movups IV, (IVP)
2287.Lcbc_dec_just_ret:
0d258efb
MK
2288#ifndef __x86_64__
2289 popl KLEN
2290 popl KEYP
2291 popl LEN
2292 popl IVP
2293#endif
8691ccd7 2294 FRAME_END
54b6a1bd 2295 ret
8309b745 2296ENDPROC(aesni_cbc_dec)
12387a46 2297
0d258efb 2298#ifdef __x86_64__
1253cab8 2299.pushsection .rodata
12387a46
HY
2300.align 16
2301.Lbswap_mask:
2302 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1253cab8 2303.popsection
12387a46
HY
2304
2305/*
2306 * _aesni_inc_init: internal ABI
2307 * setup registers used by _aesni_inc
2308 * input:
2309 * IV
2310 * output:
2311 * CTR: == IV, in little endian
2312 * TCTR_LOW: == lower qword of CTR
2313 * INC: == 1, in little endian
2314 * BSWAP_MASK == endian swapping mask
2315 */
0d258efb 2316.align 4
12387a46
HY
2317_aesni_inc_init:
2318 movaps .Lbswap_mask, BSWAP_MASK
2319 movaps IV, CTR
2320 PSHUFB_XMM BSWAP_MASK CTR
2321 mov $1, TCTR_LOW
32cbd7df
HY
2322 MOVQ_R64_XMM TCTR_LOW INC
2323 MOVQ_R64_XMM CTR TCTR_LOW
12387a46 2324 ret
8309b745 2325ENDPROC(_aesni_inc_init)
12387a46
HY
2326
2327/*
2328 * _aesni_inc: internal ABI
2329 * Increase IV by 1, IV is in big endian
2330 * input:
2331 * IV
2332 * CTR: == IV, in little endian
2333 * TCTR_LOW: == lower qword of CTR
2334 * INC: == 1, in little endian
2335 * BSWAP_MASK == endian swapping mask
2336 * output:
2337 * IV: Increase by 1
2338 * changed:
2339 * CTR: == output IV, in little endian
2340 * TCTR_LOW: == lower qword of CTR
2341 */
0d258efb 2342.align 4
12387a46
HY
2343_aesni_inc:
2344 paddq INC, CTR
2345 add $1, TCTR_LOW
2346 jnc .Linc_low
2347 pslldq $8, INC
2348 paddq INC, CTR
2349 psrldq $8, INC
2350.Linc_low:
2351 movaps CTR, IV
2352 PSHUFB_XMM BSWAP_MASK IV
2353 ret
8309b745 2354ENDPROC(_aesni_inc)
12387a46
HY
2355
2356/*
2357 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2358 * size_t len, u8 *iv)
2359 */
2360ENTRY(aesni_ctr_enc)
8691ccd7 2361 FRAME_BEGIN
12387a46
HY
2362 cmp $16, LEN
2363 jb .Lctr_enc_just_ret
2364 mov 480(KEYP), KLEN
2365 movups (IVP), IV
2366 call _aesni_inc_init
2367 cmp $64, LEN
2368 jb .Lctr_enc_loop1
2369.align 4
2370.Lctr_enc_loop4:
2371 movaps IV, STATE1
2372 call _aesni_inc
2373 movups (INP), IN1
2374 movaps IV, STATE2
2375 call _aesni_inc
2376 movups 0x10(INP), IN2
2377 movaps IV, STATE3
2378 call _aesni_inc
2379 movups 0x20(INP), IN3
2380 movaps IV, STATE4
2381 call _aesni_inc
2382 movups 0x30(INP), IN4
2383 call _aesni_enc4
2384 pxor IN1, STATE1
2385 movups STATE1, (OUTP)
2386 pxor IN2, STATE2
2387 movups STATE2, 0x10(OUTP)
2388 pxor IN3, STATE3
2389 movups STATE3, 0x20(OUTP)
2390 pxor IN4, STATE4
2391 movups STATE4, 0x30(OUTP)
2392 sub $64, LEN
2393 add $64, INP
2394 add $64, OUTP
2395 cmp $64, LEN
2396 jge .Lctr_enc_loop4
2397 cmp $16, LEN
2398 jb .Lctr_enc_ret
2399.align 4
2400.Lctr_enc_loop1:
2401 movaps IV, STATE
2402 call _aesni_inc
2403 movups (INP), IN
2404 call _aesni_enc1
2405 pxor IN, STATE
2406 movups STATE, (OUTP)
2407 sub $16, LEN
2408 add $16, INP
2409 add $16, OUTP
2410 cmp $16, LEN
2411 jge .Lctr_enc_loop1
2412.Lctr_enc_ret:
2413 movups IV, (IVP)
2414.Lctr_enc_just_ret:
8691ccd7 2415 FRAME_END
12387a46 2416 ret
8309b745 2417ENDPROC(aesni_ctr_enc)
c456a9cd
JK
2418
2419/*
2420 * _aesni_gf128mul_x_ble: internal ABI
2421 * Multiply in GF(2^128) for XTS IVs
2422 * input:
2423 * IV: current IV
2424 * GF128MUL_MASK == mask with 0x87 and 0x01
2425 * output:
2426 * IV: next IV
2427 * changed:
2428 * CTR: == temporary value
2429 */
2430#define _aesni_gf128mul_x_ble() \
2431 pshufd $0x13, IV, CTR; \
2432 paddq IV, IV; \
2433 psrad $31, CTR; \
2434 pand GF128MUL_MASK, CTR; \
2435 pxor CTR, IV;
2436
2437/*
2438 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439 * bool enc, u8 *iv)
2440 */
2441ENTRY(aesni_xts_crypt8)
8691ccd7 2442 FRAME_BEGIN
c456a9cd
JK
2443 cmpb $0, %cl
2444 movl $0, %ecx
2445 movl $240, %r10d
2446 leaq _aesni_enc4, %r11
2447 leaq _aesni_dec4, %rax
2448 cmovel %r10d, %ecx
2449 cmoveq %rax, %r11
2450
2451 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2452 movups (IVP), IV
2453
2454 mov 480(KEYP), KLEN
2455 addq %rcx, KEYP
2456
2457 movdqa IV, STATE1
fe6510b5
JK
2458 movdqu 0x00(INP), INC
2459 pxor INC, STATE1
c456a9cd
JK
2460 movdqu IV, 0x00(OUTP)
2461
2462 _aesni_gf128mul_x_ble()
2463 movdqa IV, STATE2
fe6510b5
JK
2464 movdqu 0x10(INP), INC
2465 pxor INC, STATE2
c456a9cd
JK
2466 movdqu IV, 0x10(OUTP)
2467
2468 _aesni_gf128mul_x_ble()
2469 movdqa IV, STATE3
fe6510b5
JK
2470 movdqu 0x20(INP), INC
2471 pxor INC, STATE3
c456a9cd
JK
2472 movdqu IV, 0x20(OUTP)
2473
2474 _aesni_gf128mul_x_ble()
2475 movdqa IV, STATE4
fe6510b5
JK
2476 movdqu 0x30(INP), INC
2477 pxor INC, STATE4
c456a9cd
JK
2478 movdqu IV, 0x30(OUTP)
2479
9697fa39 2480 CALL_NOSPEC %r11
c456a9cd 2481
fe6510b5
JK
2482 movdqu 0x00(OUTP), INC
2483 pxor INC, STATE1
c456a9cd
JK
2484 movdqu STATE1, 0x00(OUTP)
2485
2486 _aesni_gf128mul_x_ble()
2487 movdqa IV, STATE1
fe6510b5
JK
2488 movdqu 0x40(INP), INC
2489 pxor INC, STATE1
c456a9cd
JK
2490 movdqu IV, 0x40(OUTP)
2491
fe6510b5
JK
2492 movdqu 0x10(OUTP), INC
2493 pxor INC, STATE2
c456a9cd
JK
2494 movdqu STATE2, 0x10(OUTP)
2495
2496 _aesni_gf128mul_x_ble()
2497 movdqa IV, STATE2
fe6510b5
JK
2498 movdqu 0x50(INP), INC
2499 pxor INC, STATE2
c456a9cd
JK
2500 movdqu IV, 0x50(OUTP)
2501
fe6510b5
JK
2502 movdqu 0x20(OUTP), INC
2503 pxor INC, STATE3
c456a9cd
JK
2504 movdqu STATE3, 0x20(OUTP)
2505
2506 _aesni_gf128mul_x_ble()
2507 movdqa IV, STATE3
fe6510b5
JK
2508 movdqu 0x60(INP), INC
2509 pxor INC, STATE3
c456a9cd
JK
2510 movdqu IV, 0x60(OUTP)
2511
fe6510b5
JK
2512 movdqu 0x30(OUTP), INC
2513 pxor INC, STATE4
c456a9cd
JK
2514 movdqu STATE4, 0x30(OUTP)
2515
2516 _aesni_gf128mul_x_ble()
2517 movdqa IV, STATE4
fe6510b5
JK
2518 movdqu 0x70(INP), INC
2519 pxor INC, STATE4
c456a9cd
JK
2520 movdqu IV, 0x70(OUTP)
2521
2522 _aesni_gf128mul_x_ble()
2523 movups IV, (IVP)
2524
9697fa39 2525 CALL_NOSPEC %r11
c456a9cd 2526
fe6510b5
JK
2527 movdqu 0x40(OUTP), INC
2528 pxor INC, STATE1
c456a9cd
JK
2529 movdqu STATE1, 0x40(OUTP)
2530
fe6510b5
JK
2531 movdqu 0x50(OUTP), INC
2532 pxor INC, STATE2
c456a9cd
JK
2533 movdqu STATE2, 0x50(OUTP)
2534
fe6510b5
JK
2535 movdqu 0x60(OUTP), INC
2536 pxor INC, STATE3
c456a9cd
JK
2537 movdqu STATE3, 0x60(OUTP)
2538
fe6510b5
JK
2539 movdqu 0x70(OUTP), INC
2540 pxor INC, STATE4
c456a9cd
JK
2541 movdqu STATE4, 0x70(OUTP)
2542
8691ccd7 2543 FRAME_END
c456a9cd
JK
2544 ret
2545ENDPROC(aesni_xts_crypt8)
2546
0d258efb 2547#endif