]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blob - arch/x86/crypto/aesni-intel_asm.S
ASoC: tlv320aic31xx: Reset registers during power up
[mirror_ubuntu-focal-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
1 /*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32 #include <linux/linkage.h>
33 #include <asm/inst.h>
34 #include <asm/frame.h>
35
36 /*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44 #define MOVADQ movaps
45 #define MOVUDQ movups
46
47 #ifdef __x86_64__
48
49 # constants in mergeable sections, linker can reorder and merge
50 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
51 .align 16
52 .Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
54 .section .rodata.cst16.POLY, "aM", @progbits, 16
55 .align 16
56 POLY: .octa 0xC2000000000000000000000000000001
57 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
58 .align 16
59 TWOONE: .octa 0x00000001000000000000000000000001
60
61 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
62 .align 16
63 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
64 .section .rodata.cst16.MASK1, "aM", @progbits, 16
65 .align 16
66 MASK1: .octa 0x0000000000000000ffffffffffffffff
67 .section .rodata.cst16.MASK2, "aM", @progbits, 16
68 .align 16
69 MASK2: .octa 0xffffffffffffffff0000000000000000
70 .section .rodata.cst16.ONE, "aM", @progbits, 16
71 .align 16
72 ONE: .octa 0x00000000000000000000000000000001
73 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
74 .align 16
75 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76 .section .rodata.cst16.dec, "aM", @progbits, 16
77 .align 16
78 dec: .octa 0x1
79 .section .rodata.cst16.enc, "aM", @progbits, 16
80 .align 16
81 enc: .octa 0x2
82
83 # order of these constants should not change.
84 # more specifically, ALL_F should follow SHIFT_MASK,
85 # and zero should follow ALL_F
86 .section .rodata, "a", @progbits
87 .align 16
88 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
90 .octa 0x00000000000000000000000000000000
91
92 .section .rodata
93 .align 16
94 .type aad_shift_arr, @object
95 .size aad_shift_arr, 272
96 aad_shift_arr:
97 .octa 0xffffffffffffffffffffffffffffffff
98 .octa 0xffffffffffffffffffffffffffffff0C
99 .octa 0xffffffffffffffffffffffffffff0D0C
100 .octa 0xffffffffffffffffffffffffff0E0D0C
101 .octa 0xffffffffffffffffffffffff0F0E0D0C
102 .octa 0xffffffffffffffffffffff0C0B0A0908
103 .octa 0xffffffffffffffffffff0D0C0B0A0908
104 .octa 0xffffffffffffffffff0E0D0C0B0A0908
105 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
106 .octa 0xffffffffffffff0C0B0A090807060504
107 .octa 0xffffffffffff0D0C0B0A090807060504
108 .octa 0xffffffffff0E0D0C0B0A090807060504
109 .octa 0xffffffff0F0E0D0C0B0A090807060504
110 .octa 0xffffff0C0B0A09080706050403020100
111 .octa 0xffff0D0C0B0A09080706050403020100
112 .octa 0xff0E0D0C0B0A09080706050403020100
113 .octa 0x0F0E0D0C0B0A09080706050403020100
114
115
116 .text
117
118
119 #define STACK_OFFSET 8*3
120 #define HashKey 16*0 // store HashKey <<1 mod poly here
121 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
122 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
123 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
124 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
125 // bits of HashKey <<1 mod poly here
126 //(for Karatsuba purposes)
127 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
128 // bits of HashKey^2 <<1 mod poly here
129 // (for Karatsuba purposes)
130 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
131 // bits of HashKey^3 <<1 mod poly here
132 // (for Karatsuba purposes)
133 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
134 // bits of HashKey^4 <<1 mod poly here
135 // (for Karatsuba purposes)
136 #define VARIABLE_OFFSET 16*8
137
138 #define arg1 rdi
139 #define arg2 rsi
140 #define arg3 rdx
141 #define arg4 rcx
142 #define arg5 r8
143 #define arg6 r9
144 #define arg7 STACK_OFFSET+8(%r14)
145 #define arg8 STACK_OFFSET+16(%r14)
146 #define arg9 STACK_OFFSET+24(%r14)
147 #define arg10 STACK_OFFSET+32(%r14)
148 #define keysize 2*15*16(%arg1)
149 #endif
150
151
152 #define STATE1 %xmm0
153 #define STATE2 %xmm4
154 #define STATE3 %xmm5
155 #define STATE4 %xmm6
156 #define STATE STATE1
157 #define IN1 %xmm1
158 #define IN2 %xmm7
159 #define IN3 %xmm8
160 #define IN4 %xmm9
161 #define IN IN1
162 #define KEY %xmm2
163 #define IV %xmm3
164
165 #define BSWAP_MASK %xmm10
166 #define CTR %xmm11
167 #define INC %xmm12
168
169 #define GF128MUL_MASK %xmm10
170
171 #ifdef __x86_64__
172 #define AREG %rax
173 #define KEYP %rdi
174 #define OUTP %rsi
175 #define UKEYP OUTP
176 #define INP %rdx
177 #define LEN %rcx
178 #define IVP %r8
179 #define KLEN %r9d
180 #define T1 %r10
181 #define TKEYP T1
182 #define T2 %r11
183 #define TCTR_LOW T2
184 #else
185 #define AREG %eax
186 #define KEYP %edi
187 #define OUTP AREG
188 #define UKEYP OUTP
189 #define INP %edx
190 #define LEN %esi
191 #define IVP %ebp
192 #define KLEN %ebx
193 #define T1 %ecx
194 #define TKEYP T1
195 #endif
196
197
198 #ifdef __x86_64__
199 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
200 *
201 *
202 * Input: A and B (128-bits each, bit-reflected)
203 * Output: C = A*B*x mod poly, (i.e. >>1 )
204 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
205 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
206 *
207 */
208 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
209 movdqa \GH, \TMP1
210 pshufd $78, \GH, \TMP2
211 pshufd $78, \HK, \TMP3
212 pxor \GH, \TMP2 # TMP2 = a1+a0
213 pxor \HK, \TMP3 # TMP3 = b1+b0
214 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
215 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
216 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
217 pxor \GH, \TMP2
218 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
219 movdqa \TMP2, \TMP3
220 pslldq $8, \TMP3 # left shift TMP3 2 DWs
221 psrldq $8, \TMP2 # right shift TMP2 2 DWs
222 pxor \TMP3, \GH
223 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
224
225 # first phase of the reduction
226
227 movdqa \GH, \TMP2
228 movdqa \GH, \TMP3
229 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
230 # in in order to perform
231 # independent shifts
232 pslld $31, \TMP2 # packed right shift <<31
233 pslld $30, \TMP3 # packed right shift <<30
234 pslld $25, \TMP4 # packed right shift <<25
235 pxor \TMP3, \TMP2 # xor the shifted versions
236 pxor \TMP4, \TMP2
237 movdqa \TMP2, \TMP5
238 psrldq $4, \TMP5 # right shift TMP5 1 DW
239 pslldq $12, \TMP2 # left shift TMP2 3 DWs
240 pxor \TMP2, \GH
241
242 # second phase of the reduction
243
244 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
245 # in in order to perform
246 # independent shifts
247 movdqa \GH,\TMP3
248 movdqa \GH,\TMP4
249 psrld $1,\TMP2 # packed left shift >>1
250 psrld $2,\TMP3 # packed left shift >>2
251 psrld $7,\TMP4 # packed left shift >>7
252 pxor \TMP3,\TMP2 # xor the shifted versions
253 pxor \TMP4,\TMP2
254 pxor \TMP5, \TMP2
255 pxor \TMP2, \GH
256 pxor \TMP1, \GH # result is in TMP1
257 .endm
258
259 /*
260 * if a = number of total plaintext bytes
261 * b = floor(a/16)
262 * num_initial_blocks = b mod 4
263 * encrypt the initial num_initial_blocks blocks and apply ghash on
264 * the ciphertext
265 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
266 * are clobbered
267 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
268 */
269
270
271 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
272 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
273 MOVADQ SHUF_MASK(%rip), %xmm14
274 mov arg7, %r10 # %r10 = AAD
275 mov arg8, %r12 # %r12 = aadLen
276 mov %r12, %r11
277 pxor %xmm\i, %xmm\i
278 pxor \XMM2, \XMM2
279
280 cmp $16, %r11
281 jl _get_AAD_rest8\num_initial_blocks\operation
282 _get_AAD_blocks\num_initial_blocks\operation:
283 movdqu (%r10), %xmm\i
284 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
285 pxor %xmm\i, \XMM2
286 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287 add $16, %r10
288 sub $16, %r12
289 sub $16, %r11
290 cmp $16, %r11
291 jge _get_AAD_blocks\num_initial_blocks\operation
292
293 movdqu \XMM2, %xmm\i
294 cmp $0, %r11
295 je _get_AAD_done\num_initial_blocks\operation
296
297 pxor %xmm\i,%xmm\i
298
299 /* read the last <16B of AAD. since we have at least 4B of
300 data right after the AAD (the ICV, and maybe some CT), we can
301 read 4B/8B blocks safely, and then get rid of the extra stuff */
302 _get_AAD_rest8\num_initial_blocks\operation:
303 cmp $4, %r11
304 jle _get_AAD_rest4\num_initial_blocks\operation
305 movq (%r10), \TMP1
306 add $8, %r10
307 sub $8, %r11
308 pslldq $8, \TMP1
309 psrldq $8, %xmm\i
310 pxor \TMP1, %xmm\i
311 jmp _get_AAD_rest8\num_initial_blocks\operation
312 _get_AAD_rest4\num_initial_blocks\operation:
313 cmp $0, %r11
314 jle _get_AAD_rest0\num_initial_blocks\operation
315 mov (%r10), %eax
316 movq %rax, \TMP1
317 add $4, %r10
318 sub $4, %r10
319 pslldq $12, \TMP1
320 psrldq $4, %xmm\i
321 pxor \TMP1, %xmm\i
322 _get_AAD_rest0\num_initial_blocks\operation:
323 /* finalize: shift out the extra bytes we read, and align
324 left. since pslldq can only shift by an immediate, we use
325 vpshufb and an array of shuffle masks */
326 movq %r12, %r11
327 salq $4, %r11
328 movdqu aad_shift_arr(%r11), \TMP1
329 PSHUFB_XMM \TMP1, %xmm\i
330 _get_AAD_rest_final\num_initial_blocks\operation:
331 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
332 pxor \XMM2, %xmm\i
333 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
334
335 _get_AAD_done\num_initial_blocks\operation:
336 xor %r11, %r11 # initialise the data pointer offset as zero
337 # start AES for num_initial_blocks blocks
338
339 mov %arg5, %rax # %rax = *Y0
340 movdqu (%rax), \XMM0 # XMM0 = Y0
341 PSHUFB_XMM %xmm14, \XMM0
342
343 .if (\i == 5) || (\i == 6) || (\i == 7)
344 MOVADQ ONE(%RIP),\TMP1
345 MOVADQ (%arg1),\TMP2
346 .irpc index, \i_seq
347 paddd \TMP1, \XMM0 # INCR Y0
348 movdqa \XMM0, %xmm\index
349 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
350 pxor \TMP2, %xmm\index
351 .endr
352 lea 0x10(%arg1),%r10
353 mov keysize,%eax
354 shr $2,%eax # 128->4, 192->6, 256->8
355 add $5,%eax # 128->9, 192->11, 256->13
356
357 aes_loop_initial_dec\num_initial_blocks:
358 MOVADQ (%r10),\TMP1
359 .irpc index, \i_seq
360 AESENC \TMP1, %xmm\index
361 .endr
362 add $16,%r10
363 sub $1,%eax
364 jnz aes_loop_initial_dec\num_initial_blocks
365
366 MOVADQ (%r10), \TMP1
367 .irpc index, \i_seq
368 AESENCLAST \TMP1, %xmm\index # Last Round
369 .endr
370 .irpc index, \i_seq
371 movdqu (%arg3 , %r11, 1), \TMP1
372 pxor \TMP1, %xmm\index
373 movdqu %xmm\index, (%arg2 , %r11, 1)
374 # write back plaintext/ciphertext for num_initial_blocks
375 add $16, %r11
376
377 movdqa \TMP1, %xmm\index
378 PSHUFB_XMM %xmm14, %xmm\index
379 # prepare plaintext/ciphertext for GHASH computation
380 .endr
381 .endif
382
383 # apply GHASH on num_initial_blocks blocks
384
385 .if \i == 5
386 pxor %xmm5, %xmm6
387 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
388 pxor %xmm6, %xmm7
389 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
390 pxor %xmm7, %xmm8
391 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
392 .elseif \i == 6
393 pxor %xmm6, %xmm7
394 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
395 pxor %xmm7, %xmm8
396 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
397 .elseif \i == 7
398 pxor %xmm7, %xmm8
399 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
400 .endif
401 cmp $64, %r13
402 jl _initial_blocks_done\num_initial_blocks\operation
403 # no need for precomputed values
404 /*
405 *
406 * Precomputations for HashKey parallel with encryption of first 4 blocks.
407 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
408 */
409 MOVADQ ONE(%rip), \TMP1
410 paddd \TMP1, \XMM0 # INCR Y0
411 MOVADQ \XMM0, \XMM1
412 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
413
414 paddd \TMP1, \XMM0 # INCR Y0
415 MOVADQ \XMM0, \XMM2
416 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
417
418 paddd \TMP1, \XMM0 # INCR Y0
419 MOVADQ \XMM0, \XMM3
420 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
421
422 paddd \TMP1, \XMM0 # INCR Y0
423 MOVADQ \XMM0, \XMM4
424 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
425
426 MOVADQ 0(%arg1),\TMP1
427 pxor \TMP1, \XMM1
428 pxor \TMP1, \XMM2
429 pxor \TMP1, \XMM3
430 pxor \TMP1, \XMM4
431 movdqa \TMP3, \TMP5
432 pshufd $78, \TMP3, \TMP1
433 pxor \TMP3, \TMP1
434 movdqa \TMP1, HashKey_k(%rsp)
435 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
436 # TMP5 = HashKey^2<<1 (mod poly)
437 movdqa \TMP5, HashKey_2(%rsp)
438 # HashKey_2 = HashKey^2<<1 (mod poly)
439 pshufd $78, \TMP5, \TMP1
440 pxor \TMP5, \TMP1
441 movdqa \TMP1, HashKey_2_k(%rsp)
442 .irpc index, 1234 # do 4 rounds
443 movaps 0x10*\index(%arg1), \TMP1
444 AESENC \TMP1, \XMM1
445 AESENC \TMP1, \XMM2
446 AESENC \TMP1, \XMM3
447 AESENC \TMP1, \XMM4
448 .endr
449 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
450 # TMP5 = HashKey^3<<1 (mod poly)
451 movdqa \TMP5, HashKey_3(%rsp)
452 pshufd $78, \TMP5, \TMP1
453 pxor \TMP5, \TMP1
454 movdqa \TMP1, HashKey_3_k(%rsp)
455 .irpc index, 56789 # do next 5 rounds
456 movaps 0x10*\index(%arg1), \TMP1
457 AESENC \TMP1, \XMM1
458 AESENC \TMP1, \XMM2
459 AESENC \TMP1, \XMM3
460 AESENC \TMP1, \XMM4
461 .endr
462 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
463 # TMP5 = HashKey^3<<1 (mod poly)
464 movdqa \TMP5, HashKey_4(%rsp)
465 pshufd $78, \TMP5, \TMP1
466 pxor \TMP5, \TMP1
467 movdqa \TMP1, HashKey_4_k(%rsp)
468 lea 0xa0(%arg1),%r10
469 mov keysize,%eax
470 shr $2,%eax # 128->4, 192->6, 256->8
471 sub $4,%eax # 128->0, 192->2, 256->4
472 jz aes_loop_pre_dec_done\num_initial_blocks
473
474 aes_loop_pre_dec\num_initial_blocks:
475 MOVADQ (%r10),\TMP2
476 .irpc index, 1234
477 AESENC \TMP2, %xmm\index
478 .endr
479 add $16,%r10
480 sub $1,%eax
481 jnz aes_loop_pre_dec\num_initial_blocks
482
483 aes_loop_pre_dec_done\num_initial_blocks:
484 MOVADQ (%r10), \TMP2
485 AESENCLAST \TMP2, \XMM1
486 AESENCLAST \TMP2, \XMM2
487 AESENCLAST \TMP2, \XMM3
488 AESENCLAST \TMP2, \XMM4
489 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
490 pxor \TMP1, \XMM1
491 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
492 movdqa \TMP1, \XMM1
493 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
494 pxor \TMP1, \XMM2
495 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
496 movdqa \TMP1, \XMM2
497 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
498 pxor \TMP1, \XMM3
499 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
500 movdqa \TMP1, \XMM3
501 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
502 pxor \TMP1, \XMM4
503 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
504 movdqa \TMP1, \XMM4
505 add $64, %r11
506 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
507 pxor \XMMDst, \XMM1
508 # combine GHASHed value with the corresponding ciphertext
509 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
510 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
511 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
512
513 _initial_blocks_done\num_initial_blocks\operation:
514
515 .endm
516
517
518 /*
519 * if a = number of total plaintext bytes
520 * b = floor(a/16)
521 * num_initial_blocks = b mod 4
522 * encrypt the initial num_initial_blocks blocks and apply ghash on
523 * the ciphertext
524 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
525 * are clobbered
526 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
527 */
528
529
530 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
531 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
532 MOVADQ SHUF_MASK(%rip), %xmm14
533 mov arg7, %r10 # %r10 = AAD
534 mov arg8, %r12 # %r12 = aadLen
535 mov %r12, %r11
536 pxor %xmm\i, %xmm\i
537 pxor \XMM2, \XMM2
538
539 cmp $16, %r11
540 jl _get_AAD_rest8\num_initial_blocks\operation
541 _get_AAD_blocks\num_initial_blocks\operation:
542 movdqu (%r10), %xmm\i
543 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
544 pxor %xmm\i, \XMM2
545 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 add $16, %r10
547 sub $16, %r12
548 sub $16, %r11
549 cmp $16, %r11
550 jge _get_AAD_blocks\num_initial_blocks\operation
551
552 movdqu \XMM2, %xmm\i
553 cmp $0, %r11
554 je _get_AAD_done\num_initial_blocks\operation
555
556 pxor %xmm\i,%xmm\i
557
558 /* read the last <16B of AAD. since we have at least 4B of
559 data right after the AAD (the ICV, and maybe some PT), we can
560 read 4B/8B blocks safely, and then get rid of the extra stuff */
561 _get_AAD_rest8\num_initial_blocks\operation:
562 cmp $4, %r11
563 jle _get_AAD_rest4\num_initial_blocks\operation
564 movq (%r10), \TMP1
565 add $8, %r10
566 sub $8, %r11
567 pslldq $8, \TMP1
568 psrldq $8, %xmm\i
569 pxor \TMP1, %xmm\i
570 jmp _get_AAD_rest8\num_initial_blocks\operation
571 _get_AAD_rest4\num_initial_blocks\operation:
572 cmp $0, %r11
573 jle _get_AAD_rest0\num_initial_blocks\operation
574 mov (%r10), %eax
575 movq %rax, \TMP1
576 add $4, %r10
577 sub $4, %r10
578 pslldq $12, \TMP1
579 psrldq $4, %xmm\i
580 pxor \TMP1, %xmm\i
581 _get_AAD_rest0\num_initial_blocks\operation:
582 /* finalize: shift out the extra bytes we read, and align
583 left. since pslldq can only shift by an immediate, we use
584 vpshufb and an array of shuffle masks */
585 movq %r12, %r11
586 salq $4, %r11
587 movdqu aad_shift_arr(%r11), \TMP1
588 PSHUFB_XMM \TMP1, %xmm\i
589 _get_AAD_rest_final\num_initial_blocks\operation:
590 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
591 pxor \XMM2, %xmm\i
592 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
593
594 _get_AAD_done\num_initial_blocks\operation:
595 xor %r11, %r11 # initialise the data pointer offset as zero
596 # start AES for num_initial_blocks blocks
597
598 mov %arg5, %rax # %rax = *Y0
599 movdqu (%rax), \XMM0 # XMM0 = Y0
600 PSHUFB_XMM %xmm14, \XMM0
601
602 .if (\i == 5) || (\i == 6) || (\i == 7)
603
604 MOVADQ ONE(%RIP),\TMP1
605 MOVADQ 0(%arg1),\TMP2
606 .irpc index, \i_seq
607 paddd \TMP1, \XMM0 # INCR Y0
608 MOVADQ \XMM0, %xmm\index
609 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
610 pxor \TMP2, %xmm\index
611 .endr
612 lea 0x10(%arg1),%r10
613 mov keysize,%eax
614 shr $2,%eax # 128->4, 192->6, 256->8
615 add $5,%eax # 128->9, 192->11, 256->13
616
617 aes_loop_initial_enc\num_initial_blocks:
618 MOVADQ (%r10),\TMP1
619 .irpc index, \i_seq
620 AESENC \TMP1, %xmm\index
621 .endr
622 add $16,%r10
623 sub $1,%eax
624 jnz aes_loop_initial_enc\num_initial_blocks
625
626 MOVADQ (%r10), \TMP1
627 .irpc index, \i_seq
628 AESENCLAST \TMP1, %xmm\index # Last Round
629 .endr
630 .irpc index, \i_seq
631 movdqu (%arg3 , %r11, 1), \TMP1
632 pxor \TMP1, %xmm\index
633 movdqu %xmm\index, (%arg2 , %r11, 1)
634 # write back plaintext/ciphertext for num_initial_blocks
635 add $16, %r11
636 PSHUFB_XMM %xmm14, %xmm\index
637
638 # prepare plaintext/ciphertext for GHASH computation
639 .endr
640 .endif
641
642 # apply GHASH on num_initial_blocks blocks
643
644 .if \i == 5
645 pxor %xmm5, %xmm6
646 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
647 pxor %xmm6, %xmm7
648 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
649 pxor %xmm7, %xmm8
650 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
651 .elseif \i == 6
652 pxor %xmm6, %xmm7
653 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
654 pxor %xmm7, %xmm8
655 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
656 .elseif \i == 7
657 pxor %xmm7, %xmm8
658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
659 .endif
660 cmp $64, %r13
661 jl _initial_blocks_done\num_initial_blocks\operation
662 # no need for precomputed values
663 /*
664 *
665 * Precomputations for HashKey parallel with encryption of first 4 blocks.
666 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
667 */
668 MOVADQ ONE(%RIP),\TMP1
669 paddd \TMP1, \XMM0 # INCR Y0
670 MOVADQ \XMM0, \XMM1
671 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
672
673 paddd \TMP1, \XMM0 # INCR Y0
674 MOVADQ \XMM0, \XMM2
675 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
676
677 paddd \TMP1, \XMM0 # INCR Y0
678 MOVADQ \XMM0, \XMM3
679 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
680
681 paddd \TMP1, \XMM0 # INCR Y0
682 MOVADQ \XMM0, \XMM4
683 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
684
685 MOVADQ 0(%arg1),\TMP1
686 pxor \TMP1, \XMM1
687 pxor \TMP1, \XMM2
688 pxor \TMP1, \XMM3
689 pxor \TMP1, \XMM4
690 movdqa \TMP3, \TMP5
691 pshufd $78, \TMP3, \TMP1
692 pxor \TMP3, \TMP1
693 movdqa \TMP1, HashKey_k(%rsp)
694 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
695 # TMP5 = HashKey^2<<1 (mod poly)
696 movdqa \TMP5, HashKey_2(%rsp)
697 # HashKey_2 = HashKey^2<<1 (mod poly)
698 pshufd $78, \TMP5, \TMP1
699 pxor \TMP5, \TMP1
700 movdqa \TMP1, HashKey_2_k(%rsp)
701 .irpc index, 1234 # do 4 rounds
702 movaps 0x10*\index(%arg1), \TMP1
703 AESENC \TMP1, \XMM1
704 AESENC \TMP1, \XMM2
705 AESENC \TMP1, \XMM3
706 AESENC \TMP1, \XMM4
707 .endr
708 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
709 # TMP5 = HashKey^3<<1 (mod poly)
710 movdqa \TMP5, HashKey_3(%rsp)
711 pshufd $78, \TMP5, \TMP1
712 pxor \TMP5, \TMP1
713 movdqa \TMP1, HashKey_3_k(%rsp)
714 .irpc index, 56789 # do next 5 rounds
715 movaps 0x10*\index(%arg1), \TMP1
716 AESENC \TMP1, \XMM1
717 AESENC \TMP1, \XMM2
718 AESENC \TMP1, \XMM3
719 AESENC \TMP1, \XMM4
720 .endr
721 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
722 # TMP5 = HashKey^3<<1 (mod poly)
723 movdqa \TMP5, HashKey_4(%rsp)
724 pshufd $78, \TMP5, \TMP1
725 pxor \TMP5, \TMP1
726 movdqa \TMP1, HashKey_4_k(%rsp)
727 lea 0xa0(%arg1),%r10
728 mov keysize,%eax
729 shr $2,%eax # 128->4, 192->6, 256->8
730 sub $4,%eax # 128->0, 192->2, 256->4
731 jz aes_loop_pre_enc_done\num_initial_blocks
732
733 aes_loop_pre_enc\num_initial_blocks:
734 MOVADQ (%r10),\TMP2
735 .irpc index, 1234
736 AESENC \TMP2, %xmm\index
737 .endr
738 add $16,%r10
739 sub $1,%eax
740 jnz aes_loop_pre_enc\num_initial_blocks
741
742 aes_loop_pre_enc_done\num_initial_blocks:
743 MOVADQ (%r10), \TMP2
744 AESENCLAST \TMP2, \XMM1
745 AESENCLAST \TMP2, \XMM2
746 AESENCLAST \TMP2, \XMM3
747 AESENCLAST \TMP2, \XMM4
748 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
749 pxor \TMP1, \XMM1
750 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
751 pxor \TMP1, \XMM2
752 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
753 pxor \TMP1, \XMM3
754 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
755 pxor \TMP1, \XMM4
756 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
757 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
758 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
759 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
760
761 add $64, %r11
762 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
763 pxor \XMMDst, \XMM1
764 # combine GHASHed value with the corresponding ciphertext
765 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
766 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
767 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
768
769 _initial_blocks_done\num_initial_blocks\operation:
770
771 .endm
772
773 /*
774 * encrypt 4 blocks at a time
775 * ghash the 4 previously encrypted ciphertext blocks
776 * arg1, %arg2, %arg3 are used as pointers only, not modified
777 * %r11 is the data offset value
778 */
779 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
780 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
781
782 movdqa \XMM1, \XMM5
783 movdqa \XMM2, \XMM6
784 movdqa \XMM3, \XMM7
785 movdqa \XMM4, \XMM8
786
787 movdqa SHUF_MASK(%rip), %xmm15
788 # multiply TMP5 * HashKey using karatsuba
789
790 movdqa \XMM5, \TMP4
791 pshufd $78, \XMM5, \TMP6
792 pxor \XMM5, \TMP6
793 paddd ONE(%rip), \XMM0 # INCR CNT
794 movdqa HashKey_4(%rsp), \TMP5
795 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
796 movdqa \XMM0, \XMM1
797 paddd ONE(%rip), \XMM0 # INCR CNT
798 movdqa \XMM0, \XMM2
799 paddd ONE(%rip), \XMM0 # INCR CNT
800 movdqa \XMM0, \XMM3
801 paddd ONE(%rip), \XMM0 # INCR CNT
802 movdqa \XMM0, \XMM4
803 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
804 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
805 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
806 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
808
809 pxor (%arg1), \XMM1
810 pxor (%arg1), \XMM2
811 pxor (%arg1), \XMM3
812 pxor (%arg1), \XMM4
813 movdqa HashKey_4_k(%rsp), \TMP5
814 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
815 movaps 0x10(%arg1), \TMP1
816 AESENC \TMP1, \XMM1 # Round 1
817 AESENC \TMP1, \XMM2
818 AESENC \TMP1, \XMM3
819 AESENC \TMP1, \XMM4
820 movaps 0x20(%arg1), \TMP1
821 AESENC \TMP1, \XMM1 # Round 2
822 AESENC \TMP1, \XMM2
823 AESENC \TMP1, \XMM3
824 AESENC \TMP1, \XMM4
825 movdqa \XMM6, \TMP1
826 pshufd $78, \XMM6, \TMP2
827 pxor \XMM6, \TMP2
828 movdqa HashKey_3(%rsp), \TMP5
829 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
830 movaps 0x30(%arg1), \TMP3
831 AESENC \TMP3, \XMM1 # Round 3
832 AESENC \TMP3, \XMM2
833 AESENC \TMP3, \XMM3
834 AESENC \TMP3, \XMM4
835 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
836 movaps 0x40(%arg1), \TMP3
837 AESENC \TMP3, \XMM1 # Round 4
838 AESENC \TMP3, \XMM2
839 AESENC \TMP3, \XMM3
840 AESENC \TMP3, \XMM4
841 movdqa HashKey_3_k(%rsp), \TMP5
842 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
843 movaps 0x50(%arg1), \TMP3
844 AESENC \TMP3, \XMM1 # Round 5
845 AESENC \TMP3, \XMM2
846 AESENC \TMP3, \XMM3
847 AESENC \TMP3, \XMM4
848 pxor \TMP1, \TMP4
849 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
850 pxor \XMM6, \XMM5
851 pxor \TMP2, \TMP6
852 movdqa \XMM7, \TMP1
853 pshufd $78, \XMM7, \TMP2
854 pxor \XMM7, \TMP2
855 movdqa HashKey_2(%rsp ), \TMP5
856
857 # Multiply TMP5 * HashKey using karatsuba
858
859 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
860 movaps 0x60(%arg1), \TMP3
861 AESENC \TMP3, \XMM1 # Round 6
862 AESENC \TMP3, \XMM2
863 AESENC \TMP3, \XMM3
864 AESENC \TMP3, \XMM4
865 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
866 movaps 0x70(%arg1), \TMP3
867 AESENC \TMP3, \XMM1 # Round 7
868 AESENC \TMP3, \XMM2
869 AESENC \TMP3, \XMM3
870 AESENC \TMP3, \XMM4
871 movdqa HashKey_2_k(%rsp), \TMP5
872 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
873 movaps 0x80(%arg1), \TMP3
874 AESENC \TMP3, \XMM1 # Round 8
875 AESENC \TMP3, \XMM2
876 AESENC \TMP3, \XMM3
877 AESENC \TMP3, \XMM4
878 pxor \TMP1, \TMP4
879 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
880 pxor \XMM7, \XMM5
881 pxor \TMP2, \TMP6
882
883 # Multiply XMM8 * HashKey
884 # XMM8 and TMP5 hold the values for the two operands
885
886 movdqa \XMM8, \TMP1
887 pshufd $78, \XMM8, \TMP2
888 pxor \XMM8, \TMP2
889 movdqa HashKey(%rsp), \TMP5
890 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
891 movaps 0x90(%arg1), \TMP3
892 AESENC \TMP3, \XMM1 # Round 9
893 AESENC \TMP3, \XMM2
894 AESENC \TMP3, \XMM3
895 AESENC \TMP3, \XMM4
896 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
897 lea 0xa0(%arg1),%r10
898 mov keysize,%eax
899 shr $2,%eax # 128->4, 192->6, 256->8
900 sub $4,%eax # 128->0, 192->2, 256->4
901 jz aes_loop_par_enc_done
902
903 aes_loop_par_enc:
904 MOVADQ (%r10),\TMP3
905 .irpc index, 1234
906 AESENC \TMP3, %xmm\index
907 .endr
908 add $16,%r10
909 sub $1,%eax
910 jnz aes_loop_par_enc
911
912 aes_loop_par_enc_done:
913 MOVADQ (%r10), \TMP3
914 AESENCLAST \TMP3, \XMM1 # Round 10
915 AESENCLAST \TMP3, \XMM2
916 AESENCLAST \TMP3, \XMM3
917 AESENCLAST \TMP3, \XMM4
918 movdqa HashKey_k(%rsp), \TMP5
919 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
920 movdqu (%arg3,%r11,1), \TMP3
921 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
922 movdqu 16(%arg3,%r11,1), \TMP3
923 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
924 movdqu 32(%arg3,%r11,1), \TMP3
925 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
926 movdqu 48(%arg3,%r11,1), \TMP3
927 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
928 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
929 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
932 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
933 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
936
937 pxor \TMP4, \TMP1
938 pxor \XMM8, \XMM5
939 pxor \TMP6, \TMP2
940 pxor \TMP1, \TMP2
941 pxor \XMM5, \TMP2
942 movdqa \TMP2, \TMP3
943 pslldq $8, \TMP3 # left shift TMP3 2 DWs
944 psrldq $8, \TMP2 # right shift TMP2 2 DWs
945 pxor \TMP3, \XMM5
946 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
947
948 # first phase of reduction
949
950 movdqa \XMM5, \TMP2
951 movdqa \XMM5, \TMP3
952 movdqa \XMM5, \TMP4
953 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
954 pslld $31, \TMP2 # packed right shift << 31
955 pslld $30, \TMP3 # packed right shift << 30
956 pslld $25, \TMP4 # packed right shift << 25
957 pxor \TMP3, \TMP2 # xor the shifted versions
958 pxor \TMP4, \TMP2
959 movdqa \TMP2, \TMP5
960 psrldq $4, \TMP5 # right shift T5 1 DW
961 pslldq $12, \TMP2 # left shift T2 3 DWs
962 pxor \TMP2, \XMM5
963
964 # second phase of reduction
965
966 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
967 movdqa \XMM5,\TMP3
968 movdqa \XMM5,\TMP4
969 psrld $1, \TMP2 # packed left shift >>1
970 psrld $2, \TMP3 # packed left shift >>2
971 psrld $7, \TMP4 # packed left shift >>7
972 pxor \TMP3,\TMP2 # xor the shifted versions
973 pxor \TMP4,\TMP2
974 pxor \TMP5, \TMP2
975 pxor \TMP2, \XMM5
976 pxor \TMP1, \XMM5 # result is in TMP1
977
978 pxor \XMM5, \XMM1
979 .endm
980
981 /*
982 * decrypt 4 blocks at a time
983 * ghash the 4 previously decrypted ciphertext blocks
984 * arg1, %arg2, %arg3 are used as pointers only, not modified
985 * %r11 is the data offset value
986 */
987 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
988 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
989
990 movdqa \XMM1, \XMM5
991 movdqa \XMM2, \XMM6
992 movdqa \XMM3, \XMM7
993 movdqa \XMM4, \XMM8
994
995 movdqa SHUF_MASK(%rip), %xmm15
996 # multiply TMP5 * HashKey using karatsuba
997
998 movdqa \XMM5, \TMP4
999 pshufd $78, \XMM5, \TMP6
1000 pxor \XMM5, \TMP6
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1002 movdqa HashKey_4(%rsp), \TMP5
1003 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1004 movdqa \XMM0, \XMM1
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1006 movdqa \XMM0, \XMM2
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1008 movdqa \XMM0, \XMM3
1009 paddd ONE(%rip), \XMM0 # INCR CNT
1010 movdqa \XMM0, \XMM4
1011 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1012 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1016
1017 pxor (%arg1), \XMM1
1018 pxor (%arg1), \XMM2
1019 pxor (%arg1), \XMM3
1020 pxor (%arg1), \XMM4
1021 movdqa HashKey_4_k(%rsp), \TMP5
1022 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1023 movaps 0x10(%arg1), \TMP1
1024 AESENC \TMP1, \XMM1 # Round 1
1025 AESENC \TMP1, \XMM2
1026 AESENC \TMP1, \XMM3
1027 AESENC \TMP1, \XMM4
1028 movaps 0x20(%arg1), \TMP1
1029 AESENC \TMP1, \XMM1 # Round 2
1030 AESENC \TMP1, \XMM2
1031 AESENC \TMP1, \XMM3
1032 AESENC \TMP1, \XMM4
1033 movdqa \XMM6, \TMP1
1034 pshufd $78, \XMM6, \TMP2
1035 pxor \XMM6, \TMP2
1036 movdqa HashKey_3(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1038 movaps 0x30(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 3
1040 AESENC \TMP3, \XMM2
1041 AESENC \TMP3, \XMM3
1042 AESENC \TMP3, \XMM4
1043 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1044 movaps 0x40(%arg1), \TMP3
1045 AESENC \TMP3, \XMM1 # Round 4
1046 AESENC \TMP3, \XMM2
1047 AESENC \TMP3, \XMM3
1048 AESENC \TMP3, \XMM4
1049 movdqa HashKey_3_k(%rsp), \TMP5
1050 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1051 movaps 0x50(%arg1), \TMP3
1052 AESENC \TMP3, \XMM1 # Round 5
1053 AESENC \TMP3, \XMM2
1054 AESENC \TMP3, \XMM3
1055 AESENC \TMP3, \XMM4
1056 pxor \TMP1, \TMP4
1057 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1058 pxor \XMM6, \XMM5
1059 pxor \TMP2, \TMP6
1060 movdqa \XMM7, \TMP1
1061 pshufd $78, \XMM7, \TMP2
1062 pxor \XMM7, \TMP2
1063 movdqa HashKey_2(%rsp ), \TMP5
1064
1065 # Multiply TMP5 * HashKey using karatsuba
1066
1067 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1068 movaps 0x60(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 6
1070 AESENC \TMP3, \XMM2
1071 AESENC \TMP3, \XMM3
1072 AESENC \TMP3, \XMM4
1073 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1074 movaps 0x70(%arg1), \TMP3
1075 AESENC \TMP3, \XMM1 # Round 7
1076 AESENC \TMP3, \XMM2
1077 AESENC \TMP3, \XMM3
1078 AESENC \TMP3, \XMM4
1079 movdqa HashKey_2_k(%rsp), \TMP5
1080 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1081 movaps 0x80(%arg1), \TMP3
1082 AESENC \TMP3, \XMM1 # Round 8
1083 AESENC \TMP3, \XMM2
1084 AESENC \TMP3, \XMM3
1085 AESENC \TMP3, \XMM4
1086 pxor \TMP1, \TMP4
1087 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1088 pxor \XMM7, \XMM5
1089 pxor \TMP2, \TMP6
1090
1091 # Multiply XMM8 * HashKey
1092 # XMM8 and TMP5 hold the values for the two operands
1093
1094 movdqa \XMM8, \TMP1
1095 pshufd $78, \XMM8, \TMP2
1096 pxor \XMM8, \TMP2
1097 movdqa HashKey(%rsp), \TMP5
1098 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1099 movaps 0x90(%arg1), \TMP3
1100 AESENC \TMP3, \XMM1 # Round 9
1101 AESENC \TMP3, \XMM2
1102 AESENC \TMP3, \XMM3
1103 AESENC \TMP3, \XMM4
1104 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1105 lea 0xa0(%arg1),%r10
1106 mov keysize,%eax
1107 shr $2,%eax # 128->4, 192->6, 256->8
1108 sub $4,%eax # 128->0, 192->2, 256->4
1109 jz aes_loop_par_dec_done
1110
1111 aes_loop_par_dec:
1112 MOVADQ (%r10),\TMP3
1113 .irpc index, 1234
1114 AESENC \TMP3, %xmm\index
1115 .endr
1116 add $16,%r10
1117 sub $1,%eax
1118 jnz aes_loop_par_dec
1119
1120 aes_loop_par_dec_done:
1121 MOVADQ (%r10), \TMP3
1122 AESENCLAST \TMP3, \XMM1 # last round
1123 AESENCLAST \TMP3, \XMM2
1124 AESENCLAST \TMP3, \XMM3
1125 AESENCLAST \TMP3, \XMM4
1126 movdqa HashKey_k(%rsp), \TMP5
1127 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1128 movdqu (%arg3,%r11,1), \TMP3
1129 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1130 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1131 movdqa \TMP3, \XMM1
1132 movdqu 16(%arg3,%r11,1), \TMP3
1133 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1134 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1135 movdqa \TMP3, \XMM2
1136 movdqu 32(%arg3,%r11,1), \TMP3
1137 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1138 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1139 movdqa \TMP3, \XMM3
1140 movdqu 48(%arg3,%r11,1), \TMP3
1141 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1142 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1143 movdqa \TMP3, \XMM4
1144 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1145 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1148
1149 pxor \TMP4, \TMP1
1150 pxor \XMM8, \XMM5
1151 pxor \TMP6, \TMP2
1152 pxor \TMP1, \TMP2
1153 pxor \XMM5, \TMP2
1154 movdqa \TMP2, \TMP3
1155 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1156 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1157 pxor \TMP3, \XMM5
1158 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1159
1160 # first phase of reduction
1161
1162 movdqa \XMM5, \TMP2
1163 movdqa \XMM5, \TMP3
1164 movdqa \XMM5, \TMP4
1165 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1166 pslld $31, \TMP2 # packed right shift << 31
1167 pslld $30, \TMP3 # packed right shift << 30
1168 pslld $25, \TMP4 # packed right shift << 25
1169 pxor \TMP3, \TMP2 # xor the shifted versions
1170 pxor \TMP4, \TMP2
1171 movdqa \TMP2, \TMP5
1172 psrldq $4, \TMP5 # right shift T5 1 DW
1173 pslldq $12, \TMP2 # left shift T2 3 DWs
1174 pxor \TMP2, \XMM5
1175
1176 # second phase of reduction
1177
1178 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1179 movdqa \XMM5,\TMP3
1180 movdqa \XMM5,\TMP4
1181 psrld $1, \TMP2 # packed left shift >>1
1182 psrld $2, \TMP3 # packed left shift >>2
1183 psrld $7, \TMP4 # packed left shift >>7
1184 pxor \TMP3,\TMP2 # xor the shifted versions
1185 pxor \TMP4,\TMP2
1186 pxor \TMP5, \TMP2
1187 pxor \TMP2, \XMM5
1188 pxor \TMP1, \XMM5 # result is in TMP1
1189
1190 pxor \XMM5, \XMM1
1191 .endm
1192
1193 /* GHASH the last 4 ciphertext blocks. */
1194 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1195 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1196
1197 # Multiply TMP6 * HashKey (using Karatsuba)
1198
1199 movdqa \XMM1, \TMP6
1200 pshufd $78, \XMM1, \TMP2
1201 pxor \XMM1, \TMP2
1202 movdqa HashKey_4(%rsp), \TMP5
1203 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1204 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1205 movdqa HashKey_4_k(%rsp), \TMP4
1206 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1207 movdqa \XMM1, \XMMDst
1208 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1209
1210 # Multiply TMP1 * HashKey (using Karatsuba)
1211
1212 movdqa \XMM2, \TMP1
1213 pshufd $78, \XMM2, \TMP2
1214 pxor \XMM2, \TMP2
1215 movdqa HashKey_3(%rsp), \TMP5
1216 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1217 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1218 movdqa HashKey_3_k(%rsp), \TMP4
1219 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1220 pxor \TMP1, \TMP6
1221 pxor \XMM2, \XMMDst
1222 pxor \TMP2, \XMM1
1223 # results accumulated in TMP6, XMMDst, XMM1
1224
1225 # Multiply TMP1 * HashKey (using Karatsuba)
1226
1227 movdqa \XMM3, \TMP1
1228 pshufd $78, \XMM3, \TMP2
1229 pxor \XMM3, \TMP2
1230 movdqa HashKey_2(%rsp), \TMP5
1231 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1232 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1233 movdqa HashKey_2_k(%rsp), \TMP4
1234 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1235 pxor \TMP1, \TMP6
1236 pxor \XMM3, \XMMDst
1237 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1238
1239 # Multiply TMP1 * HashKey (using Karatsuba)
1240 movdqa \XMM4, \TMP1
1241 pshufd $78, \XMM4, \TMP2
1242 pxor \XMM4, \TMP2
1243 movdqa HashKey(%rsp), \TMP5
1244 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1245 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1246 movdqa HashKey_k(%rsp), \TMP4
1247 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 pxor \TMP1, \TMP6
1249 pxor \XMM4, \XMMDst
1250 pxor \XMM1, \TMP2
1251 pxor \TMP6, \TMP2
1252 pxor \XMMDst, \TMP2
1253 # middle section of the temp results combined as in karatsuba algorithm
1254 movdqa \TMP2, \TMP4
1255 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1256 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1257 pxor \TMP4, \XMMDst
1258 pxor \TMP2, \TMP6
1259 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1260 # first phase of the reduction
1261 movdqa \XMMDst, \TMP2
1262 movdqa \XMMDst, \TMP3
1263 movdqa \XMMDst, \TMP4
1264 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1265 pslld $31, \TMP2 # packed right shifting << 31
1266 pslld $30, \TMP3 # packed right shifting << 30
1267 pslld $25, \TMP4 # packed right shifting << 25
1268 pxor \TMP3, \TMP2 # xor the shifted versions
1269 pxor \TMP4, \TMP2
1270 movdqa \TMP2, \TMP7
1271 psrldq $4, \TMP7 # right shift TMP7 1 DW
1272 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1273 pxor \TMP2, \XMMDst
1274
1275 # second phase of the reduction
1276 movdqa \XMMDst, \TMP2
1277 # make 3 copies of XMMDst for doing 3 shift operations
1278 movdqa \XMMDst, \TMP3
1279 movdqa \XMMDst, \TMP4
1280 psrld $1, \TMP2 # packed left shift >> 1
1281 psrld $2, \TMP3 # packed left shift >> 2
1282 psrld $7, \TMP4 # packed left shift >> 7
1283 pxor \TMP3, \TMP2 # xor the shifted versions
1284 pxor \TMP4, \TMP2
1285 pxor \TMP7, \TMP2
1286 pxor \TMP2, \XMMDst
1287 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1288 .endm
1289
1290
1291 /* Encryption of a single block
1292 * uses eax & r10
1293 */
1294
1295 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1296
1297 pxor (%arg1), \XMM0
1298 mov keysize,%eax
1299 shr $2,%eax # 128->4, 192->6, 256->8
1300 add $5,%eax # 128->9, 192->11, 256->13
1301 lea 16(%arg1), %r10 # get first expanded key address
1302
1303 _esb_loop_\@:
1304 MOVADQ (%r10),\TMP1
1305 AESENC \TMP1,\XMM0
1306 add $16,%r10
1307 sub $1,%eax
1308 jnz _esb_loop_\@
1309
1310 MOVADQ (%r10),\TMP1
1311 AESENCLAST \TMP1,\XMM0
1312 .endm
1313 /*****************************************************************************
1314 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1315 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1316 * const u8 *in, // Ciphertext input
1317 * u64 plaintext_len, // Length of data in bytes for decryption.
1318 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1319 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1320 * // concatenated with 0x00000001. 16-byte aligned pointer.
1321 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1322 * const u8 *aad, // Additional Authentication Data (AAD)
1323 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1324 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1325 * // given authentication tag and only return the plaintext if they match.
1326 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1327 * // (most likely), 12 or 8.
1328 *
1329 * Assumptions:
1330 *
1331 * keys:
1332 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1333 * set of 11 keys in the data structure void *aes_ctx
1334 *
1335 * iv:
1336 * 0 1 2 3
1337 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1338 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1339 * | Salt (From the SA) |
1340 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1341 * | Initialization Vector |
1342 * | (This is the sequence number from IPSec header) |
1343 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1344 * | 0x1 |
1345 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1346 *
1347 *
1348 *
1349 * AAD:
1350 * AAD padded to 128 bits with 0
1351 * for example, assume AAD is a u32 vector
1352 *
1353 * if AAD is 8 bytes:
1354 * AAD[3] = {A0, A1};
1355 * padded AAD in xmm register = {A1 A0 0 0}
1356 *
1357 * 0 1 2 3
1358 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1359 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1360 * | SPI (A1) |
1361 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362 * | 32-bit Sequence Number (A0) |
1363 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1364 * | 0x0 |
1365 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366 *
1367 * AAD Format with 32-bit Sequence Number
1368 *
1369 * if AAD is 12 bytes:
1370 * AAD[3] = {A0, A1, A2};
1371 * padded AAD in xmm register = {A2 A1 A0 0}
1372 *
1373 * 0 1 2 3
1374 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1375 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1376 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1377 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1378 * | SPI (A2) |
1379 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1380 * | 64-bit Extended Sequence Number {A1,A0} |
1381 * | |
1382 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1383 * | 0x0 |
1384 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1385 *
1386 * AAD Format with 64-bit Extended Sequence Number
1387 *
1388 * aadLen:
1389 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1390 * The code supports 16 too but for other sizes, the code will fail.
1391 *
1392 * TLen:
1393 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1394 * For other sizes, the code will fail.
1395 *
1396 * poly = x^128 + x^127 + x^126 + x^121 + 1
1397 *
1398 *****************************************************************************/
1399 ENTRY(aesni_gcm_dec)
1400 push %r12
1401 push %r13
1402 push %r14
1403 mov %rsp, %r14
1404 /*
1405 * states of %xmm registers %xmm6:%xmm15 not saved
1406 * all %xmm registers are clobbered
1407 */
1408 sub $VARIABLE_OFFSET, %rsp
1409 and $~63, %rsp # align rsp to 64 bytes
1410 mov %arg6, %r12
1411 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1412 movdqa SHUF_MASK(%rip), %xmm2
1413 PSHUFB_XMM %xmm2, %xmm13
1414
1415
1416 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1417
1418 movdqa %xmm13, %xmm2
1419 psllq $1, %xmm13
1420 psrlq $63, %xmm2
1421 movdqa %xmm2, %xmm1
1422 pslldq $8, %xmm2
1423 psrldq $8, %xmm1
1424 por %xmm2, %xmm13
1425
1426 # Reduction
1427
1428 pshufd $0x24, %xmm1, %xmm2
1429 pcmpeqd TWOONE(%rip), %xmm2
1430 pand POLY(%rip), %xmm2
1431 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1432
1433
1434 # Decrypt first few blocks
1435
1436 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1437 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1438 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1439 mov %r13, %r12
1440 and $(3<<4), %r12
1441 jz _initial_num_blocks_is_0_decrypt
1442 cmp $(2<<4), %r12
1443 jb _initial_num_blocks_is_1_decrypt
1444 je _initial_num_blocks_is_2_decrypt
1445 _initial_num_blocks_is_3_decrypt:
1446 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1447 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1448 sub $48, %r13
1449 jmp _initial_blocks_decrypted
1450 _initial_num_blocks_is_2_decrypt:
1451 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1452 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1453 sub $32, %r13
1454 jmp _initial_blocks_decrypted
1455 _initial_num_blocks_is_1_decrypt:
1456 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1457 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1458 sub $16, %r13
1459 jmp _initial_blocks_decrypted
1460 _initial_num_blocks_is_0_decrypt:
1461 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1462 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1463 _initial_blocks_decrypted:
1464 cmp $0, %r13
1465 je _zero_cipher_left_decrypt
1466 sub $64, %r13
1467 je _four_cipher_left_decrypt
1468 _decrypt_by_4:
1469 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1470 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1471 add $64, %r11
1472 sub $64, %r13
1473 jne _decrypt_by_4
1474 _four_cipher_left_decrypt:
1475 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1476 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1477 _zero_cipher_left_decrypt:
1478 mov %arg4, %r13
1479 and $15, %r13 # %r13 = arg4 (mod 16)
1480 je _multiple_of_16_bytes_decrypt
1481
1482 # Handle the last <16 byte block separately
1483
1484 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1485 movdqa SHUF_MASK(%rip), %xmm10
1486 PSHUFB_XMM %xmm10, %xmm0
1487
1488 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1489 sub $16, %r11
1490 add %r13, %r11
1491 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1492 lea SHIFT_MASK+16(%rip), %r12
1493 sub %r13, %r12
1494 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1495 # (%r13 is the number of bytes in plaintext mod 16)
1496 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1497 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1498
1499 movdqa %xmm1, %xmm2
1500 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1501 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1502 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1503 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1504 pand %xmm1, %xmm2
1505 movdqa SHUF_MASK(%rip), %xmm10
1506 PSHUFB_XMM %xmm10 ,%xmm2
1507
1508 pxor %xmm2, %xmm8
1509 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1510 # GHASH computation for the last <16 byte block
1511 sub %r13, %r11
1512 add $16, %r11
1513
1514 # output %r13 bytes
1515 MOVQ_R64_XMM %xmm0, %rax
1516 cmp $8, %r13
1517 jle _less_than_8_bytes_left_decrypt
1518 mov %rax, (%arg2 , %r11, 1)
1519 add $8, %r11
1520 psrldq $8, %xmm0
1521 MOVQ_R64_XMM %xmm0, %rax
1522 sub $8, %r13
1523 _less_than_8_bytes_left_decrypt:
1524 mov %al, (%arg2, %r11, 1)
1525 add $1, %r11
1526 shr $8, %rax
1527 sub $1, %r13
1528 jne _less_than_8_bytes_left_decrypt
1529 _multiple_of_16_bytes_decrypt:
1530 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1531 shl $3, %r12 # convert into number of bits
1532 movd %r12d, %xmm15 # len(A) in %xmm15
1533 shl $3, %arg4 # len(C) in bits (*128)
1534 MOVQ_R64_XMM %arg4, %xmm1
1535 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1536 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1537 pxor %xmm15, %xmm8
1538 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1539 # final GHASH computation
1540 movdqa SHUF_MASK(%rip), %xmm10
1541 PSHUFB_XMM %xmm10, %xmm8
1542
1543 mov %arg5, %rax # %rax = *Y0
1544 movdqu (%rax), %xmm0 # %xmm0 = Y0
1545 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1546 pxor %xmm8, %xmm0
1547 _return_T_decrypt:
1548 mov arg9, %r10 # %r10 = authTag
1549 mov arg10, %r11 # %r11 = auth_tag_len
1550 cmp $16, %r11
1551 je _T_16_decrypt
1552 cmp $8, %r11
1553 jl _T_4_decrypt
1554 _T_8_decrypt:
1555 MOVQ_R64_XMM %xmm0, %rax
1556 mov %rax, (%r10)
1557 add $8, %r10
1558 sub $8, %r11
1559 psrldq $8, %xmm0
1560 cmp $0, %r11
1561 je _return_T_done_decrypt
1562 _T_4_decrypt:
1563 movd %xmm0, %eax
1564 mov %eax, (%r10)
1565 add $4, %r10
1566 sub $4, %r11
1567 psrldq $4, %xmm0
1568 cmp $0, %r11
1569 je _return_T_done_decrypt
1570 _T_123_decrypt:
1571 movd %xmm0, %eax
1572 cmp $2, %r11
1573 jl _T_1_decrypt
1574 mov %ax, (%r10)
1575 cmp $2, %r11
1576 je _return_T_done_decrypt
1577 add $2, %r10
1578 sar $16, %eax
1579 _T_1_decrypt:
1580 mov %al, (%r10)
1581 jmp _return_T_done_decrypt
1582 _T_16_decrypt:
1583 movdqu %xmm0, (%r10)
1584 _return_T_done_decrypt:
1585 mov %r14, %rsp
1586 pop %r14
1587 pop %r13
1588 pop %r12
1589 ret
1590 ENDPROC(aesni_gcm_dec)
1591
1592
1593 /*****************************************************************************
1594 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1595 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1596 * const u8 *in, // Plaintext input
1597 * u64 plaintext_len, // Length of data in bytes for encryption.
1598 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1599 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1600 * // concatenated with 0x00000001. 16-byte aligned pointer.
1601 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1602 * const u8 *aad, // Additional Authentication Data (AAD)
1603 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1604 * u8 *auth_tag, // Authenticated Tag output.
1605 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1606 * // 12 or 8.
1607 *
1608 * Assumptions:
1609 *
1610 * keys:
1611 * keys are pre-expanded and aligned to 16 bytes. we are using the
1612 * first set of 11 keys in the data structure void *aes_ctx
1613 *
1614 *
1615 * iv:
1616 * 0 1 2 3
1617 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1618 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1619 * | Salt (From the SA) |
1620 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1621 * | Initialization Vector |
1622 * | (This is the sequence number from IPSec header) |
1623 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1624 * | 0x1 |
1625 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1626 *
1627 *
1628 *
1629 * AAD:
1630 * AAD padded to 128 bits with 0
1631 * for example, assume AAD is a u32 vector
1632 *
1633 * if AAD is 8 bytes:
1634 * AAD[3] = {A0, A1};
1635 * padded AAD in xmm register = {A1 A0 0 0}
1636 *
1637 * 0 1 2 3
1638 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1639 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 * | SPI (A1) |
1641 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1642 * | 32-bit Sequence Number (A0) |
1643 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1644 * | 0x0 |
1645 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1646 *
1647 * AAD Format with 32-bit Sequence Number
1648 *
1649 * if AAD is 12 bytes:
1650 * AAD[3] = {A0, A1, A2};
1651 * padded AAD in xmm register = {A2 A1 A0 0}
1652 *
1653 * 0 1 2 3
1654 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1655 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 * | SPI (A2) |
1657 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658 * | 64-bit Extended Sequence Number {A1,A0} |
1659 * | |
1660 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 * | 0x0 |
1662 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663 *
1664 * AAD Format with 64-bit Extended Sequence Number
1665 *
1666 * aadLen:
1667 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1668 * The code supports 16 too but for other sizes, the code will fail.
1669 *
1670 * TLen:
1671 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1672 * For other sizes, the code will fail.
1673 *
1674 * poly = x^128 + x^127 + x^126 + x^121 + 1
1675 ***************************************************************************/
1676 ENTRY(aesni_gcm_enc)
1677 push %r12
1678 push %r13
1679 push %r14
1680 mov %rsp, %r14
1681 #
1682 # states of %xmm registers %xmm6:%xmm15 not saved
1683 # all %xmm registers are clobbered
1684 #
1685 sub $VARIABLE_OFFSET, %rsp
1686 and $~63, %rsp
1687 mov %arg6, %r12
1688 movdqu (%r12), %xmm13
1689 movdqa SHUF_MASK(%rip), %xmm2
1690 PSHUFB_XMM %xmm2, %xmm13
1691
1692
1693 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1694
1695 movdqa %xmm13, %xmm2
1696 psllq $1, %xmm13
1697 psrlq $63, %xmm2
1698 movdqa %xmm2, %xmm1
1699 pslldq $8, %xmm2
1700 psrldq $8, %xmm1
1701 por %xmm2, %xmm13
1702
1703 # reduce HashKey<<1
1704
1705 pshufd $0x24, %xmm1, %xmm2
1706 pcmpeqd TWOONE(%rip), %xmm2
1707 pand POLY(%rip), %xmm2
1708 pxor %xmm2, %xmm13
1709 movdqa %xmm13, HashKey(%rsp)
1710 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1711 and $-16, %r13
1712 mov %r13, %r12
1713
1714 # Encrypt first few blocks
1715
1716 and $(3<<4), %r12
1717 jz _initial_num_blocks_is_0_encrypt
1718 cmp $(2<<4), %r12
1719 jb _initial_num_blocks_is_1_encrypt
1720 je _initial_num_blocks_is_2_encrypt
1721 _initial_num_blocks_is_3_encrypt:
1722 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1723 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1724 sub $48, %r13
1725 jmp _initial_blocks_encrypted
1726 _initial_num_blocks_is_2_encrypt:
1727 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1728 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1729 sub $32, %r13
1730 jmp _initial_blocks_encrypted
1731 _initial_num_blocks_is_1_encrypt:
1732 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1733 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1734 sub $16, %r13
1735 jmp _initial_blocks_encrypted
1736 _initial_num_blocks_is_0_encrypt:
1737 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1738 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1739 _initial_blocks_encrypted:
1740
1741 # Main loop - Encrypt remaining blocks
1742
1743 cmp $0, %r13
1744 je _zero_cipher_left_encrypt
1745 sub $64, %r13
1746 je _four_cipher_left_encrypt
1747 _encrypt_by_4_encrypt:
1748 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1749 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1750 add $64, %r11
1751 sub $64, %r13
1752 jne _encrypt_by_4_encrypt
1753 _four_cipher_left_encrypt:
1754 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1755 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1756 _zero_cipher_left_encrypt:
1757 mov %arg4, %r13
1758 and $15, %r13 # %r13 = arg4 (mod 16)
1759 je _multiple_of_16_bytes_encrypt
1760
1761 # Handle the last <16 Byte block separately
1762 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1763 movdqa SHUF_MASK(%rip), %xmm10
1764 PSHUFB_XMM %xmm10, %xmm0
1765
1766
1767 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1768 sub $16, %r11
1769 add %r13, %r11
1770 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1771 lea SHIFT_MASK+16(%rip), %r12
1772 sub %r13, %r12
1773 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1774 # (%r13 is the number of bytes in plaintext mod 16)
1775 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1776 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1777 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1778 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1779 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1780 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1781 movdqa SHUF_MASK(%rip), %xmm10
1782 PSHUFB_XMM %xmm10,%xmm0
1783
1784 pxor %xmm0, %xmm8
1785 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1786 # GHASH computation for the last <16 byte block
1787 sub %r13, %r11
1788 add $16, %r11
1789
1790 movdqa SHUF_MASK(%rip), %xmm10
1791 PSHUFB_XMM %xmm10, %xmm0
1792
1793 # shuffle xmm0 back to output as ciphertext
1794
1795 # Output %r13 bytes
1796 MOVQ_R64_XMM %xmm0, %rax
1797 cmp $8, %r13
1798 jle _less_than_8_bytes_left_encrypt
1799 mov %rax, (%arg2 , %r11, 1)
1800 add $8, %r11
1801 psrldq $8, %xmm0
1802 MOVQ_R64_XMM %xmm0, %rax
1803 sub $8, %r13
1804 _less_than_8_bytes_left_encrypt:
1805 mov %al, (%arg2, %r11, 1)
1806 add $1, %r11
1807 shr $8, %rax
1808 sub $1, %r13
1809 jne _less_than_8_bytes_left_encrypt
1810 _multiple_of_16_bytes_encrypt:
1811 mov arg8, %r12 # %r12 = addLen (number of bytes)
1812 shl $3, %r12
1813 movd %r12d, %xmm15 # len(A) in %xmm15
1814 shl $3, %arg4 # len(C) in bits (*128)
1815 MOVQ_R64_XMM %arg4, %xmm1
1816 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1817 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1818 pxor %xmm15, %xmm8
1819 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1820 # final GHASH computation
1821 movdqa SHUF_MASK(%rip), %xmm10
1822 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1823
1824 mov %arg5, %rax # %rax = *Y0
1825 movdqu (%rax), %xmm0 # %xmm0 = Y0
1826 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1827 pxor %xmm8, %xmm0
1828 _return_T_encrypt:
1829 mov arg9, %r10 # %r10 = authTag
1830 mov arg10, %r11 # %r11 = auth_tag_len
1831 cmp $16, %r11
1832 je _T_16_encrypt
1833 cmp $8, %r11
1834 jl _T_4_encrypt
1835 _T_8_encrypt:
1836 MOVQ_R64_XMM %xmm0, %rax
1837 mov %rax, (%r10)
1838 add $8, %r10
1839 sub $8, %r11
1840 psrldq $8, %xmm0
1841 cmp $0, %r11
1842 je _return_T_done_encrypt
1843 _T_4_encrypt:
1844 movd %xmm0, %eax
1845 mov %eax, (%r10)
1846 add $4, %r10
1847 sub $4, %r11
1848 psrldq $4, %xmm0
1849 cmp $0, %r11
1850 je _return_T_done_encrypt
1851 _T_123_encrypt:
1852 movd %xmm0, %eax
1853 cmp $2, %r11
1854 jl _T_1_encrypt
1855 mov %ax, (%r10)
1856 cmp $2, %r11
1857 je _return_T_done_encrypt
1858 add $2, %r10
1859 sar $16, %eax
1860 _T_1_encrypt:
1861 mov %al, (%r10)
1862 jmp _return_T_done_encrypt
1863 _T_16_encrypt:
1864 movdqu %xmm0, (%r10)
1865 _return_T_done_encrypt:
1866 mov %r14, %rsp
1867 pop %r14
1868 pop %r13
1869 pop %r12
1870 ret
1871 ENDPROC(aesni_gcm_enc)
1872
1873 #endif
1874
1875
1876 .align 4
1877 _key_expansion_128:
1878 _key_expansion_256a:
1879 pshufd $0b11111111, %xmm1, %xmm1
1880 shufps $0b00010000, %xmm0, %xmm4
1881 pxor %xmm4, %xmm0
1882 shufps $0b10001100, %xmm0, %xmm4
1883 pxor %xmm4, %xmm0
1884 pxor %xmm1, %xmm0
1885 movaps %xmm0, (TKEYP)
1886 add $0x10, TKEYP
1887 ret
1888 ENDPROC(_key_expansion_128)
1889 ENDPROC(_key_expansion_256a)
1890
1891 .align 4
1892 _key_expansion_192a:
1893 pshufd $0b01010101, %xmm1, %xmm1
1894 shufps $0b00010000, %xmm0, %xmm4
1895 pxor %xmm4, %xmm0
1896 shufps $0b10001100, %xmm0, %xmm4
1897 pxor %xmm4, %xmm0
1898 pxor %xmm1, %xmm0
1899
1900 movaps %xmm2, %xmm5
1901 movaps %xmm2, %xmm6
1902 pslldq $4, %xmm5
1903 pshufd $0b11111111, %xmm0, %xmm3
1904 pxor %xmm3, %xmm2
1905 pxor %xmm5, %xmm2
1906
1907 movaps %xmm0, %xmm1
1908 shufps $0b01000100, %xmm0, %xmm6
1909 movaps %xmm6, (TKEYP)
1910 shufps $0b01001110, %xmm2, %xmm1
1911 movaps %xmm1, 0x10(TKEYP)
1912 add $0x20, TKEYP
1913 ret
1914 ENDPROC(_key_expansion_192a)
1915
1916 .align 4
1917 _key_expansion_192b:
1918 pshufd $0b01010101, %xmm1, %xmm1
1919 shufps $0b00010000, %xmm0, %xmm4
1920 pxor %xmm4, %xmm0
1921 shufps $0b10001100, %xmm0, %xmm4
1922 pxor %xmm4, %xmm0
1923 pxor %xmm1, %xmm0
1924
1925 movaps %xmm2, %xmm5
1926 pslldq $4, %xmm5
1927 pshufd $0b11111111, %xmm0, %xmm3
1928 pxor %xmm3, %xmm2
1929 pxor %xmm5, %xmm2
1930
1931 movaps %xmm0, (TKEYP)
1932 add $0x10, TKEYP
1933 ret
1934 ENDPROC(_key_expansion_192b)
1935
1936 .align 4
1937 _key_expansion_256b:
1938 pshufd $0b10101010, %xmm1, %xmm1
1939 shufps $0b00010000, %xmm2, %xmm4
1940 pxor %xmm4, %xmm2
1941 shufps $0b10001100, %xmm2, %xmm4
1942 pxor %xmm4, %xmm2
1943 pxor %xmm1, %xmm2
1944 movaps %xmm2, (TKEYP)
1945 add $0x10, TKEYP
1946 ret
1947 ENDPROC(_key_expansion_256b)
1948
1949 /*
1950 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1951 * unsigned int key_len)
1952 */
1953 ENTRY(aesni_set_key)
1954 FRAME_BEGIN
1955 #ifndef __x86_64__
1956 pushl KEYP
1957 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1958 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1959 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1960 #endif
1961 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1962 movaps %xmm0, (KEYP)
1963 lea 0x10(KEYP), TKEYP # key addr
1964 movl %edx, 480(KEYP)
1965 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1966 cmp $24, %dl
1967 jb .Lenc_key128
1968 je .Lenc_key192
1969 movups 0x10(UKEYP), %xmm2 # other user key
1970 movaps %xmm2, (TKEYP)
1971 add $0x10, TKEYP
1972 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1973 call _key_expansion_256a
1974 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1975 call _key_expansion_256b
1976 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1977 call _key_expansion_256a
1978 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1979 call _key_expansion_256b
1980 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1981 call _key_expansion_256a
1982 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1983 call _key_expansion_256b
1984 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1985 call _key_expansion_256a
1986 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1987 call _key_expansion_256b
1988 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1989 call _key_expansion_256a
1990 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1991 call _key_expansion_256b
1992 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1993 call _key_expansion_256a
1994 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1995 call _key_expansion_256b
1996 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1997 call _key_expansion_256a
1998 jmp .Ldec_key
1999 .Lenc_key192:
2000 movq 0x10(UKEYP), %xmm2 # other user key
2001 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
2002 call _key_expansion_192a
2003 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
2004 call _key_expansion_192b
2005 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
2006 call _key_expansion_192a
2007 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
2008 call _key_expansion_192b
2009 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
2010 call _key_expansion_192a
2011 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
2012 call _key_expansion_192b
2013 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
2014 call _key_expansion_192a
2015 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
2016 call _key_expansion_192b
2017 jmp .Ldec_key
2018 .Lenc_key128:
2019 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
2020 call _key_expansion_128
2021 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
2022 call _key_expansion_128
2023 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
2024 call _key_expansion_128
2025 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
2026 call _key_expansion_128
2027 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
2028 call _key_expansion_128
2029 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
2030 call _key_expansion_128
2031 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
2032 call _key_expansion_128
2033 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
2034 call _key_expansion_128
2035 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
2036 call _key_expansion_128
2037 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
2038 call _key_expansion_128
2039 .Ldec_key:
2040 sub $0x10, TKEYP
2041 movaps (KEYP), %xmm0
2042 movaps (TKEYP), %xmm1
2043 movaps %xmm0, 240(TKEYP)
2044 movaps %xmm1, 240(KEYP)
2045 add $0x10, KEYP
2046 lea 240-16(TKEYP), UKEYP
2047 .align 4
2048 .Ldec_key_loop:
2049 movaps (KEYP), %xmm0
2050 AESIMC %xmm0 %xmm1
2051 movaps %xmm1, (UKEYP)
2052 add $0x10, KEYP
2053 sub $0x10, UKEYP
2054 cmp TKEYP, KEYP
2055 jb .Ldec_key_loop
2056 xor AREG, AREG
2057 #ifndef __x86_64__
2058 popl KEYP
2059 #endif
2060 FRAME_END
2061 ret
2062 ENDPROC(aesni_set_key)
2063
2064 /*
2065 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2066 */
2067 ENTRY(aesni_enc)
2068 FRAME_BEGIN
2069 #ifndef __x86_64__
2070 pushl KEYP
2071 pushl KLEN
2072 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2073 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2074 movl (FRAME_OFFSET+20)(%esp), INP # src
2075 #endif
2076 movl 480(KEYP), KLEN # key length
2077 movups (INP), STATE # input
2078 call _aesni_enc1
2079 movups STATE, (OUTP) # output
2080 #ifndef __x86_64__
2081 popl KLEN
2082 popl KEYP
2083 #endif
2084 FRAME_END
2085 ret
2086 ENDPROC(aesni_enc)
2087
2088 /*
2089 * _aesni_enc1: internal ABI
2090 * input:
2091 * KEYP: key struct pointer
2092 * KLEN: round count
2093 * STATE: initial state (input)
2094 * output:
2095 * STATE: finial state (output)
2096 * changed:
2097 * KEY
2098 * TKEYP (T1)
2099 */
2100 .align 4
2101 _aesni_enc1:
2102 movaps (KEYP), KEY # key
2103 mov KEYP, TKEYP
2104 pxor KEY, STATE # round 0
2105 add $0x30, TKEYP
2106 cmp $24, KLEN
2107 jb .Lenc128
2108 lea 0x20(TKEYP), TKEYP
2109 je .Lenc192
2110 add $0x20, TKEYP
2111 movaps -0x60(TKEYP), KEY
2112 AESENC KEY STATE
2113 movaps -0x50(TKEYP), KEY
2114 AESENC KEY STATE
2115 .align 4
2116 .Lenc192:
2117 movaps -0x40(TKEYP), KEY
2118 AESENC KEY STATE
2119 movaps -0x30(TKEYP), KEY
2120 AESENC KEY STATE
2121 .align 4
2122 .Lenc128:
2123 movaps -0x20(TKEYP), KEY
2124 AESENC KEY STATE
2125 movaps -0x10(TKEYP), KEY
2126 AESENC KEY STATE
2127 movaps (TKEYP), KEY
2128 AESENC KEY STATE
2129 movaps 0x10(TKEYP), KEY
2130 AESENC KEY STATE
2131 movaps 0x20(TKEYP), KEY
2132 AESENC KEY STATE
2133 movaps 0x30(TKEYP), KEY
2134 AESENC KEY STATE
2135 movaps 0x40(TKEYP), KEY
2136 AESENC KEY STATE
2137 movaps 0x50(TKEYP), KEY
2138 AESENC KEY STATE
2139 movaps 0x60(TKEYP), KEY
2140 AESENC KEY STATE
2141 movaps 0x70(TKEYP), KEY
2142 AESENCLAST KEY STATE
2143 ret
2144 ENDPROC(_aesni_enc1)
2145
2146 /*
2147 * _aesni_enc4: internal ABI
2148 * input:
2149 * KEYP: key struct pointer
2150 * KLEN: round count
2151 * STATE1: initial state (input)
2152 * STATE2
2153 * STATE3
2154 * STATE4
2155 * output:
2156 * STATE1: finial state (output)
2157 * STATE2
2158 * STATE3
2159 * STATE4
2160 * changed:
2161 * KEY
2162 * TKEYP (T1)
2163 */
2164 .align 4
2165 _aesni_enc4:
2166 movaps (KEYP), KEY # key
2167 mov KEYP, TKEYP
2168 pxor KEY, STATE1 # round 0
2169 pxor KEY, STATE2
2170 pxor KEY, STATE3
2171 pxor KEY, STATE4
2172 add $0x30, TKEYP
2173 cmp $24, KLEN
2174 jb .L4enc128
2175 lea 0x20(TKEYP), TKEYP
2176 je .L4enc192
2177 add $0x20, TKEYP
2178 movaps -0x60(TKEYP), KEY
2179 AESENC KEY STATE1
2180 AESENC KEY STATE2
2181 AESENC KEY STATE3
2182 AESENC KEY STATE4
2183 movaps -0x50(TKEYP), KEY
2184 AESENC KEY STATE1
2185 AESENC KEY STATE2
2186 AESENC KEY STATE3
2187 AESENC KEY STATE4
2188 #.align 4
2189 .L4enc192:
2190 movaps -0x40(TKEYP), KEY
2191 AESENC KEY STATE1
2192 AESENC KEY STATE2
2193 AESENC KEY STATE3
2194 AESENC KEY STATE4
2195 movaps -0x30(TKEYP), KEY
2196 AESENC KEY STATE1
2197 AESENC KEY STATE2
2198 AESENC KEY STATE3
2199 AESENC KEY STATE4
2200 #.align 4
2201 .L4enc128:
2202 movaps -0x20(TKEYP), KEY
2203 AESENC KEY STATE1
2204 AESENC KEY STATE2
2205 AESENC KEY STATE3
2206 AESENC KEY STATE4
2207 movaps -0x10(TKEYP), KEY
2208 AESENC KEY STATE1
2209 AESENC KEY STATE2
2210 AESENC KEY STATE3
2211 AESENC KEY STATE4
2212 movaps (TKEYP), KEY
2213 AESENC KEY STATE1
2214 AESENC KEY STATE2
2215 AESENC KEY STATE3
2216 AESENC KEY STATE4
2217 movaps 0x10(TKEYP), KEY
2218 AESENC KEY STATE1
2219 AESENC KEY STATE2
2220 AESENC KEY STATE3
2221 AESENC KEY STATE4
2222 movaps 0x20(TKEYP), KEY
2223 AESENC KEY STATE1
2224 AESENC KEY STATE2
2225 AESENC KEY STATE3
2226 AESENC KEY STATE4
2227 movaps 0x30(TKEYP), KEY
2228 AESENC KEY STATE1
2229 AESENC KEY STATE2
2230 AESENC KEY STATE3
2231 AESENC KEY STATE4
2232 movaps 0x40(TKEYP), KEY
2233 AESENC KEY STATE1
2234 AESENC KEY STATE2
2235 AESENC KEY STATE3
2236 AESENC KEY STATE4
2237 movaps 0x50(TKEYP), KEY
2238 AESENC KEY STATE1
2239 AESENC KEY STATE2
2240 AESENC KEY STATE3
2241 AESENC KEY STATE4
2242 movaps 0x60(TKEYP), KEY
2243 AESENC KEY STATE1
2244 AESENC KEY STATE2
2245 AESENC KEY STATE3
2246 AESENC KEY STATE4
2247 movaps 0x70(TKEYP), KEY
2248 AESENCLAST KEY STATE1 # last round
2249 AESENCLAST KEY STATE2
2250 AESENCLAST KEY STATE3
2251 AESENCLAST KEY STATE4
2252 ret
2253 ENDPROC(_aesni_enc4)
2254
2255 /*
2256 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2257 */
2258 ENTRY(aesni_dec)
2259 FRAME_BEGIN
2260 #ifndef __x86_64__
2261 pushl KEYP
2262 pushl KLEN
2263 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2264 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2265 movl (FRAME_OFFSET+20)(%esp), INP # src
2266 #endif
2267 mov 480(KEYP), KLEN # key length
2268 add $240, KEYP
2269 movups (INP), STATE # input
2270 call _aesni_dec1
2271 movups STATE, (OUTP) #output
2272 #ifndef __x86_64__
2273 popl KLEN
2274 popl KEYP
2275 #endif
2276 FRAME_END
2277 ret
2278 ENDPROC(aesni_dec)
2279
2280 /*
2281 * _aesni_dec1: internal ABI
2282 * input:
2283 * KEYP: key struct pointer
2284 * KLEN: key length
2285 * STATE: initial state (input)
2286 * output:
2287 * STATE: finial state (output)
2288 * changed:
2289 * KEY
2290 * TKEYP (T1)
2291 */
2292 .align 4
2293 _aesni_dec1:
2294 movaps (KEYP), KEY # key
2295 mov KEYP, TKEYP
2296 pxor KEY, STATE # round 0
2297 add $0x30, TKEYP
2298 cmp $24, KLEN
2299 jb .Ldec128
2300 lea 0x20(TKEYP), TKEYP
2301 je .Ldec192
2302 add $0x20, TKEYP
2303 movaps -0x60(TKEYP), KEY
2304 AESDEC KEY STATE
2305 movaps -0x50(TKEYP), KEY
2306 AESDEC KEY STATE
2307 .align 4
2308 .Ldec192:
2309 movaps -0x40(TKEYP), KEY
2310 AESDEC KEY STATE
2311 movaps -0x30(TKEYP), KEY
2312 AESDEC KEY STATE
2313 .align 4
2314 .Ldec128:
2315 movaps -0x20(TKEYP), KEY
2316 AESDEC KEY STATE
2317 movaps -0x10(TKEYP), KEY
2318 AESDEC KEY STATE
2319 movaps (TKEYP), KEY
2320 AESDEC KEY STATE
2321 movaps 0x10(TKEYP), KEY
2322 AESDEC KEY STATE
2323 movaps 0x20(TKEYP), KEY
2324 AESDEC KEY STATE
2325 movaps 0x30(TKEYP), KEY
2326 AESDEC KEY STATE
2327 movaps 0x40(TKEYP), KEY
2328 AESDEC KEY STATE
2329 movaps 0x50(TKEYP), KEY
2330 AESDEC KEY STATE
2331 movaps 0x60(TKEYP), KEY
2332 AESDEC KEY STATE
2333 movaps 0x70(TKEYP), KEY
2334 AESDECLAST KEY STATE
2335 ret
2336 ENDPROC(_aesni_dec1)
2337
2338 /*
2339 * _aesni_dec4: internal ABI
2340 * input:
2341 * KEYP: key struct pointer
2342 * KLEN: key length
2343 * STATE1: initial state (input)
2344 * STATE2
2345 * STATE3
2346 * STATE4
2347 * output:
2348 * STATE1: finial state (output)
2349 * STATE2
2350 * STATE3
2351 * STATE4
2352 * changed:
2353 * KEY
2354 * TKEYP (T1)
2355 */
2356 .align 4
2357 _aesni_dec4:
2358 movaps (KEYP), KEY # key
2359 mov KEYP, TKEYP
2360 pxor KEY, STATE1 # round 0
2361 pxor KEY, STATE2
2362 pxor KEY, STATE3
2363 pxor KEY, STATE4
2364 add $0x30, TKEYP
2365 cmp $24, KLEN
2366 jb .L4dec128
2367 lea 0x20(TKEYP), TKEYP
2368 je .L4dec192
2369 add $0x20, TKEYP
2370 movaps -0x60(TKEYP), KEY
2371 AESDEC KEY STATE1
2372 AESDEC KEY STATE2
2373 AESDEC KEY STATE3
2374 AESDEC KEY STATE4
2375 movaps -0x50(TKEYP), KEY
2376 AESDEC KEY STATE1
2377 AESDEC KEY STATE2
2378 AESDEC KEY STATE3
2379 AESDEC KEY STATE4
2380 .align 4
2381 .L4dec192:
2382 movaps -0x40(TKEYP), KEY
2383 AESDEC KEY STATE1
2384 AESDEC KEY STATE2
2385 AESDEC KEY STATE3
2386 AESDEC KEY STATE4
2387 movaps -0x30(TKEYP), KEY
2388 AESDEC KEY STATE1
2389 AESDEC KEY STATE2
2390 AESDEC KEY STATE3
2391 AESDEC KEY STATE4
2392 .align 4
2393 .L4dec128:
2394 movaps -0x20(TKEYP), KEY
2395 AESDEC KEY STATE1
2396 AESDEC KEY STATE2
2397 AESDEC KEY STATE3
2398 AESDEC KEY STATE4
2399 movaps -0x10(TKEYP), KEY
2400 AESDEC KEY STATE1
2401 AESDEC KEY STATE2
2402 AESDEC KEY STATE3
2403 AESDEC KEY STATE4
2404 movaps (TKEYP), KEY
2405 AESDEC KEY STATE1
2406 AESDEC KEY STATE2
2407 AESDEC KEY STATE3
2408 AESDEC KEY STATE4
2409 movaps 0x10(TKEYP), KEY
2410 AESDEC KEY STATE1
2411 AESDEC KEY STATE2
2412 AESDEC KEY STATE3
2413 AESDEC KEY STATE4
2414 movaps 0x20(TKEYP), KEY
2415 AESDEC KEY STATE1
2416 AESDEC KEY STATE2
2417 AESDEC KEY STATE3
2418 AESDEC KEY STATE4
2419 movaps 0x30(TKEYP), KEY
2420 AESDEC KEY STATE1
2421 AESDEC KEY STATE2
2422 AESDEC KEY STATE3
2423 AESDEC KEY STATE4
2424 movaps 0x40(TKEYP), KEY
2425 AESDEC KEY STATE1
2426 AESDEC KEY STATE2
2427 AESDEC KEY STATE3
2428 AESDEC KEY STATE4
2429 movaps 0x50(TKEYP), KEY
2430 AESDEC KEY STATE1
2431 AESDEC KEY STATE2
2432 AESDEC KEY STATE3
2433 AESDEC KEY STATE4
2434 movaps 0x60(TKEYP), KEY
2435 AESDEC KEY STATE1
2436 AESDEC KEY STATE2
2437 AESDEC KEY STATE3
2438 AESDEC KEY STATE4
2439 movaps 0x70(TKEYP), KEY
2440 AESDECLAST KEY STATE1 # last round
2441 AESDECLAST KEY STATE2
2442 AESDECLAST KEY STATE3
2443 AESDECLAST KEY STATE4
2444 ret
2445 ENDPROC(_aesni_dec4)
2446
2447 /*
2448 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2449 * size_t len)
2450 */
2451 ENTRY(aesni_ecb_enc)
2452 FRAME_BEGIN
2453 #ifndef __x86_64__
2454 pushl LEN
2455 pushl KEYP
2456 pushl KLEN
2457 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2458 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2459 movl (FRAME_OFFSET+24)(%esp), INP # src
2460 movl (FRAME_OFFSET+28)(%esp), LEN # len
2461 #endif
2462 test LEN, LEN # check length
2463 jz .Lecb_enc_ret
2464 mov 480(KEYP), KLEN
2465 cmp $16, LEN
2466 jb .Lecb_enc_ret
2467 cmp $64, LEN
2468 jb .Lecb_enc_loop1
2469 .align 4
2470 .Lecb_enc_loop4:
2471 movups (INP), STATE1
2472 movups 0x10(INP), STATE2
2473 movups 0x20(INP), STATE3
2474 movups 0x30(INP), STATE4
2475 call _aesni_enc4
2476 movups STATE1, (OUTP)
2477 movups STATE2, 0x10(OUTP)
2478 movups STATE3, 0x20(OUTP)
2479 movups STATE4, 0x30(OUTP)
2480 sub $64, LEN
2481 add $64, INP
2482 add $64, OUTP
2483 cmp $64, LEN
2484 jge .Lecb_enc_loop4
2485 cmp $16, LEN
2486 jb .Lecb_enc_ret
2487 .align 4
2488 .Lecb_enc_loop1:
2489 movups (INP), STATE1
2490 call _aesni_enc1
2491 movups STATE1, (OUTP)
2492 sub $16, LEN
2493 add $16, INP
2494 add $16, OUTP
2495 cmp $16, LEN
2496 jge .Lecb_enc_loop1
2497 .Lecb_enc_ret:
2498 #ifndef __x86_64__
2499 popl KLEN
2500 popl KEYP
2501 popl LEN
2502 #endif
2503 FRAME_END
2504 ret
2505 ENDPROC(aesni_ecb_enc)
2506
2507 /*
2508 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2509 * size_t len);
2510 */
2511 ENTRY(aesni_ecb_dec)
2512 FRAME_BEGIN
2513 #ifndef __x86_64__
2514 pushl LEN
2515 pushl KEYP
2516 pushl KLEN
2517 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2518 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2519 movl (FRAME_OFFSET+24)(%esp), INP # src
2520 movl (FRAME_OFFSET+28)(%esp), LEN # len
2521 #endif
2522 test LEN, LEN
2523 jz .Lecb_dec_ret
2524 mov 480(KEYP), KLEN
2525 add $240, KEYP
2526 cmp $16, LEN
2527 jb .Lecb_dec_ret
2528 cmp $64, LEN
2529 jb .Lecb_dec_loop1
2530 .align 4
2531 .Lecb_dec_loop4:
2532 movups (INP), STATE1
2533 movups 0x10(INP), STATE2
2534 movups 0x20(INP), STATE3
2535 movups 0x30(INP), STATE4
2536 call _aesni_dec4
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2541 sub $64, LEN
2542 add $64, INP
2543 add $64, OUTP
2544 cmp $64, LEN
2545 jge .Lecb_dec_loop4
2546 cmp $16, LEN
2547 jb .Lecb_dec_ret
2548 .align 4
2549 .Lecb_dec_loop1:
2550 movups (INP), STATE1
2551 call _aesni_dec1
2552 movups STATE1, (OUTP)
2553 sub $16, LEN
2554 add $16, INP
2555 add $16, OUTP
2556 cmp $16, LEN
2557 jge .Lecb_dec_loop1
2558 .Lecb_dec_ret:
2559 #ifndef __x86_64__
2560 popl KLEN
2561 popl KEYP
2562 popl LEN
2563 #endif
2564 FRAME_END
2565 ret
2566 ENDPROC(aesni_ecb_dec)
2567
2568 /*
2569 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2570 * size_t len, u8 *iv)
2571 */
2572 ENTRY(aesni_cbc_enc)
2573 FRAME_BEGIN
2574 #ifndef __x86_64__
2575 pushl IVP
2576 pushl LEN
2577 pushl KEYP
2578 pushl KLEN
2579 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2580 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2581 movl (FRAME_OFFSET+28)(%esp), INP # src
2582 movl (FRAME_OFFSET+32)(%esp), LEN # len
2583 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2584 #endif
2585 cmp $16, LEN
2586 jb .Lcbc_enc_ret
2587 mov 480(KEYP), KLEN
2588 movups (IVP), STATE # load iv as initial state
2589 .align 4
2590 .Lcbc_enc_loop:
2591 movups (INP), IN # load input
2592 pxor IN, STATE
2593 call _aesni_enc1
2594 movups STATE, (OUTP) # store output
2595 sub $16, LEN
2596 add $16, INP
2597 add $16, OUTP
2598 cmp $16, LEN
2599 jge .Lcbc_enc_loop
2600 movups STATE, (IVP)
2601 .Lcbc_enc_ret:
2602 #ifndef __x86_64__
2603 popl KLEN
2604 popl KEYP
2605 popl LEN
2606 popl IVP
2607 #endif
2608 FRAME_END
2609 ret
2610 ENDPROC(aesni_cbc_enc)
2611
2612 /*
2613 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2614 * size_t len, u8 *iv)
2615 */
2616 ENTRY(aesni_cbc_dec)
2617 FRAME_BEGIN
2618 #ifndef __x86_64__
2619 pushl IVP
2620 pushl LEN
2621 pushl KEYP
2622 pushl KLEN
2623 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2624 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2625 movl (FRAME_OFFSET+28)(%esp), INP # src
2626 movl (FRAME_OFFSET+32)(%esp), LEN # len
2627 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2628 #endif
2629 cmp $16, LEN
2630 jb .Lcbc_dec_just_ret
2631 mov 480(KEYP), KLEN
2632 add $240, KEYP
2633 movups (IVP), IV
2634 cmp $64, LEN
2635 jb .Lcbc_dec_loop1
2636 .align 4
2637 .Lcbc_dec_loop4:
2638 movups (INP), IN1
2639 movaps IN1, STATE1
2640 movups 0x10(INP), IN2
2641 movaps IN2, STATE2
2642 #ifdef __x86_64__
2643 movups 0x20(INP), IN3
2644 movaps IN3, STATE3
2645 movups 0x30(INP), IN4
2646 movaps IN4, STATE4
2647 #else
2648 movups 0x20(INP), IN1
2649 movaps IN1, STATE3
2650 movups 0x30(INP), IN2
2651 movaps IN2, STATE4
2652 #endif
2653 call _aesni_dec4
2654 pxor IV, STATE1
2655 #ifdef __x86_64__
2656 pxor IN1, STATE2
2657 pxor IN2, STATE3
2658 pxor IN3, STATE4
2659 movaps IN4, IV
2660 #else
2661 pxor IN1, STATE4
2662 movaps IN2, IV
2663 movups (INP), IN1
2664 pxor IN1, STATE2
2665 movups 0x10(INP), IN2
2666 pxor IN2, STATE3
2667 #endif
2668 movups STATE1, (OUTP)
2669 movups STATE2, 0x10(OUTP)
2670 movups STATE3, 0x20(OUTP)
2671 movups STATE4, 0x30(OUTP)
2672 sub $64, LEN
2673 add $64, INP
2674 add $64, OUTP
2675 cmp $64, LEN
2676 jge .Lcbc_dec_loop4
2677 cmp $16, LEN
2678 jb .Lcbc_dec_ret
2679 .align 4
2680 .Lcbc_dec_loop1:
2681 movups (INP), IN
2682 movaps IN, STATE
2683 call _aesni_dec1
2684 pxor IV, STATE
2685 movups STATE, (OUTP)
2686 movaps IN, IV
2687 sub $16, LEN
2688 add $16, INP
2689 add $16, OUTP
2690 cmp $16, LEN
2691 jge .Lcbc_dec_loop1
2692 .Lcbc_dec_ret:
2693 movups IV, (IVP)
2694 .Lcbc_dec_just_ret:
2695 #ifndef __x86_64__
2696 popl KLEN
2697 popl KEYP
2698 popl LEN
2699 popl IVP
2700 #endif
2701 FRAME_END
2702 ret
2703 ENDPROC(aesni_cbc_dec)
2704
2705 #ifdef __x86_64__
2706 .pushsection .rodata
2707 .align 16
2708 .Lbswap_mask:
2709 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2710 .popsection
2711
2712 /*
2713 * _aesni_inc_init: internal ABI
2714 * setup registers used by _aesni_inc
2715 * input:
2716 * IV
2717 * output:
2718 * CTR: == IV, in little endian
2719 * TCTR_LOW: == lower qword of CTR
2720 * INC: == 1, in little endian
2721 * BSWAP_MASK == endian swapping mask
2722 */
2723 .align 4
2724 _aesni_inc_init:
2725 movaps .Lbswap_mask, BSWAP_MASK
2726 movaps IV, CTR
2727 PSHUFB_XMM BSWAP_MASK CTR
2728 mov $1, TCTR_LOW
2729 MOVQ_R64_XMM TCTR_LOW INC
2730 MOVQ_R64_XMM CTR TCTR_LOW
2731 ret
2732 ENDPROC(_aesni_inc_init)
2733
2734 /*
2735 * _aesni_inc: internal ABI
2736 * Increase IV by 1, IV is in big endian
2737 * input:
2738 * IV
2739 * CTR: == IV, in little endian
2740 * TCTR_LOW: == lower qword of CTR
2741 * INC: == 1, in little endian
2742 * BSWAP_MASK == endian swapping mask
2743 * output:
2744 * IV: Increase by 1
2745 * changed:
2746 * CTR: == output IV, in little endian
2747 * TCTR_LOW: == lower qword of CTR
2748 */
2749 .align 4
2750 _aesni_inc:
2751 paddq INC, CTR
2752 add $1, TCTR_LOW
2753 jnc .Linc_low
2754 pslldq $8, INC
2755 paddq INC, CTR
2756 psrldq $8, INC
2757 .Linc_low:
2758 movaps CTR, IV
2759 PSHUFB_XMM BSWAP_MASK IV
2760 ret
2761 ENDPROC(_aesni_inc)
2762
2763 /*
2764 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2765 * size_t len, u8 *iv)
2766 */
2767 ENTRY(aesni_ctr_enc)
2768 FRAME_BEGIN
2769 cmp $16, LEN
2770 jb .Lctr_enc_just_ret
2771 mov 480(KEYP), KLEN
2772 movups (IVP), IV
2773 call _aesni_inc_init
2774 cmp $64, LEN
2775 jb .Lctr_enc_loop1
2776 .align 4
2777 .Lctr_enc_loop4:
2778 movaps IV, STATE1
2779 call _aesni_inc
2780 movups (INP), IN1
2781 movaps IV, STATE2
2782 call _aesni_inc
2783 movups 0x10(INP), IN2
2784 movaps IV, STATE3
2785 call _aesni_inc
2786 movups 0x20(INP), IN3
2787 movaps IV, STATE4
2788 call _aesni_inc
2789 movups 0x30(INP), IN4
2790 call _aesni_enc4
2791 pxor IN1, STATE1
2792 movups STATE1, (OUTP)
2793 pxor IN2, STATE2
2794 movups STATE2, 0x10(OUTP)
2795 pxor IN3, STATE3
2796 movups STATE3, 0x20(OUTP)
2797 pxor IN4, STATE4
2798 movups STATE4, 0x30(OUTP)
2799 sub $64, LEN
2800 add $64, INP
2801 add $64, OUTP
2802 cmp $64, LEN
2803 jge .Lctr_enc_loop4
2804 cmp $16, LEN
2805 jb .Lctr_enc_ret
2806 .align 4
2807 .Lctr_enc_loop1:
2808 movaps IV, STATE
2809 call _aesni_inc
2810 movups (INP), IN
2811 call _aesni_enc1
2812 pxor IN, STATE
2813 movups STATE, (OUTP)
2814 sub $16, LEN
2815 add $16, INP
2816 add $16, OUTP
2817 cmp $16, LEN
2818 jge .Lctr_enc_loop1
2819 .Lctr_enc_ret:
2820 movups IV, (IVP)
2821 .Lctr_enc_just_ret:
2822 FRAME_END
2823 ret
2824 ENDPROC(aesni_ctr_enc)
2825
2826 /*
2827 * _aesni_gf128mul_x_ble: internal ABI
2828 * Multiply in GF(2^128) for XTS IVs
2829 * input:
2830 * IV: current IV
2831 * GF128MUL_MASK == mask with 0x87 and 0x01
2832 * output:
2833 * IV: next IV
2834 * changed:
2835 * CTR: == temporary value
2836 */
2837 #define _aesni_gf128mul_x_ble() \
2838 pshufd $0x13, IV, CTR; \
2839 paddq IV, IV; \
2840 psrad $31, CTR; \
2841 pand GF128MUL_MASK, CTR; \
2842 pxor CTR, IV;
2843
2844 /*
2845 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2846 * bool enc, u8 *iv)
2847 */
2848 ENTRY(aesni_xts_crypt8)
2849 FRAME_BEGIN
2850 cmpb $0, %cl
2851 movl $0, %ecx
2852 movl $240, %r10d
2853 leaq _aesni_enc4, %r11
2854 leaq _aesni_dec4, %rax
2855 cmovel %r10d, %ecx
2856 cmoveq %rax, %r11
2857
2858 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2859 movups (IVP), IV
2860
2861 mov 480(KEYP), KLEN
2862 addq %rcx, KEYP
2863
2864 movdqa IV, STATE1
2865 movdqu 0x00(INP), INC
2866 pxor INC, STATE1
2867 movdqu IV, 0x00(OUTP)
2868
2869 _aesni_gf128mul_x_ble()
2870 movdqa IV, STATE2
2871 movdqu 0x10(INP), INC
2872 pxor INC, STATE2
2873 movdqu IV, 0x10(OUTP)
2874
2875 _aesni_gf128mul_x_ble()
2876 movdqa IV, STATE3
2877 movdqu 0x20(INP), INC
2878 pxor INC, STATE3
2879 movdqu IV, 0x20(OUTP)
2880
2881 _aesni_gf128mul_x_ble()
2882 movdqa IV, STATE4
2883 movdqu 0x30(INP), INC
2884 pxor INC, STATE4
2885 movdqu IV, 0x30(OUTP)
2886
2887 call *%r11
2888
2889 movdqu 0x00(OUTP), INC
2890 pxor INC, STATE1
2891 movdqu STATE1, 0x00(OUTP)
2892
2893 _aesni_gf128mul_x_ble()
2894 movdqa IV, STATE1
2895 movdqu 0x40(INP), INC
2896 pxor INC, STATE1
2897 movdqu IV, 0x40(OUTP)
2898
2899 movdqu 0x10(OUTP), INC
2900 pxor INC, STATE2
2901 movdqu STATE2, 0x10(OUTP)
2902
2903 _aesni_gf128mul_x_ble()
2904 movdqa IV, STATE2
2905 movdqu 0x50(INP), INC
2906 pxor INC, STATE2
2907 movdqu IV, 0x50(OUTP)
2908
2909 movdqu 0x20(OUTP), INC
2910 pxor INC, STATE3
2911 movdqu STATE3, 0x20(OUTP)
2912
2913 _aesni_gf128mul_x_ble()
2914 movdqa IV, STATE3
2915 movdqu 0x60(INP), INC
2916 pxor INC, STATE3
2917 movdqu IV, 0x60(OUTP)
2918
2919 movdqu 0x30(OUTP), INC
2920 pxor INC, STATE4
2921 movdqu STATE4, 0x30(OUTP)
2922
2923 _aesni_gf128mul_x_ble()
2924 movdqa IV, STATE4
2925 movdqu 0x70(INP), INC
2926 pxor INC, STATE4
2927 movdqu IV, 0x70(OUTP)
2928
2929 _aesni_gf128mul_x_ble()
2930 movups IV, (IVP)
2931
2932 call *%r11
2933
2934 movdqu 0x40(OUTP), INC
2935 pxor INC, STATE1
2936 movdqu STATE1, 0x40(OUTP)
2937
2938 movdqu 0x50(OUTP), INC
2939 pxor INC, STATE2
2940 movdqu STATE2, 0x50(OUTP)
2941
2942 movdqu 0x60(OUTP), INC
2943 pxor INC, STATE3
2944 movdqu STATE3, 0x60(OUTP)
2945
2946 movdqu 0x70(OUTP), INC
2947 pxor INC, STATE4
2948 movdqu STATE4, 0x70(OUTP)
2949
2950 FRAME_END
2951 ret
2952 ENDPROC(aesni_xts_crypt8)
2953
2954 #endif