]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - arch/x86/crypto/aesni-intel_asm.S
crypto: aesni - Add GCM_INIT macro
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
1 /*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32 #include <linux/linkage.h>
33 #include <asm/inst.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
36
37 /*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45 #define MOVADQ movaps
46 #define MOVUDQ movups
47
48 #ifdef __x86_64__
49
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52 .align 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
56 .align 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
59 .align 16
60 TWOONE: .octa 0x00000001000000000000000000000001
61
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63 .align 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
66 .align 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
69 .align 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
72 .align 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75 .align 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
78 .align 16
79 dec: .octa 0x1
80 .section .rodata.cst16.enc, "aM", @progbits, 16
81 .align 16
82 enc: .octa 0x2
83
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
88 .align 16
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
93 .text
94
95
96 #define STACK_OFFSET 8*3
97 #define HashKey 16*0 // store HashKey <<1 mod poly here
98 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define VARIABLE_OFFSET 16*8
114
115 #define arg1 rdi
116 #define arg2 rsi
117 #define arg3 rdx
118 #define arg4 rcx
119 #define arg5 r8
120 #define arg6 r9
121 #define arg7 STACK_OFFSET+8(%r14)
122 #define arg8 STACK_OFFSET+16(%r14)
123 #define arg9 STACK_OFFSET+24(%r14)
124 #define arg10 STACK_OFFSET+32(%r14)
125 #define keysize 2*15*16(%arg1)
126 #endif
127
128
129 #define STATE1 %xmm0
130 #define STATE2 %xmm4
131 #define STATE3 %xmm5
132 #define STATE4 %xmm6
133 #define STATE STATE1
134 #define IN1 %xmm1
135 #define IN2 %xmm7
136 #define IN3 %xmm8
137 #define IN4 %xmm9
138 #define IN IN1
139 #define KEY %xmm2
140 #define IV %xmm3
141
142 #define BSWAP_MASK %xmm10
143 #define CTR %xmm11
144 #define INC %xmm12
145
146 #define GF128MUL_MASK %xmm10
147
148 #ifdef __x86_64__
149 #define AREG %rax
150 #define KEYP %rdi
151 #define OUTP %rsi
152 #define UKEYP OUTP
153 #define INP %rdx
154 #define LEN %rcx
155 #define IVP %r8
156 #define KLEN %r9d
157 #define T1 %r10
158 #define TKEYP T1
159 #define T2 %r11
160 #define TCTR_LOW T2
161 #else
162 #define AREG %eax
163 #define KEYP %edi
164 #define OUTP AREG
165 #define UKEYP OUTP
166 #define INP %edx
167 #define LEN %esi
168 #define IVP %ebp
169 #define KLEN %ebx
170 #define T1 %ecx
171 #define TKEYP T1
172 #endif
173
174 .macro FUNC_SAVE
175 push %r12
176 push %r13
177 push %r14
178 mov %rsp, %r14
179 #
180 # states of %xmm registers %xmm6:%xmm15 not saved
181 # all %xmm registers are clobbered
182 #
183 sub $VARIABLE_OFFSET, %rsp
184 and $~63, %rsp
185 .endm
186
187
188 .macro FUNC_RESTORE
189 mov %r14, %rsp
190 pop %r14
191 pop %r13
192 pop %r12
193 .endm
194
195
196 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
197 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
198 .macro GCM_INIT
199 mov %arg6, %r12
200 movdqu (%r12), %xmm13
201 movdqa SHUF_MASK(%rip), %xmm2
202 PSHUFB_XMM %xmm2, %xmm13
203
204 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
205
206 movdqa %xmm13, %xmm2
207 psllq $1, %xmm13
208 psrlq $63, %xmm2
209 movdqa %xmm2, %xmm1
210 pslldq $8, %xmm2
211 psrldq $8, %xmm1
212 por %xmm2, %xmm13
213
214 # reduce HashKey<<1
215
216 pshufd $0x24, %xmm1, %xmm2
217 pcmpeqd TWOONE(%rip), %xmm2
218 pand POLY(%rip), %xmm2
219 pxor %xmm2, %xmm13
220 movdqa %xmm13, HashKey(%rsp)
221 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
222 and $-16, %r13
223 mov %r13, %r12
224 .endm
225
226 #ifdef __x86_64__
227 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
228 *
229 *
230 * Input: A and B (128-bits each, bit-reflected)
231 * Output: C = A*B*x mod poly, (i.e. >>1 )
232 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
233 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
234 *
235 */
236 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
237 movdqa \GH, \TMP1
238 pshufd $78, \GH, \TMP2
239 pshufd $78, \HK, \TMP3
240 pxor \GH, \TMP2 # TMP2 = a1+a0
241 pxor \HK, \TMP3 # TMP3 = b1+b0
242 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
243 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
244 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
245 pxor \GH, \TMP2
246 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
247 movdqa \TMP2, \TMP3
248 pslldq $8, \TMP3 # left shift TMP3 2 DWs
249 psrldq $8, \TMP2 # right shift TMP2 2 DWs
250 pxor \TMP3, \GH
251 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
252
253 # first phase of the reduction
254
255 movdqa \GH, \TMP2
256 movdqa \GH, \TMP3
257 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
258 # in in order to perform
259 # independent shifts
260 pslld $31, \TMP2 # packed right shift <<31
261 pslld $30, \TMP3 # packed right shift <<30
262 pslld $25, \TMP4 # packed right shift <<25
263 pxor \TMP3, \TMP2 # xor the shifted versions
264 pxor \TMP4, \TMP2
265 movdqa \TMP2, \TMP5
266 psrldq $4, \TMP5 # right shift TMP5 1 DW
267 pslldq $12, \TMP2 # left shift TMP2 3 DWs
268 pxor \TMP2, \GH
269
270 # second phase of the reduction
271
272 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
273 # in in order to perform
274 # independent shifts
275 movdqa \GH,\TMP3
276 movdqa \GH,\TMP4
277 psrld $1,\TMP2 # packed left shift >>1
278 psrld $2,\TMP3 # packed left shift >>2
279 psrld $7,\TMP4 # packed left shift >>7
280 pxor \TMP3,\TMP2 # xor the shifted versions
281 pxor \TMP4,\TMP2
282 pxor \TMP5, \TMP2
283 pxor \TMP2, \GH
284 pxor \TMP1, \GH # result is in TMP1
285 .endm
286
287 # Reads DLEN bytes starting at DPTR and stores in XMMDst
288 # where 0 < DLEN < 16
289 # Clobbers %rax, DLEN and XMM1
290 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
291 cmp $8, \DLEN
292 jl _read_lt8_\@
293 mov (\DPTR), %rax
294 MOVQ_R64_XMM %rax, \XMMDst
295 sub $8, \DLEN
296 jz _done_read_partial_block_\@
297 xor %eax, %eax
298 _read_next_byte_\@:
299 shl $8, %rax
300 mov 7(\DPTR, \DLEN, 1), %al
301 dec \DLEN
302 jnz _read_next_byte_\@
303 MOVQ_R64_XMM %rax, \XMM1
304 pslldq $8, \XMM1
305 por \XMM1, \XMMDst
306 jmp _done_read_partial_block_\@
307 _read_lt8_\@:
308 xor %eax, %eax
309 _read_next_byte_lt8_\@:
310 shl $8, %rax
311 mov -1(\DPTR, \DLEN, 1), %al
312 dec \DLEN
313 jnz _read_next_byte_lt8_\@
314 MOVQ_R64_XMM %rax, \XMMDst
315 _done_read_partial_block_\@:
316 .endm
317
318 /*
319 * if a = number of total plaintext bytes
320 * b = floor(a/16)
321 * num_initial_blocks = b mod 4
322 * encrypt the initial num_initial_blocks blocks and apply ghash on
323 * the ciphertext
324 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
325 * are clobbered
326 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
327 */
328
329
330 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
331 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
332 MOVADQ SHUF_MASK(%rip), %xmm14
333 mov arg7, %r10 # %r10 = AAD
334 mov arg8, %r11 # %r11 = aadLen
335 pxor %xmm\i, %xmm\i
336 pxor \XMM2, \XMM2
337
338 cmp $16, %r11
339 jl _get_AAD_rest\@
340 _get_AAD_blocks\@:
341 movdqu (%r10), %xmm\i
342 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
343 pxor %xmm\i, \XMM2
344 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
345 add $16, %r10
346 sub $16, %r11
347 cmp $16, %r11
348 jge _get_AAD_blocks\@
349
350 movdqu \XMM2, %xmm\i
351
352 /* read the last <16B of AAD */
353 _get_AAD_rest\@:
354 cmp $0, %r11
355 je _get_AAD_done\@
356
357 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
358 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
359 pxor \XMM2, %xmm\i
360 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
361
362 _get_AAD_done\@:
363 xor %r11, %r11 # initialise the data pointer offset as zero
364 # start AES for num_initial_blocks blocks
365
366 mov %arg5, %rax # %rax = *Y0
367 movdqu (%rax), \XMM0 # XMM0 = Y0
368 PSHUFB_XMM %xmm14, \XMM0
369
370 .if (\i == 5) || (\i == 6) || (\i == 7)
371
372 MOVADQ ONE(%RIP),\TMP1
373 MOVADQ 0(%arg1),\TMP2
374 .irpc index, \i_seq
375 paddd \TMP1, \XMM0 # INCR Y0
376 .ifc \operation, dec
377 movdqa \XMM0, %xmm\index
378 .else
379 MOVADQ \XMM0, %xmm\index
380 .endif
381 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
382 pxor \TMP2, %xmm\index
383 .endr
384 lea 0x10(%arg1),%r10
385 mov keysize,%eax
386 shr $2,%eax # 128->4, 192->6, 256->8
387 add $5,%eax # 128->9, 192->11, 256->13
388
389 aes_loop_initial_\@:
390 MOVADQ (%r10),\TMP1
391 .irpc index, \i_seq
392 AESENC \TMP1, %xmm\index
393 .endr
394 add $16,%r10
395 sub $1,%eax
396 jnz aes_loop_initial_\@
397
398 MOVADQ (%r10), \TMP1
399 .irpc index, \i_seq
400 AESENCLAST \TMP1, %xmm\index # Last Round
401 .endr
402 .irpc index, \i_seq
403 movdqu (%arg3 , %r11, 1), \TMP1
404 pxor \TMP1, %xmm\index
405 movdqu %xmm\index, (%arg2 , %r11, 1)
406 # write back plaintext/ciphertext for num_initial_blocks
407 add $16, %r11
408
409 .ifc \operation, dec
410 movdqa \TMP1, %xmm\index
411 .endif
412 PSHUFB_XMM %xmm14, %xmm\index
413
414 # prepare plaintext/ciphertext for GHASH computation
415 .endr
416 .endif
417
418 # apply GHASH on num_initial_blocks blocks
419
420 .if \i == 5
421 pxor %xmm5, %xmm6
422 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
423 pxor %xmm6, %xmm7
424 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
425 pxor %xmm7, %xmm8
426 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
427 .elseif \i == 6
428 pxor %xmm6, %xmm7
429 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
430 pxor %xmm7, %xmm8
431 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
432 .elseif \i == 7
433 pxor %xmm7, %xmm8
434 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
435 .endif
436 cmp $64, %r13
437 jl _initial_blocks_done\@
438 # no need for precomputed values
439 /*
440 *
441 * Precomputations for HashKey parallel with encryption of first 4 blocks.
442 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
443 */
444 MOVADQ ONE(%RIP),\TMP1
445 paddd \TMP1, \XMM0 # INCR Y0
446 MOVADQ \XMM0, \XMM1
447 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
448
449 paddd \TMP1, \XMM0 # INCR Y0
450 MOVADQ \XMM0, \XMM2
451 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
452
453 paddd \TMP1, \XMM0 # INCR Y0
454 MOVADQ \XMM0, \XMM3
455 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
456
457 paddd \TMP1, \XMM0 # INCR Y0
458 MOVADQ \XMM0, \XMM4
459 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
460
461 MOVADQ 0(%arg1),\TMP1
462 pxor \TMP1, \XMM1
463 pxor \TMP1, \XMM2
464 pxor \TMP1, \XMM3
465 pxor \TMP1, \XMM4
466 movdqa \TMP3, \TMP5
467 pshufd $78, \TMP3, \TMP1
468 pxor \TMP3, \TMP1
469 movdqa \TMP1, HashKey_k(%rsp)
470 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
471 # TMP5 = HashKey^2<<1 (mod poly)
472 movdqa \TMP5, HashKey_2(%rsp)
473 # HashKey_2 = HashKey^2<<1 (mod poly)
474 pshufd $78, \TMP5, \TMP1
475 pxor \TMP5, \TMP1
476 movdqa \TMP1, HashKey_2_k(%rsp)
477 .irpc index, 1234 # do 4 rounds
478 movaps 0x10*\index(%arg1), \TMP1
479 AESENC \TMP1, \XMM1
480 AESENC \TMP1, \XMM2
481 AESENC \TMP1, \XMM3
482 AESENC \TMP1, \XMM4
483 .endr
484 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
485 # TMP5 = HashKey^3<<1 (mod poly)
486 movdqa \TMP5, HashKey_3(%rsp)
487 pshufd $78, \TMP5, \TMP1
488 pxor \TMP5, \TMP1
489 movdqa \TMP1, HashKey_3_k(%rsp)
490 .irpc index, 56789 # do next 5 rounds
491 movaps 0x10*\index(%arg1), \TMP1
492 AESENC \TMP1, \XMM1
493 AESENC \TMP1, \XMM2
494 AESENC \TMP1, \XMM3
495 AESENC \TMP1, \XMM4
496 .endr
497 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
498 # TMP5 = HashKey^3<<1 (mod poly)
499 movdqa \TMP5, HashKey_4(%rsp)
500 pshufd $78, \TMP5, \TMP1
501 pxor \TMP5, \TMP1
502 movdqa \TMP1, HashKey_4_k(%rsp)
503 lea 0xa0(%arg1),%r10
504 mov keysize,%eax
505 shr $2,%eax # 128->4, 192->6, 256->8
506 sub $4,%eax # 128->0, 192->2, 256->4
507 jz aes_loop_pre_done\@
508
509 aes_loop_pre_\@:
510 MOVADQ (%r10),\TMP2
511 .irpc index, 1234
512 AESENC \TMP2, %xmm\index
513 .endr
514 add $16,%r10
515 sub $1,%eax
516 jnz aes_loop_pre_\@
517
518 aes_loop_pre_done\@:
519 MOVADQ (%r10), \TMP2
520 AESENCLAST \TMP2, \XMM1
521 AESENCLAST \TMP2, \XMM2
522 AESENCLAST \TMP2, \XMM3
523 AESENCLAST \TMP2, \XMM4
524 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
525 pxor \TMP1, \XMM1
526 .ifc \operation, dec
527 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
528 movdqa \TMP1, \XMM1
529 .endif
530 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
531 pxor \TMP1, \XMM2
532 .ifc \operation, dec
533 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
534 movdqa \TMP1, \XMM2
535 .endif
536 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
537 pxor \TMP1, \XMM3
538 .ifc \operation, dec
539 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
540 movdqa \TMP1, \XMM3
541 .endif
542 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
543 pxor \TMP1, \XMM4
544 .ifc \operation, dec
545 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
546 movdqa \TMP1, \XMM4
547 .else
548 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
549 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
550 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
551 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
552 .endif
553
554 add $64, %r11
555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
556 pxor \XMMDst, \XMM1
557 # combine GHASHed value with the corresponding ciphertext
558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
559 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
560 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
561
562 _initial_blocks_done\@:
563
564 .endm
565
566 /*
567 * encrypt 4 blocks at a time
568 * ghash the 4 previously encrypted ciphertext blocks
569 * arg1, %arg2, %arg3 are used as pointers only, not modified
570 * %r11 is the data offset value
571 */
572 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
573 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
574
575 movdqa \XMM1, \XMM5
576 movdqa \XMM2, \XMM6
577 movdqa \XMM3, \XMM7
578 movdqa \XMM4, \XMM8
579
580 movdqa SHUF_MASK(%rip), %xmm15
581 # multiply TMP5 * HashKey using karatsuba
582
583 movdqa \XMM5, \TMP4
584 pshufd $78, \XMM5, \TMP6
585 pxor \XMM5, \TMP6
586 paddd ONE(%rip), \XMM0 # INCR CNT
587 movdqa HashKey_4(%rsp), \TMP5
588 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
589 movdqa \XMM0, \XMM1
590 paddd ONE(%rip), \XMM0 # INCR CNT
591 movdqa \XMM0, \XMM2
592 paddd ONE(%rip), \XMM0 # INCR CNT
593 movdqa \XMM0, \XMM3
594 paddd ONE(%rip), \XMM0 # INCR CNT
595 movdqa \XMM0, \XMM4
596 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
597 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
598 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
599 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
600 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
601
602 pxor (%arg1), \XMM1
603 pxor (%arg1), \XMM2
604 pxor (%arg1), \XMM3
605 pxor (%arg1), \XMM4
606 movdqa HashKey_4_k(%rsp), \TMP5
607 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
608 movaps 0x10(%arg1), \TMP1
609 AESENC \TMP1, \XMM1 # Round 1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613 movaps 0x20(%arg1), \TMP1
614 AESENC \TMP1, \XMM1 # Round 2
615 AESENC \TMP1, \XMM2
616 AESENC \TMP1, \XMM3
617 AESENC \TMP1, \XMM4
618 movdqa \XMM6, \TMP1
619 pshufd $78, \XMM6, \TMP2
620 pxor \XMM6, \TMP2
621 movdqa HashKey_3(%rsp), \TMP5
622 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
623 movaps 0x30(%arg1), \TMP3
624 AESENC \TMP3, \XMM1 # Round 3
625 AESENC \TMP3, \XMM2
626 AESENC \TMP3, \XMM3
627 AESENC \TMP3, \XMM4
628 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
629 movaps 0x40(%arg1), \TMP3
630 AESENC \TMP3, \XMM1 # Round 4
631 AESENC \TMP3, \XMM2
632 AESENC \TMP3, \XMM3
633 AESENC \TMP3, \XMM4
634 movdqa HashKey_3_k(%rsp), \TMP5
635 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
636 movaps 0x50(%arg1), \TMP3
637 AESENC \TMP3, \XMM1 # Round 5
638 AESENC \TMP3, \XMM2
639 AESENC \TMP3, \XMM3
640 AESENC \TMP3, \XMM4
641 pxor \TMP1, \TMP4
642 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
643 pxor \XMM6, \XMM5
644 pxor \TMP2, \TMP6
645 movdqa \XMM7, \TMP1
646 pshufd $78, \XMM7, \TMP2
647 pxor \XMM7, \TMP2
648 movdqa HashKey_2(%rsp ), \TMP5
649
650 # Multiply TMP5 * HashKey using karatsuba
651
652 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
653 movaps 0x60(%arg1), \TMP3
654 AESENC \TMP3, \XMM1 # Round 6
655 AESENC \TMP3, \XMM2
656 AESENC \TMP3, \XMM3
657 AESENC \TMP3, \XMM4
658 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
659 movaps 0x70(%arg1), \TMP3
660 AESENC \TMP3, \XMM1 # Round 7
661 AESENC \TMP3, \XMM2
662 AESENC \TMP3, \XMM3
663 AESENC \TMP3, \XMM4
664 movdqa HashKey_2_k(%rsp), \TMP5
665 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
666 movaps 0x80(%arg1), \TMP3
667 AESENC \TMP3, \XMM1 # Round 8
668 AESENC \TMP3, \XMM2
669 AESENC \TMP3, \XMM3
670 AESENC \TMP3, \XMM4
671 pxor \TMP1, \TMP4
672 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
673 pxor \XMM7, \XMM5
674 pxor \TMP2, \TMP6
675
676 # Multiply XMM8 * HashKey
677 # XMM8 and TMP5 hold the values for the two operands
678
679 movdqa \XMM8, \TMP1
680 pshufd $78, \XMM8, \TMP2
681 pxor \XMM8, \TMP2
682 movdqa HashKey(%rsp), \TMP5
683 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
684 movaps 0x90(%arg1), \TMP3
685 AESENC \TMP3, \XMM1 # Round 9
686 AESENC \TMP3, \XMM2
687 AESENC \TMP3, \XMM3
688 AESENC \TMP3, \XMM4
689 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
690 lea 0xa0(%arg1),%r10
691 mov keysize,%eax
692 shr $2,%eax # 128->4, 192->6, 256->8
693 sub $4,%eax # 128->0, 192->2, 256->4
694 jz aes_loop_par_enc_done
695
696 aes_loop_par_enc:
697 MOVADQ (%r10),\TMP3
698 .irpc index, 1234
699 AESENC \TMP3, %xmm\index
700 .endr
701 add $16,%r10
702 sub $1,%eax
703 jnz aes_loop_par_enc
704
705 aes_loop_par_enc_done:
706 MOVADQ (%r10), \TMP3
707 AESENCLAST \TMP3, \XMM1 # Round 10
708 AESENCLAST \TMP3, \XMM2
709 AESENCLAST \TMP3, \XMM3
710 AESENCLAST \TMP3, \XMM4
711 movdqa HashKey_k(%rsp), \TMP5
712 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
713 movdqu (%arg3,%r11,1), \TMP3
714 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
715 movdqu 16(%arg3,%r11,1), \TMP3
716 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
717 movdqu 32(%arg3,%r11,1), \TMP3
718 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
719 movdqu 48(%arg3,%r11,1), \TMP3
720 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
721 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
722 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
723 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
724 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
725 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
726 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
727 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
728 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
729
730 pxor \TMP4, \TMP1
731 pxor \XMM8, \XMM5
732 pxor \TMP6, \TMP2
733 pxor \TMP1, \TMP2
734 pxor \XMM5, \TMP2
735 movdqa \TMP2, \TMP3
736 pslldq $8, \TMP3 # left shift TMP3 2 DWs
737 psrldq $8, \TMP2 # right shift TMP2 2 DWs
738 pxor \TMP3, \XMM5
739 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
740
741 # first phase of reduction
742
743 movdqa \XMM5, \TMP2
744 movdqa \XMM5, \TMP3
745 movdqa \XMM5, \TMP4
746 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
747 pslld $31, \TMP2 # packed right shift << 31
748 pslld $30, \TMP3 # packed right shift << 30
749 pslld $25, \TMP4 # packed right shift << 25
750 pxor \TMP3, \TMP2 # xor the shifted versions
751 pxor \TMP4, \TMP2
752 movdqa \TMP2, \TMP5
753 psrldq $4, \TMP5 # right shift T5 1 DW
754 pslldq $12, \TMP2 # left shift T2 3 DWs
755 pxor \TMP2, \XMM5
756
757 # second phase of reduction
758
759 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
760 movdqa \XMM5,\TMP3
761 movdqa \XMM5,\TMP4
762 psrld $1, \TMP2 # packed left shift >>1
763 psrld $2, \TMP3 # packed left shift >>2
764 psrld $7, \TMP4 # packed left shift >>7
765 pxor \TMP3,\TMP2 # xor the shifted versions
766 pxor \TMP4,\TMP2
767 pxor \TMP5, \TMP2
768 pxor \TMP2, \XMM5
769 pxor \TMP1, \XMM5 # result is in TMP1
770
771 pxor \XMM5, \XMM1
772 .endm
773
774 /*
775 * decrypt 4 blocks at a time
776 * ghash the 4 previously decrypted ciphertext blocks
777 * arg1, %arg2, %arg3 are used as pointers only, not modified
778 * %r11 is the data offset value
779 */
780 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
781 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
782
783 movdqa \XMM1, \XMM5
784 movdqa \XMM2, \XMM6
785 movdqa \XMM3, \XMM7
786 movdqa \XMM4, \XMM8
787
788 movdqa SHUF_MASK(%rip), %xmm15
789 # multiply TMP5 * HashKey using karatsuba
790
791 movdqa \XMM5, \TMP4
792 pshufd $78, \XMM5, \TMP6
793 pxor \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
797 movdqa \XMM0, \XMM1
798 paddd ONE(%rip), \XMM0 # INCR CNT
799 movdqa \XMM0, \XMM2
800 paddd ONE(%rip), \XMM0 # INCR CNT
801 movdqa \XMM0, \XMM3
802 paddd ONE(%rip), \XMM0 # INCR CNT
803 movdqa \XMM0, \XMM4
804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
809
810 pxor (%arg1), \XMM1
811 pxor (%arg1), \XMM2
812 pxor (%arg1), \XMM3
813 pxor (%arg1), \XMM4
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
818 AESENC \TMP1, \XMM2
819 AESENC \TMP1, \XMM3
820 AESENC \TMP1, \XMM4
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
823 AESENC \TMP1, \XMM2
824 AESENC \TMP1, \XMM3
825 AESENC \TMP1, \XMM4
826 movdqa \XMM6, \TMP1
827 pshufd $78, \XMM6, \TMP2
828 pxor \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
833 AESENC \TMP3, \XMM2
834 AESENC \TMP3, \XMM3
835 AESENC \TMP3, \XMM4
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
839 AESENC \TMP3, \XMM2
840 AESENC \TMP3, \XMM3
841 AESENC \TMP3, \XMM4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
846 AESENC \TMP3, \XMM2
847 AESENC \TMP3, \XMM3
848 AESENC \TMP3, \XMM4
849 pxor \TMP1, \TMP4
850 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
851 pxor \XMM6, \XMM5
852 pxor \TMP2, \TMP6
853 movdqa \XMM7, \TMP1
854 pshufd $78, \XMM7, \TMP2
855 pxor \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
857
858 # Multiply TMP5 * HashKey using karatsuba
859
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
863 AESENC \TMP3, \XMM2
864 AESENC \TMP3, \XMM3
865 AESENC \TMP3, \XMM4
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
869 AESENC \TMP3, \XMM2
870 AESENC \TMP3, \XMM3
871 AESENC \TMP3, \XMM4
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
876 AESENC \TMP3, \XMM2
877 AESENC \TMP3, \XMM3
878 AESENC \TMP3, \XMM4
879 pxor \TMP1, \TMP4
880 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
881 pxor \XMM7, \XMM5
882 pxor \TMP2, \TMP6
883
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
886
887 movdqa \XMM8, \TMP1
888 pshufd $78, \XMM8, \TMP2
889 pxor \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
894 AESENC \TMP3, \XMM2
895 AESENC \TMP3, \XMM3
896 AESENC \TMP3, \XMM4
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
898 lea 0xa0(%arg1),%r10
899 mov keysize,%eax
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_dec_done
903
904 aes_loop_par_dec:
905 MOVADQ (%r10),\TMP3
906 .irpc index, 1234
907 AESENC \TMP3, %xmm\index
908 .endr
909 add $16,%r10
910 sub $1,%eax
911 jnz aes_loop_par_dec
912
913 aes_loop_par_dec_done:
914 MOVADQ (%r10), \TMP3
915 AESENCLAST \TMP3, \XMM1 # last round
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
923 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
924 movdqa \TMP3, \XMM1
925 movdqu 16(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
927 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
928 movdqa \TMP3, \XMM2
929 movdqu 32(%arg3,%r11,1), \TMP3
930 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
932 movdqa \TMP3, \XMM3
933 movdqu 48(%arg3,%r11,1), \TMP3
934 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
935 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
936 movdqa \TMP3, \XMM4
937 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
938 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
939 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
940 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
941
942 pxor \TMP4, \TMP1
943 pxor \XMM8, \XMM5
944 pxor \TMP6, \TMP2
945 pxor \TMP1, \TMP2
946 pxor \XMM5, \TMP2
947 movdqa \TMP2, \TMP3
948 pslldq $8, \TMP3 # left shift TMP3 2 DWs
949 psrldq $8, \TMP2 # right shift TMP2 2 DWs
950 pxor \TMP3, \XMM5
951 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
952
953 # first phase of reduction
954
955 movdqa \XMM5, \TMP2
956 movdqa \XMM5, \TMP3
957 movdqa \XMM5, \TMP4
958 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
959 pslld $31, \TMP2 # packed right shift << 31
960 pslld $30, \TMP3 # packed right shift << 30
961 pslld $25, \TMP4 # packed right shift << 25
962 pxor \TMP3, \TMP2 # xor the shifted versions
963 pxor \TMP4, \TMP2
964 movdqa \TMP2, \TMP5
965 psrldq $4, \TMP5 # right shift T5 1 DW
966 pslldq $12, \TMP2 # left shift T2 3 DWs
967 pxor \TMP2, \XMM5
968
969 # second phase of reduction
970
971 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
972 movdqa \XMM5,\TMP3
973 movdqa \XMM5,\TMP4
974 psrld $1, \TMP2 # packed left shift >>1
975 psrld $2, \TMP3 # packed left shift >>2
976 psrld $7, \TMP4 # packed left shift >>7
977 pxor \TMP3,\TMP2 # xor the shifted versions
978 pxor \TMP4,\TMP2
979 pxor \TMP5, \TMP2
980 pxor \TMP2, \XMM5
981 pxor \TMP1, \XMM5 # result is in TMP1
982
983 pxor \XMM5, \XMM1
984 .endm
985
986 /* GHASH the last 4 ciphertext blocks. */
987 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
988 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
989
990 # Multiply TMP6 * HashKey (using Karatsuba)
991
992 movdqa \XMM1, \TMP6
993 pshufd $78, \XMM1, \TMP2
994 pxor \XMM1, \TMP2
995 movdqa HashKey_4(%rsp), \TMP5
996 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
997 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
998 movdqa HashKey_4_k(%rsp), \TMP4
999 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1000 movdqa \XMM1, \XMMDst
1001 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1002
1003 # Multiply TMP1 * HashKey (using Karatsuba)
1004
1005 movdqa \XMM2, \TMP1
1006 pshufd $78, \XMM2, \TMP2
1007 pxor \XMM2, \TMP2
1008 movdqa HashKey_3(%rsp), \TMP5
1009 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1010 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1011 movdqa HashKey_3_k(%rsp), \TMP4
1012 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1013 pxor \TMP1, \TMP6
1014 pxor \XMM2, \XMMDst
1015 pxor \TMP2, \XMM1
1016 # results accumulated in TMP6, XMMDst, XMM1
1017
1018 # Multiply TMP1 * HashKey (using Karatsuba)
1019
1020 movdqa \XMM3, \TMP1
1021 pshufd $78, \XMM3, \TMP2
1022 pxor \XMM3, \TMP2
1023 movdqa HashKey_2(%rsp), \TMP5
1024 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1025 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1026 movdqa HashKey_2_k(%rsp), \TMP4
1027 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1028 pxor \TMP1, \TMP6
1029 pxor \XMM3, \XMMDst
1030 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1031
1032 # Multiply TMP1 * HashKey (using Karatsuba)
1033 movdqa \XMM4, \TMP1
1034 pshufd $78, \XMM4, \TMP2
1035 pxor \XMM4, \TMP2
1036 movdqa HashKey(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1038 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1039 movdqa HashKey_k(%rsp), \TMP4
1040 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1041 pxor \TMP1, \TMP6
1042 pxor \XMM4, \XMMDst
1043 pxor \XMM1, \TMP2
1044 pxor \TMP6, \TMP2
1045 pxor \XMMDst, \TMP2
1046 # middle section of the temp results combined as in karatsuba algorithm
1047 movdqa \TMP2, \TMP4
1048 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1049 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1050 pxor \TMP4, \XMMDst
1051 pxor \TMP2, \TMP6
1052 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1053 # first phase of the reduction
1054 movdqa \XMMDst, \TMP2
1055 movdqa \XMMDst, \TMP3
1056 movdqa \XMMDst, \TMP4
1057 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1058 pslld $31, \TMP2 # packed right shifting << 31
1059 pslld $30, \TMP3 # packed right shifting << 30
1060 pslld $25, \TMP4 # packed right shifting << 25
1061 pxor \TMP3, \TMP2 # xor the shifted versions
1062 pxor \TMP4, \TMP2
1063 movdqa \TMP2, \TMP7
1064 psrldq $4, \TMP7 # right shift TMP7 1 DW
1065 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1066 pxor \TMP2, \XMMDst
1067
1068 # second phase of the reduction
1069 movdqa \XMMDst, \TMP2
1070 # make 3 copies of XMMDst for doing 3 shift operations
1071 movdqa \XMMDst, \TMP3
1072 movdqa \XMMDst, \TMP4
1073 psrld $1, \TMP2 # packed left shift >> 1
1074 psrld $2, \TMP3 # packed left shift >> 2
1075 psrld $7, \TMP4 # packed left shift >> 7
1076 pxor \TMP3, \TMP2 # xor the shifted versions
1077 pxor \TMP4, \TMP2
1078 pxor \TMP7, \TMP2
1079 pxor \TMP2, \XMMDst
1080 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1081 .endm
1082
1083
1084 /* Encryption of a single block
1085 * uses eax & r10
1086 */
1087
1088 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1089
1090 pxor (%arg1), \XMM0
1091 mov keysize,%eax
1092 shr $2,%eax # 128->4, 192->6, 256->8
1093 add $5,%eax # 128->9, 192->11, 256->13
1094 lea 16(%arg1), %r10 # get first expanded key address
1095
1096 _esb_loop_\@:
1097 MOVADQ (%r10),\TMP1
1098 AESENC \TMP1,\XMM0
1099 add $16,%r10
1100 sub $1,%eax
1101 jnz _esb_loop_\@
1102
1103 MOVADQ (%r10),\TMP1
1104 AESENCLAST \TMP1,\XMM0
1105 .endm
1106 /*****************************************************************************
1107 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1108 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1109 * const u8 *in, // Ciphertext input
1110 * u64 plaintext_len, // Length of data in bytes for decryption.
1111 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1112 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1113 * // concatenated with 0x00000001. 16-byte aligned pointer.
1114 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1115 * const u8 *aad, // Additional Authentication Data (AAD)
1116 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1117 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1118 * // given authentication tag and only return the plaintext if they match.
1119 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1120 * // (most likely), 12 or 8.
1121 *
1122 * Assumptions:
1123 *
1124 * keys:
1125 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1126 * set of 11 keys in the data structure void *aes_ctx
1127 *
1128 * iv:
1129 * 0 1 2 3
1130 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1131 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1132 * | Salt (From the SA) |
1133 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1134 * | Initialization Vector |
1135 * | (This is the sequence number from IPSec header) |
1136 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1137 * | 0x1 |
1138 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1139 *
1140 *
1141 *
1142 * AAD:
1143 * AAD padded to 128 bits with 0
1144 * for example, assume AAD is a u32 vector
1145 *
1146 * if AAD is 8 bytes:
1147 * AAD[3] = {A0, A1};
1148 * padded AAD in xmm register = {A1 A0 0 0}
1149 *
1150 * 0 1 2 3
1151 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1152 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1153 * | SPI (A1) |
1154 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1155 * | 32-bit Sequence Number (A0) |
1156 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1157 * | 0x0 |
1158 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1159 *
1160 * AAD Format with 32-bit Sequence Number
1161 *
1162 * if AAD is 12 bytes:
1163 * AAD[3] = {A0, A1, A2};
1164 * padded AAD in xmm register = {A2 A1 A0 0}
1165 *
1166 * 0 1 2 3
1167 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1168 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1169 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1170 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1171 * | SPI (A2) |
1172 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1173 * | 64-bit Extended Sequence Number {A1,A0} |
1174 * | |
1175 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1176 * | 0x0 |
1177 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1178 *
1179 * AAD Format with 64-bit Extended Sequence Number
1180 *
1181 * poly = x^128 + x^127 + x^126 + x^121 + 1
1182 *
1183 *****************************************************************************/
1184 ENTRY(aesni_gcm_dec)
1185 FUNC_SAVE
1186
1187 GCM_INIT
1188
1189 # Decrypt first few blocks
1190
1191 and $(3<<4), %r12
1192 jz _initial_num_blocks_is_0_decrypt
1193 cmp $(2<<4), %r12
1194 jb _initial_num_blocks_is_1_decrypt
1195 je _initial_num_blocks_is_2_decrypt
1196 _initial_num_blocks_is_3_decrypt:
1197 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1198 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1199 sub $48, %r13
1200 jmp _initial_blocks_decrypted
1201 _initial_num_blocks_is_2_decrypt:
1202 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1203 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1204 sub $32, %r13
1205 jmp _initial_blocks_decrypted
1206 _initial_num_blocks_is_1_decrypt:
1207 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1208 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1209 sub $16, %r13
1210 jmp _initial_blocks_decrypted
1211 _initial_num_blocks_is_0_decrypt:
1212 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1213 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1214 _initial_blocks_decrypted:
1215 cmp $0, %r13
1216 je _zero_cipher_left_decrypt
1217 sub $64, %r13
1218 je _four_cipher_left_decrypt
1219 _decrypt_by_4:
1220 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1221 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1222 add $64, %r11
1223 sub $64, %r13
1224 jne _decrypt_by_4
1225 _four_cipher_left_decrypt:
1226 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1227 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1228 _zero_cipher_left_decrypt:
1229 mov %arg4, %r13
1230 and $15, %r13 # %r13 = arg4 (mod 16)
1231 je _multiple_of_16_bytes_decrypt
1232
1233 # Handle the last <16 byte block separately
1234
1235 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1236 movdqa SHUF_MASK(%rip), %xmm10
1237 PSHUFB_XMM %xmm10, %xmm0
1238
1239 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1240
1241 lea (%arg3,%r11,1), %r10
1242 mov %r13, %r12
1243 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1244
1245 lea ALL_F+16(%rip), %r12
1246 sub %r13, %r12
1247 movdqa %xmm1, %xmm2
1248 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1249 movdqu (%r12), %xmm1
1250 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1251 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1252 pand %xmm1, %xmm2
1253 movdqa SHUF_MASK(%rip), %xmm10
1254 PSHUFB_XMM %xmm10 ,%xmm2
1255
1256 pxor %xmm2, %xmm8
1257 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1258
1259 # output %r13 bytes
1260 MOVQ_R64_XMM %xmm0, %rax
1261 cmp $8, %r13
1262 jle _less_than_8_bytes_left_decrypt
1263 mov %rax, (%arg2 , %r11, 1)
1264 add $8, %r11
1265 psrldq $8, %xmm0
1266 MOVQ_R64_XMM %xmm0, %rax
1267 sub $8, %r13
1268 _less_than_8_bytes_left_decrypt:
1269 mov %al, (%arg2, %r11, 1)
1270 add $1, %r11
1271 shr $8, %rax
1272 sub $1, %r13
1273 jne _less_than_8_bytes_left_decrypt
1274 _multiple_of_16_bytes_decrypt:
1275 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1276 shl $3, %r12 # convert into number of bits
1277 movd %r12d, %xmm15 # len(A) in %xmm15
1278 shl $3, %arg4 # len(C) in bits (*128)
1279 MOVQ_R64_XMM %arg4, %xmm1
1280 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1281 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1282 pxor %xmm15, %xmm8
1283 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1284 # final GHASH computation
1285 movdqa SHUF_MASK(%rip), %xmm10
1286 PSHUFB_XMM %xmm10, %xmm8
1287
1288 mov %arg5, %rax # %rax = *Y0
1289 movdqu (%rax), %xmm0 # %xmm0 = Y0
1290 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1291 pxor %xmm8, %xmm0
1292 _return_T_decrypt:
1293 mov arg9, %r10 # %r10 = authTag
1294 mov arg10, %r11 # %r11 = auth_tag_len
1295 cmp $16, %r11
1296 je _T_16_decrypt
1297 cmp $8, %r11
1298 jl _T_4_decrypt
1299 _T_8_decrypt:
1300 MOVQ_R64_XMM %xmm0, %rax
1301 mov %rax, (%r10)
1302 add $8, %r10
1303 sub $8, %r11
1304 psrldq $8, %xmm0
1305 cmp $0, %r11
1306 je _return_T_done_decrypt
1307 _T_4_decrypt:
1308 movd %xmm0, %eax
1309 mov %eax, (%r10)
1310 add $4, %r10
1311 sub $4, %r11
1312 psrldq $4, %xmm0
1313 cmp $0, %r11
1314 je _return_T_done_decrypt
1315 _T_123_decrypt:
1316 movd %xmm0, %eax
1317 cmp $2, %r11
1318 jl _T_1_decrypt
1319 mov %ax, (%r10)
1320 cmp $2, %r11
1321 je _return_T_done_decrypt
1322 add $2, %r10
1323 sar $16, %eax
1324 _T_1_decrypt:
1325 mov %al, (%r10)
1326 jmp _return_T_done_decrypt
1327 _T_16_decrypt:
1328 movdqu %xmm0, (%r10)
1329 _return_T_done_decrypt:
1330 FUNC_RESTORE
1331 ret
1332 ENDPROC(aesni_gcm_dec)
1333
1334
1335 /*****************************************************************************
1336 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1337 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1338 * const u8 *in, // Plaintext input
1339 * u64 plaintext_len, // Length of data in bytes for encryption.
1340 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1341 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1342 * // concatenated with 0x00000001. 16-byte aligned pointer.
1343 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1344 * const u8 *aad, // Additional Authentication Data (AAD)
1345 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1346 * u8 *auth_tag, // Authenticated Tag output.
1347 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1348 * // 12 or 8.
1349 *
1350 * Assumptions:
1351 *
1352 * keys:
1353 * keys are pre-expanded and aligned to 16 bytes. we are using the
1354 * first set of 11 keys in the data structure void *aes_ctx
1355 *
1356 *
1357 * iv:
1358 * 0 1 2 3
1359 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361 * | Salt (From the SA) |
1362 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363 * | Initialization Vector |
1364 * | (This is the sequence number from IPSec header) |
1365 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366 * | 0x1 |
1367 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368 *
1369 *
1370 *
1371 * AAD:
1372 * AAD padded to 128 bits with 0
1373 * for example, assume AAD is a u32 vector
1374 *
1375 * if AAD is 8 bytes:
1376 * AAD[3] = {A0, A1};
1377 * padded AAD in xmm register = {A1 A0 0 0}
1378 *
1379 * 0 1 2 3
1380 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1381 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1382 * | SPI (A1) |
1383 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384 * | 32-bit Sequence Number (A0) |
1385 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386 * | 0x0 |
1387 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1388 *
1389 * AAD Format with 32-bit Sequence Number
1390 *
1391 * if AAD is 12 bytes:
1392 * AAD[3] = {A0, A1, A2};
1393 * padded AAD in xmm register = {A2 A1 A0 0}
1394 *
1395 * 0 1 2 3
1396 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1397 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1398 * | SPI (A2) |
1399 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1400 * | 64-bit Extended Sequence Number {A1,A0} |
1401 * | |
1402 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1403 * | 0x0 |
1404 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1405 *
1406 * AAD Format with 64-bit Extended Sequence Number
1407 *
1408 * poly = x^128 + x^127 + x^126 + x^121 + 1
1409 ***************************************************************************/
1410 ENTRY(aesni_gcm_enc)
1411 FUNC_SAVE
1412
1413 GCM_INIT
1414 # Encrypt first few blocks
1415
1416 and $(3<<4), %r12
1417 jz _initial_num_blocks_is_0_encrypt
1418 cmp $(2<<4), %r12
1419 jb _initial_num_blocks_is_1_encrypt
1420 je _initial_num_blocks_is_2_encrypt
1421 _initial_num_blocks_is_3_encrypt:
1422 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1423 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1424 sub $48, %r13
1425 jmp _initial_blocks_encrypted
1426 _initial_num_blocks_is_2_encrypt:
1427 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1428 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1429 sub $32, %r13
1430 jmp _initial_blocks_encrypted
1431 _initial_num_blocks_is_1_encrypt:
1432 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1433 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1434 sub $16, %r13
1435 jmp _initial_blocks_encrypted
1436 _initial_num_blocks_is_0_encrypt:
1437 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1438 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1439 _initial_blocks_encrypted:
1440
1441 # Main loop - Encrypt remaining blocks
1442
1443 cmp $0, %r13
1444 je _zero_cipher_left_encrypt
1445 sub $64, %r13
1446 je _four_cipher_left_encrypt
1447 _encrypt_by_4_encrypt:
1448 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1449 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1450 add $64, %r11
1451 sub $64, %r13
1452 jne _encrypt_by_4_encrypt
1453 _four_cipher_left_encrypt:
1454 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1455 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1456 _zero_cipher_left_encrypt:
1457 mov %arg4, %r13
1458 and $15, %r13 # %r13 = arg4 (mod 16)
1459 je _multiple_of_16_bytes_encrypt
1460
1461 # Handle the last <16 Byte block separately
1462 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1463 movdqa SHUF_MASK(%rip), %xmm10
1464 PSHUFB_XMM %xmm10, %xmm0
1465
1466 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1467
1468 lea (%arg3,%r11,1), %r10
1469 mov %r13, %r12
1470 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1471
1472 lea ALL_F+16(%rip), %r12
1473 sub %r13, %r12
1474 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1475 movdqu (%r12), %xmm1
1476 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1477 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1478 movdqa SHUF_MASK(%rip), %xmm10
1479 PSHUFB_XMM %xmm10,%xmm0
1480
1481 pxor %xmm0, %xmm8
1482 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1483 # GHASH computation for the last <16 byte block
1484 movdqa SHUF_MASK(%rip), %xmm10
1485 PSHUFB_XMM %xmm10, %xmm0
1486
1487 # shuffle xmm0 back to output as ciphertext
1488
1489 # Output %r13 bytes
1490 MOVQ_R64_XMM %xmm0, %rax
1491 cmp $8, %r13
1492 jle _less_than_8_bytes_left_encrypt
1493 mov %rax, (%arg2 , %r11, 1)
1494 add $8, %r11
1495 psrldq $8, %xmm0
1496 MOVQ_R64_XMM %xmm0, %rax
1497 sub $8, %r13
1498 _less_than_8_bytes_left_encrypt:
1499 mov %al, (%arg2, %r11, 1)
1500 add $1, %r11
1501 shr $8, %rax
1502 sub $1, %r13
1503 jne _less_than_8_bytes_left_encrypt
1504 _multiple_of_16_bytes_encrypt:
1505 mov arg8, %r12 # %r12 = addLen (number of bytes)
1506 shl $3, %r12
1507 movd %r12d, %xmm15 # len(A) in %xmm15
1508 shl $3, %arg4 # len(C) in bits (*128)
1509 MOVQ_R64_XMM %arg4, %xmm1
1510 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1511 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1512 pxor %xmm15, %xmm8
1513 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1514 # final GHASH computation
1515 movdqa SHUF_MASK(%rip), %xmm10
1516 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1517
1518 mov %arg5, %rax # %rax = *Y0
1519 movdqu (%rax), %xmm0 # %xmm0 = Y0
1520 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1521 pxor %xmm8, %xmm0
1522 _return_T_encrypt:
1523 mov arg9, %r10 # %r10 = authTag
1524 mov arg10, %r11 # %r11 = auth_tag_len
1525 cmp $16, %r11
1526 je _T_16_encrypt
1527 cmp $8, %r11
1528 jl _T_4_encrypt
1529 _T_8_encrypt:
1530 MOVQ_R64_XMM %xmm0, %rax
1531 mov %rax, (%r10)
1532 add $8, %r10
1533 sub $8, %r11
1534 psrldq $8, %xmm0
1535 cmp $0, %r11
1536 je _return_T_done_encrypt
1537 _T_4_encrypt:
1538 movd %xmm0, %eax
1539 mov %eax, (%r10)
1540 add $4, %r10
1541 sub $4, %r11
1542 psrldq $4, %xmm0
1543 cmp $0, %r11
1544 je _return_T_done_encrypt
1545 _T_123_encrypt:
1546 movd %xmm0, %eax
1547 cmp $2, %r11
1548 jl _T_1_encrypt
1549 mov %ax, (%r10)
1550 cmp $2, %r11
1551 je _return_T_done_encrypt
1552 add $2, %r10
1553 sar $16, %eax
1554 _T_1_encrypt:
1555 mov %al, (%r10)
1556 jmp _return_T_done_encrypt
1557 _T_16_encrypt:
1558 movdqu %xmm0, (%r10)
1559 _return_T_done_encrypt:
1560 FUNC_RESTORE
1561 ret
1562 ENDPROC(aesni_gcm_enc)
1563
1564 #endif
1565
1566
1567 .align 4
1568 _key_expansion_128:
1569 _key_expansion_256a:
1570 pshufd $0b11111111, %xmm1, %xmm1
1571 shufps $0b00010000, %xmm0, %xmm4
1572 pxor %xmm4, %xmm0
1573 shufps $0b10001100, %xmm0, %xmm4
1574 pxor %xmm4, %xmm0
1575 pxor %xmm1, %xmm0
1576 movaps %xmm0, (TKEYP)
1577 add $0x10, TKEYP
1578 ret
1579 ENDPROC(_key_expansion_128)
1580 ENDPROC(_key_expansion_256a)
1581
1582 .align 4
1583 _key_expansion_192a:
1584 pshufd $0b01010101, %xmm1, %xmm1
1585 shufps $0b00010000, %xmm0, %xmm4
1586 pxor %xmm4, %xmm0
1587 shufps $0b10001100, %xmm0, %xmm4
1588 pxor %xmm4, %xmm0
1589 pxor %xmm1, %xmm0
1590
1591 movaps %xmm2, %xmm5
1592 movaps %xmm2, %xmm6
1593 pslldq $4, %xmm5
1594 pshufd $0b11111111, %xmm0, %xmm3
1595 pxor %xmm3, %xmm2
1596 pxor %xmm5, %xmm2
1597
1598 movaps %xmm0, %xmm1
1599 shufps $0b01000100, %xmm0, %xmm6
1600 movaps %xmm6, (TKEYP)
1601 shufps $0b01001110, %xmm2, %xmm1
1602 movaps %xmm1, 0x10(TKEYP)
1603 add $0x20, TKEYP
1604 ret
1605 ENDPROC(_key_expansion_192a)
1606
1607 .align 4
1608 _key_expansion_192b:
1609 pshufd $0b01010101, %xmm1, %xmm1
1610 shufps $0b00010000, %xmm0, %xmm4
1611 pxor %xmm4, %xmm0
1612 shufps $0b10001100, %xmm0, %xmm4
1613 pxor %xmm4, %xmm0
1614 pxor %xmm1, %xmm0
1615
1616 movaps %xmm2, %xmm5
1617 pslldq $4, %xmm5
1618 pshufd $0b11111111, %xmm0, %xmm3
1619 pxor %xmm3, %xmm2
1620 pxor %xmm5, %xmm2
1621
1622 movaps %xmm0, (TKEYP)
1623 add $0x10, TKEYP
1624 ret
1625 ENDPROC(_key_expansion_192b)
1626
1627 .align 4
1628 _key_expansion_256b:
1629 pshufd $0b10101010, %xmm1, %xmm1
1630 shufps $0b00010000, %xmm2, %xmm4
1631 pxor %xmm4, %xmm2
1632 shufps $0b10001100, %xmm2, %xmm4
1633 pxor %xmm4, %xmm2
1634 pxor %xmm1, %xmm2
1635 movaps %xmm2, (TKEYP)
1636 add $0x10, TKEYP
1637 ret
1638 ENDPROC(_key_expansion_256b)
1639
1640 /*
1641 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1642 * unsigned int key_len)
1643 */
1644 ENTRY(aesni_set_key)
1645 FRAME_BEGIN
1646 #ifndef __x86_64__
1647 pushl KEYP
1648 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1649 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1650 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1651 #endif
1652 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1653 movaps %xmm0, (KEYP)
1654 lea 0x10(KEYP), TKEYP # key addr
1655 movl %edx, 480(KEYP)
1656 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1657 cmp $24, %dl
1658 jb .Lenc_key128
1659 je .Lenc_key192
1660 movups 0x10(UKEYP), %xmm2 # other user key
1661 movaps %xmm2, (TKEYP)
1662 add $0x10, TKEYP
1663 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1664 call _key_expansion_256a
1665 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1666 call _key_expansion_256b
1667 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1668 call _key_expansion_256a
1669 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1670 call _key_expansion_256b
1671 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1672 call _key_expansion_256a
1673 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1674 call _key_expansion_256b
1675 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1676 call _key_expansion_256a
1677 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1678 call _key_expansion_256b
1679 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1680 call _key_expansion_256a
1681 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1682 call _key_expansion_256b
1683 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1684 call _key_expansion_256a
1685 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1686 call _key_expansion_256b
1687 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1688 call _key_expansion_256a
1689 jmp .Ldec_key
1690 .Lenc_key192:
1691 movq 0x10(UKEYP), %xmm2 # other user key
1692 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1693 call _key_expansion_192a
1694 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1695 call _key_expansion_192b
1696 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1697 call _key_expansion_192a
1698 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1699 call _key_expansion_192b
1700 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1701 call _key_expansion_192a
1702 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1703 call _key_expansion_192b
1704 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1705 call _key_expansion_192a
1706 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1707 call _key_expansion_192b
1708 jmp .Ldec_key
1709 .Lenc_key128:
1710 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1711 call _key_expansion_128
1712 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1713 call _key_expansion_128
1714 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1715 call _key_expansion_128
1716 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1717 call _key_expansion_128
1718 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1719 call _key_expansion_128
1720 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1721 call _key_expansion_128
1722 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1723 call _key_expansion_128
1724 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1725 call _key_expansion_128
1726 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1727 call _key_expansion_128
1728 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1729 call _key_expansion_128
1730 .Ldec_key:
1731 sub $0x10, TKEYP
1732 movaps (KEYP), %xmm0
1733 movaps (TKEYP), %xmm1
1734 movaps %xmm0, 240(TKEYP)
1735 movaps %xmm1, 240(KEYP)
1736 add $0x10, KEYP
1737 lea 240-16(TKEYP), UKEYP
1738 .align 4
1739 .Ldec_key_loop:
1740 movaps (KEYP), %xmm0
1741 AESIMC %xmm0 %xmm1
1742 movaps %xmm1, (UKEYP)
1743 add $0x10, KEYP
1744 sub $0x10, UKEYP
1745 cmp TKEYP, KEYP
1746 jb .Ldec_key_loop
1747 xor AREG, AREG
1748 #ifndef __x86_64__
1749 popl KEYP
1750 #endif
1751 FRAME_END
1752 ret
1753 ENDPROC(aesni_set_key)
1754
1755 /*
1756 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1757 */
1758 ENTRY(aesni_enc)
1759 FRAME_BEGIN
1760 #ifndef __x86_64__
1761 pushl KEYP
1762 pushl KLEN
1763 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1764 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1765 movl (FRAME_OFFSET+20)(%esp), INP # src
1766 #endif
1767 movl 480(KEYP), KLEN # key length
1768 movups (INP), STATE # input
1769 call _aesni_enc1
1770 movups STATE, (OUTP) # output
1771 #ifndef __x86_64__
1772 popl KLEN
1773 popl KEYP
1774 #endif
1775 FRAME_END
1776 ret
1777 ENDPROC(aesni_enc)
1778
1779 /*
1780 * _aesni_enc1: internal ABI
1781 * input:
1782 * KEYP: key struct pointer
1783 * KLEN: round count
1784 * STATE: initial state (input)
1785 * output:
1786 * STATE: finial state (output)
1787 * changed:
1788 * KEY
1789 * TKEYP (T1)
1790 */
1791 .align 4
1792 _aesni_enc1:
1793 movaps (KEYP), KEY # key
1794 mov KEYP, TKEYP
1795 pxor KEY, STATE # round 0
1796 add $0x30, TKEYP
1797 cmp $24, KLEN
1798 jb .Lenc128
1799 lea 0x20(TKEYP), TKEYP
1800 je .Lenc192
1801 add $0x20, TKEYP
1802 movaps -0x60(TKEYP), KEY
1803 AESENC KEY STATE
1804 movaps -0x50(TKEYP), KEY
1805 AESENC KEY STATE
1806 .align 4
1807 .Lenc192:
1808 movaps -0x40(TKEYP), KEY
1809 AESENC KEY STATE
1810 movaps -0x30(TKEYP), KEY
1811 AESENC KEY STATE
1812 .align 4
1813 .Lenc128:
1814 movaps -0x20(TKEYP), KEY
1815 AESENC KEY STATE
1816 movaps -0x10(TKEYP), KEY
1817 AESENC KEY STATE
1818 movaps (TKEYP), KEY
1819 AESENC KEY STATE
1820 movaps 0x10(TKEYP), KEY
1821 AESENC KEY STATE
1822 movaps 0x20(TKEYP), KEY
1823 AESENC KEY STATE
1824 movaps 0x30(TKEYP), KEY
1825 AESENC KEY STATE
1826 movaps 0x40(TKEYP), KEY
1827 AESENC KEY STATE
1828 movaps 0x50(TKEYP), KEY
1829 AESENC KEY STATE
1830 movaps 0x60(TKEYP), KEY
1831 AESENC KEY STATE
1832 movaps 0x70(TKEYP), KEY
1833 AESENCLAST KEY STATE
1834 ret
1835 ENDPROC(_aesni_enc1)
1836
1837 /*
1838 * _aesni_enc4: internal ABI
1839 * input:
1840 * KEYP: key struct pointer
1841 * KLEN: round count
1842 * STATE1: initial state (input)
1843 * STATE2
1844 * STATE3
1845 * STATE4
1846 * output:
1847 * STATE1: finial state (output)
1848 * STATE2
1849 * STATE3
1850 * STATE4
1851 * changed:
1852 * KEY
1853 * TKEYP (T1)
1854 */
1855 .align 4
1856 _aesni_enc4:
1857 movaps (KEYP), KEY # key
1858 mov KEYP, TKEYP
1859 pxor KEY, STATE1 # round 0
1860 pxor KEY, STATE2
1861 pxor KEY, STATE3
1862 pxor KEY, STATE4
1863 add $0x30, TKEYP
1864 cmp $24, KLEN
1865 jb .L4enc128
1866 lea 0x20(TKEYP), TKEYP
1867 je .L4enc192
1868 add $0x20, TKEYP
1869 movaps -0x60(TKEYP), KEY
1870 AESENC KEY STATE1
1871 AESENC KEY STATE2
1872 AESENC KEY STATE3
1873 AESENC KEY STATE4
1874 movaps -0x50(TKEYP), KEY
1875 AESENC KEY STATE1
1876 AESENC KEY STATE2
1877 AESENC KEY STATE3
1878 AESENC KEY STATE4
1879 #.align 4
1880 .L4enc192:
1881 movaps -0x40(TKEYP), KEY
1882 AESENC KEY STATE1
1883 AESENC KEY STATE2
1884 AESENC KEY STATE3
1885 AESENC KEY STATE4
1886 movaps -0x30(TKEYP), KEY
1887 AESENC KEY STATE1
1888 AESENC KEY STATE2
1889 AESENC KEY STATE3
1890 AESENC KEY STATE4
1891 #.align 4
1892 .L4enc128:
1893 movaps -0x20(TKEYP), KEY
1894 AESENC KEY STATE1
1895 AESENC KEY STATE2
1896 AESENC KEY STATE3
1897 AESENC KEY STATE4
1898 movaps -0x10(TKEYP), KEY
1899 AESENC KEY STATE1
1900 AESENC KEY STATE2
1901 AESENC KEY STATE3
1902 AESENC KEY STATE4
1903 movaps (TKEYP), KEY
1904 AESENC KEY STATE1
1905 AESENC KEY STATE2
1906 AESENC KEY STATE3
1907 AESENC KEY STATE4
1908 movaps 0x10(TKEYP), KEY
1909 AESENC KEY STATE1
1910 AESENC KEY STATE2
1911 AESENC KEY STATE3
1912 AESENC KEY STATE4
1913 movaps 0x20(TKEYP), KEY
1914 AESENC KEY STATE1
1915 AESENC KEY STATE2
1916 AESENC KEY STATE3
1917 AESENC KEY STATE4
1918 movaps 0x30(TKEYP), KEY
1919 AESENC KEY STATE1
1920 AESENC KEY STATE2
1921 AESENC KEY STATE3
1922 AESENC KEY STATE4
1923 movaps 0x40(TKEYP), KEY
1924 AESENC KEY STATE1
1925 AESENC KEY STATE2
1926 AESENC KEY STATE3
1927 AESENC KEY STATE4
1928 movaps 0x50(TKEYP), KEY
1929 AESENC KEY STATE1
1930 AESENC KEY STATE2
1931 AESENC KEY STATE3
1932 AESENC KEY STATE4
1933 movaps 0x60(TKEYP), KEY
1934 AESENC KEY STATE1
1935 AESENC KEY STATE2
1936 AESENC KEY STATE3
1937 AESENC KEY STATE4
1938 movaps 0x70(TKEYP), KEY
1939 AESENCLAST KEY STATE1 # last round
1940 AESENCLAST KEY STATE2
1941 AESENCLAST KEY STATE3
1942 AESENCLAST KEY STATE4
1943 ret
1944 ENDPROC(_aesni_enc4)
1945
1946 /*
1947 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1948 */
1949 ENTRY(aesni_dec)
1950 FRAME_BEGIN
1951 #ifndef __x86_64__
1952 pushl KEYP
1953 pushl KLEN
1954 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1955 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1956 movl (FRAME_OFFSET+20)(%esp), INP # src
1957 #endif
1958 mov 480(KEYP), KLEN # key length
1959 add $240, KEYP
1960 movups (INP), STATE # input
1961 call _aesni_dec1
1962 movups STATE, (OUTP) #output
1963 #ifndef __x86_64__
1964 popl KLEN
1965 popl KEYP
1966 #endif
1967 FRAME_END
1968 ret
1969 ENDPROC(aesni_dec)
1970
1971 /*
1972 * _aesni_dec1: internal ABI
1973 * input:
1974 * KEYP: key struct pointer
1975 * KLEN: key length
1976 * STATE: initial state (input)
1977 * output:
1978 * STATE: finial state (output)
1979 * changed:
1980 * KEY
1981 * TKEYP (T1)
1982 */
1983 .align 4
1984 _aesni_dec1:
1985 movaps (KEYP), KEY # key
1986 mov KEYP, TKEYP
1987 pxor KEY, STATE # round 0
1988 add $0x30, TKEYP
1989 cmp $24, KLEN
1990 jb .Ldec128
1991 lea 0x20(TKEYP), TKEYP
1992 je .Ldec192
1993 add $0x20, TKEYP
1994 movaps -0x60(TKEYP), KEY
1995 AESDEC KEY STATE
1996 movaps -0x50(TKEYP), KEY
1997 AESDEC KEY STATE
1998 .align 4
1999 .Ldec192:
2000 movaps -0x40(TKEYP), KEY
2001 AESDEC KEY STATE
2002 movaps -0x30(TKEYP), KEY
2003 AESDEC KEY STATE
2004 .align 4
2005 .Ldec128:
2006 movaps -0x20(TKEYP), KEY
2007 AESDEC KEY STATE
2008 movaps -0x10(TKEYP), KEY
2009 AESDEC KEY STATE
2010 movaps (TKEYP), KEY
2011 AESDEC KEY STATE
2012 movaps 0x10(TKEYP), KEY
2013 AESDEC KEY STATE
2014 movaps 0x20(TKEYP), KEY
2015 AESDEC KEY STATE
2016 movaps 0x30(TKEYP), KEY
2017 AESDEC KEY STATE
2018 movaps 0x40(TKEYP), KEY
2019 AESDEC KEY STATE
2020 movaps 0x50(TKEYP), KEY
2021 AESDEC KEY STATE
2022 movaps 0x60(TKEYP), KEY
2023 AESDEC KEY STATE
2024 movaps 0x70(TKEYP), KEY
2025 AESDECLAST KEY STATE
2026 ret
2027 ENDPROC(_aesni_dec1)
2028
2029 /*
2030 * _aesni_dec4: internal ABI
2031 * input:
2032 * KEYP: key struct pointer
2033 * KLEN: key length
2034 * STATE1: initial state (input)
2035 * STATE2
2036 * STATE3
2037 * STATE4
2038 * output:
2039 * STATE1: finial state (output)
2040 * STATE2
2041 * STATE3
2042 * STATE4
2043 * changed:
2044 * KEY
2045 * TKEYP (T1)
2046 */
2047 .align 4
2048 _aesni_dec4:
2049 movaps (KEYP), KEY # key
2050 mov KEYP, TKEYP
2051 pxor KEY, STATE1 # round 0
2052 pxor KEY, STATE2
2053 pxor KEY, STATE3
2054 pxor KEY, STATE4
2055 add $0x30, TKEYP
2056 cmp $24, KLEN
2057 jb .L4dec128
2058 lea 0x20(TKEYP), TKEYP
2059 je .L4dec192
2060 add $0x20, TKEYP
2061 movaps -0x60(TKEYP), KEY
2062 AESDEC KEY STATE1
2063 AESDEC KEY STATE2
2064 AESDEC KEY STATE3
2065 AESDEC KEY STATE4
2066 movaps -0x50(TKEYP), KEY
2067 AESDEC KEY STATE1
2068 AESDEC KEY STATE2
2069 AESDEC KEY STATE3
2070 AESDEC KEY STATE4
2071 .align 4
2072 .L4dec192:
2073 movaps -0x40(TKEYP), KEY
2074 AESDEC KEY STATE1
2075 AESDEC KEY STATE2
2076 AESDEC KEY STATE3
2077 AESDEC KEY STATE4
2078 movaps -0x30(TKEYP), KEY
2079 AESDEC KEY STATE1
2080 AESDEC KEY STATE2
2081 AESDEC KEY STATE3
2082 AESDEC KEY STATE4
2083 .align 4
2084 .L4dec128:
2085 movaps -0x20(TKEYP), KEY
2086 AESDEC KEY STATE1
2087 AESDEC KEY STATE2
2088 AESDEC KEY STATE3
2089 AESDEC KEY STATE4
2090 movaps -0x10(TKEYP), KEY
2091 AESDEC KEY STATE1
2092 AESDEC KEY STATE2
2093 AESDEC KEY STATE3
2094 AESDEC KEY STATE4
2095 movaps (TKEYP), KEY
2096 AESDEC KEY STATE1
2097 AESDEC KEY STATE2
2098 AESDEC KEY STATE3
2099 AESDEC KEY STATE4
2100 movaps 0x10(TKEYP), KEY
2101 AESDEC KEY STATE1
2102 AESDEC KEY STATE2
2103 AESDEC KEY STATE3
2104 AESDEC KEY STATE4
2105 movaps 0x20(TKEYP), KEY
2106 AESDEC KEY STATE1
2107 AESDEC KEY STATE2
2108 AESDEC KEY STATE3
2109 AESDEC KEY STATE4
2110 movaps 0x30(TKEYP), KEY
2111 AESDEC KEY STATE1
2112 AESDEC KEY STATE2
2113 AESDEC KEY STATE3
2114 AESDEC KEY STATE4
2115 movaps 0x40(TKEYP), KEY
2116 AESDEC KEY STATE1
2117 AESDEC KEY STATE2
2118 AESDEC KEY STATE3
2119 AESDEC KEY STATE4
2120 movaps 0x50(TKEYP), KEY
2121 AESDEC KEY STATE1
2122 AESDEC KEY STATE2
2123 AESDEC KEY STATE3
2124 AESDEC KEY STATE4
2125 movaps 0x60(TKEYP), KEY
2126 AESDEC KEY STATE1
2127 AESDEC KEY STATE2
2128 AESDEC KEY STATE3
2129 AESDEC KEY STATE4
2130 movaps 0x70(TKEYP), KEY
2131 AESDECLAST KEY STATE1 # last round
2132 AESDECLAST KEY STATE2
2133 AESDECLAST KEY STATE3
2134 AESDECLAST KEY STATE4
2135 ret
2136 ENDPROC(_aesni_dec4)
2137
2138 /*
2139 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2140 * size_t len)
2141 */
2142 ENTRY(aesni_ecb_enc)
2143 FRAME_BEGIN
2144 #ifndef __x86_64__
2145 pushl LEN
2146 pushl KEYP
2147 pushl KLEN
2148 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2149 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2150 movl (FRAME_OFFSET+24)(%esp), INP # src
2151 movl (FRAME_OFFSET+28)(%esp), LEN # len
2152 #endif
2153 test LEN, LEN # check length
2154 jz .Lecb_enc_ret
2155 mov 480(KEYP), KLEN
2156 cmp $16, LEN
2157 jb .Lecb_enc_ret
2158 cmp $64, LEN
2159 jb .Lecb_enc_loop1
2160 .align 4
2161 .Lecb_enc_loop4:
2162 movups (INP), STATE1
2163 movups 0x10(INP), STATE2
2164 movups 0x20(INP), STATE3
2165 movups 0x30(INP), STATE4
2166 call _aesni_enc4
2167 movups STATE1, (OUTP)
2168 movups STATE2, 0x10(OUTP)
2169 movups STATE3, 0x20(OUTP)
2170 movups STATE4, 0x30(OUTP)
2171 sub $64, LEN
2172 add $64, INP
2173 add $64, OUTP
2174 cmp $64, LEN
2175 jge .Lecb_enc_loop4
2176 cmp $16, LEN
2177 jb .Lecb_enc_ret
2178 .align 4
2179 .Lecb_enc_loop1:
2180 movups (INP), STATE1
2181 call _aesni_enc1
2182 movups STATE1, (OUTP)
2183 sub $16, LEN
2184 add $16, INP
2185 add $16, OUTP
2186 cmp $16, LEN
2187 jge .Lecb_enc_loop1
2188 .Lecb_enc_ret:
2189 #ifndef __x86_64__
2190 popl KLEN
2191 popl KEYP
2192 popl LEN
2193 #endif
2194 FRAME_END
2195 ret
2196 ENDPROC(aesni_ecb_enc)
2197
2198 /*
2199 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2200 * size_t len);
2201 */
2202 ENTRY(aesni_ecb_dec)
2203 FRAME_BEGIN
2204 #ifndef __x86_64__
2205 pushl LEN
2206 pushl KEYP
2207 pushl KLEN
2208 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2209 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2210 movl (FRAME_OFFSET+24)(%esp), INP # src
2211 movl (FRAME_OFFSET+28)(%esp), LEN # len
2212 #endif
2213 test LEN, LEN
2214 jz .Lecb_dec_ret
2215 mov 480(KEYP), KLEN
2216 add $240, KEYP
2217 cmp $16, LEN
2218 jb .Lecb_dec_ret
2219 cmp $64, LEN
2220 jb .Lecb_dec_loop1
2221 .align 4
2222 .Lecb_dec_loop4:
2223 movups (INP), STATE1
2224 movups 0x10(INP), STATE2
2225 movups 0x20(INP), STATE3
2226 movups 0x30(INP), STATE4
2227 call _aesni_dec4
2228 movups STATE1, (OUTP)
2229 movups STATE2, 0x10(OUTP)
2230 movups STATE3, 0x20(OUTP)
2231 movups STATE4, 0x30(OUTP)
2232 sub $64, LEN
2233 add $64, INP
2234 add $64, OUTP
2235 cmp $64, LEN
2236 jge .Lecb_dec_loop4
2237 cmp $16, LEN
2238 jb .Lecb_dec_ret
2239 .align 4
2240 .Lecb_dec_loop1:
2241 movups (INP), STATE1
2242 call _aesni_dec1
2243 movups STATE1, (OUTP)
2244 sub $16, LEN
2245 add $16, INP
2246 add $16, OUTP
2247 cmp $16, LEN
2248 jge .Lecb_dec_loop1
2249 .Lecb_dec_ret:
2250 #ifndef __x86_64__
2251 popl KLEN
2252 popl KEYP
2253 popl LEN
2254 #endif
2255 FRAME_END
2256 ret
2257 ENDPROC(aesni_ecb_dec)
2258
2259 /*
2260 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 * size_t len, u8 *iv)
2262 */
2263 ENTRY(aesni_cbc_enc)
2264 FRAME_BEGIN
2265 #ifndef __x86_64__
2266 pushl IVP
2267 pushl LEN
2268 pushl KEYP
2269 pushl KLEN
2270 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2271 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2272 movl (FRAME_OFFSET+28)(%esp), INP # src
2273 movl (FRAME_OFFSET+32)(%esp), LEN # len
2274 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2275 #endif
2276 cmp $16, LEN
2277 jb .Lcbc_enc_ret
2278 mov 480(KEYP), KLEN
2279 movups (IVP), STATE # load iv as initial state
2280 .align 4
2281 .Lcbc_enc_loop:
2282 movups (INP), IN # load input
2283 pxor IN, STATE
2284 call _aesni_enc1
2285 movups STATE, (OUTP) # store output
2286 sub $16, LEN
2287 add $16, INP
2288 add $16, OUTP
2289 cmp $16, LEN
2290 jge .Lcbc_enc_loop
2291 movups STATE, (IVP)
2292 .Lcbc_enc_ret:
2293 #ifndef __x86_64__
2294 popl KLEN
2295 popl KEYP
2296 popl LEN
2297 popl IVP
2298 #endif
2299 FRAME_END
2300 ret
2301 ENDPROC(aesni_cbc_enc)
2302
2303 /*
2304 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2305 * size_t len, u8 *iv)
2306 */
2307 ENTRY(aesni_cbc_dec)
2308 FRAME_BEGIN
2309 #ifndef __x86_64__
2310 pushl IVP
2311 pushl LEN
2312 pushl KEYP
2313 pushl KLEN
2314 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2315 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2316 movl (FRAME_OFFSET+28)(%esp), INP # src
2317 movl (FRAME_OFFSET+32)(%esp), LEN # len
2318 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2319 #endif
2320 cmp $16, LEN
2321 jb .Lcbc_dec_just_ret
2322 mov 480(KEYP), KLEN
2323 add $240, KEYP
2324 movups (IVP), IV
2325 cmp $64, LEN
2326 jb .Lcbc_dec_loop1
2327 .align 4
2328 .Lcbc_dec_loop4:
2329 movups (INP), IN1
2330 movaps IN1, STATE1
2331 movups 0x10(INP), IN2
2332 movaps IN2, STATE2
2333 #ifdef __x86_64__
2334 movups 0x20(INP), IN3
2335 movaps IN3, STATE3
2336 movups 0x30(INP), IN4
2337 movaps IN4, STATE4
2338 #else
2339 movups 0x20(INP), IN1
2340 movaps IN1, STATE3
2341 movups 0x30(INP), IN2
2342 movaps IN2, STATE4
2343 #endif
2344 call _aesni_dec4
2345 pxor IV, STATE1
2346 #ifdef __x86_64__
2347 pxor IN1, STATE2
2348 pxor IN2, STATE3
2349 pxor IN3, STATE4
2350 movaps IN4, IV
2351 #else
2352 pxor IN1, STATE4
2353 movaps IN2, IV
2354 movups (INP), IN1
2355 pxor IN1, STATE2
2356 movups 0x10(INP), IN2
2357 pxor IN2, STATE3
2358 #endif
2359 movups STATE1, (OUTP)
2360 movups STATE2, 0x10(OUTP)
2361 movups STATE3, 0x20(OUTP)
2362 movups STATE4, 0x30(OUTP)
2363 sub $64, LEN
2364 add $64, INP
2365 add $64, OUTP
2366 cmp $64, LEN
2367 jge .Lcbc_dec_loop4
2368 cmp $16, LEN
2369 jb .Lcbc_dec_ret
2370 .align 4
2371 .Lcbc_dec_loop1:
2372 movups (INP), IN
2373 movaps IN, STATE
2374 call _aesni_dec1
2375 pxor IV, STATE
2376 movups STATE, (OUTP)
2377 movaps IN, IV
2378 sub $16, LEN
2379 add $16, INP
2380 add $16, OUTP
2381 cmp $16, LEN
2382 jge .Lcbc_dec_loop1
2383 .Lcbc_dec_ret:
2384 movups IV, (IVP)
2385 .Lcbc_dec_just_ret:
2386 #ifndef __x86_64__
2387 popl KLEN
2388 popl KEYP
2389 popl LEN
2390 popl IVP
2391 #endif
2392 FRAME_END
2393 ret
2394 ENDPROC(aesni_cbc_dec)
2395
2396 #ifdef __x86_64__
2397 .pushsection .rodata
2398 .align 16
2399 .Lbswap_mask:
2400 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2401 .popsection
2402
2403 /*
2404 * _aesni_inc_init: internal ABI
2405 * setup registers used by _aesni_inc
2406 * input:
2407 * IV
2408 * output:
2409 * CTR: == IV, in little endian
2410 * TCTR_LOW: == lower qword of CTR
2411 * INC: == 1, in little endian
2412 * BSWAP_MASK == endian swapping mask
2413 */
2414 .align 4
2415 _aesni_inc_init:
2416 movaps .Lbswap_mask, BSWAP_MASK
2417 movaps IV, CTR
2418 PSHUFB_XMM BSWAP_MASK CTR
2419 mov $1, TCTR_LOW
2420 MOVQ_R64_XMM TCTR_LOW INC
2421 MOVQ_R64_XMM CTR TCTR_LOW
2422 ret
2423 ENDPROC(_aesni_inc_init)
2424
2425 /*
2426 * _aesni_inc: internal ABI
2427 * Increase IV by 1, IV is in big endian
2428 * input:
2429 * IV
2430 * CTR: == IV, in little endian
2431 * TCTR_LOW: == lower qword of CTR
2432 * INC: == 1, in little endian
2433 * BSWAP_MASK == endian swapping mask
2434 * output:
2435 * IV: Increase by 1
2436 * changed:
2437 * CTR: == output IV, in little endian
2438 * TCTR_LOW: == lower qword of CTR
2439 */
2440 .align 4
2441 _aesni_inc:
2442 paddq INC, CTR
2443 add $1, TCTR_LOW
2444 jnc .Linc_low
2445 pslldq $8, INC
2446 paddq INC, CTR
2447 psrldq $8, INC
2448 .Linc_low:
2449 movaps CTR, IV
2450 PSHUFB_XMM BSWAP_MASK IV
2451 ret
2452 ENDPROC(_aesni_inc)
2453
2454 /*
2455 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2456 * size_t len, u8 *iv)
2457 */
2458 ENTRY(aesni_ctr_enc)
2459 FRAME_BEGIN
2460 cmp $16, LEN
2461 jb .Lctr_enc_just_ret
2462 mov 480(KEYP), KLEN
2463 movups (IVP), IV
2464 call _aesni_inc_init
2465 cmp $64, LEN
2466 jb .Lctr_enc_loop1
2467 .align 4
2468 .Lctr_enc_loop4:
2469 movaps IV, STATE1
2470 call _aesni_inc
2471 movups (INP), IN1
2472 movaps IV, STATE2
2473 call _aesni_inc
2474 movups 0x10(INP), IN2
2475 movaps IV, STATE3
2476 call _aesni_inc
2477 movups 0x20(INP), IN3
2478 movaps IV, STATE4
2479 call _aesni_inc
2480 movups 0x30(INP), IN4
2481 call _aesni_enc4
2482 pxor IN1, STATE1
2483 movups STATE1, (OUTP)
2484 pxor IN2, STATE2
2485 movups STATE2, 0x10(OUTP)
2486 pxor IN3, STATE3
2487 movups STATE3, 0x20(OUTP)
2488 pxor IN4, STATE4
2489 movups STATE4, 0x30(OUTP)
2490 sub $64, LEN
2491 add $64, INP
2492 add $64, OUTP
2493 cmp $64, LEN
2494 jge .Lctr_enc_loop4
2495 cmp $16, LEN
2496 jb .Lctr_enc_ret
2497 .align 4
2498 .Lctr_enc_loop1:
2499 movaps IV, STATE
2500 call _aesni_inc
2501 movups (INP), IN
2502 call _aesni_enc1
2503 pxor IN, STATE
2504 movups STATE, (OUTP)
2505 sub $16, LEN
2506 add $16, INP
2507 add $16, OUTP
2508 cmp $16, LEN
2509 jge .Lctr_enc_loop1
2510 .Lctr_enc_ret:
2511 movups IV, (IVP)
2512 .Lctr_enc_just_ret:
2513 FRAME_END
2514 ret
2515 ENDPROC(aesni_ctr_enc)
2516
2517 /*
2518 * _aesni_gf128mul_x_ble: internal ABI
2519 * Multiply in GF(2^128) for XTS IVs
2520 * input:
2521 * IV: current IV
2522 * GF128MUL_MASK == mask with 0x87 and 0x01
2523 * output:
2524 * IV: next IV
2525 * changed:
2526 * CTR: == temporary value
2527 */
2528 #define _aesni_gf128mul_x_ble() \
2529 pshufd $0x13, IV, CTR; \
2530 paddq IV, IV; \
2531 psrad $31, CTR; \
2532 pand GF128MUL_MASK, CTR; \
2533 pxor CTR, IV;
2534
2535 /*
2536 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2537 * bool enc, u8 *iv)
2538 */
2539 ENTRY(aesni_xts_crypt8)
2540 FRAME_BEGIN
2541 cmpb $0, %cl
2542 movl $0, %ecx
2543 movl $240, %r10d
2544 leaq _aesni_enc4, %r11
2545 leaq _aesni_dec4, %rax
2546 cmovel %r10d, %ecx
2547 cmoveq %rax, %r11
2548
2549 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2550 movups (IVP), IV
2551
2552 mov 480(KEYP), KLEN
2553 addq %rcx, KEYP
2554
2555 movdqa IV, STATE1
2556 movdqu 0x00(INP), INC
2557 pxor INC, STATE1
2558 movdqu IV, 0x00(OUTP)
2559
2560 _aesni_gf128mul_x_ble()
2561 movdqa IV, STATE2
2562 movdqu 0x10(INP), INC
2563 pxor INC, STATE2
2564 movdqu IV, 0x10(OUTP)
2565
2566 _aesni_gf128mul_x_ble()
2567 movdqa IV, STATE3
2568 movdqu 0x20(INP), INC
2569 pxor INC, STATE3
2570 movdqu IV, 0x20(OUTP)
2571
2572 _aesni_gf128mul_x_ble()
2573 movdqa IV, STATE4
2574 movdqu 0x30(INP), INC
2575 pxor INC, STATE4
2576 movdqu IV, 0x30(OUTP)
2577
2578 CALL_NOSPEC %r11
2579
2580 movdqu 0x00(OUTP), INC
2581 pxor INC, STATE1
2582 movdqu STATE1, 0x00(OUTP)
2583
2584 _aesni_gf128mul_x_ble()
2585 movdqa IV, STATE1
2586 movdqu 0x40(INP), INC
2587 pxor INC, STATE1
2588 movdqu IV, 0x40(OUTP)
2589
2590 movdqu 0x10(OUTP), INC
2591 pxor INC, STATE2
2592 movdqu STATE2, 0x10(OUTP)
2593
2594 _aesni_gf128mul_x_ble()
2595 movdqa IV, STATE2
2596 movdqu 0x50(INP), INC
2597 pxor INC, STATE2
2598 movdqu IV, 0x50(OUTP)
2599
2600 movdqu 0x20(OUTP), INC
2601 pxor INC, STATE3
2602 movdqu STATE3, 0x20(OUTP)
2603
2604 _aesni_gf128mul_x_ble()
2605 movdqa IV, STATE3
2606 movdqu 0x60(INP), INC
2607 pxor INC, STATE3
2608 movdqu IV, 0x60(OUTP)
2609
2610 movdqu 0x30(OUTP), INC
2611 pxor INC, STATE4
2612 movdqu STATE4, 0x30(OUTP)
2613
2614 _aesni_gf128mul_x_ble()
2615 movdqa IV, STATE4
2616 movdqu 0x70(INP), INC
2617 pxor INC, STATE4
2618 movdqu IV, 0x70(OUTP)
2619
2620 _aesni_gf128mul_x_ble()
2621 movups IV, (IVP)
2622
2623 CALL_NOSPEC %r11
2624
2625 movdqu 0x40(OUTP), INC
2626 pxor INC, STATE1
2627 movdqu STATE1, 0x40(OUTP)
2628
2629 movdqu 0x50(OUTP), INC
2630 pxor INC, STATE2
2631 movdqu STATE2, 0x50(OUTP)
2632
2633 movdqu 0x60(OUTP), INC
2634 pxor INC, STATE3
2635 movdqu STATE3, 0x60(OUTP)
2636
2637 movdqu 0x70(OUTP), INC
2638 pxor INC, STATE4
2639 movdqu STATE4, 0x70(OUTP)
2640
2641 FRAME_END
2642 ret
2643 ENDPROC(aesni_xts_crypt8)
2644
2645 #endif