]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - arch/x86/crypto/aesni-intel_avx-x86_64.S
Merge tag 'kbuild-misc-v4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/masahi...
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / crypto / aesni-intel_avx-x86_64.S
1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
3 #
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
12 # met:
13 #
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
16 #
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
20 # distribution.
21 #
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
25 #
26 #
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
39 ##
40 ## Authors:
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
45 ##
46 ## References:
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
53 ##
54 ## Assumptions:
55 ##
56 ##
57 ##
58 ## iv:
59 ## 0 1 2 3
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67 ## | 0x1 |
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69 ##
70 ##
71 ##
72 ## AAD:
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
75 ##
76 ## if AAD is 8 bytes:
77 ## AAD[3] = {A0, A1}#
78 ## padded AAD in xmm register = {A1 A0 0 0}
79 ##
80 ## 0 1 2 3
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ## | SPI (A1) |
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 ## | 0x0 |
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 ##
90 ## AAD Format with 32-bit Sequence Number
91 ##
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
95 ##
96 ## 0 1 2 3
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ## | SPI (A2) |
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
102 ## | |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 ## | 0x0 |
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 ##
107 ## AAD Format with 64-bit Extended Sequence Number
108 ##
109 ##
110 ## aadLen:
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
113 ##
114 ## TLen:
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116 ##
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
120 ##
121
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
124
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 .align 16
128 POLY: .octa 0xC2000000000000000000000000000001
129
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 .align 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
133
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 .align 16
136 TWOONE: .octa 0x00000001000000000000000000000001
137
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 .align 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 .align 16
144 ONE: .octa 0x00000000000000000000000000000001
145
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 .align 16
148 ONEf: .octa 0x01000000000000000000000000000000
149
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
153 .align 16
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
157
158 .text
159
160
161 ##define the fields of the gcm aes context
162 #{
163 # u8 expanded_keys[16*11] store expanded keys
164 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
165 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
166 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
167 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
168 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
169 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
170 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
171 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
172 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
173 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
174 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
175 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
176 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
177 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
178 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
179 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
180 #} gcm_ctx#
181
182 HashKey = 16*11 # store HashKey <<1 mod poly here
183 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
184 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
185 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
186 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
187 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
188 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
189 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
190 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
191 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
192 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
193 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
194 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
195 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
196 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
197 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
198
199 #define arg1 %rdi
200 #define arg2 %rsi
201 #define arg3 %rdx
202 #define arg4 %rcx
203 #define arg5 %r8
204 #define arg6 %r9
205 #define arg7 STACK_OFFSET+8*1(%r14)
206 #define arg8 STACK_OFFSET+8*2(%r14)
207 #define arg9 STACK_OFFSET+8*3(%r14)
208
209 i = 0
210 j = 0
211
212 out_order = 0
213 in_order = 1
214 DEC = 0
215 ENC = 1
216
217 .macro define_reg r n
218 reg_\r = %xmm\n
219 .endm
220
221 .macro setreg
222 .altmacro
223 define_reg i %i
224 define_reg j %j
225 .noaltmacro
226 .endm
227
228 # need to push 4 registers into stack to maintain
229 STACK_OFFSET = 8*4
230
231 TMP1 = 16*0 # Temporary storage for AAD
232 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
233 TMP3 = 16*2 # Temporary storage for AES State 3
234 TMP4 = 16*3 # Temporary storage for AES State 4
235 TMP5 = 16*4 # Temporary storage for AES State 5
236 TMP6 = 16*5 # Temporary storage for AES State 6
237 TMP7 = 16*6 # Temporary storage for AES State 7
238 TMP8 = 16*7 # Temporary storage for AES State 8
239
240 VARIABLE_OFFSET = 16*8
241
242 ################################
243 # Utility Macros
244 ################################
245
246 # Encryption of a single block
247 .macro ENCRYPT_SINGLE_BLOCK XMM0
248 vpxor (arg1), \XMM0, \XMM0
249 i = 1
250 setreg
251 .rep 9
252 vaesenc 16*i(arg1), \XMM0, \XMM0
253 i = (i+1)
254 setreg
255 .endr
256 vaesenclast 16*10(arg1), \XMM0, \XMM0
257 .endm
258
259 #ifdef CONFIG_AS_AVX
260 ###############################################################################
261 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
262 # Input: A and B (128-bits each, bit-reflected)
263 # Output: C = A*B*x mod poly, (i.e. >>1 )
264 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
265 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
266 ###############################################################################
267 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
268
269 vpshufd $0b01001110, \GH, \T2
270 vpshufd $0b01001110, \HK, \T3
271 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
272 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
273
274 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
275 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
276 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
277 vpxor \GH, \T2,\T2
278 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
279
280 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
281 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
282 vpxor \T3, \GH, \GH
283 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
284
285 #first phase of the reduction
286 vpslld $31, \GH, \T2 # packed right shifting << 31
287 vpslld $30, \GH, \T3 # packed right shifting shift << 30
288 vpslld $25, \GH, \T4 # packed right shifting shift << 25
289
290 vpxor \T3, \T2, \T2 # xor the shifted versions
291 vpxor \T4, \T2, \T2
292
293 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
294
295 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
296 vpxor \T2, \GH, \GH # first phase of the reduction complete
297
298 #second phase of the reduction
299
300 vpsrld $1,\GH, \T2 # packed left shifting >> 1
301 vpsrld $2,\GH, \T3 # packed left shifting >> 2
302 vpsrld $7,\GH, \T4 # packed left shifting >> 7
303 vpxor \T3, \T2, \T2 # xor the shifted versions
304 vpxor \T4, \T2, \T2
305
306 vpxor \T5, \T2, \T2
307 vpxor \T2, \GH, \GH
308 vpxor \T1, \GH, \GH # the result is in GH
309
310
311 .endm
312
313 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
314
315 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
316 vmovdqa \HK, \T5
317
318 vpshufd $0b01001110, \T5, \T1
319 vpxor \T5, \T1, \T1
320 vmovdqa \T1, HashKey_k(arg1)
321
322 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
323 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
324 vpshufd $0b01001110, \T5, \T1
325 vpxor \T5, \T1, \T1
326 vmovdqa \T1, HashKey_2_k(arg1)
327
328 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
329 vmovdqa \T5, HashKey_3(arg1)
330 vpshufd $0b01001110, \T5, \T1
331 vpxor \T5, \T1, \T1
332 vmovdqa \T1, HashKey_3_k(arg1)
333
334 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
335 vmovdqa \T5, HashKey_4(arg1)
336 vpshufd $0b01001110, \T5, \T1
337 vpxor \T5, \T1, \T1
338 vmovdqa \T1, HashKey_4_k(arg1)
339
340 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
341 vmovdqa \T5, HashKey_5(arg1)
342 vpshufd $0b01001110, \T5, \T1
343 vpxor \T5, \T1, \T1
344 vmovdqa \T1, HashKey_5_k(arg1)
345
346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
347 vmovdqa \T5, HashKey_6(arg1)
348 vpshufd $0b01001110, \T5, \T1
349 vpxor \T5, \T1, \T1
350 vmovdqa \T1, HashKey_6_k(arg1)
351
352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
353 vmovdqa \T5, HashKey_7(arg1)
354 vpshufd $0b01001110, \T5, \T1
355 vpxor \T5, \T1, \T1
356 vmovdqa \T1, HashKey_7_k(arg1)
357
358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
359 vmovdqa \T5, HashKey_8(arg1)
360 vpshufd $0b01001110, \T5, \T1
361 vpxor \T5, \T1, \T1
362 vmovdqa \T1, HashKey_8_k(arg1)
363
364 .endm
365
366 ## if a = number of total plaintext bytes
367 ## b = floor(a/16)
368 ## num_initial_blocks = b mod 4#
369 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
370 ## r10, r11, r12, rax are clobbered
371 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
372
373 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
374 i = (8-\num_initial_blocks)
375 setreg
376
377 mov arg6, %r10 # r10 = AAD
378 mov arg7, %r12 # r12 = aadLen
379
380
381 mov %r12, %r11
382
383 vpxor reg_i, reg_i, reg_i
384 _get_AAD_loop\@:
385 vmovd (%r10), \T1
386 vpslldq $12, \T1, \T1
387 vpsrldq $4, reg_i, reg_i
388 vpxor \T1, reg_i, reg_i
389
390 add $4, %r10
391 sub $4, %r12
392 jg _get_AAD_loop\@
393
394
395 cmp $16, %r11
396 je _get_AAD_loop2_done\@
397 mov $16, %r12
398
399 _get_AAD_loop2\@:
400 vpsrldq $4, reg_i, reg_i
401 sub $4, %r12
402 cmp %r11, %r12
403 jg _get_AAD_loop2\@
404
405 _get_AAD_loop2_done\@:
406
407 #byte-reflect the AAD data
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i
409
410 # initialize the data pointer offset as zero
411 xor %r11, %r11
412
413 # start AES for num_initial_blocks blocks
414 mov arg5, %rax # rax = *Y0
415 vmovdqu (%rax), \CTR # CTR = Y0
416 vpshufb SHUF_MASK(%rip), \CTR, \CTR
417
418
419 i = (9-\num_initial_blocks)
420 setreg
421 .rep \num_initial_blocks
422 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
423 vmovdqa \CTR, reg_i
424 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
425 i = (i+1)
426 setreg
427 .endr
428
429 vmovdqa (arg1), \T_key
430 i = (9-\num_initial_blocks)
431 setreg
432 .rep \num_initial_blocks
433 vpxor \T_key, reg_i, reg_i
434 i = (i+1)
435 setreg
436 .endr
437
438 j = 1
439 setreg
440 .rep 9
441 vmovdqa 16*j(arg1), \T_key
442 i = (9-\num_initial_blocks)
443 setreg
444 .rep \num_initial_blocks
445 vaesenc \T_key, reg_i, reg_i
446 i = (i+1)
447 setreg
448 .endr
449
450 j = (j+1)
451 setreg
452 .endr
453
454
455 vmovdqa 16*10(arg1), \T_key
456 i = (9-\num_initial_blocks)
457 setreg
458 .rep \num_initial_blocks
459 vaesenclast \T_key, reg_i, reg_i
460 i = (i+1)
461 setreg
462 .endr
463
464 i = (9-\num_initial_blocks)
465 setreg
466 .rep \num_initial_blocks
467 vmovdqu (arg3, %r11), \T1
468 vpxor \T1, reg_i, reg_i
469 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
470 add $16, %r11
471 .if \ENC_DEC == DEC
472 vmovdqa \T1, reg_i
473 .endif
474 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
475 i = (i+1)
476 setreg
477 .endr
478
479
480 i = (8-\num_initial_blocks)
481 j = (9-\num_initial_blocks)
482 setreg
483 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
484
485 .rep \num_initial_blocks
486 vpxor reg_i, reg_j, reg_j
487 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
488 i = (i+1)
489 j = (j+1)
490 setreg
491 .endr
492 # XMM8 has the combined result here
493
494 vmovdqa \XMM8, TMP1(%rsp)
495 vmovdqa \XMM8, \T3
496
497 cmp $128, %r13
498 jl _initial_blocks_done\@ # no need for precomputed constants
499
500 ###############################################################################
501 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
503 vmovdqa \CTR, \XMM1
504 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
505
506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
507 vmovdqa \CTR, \XMM2
508 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
509
510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
511 vmovdqa \CTR, \XMM3
512 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
513
514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
515 vmovdqa \CTR, \XMM4
516 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
517
518 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
519 vmovdqa \CTR, \XMM5
520 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
521
522 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
523 vmovdqa \CTR, \XMM6
524 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
525
526 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
527 vmovdqa \CTR, \XMM7
528 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
529
530 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
531 vmovdqa \CTR, \XMM8
532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
533
534 vmovdqa (arg1), \T_key
535 vpxor \T_key, \XMM1, \XMM1
536 vpxor \T_key, \XMM2, \XMM2
537 vpxor \T_key, \XMM3, \XMM3
538 vpxor \T_key, \XMM4, \XMM4
539 vpxor \T_key, \XMM5, \XMM5
540 vpxor \T_key, \XMM6, \XMM6
541 vpxor \T_key, \XMM7, \XMM7
542 vpxor \T_key, \XMM8, \XMM8
543
544 i = 1
545 setreg
546 .rep 9 # do 9 rounds
547 vmovdqa 16*i(arg1), \T_key
548 vaesenc \T_key, \XMM1, \XMM1
549 vaesenc \T_key, \XMM2, \XMM2
550 vaesenc \T_key, \XMM3, \XMM3
551 vaesenc \T_key, \XMM4, \XMM4
552 vaesenc \T_key, \XMM5, \XMM5
553 vaesenc \T_key, \XMM6, \XMM6
554 vaesenc \T_key, \XMM7, \XMM7
555 vaesenc \T_key, \XMM8, \XMM8
556 i = (i+1)
557 setreg
558 .endr
559
560
561 vmovdqa 16*i(arg1), \T_key
562 vaesenclast \T_key, \XMM1, \XMM1
563 vaesenclast \T_key, \XMM2, \XMM2
564 vaesenclast \T_key, \XMM3, \XMM3
565 vaesenclast \T_key, \XMM4, \XMM4
566 vaesenclast \T_key, \XMM5, \XMM5
567 vaesenclast \T_key, \XMM6, \XMM6
568 vaesenclast \T_key, \XMM7, \XMM7
569 vaesenclast \T_key, \XMM8, \XMM8
570
571 vmovdqu (arg3, %r11), \T1
572 vpxor \T1, \XMM1, \XMM1
573 vmovdqu \XMM1, (arg2 , %r11)
574 .if \ENC_DEC == DEC
575 vmovdqa \T1, \XMM1
576 .endif
577
578 vmovdqu 16*1(arg3, %r11), \T1
579 vpxor \T1, \XMM2, \XMM2
580 vmovdqu \XMM2, 16*1(arg2 , %r11)
581 .if \ENC_DEC == DEC
582 vmovdqa \T1, \XMM2
583 .endif
584
585 vmovdqu 16*2(arg3, %r11), \T1
586 vpxor \T1, \XMM3, \XMM3
587 vmovdqu \XMM3, 16*2(arg2 , %r11)
588 .if \ENC_DEC == DEC
589 vmovdqa \T1, \XMM3
590 .endif
591
592 vmovdqu 16*3(arg3, %r11), \T1
593 vpxor \T1, \XMM4, \XMM4
594 vmovdqu \XMM4, 16*3(arg2 , %r11)
595 .if \ENC_DEC == DEC
596 vmovdqa \T1, \XMM4
597 .endif
598
599 vmovdqu 16*4(arg3, %r11), \T1
600 vpxor \T1, \XMM5, \XMM5
601 vmovdqu \XMM5, 16*4(arg2 , %r11)
602 .if \ENC_DEC == DEC
603 vmovdqa \T1, \XMM5
604 .endif
605
606 vmovdqu 16*5(arg3, %r11), \T1
607 vpxor \T1, \XMM6, \XMM6
608 vmovdqu \XMM6, 16*5(arg2 , %r11)
609 .if \ENC_DEC == DEC
610 vmovdqa \T1, \XMM6
611 .endif
612
613 vmovdqu 16*6(arg3, %r11), \T1
614 vpxor \T1, \XMM7, \XMM7
615 vmovdqu \XMM7, 16*6(arg2 , %r11)
616 .if \ENC_DEC == DEC
617 vmovdqa \T1, \XMM7
618 .endif
619
620 vmovdqu 16*7(arg3, %r11), \T1
621 vpxor \T1, \XMM8, \XMM8
622 vmovdqu \XMM8, 16*7(arg2 , %r11)
623 .if \ENC_DEC == DEC
624 vmovdqa \T1, \XMM8
625 .endif
626
627 add $128, %r11
628
629 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
630 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
631 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
632 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
633 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
634 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
635 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
636 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
637 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
638
639 ###############################################################################
640
641 _initial_blocks_done\@:
642
643 .endm
644
645 # encrypt 8 blocks at a time
646 # ghash the 8 previously encrypted ciphertext blocks
647 # arg1, arg2, arg3 are used as pointers only, not modified
648 # r11 is the data offset value
649 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
650
651 vmovdqa \XMM1, \T2
652 vmovdqa \XMM2, TMP2(%rsp)
653 vmovdqa \XMM3, TMP3(%rsp)
654 vmovdqa \XMM4, TMP4(%rsp)
655 vmovdqa \XMM5, TMP5(%rsp)
656 vmovdqa \XMM6, TMP6(%rsp)
657 vmovdqa \XMM7, TMP7(%rsp)
658 vmovdqa \XMM8, TMP8(%rsp)
659
660 .if \loop_idx == in_order
661 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
662 vpaddd ONE(%rip), \XMM1, \XMM2
663 vpaddd ONE(%rip), \XMM2, \XMM3
664 vpaddd ONE(%rip), \XMM3, \XMM4
665 vpaddd ONE(%rip), \XMM4, \XMM5
666 vpaddd ONE(%rip), \XMM5, \XMM6
667 vpaddd ONE(%rip), \XMM6, \XMM7
668 vpaddd ONE(%rip), \XMM7, \XMM8
669 vmovdqa \XMM8, \CTR
670
671 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
672 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
673 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
674 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
675 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
676 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
677 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
678 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
679 .else
680 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
681 vpaddd ONEf(%rip), \XMM1, \XMM2
682 vpaddd ONEf(%rip), \XMM2, \XMM3
683 vpaddd ONEf(%rip), \XMM3, \XMM4
684 vpaddd ONEf(%rip), \XMM4, \XMM5
685 vpaddd ONEf(%rip), \XMM5, \XMM6
686 vpaddd ONEf(%rip), \XMM6, \XMM7
687 vpaddd ONEf(%rip), \XMM7, \XMM8
688 vmovdqa \XMM8, \CTR
689 .endif
690
691
692 #######################################################################
693
694 vmovdqu (arg1), \T1
695 vpxor \T1, \XMM1, \XMM1
696 vpxor \T1, \XMM2, \XMM2
697 vpxor \T1, \XMM3, \XMM3
698 vpxor \T1, \XMM4, \XMM4
699 vpxor \T1, \XMM5, \XMM5
700 vpxor \T1, \XMM6, \XMM6
701 vpxor \T1, \XMM7, \XMM7
702 vpxor \T1, \XMM8, \XMM8
703
704 #######################################################################
705
706
707
708
709
710 vmovdqu 16*1(arg1), \T1
711 vaesenc \T1, \XMM1, \XMM1
712 vaesenc \T1, \XMM2, \XMM2
713 vaesenc \T1, \XMM3, \XMM3
714 vaesenc \T1, \XMM4, \XMM4
715 vaesenc \T1, \XMM5, \XMM5
716 vaesenc \T1, \XMM6, \XMM6
717 vaesenc \T1, \XMM7, \XMM7
718 vaesenc \T1, \XMM8, \XMM8
719
720 vmovdqu 16*2(arg1), \T1
721 vaesenc \T1, \XMM1, \XMM1
722 vaesenc \T1, \XMM2, \XMM2
723 vaesenc \T1, \XMM3, \XMM3
724 vaesenc \T1, \XMM4, \XMM4
725 vaesenc \T1, \XMM5, \XMM5
726 vaesenc \T1, \XMM6, \XMM6
727 vaesenc \T1, \XMM7, \XMM7
728 vaesenc \T1, \XMM8, \XMM8
729
730
731 #######################################################################
732
733 vmovdqa HashKey_8(arg1), \T5
734 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
735 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
736
737 vpshufd $0b01001110, \T2, \T6
738 vpxor \T2, \T6, \T6
739
740 vmovdqa HashKey_8_k(arg1), \T5
741 vpclmulqdq $0x00, \T5, \T6, \T6
742
743 vmovdqu 16*3(arg1), \T1
744 vaesenc \T1, \XMM1, \XMM1
745 vaesenc \T1, \XMM2, \XMM2
746 vaesenc \T1, \XMM3, \XMM3
747 vaesenc \T1, \XMM4, \XMM4
748 vaesenc \T1, \XMM5, \XMM5
749 vaesenc \T1, \XMM6, \XMM6
750 vaesenc \T1, \XMM7, \XMM7
751 vaesenc \T1, \XMM8, \XMM8
752
753 vmovdqa TMP2(%rsp), \T1
754 vmovdqa HashKey_7(arg1), \T5
755 vpclmulqdq $0x11, \T5, \T1, \T3
756 vpxor \T3, \T4, \T4
757 vpclmulqdq $0x00, \T5, \T1, \T3
758 vpxor \T3, \T7, \T7
759
760 vpshufd $0b01001110, \T1, \T3
761 vpxor \T1, \T3, \T3
762 vmovdqa HashKey_7_k(arg1), \T5
763 vpclmulqdq $0x10, \T5, \T3, \T3
764 vpxor \T3, \T6, \T6
765
766 vmovdqu 16*4(arg1), \T1
767 vaesenc \T1, \XMM1, \XMM1
768 vaesenc \T1, \XMM2, \XMM2
769 vaesenc \T1, \XMM3, \XMM3
770 vaesenc \T1, \XMM4, \XMM4
771 vaesenc \T1, \XMM5, \XMM5
772 vaesenc \T1, \XMM6, \XMM6
773 vaesenc \T1, \XMM7, \XMM7
774 vaesenc \T1, \XMM8, \XMM8
775
776 #######################################################################
777
778 vmovdqa TMP3(%rsp), \T1
779 vmovdqa HashKey_6(arg1), \T5
780 vpclmulqdq $0x11, \T5, \T1, \T3
781 vpxor \T3, \T4, \T4
782 vpclmulqdq $0x00, \T5, \T1, \T3
783 vpxor \T3, \T7, \T7
784
785 vpshufd $0b01001110, \T1, \T3
786 vpxor \T1, \T3, \T3
787 vmovdqa HashKey_6_k(arg1), \T5
788 vpclmulqdq $0x10, \T5, \T3, \T3
789 vpxor \T3, \T6, \T6
790
791 vmovdqu 16*5(arg1), \T1
792 vaesenc \T1, \XMM1, \XMM1
793 vaesenc \T1, \XMM2, \XMM2
794 vaesenc \T1, \XMM3, \XMM3
795 vaesenc \T1, \XMM4, \XMM4
796 vaesenc \T1, \XMM5, \XMM5
797 vaesenc \T1, \XMM6, \XMM6
798 vaesenc \T1, \XMM7, \XMM7
799 vaesenc \T1, \XMM8, \XMM8
800
801 vmovdqa TMP4(%rsp), \T1
802 vmovdqa HashKey_5(arg1), \T5
803 vpclmulqdq $0x11, \T5, \T1, \T3
804 vpxor \T3, \T4, \T4
805 vpclmulqdq $0x00, \T5, \T1, \T3
806 vpxor \T3, \T7, \T7
807
808 vpshufd $0b01001110, \T1, \T3
809 vpxor \T1, \T3, \T3
810 vmovdqa HashKey_5_k(arg1), \T5
811 vpclmulqdq $0x10, \T5, \T3, \T3
812 vpxor \T3, \T6, \T6
813
814 vmovdqu 16*6(arg1), \T1
815 vaesenc \T1, \XMM1, \XMM1
816 vaesenc \T1, \XMM2, \XMM2
817 vaesenc \T1, \XMM3, \XMM3
818 vaesenc \T1, \XMM4, \XMM4
819 vaesenc \T1, \XMM5, \XMM5
820 vaesenc \T1, \XMM6, \XMM6
821 vaesenc \T1, \XMM7, \XMM7
822 vaesenc \T1, \XMM8, \XMM8
823
824
825 vmovdqa TMP5(%rsp), \T1
826 vmovdqa HashKey_4(arg1), \T5
827 vpclmulqdq $0x11, \T5, \T1, \T3
828 vpxor \T3, \T4, \T4
829 vpclmulqdq $0x00, \T5, \T1, \T3
830 vpxor \T3, \T7, \T7
831
832 vpshufd $0b01001110, \T1, \T3
833 vpxor \T1, \T3, \T3
834 vmovdqa HashKey_4_k(arg1), \T5
835 vpclmulqdq $0x10, \T5, \T3, \T3
836 vpxor \T3, \T6, \T6
837
838 vmovdqu 16*7(arg1), \T1
839 vaesenc \T1, \XMM1, \XMM1
840 vaesenc \T1, \XMM2, \XMM2
841 vaesenc \T1, \XMM3, \XMM3
842 vaesenc \T1, \XMM4, \XMM4
843 vaesenc \T1, \XMM5, \XMM5
844 vaesenc \T1, \XMM6, \XMM6
845 vaesenc \T1, \XMM7, \XMM7
846 vaesenc \T1, \XMM8, \XMM8
847
848 vmovdqa TMP6(%rsp), \T1
849 vmovdqa HashKey_3(arg1), \T5
850 vpclmulqdq $0x11, \T5, \T1, \T3
851 vpxor \T3, \T4, \T4
852 vpclmulqdq $0x00, \T5, \T1, \T3
853 vpxor \T3, \T7, \T7
854
855 vpshufd $0b01001110, \T1, \T3
856 vpxor \T1, \T3, \T3
857 vmovdqa HashKey_3_k(arg1), \T5
858 vpclmulqdq $0x10, \T5, \T3, \T3
859 vpxor \T3, \T6, \T6
860
861
862 vmovdqu 16*8(arg1), \T1
863 vaesenc \T1, \XMM1, \XMM1
864 vaesenc \T1, \XMM2, \XMM2
865 vaesenc \T1, \XMM3, \XMM3
866 vaesenc \T1, \XMM4, \XMM4
867 vaesenc \T1, \XMM5, \XMM5
868 vaesenc \T1, \XMM6, \XMM6
869 vaesenc \T1, \XMM7, \XMM7
870 vaesenc \T1, \XMM8, \XMM8
871
872 vmovdqa TMP7(%rsp), \T1
873 vmovdqa HashKey_2(arg1), \T5
874 vpclmulqdq $0x11, \T5, \T1, \T3
875 vpxor \T3, \T4, \T4
876 vpclmulqdq $0x00, \T5, \T1, \T3
877 vpxor \T3, \T7, \T7
878
879 vpshufd $0b01001110, \T1, \T3
880 vpxor \T1, \T3, \T3
881 vmovdqa HashKey_2_k(arg1), \T5
882 vpclmulqdq $0x10, \T5, \T3, \T3
883 vpxor \T3, \T6, \T6
884
885 #######################################################################
886
887 vmovdqu 16*9(arg1), \T5
888 vaesenc \T5, \XMM1, \XMM1
889 vaesenc \T5, \XMM2, \XMM2
890 vaesenc \T5, \XMM3, \XMM3
891 vaesenc \T5, \XMM4, \XMM4
892 vaesenc \T5, \XMM5, \XMM5
893 vaesenc \T5, \XMM6, \XMM6
894 vaesenc \T5, \XMM7, \XMM7
895 vaesenc \T5, \XMM8, \XMM8
896
897 vmovdqa TMP8(%rsp), \T1
898 vmovdqa HashKey(arg1), \T5
899 vpclmulqdq $0x11, \T5, \T1, \T3
900 vpxor \T3, \T4, \T4
901 vpclmulqdq $0x00, \T5, \T1, \T3
902 vpxor \T3, \T7, \T7
903
904 vpshufd $0b01001110, \T1, \T3
905 vpxor \T1, \T3, \T3
906 vmovdqa HashKey_k(arg1), \T5
907 vpclmulqdq $0x10, \T5, \T3, \T3
908 vpxor \T3, \T6, \T6
909
910 vpxor \T4, \T6, \T6
911 vpxor \T7, \T6, \T6
912
913 vmovdqu 16*10(arg1), \T5
914
915 i = 0
916 j = 1
917 setreg
918 .rep 8
919 vpxor 16*i(arg3, %r11), \T5, \T2
920 .if \ENC_DEC == ENC
921 vaesenclast \T2, reg_j, reg_j
922 .else
923 vaesenclast \T2, reg_j, \T3
924 vmovdqu 16*i(arg3, %r11), reg_j
925 vmovdqu \T3, 16*i(arg2, %r11)
926 .endif
927 i = (i+1)
928 j = (j+1)
929 setreg
930 .endr
931 #######################################################################
932
933
934 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
935 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
936 vpxor \T3, \T7, \T7
937 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
938
939
940
941 #######################################################################
942 #first phase of the reduction
943 #######################################################################
944 vpslld $31, \T7, \T2 # packed right shifting << 31
945 vpslld $30, \T7, \T3 # packed right shifting shift << 30
946 vpslld $25, \T7, \T4 # packed right shifting shift << 25
947
948 vpxor \T3, \T2, \T2 # xor the shifted versions
949 vpxor \T4, \T2, \T2
950
951 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
952
953 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
954 vpxor \T2, \T7, \T7 # first phase of the reduction complete
955 #######################################################################
956 .if \ENC_DEC == ENC
957 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
958 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
959 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
960 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
961 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
962 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
963 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
964 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
965 .endif
966
967 #######################################################################
968 #second phase of the reduction
969 vpsrld $1, \T7, \T2 # packed left shifting >> 1
970 vpsrld $2, \T7, \T3 # packed left shifting >> 2
971 vpsrld $7, \T7, \T4 # packed left shifting >> 7
972 vpxor \T3, \T2, \T2 # xor the shifted versions
973 vpxor \T4, \T2, \T2
974
975 vpxor \T1, \T2, \T2
976 vpxor \T2, \T7, \T7
977 vpxor \T7, \T6, \T6 # the result is in T6
978 #######################################################################
979
980 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
981 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
982 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
983 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
984 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
985 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
986 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
987 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
988
989
990 vpxor \T6, \XMM1, \XMM1
991
992
993
994 .endm
995
996
997 # GHASH the last 4 ciphertext blocks.
998 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
999
1000 ## Karatsuba Method
1001
1002
1003 vpshufd $0b01001110, \XMM1, \T2
1004 vpxor \XMM1, \T2, \T2
1005 vmovdqa HashKey_8(arg1), \T5
1006 vpclmulqdq $0x11, \T5, \XMM1, \T6
1007 vpclmulqdq $0x00, \T5, \XMM1, \T7
1008
1009 vmovdqa HashKey_8_k(arg1), \T3
1010 vpclmulqdq $0x00, \T3, \T2, \XMM1
1011
1012 ######################
1013
1014 vpshufd $0b01001110, \XMM2, \T2
1015 vpxor \XMM2, \T2, \T2
1016 vmovdqa HashKey_7(arg1), \T5
1017 vpclmulqdq $0x11, \T5, \XMM2, \T4
1018 vpxor \T4, \T6, \T6
1019
1020 vpclmulqdq $0x00, \T5, \XMM2, \T4
1021 vpxor \T4, \T7, \T7
1022
1023 vmovdqa HashKey_7_k(arg1), \T3
1024 vpclmulqdq $0x00, \T3, \T2, \T2
1025 vpxor \T2, \XMM1, \XMM1
1026
1027 ######################
1028
1029 vpshufd $0b01001110, \XMM3, \T2
1030 vpxor \XMM3, \T2, \T2
1031 vmovdqa HashKey_6(arg1), \T5
1032 vpclmulqdq $0x11, \T5, \XMM3, \T4
1033 vpxor \T4, \T6, \T6
1034
1035 vpclmulqdq $0x00, \T5, \XMM3, \T4
1036 vpxor \T4, \T7, \T7
1037
1038 vmovdqa HashKey_6_k(arg1), \T3
1039 vpclmulqdq $0x00, \T3, \T2, \T2
1040 vpxor \T2, \XMM1, \XMM1
1041
1042 ######################
1043
1044 vpshufd $0b01001110, \XMM4, \T2
1045 vpxor \XMM4, \T2, \T2
1046 vmovdqa HashKey_5(arg1), \T5
1047 vpclmulqdq $0x11, \T5, \XMM4, \T4
1048 vpxor \T4, \T6, \T6
1049
1050 vpclmulqdq $0x00, \T5, \XMM4, \T4
1051 vpxor \T4, \T7, \T7
1052
1053 vmovdqa HashKey_5_k(arg1), \T3
1054 vpclmulqdq $0x00, \T3, \T2, \T2
1055 vpxor \T2, \XMM1, \XMM1
1056
1057 ######################
1058
1059 vpshufd $0b01001110, \XMM5, \T2
1060 vpxor \XMM5, \T2, \T2
1061 vmovdqa HashKey_4(arg1), \T5
1062 vpclmulqdq $0x11, \T5, \XMM5, \T4
1063 vpxor \T4, \T6, \T6
1064
1065 vpclmulqdq $0x00, \T5, \XMM5, \T4
1066 vpxor \T4, \T7, \T7
1067
1068 vmovdqa HashKey_4_k(arg1), \T3
1069 vpclmulqdq $0x00, \T3, \T2, \T2
1070 vpxor \T2, \XMM1, \XMM1
1071
1072 ######################
1073
1074 vpshufd $0b01001110, \XMM6, \T2
1075 vpxor \XMM6, \T2, \T2
1076 vmovdqa HashKey_3(arg1), \T5
1077 vpclmulqdq $0x11, \T5, \XMM6, \T4
1078 vpxor \T4, \T6, \T6
1079
1080 vpclmulqdq $0x00, \T5, \XMM6, \T4
1081 vpxor \T4, \T7, \T7
1082
1083 vmovdqa HashKey_3_k(arg1), \T3
1084 vpclmulqdq $0x00, \T3, \T2, \T2
1085 vpxor \T2, \XMM1, \XMM1
1086
1087 ######################
1088
1089 vpshufd $0b01001110, \XMM7, \T2
1090 vpxor \XMM7, \T2, \T2
1091 vmovdqa HashKey_2(arg1), \T5
1092 vpclmulqdq $0x11, \T5, \XMM7, \T4
1093 vpxor \T4, \T6, \T6
1094
1095 vpclmulqdq $0x00, \T5, \XMM7, \T4
1096 vpxor \T4, \T7, \T7
1097
1098 vmovdqa HashKey_2_k(arg1), \T3
1099 vpclmulqdq $0x00, \T3, \T2, \T2
1100 vpxor \T2, \XMM1, \XMM1
1101
1102 ######################
1103
1104 vpshufd $0b01001110, \XMM8, \T2
1105 vpxor \XMM8, \T2, \T2
1106 vmovdqa HashKey(arg1), \T5
1107 vpclmulqdq $0x11, \T5, \XMM8, \T4
1108 vpxor \T4, \T6, \T6
1109
1110 vpclmulqdq $0x00, \T5, \XMM8, \T4
1111 vpxor \T4, \T7, \T7
1112
1113 vmovdqa HashKey_k(arg1), \T3
1114 vpclmulqdq $0x00, \T3, \T2, \T2
1115
1116 vpxor \T2, \XMM1, \XMM1
1117 vpxor \T6, \XMM1, \XMM1
1118 vpxor \T7, \XMM1, \T2
1119
1120
1121
1122
1123 vpslldq $8, \T2, \T4
1124 vpsrldq $8, \T2, \T2
1125
1126 vpxor \T4, \T7, \T7
1127 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1128 # the accumulated carry-less multiplications
1129
1130 #######################################################################
1131 #first phase of the reduction
1132 vpslld $31, \T7, \T2 # packed right shifting << 31
1133 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1134 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1135
1136 vpxor \T3, \T2, \T2 # xor the shifted versions
1137 vpxor \T4, \T2, \T2
1138
1139 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1140
1141 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1142 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1143 #######################################################################
1144
1145
1146 #second phase of the reduction
1147 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1148 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1149 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1150 vpxor \T3, \T2, \T2 # xor the shifted versions
1151 vpxor \T4, \T2, \T2
1152
1153 vpxor \T1, \T2, \T2
1154 vpxor \T2, \T7, \T7
1155 vpxor \T7, \T6, \T6 # the result is in T6
1156
1157 .endm
1158
1159
1160 # combined for GCM encrypt and decrypt functions
1161 # clobbering all xmm registers
1162 # clobbering r10, r11, r12, r13, r14, r15
1163 .macro GCM_ENC_DEC_AVX ENC_DEC
1164
1165 #the number of pushes must equal STACK_OFFSET
1166 push %r12
1167 push %r13
1168 push %r14
1169 push %r15
1170
1171 mov %rsp, %r14
1172
1173
1174
1175
1176 sub $VARIABLE_OFFSET, %rsp
1177 and $~63, %rsp # align rsp to 64 bytes
1178
1179
1180 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1181
1182 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1183 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1184
1185 mov %r13, %r12
1186 shr $4, %r12
1187 and $7, %r12
1188 jz _initial_num_blocks_is_0\@
1189
1190 cmp $7, %r12
1191 je _initial_num_blocks_is_7\@
1192 cmp $6, %r12
1193 je _initial_num_blocks_is_6\@
1194 cmp $5, %r12
1195 je _initial_num_blocks_is_5\@
1196 cmp $4, %r12
1197 je _initial_num_blocks_is_4\@
1198 cmp $3, %r12
1199 je _initial_num_blocks_is_3\@
1200 cmp $2, %r12
1201 je _initial_num_blocks_is_2\@
1202
1203 jmp _initial_num_blocks_is_1\@
1204
1205 _initial_num_blocks_is_7\@:
1206 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1207 sub $16*7, %r13
1208 jmp _initial_blocks_encrypted\@
1209
1210 _initial_num_blocks_is_6\@:
1211 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1212 sub $16*6, %r13
1213 jmp _initial_blocks_encrypted\@
1214
1215 _initial_num_blocks_is_5\@:
1216 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1217 sub $16*5, %r13
1218 jmp _initial_blocks_encrypted\@
1219
1220 _initial_num_blocks_is_4\@:
1221 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1222 sub $16*4, %r13
1223 jmp _initial_blocks_encrypted\@
1224
1225 _initial_num_blocks_is_3\@:
1226 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1227 sub $16*3, %r13
1228 jmp _initial_blocks_encrypted\@
1229
1230 _initial_num_blocks_is_2\@:
1231 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1232 sub $16*2, %r13
1233 jmp _initial_blocks_encrypted\@
1234
1235 _initial_num_blocks_is_1\@:
1236 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1237 sub $16*1, %r13
1238 jmp _initial_blocks_encrypted\@
1239
1240 _initial_num_blocks_is_0\@:
1241 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1242
1243
1244 _initial_blocks_encrypted\@:
1245 cmp $0, %r13
1246 je _zero_cipher_left\@
1247
1248 sub $128, %r13
1249 je _eight_cipher_left\@
1250
1251
1252
1253
1254 vmovd %xmm9, %r15d
1255 and $255, %r15d
1256 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1257
1258
1259 _encrypt_by_8_new\@:
1260 cmp $(255-8), %r15d
1261 jg _encrypt_by_8\@
1262
1263
1264
1265 add $8, %r15b
1266 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1267 add $128, %r11
1268 sub $128, %r13
1269 jne _encrypt_by_8_new\@
1270
1271 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1272 jmp _eight_cipher_left\@
1273
1274 _encrypt_by_8\@:
1275 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1276 add $8, %r15b
1277 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1278 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1279 add $128, %r11
1280 sub $128, %r13
1281 jne _encrypt_by_8_new\@
1282
1283 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1284
1285
1286
1287
1288 _eight_cipher_left\@:
1289 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1290
1291
1292 _zero_cipher_left\@:
1293 cmp $16, arg4
1294 jl _only_less_than_16\@
1295
1296 mov arg4, %r13
1297 and $15, %r13 # r13 = (arg4 mod 16)
1298
1299 je _multiple_of_16_bytes\@
1300
1301 # handle the last <16 Byte block seperately
1302
1303
1304 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1305 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1306 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1307
1308 sub $16, %r11
1309 add %r13, %r11
1310 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1311
1312 lea SHIFT_MASK+16(%rip), %r12
1313 sub %r13, %r12 # adjust the shuffle mask pointer to be
1314 # able to shift 16-r13 bytes (r13 is the
1315 # number of bytes in plaintext mod 16)
1316 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1317 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1318 jmp _final_ghash_mul\@
1319
1320 _only_less_than_16\@:
1321 # check for 0 length
1322 mov arg4, %r13
1323 and $15, %r13 # r13 = (arg4 mod 16)
1324
1325 je _multiple_of_16_bytes\@
1326
1327 # handle the last <16 Byte block seperately
1328
1329
1330 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1331 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1332 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1333
1334
1335 lea SHIFT_MASK+16(%rip), %r12
1336 sub %r13, %r12 # adjust the shuffle mask pointer to be
1337 # able to shift 16-r13 bytes (r13 is the
1338 # number of bytes in plaintext mod 16)
1339
1340 _get_last_16_byte_loop\@:
1341 movb (arg3, %r11), %al
1342 movb %al, TMP1 (%rsp , %r11)
1343 add $1, %r11
1344 cmp %r13, %r11
1345 jne _get_last_16_byte_loop\@
1346
1347 vmovdqu TMP1(%rsp), %xmm1
1348
1349 sub $16, %r11
1350
1351 _final_ghash_mul\@:
1352 .if \ENC_DEC == DEC
1353 vmovdqa %xmm1, %xmm2
1354 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1355 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1356 # mask out top 16-r13 bytes of xmm9
1357 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1358 vpand %xmm1, %xmm2, %xmm2
1359 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1360 vpxor %xmm2, %xmm14, %xmm14
1361 #GHASH computation for the last <16 Byte block
1362 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1363 sub %r13, %r11
1364 add $16, %r11
1365 .else
1366 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1367 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1368 # mask out top 16-r13 bytes of xmm9
1369 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1370 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1371 vpxor %xmm9, %xmm14, %xmm14
1372 #GHASH computation for the last <16 Byte block
1373 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1374 sub %r13, %r11
1375 add $16, %r11
1376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1377 .endif
1378
1379
1380 #############################
1381 # output r13 Bytes
1382 vmovq %xmm9, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left\@
1385
1386 mov %rax, (arg2 , %r11)
1387 add $8, %r11
1388 vpsrldq $8, %xmm9, %xmm9
1389 vmovq %xmm9, %rax
1390 sub $8, %r13
1391
1392 _less_than_8_bytes_left\@:
1393 movb %al, (arg2 , %r11)
1394 add $1, %r11
1395 shr $8, %rax
1396 sub $1, %r13
1397 jne _less_than_8_bytes_left\@
1398 #############################
1399
1400 _multiple_of_16_bytes\@:
1401 mov arg7, %r12 # r12 = aadLen (number of bytes)
1402 shl $3, %r12 # convert into number of bits
1403 vmovd %r12d, %xmm15 # len(A) in xmm15
1404
1405 shl $3, arg4 # len(C) in bits (*128)
1406 vmovq arg4, %xmm1
1407 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1408 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1409
1410 vpxor %xmm15, %xmm14, %xmm14
1411 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1412 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1413
1414 mov arg5, %rax # rax = *Y0
1415 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1416
1417 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1418
1419 vpxor %xmm14, %xmm9, %xmm9
1420
1421
1422
1423 _return_T\@:
1424 mov arg8, %r10 # r10 = authTag
1425 mov arg9, %r11 # r11 = auth_tag_len
1426
1427 cmp $16, %r11
1428 je _T_16\@
1429
1430 cmp $12, %r11
1431 je _T_12\@
1432
1433 _T_8\@:
1434 vmovq %xmm9, %rax
1435 mov %rax, (%r10)
1436 jmp _return_T_done\@
1437 _T_12\@:
1438 vmovq %xmm9, %rax
1439 mov %rax, (%r10)
1440 vpsrldq $8, %xmm9, %xmm9
1441 vmovd %xmm9, %eax
1442 mov %eax, 8(%r10)
1443 jmp _return_T_done\@
1444
1445 _T_16\@:
1446 vmovdqu %xmm9, (%r10)
1447
1448 _return_T_done\@:
1449 mov %r14, %rsp
1450
1451 pop %r15
1452 pop %r14
1453 pop %r13
1454 pop %r12
1455 .endm
1456
1457
1458 #############################################################
1459 #void aesni_gcm_precomp_avx_gen2
1460 # (gcm_data *my_ctx_data,
1461 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1462 #############################################################
1463 ENTRY(aesni_gcm_precomp_avx_gen2)
1464 #the number of pushes must equal STACK_OFFSET
1465 push %r12
1466 push %r13
1467 push %r14
1468 push %r15
1469
1470 mov %rsp, %r14
1471
1472
1473
1474 sub $VARIABLE_OFFSET, %rsp
1475 and $~63, %rsp # align rsp to 64 bytes
1476
1477 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1478
1479 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1480 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1481 vmovdqa %xmm6, %xmm2
1482 vpsllq $1, %xmm6, %xmm6
1483 vpsrlq $63, %xmm2, %xmm2
1484 vmovdqa %xmm2, %xmm1
1485 vpslldq $8, %xmm2, %xmm2
1486 vpsrldq $8, %xmm1, %xmm1
1487 vpor %xmm2, %xmm6, %xmm6
1488 #reduction
1489 vpshufd $0b00100100, %xmm1, %xmm2
1490 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1491 vpand POLY(%rip), %xmm2, %xmm2
1492 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1493 #######################################################################
1494 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1495
1496
1497 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1498
1499 mov %r14, %rsp
1500
1501 pop %r15
1502 pop %r14
1503 pop %r13
1504 pop %r12
1505 ret
1506 ENDPROC(aesni_gcm_precomp_avx_gen2)
1507
1508 ###############################################################################
1509 #void aesni_gcm_enc_avx_gen2(
1510 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1511 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1512 # const u8 *in, /* Plaintext input */
1513 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1514 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1515 # (from Security Association) concatenated with 8 byte
1516 # Initialisation Vector (from IPSec ESP Payload)
1517 # concatenated with 0x00000001. 16-byte aligned pointer. */
1518 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1519 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1520 # u8 *auth_tag, /* Authenticated Tag output. */
1521 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1522 # Valid values are 16 (most likely), 12 or 8. */
1523 ###############################################################################
1524 ENTRY(aesni_gcm_enc_avx_gen2)
1525 GCM_ENC_DEC_AVX ENC
1526 ret
1527 ENDPROC(aesni_gcm_enc_avx_gen2)
1528
1529 ###############################################################################
1530 #void aesni_gcm_dec_avx_gen2(
1531 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1532 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1533 # const u8 *in, /* Ciphertext input */
1534 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1535 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1536 # (from Security Association) concatenated with 8 byte
1537 # Initialisation Vector (from IPSec ESP Payload)
1538 # concatenated with 0x00000001. 16-byte aligned pointer. */
1539 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1540 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1541 # u8 *auth_tag, /* Authenticated Tag output. */
1542 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1543 # Valid values are 16 (most likely), 12 or 8. */
1544 ###############################################################################
1545 ENTRY(aesni_gcm_dec_avx_gen2)
1546 GCM_ENC_DEC_AVX DEC
1547 ret
1548 ENDPROC(aesni_gcm_dec_avx_gen2)
1549 #endif /* CONFIG_AS_AVX */
1550
1551 #ifdef CONFIG_AS_AVX2
1552 ###############################################################################
1553 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1554 # Input: A and B (128-bits each, bit-reflected)
1555 # Output: C = A*B*x mod poly, (i.e. >>1 )
1556 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1557 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1558 ###############################################################################
1559 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1560
1561 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1562 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1563 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1564 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1565 vpxor \T3, \GH, \GH
1566
1567
1568 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1569 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1570
1571 vpxor \T3, \T1, \T1
1572 vpxor \T2, \GH, \GH
1573
1574 #######################################################################
1575 #first phase of the reduction
1576 vmovdqa POLY2(%rip), \T3
1577
1578 vpclmulqdq $0x01, \GH, \T3, \T2
1579 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1580
1581 vpxor \T2, \GH, \GH # first phase of the reduction complete
1582 #######################################################################
1583 #second phase of the reduction
1584 vpclmulqdq $0x00, \GH, \T3, \T2
1585 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1586
1587 vpclmulqdq $0x10, \GH, \T3, \GH
1588 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1589
1590 vpxor \T2, \GH, \GH # second phase of the reduction complete
1591 #######################################################################
1592 vpxor \T1, \GH, \GH # the result is in GH
1593
1594
1595 .endm
1596
1597 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1598
1599 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1600 vmovdqa \HK, \T5
1601 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1602 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1603
1604 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1605 vmovdqa \T5, HashKey_3(arg1)
1606
1607 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1608 vmovdqa \T5, HashKey_4(arg1)
1609
1610 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1611 vmovdqa \T5, HashKey_5(arg1)
1612
1613 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1614 vmovdqa \T5, HashKey_6(arg1)
1615
1616 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1617 vmovdqa \T5, HashKey_7(arg1)
1618
1619 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1620 vmovdqa \T5, HashKey_8(arg1)
1621
1622 .endm
1623
1624
1625 ## if a = number of total plaintext bytes
1626 ## b = floor(a/16)
1627 ## num_initial_blocks = b mod 4#
1628 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1629 ## r10, r11, r12, rax are clobbered
1630 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1631
1632 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1633 i = (8-\num_initial_blocks)
1634 setreg
1635
1636 mov arg6, %r10 # r10 = AAD
1637 mov arg7, %r12 # r12 = aadLen
1638
1639
1640 mov %r12, %r11
1641
1642 vpxor reg_i, reg_i, reg_i
1643 _get_AAD_loop\@:
1644 vmovd (%r10), \T1
1645 vpslldq $12, \T1, \T1
1646 vpsrldq $4, reg_i, reg_i
1647 vpxor \T1, reg_i, reg_i
1648
1649 add $4, %r10
1650 sub $4, %r12
1651 jg _get_AAD_loop\@
1652
1653
1654 cmp $16, %r11
1655 je _get_AAD_loop2_done\@
1656 mov $16, %r12
1657
1658 _get_AAD_loop2\@:
1659 vpsrldq $4, reg_i, reg_i
1660 sub $4, %r12
1661 cmp %r11, %r12
1662 jg _get_AAD_loop2\@
1663
1664 _get_AAD_loop2_done\@:
1665
1666 #byte-reflect the AAD data
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1668
1669 # initialize the data pointer offset as zero
1670 xor %r11, %r11
1671
1672 # start AES for num_initial_blocks blocks
1673 mov arg5, %rax # rax = *Y0
1674 vmovdqu (%rax), \CTR # CTR = Y0
1675 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1676
1677
1678 i = (9-\num_initial_blocks)
1679 setreg
1680 .rep \num_initial_blocks
1681 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1682 vmovdqa \CTR, reg_i
1683 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1684 i = (i+1)
1685 setreg
1686 .endr
1687
1688 vmovdqa (arg1), \T_key
1689 i = (9-\num_initial_blocks)
1690 setreg
1691 .rep \num_initial_blocks
1692 vpxor \T_key, reg_i, reg_i
1693 i = (i+1)
1694 setreg
1695 .endr
1696
1697 j = 1
1698 setreg
1699 .rep 9
1700 vmovdqa 16*j(arg1), \T_key
1701 i = (9-\num_initial_blocks)
1702 setreg
1703 .rep \num_initial_blocks
1704 vaesenc \T_key, reg_i, reg_i
1705 i = (i+1)
1706 setreg
1707 .endr
1708
1709 j = (j+1)
1710 setreg
1711 .endr
1712
1713
1714 vmovdqa 16*10(arg1), \T_key
1715 i = (9-\num_initial_blocks)
1716 setreg
1717 .rep \num_initial_blocks
1718 vaesenclast \T_key, reg_i, reg_i
1719 i = (i+1)
1720 setreg
1721 .endr
1722
1723 i = (9-\num_initial_blocks)
1724 setreg
1725 .rep \num_initial_blocks
1726 vmovdqu (arg3, %r11), \T1
1727 vpxor \T1, reg_i, reg_i
1728 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1729 # num_initial_blocks blocks
1730 add $16, %r11
1731 .if \ENC_DEC == DEC
1732 vmovdqa \T1, reg_i
1733 .endif
1734 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1735 i = (i+1)
1736 setreg
1737 .endr
1738
1739
1740 i = (8-\num_initial_blocks)
1741 j = (9-\num_initial_blocks)
1742 setreg
1743 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1744
1745 .rep \num_initial_blocks
1746 vpxor reg_i, reg_j, reg_j
1747 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1748 i = (i+1)
1749 j = (j+1)
1750 setreg
1751 .endr
1752 # XMM8 has the combined result here
1753
1754 vmovdqa \XMM8, TMP1(%rsp)
1755 vmovdqa \XMM8, \T3
1756
1757 cmp $128, %r13
1758 jl _initial_blocks_done\@ # no need for precomputed constants
1759
1760 ###############################################################################
1761 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1763 vmovdqa \CTR, \XMM1
1764 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1765
1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1767 vmovdqa \CTR, \XMM2
1768 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1769
1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1771 vmovdqa \CTR, \XMM3
1772 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1773
1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1775 vmovdqa \CTR, \XMM4
1776 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1777
1778 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1779 vmovdqa \CTR, \XMM5
1780 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1781
1782 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1783 vmovdqa \CTR, \XMM6
1784 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1785
1786 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1787 vmovdqa \CTR, \XMM7
1788 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1789
1790 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1791 vmovdqa \CTR, \XMM8
1792 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1793
1794 vmovdqa (arg1), \T_key
1795 vpxor \T_key, \XMM1, \XMM1
1796 vpxor \T_key, \XMM2, \XMM2
1797 vpxor \T_key, \XMM3, \XMM3
1798 vpxor \T_key, \XMM4, \XMM4
1799 vpxor \T_key, \XMM5, \XMM5
1800 vpxor \T_key, \XMM6, \XMM6
1801 vpxor \T_key, \XMM7, \XMM7
1802 vpxor \T_key, \XMM8, \XMM8
1803
1804 i = 1
1805 setreg
1806 .rep 9 # do 9 rounds
1807 vmovdqa 16*i(arg1), \T_key
1808 vaesenc \T_key, \XMM1, \XMM1
1809 vaesenc \T_key, \XMM2, \XMM2
1810 vaesenc \T_key, \XMM3, \XMM3
1811 vaesenc \T_key, \XMM4, \XMM4
1812 vaesenc \T_key, \XMM5, \XMM5
1813 vaesenc \T_key, \XMM6, \XMM6
1814 vaesenc \T_key, \XMM7, \XMM7
1815 vaesenc \T_key, \XMM8, \XMM8
1816 i = (i+1)
1817 setreg
1818 .endr
1819
1820
1821 vmovdqa 16*i(arg1), \T_key
1822 vaesenclast \T_key, \XMM1, \XMM1
1823 vaesenclast \T_key, \XMM2, \XMM2
1824 vaesenclast \T_key, \XMM3, \XMM3
1825 vaesenclast \T_key, \XMM4, \XMM4
1826 vaesenclast \T_key, \XMM5, \XMM5
1827 vaesenclast \T_key, \XMM6, \XMM6
1828 vaesenclast \T_key, \XMM7, \XMM7
1829 vaesenclast \T_key, \XMM8, \XMM8
1830
1831 vmovdqu (arg3, %r11), \T1
1832 vpxor \T1, \XMM1, \XMM1
1833 vmovdqu \XMM1, (arg2 , %r11)
1834 .if \ENC_DEC == DEC
1835 vmovdqa \T1, \XMM1
1836 .endif
1837
1838 vmovdqu 16*1(arg3, %r11), \T1
1839 vpxor \T1, \XMM2, \XMM2
1840 vmovdqu \XMM2, 16*1(arg2 , %r11)
1841 .if \ENC_DEC == DEC
1842 vmovdqa \T1, \XMM2
1843 .endif
1844
1845 vmovdqu 16*2(arg3, %r11), \T1
1846 vpxor \T1, \XMM3, \XMM3
1847 vmovdqu \XMM3, 16*2(arg2 , %r11)
1848 .if \ENC_DEC == DEC
1849 vmovdqa \T1, \XMM3
1850 .endif
1851
1852 vmovdqu 16*3(arg3, %r11), \T1
1853 vpxor \T1, \XMM4, \XMM4
1854 vmovdqu \XMM4, 16*3(arg2 , %r11)
1855 .if \ENC_DEC == DEC
1856 vmovdqa \T1, \XMM4
1857 .endif
1858
1859 vmovdqu 16*4(arg3, %r11), \T1
1860 vpxor \T1, \XMM5, \XMM5
1861 vmovdqu \XMM5, 16*4(arg2 , %r11)
1862 .if \ENC_DEC == DEC
1863 vmovdqa \T1, \XMM5
1864 .endif
1865
1866 vmovdqu 16*5(arg3, %r11), \T1
1867 vpxor \T1, \XMM6, \XMM6
1868 vmovdqu \XMM6, 16*5(arg2 , %r11)
1869 .if \ENC_DEC == DEC
1870 vmovdqa \T1, \XMM6
1871 .endif
1872
1873 vmovdqu 16*6(arg3, %r11), \T1
1874 vpxor \T1, \XMM7, \XMM7
1875 vmovdqu \XMM7, 16*6(arg2 , %r11)
1876 .if \ENC_DEC == DEC
1877 vmovdqa \T1, \XMM7
1878 .endif
1879
1880 vmovdqu 16*7(arg3, %r11), \T1
1881 vpxor \T1, \XMM8, \XMM8
1882 vmovdqu \XMM8, 16*7(arg2 , %r11)
1883 .if \ENC_DEC == DEC
1884 vmovdqa \T1, \XMM8
1885 .endif
1886
1887 add $128, %r11
1888
1889 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1890 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1891 # the corresponding ciphertext
1892 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1893 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1894 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1895 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1896 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1897 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1898 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1899
1900 ###############################################################################
1901
1902 _initial_blocks_done\@:
1903
1904
1905 .endm
1906
1907
1908
1909 # encrypt 8 blocks at a time
1910 # ghash the 8 previously encrypted ciphertext blocks
1911 # arg1, arg2, arg3 are used as pointers only, not modified
1912 # r11 is the data offset value
1913 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1914
1915 vmovdqa \XMM1, \T2
1916 vmovdqa \XMM2, TMP2(%rsp)
1917 vmovdqa \XMM3, TMP3(%rsp)
1918 vmovdqa \XMM4, TMP4(%rsp)
1919 vmovdqa \XMM5, TMP5(%rsp)
1920 vmovdqa \XMM6, TMP6(%rsp)
1921 vmovdqa \XMM7, TMP7(%rsp)
1922 vmovdqa \XMM8, TMP8(%rsp)
1923
1924 .if \loop_idx == in_order
1925 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1926 vpaddd ONE(%rip), \XMM1, \XMM2
1927 vpaddd ONE(%rip), \XMM2, \XMM3
1928 vpaddd ONE(%rip), \XMM3, \XMM4
1929 vpaddd ONE(%rip), \XMM4, \XMM5
1930 vpaddd ONE(%rip), \XMM5, \XMM6
1931 vpaddd ONE(%rip), \XMM6, \XMM7
1932 vpaddd ONE(%rip), \XMM7, \XMM8
1933 vmovdqa \XMM8, \CTR
1934
1935 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1936 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1937 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1938 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1939 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1940 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1941 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1942 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1943 .else
1944 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1945 vpaddd ONEf(%rip), \XMM1, \XMM2
1946 vpaddd ONEf(%rip), \XMM2, \XMM3
1947 vpaddd ONEf(%rip), \XMM3, \XMM4
1948 vpaddd ONEf(%rip), \XMM4, \XMM5
1949 vpaddd ONEf(%rip), \XMM5, \XMM6
1950 vpaddd ONEf(%rip), \XMM6, \XMM7
1951 vpaddd ONEf(%rip), \XMM7, \XMM8
1952 vmovdqa \XMM8, \CTR
1953 .endif
1954
1955
1956 #######################################################################
1957
1958 vmovdqu (arg1), \T1
1959 vpxor \T1, \XMM1, \XMM1
1960 vpxor \T1, \XMM2, \XMM2
1961 vpxor \T1, \XMM3, \XMM3
1962 vpxor \T1, \XMM4, \XMM4
1963 vpxor \T1, \XMM5, \XMM5
1964 vpxor \T1, \XMM6, \XMM6
1965 vpxor \T1, \XMM7, \XMM7
1966 vpxor \T1, \XMM8, \XMM8
1967
1968 #######################################################################
1969
1970
1971
1972
1973
1974 vmovdqu 16*1(arg1), \T1
1975 vaesenc \T1, \XMM1, \XMM1
1976 vaesenc \T1, \XMM2, \XMM2
1977 vaesenc \T1, \XMM3, \XMM3
1978 vaesenc \T1, \XMM4, \XMM4
1979 vaesenc \T1, \XMM5, \XMM5
1980 vaesenc \T1, \XMM6, \XMM6
1981 vaesenc \T1, \XMM7, \XMM7
1982 vaesenc \T1, \XMM8, \XMM8
1983
1984 vmovdqu 16*2(arg1), \T1
1985 vaesenc \T1, \XMM1, \XMM1
1986 vaesenc \T1, \XMM2, \XMM2
1987 vaesenc \T1, \XMM3, \XMM3
1988 vaesenc \T1, \XMM4, \XMM4
1989 vaesenc \T1, \XMM5, \XMM5
1990 vaesenc \T1, \XMM6, \XMM6
1991 vaesenc \T1, \XMM7, \XMM7
1992 vaesenc \T1, \XMM8, \XMM8
1993
1994
1995 #######################################################################
1996
1997 vmovdqa HashKey_8(arg1), \T5
1998 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1999 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2000 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2001 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2002 vpxor \T5, \T6, \T6
2003
2004 vmovdqu 16*3(arg1), \T1
2005 vaesenc \T1, \XMM1, \XMM1
2006 vaesenc \T1, \XMM2, \XMM2
2007 vaesenc \T1, \XMM3, \XMM3
2008 vaesenc \T1, \XMM4, \XMM4
2009 vaesenc \T1, \XMM5, \XMM5
2010 vaesenc \T1, \XMM6, \XMM6
2011 vaesenc \T1, \XMM7, \XMM7
2012 vaesenc \T1, \XMM8, \XMM8
2013
2014 vmovdqa TMP2(%rsp), \T1
2015 vmovdqa HashKey_7(arg1), \T5
2016 vpclmulqdq $0x11, \T5, \T1, \T3
2017 vpxor \T3, \T4, \T4
2018
2019 vpclmulqdq $0x00, \T5, \T1, \T3
2020 vpxor \T3, \T7, \T7
2021
2022 vpclmulqdq $0x01, \T5, \T1, \T3
2023 vpxor \T3, \T6, \T6
2024
2025 vpclmulqdq $0x10, \T5, \T1, \T3
2026 vpxor \T3, \T6, \T6
2027
2028 vmovdqu 16*4(arg1), \T1
2029 vaesenc \T1, \XMM1, \XMM1
2030 vaesenc \T1, \XMM2, \XMM2
2031 vaesenc \T1, \XMM3, \XMM3
2032 vaesenc \T1, \XMM4, \XMM4
2033 vaesenc \T1, \XMM5, \XMM5
2034 vaesenc \T1, \XMM6, \XMM6
2035 vaesenc \T1, \XMM7, \XMM7
2036 vaesenc \T1, \XMM8, \XMM8
2037
2038 #######################################################################
2039
2040 vmovdqa TMP3(%rsp), \T1
2041 vmovdqa HashKey_6(arg1), \T5
2042 vpclmulqdq $0x11, \T5, \T1, \T3
2043 vpxor \T3, \T4, \T4
2044
2045 vpclmulqdq $0x00, \T5, \T1, \T3
2046 vpxor \T3, \T7, \T7
2047
2048 vpclmulqdq $0x01, \T5, \T1, \T3
2049 vpxor \T3, \T6, \T6
2050
2051 vpclmulqdq $0x10, \T5, \T1, \T3
2052 vpxor \T3, \T6, \T6
2053
2054 vmovdqu 16*5(arg1), \T1
2055 vaesenc \T1, \XMM1, \XMM1
2056 vaesenc \T1, \XMM2, \XMM2
2057 vaesenc \T1, \XMM3, \XMM3
2058 vaesenc \T1, \XMM4, \XMM4
2059 vaesenc \T1, \XMM5, \XMM5
2060 vaesenc \T1, \XMM6, \XMM6
2061 vaesenc \T1, \XMM7, \XMM7
2062 vaesenc \T1, \XMM8, \XMM8
2063
2064 vmovdqa TMP4(%rsp), \T1
2065 vmovdqa HashKey_5(arg1), \T5
2066 vpclmulqdq $0x11, \T5, \T1, \T3
2067 vpxor \T3, \T4, \T4
2068
2069 vpclmulqdq $0x00, \T5, \T1, \T3
2070 vpxor \T3, \T7, \T7
2071
2072 vpclmulqdq $0x01, \T5, \T1, \T3
2073 vpxor \T3, \T6, \T6
2074
2075 vpclmulqdq $0x10, \T5, \T1, \T3
2076 vpxor \T3, \T6, \T6
2077
2078 vmovdqu 16*6(arg1), \T1
2079 vaesenc \T1, \XMM1, \XMM1
2080 vaesenc \T1, \XMM2, \XMM2
2081 vaesenc \T1, \XMM3, \XMM3
2082 vaesenc \T1, \XMM4, \XMM4
2083 vaesenc \T1, \XMM5, \XMM5
2084 vaesenc \T1, \XMM6, \XMM6
2085 vaesenc \T1, \XMM7, \XMM7
2086 vaesenc \T1, \XMM8, \XMM8
2087
2088
2089 vmovdqa TMP5(%rsp), \T1
2090 vmovdqa HashKey_4(arg1), \T5
2091 vpclmulqdq $0x11, \T5, \T1, \T3
2092 vpxor \T3, \T4, \T4
2093
2094 vpclmulqdq $0x00, \T5, \T1, \T3
2095 vpxor \T3, \T7, \T7
2096
2097 vpclmulqdq $0x01, \T5, \T1, \T3
2098 vpxor \T3, \T6, \T6
2099
2100 vpclmulqdq $0x10, \T5, \T1, \T3
2101 vpxor \T3, \T6, \T6
2102
2103 vmovdqu 16*7(arg1), \T1
2104 vaesenc \T1, \XMM1, \XMM1
2105 vaesenc \T1, \XMM2, \XMM2
2106 vaesenc \T1, \XMM3, \XMM3
2107 vaesenc \T1, \XMM4, \XMM4
2108 vaesenc \T1, \XMM5, \XMM5
2109 vaesenc \T1, \XMM6, \XMM6
2110 vaesenc \T1, \XMM7, \XMM7
2111 vaesenc \T1, \XMM8, \XMM8
2112
2113 vmovdqa TMP6(%rsp), \T1
2114 vmovdqa HashKey_3(arg1), \T5
2115 vpclmulqdq $0x11, \T5, \T1, \T3
2116 vpxor \T3, \T4, \T4
2117
2118 vpclmulqdq $0x00, \T5, \T1, \T3
2119 vpxor \T3, \T7, \T7
2120
2121 vpclmulqdq $0x01, \T5, \T1, \T3
2122 vpxor \T3, \T6, \T6
2123
2124 vpclmulqdq $0x10, \T5, \T1, \T3
2125 vpxor \T3, \T6, \T6
2126
2127 vmovdqu 16*8(arg1), \T1
2128 vaesenc \T1, \XMM1, \XMM1
2129 vaesenc \T1, \XMM2, \XMM2
2130 vaesenc \T1, \XMM3, \XMM3
2131 vaesenc \T1, \XMM4, \XMM4
2132 vaesenc \T1, \XMM5, \XMM5
2133 vaesenc \T1, \XMM6, \XMM6
2134 vaesenc \T1, \XMM7, \XMM7
2135 vaesenc \T1, \XMM8, \XMM8
2136
2137 vmovdqa TMP7(%rsp), \T1
2138 vmovdqa HashKey_2(arg1), \T5
2139 vpclmulqdq $0x11, \T5, \T1, \T3
2140 vpxor \T3, \T4, \T4
2141
2142 vpclmulqdq $0x00, \T5, \T1, \T3
2143 vpxor \T3, \T7, \T7
2144
2145 vpclmulqdq $0x01, \T5, \T1, \T3
2146 vpxor \T3, \T6, \T6
2147
2148 vpclmulqdq $0x10, \T5, \T1, \T3
2149 vpxor \T3, \T6, \T6
2150
2151
2152 #######################################################################
2153
2154 vmovdqu 16*9(arg1), \T5
2155 vaesenc \T5, \XMM1, \XMM1
2156 vaesenc \T5, \XMM2, \XMM2
2157 vaesenc \T5, \XMM3, \XMM3
2158 vaesenc \T5, \XMM4, \XMM4
2159 vaesenc \T5, \XMM5, \XMM5
2160 vaesenc \T5, \XMM6, \XMM6
2161 vaesenc \T5, \XMM7, \XMM7
2162 vaesenc \T5, \XMM8, \XMM8
2163
2164 vmovdqa TMP8(%rsp), \T1
2165 vmovdqa HashKey(arg1), \T5
2166
2167 vpclmulqdq $0x00, \T5, \T1, \T3
2168 vpxor \T3, \T7, \T7
2169
2170 vpclmulqdq $0x01, \T5, \T1, \T3
2171 vpxor \T3, \T6, \T6
2172
2173 vpclmulqdq $0x10, \T5, \T1, \T3
2174 vpxor \T3, \T6, \T6
2175
2176 vpclmulqdq $0x11, \T5, \T1, \T3
2177 vpxor \T3, \T4, \T1
2178
2179
2180 vmovdqu 16*10(arg1), \T5
2181
2182 i = 0
2183 j = 1
2184 setreg
2185 .rep 8
2186 vpxor 16*i(arg3, %r11), \T5, \T2
2187 .if \ENC_DEC == ENC
2188 vaesenclast \T2, reg_j, reg_j
2189 .else
2190 vaesenclast \T2, reg_j, \T3
2191 vmovdqu 16*i(arg3, %r11), reg_j
2192 vmovdqu \T3, 16*i(arg2, %r11)
2193 .endif
2194 i = (i+1)
2195 j = (j+1)
2196 setreg
2197 .endr
2198 #######################################################################
2199
2200
2201 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2202 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2203 vpxor \T3, \T7, \T7
2204 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2205
2206
2207
2208 #######################################################################
2209 #first phase of the reduction
2210 vmovdqa POLY2(%rip), \T3
2211
2212 vpclmulqdq $0x01, \T7, \T3, \T2
2213 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2214
2215 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2216 #######################################################################
2217 .if \ENC_DEC == ENC
2218 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2219 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2220 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2221 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2222 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2223 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2224 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2225 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2226 .endif
2227
2228 #######################################################################
2229 #second phase of the reduction
2230 vpclmulqdq $0x00, \T7, \T3, \T2
2231 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2232
2233 vpclmulqdq $0x10, \T7, \T3, \T4
2234 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2235
2236 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2237 #######################################################################
2238 vpxor \T4, \T1, \T1 # the result is in T1
2239
2240 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2241 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2242 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2243 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2244 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2245 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2246 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2247 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2248
2249
2250 vpxor \T1, \XMM1, \XMM1
2251
2252
2253
2254 .endm
2255
2256
2257 # GHASH the last 4 ciphertext blocks.
2258 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2259
2260 ## Karatsuba Method
2261
2262 vmovdqa HashKey_8(arg1), \T5
2263
2264 vpshufd $0b01001110, \XMM1, \T2
2265 vpshufd $0b01001110, \T5, \T3
2266 vpxor \XMM1, \T2, \T2
2267 vpxor \T5, \T3, \T3
2268
2269 vpclmulqdq $0x11, \T5, \XMM1, \T6
2270 vpclmulqdq $0x00, \T5, \XMM1, \T7
2271
2272 vpclmulqdq $0x00, \T3, \T2, \XMM1
2273
2274 ######################
2275
2276 vmovdqa HashKey_7(arg1), \T5
2277 vpshufd $0b01001110, \XMM2, \T2
2278 vpshufd $0b01001110, \T5, \T3
2279 vpxor \XMM2, \T2, \T2
2280 vpxor \T5, \T3, \T3
2281
2282 vpclmulqdq $0x11, \T5, \XMM2, \T4
2283 vpxor \T4, \T6, \T6
2284
2285 vpclmulqdq $0x00, \T5, \XMM2, \T4
2286 vpxor \T4, \T7, \T7
2287
2288 vpclmulqdq $0x00, \T3, \T2, \T2
2289
2290 vpxor \T2, \XMM1, \XMM1
2291
2292 ######################
2293
2294 vmovdqa HashKey_6(arg1), \T5
2295 vpshufd $0b01001110, \XMM3, \T2
2296 vpshufd $0b01001110, \T5, \T3
2297 vpxor \XMM3, \T2, \T2
2298 vpxor \T5, \T3, \T3
2299
2300 vpclmulqdq $0x11, \T5, \XMM3, \T4
2301 vpxor \T4, \T6, \T6
2302
2303 vpclmulqdq $0x00, \T5, \XMM3, \T4
2304 vpxor \T4, \T7, \T7
2305
2306 vpclmulqdq $0x00, \T3, \T2, \T2
2307
2308 vpxor \T2, \XMM1, \XMM1
2309
2310 ######################
2311
2312 vmovdqa HashKey_5(arg1), \T5
2313 vpshufd $0b01001110, \XMM4, \T2
2314 vpshufd $0b01001110, \T5, \T3
2315 vpxor \XMM4, \T2, \T2
2316 vpxor \T5, \T3, \T3
2317
2318 vpclmulqdq $0x11, \T5, \XMM4, \T4
2319 vpxor \T4, \T6, \T6
2320
2321 vpclmulqdq $0x00, \T5, \XMM4, \T4
2322 vpxor \T4, \T7, \T7
2323
2324 vpclmulqdq $0x00, \T3, \T2, \T2
2325
2326 vpxor \T2, \XMM1, \XMM1
2327
2328 ######################
2329
2330 vmovdqa HashKey_4(arg1), \T5
2331 vpshufd $0b01001110, \XMM5, \T2
2332 vpshufd $0b01001110, \T5, \T3
2333 vpxor \XMM5, \T2, \T2
2334 vpxor \T5, \T3, \T3
2335
2336 vpclmulqdq $0x11, \T5, \XMM5, \T4
2337 vpxor \T4, \T6, \T6
2338
2339 vpclmulqdq $0x00, \T5, \XMM5, \T4
2340 vpxor \T4, \T7, \T7
2341
2342 vpclmulqdq $0x00, \T3, \T2, \T2
2343
2344 vpxor \T2, \XMM1, \XMM1
2345
2346 ######################
2347
2348 vmovdqa HashKey_3(arg1), \T5
2349 vpshufd $0b01001110, \XMM6, \T2
2350 vpshufd $0b01001110, \T5, \T3
2351 vpxor \XMM6, \T2, \T2
2352 vpxor \T5, \T3, \T3
2353
2354 vpclmulqdq $0x11, \T5, \XMM6, \T4
2355 vpxor \T4, \T6, \T6
2356
2357 vpclmulqdq $0x00, \T5, \XMM6, \T4
2358 vpxor \T4, \T7, \T7
2359
2360 vpclmulqdq $0x00, \T3, \T2, \T2
2361
2362 vpxor \T2, \XMM1, \XMM1
2363
2364 ######################
2365
2366 vmovdqa HashKey_2(arg1), \T5
2367 vpshufd $0b01001110, \XMM7, \T2
2368 vpshufd $0b01001110, \T5, \T3
2369 vpxor \XMM7, \T2, \T2
2370 vpxor \T5, \T3, \T3
2371
2372 vpclmulqdq $0x11, \T5, \XMM7, \T4
2373 vpxor \T4, \T6, \T6
2374
2375 vpclmulqdq $0x00, \T5, \XMM7, \T4
2376 vpxor \T4, \T7, \T7
2377
2378 vpclmulqdq $0x00, \T3, \T2, \T2
2379
2380 vpxor \T2, \XMM1, \XMM1
2381
2382 ######################
2383
2384 vmovdqa HashKey(arg1), \T5
2385 vpshufd $0b01001110, \XMM8, \T2
2386 vpshufd $0b01001110, \T5, \T3
2387 vpxor \XMM8, \T2, \T2
2388 vpxor \T5, \T3, \T3
2389
2390 vpclmulqdq $0x11, \T5, \XMM8, \T4
2391 vpxor \T4, \T6, \T6
2392
2393 vpclmulqdq $0x00, \T5, \XMM8, \T4
2394 vpxor \T4, \T7, \T7
2395
2396 vpclmulqdq $0x00, \T3, \T2, \T2
2397
2398 vpxor \T2, \XMM1, \XMM1
2399 vpxor \T6, \XMM1, \XMM1
2400 vpxor \T7, \XMM1, \T2
2401
2402
2403
2404
2405 vpslldq $8, \T2, \T4
2406 vpsrldq $8, \T2, \T2
2407
2408 vpxor \T4, \T7, \T7
2409 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2410 # accumulated carry-less multiplications
2411
2412 #######################################################################
2413 #first phase of the reduction
2414 vmovdqa POLY2(%rip), \T3
2415
2416 vpclmulqdq $0x01, \T7, \T3, \T2
2417 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2418
2419 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2420 #######################################################################
2421
2422
2423 #second phase of the reduction
2424 vpclmulqdq $0x00, \T7, \T3, \T2
2425 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2426
2427 vpclmulqdq $0x10, \T7, \T3, \T4
2428 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2429
2430 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2431 #######################################################################
2432 vpxor \T4, \T6, \T6 # the result is in T6
2433 .endm
2434
2435
2436
2437 # combined for GCM encrypt and decrypt functions
2438 # clobbering all xmm registers
2439 # clobbering r10, r11, r12, r13, r14, r15
2440 .macro GCM_ENC_DEC_AVX2 ENC_DEC
2441
2442 #the number of pushes must equal STACK_OFFSET
2443 push %r12
2444 push %r13
2445 push %r14
2446 push %r15
2447
2448 mov %rsp, %r14
2449
2450
2451
2452
2453 sub $VARIABLE_OFFSET, %rsp
2454 and $~63, %rsp # align rsp to 64 bytes
2455
2456
2457 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2458
2459 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2460 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2461
2462 mov %r13, %r12
2463 shr $4, %r12
2464 and $7, %r12
2465 jz _initial_num_blocks_is_0\@
2466
2467 cmp $7, %r12
2468 je _initial_num_blocks_is_7\@
2469 cmp $6, %r12
2470 je _initial_num_blocks_is_6\@
2471 cmp $5, %r12
2472 je _initial_num_blocks_is_5\@
2473 cmp $4, %r12
2474 je _initial_num_blocks_is_4\@
2475 cmp $3, %r12
2476 je _initial_num_blocks_is_3\@
2477 cmp $2, %r12
2478 je _initial_num_blocks_is_2\@
2479
2480 jmp _initial_num_blocks_is_1\@
2481
2482 _initial_num_blocks_is_7\@:
2483 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2484 sub $16*7, %r13
2485 jmp _initial_blocks_encrypted\@
2486
2487 _initial_num_blocks_is_6\@:
2488 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2489 sub $16*6, %r13
2490 jmp _initial_blocks_encrypted\@
2491
2492 _initial_num_blocks_is_5\@:
2493 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2494 sub $16*5, %r13
2495 jmp _initial_blocks_encrypted\@
2496
2497 _initial_num_blocks_is_4\@:
2498 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2499 sub $16*4, %r13
2500 jmp _initial_blocks_encrypted\@
2501
2502 _initial_num_blocks_is_3\@:
2503 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2504 sub $16*3, %r13
2505 jmp _initial_blocks_encrypted\@
2506
2507 _initial_num_blocks_is_2\@:
2508 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2509 sub $16*2, %r13
2510 jmp _initial_blocks_encrypted\@
2511
2512 _initial_num_blocks_is_1\@:
2513 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2514 sub $16*1, %r13
2515 jmp _initial_blocks_encrypted\@
2516
2517 _initial_num_blocks_is_0\@:
2518 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2519
2520
2521 _initial_blocks_encrypted\@:
2522 cmp $0, %r13
2523 je _zero_cipher_left\@
2524
2525 sub $128, %r13
2526 je _eight_cipher_left\@
2527
2528
2529
2530
2531 vmovd %xmm9, %r15d
2532 and $255, %r15d
2533 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2534
2535
2536 _encrypt_by_8_new\@:
2537 cmp $(255-8), %r15d
2538 jg _encrypt_by_8\@
2539
2540
2541
2542 add $8, %r15b
2543 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2544 add $128, %r11
2545 sub $128, %r13
2546 jne _encrypt_by_8_new\@
2547
2548 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2549 jmp _eight_cipher_left\@
2550
2551 _encrypt_by_8\@:
2552 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2553 add $8, %r15b
2554 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2555 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2556 add $128, %r11
2557 sub $128, %r13
2558 jne _encrypt_by_8_new\@
2559
2560 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2561
2562
2563
2564
2565 _eight_cipher_left\@:
2566 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2567
2568
2569 _zero_cipher_left\@:
2570 cmp $16, arg4
2571 jl _only_less_than_16\@
2572
2573 mov arg4, %r13
2574 and $15, %r13 # r13 = (arg4 mod 16)
2575
2576 je _multiple_of_16_bytes\@
2577
2578 # handle the last <16 Byte block seperately
2579
2580
2581 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2582 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2583 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2584
2585 sub $16, %r11
2586 add %r13, %r11
2587 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2588
2589 lea SHIFT_MASK+16(%rip), %r12
2590 sub %r13, %r12 # adjust the shuffle mask pointer
2591 # to be able to shift 16-r13 bytes
2592 # (r13 is the number of bytes in plaintext mod 16)
2593 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2594 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2595 jmp _final_ghash_mul\@
2596
2597 _only_less_than_16\@:
2598 # check for 0 length
2599 mov arg4, %r13
2600 and $15, %r13 # r13 = (arg4 mod 16)
2601
2602 je _multiple_of_16_bytes\@
2603
2604 # handle the last <16 Byte block seperately
2605
2606
2607 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2608 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2609 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2610
2611
2612 lea SHIFT_MASK+16(%rip), %r12
2613 sub %r13, %r12 # adjust the shuffle mask pointer to be
2614 # able to shift 16-r13 bytes (r13 is the
2615 # number of bytes in plaintext mod 16)
2616
2617 _get_last_16_byte_loop\@:
2618 movb (arg3, %r11), %al
2619 movb %al, TMP1 (%rsp , %r11)
2620 add $1, %r11
2621 cmp %r13, %r11
2622 jne _get_last_16_byte_loop\@
2623
2624 vmovdqu TMP1(%rsp), %xmm1
2625
2626 sub $16, %r11
2627
2628 _final_ghash_mul\@:
2629 .if \ENC_DEC == DEC
2630 vmovdqa %xmm1, %xmm2
2631 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2632 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2633 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2634 vpand %xmm1, %xmm2, %xmm2
2635 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2636 vpxor %xmm2, %xmm14, %xmm14
2637 #GHASH computation for the last <16 Byte block
2638 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2639 sub %r13, %r11
2640 add $16, %r11
2641 .else
2642 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2643 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2644 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2645 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2646 vpxor %xmm9, %xmm14, %xmm14
2647 #GHASH computation for the last <16 Byte block
2648 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2649 sub %r13, %r11
2650 add $16, %r11
2651 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2652 .endif
2653
2654
2655 #############################
2656 # output r13 Bytes
2657 vmovq %xmm9, %rax
2658 cmp $8, %r13
2659 jle _less_than_8_bytes_left\@
2660
2661 mov %rax, (arg2 , %r11)
2662 add $8, %r11
2663 vpsrldq $8, %xmm9, %xmm9
2664 vmovq %xmm9, %rax
2665 sub $8, %r13
2666
2667 _less_than_8_bytes_left\@:
2668 movb %al, (arg2 , %r11)
2669 add $1, %r11
2670 shr $8, %rax
2671 sub $1, %r13
2672 jne _less_than_8_bytes_left\@
2673 #############################
2674
2675 _multiple_of_16_bytes\@:
2676 mov arg7, %r12 # r12 = aadLen (number of bytes)
2677 shl $3, %r12 # convert into number of bits
2678 vmovd %r12d, %xmm15 # len(A) in xmm15
2679
2680 shl $3, arg4 # len(C) in bits (*128)
2681 vmovq arg4, %xmm1
2682 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2683 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2684
2685 vpxor %xmm15, %xmm14, %xmm14
2686 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2687 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2688
2689 mov arg5, %rax # rax = *Y0
2690 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2691
2692 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2693
2694 vpxor %xmm14, %xmm9, %xmm9
2695
2696
2697
2698 _return_T\@:
2699 mov arg8, %r10 # r10 = authTag
2700 mov arg9, %r11 # r11 = auth_tag_len
2701
2702 cmp $16, %r11
2703 je _T_16\@
2704
2705 cmp $12, %r11
2706 je _T_12\@
2707
2708 _T_8\@:
2709 vmovq %xmm9, %rax
2710 mov %rax, (%r10)
2711 jmp _return_T_done\@
2712 _T_12\@:
2713 vmovq %xmm9, %rax
2714 mov %rax, (%r10)
2715 vpsrldq $8, %xmm9, %xmm9
2716 vmovd %xmm9, %eax
2717 mov %eax, 8(%r10)
2718 jmp _return_T_done\@
2719
2720 _T_16\@:
2721 vmovdqu %xmm9, (%r10)
2722
2723 _return_T_done\@:
2724 mov %r14, %rsp
2725
2726 pop %r15
2727 pop %r14
2728 pop %r13
2729 pop %r12
2730 .endm
2731
2732
2733 #############################################################
2734 #void aesni_gcm_precomp_avx_gen4
2735 # (gcm_data *my_ctx_data,
2736 # u8 *hash_subkey)# /* H, the Hash sub key input.
2737 # Data starts on a 16-byte boundary. */
2738 #############################################################
2739 ENTRY(aesni_gcm_precomp_avx_gen4)
2740 #the number of pushes must equal STACK_OFFSET
2741 push %r12
2742 push %r13
2743 push %r14
2744 push %r15
2745
2746 mov %rsp, %r14
2747
2748
2749
2750 sub $VARIABLE_OFFSET, %rsp
2751 and $~63, %rsp # align rsp to 64 bytes
2752
2753 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2754
2755 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2756 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2757 vmovdqa %xmm6, %xmm2
2758 vpsllq $1, %xmm6, %xmm6
2759 vpsrlq $63, %xmm2, %xmm2
2760 vmovdqa %xmm2, %xmm1
2761 vpslldq $8, %xmm2, %xmm2
2762 vpsrldq $8, %xmm1, %xmm1
2763 vpor %xmm2, %xmm6, %xmm6
2764 #reduction
2765 vpshufd $0b00100100, %xmm1, %xmm2
2766 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2767 vpand POLY(%rip), %xmm2, %xmm2
2768 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2769 #######################################################################
2770 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2771
2772
2773 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2774
2775 mov %r14, %rsp
2776
2777 pop %r15
2778 pop %r14
2779 pop %r13
2780 pop %r12
2781 ret
2782 ENDPROC(aesni_gcm_precomp_avx_gen4)
2783
2784
2785 ###############################################################################
2786 #void aesni_gcm_enc_avx_gen4(
2787 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2788 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2789 # const u8 *in, /* Plaintext input */
2790 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2791 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2792 # (from Security Association) concatenated with 8 byte
2793 # Initialisation Vector (from IPSec ESP Payload)
2794 # concatenated with 0x00000001. 16-byte aligned pointer. */
2795 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2796 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2797 # u8 *auth_tag, /* Authenticated Tag output. */
2798 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2799 # Valid values are 16 (most likely), 12 or 8. */
2800 ###############################################################################
2801 ENTRY(aesni_gcm_enc_avx_gen4)
2802 GCM_ENC_DEC_AVX2 ENC
2803 ret
2804 ENDPROC(aesni_gcm_enc_avx_gen4)
2805
2806 ###############################################################################
2807 #void aesni_gcm_dec_avx_gen4(
2808 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2809 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2810 # const u8 *in, /* Ciphertext input */
2811 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2812 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2813 # (from Security Association) concatenated with 8 byte
2814 # Initialisation Vector (from IPSec ESP Payload)
2815 # concatenated with 0x00000001. 16-byte aligned pointer. */
2816 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2817 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2818 # u8 *auth_tag, /* Authenticated Tag output. */
2819 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2820 # Valid values are 16 (most likely), 12 or 8. */
2821 ###############################################################################
2822 ENTRY(aesni_gcm_dec_avx_gen4)
2823 GCM_ENC_DEC_AVX2 DEC
2824 ret
2825 ENDPROC(aesni_gcm_dec_avx_gen4)
2826
2827 #endif /* CONFIG_AS_AVX2 */