]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86/crypto/aesni-intel_avx-x86_64.S
Merge tag 'gvt-fixes-2017-08-07' of https://github.com/01org/gvt-linux into drm-intel...
[mirror_ubuntu-artful-kernel.git] / arch / x86 / crypto / aesni-intel_avx-x86_64.S
1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
3 #
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
12 # met:
13 #
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
16 #
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
20 # distribution.
21 #
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
25 #
26 #
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
39 ##
40 ## Authors:
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
45 ##
46 ## References:
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
53 ##
54 ## Assumptions:
55 ##
56 ##
57 ##
58 ## iv:
59 ## 0 1 2 3
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67 ## | 0x1 |
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69 ##
70 ##
71 ##
72 ## AAD:
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
75 ##
76 ## if AAD is 8 bytes:
77 ## AAD[3] = {A0, A1}#
78 ## padded AAD in xmm register = {A1 A0 0 0}
79 ##
80 ## 0 1 2 3
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ## | SPI (A1) |
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 ## | 0x0 |
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 ##
90 ## AAD Format with 32-bit Sequence Number
91 ##
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
95 ##
96 ## 0 1 2 3
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ## | SPI (A2) |
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
102 ## | |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 ## | 0x0 |
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 ##
107 ## AAD Format with 64-bit Extended Sequence Number
108 ##
109 ##
110 ## aadLen:
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
113 ##
114 ## TLen:
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116 ##
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
120 ##
121
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
124
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 .align 16
128 POLY: .octa 0xC2000000000000000000000000000001
129
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 .align 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
133
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 .align 16
136 TWOONE: .octa 0x00000001000000000000000000000001
137
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 .align 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 .align 16
144 ONE: .octa 0x00000000000000000000000000000001
145
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 .align 16
148 ONEf: .octa 0x01000000000000000000000000000000
149
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
153 .align 16
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
157
158 .section .rodata
159 .align 16
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
162 aad_shift_arr:
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
180
181
182 .text
183
184
185 ##define the fields of the gcm aes context
186 #{
187 # u8 expanded_keys[16*11] store expanded keys
188 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
189 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
190 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
191 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
192 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
193 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
194 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
195 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
196 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
197 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
198 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
199 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
200 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
201 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
202 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
203 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
204 #} gcm_ctx#
205
206 HashKey = 16*11 # store HashKey <<1 mod poly here
207 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
208 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
209 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
210 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
211 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
212 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
213 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
214 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
215 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
216 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
217 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
218 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
219 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
220 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
221 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
222
223 #define arg1 %rdi
224 #define arg2 %rsi
225 #define arg3 %rdx
226 #define arg4 %rcx
227 #define arg5 %r8
228 #define arg6 %r9
229 #define arg7 STACK_OFFSET+8*1(%r14)
230 #define arg8 STACK_OFFSET+8*2(%r14)
231 #define arg9 STACK_OFFSET+8*3(%r14)
232
233 i = 0
234 j = 0
235
236 out_order = 0
237 in_order = 1
238 DEC = 0
239 ENC = 1
240
241 .macro define_reg r n
242 reg_\r = %xmm\n
243 .endm
244
245 .macro setreg
246 .altmacro
247 define_reg i %i
248 define_reg j %j
249 .noaltmacro
250 .endm
251
252 # need to push 4 registers into stack to maintain
253 STACK_OFFSET = 8*4
254
255 TMP1 = 16*0 # Temporary storage for AAD
256 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
257 TMP3 = 16*2 # Temporary storage for AES State 3
258 TMP4 = 16*3 # Temporary storage for AES State 4
259 TMP5 = 16*4 # Temporary storage for AES State 5
260 TMP6 = 16*5 # Temporary storage for AES State 6
261 TMP7 = 16*6 # Temporary storage for AES State 7
262 TMP8 = 16*7 # Temporary storage for AES State 8
263
264 VARIABLE_OFFSET = 16*8
265
266 ################################
267 # Utility Macros
268 ################################
269
270 # Encryption of a single block
271 .macro ENCRYPT_SINGLE_BLOCK XMM0
272 vpxor (arg1), \XMM0, \XMM0
273 i = 1
274 setreg
275 .rep 9
276 vaesenc 16*i(arg1), \XMM0, \XMM0
277 i = (i+1)
278 setreg
279 .endr
280 vaesenclast 16*10(arg1), \XMM0, \XMM0
281 .endm
282
283 #ifdef CONFIG_AS_AVX
284 ###############################################################################
285 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
286 # Input: A and B (128-bits each, bit-reflected)
287 # Output: C = A*B*x mod poly, (i.e. >>1 )
288 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
289 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
290 ###############################################################################
291 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
292
293 vpshufd $0b01001110, \GH, \T2
294 vpshufd $0b01001110, \HK, \T3
295 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
296 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
297
298 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
299 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
300 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
301 vpxor \GH, \T2,\T2
302 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
303
304 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
305 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
306 vpxor \T3, \GH, \GH
307 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
308
309 #first phase of the reduction
310 vpslld $31, \GH, \T2 # packed right shifting << 31
311 vpslld $30, \GH, \T3 # packed right shifting shift << 30
312 vpslld $25, \GH, \T4 # packed right shifting shift << 25
313
314 vpxor \T3, \T2, \T2 # xor the shifted versions
315 vpxor \T4, \T2, \T2
316
317 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
318
319 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
320 vpxor \T2, \GH, \GH # first phase of the reduction complete
321
322 #second phase of the reduction
323
324 vpsrld $1,\GH, \T2 # packed left shifting >> 1
325 vpsrld $2,\GH, \T3 # packed left shifting >> 2
326 vpsrld $7,\GH, \T4 # packed left shifting >> 7
327 vpxor \T3, \T2, \T2 # xor the shifted versions
328 vpxor \T4, \T2, \T2
329
330 vpxor \T5, \T2, \T2
331 vpxor \T2, \GH, \GH
332 vpxor \T1, \GH, \GH # the result is in GH
333
334
335 .endm
336
337 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
338
339 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
340 vmovdqa \HK, \T5
341
342 vpshufd $0b01001110, \T5, \T1
343 vpxor \T5, \T1, \T1
344 vmovdqa \T1, HashKey_k(arg1)
345
346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
347 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
348 vpshufd $0b01001110, \T5, \T1
349 vpxor \T5, \T1, \T1
350 vmovdqa \T1, HashKey_2_k(arg1)
351
352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
353 vmovdqa \T5, HashKey_3(arg1)
354 vpshufd $0b01001110, \T5, \T1
355 vpxor \T5, \T1, \T1
356 vmovdqa \T1, HashKey_3_k(arg1)
357
358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
359 vmovdqa \T5, HashKey_4(arg1)
360 vpshufd $0b01001110, \T5, \T1
361 vpxor \T5, \T1, \T1
362 vmovdqa \T1, HashKey_4_k(arg1)
363
364 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
365 vmovdqa \T5, HashKey_5(arg1)
366 vpshufd $0b01001110, \T5, \T1
367 vpxor \T5, \T1, \T1
368 vmovdqa \T1, HashKey_5_k(arg1)
369
370 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
371 vmovdqa \T5, HashKey_6(arg1)
372 vpshufd $0b01001110, \T5, \T1
373 vpxor \T5, \T1, \T1
374 vmovdqa \T1, HashKey_6_k(arg1)
375
376 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
377 vmovdqa \T5, HashKey_7(arg1)
378 vpshufd $0b01001110, \T5, \T1
379 vpxor \T5, \T1, \T1
380 vmovdqa \T1, HashKey_7_k(arg1)
381
382 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
383 vmovdqa \T5, HashKey_8(arg1)
384 vpshufd $0b01001110, \T5, \T1
385 vpxor \T5, \T1, \T1
386 vmovdqa \T1, HashKey_8_k(arg1)
387
388 .endm
389
390 ## if a = number of total plaintext bytes
391 ## b = floor(a/16)
392 ## num_initial_blocks = b mod 4#
393 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
394 ## r10, r11, r12, rax are clobbered
395 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
396
397 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
398 i = (8-\num_initial_blocks)
399 j = 0
400 setreg
401
402 mov arg6, %r10 # r10 = AAD
403 mov arg7, %r12 # r12 = aadLen
404
405
406 mov %r12, %r11
407
408 vpxor reg_j, reg_j, reg_j
409 vpxor reg_i, reg_i, reg_i
410 cmp $16, %r11
411 jl _get_AAD_rest8\@
412 _get_AAD_blocks\@:
413 vmovdqu (%r10), reg_i
414 vpshufb SHUF_MASK(%rip), reg_i, reg_i
415 vpxor reg_i, reg_j, reg_j
416 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
417 add $16, %r10
418 sub $16, %r12
419 sub $16, %r11
420 cmp $16, %r11
421 jge _get_AAD_blocks\@
422 vmovdqu reg_j, reg_i
423 cmp $0, %r11
424 je _get_AAD_done\@
425
426 vpxor reg_i, reg_i, reg_i
427
428 /* read the last <16B of AAD. since we have at least 4B of
429 data right after the AAD (the ICV, and maybe some CT), we can
430 read 4B/8B blocks safely, and then get rid of the extra stuff */
431 _get_AAD_rest8\@:
432 cmp $4, %r11
433 jle _get_AAD_rest4\@
434 movq (%r10), \T1
435 add $8, %r10
436 sub $8, %r11
437 vpslldq $8, \T1, \T1
438 vpsrldq $8, reg_i, reg_i
439 vpxor \T1, reg_i, reg_i
440 jmp _get_AAD_rest8\@
441 _get_AAD_rest4\@:
442 cmp $0, %r11
443 jle _get_AAD_rest0\@
444 mov (%r10), %eax
445 movq %rax, \T1
446 add $4, %r10
447 sub $4, %r11
448 vpslldq $12, \T1, \T1
449 vpsrldq $4, reg_i, reg_i
450 vpxor \T1, reg_i, reg_i
451 _get_AAD_rest0\@:
452 /* finalize: shift out the extra bytes we read, and align
453 left. since pslldq can only shift by an immediate, we use
454 vpshufb and an array of shuffle masks */
455 movq %r12, %r11
456 salq $4, %r11
457 movdqu aad_shift_arr(%r11), \T1
458 vpshufb \T1, reg_i, reg_i
459 _get_AAD_rest_final\@:
460 vpshufb SHUF_MASK(%rip), reg_i, reg_i
461 vpxor reg_j, reg_i, reg_i
462 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
463
464 _get_AAD_done\@:
465 # initialize the data pointer offset as zero
466 xor %r11, %r11
467
468 # start AES for num_initial_blocks blocks
469 mov arg5, %rax # rax = *Y0
470 vmovdqu (%rax), \CTR # CTR = Y0
471 vpshufb SHUF_MASK(%rip), \CTR, \CTR
472
473
474 i = (9-\num_initial_blocks)
475 setreg
476 .rep \num_initial_blocks
477 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
478 vmovdqa \CTR, reg_i
479 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
480 i = (i+1)
481 setreg
482 .endr
483
484 vmovdqa (arg1), \T_key
485 i = (9-\num_initial_blocks)
486 setreg
487 .rep \num_initial_blocks
488 vpxor \T_key, reg_i, reg_i
489 i = (i+1)
490 setreg
491 .endr
492
493 j = 1
494 setreg
495 .rep 9
496 vmovdqa 16*j(arg1), \T_key
497 i = (9-\num_initial_blocks)
498 setreg
499 .rep \num_initial_blocks
500 vaesenc \T_key, reg_i, reg_i
501 i = (i+1)
502 setreg
503 .endr
504
505 j = (j+1)
506 setreg
507 .endr
508
509
510 vmovdqa 16*10(arg1), \T_key
511 i = (9-\num_initial_blocks)
512 setreg
513 .rep \num_initial_blocks
514 vaesenclast \T_key, reg_i, reg_i
515 i = (i+1)
516 setreg
517 .endr
518
519 i = (9-\num_initial_blocks)
520 setreg
521 .rep \num_initial_blocks
522 vmovdqu (arg3, %r11), \T1
523 vpxor \T1, reg_i, reg_i
524 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
525 add $16, %r11
526 .if \ENC_DEC == DEC
527 vmovdqa \T1, reg_i
528 .endif
529 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
530 i = (i+1)
531 setreg
532 .endr
533
534
535 i = (8-\num_initial_blocks)
536 j = (9-\num_initial_blocks)
537 setreg
538
539 .rep \num_initial_blocks
540 vpxor reg_i, reg_j, reg_j
541 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
542 i = (i+1)
543 j = (j+1)
544 setreg
545 .endr
546 # XMM8 has the combined result here
547
548 vmovdqa \XMM8, TMP1(%rsp)
549 vmovdqa \XMM8, \T3
550
551 cmp $128, %r13
552 jl _initial_blocks_done\@ # no need for precomputed constants
553
554 ###############################################################################
555 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
556 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
557 vmovdqa \CTR, \XMM1
558 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
559
560 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
561 vmovdqa \CTR, \XMM2
562 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
563
564 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
565 vmovdqa \CTR, \XMM3
566 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
567
568 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
569 vmovdqa \CTR, \XMM4
570 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
571
572 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
573 vmovdqa \CTR, \XMM5
574 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
575
576 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
577 vmovdqa \CTR, \XMM6
578 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
579
580 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
581 vmovdqa \CTR, \XMM7
582 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
583
584 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
585 vmovdqa \CTR, \XMM8
586 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
587
588 vmovdqa (arg1), \T_key
589 vpxor \T_key, \XMM1, \XMM1
590 vpxor \T_key, \XMM2, \XMM2
591 vpxor \T_key, \XMM3, \XMM3
592 vpxor \T_key, \XMM4, \XMM4
593 vpxor \T_key, \XMM5, \XMM5
594 vpxor \T_key, \XMM6, \XMM6
595 vpxor \T_key, \XMM7, \XMM7
596 vpxor \T_key, \XMM8, \XMM8
597
598 i = 1
599 setreg
600 .rep 9 # do 9 rounds
601 vmovdqa 16*i(arg1), \T_key
602 vaesenc \T_key, \XMM1, \XMM1
603 vaesenc \T_key, \XMM2, \XMM2
604 vaesenc \T_key, \XMM3, \XMM3
605 vaesenc \T_key, \XMM4, \XMM4
606 vaesenc \T_key, \XMM5, \XMM5
607 vaesenc \T_key, \XMM6, \XMM6
608 vaesenc \T_key, \XMM7, \XMM7
609 vaesenc \T_key, \XMM8, \XMM8
610 i = (i+1)
611 setreg
612 .endr
613
614
615 vmovdqa 16*i(arg1), \T_key
616 vaesenclast \T_key, \XMM1, \XMM1
617 vaesenclast \T_key, \XMM2, \XMM2
618 vaesenclast \T_key, \XMM3, \XMM3
619 vaesenclast \T_key, \XMM4, \XMM4
620 vaesenclast \T_key, \XMM5, \XMM5
621 vaesenclast \T_key, \XMM6, \XMM6
622 vaesenclast \T_key, \XMM7, \XMM7
623 vaesenclast \T_key, \XMM8, \XMM8
624
625 vmovdqu (arg3, %r11), \T1
626 vpxor \T1, \XMM1, \XMM1
627 vmovdqu \XMM1, (arg2 , %r11)
628 .if \ENC_DEC == DEC
629 vmovdqa \T1, \XMM1
630 .endif
631
632 vmovdqu 16*1(arg3, %r11), \T1
633 vpxor \T1, \XMM2, \XMM2
634 vmovdqu \XMM2, 16*1(arg2 , %r11)
635 .if \ENC_DEC == DEC
636 vmovdqa \T1, \XMM2
637 .endif
638
639 vmovdqu 16*2(arg3, %r11), \T1
640 vpxor \T1, \XMM3, \XMM3
641 vmovdqu \XMM3, 16*2(arg2 , %r11)
642 .if \ENC_DEC == DEC
643 vmovdqa \T1, \XMM3
644 .endif
645
646 vmovdqu 16*3(arg3, %r11), \T1
647 vpxor \T1, \XMM4, \XMM4
648 vmovdqu \XMM4, 16*3(arg2 , %r11)
649 .if \ENC_DEC == DEC
650 vmovdqa \T1, \XMM4
651 .endif
652
653 vmovdqu 16*4(arg3, %r11), \T1
654 vpxor \T1, \XMM5, \XMM5
655 vmovdqu \XMM5, 16*4(arg2 , %r11)
656 .if \ENC_DEC == DEC
657 vmovdqa \T1, \XMM5
658 .endif
659
660 vmovdqu 16*5(arg3, %r11), \T1
661 vpxor \T1, \XMM6, \XMM6
662 vmovdqu \XMM6, 16*5(arg2 , %r11)
663 .if \ENC_DEC == DEC
664 vmovdqa \T1, \XMM6
665 .endif
666
667 vmovdqu 16*6(arg3, %r11), \T1
668 vpxor \T1, \XMM7, \XMM7
669 vmovdqu \XMM7, 16*6(arg2 , %r11)
670 .if \ENC_DEC == DEC
671 vmovdqa \T1, \XMM7
672 .endif
673
674 vmovdqu 16*7(arg3, %r11), \T1
675 vpxor \T1, \XMM8, \XMM8
676 vmovdqu \XMM8, 16*7(arg2 , %r11)
677 .if \ENC_DEC == DEC
678 vmovdqa \T1, \XMM8
679 .endif
680
681 add $128, %r11
682
683 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
684 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
685 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
686 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
687 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
688 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
689 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
690 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
691 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
692
693 ###############################################################################
694
695 _initial_blocks_done\@:
696
697 .endm
698
699 # encrypt 8 blocks at a time
700 # ghash the 8 previously encrypted ciphertext blocks
701 # arg1, arg2, arg3 are used as pointers only, not modified
702 # r11 is the data offset value
703 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
704
705 vmovdqa \XMM1, \T2
706 vmovdqa \XMM2, TMP2(%rsp)
707 vmovdqa \XMM3, TMP3(%rsp)
708 vmovdqa \XMM4, TMP4(%rsp)
709 vmovdqa \XMM5, TMP5(%rsp)
710 vmovdqa \XMM6, TMP6(%rsp)
711 vmovdqa \XMM7, TMP7(%rsp)
712 vmovdqa \XMM8, TMP8(%rsp)
713
714 .if \loop_idx == in_order
715 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
716 vpaddd ONE(%rip), \XMM1, \XMM2
717 vpaddd ONE(%rip), \XMM2, \XMM3
718 vpaddd ONE(%rip), \XMM3, \XMM4
719 vpaddd ONE(%rip), \XMM4, \XMM5
720 vpaddd ONE(%rip), \XMM5, \XMM6
721 vpaddd ONE(%rip), \XMM6, \XMM7
722 vpaddd ONE(%rip), \XMM7, \XMM8
723 vmovdqa \XMM8, \CTR
724
725 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
726 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
727 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
728 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
729 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
730 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
731 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
732 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
733 .else
734 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
735 vpaddd ONEf(%rip), \XMM1, \XMM2
736 vpaddd ONEf(%rip), \XMM2, \XMM3
737 vpaddd ONEf(%rip), \XMM3, \XMM4
738 vpaddd ONEf(%rip), \XMM4, \XMM5
739 vpaddd ONEf(%rip), \XMM5, \XMM6
740 vpaddd ONEf(%rip), \XMM6, \XMM7
741 vpaddd ONEf(%rip), \XMM7, \XMM8
742 vmovdqa \XMM8, \CTR
743 .endif
744
745
746 #######################################################################
747
748 vmovdqu (arg1), \T1
749 vpxor \T1, \XMM1, \XMM1
750 vpxor \T1, \XMM2, \XMM2
751 vpxor \T1, \XMM3, \XMM3
752 vpxor \T1, \XMM4, \XMM4
753 vpxor \T1, \XMM5, \XMM5
754 vpxor \T1, \XMM6, \XMM6
755 vpxor \T1, \XMM7, \XMM7
756 vpxor \T1, \XMM8, \XMM8
757
758 #######################################################################
759
760
761
762
763
764 vmovdqu 16*1(arg1), \T1
765 vaesenc \T1, \XMM1, \XMM1
766 vaesenc \T1, \XMM2, \XMM2
767 vaesenc \T1, \XMM3, \XMM3
768 vaesenc \T1, \XMM4, \XMM4
769 vaesenc \T1, \XMM5, \XMM5
770 vaesenc \T1, \XMM6, \XMM6
771 vaesenc \T1, \XMM7, \XMM7
772 vaesenc \T1, \XMM8, \XMM8
773
774 vmovdqu 16*2(arg1), \T1
775 vaesenc \T1, \XMM1, \XMM1
776 vaesenc \T1, \XMM2, \XMM2
777 vaesenc \T1, \XMM3, \XMM3
778 vaesenc \T1, \XMM4, \XMM4
779 vaesenc \T1, \XMM5, \XMM5
780 vaesenc \T1, \XMM6, \XMM6
781 vaesenc \T1, \XMM7, \XMM7
782 vaesenc \T1, \XMM8, \XMM8
783
784
785 #######################################################################
786
787 vmovdqa HashKey_8(arg1), \T5
788 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
789 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
790
791 vpshufd $0b01001110, \T2, \T6
792 vpxor \T2, \T6, \T6
793
794 vmovdqa HashKey_8_k(arg1), \T5
795 vpclmulqdq $0x00, \T5, \T6, \T6
796
797 vmovdqu 16*3(arg1), \T1
798 vaesenc \T1, \XMM1, \XMM1
799 vaesenc \T1, \XMM2, \XMM2
800 vaesenc \T1, \XMM3, \XMM3
801 vaesenc \T1, \XMM4, \XMM4
802 vaesenc \T1, \XMM5, \XMM5
803 vaesenc \T1, \XMM6, \XMM6
804 vaesenc \T1, \XMM7, \XMM7
805 vaesenc \T1, \XMM8, \XMM8
806
807 vmovdqa TMP2(%rsp), \T1
808 vmovdqa HashKey_7(arg1), \T5
809 vpclmulqdq $0x11, \T5, \T1, \T3
810 vpxor \T3, \T4, \T4
811 vpclmulqdq $0x00, \T5, \T1, \T3
812 vpxor \T3, \T7, \T7
813
814 vpshufd $0b01001110, \T1, \T3
815 vpxor \T1, \T3, \T3
816 vmovdqa HashKey_7_k(arg1), \T5
817 vpclmulqdq $0x10, \T5, \T3, \T3
818 vpxor \T3, \T6, \T6
819
820 vmovdqu 16*4(arg1), \T1
821 vaesenc \T1, \XMM1, \XMM1
822 vaesenc \T1, \XMM2, \XMM2
823 vaesenc \T1, \XMM3, \XMM3
824 vaesenc \T1, \XMM4, \XMM4
825 vaesenc \T1, \XMM5, \XMM5
826 vaesenc \T1, \XMM6, \XMM6
827 vaesenc \T1, \XMM7, \XMM7
828 vaesenc \T1, \XMM8, \XMM8
829
830 #######################################################################
831
832 vmovdqa TMP3(%rsp), \T1
833 vmovdqa HashKey_6(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
835 vpxor \T3, \T4, \T4
836 vpclmulqdq $0x00, \T5, \T1, \T3
837 vpxor \T3, \T7, \T7
838
839 vpshufd $0b01001110, \T1, \T3
840 vpxor \T1, \T3, \T3
841 vmovdqa HashKey_6_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
843 vpxor \T3, \T6, \T6
844
845 vmovdqu 16*5(arg1), \T1
846 vaesenc \T1, \XMM1, \XMM1
847 vaesenc \T1, \XMM2, \XMM2
848 vaesenc \T1, \XMM3, \XMM3
849 vaesenc \T1, \XMM4, \XMM4
850 vaesenc \T1, \XMM5, \XMM5
851 vaesenc \T1, \XMM6, \XMM6
852 vaesenc \T1, \XMM7, \XMM7
853 vaesenc \T1, \XMM8, \XMM8
854
855 vmovdqa TMP4(%rsp), \T1
856 vmovdqa HashKey_5(arg1), \T5
857 vpclmulqdq $0x11, \T5, \T1, \T3
858 vpxor \T3, \T4, \T4
859 vpclmulqdq $0x00, \T5, \T1, \T3
860 vpxor \T3, \T7, \T7
861
862 vpshufd $0b01001110, \T1, \T3
863 vpxor \T1, \T3, \T3
864 vmovdqa HashKey_5_k(arg1), \T5
865 vpclmulqdq $0x10, \T5, \T3, \T3
866 vpxor \T3, \T6, \T6
867
868 vmovdqu 16*6(arg1), \T1
869 vaesenc \T1, \XMM1, \XMM1
870 vaesenc \T1, \XMM2, \XMM2
871 vaesenc \T1, \XMM3, \XMM3
872 vaesenc \T1, \XMM4, \XMM4
873 vaesenc \T1, \XMM5, \XMM5
874 vaesenc \T1, \XMM6, \XMM6
875 vaesenc \T1, \XMM7, \XMM7
876 vaesenc \T1, \XMM8, \XMM8
877
878
879 vmovdqa TMP5(%rsp), \T1
880 vmovdqa HashKey_4(arg1), \T5
881 vpclmulqdq $0x11, \T5, \T1, \T3
882 vpxor \T3, \T4, \T4
883 vpclmulqdq $0x00, \T5, \T1, \T3
884 vpxor \T3, \T7, \T7
885
886 vpshufd $0b01001110, \T1, \T3
887 vpxor \T1, \T3, \T3
888 vmovdqa HashKey_4_k(arg1), \T5
889 vpclmulqdq $0x10, \T5, \T3, \T3
890 vpxor \T3, \T6, \T6
891
892 vmovdqu 16*7(arg1), \T1
893 vaesenc \T1, \XMM1, \XMM1
894 vaesenc \T1, \XMM2, \XMM2
895 vaesenc \T1, \XMM3, \XMM3
896 vaesenc \T1, \XMM4, \XMM4
897 vaesenc \T1, \XMM5, \XMM5
898 vaesenc \T1, \XMM6, \XMM6
899 vaesenc \T1, \XMM7, \XMM7
900 vaesenc \T1, \XMM8, \XMM8
901
902 vmovdqa TMP6(%rsp), \T1
903 vmovdqa HashKey_3(arg1), \T5
904 vpclmulqdq $0x11, \T5, \T1, \T3
905 vpxor \T3, \T4, \T4
906 vpclmulqdq $0x00, \T5, \T1, \T3
907 vpxor \T3, \T7, \T7
908
909 vpshufd $0b01001110, \T1, \T3
910 vpxor \T1, \T3, \T3
911 vmovdqa HashKey_3_k(arg1), \T5
912 vpclmulqdq $0x10, \T5, \T3, \T3
913 vpxor \T3, \T6, \T6
914
915
916 vmovdqu 16*8(arg1), \T1
917 vaesenc \T1, \XMM1, \XMM1
918 vaesenc \T1, \XMM2, \XMM2
919 vaesenc \T1, \XMM3, \XMM3
920 vaesenc \T1, \XMM4, \XMM4
921 vaesenc \T1, \XMM5, \XMM5
922 vaesenc \T1, \XMM6, \XMM6
923 vaesenc \T1, \XMM7, \XMM7
924 vaesenc \T1, \XMM8, \XMM8
925
926 vmovdqa TMP7(%rsp), \T1
927 vmovdqa HashKey_2(arg1), \T5
928 vpclmulqdq $0x11, \T5, \T1, \T3
929 vpxor \T3, \T4, \T4
930 vpclmulqdq $0x00, \T5, \T1, \T3
931 vpxor \T3, \T7, \T7
932
933 vpshufd $0b01001110, \T1, \T3
934 vpxor \T1, \T3, \T3
935 vmovdqa HashKey_2_k(arg1), \T5
936 vpclmulqdq $0x10, \T5, \T3, \T3
937 vpxor \T3, \T6, \T6
938
939 #######################################################################
940
941 vmovdqu 16*9(arg1), \T5
942 vaesenc \T5, \XMM1, \XMM1
943 vaesenc \T5, \XMM2, \XMM2
944 vaesenc \T5, \XMM3, \XMM3
945 vaesenc \T5, \XMM4, \XMM4
946 vaesenc \T5, \XMM5, \XMM5
947 vaesenc \T5, \XMM6, \XMM6
948 vaesenc \T5, \XMM7, \XMM7
949 vaesenc \T5, \XMM8, \XMM8
950
951 vmovdqa TMP8(%rsp), \T1
952 vmovdqa HashKey(arg1), \T5
953 vpclmulqdq $0x11, \T5, \T1, \T3
954 vpxor \T3, \T4, \T4
955 vpclmulqdq $0x00, \T5, \T1, \T3
956 vpxor \T3, \T7, \T7
957
958 vpshufd $0b01001110, \T1, \T3
959 vpxor \T1, \T3, \T3
960 vmovdqa HashKey_k(arg1), \T5
961 vpclmulqdq $0x10, \T5, \T3, \T3
962 vpxor \T3, \T6, \T6
963
964 vpxor \T4, \T6, \T6
965 vpxor \T7, \T6, \T6
966
967 vmovdqu 16*10(arg1), \T5
968
969 i = 0
970 j = 1
971 setreg
972 .rep 8
973 vpxor 16*i(arg3, %r11), \T5, \T2
974 .if \ENC_DEC == ENC
975 vaesenclast \T2, reg_j, reg_j
976 .else
977 vaesenclast \T2, reg_j, \T3
978 vmovdqu 16*i(arg3, %r11), reg_j
979 vmovdqu \T3, 16*i(arg2, %r11)
980 .endif
981 i = (i+1)
982 j = (j+1)
983 setreg
984 .endr
985 #######################################################################
986
987
988 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
989 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
990 vpxor \T3, \T7, \T7
991 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
992
993
994
995 #######################################################################
996 #first phase of the reduction
997 #######################################################################
998 vpslld $31, \T7, \T2 # packed right shifting << 31
999 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1000 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1001
1002 vpxor \T3, \T2, \T2 # xor the shifted versions
1003 vpxor \T4, \T2, \T2
1004
1005 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1006
1007 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1008 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1009 #######################################################################
1010 .if \ENC_DEC == ENC
1011 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
1012 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
1013 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
1014 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
1015 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
1016 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
1017 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
1018 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
1019 .endif
1020
1021 #######################################################################
1022 #second phase of the reduction
1023 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1024 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1025 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1026 vpxor \T3, \T2, \T2 # xor the shifted versions
1027 vpxor \T4, \T2, \T2
1028
1029 vpxor \T1, \T2, \T2
1030 vpxor \T2, \T7, \T7
1031 vpxor \T7, \T6, \T6 # the result is in T6
1032 #######################################################################
1033
1034 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1035 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1036 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1037 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1038 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1039 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1040 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1041 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1042
1043
1044 vpxor \T6, \XMM1, \XMM1
1045
1046
1047
1048 .endm
1049
1050
1051 # GHASH the last 4 ciphertext blocks.
1052 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1053
1054 ## Karatsuba Method
1055
1056
1057 vpshufd $0b01001110, \XMM1, \T2
1058 vpxor \XMM1, \T2, \T2
1059 vmovdqa HashKey_8(arg1), \T5
1060 vpclmulqdq $0x11, \T5, \XMM1, \T6
1061 vpclmulqdq $0x00, \T5, \XMM1, \T7
1062
1063 vmovdqa HashKey_8_k(arg1), \T3
1064 vpclmulqdq $0x00, \T3, \T2, \XMM1
1065
1066 ######################
1067
1068 vpshufd $0b01001110, \XMM2, \T2
1069 vpxor \XMM2, \T2, \T2
1070 vmovdqa HashKey_7(arg1), \T5
1071 vpclmulqdq $0x11, \T5, \XMM2, \T4
1072 vpxor \T4, \T6, \T6
1073
1074 vpclmulqdq $0x00, \T5, \XMM2, \T4
1075 vpxor \T4, \T7, \T7
1076
1077 vmovdqa HashKey_7_k(arg1), \T3
1078 vpclmulqdq $0x00, \T3, \T2, \T2
1079 vpxor \T2, \XMM1, \XMM1
1080
1081 ######################
1082
1083 vpshufd $0b01001110, \XMM3, \T2
1084 vpxor \XMM3, \T2, \T2
1085 vmovdqa HashKey_6(arg1), \T5
1086 vpclmulqdq $0x11, \T5, \XMM3, \T4
1087 vpxor \T4, \T6, \T6
1088
1089 vpclmulqdq $0x00, \T5, \XMM3, \T4
1090 vpxor \T4, \T7, \T7
1091
1092 vmovdqa HashKey_6_k(arg1), \T3
1093 vpclmulqdq $0x00, \T3, \T2, \T2
1094 vpxor \T2, \XMM1, \XMM1
1095
1096 ######################
1097
1098 vpshufd $0b01001110, \XMM4, \T2
1099 vpxor \XMM4, \T2, \T2
1100 vmovdqa HashKey_5(arg1), \T5
1101 vpclmulqdq $0x11, \T5, \XMM4, \T4
1102 vpxor \T4, \T6, \T6
1103
1104 vpclmulqdq $0x00, \T5, \XMM4, \T4
1105 vpxor \T4, \T7, \T7
1106
1107 vmovdqa HashKey_5_k(arg1), \T3
1108 vpclmulqdq $0x00, \T3, \T2, \T2
1109 vpxor \T2, \XMM1, \XMM1
1110
1111 ######################
1112
1113 vpshufd $0b01001110, \XMM5, \T2
1114 vpxor \XMM5, \T2, \T2
1115 vmovdqa HashKey_4(arg1), \T5
1116 vpclmulqdq $0x11, \T5, \XMM5, \T4
1117 vpxor \T4, \T6, \T6
1118
1119 vpclmulqdq $0x00, \T5, \XMM5, \T4
1120 vpxor \T4, \T7, \T7
1121
1122 vmovdqa HashKey_4_k(arg1), \T3
1123 vpclmulqdq $0x00, \T3, \T2, \T2
1124 vpxor \T2, \XMM1, \XMM1
1125
1126 ######################
1127
1128 vpshufd $0b01001110, \XMM6, \T2
1129 vpxor \XMM6, \T2, \T2
1130 vmovdqa HashKey_3(arg1), \T5
1131 vpclmulqdq $0x11, \T5, \XMM6, \T4
1132 vpxor \T4, \T6, \T6
1133
1134 vpclmulqdq $0x00, \T5, \XMM6, \T4
1135 vpxor \T4, \T7, \T7
1136
1137 vmovdqa HashKey_3_k(arg1), \T3
1138 vpclmulqdq $0x00, \T3, \T2, \T2
1139 vpxor \T2, \XMM1, \XMM1
1140
1141 ######################
1142
1143 vpshufd $0b01001110, \XMM7, \T2
1144 vpxor \XMM7, \T2, \T2
1145 vmovdqa HashKey_2(arg1), \T5
1146 vpclmulqdq $0x11, \T5, \XMM7, \T4
1147 vpxor \T4, \T6, \T6
1148
1149 vpclmulqdq $0x00, \T5, \XMM7, \T4
1150 vpxor \T4, \T7, \T7
1151
1152 vmovdqa HashKey_2_k(arg1), \T3
1153 vpclmulqdq $0x00, \T3, \T2, \T2
1154 vpxor \T2, \XMM1, \XMM1
1155
1156 ######################
1157
1158 vpshufd $0b01001110, \XMM8, \T2
1159 vpxor \XMM8, \T2, \T2
1160 vmovdqa HashKey(arg1), \T5
1161 vpclmulqdq $0x11, \T5, \XMM8, \T4
1162 vpxor \T4, \T6, \T6
1163
1164 vpclmulqdq $0x00, \T5, \XMM8, \T4
1165 vpxor \T4, \T7, \T7
1166
1167 vmovdqa HashKey_k(arg1), \T3
1168 vpclmulqdq $0x00, \T3, \T2, \T2
1169
1170 vpxor \T2, \XMM1, \XMM1
1171 vpxor \T6, \XMM1, \XMM1
1172 vpxor \T7, \XMM1, \T2
1173
1174
1175
1176
1177 vpslldq $8, \T2, \T4
1178 vpsrldq $8, \T2, \T2
1179
1180 vpxor \T4, \T7, \T7
1181 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1182 # the accumulated carry-less multiplications
1183
1184 #######################################################################
1185 #first phase of the reduction
1186 vpslld $31, \T7, \T2 # packed right shifting << 31
1187 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1188 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1189
1190 vpxor \T3, \T2, \T2 # xor the shifted versions
1191 vpxor \T4, \T2, \T2
1192
1193 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1194
1195 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1196 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1197 #######################################################################
1198
1199
1200 #second phase of the reduction
1201 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1202 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1203 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1204 vpxor \T3, \T2, \T2 # xor the shifted versions
1205 vpxor \T4, \T2, \T2
1206
1207 vpxor \T1, \T2, \T2
1208 vpxor \T2, \T7, \T7
1209 vpxor \T7, \T6, \T6 # the result is in T6
1210
1211 .endm
1212
1213
1214 # combined for GCM encrypt and decrypt functions
1215 # clobbering all xmm registers
1216 # clobbering r10, r11, r12, r13, r14, r15
1217 .macro GCM_ENC_DEC_AVX ENC_DEC
1218
1219 #the number of pushes must equal STACK_OFFSET
1220 push %r12
1221 push %r13
1222 push %r14
1223 push %r15
1224
1225 mov %rsp, %r14
1226
1227
1228
1229
1230 sub $VARIABLE_OFFSET, %rsp
1231 and $~63, %rsp # align rsp to 64 bytes
1232
1233
1234 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1235
1236 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1237 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1238
1239 mov %r13, %r12
1240 shr $4, %r12
1241 and $7, %r12
1242 jz _initial_num_blocks_is_0\@
1243
1244 cmp $7, %r12
1245 je _initial_num_blocks_is_7\@
1246 cmp $6, %r12
1247 je _initial_num_blocks_is_6\@
1248 cmp $5, %r12
1249 je _initial_num_blocks_is_5\@
1250 cmp $4, %r12
1251 je _initial_num_blocks_is_4\@
1252 cmp $3, %r12
1253 je _initial_num_blocks_is_3\@
1254 cmp $2, %r12
1255 je _initial_num_blocks_is_2\@
1256
1257 jmp _initial_num_blocks_is_1\@
1258
1259 _initial_num_blocks_is_7\@:
1260 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1261 sub $16*7, %r13
1262 jmp _initial_blocks_encrypted\@
1263
1264 _initial_num_blocks_is_6\@:
1265 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1266 sub $16*6, %r13
1267 jmp _initial_blocks_encrypted\@
1268
1269 _initial_num_blocks_is_5\@:
1270 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1271 sub $16*5, %r13
1272 jmp _initial_blocks_encrypted\@
1273
1274 _initial_num_blocks_is_4\@:
1275 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1276 sub $16*4, %r13
1277 jmp _initial_blocks_encrypted\@
1278
1279 _initial_num_blocks_is_3\@:
1280 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1281 sub $16*3, %r13
1282 jmp _initial_blocks_encrypted\@
1283
1284 _initial_num_blocks_is_2\@:
1285 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1286 sub $16*2, %r13
1287 jmp _initial_blocks_encrypted\@
1288
1289 _initial_num_blocks_is_1\@:
1290 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1291 sub $16*1, %r13
1292 jmp _initial_blocks_encrypted\@
1293
1294 _initial_num_blocks_is_0\@:
1295 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1296
1297
1298 _initial_blocks_encrypted\@:
1299 cmp $0, %r13
1300 je _zero_cipher_left\@
1301
1302 sub $128, %r13
1303 je _eight_cipher_left\@
1304
1305
1306
1307
1308 vmovd %xmm9, %r15d
1309 and $255, %r15d
1310 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1311
1312
1313 _encrypt_by_8_new\@:
1314 cmp $(255-8), %r15d
1315 jg _encrypt_by_8\@
1316
1317
1318
1319 add $8, %r15b
1320 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1321 add $128, %r11
1322 sub $128, %r13
1323 jne _encrypt_by_8_new\@
1324
1325 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326 jmp _eight_cipher_left\@
1327
1328 _encrypt_by_8\@:
1329 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1330 add $8, %r15b
1331 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1333 add $128, %r11
1334 sub $128, %r13
1335 jne _encrypt_by_8_new\@
1336
1337 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1338
1339
1340
1341
1342 _eight_cipher_left\@:
1343 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1344
1345
1346 _zero_cipher_left\@:
1347 cmp $16, arg4
1348 jl _only_less_than_16\@
1349
1350 mov arg4, %r13
1351 and $15, %r13 # r13 = (arg4 mod 16)
1352
1353 je _multiple_of_16_bytes\@
1354
1355 # handle the last <16 Byte block seperately
1356
1357
1358 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1359 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1361
1362 sub $16, %r11
1363 add %r13, %r11
1364 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1365
1366 lea SHIFT_MASK+16(%rip), %r12
1367 sub %r13, %r12 # adjust the shuffle mask pointer to be
1368 # able to shift 16-r13 bytes (r13 is the
1369 # number of bytes in plaintext mod 16)
1370 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1371 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1372 jmp _final_ghash_mul\@
1373
1374 _only_less_than_16\@:
1375 # check for 0 length
1376 mov arg4, %r13
1377 and $15, %r13 # r13 = (arg4 mod 16)
1378
1379 je _multiple_of_16_bytes\@
1380
1381 # handle the last <16 Byte block seperately
1382
1383
1384 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1385 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1387
1388
1389 lea SHIFT_MASK+16(%rip), %r12
1390 sub %r13, %r12 # adjust the shuffle mask pointer to be
1391 # able to shift 16-r13 bytes (r13 is the
1392 # number of bytes in plaintext mod 16)
1393
1394 _get_last_16_byte_loop\@:
1395 movb (arg3, %r11), %al
1396 movb %al, TMP1 (%rsp , %r11)
1397 add $1, %r11
1398 cmp %r13, %r11
1399 jne _get_last_16_byte_loop\@
1400
1401 vmovdqu TMP1(%rsp), %xmm1
1402
1403 sub $16, %r11
1404
1405 _final_ghash_mul\@:
1406 .if \ENC_DEC == DEC
1407 vmovdqa %xmm1, %xmm2
1408 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1409 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1410 # mask out top 16-r13 bytes of xmm9
1411 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1412 vpand %xmm1, %xmm2, %xmm2
1413 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414 vpxor %xmm2, %xmm14, %xmm14
1415 #GHASH computation for the last <16 Byte block
1416 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1417 sub %r13, %r11
1418 add $16, %r11
1419 .else
1420 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1421 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1422 # mask out top 16-r13 bytes of xmm9
1423 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1424 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425 vpxor %xmm9, %xmm14, %xmm14
1426 #GHASH computation for the last <16 Byte block
1427 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1428 sub %r13, %r11
1429 add $16, %r11
1430 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1431 .endif
1432
1433
1434 #############################
1435 # output r13 Bytes
1436 vmovq %xmm9, %rax
1437 cmp $8, %r13
1438 jle _less_than_8_bytes_left\@
1439
1440 mov %rax, (arg2 , %r11)
1441 add $8, %r11
1442 vpsrldq $8, %xmm9, %xmm9
1443 vmovq %xmm9, %rax
1444 sub $8, %r13
1445
1446 _less_than_8_bytes_left\@:
1447 movb %al, (arg2 , %r11)
1448 add $1, %r11
1449 shr $8, %rax
1450 sub $1, %r13
1451 jne _less_than_8_bytes_left\@
1452 #############################
1453
1454 _multiple_of_16_bytes\@:
1455 mov arg7, %r12 # r12 = aadLen (number of bytes)
1456 shl $3, %r12 # convert into number of bits
1457 vmovd %r12d, %xmm15 # len(A) in xmm15
1458
1459 shl $3, arg4 # len(C) in bits (*128)
1460 vmovq arg4, %xmm1
1461 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1462 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1463
1464 vpxor %xmm15, %xmm14, %xmm14
1465 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1466 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1467
1468 mov arg5, %rax # rax = *Y0
1469 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1470
1471 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1472
1473 vpxor %xmm14, %xmm9, %xmm9
1474
1475
1476
1477 _return_T\@:
1478 mov arg8, %r10 # r10 = authTag
1479 mov arg9, %r11 # r11 = auth_tag_len
1480
1481 cmp $16, %r11
1482 je _T_16\@
1483
1484 cmp $8, %r11
1485 jl _T_4\@
1486
1487 _T_8\@:
1488 vmovq %xmm9, %rax
1489 mov %rax, (%r10)
1490 add $8, %r10
1491 sub $8, %r11
1492 vpsrldq $8, %xmm9, %xmm9
1493 cmp $0, %r11
1494 je _return_T_done\@
1495 _T_4\@:
1496 vmovd %xmm9, %eax
1497 mov %eax, (%r10)
1498 add $4, %r10
1499 sub $4, %r11
1500 vpsrldq $4, %xmm9, %xmm9
1501 cmp $0, %r11
1502 je _return_T_done\@
1503 _T_123\@:
1504 vmovd %xmm9, %eax
1505 cmp $2, %r11
1506 jl _T_1\@
1507 mov %ax, (%r10)
1508 cmp $2, %r11
1509 je _return_T_done\@
1510 add $2, %r10
1511 sar $16, %eax
1512 _T_1\@:
1513 mov %al, (%r10)
1514 jmp _return_T_done\@
1515
1516 _T_16\@:
1517 vmovdqu %xmm9, (%r10)
1518
1519 _return_T_done\@:
1520 mov %r14, %rsp
1521
1522 pop %r15
1523 pop %r14
1524 pop %r13
1525 pop %r12
1526 .endm
1527
1528
1529 #############################################################
1530 #void aesni_gcm_precomp_avx_gen2
1531 # (gcm_data *my_ctx_data,
1532 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1533 #############################################################
1534 ENTRY(aesni_gcm_precomp_avx_gen2)
1535 #the number of pushes must equal STACK_OFFSET
1536 push %r12
1537 push %r13
1538 push %r14
1539 push %r15
1540
1541 mov %rsp, %r14
1542
1543
1544
1545 sub $VARIABLE_OFFSET, %rsp
1546 and $~63, %rsp # align rsp to 64 bytes
1547
1548 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1549
1550 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1551 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1552 vmovdqa %xmm6, %xmm2
1553 vpsllq $1, %xmm6, %xmm6
1554 vpsrlq $63, %xmm2, %xmm2
1555 vmovdqa %xmm2, %xmm1
1556 vpslldq $8, %xmm2, %xmm2
1557 vpsrldq $8, %xmm1, %xmm1
1558 vpor %xmm2, %xmm6, %xmm6
1559 #reduction
1560 vpshufd $0b00100100, %xmm1, %xmm2
1561 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562 vpand POLY(%rip), %xmm2, %xmm2
1563 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1564 #######################################################################
1565 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1566
1567
1568 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1569
1570 mov %r14, %rsp
1571
1572 pop %r15
1573 pop %r14
1574 pop %r13
1575 pop %r12
1576 ret
1577 ENDPROC(aesni_gcm_precomp_avx_gen2)
1578
1579 ###############################################################################
1580 #void aesni_gcm_enc_avx_gen2(
1581 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1582 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1583 # const u8 *in, /* Plaintext input */
1584 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1585 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1586 # (from Security Association) concatenated with 8 byte
1587 # Initialisation Vector (from IPSec ESP Payload)
1588 # concatenated with 0x00000001. 16-byte aligned pointer. */
1589 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1590 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1591 # u8 *auth_tag, /* Authenticated Tag output. */
1592 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1593 # Valid values are 16 (most likely), 12 or 8. */
1594 ###############################################################################
1595 ENTRY(aesni_gcm_enc_avx_gen2)
1596 GCM_ENC_DEC_AVX ENC
1597 ret
1598 ENDPROC(aesni_gcm_enc_avx_gen2)
1599
1600 ###############################################################################
1601 #void aesni_gcm_dec_avx_gen2(
1602 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1603 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1604 # const u8 *in, /* Ciphertext input */
1605 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1606 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1607 # (from Security Association) concatenated with 8 byte
1608 # Initialisation Vector (from IPSec ESP Payload)
1609 # concatenated with 0x00000001. 16-byte aligned pointer. */
1610 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1611 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1612 # u8 *auth_tag, /* Authenticated Tag output. */
1613 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1614 # Valid values are 16 (most likely), 12 or 8. */
1615 ###############################################################################
1616 ENTRY(aesni_gcm_dec_avx_gen2)
1617 GCM_ENC_DEC_AVX DEC
1618 ret
1619 ENDPROC(aesni_gcm_dec_avx_gen2)
1620 #endif /* CONFIG_AS_AVX */
1621
1622 #ifdef CONFIG_AS_AVX2
1623 ###############################################################################
1624 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1625 # Input: A and B (128-bits each, bit-reflected)
1626 # Output: C = A*B*x mod poly, (i.e. >>1 )
1627 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1628 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1629 ###############################################################################
1630 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1631
1632 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1633 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1634 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1635 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1636 vpxor \T3, \GH, \GH
1637
1638
1639 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1640 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1641
1642 vpxor \T3, \T1, \T1
1643 vpxor \T2, \GH, \GH
1644
1645 #######################################################################
1646 #first phase of the reduction
1647 vmovdqa POLY2(%rip), \T3
1648
1649 vpclmulqdq $0x01, \GH, \T3, \T2
1650 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1651
1652 vpxor \T2, \GH, \GH # first phase of the reduction complete
1653 #######################################################################
1654 #second phase of the reduction
1655 vpclmulqdq $0x00, \GH, \T3, \T2
1656 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1657
1658 vpclmulqdq $0x10, \GH, \T3, \GH
1659 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1660
1661 vpxor \T2, \GH, \GH # second phase of the reduction complete
1662 #######################################################################
1663 vpxor \T1, \GH, \GH # the result is in GH
1664
1665
1666 .endm
1667
1668 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1669
1670 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1671 vmovdqa \HK, \T5
1672 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1673 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1674
1675 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1676 vmovdqa \T5, HashKey_3(arg1)
1677
1678 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1679 vmovdqa \T5, HashKey_4(arg1)
1680
1681 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1682 vmovdqa \T5, HashKey_5(arg1)
1683
1684 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1685 vmovdqa \T5, HashKey_6(arg1)
1686
1687 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1688 vmovdqa \T5, HashKey_7(arg1)
1689
1690 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1691 vmovdqa \T5, HashKey_8(arg1)
1692
1693 .endm
1694
1695
1696 ## if a = number of total plaintext bytes
1697 ## b = floor(a/16)
1698 ## num_initial_blocks = b mod 4#
1699 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1700 ## r10, r11, r12, rax are clobbered
1701 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1702
1703 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1704 i = (8-\num_initial_blocks)
1705 j = 0
1706 setreg
1707
1708 mov arg6, %r10 # r10 = AAD
1709 mov arg7, %r12 # r12 = aadLen
1710
1711
1712 mov %r12, %r11
1713
1714 vpxor reg_j, reg_j, reg_j
1715 vpxor reg_i, reg_i, reg_i
1716
1717 cmp $16, %r11
1718 jl _get_AAD_rest8\@
1719 _get_AAD_blocks\@:
1720 vmovdqu (%r10), reg_i
1721 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722 vpxor reg_i, reg_j, reg_j
1723 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1724 add $16, %r10
1725 sub $16, %r12
1726 sub $16, %r11
1727 cmp $16, %r11
1728 jge _get_AAD_blocks\@
1729 vmovdqu reg_j, reg_i
1730 cmp $0, %r11
1731 je _get_AAD_done\@
1732
1733 vpxor reg_i, reg_i, reg_i
1734
1735 /* read the last <16B of AAD. since we have at least 4B of
1736 data right after the AAD (the ICV, and maybe some CT), we can
1737 read 4B/8B blocks safely, and then get rid of the extra stuff */
1738 _get_AAD_rest8\@:
1739 cmp $4, %r11
1740 jle _get_AAD_rest4\@
1741 movq (%r10), \T1
1742 add $8, %r10
1743 sub $8, %r11
1744 vpslldq $8, \T1, \T1
1745 vpsrldq $8, reg_i, reg_i
1746 vpxor \T1, reg_i, reg_i
1747 jmp _get_AAD_rest8\@
1748 _get_AAD_rest4\@:
1749 cmp $0, %r11
1750 jle _get_AAD_rest0\@
1751 mov (%r10), %eax
1752 movq %rax, \T1
1753 add $4, %r10
1754 sub $4, %r11
1755 vpslldq $12, \T1, \T1
1756 vpsrldq $4, reg_i, reg_i
1757 vpxor \T1, reg_i, reg_i
1758 _get_AAD_rest0\@:
1759 /* finalize: shift out the extra bytes we read, and align
1760 left. since pslldq can only shift by an immediate, we use
1761 vpshufb and an array of shuffle masks */
1762 movq %r12, %r11
1763 salq $4, %r11
1764 movdqu aad_shift_arr(%r11), \T1
1765 vpshufb \T1, reg_i, reg_i
1766 _get_AAD_rest_final\@:
1767 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768 vpxor reg_j, reg_i, reg_i
1769 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770
1771 _get_AAD_done\@:
1772 # initialize the data pointer offset as zero
1773 xor %r11, %r11
1774
1775 # start AES for num_initial_blocks blocks
1776 mov arg5, %rax # rax = *Y0
1777 vmovdqu (%rax), \CTR # CTR = Y0
1778 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779
1780
1781 i = (9-\num_initial_blocks)
1782 setreg
1783 .rep \num_initial_blocks
1784 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1785 vmovdqa \CTR, reg_i
1786 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1787 i = (i+1)
1788 setreg
1789 .endr
1790
1791 vmovdqa (arg1), \T_key
1792 i = (9-\num_initial_blocks)
1793 setreg
1794 .rep \num_initial_blocks
1795 vpxor \T_key, reg_i, reg_i
1796 i = (i+1)
1797 setreg
1798 .endr
1799
1800 j = 1
1801 setreg
1802 .rep 9
1803 vmovdqa 16*j(arg1), \T_key
1804 i = (9-\num_initial_blocks)
1805 setreg
1806 .rep \num_initial_blocks
1807 vaesenc \T_key, reg_i, reg_i
1808 i = (i+1)
1809 setreg
1810 .endr
1811
1812 j = (j+1)
1813 setreg
1814 .endr
1815
1816
1817 vmovdqa 16*10(arg1), \T_key
1818 i = (9-\num_initial_blocks)
1819 setreg
1820 .rep \num_initial_blocks
1821 vaesenclast \T_key, reg_i, reg_i
1822 i = (i+1)
1823 setreg
1824 .endr
1825
1826 i = (9-\num_initial_blocks)
1827 setreg
1828 .rep \num_initial_blocks
1829 vmovdqu (arg3, %r11), \T1
1830 vpxor \T1, reg_i, reg_i
1831 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1832 # num_initial_blocks blocks
1833 add $16, %r11
1834 .if \ENC_DEC == DEC
1835 vmovdqa \T1, reg_i
1836 .endif
1837 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1838 i = (i+1)
1839 setreg
1840 .endr
1841
1842
1843 i = (8-\num_initial_blocks)
1844 j = (9-\num_initial_blocks)
1845 setreg
1846
1847 .rep \num_initial_blocks
1848 vpxor reg_i, reg_j, reg_j
1849 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1850 i = (i+1)
1851 j = (j+1)
1852 setreg
1853 .endr
1854 # XMM8 has the combined result here
1855
1856 vmovdqa \XMM8, TMP1(%rsp)
1857 vmovdqa \XMM8, \T3
1858
1859 cmp $128, %r13
1860 jl _initial_blocks_done\@ # no need for precomputed constants
1861
1862 ###############################################################################
1863 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1864 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1865 vmovdqa \CTR, \XMM1
1866 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1867
1868 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1869 vmovdqa \CTR, \XMM2
1870 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1871
1872 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1873 vmovdqa \CTR, \XMM3
1874 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1875
1876 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1877 vmovdqa \CTR, \XMM4
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1879
1880 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1881 vmovdqa \CTR, \XMM5
1882 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1883
1884 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1885 vmovdqa \CTR, \XMM6
1886 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1887
1888 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1889 vmovdqa \CTR, \XMM7
1890 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1891
1892 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1893 vmovdqa \CTR, \XMM8
1894 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1895
1896 vmovdqa (arg1), \T_key
1897 vpxor \T_key, \XMM1, \XMM1
1898 vpxor \T_key, \XMM2, \XMM2
1899 vpxor \T_key, \XMM3, \XMM3
1900 vpxor \T_key, \XMM4, \XMM4
1901 vpxor \T_key, \XMM5, \XMM5
1902 vpxor \T_key, \XMM6, \XMM6
1903 vpxor \T_key, \XMM7, \XMM7
1904 vpxor \T_key, \XMM8, \XMM8
1905
1906 i = 1
1907 setreg
1908 .rep 9 # do 9 rounds
1909 vmovdqa 16*i(arg1), \T_key
1910 vaesenc \T_key, \XMM1, \XMM1
1911 vaesenc \T_key, \XMM2, \XMM2
1912 vaesenc \T_key, \XMM3, \XMM3
1913 vaesenc \T_key, \XMM4, \XMM4
1914 vaesenc \T_key, \XMM5, \XMM5
1915 vaesenc \T_key, \XMM6, \XMM6
1916 vaesenc \T_key, \XMM7, \XMM7
1917 vaesenc \T_key, \XMM8, \XMM8
1918 i = (i+1)
1919 setreg
1920 .endr
1921
1922
1923 vmovdqa 16*i(arg1), \T_key
1924 vaesenclast \T_key, \XMM1, \XMM1
1925 vaesenclast \T_key, \XMM2, \XMM2
1926 vaesenclast \T_key, \XMM3, \XMM3
1927 vaesenclast \T_key, \XMM4, \XMM4
1928 vaesenclast \T_key, \XMM5, \XMM5
1929 vaesenclast \T_key, \XMM6, \XMM6
1930 vaesenclast \T_key, \XMM7, \XMM7
1931 vaesenclast \T_key, \XMM8, \XMM8
1932
1933 vmovdqu (arg3, %r11), \T1
1934 vpxor \T1, \XMM1, \XMM1
1935 vmovdqu \XMM1, (arg2 , %r11)
1936 .if \ENC_DEC == DEC
1937 vmovdqa \T1, \XMM1
1938 .endif
1939
1940 vmovdqu 16*1(arg3, %r11), \T1
1941 vpxor \T1, \XMM2, \XMM2
1942 vmovdqu \XMM2, 16*1(arg2 , %r11)
1943 .if \ENC_DEC == DEC
1944 vmovdqa \T1, \XMM2
1945 .endif
1946
1947 vmovdqu 16*2(arg3, %r11), \T1
1948 vpxor \T1, \XMM3, \XMM3
1949 vmovdqu \XMM3, 16*2(arg2 , %r11)
1950 .if \ENC_DEC == DEC
1951 vmovdqa \T1, \XMM3
1952 .endif
1953
1954 vmovdqu 16*3(arg3, %r11), \T1
1955 vpxor \T1, \XMM4, \XMM4
1956 vmovdqu \XMM4, 16*3(arg2 , %r11)
1957 .if \ENC_DEC == DEC
1958 vmovdqa \T1, \XMM4
1959 .endif
1960
1961 vmovdqu 16*4(arg3, %r11), \T1
1962 vpxor \T1, \XMM5, \XMM5
1963 vmovdqu \XMM5, 16*4(arg2 , %r11)
1964 .if \ENC_DEC == DEC
1965 vmovdqa \T1, \XMM5
1966 .endif
1967
1968 vmovdqu 16*5(arg3, %r11), \T1
1969 vpxor \T1, \XMM6, \XMM6
1970 vmovdqu \XMM6, 16*5(arg2 , %r11)
1971 .if \ENC_DEC == DEC
1972 vmovdqa \T1, \XMM6
1973 .endif
1974
1975 vmovdqu 16*6(arg3, %r11), \T1
1976 vpxor \T1, \XMM7, \XMM7
1977 vmovdqu \XMM7, 16*6(arg2 , %r11)
1978 .if \ENC_DEC == DEC
1979 vmovdqa \T1, \XMM7
1980 .endif
1981
1982 vmovdqu 16*7(arg3, %r11), \T1
1983 vpxor \T1, \XMM8, \XMM8
1984 vmovdqu \XMM8, 16*7(arg2 , %r11)
1985 .if \ENC_DEC == DEC
1986 vmovdqa \T1, \XMM8
1987 .endif
1988
1989 add $128, %r11
1990
1991 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1992 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1993 # the corresponding ciphertext
1994 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1995 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1996 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1997 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1998 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1999 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2000 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2001
2002 ###############################################################################
2003
2004 _initial_blocks_done\@:
2005
2006
2007 .endm
2008
2009
2010
2011 # encrypt 8 blocks at a time
2012 # ghash the 8 previously encrypted ciphertext blocks
2013 # arg1, arg2, arg3 are used as pointers only, not modified
2014 # r11 is the data offset value
2015 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2016
2017 vmovdqa \XMM1, \T2
2018 vmovdqa \XMM2, TMP2(%rsp)
2019 vmovdqa \XMM3, TMP3(%rsp)
2020 vmovdqa \XMM4, TMP4(%rsp)
2021 vmovdqa \XMM5, TMP5(%rsp)
2022 vmovdqa \XMM6, TMP6(%rsp)
2023 vmovdqa \XMM7, TMP7(%rsp)
2024 vmovdqa \XMM8, TMP8(%rsp)
2025
2026 .if \loop_idx == in_order
2027 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2028 vpaddd ONE(%rip), \XMM1, \XMM2
2029 vpaddd ONE(%rip), \XMM2, \XMM3
2030 vpaddd ONE(%rip), \XMM3, \XMM4
2031 vpaddd ONE(%rip), \XMM4, \XMM5
2032 vpaddd ONE(%rip), \XMM5, \XMM6
2033 vpaddd ONE(%rip), \XMM6, \XMM7
2034 vpaddd ONE(%rip), \XMM7, \XMM8
2035 vmovdqa \XMM8, \CTR
2036
2037 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2038 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2039 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2040 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2041 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2042 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2043 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2044 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2045 .else
2046 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2047 vpaddd ONEf(%rip), \XMM1, \XMM2
2048 vpaddd ONEf(%rip), \XMM2, \XMM3
2049 vpaddd ONEf(%rip), \XMM3, \XMM4
2050 vpaddd ONEf(%rip), \XMM4, \XMM5
2051 vpaddd ONEf(%rip), \XMM5, \XMM6
2052 vpaddd ONEf(%rip), \XMM6, \XMM7
2053 vpaddd ONEf(%rip), \XMM7, \XMM8
2054 vmovdqa \XMM8, \CTR
2055 .endif
2056
2057
2058 #######################################################################
2059
2060 vmovdqu (arg1), \T1
2061 vpxor \T1, \XMM1, \XMM1
2062 vpxor \T1, \XMM2, \XMM2
2063 vpxor \T1, \XMM3, \XMM3
2064 vpxor \T1, \XMM4, \XMM4
2065 vpxor \T1, \XMM5, \XMM5
2066 vpxor \T1, \XMM6, \XMM6
2067 vpxor \T1, \XMM7, \XMM7
2068 vpxor \T1, \XMM8, \XMM8
2069
2070 #######################################################################
2071
2072
2073
2074
2075
2076 vmovdqu 16*1(arg1), \T1
2077 vaesenc \T1, \XMM1, \XMM1
2078 vaesenc \T1, \XMM2, \XMM2
2079 vaesenc \T1, \XMM3, \XMM3
2080 vaesenc \T1, \XMM4, \XMM4
2081 vaesenc \T1, \XMM5, \XMM5
2082 vaesenc \T1, \XMM6, \XMM6
2083 vaesenc \T1, \XMM7, \XMM7
2084 vaesenc \T1, \XMM8, \XMM8
2085
2086 vmovdqu 16*2(arg1), \T1
2087 vaesenc \T1, \XMM1, \XMM1
2088 vaesenc \T1, \XMM2, \XMM2
2089 vaesenc \T1, \XMM3, \XMM3
2090 vaesenc \T1, \XMM4, \XMM4
2091 vaesenc \T1, \XMM5, \XMM5
2092 vaesenc \T1, \XMM6, \XMM6
2093 vaesenc \T1, \XMM7, \XMM7
2094 vaesenc \T1, \XMM8, \XMM8
2095
2096
2097 #######################################################################
2098
2099 vmovdqa HashKey_8(arg1), \T5
2100 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2101 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2102 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2103 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2104 vpxor \T5, \T6, \T6
2105
2106 vmovdqu 16*3(arg1), \T1
2107 vaesenc \T1, \XMM1, \XMM1
2108 vaesenc \T1, \XMM2, \XMM2
2109 vaesenc \T1, \XMM3, \XMM3
2110 vaesenc \T1, \XMM4, \XMM4
2111 vaesenc \T1, \XMM5, \XMM5
2112 vaesenc \T1, \XMM6, \XMM6
2113 vaesenc \T1, \XMM7, \XMM7
2114 vaesenc \T1, \XMM8, \XMM8
2115
2116 vmovdqa TMP2(%rsp), \T1
2117 vmovdqa HashKey_7(arg1), \T5
2118 vpclmulqdq $0x11, \T5, \T1, \T3
2119 vpxor \T3, \T4, \T4
2120
2121 vpclmulqdq $0x00, \T5, \T1, \T3
2122 vpxor \T3, \T7, \T7
2123
2124 vpclmulqdq $0x01, \T5, \T1, \T3
2125 vpxor \T3, \T6, \T6
2126
2127 vpclmulqdq $0x10, \T5, \T1, \T3
2128 vpxor \T3, \T6, \T6
2129
2130 vmovdqu 16*4(arg1), \T1
2131 vaesenc \T1, \XMM1, \XMM1
2132 vaesenc \T1, \XMM2, \XMM2
2133 vaesenc \T1, \XMM3, \XMM3
2134 vaesenc \T1, \XMM4, \XMM4
2135 vaesenc \T1, \XMM5, \XMM5
2136 vaesenc \T1, \XMM6, \XMM6
2137 vaesenc \T1, \XMM7, \XMM7
2138 vaesenc \T1, \XMM8, \XMM8
2139
2140 #######################################################################
2141
2142 vmovdqa TMP3(%rsp), \T1
2143 vmovdqa HashKey_6(arg1), \T5
2144 vpclmulqdq $0x11, \T5, \T1, \T3
2145 vpxor \T3, \T4, \T4
2146
2147 vpclmulqdq $0x00, \T5, \T1, \T3
2148 vpxor \T3, \T7, \T7
2149
2150 vpclmulqdq $0x01, \T5, \T1, \T3
2151 vpxor \T3, \T6, \T6
2152
2153 vpclmulqdq $0x10, \T5, \T1, \T3
2154 vpxor \T3, \T6, \T6
2155
2156 vmovdqu 16*5(arg1), \T1
2157 vaesenc \T1, \XMM1, \XMM1
2158 vaesenc \T1, \XMM2, \XMM2
2159 vaesenc \T1, \XMM3, \XMM3
2160 vaesenc \T1, \XMM4, \XMM4
2161 vaesenc \T1, \XMM5, \XMM5
2162 vaesenc \T1, \XMM6, \XMM6
2163 vaesenc \T1, \XMM7, \XMM7
2164 vaesenc \T1, \XMM8, \XMM8
2165
2166 vmovdqa TMP4(%rsp), \T1
2167 vmovdqa HashKey_5(arg1), \T5
2168 vpclmulqdq $0x11, \T5, \T1, \T3
2169 vpxor \T3, \T4, \T4
2170
2171 vpclmulqdq $0x00, \T5, \T1, \T3
2172 vpxor \T3, \T7, \T7
2173
2174 vpclmulqdq $0x01, \T5, \T1, \T3
2175 vpxor \T3, \T6, \T6
2176
2177 vpclmulqdq $0x10, \T5, \T1, \T3
2178 vpxor \T3, \T6, \T6
2179
2180 vmovdqu 16*6(arg1), \T1
2181 vaesenc \T1, \XMM1, \XMM1
2182 vaesenc \T1, \XMM2, \XMM2
2183 vaesenc \T1, \XMM3, \XMM3
2184 vaesenc \T1, \XMM4, \XMM4
2185 vaesenc \T1, \XMM5, \XMM5
2186 vaesenc \T1, \XMM6, \XMM6
2187 vaesenc \T1, \XMM7, \XMM7
2188 vaesenc \T1, \XMM8, \XMM8
2189
2190
2191 vmovdqa TMP5(%rsp), \T1
2192 vmovdqa HashKey_4(arg1), \T5
2193 vpclmulqdq $0x11, \T5, \T1, \T3
2194 vpxor \T3, \T4, \T4
2195
2196 vpclmulqdq $0x00, \T5, \T1, \T3
2197 vpxor \T3, \T7, \T7
2198
2199 vpclmulqdq $0x01, \T5, \T1, \T3
2200 vpxor \T3, \T6, \T6
2201
2202 vpclmulqdq $0x10, \T5, \T1, \T3
2203 vpxor \T3, \T6, \T6
2204
2205 vmovdqu 16*7(arg1), \T1
2206 vaesenc \T1, \XMM1, \XMM1
2207 vaesenc \T1, \XMM2, \XMM2
2208 vaesenc \T1, \XMM3, \XMM3
2209 vaesenc \T1, \XMM4, \XMM4
2210 vaesenc \T1, \XMM5, \XMM5
2211 vaesenc \T1, \XMM6, \XMM6
2212 vaesenc \T1, \XMM7, \XMM7
2213 vaesenc \T1, \XMM8, \XMM8
2214
2215 vmovdqa TMP6(%rsp), \T1
2216 vmovdqa HashKey_3(arg1), \T5
2217 vpclmulqdq $0x11, \T5, \T1, \T3
2218 vpxor \T3, \T4, \T4
2219
2220 vpclmulqdq $0x00, \T5, \T1, \T3
2221 vpxor \T3, \T7, \T7
2222
2223 vpclmulqdq $0x01, \T5, \T1, \T3
2224 vpxor \T3, \T6, \T6
2225
2226 vpclmulqdq $0x10, \T5, \T1, \T3
2227 vpxor \T3, \T6, \T6
2228
2229 vmovdqu 16*8(arg1), \T1
2230 vaesenc \T1, \XMM1, \XMM1
2231 vaesenc \T1, \XMM2, \XMM2
2232 vaesenc \T1, \XMM3, \XMM3
2233 vaesenc \T1, \XMM4, \XMM4
2234 vaesenc \T1, \XMM5, \XMM5
2235 vaesenc \T1, \XMM6, \XMM6
2236 vaesenc \T1, \XMM7, \XMM7
2237 vaesenc \T1, \XMM8, \XMM8
2238
2239 vmovdqa TMP7(%rsp), \T1
2240 vmovdqa HashKey_2(arg1), \T5
2241 vpclmulqdq $0x11, \T5, \T1, \T3
2242 vpxor \T3, \T4, \T4
2243
2244 vpclmulqdq $0x00, \T5, \T1, \T3
2245 vpxor \T3, \T7, \T7
2246
2247 vpclmulqdq $0x01, \T5, \T1, \T3
2248 vpxor \T3, \T6, \T6
2249
2250 vpclmulqdq $0x10, \T5, \T1, \T3
2251 vpxor \T3, \T6, \T6
2252
2253
2254 #######################################################################
2255
2256 vmovdqu 16*9(arg1), \T5
2257 vaesenc \T5, \XMM1, \XMM1
2258 vaesenc \T5, \XMM2, \XMM2
2259 vaesenc \T5, \XMM3, \XMM3
2260 vaesenc \T5, \XMM4, \XMM4
2261 vaesenc \T5, \XMM5, \XMM5
2262 vaesenc \T5, \XMM6, \XMM6
2263 vaesenc \T5, \XMM7, \XMM7
2264 vaesenc \T5, \XMM8, \XMM8
2265
2266 vmovdqa TMP8(%rsp), \T1
2267 vmovdqa HashKey(arg1), \T5
2268
2269 vpclmulqdq $0x00, \T5, \T1, \T3
2270 vpxor \T3, \T7, \T7
2271
2272 vpclmulqdq $0x01, \T5, \T1, \T3
2273 vpxor \T3, \T6, \T6
2274
2275 vpclmulqdq $0x10, \T5, \T1, \T3
2276 vpxor \T3, \T6, \T6
2277
2278 vpclmulqdq $0x11, \T5, \T1, \T3
2279 vpxor \T3, \T4, \T1
2280
2281
2282 vmovdqu 16*10(arg1), \T5
2283
2284 i = 0
2285 j = 1
2286 setreg
2287 .rep 8
2288 vpxor 16*i(arg3, %r11), \T5, \T2
2289 .if \ENC_DEC == ENC
2290 vaesenclast \T2, reg_j, reg_j
2291 .else
2292 vaesenclast \T2, reg_j, \T3
2293 vmovdqu 16*i(arg3, %r11), reg_j
2294 vmovdqu \T3, 16*i(arg2, %r11)
2295 .endif
2296 i = (i+1)
2297 j = (j+1)
2298 setreg
2299 .endr
2300 #######################################################################
2301
2302
2303 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2304 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2305 vpxor \T3, \T7, \T7
2306 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2307
2308
2309
2310 #######################################################################
2311 #first phase of the reduction
2312 vmovdqa POLY2(%rip), \T3
2313
2314 vpclmulqdq $0x01, \T7, \T3, \T2
2315 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2316
2317 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2318 #######################################################################
2319 .if \ENC_DEC == ENC
2320 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2321 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2322 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2323 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2324 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2325 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2326 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2327 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2328 .endif
2329
2330 #######################################################################
2331 #second phase of the reduction
2332 vpclmulqdq $0x00, \T7, \T3, \T2
2333 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2334
2335 vpclmulqdq $0x10, \T7, \T3, \T4
2336 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2337
2338 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2339 #######################################################################
2340 vpxor \T4, \T1, \T1 # the result is in T1
2341
2342 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2343 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2344 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2345 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2346 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2347 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2348 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2349 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2350
2351
2352 vpxor \T1, \XMM1, \XMM1
2353
2354
2355
2356 .endm
2357
2358
2359 # GHASH the last 4 ciphertext blocks.
2360 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2361
2362 ## Karatsuba Method
2363
2364 vmovdqa HashKey_8(arg1), \T5
2365
2366 vpshufd $0b01001110, \XMM1, \T2
2367 vpshufd $0b01001110, \T5, \T3
2368 vpxor \XMM1, \T2, \T2
2369 vpxor \T5, \T3, \T3
2370
2371 vpclmulqdq $0x11, \T5, \XMM1, \T6
2372 vpclmulqdq $0x00, \T5, \XMM1, \T7
2373
2374 vpclmulqdq $0x00, \T3, \T2, \XMM1
2375
2376 ######################
2377
2378 vmovdqa HashKey_7(arg1), \T5
2379 vpshufd $0b01001110, \XMM2, \T2
2380 vpshufd $0b01001110, \T5, \T3
2381 vpxor \XMM2, \T2, \T2
2382 vpxor \T5, \T3, \T3
2383
2384 vpclmulqdq $0x11, \T5, \XMM2, \T4
2385 vpxor \T4, \T6, \T6
2386
2387 vpclmulqdq $0x00, \T5, \XMM2, \T4
2388 vpxor \T4, \T7, \T7
2389
2390 vpclmulqdq $0x00, \T3, \T2, \T2
2391
2392 vpxor \T2, \XMM1, \XMM1
2393
2394 ######################
2395
2396 vmovdqa HashKey_6(arg1), \T5
2397 vpshufd $0b01001110, \XMM3, \T2
2398 vpshufd $0b01001110, \T5, \T3
2399 vpxor \XMM3, \T2, \T2
2400 vpxor \T5, \T3, \T3
2401
2402 vpclmulqdq $0x11, \T5, \XMM3, \T4
2403 vpxor \T4, \T6, \T6
2404
2405 vpclmulqdq $0x00, \T5, \XMM3, \T4
2406 vpxor \T4, \T7, \T7
2407
2408 vpclmulqdq $0x00, \T3, \T2, \T2
2409
2410 vpxor \T2, \XMM1, \XMM1
2411
2412 ######################
2413
2414 vmovdqa HashKey_5(arg1), \T5
2415 vpshufd $0b01001110, \XMM4, \T2
2416 vpshufd $0b01001110, \T5, \T3
2417 vpxor \XMM4, \T2, \T2
2418 vpxor \T5, \T3, \T3
2419
2420 vpclmulqdq $0x11, \T5, \XMM4, \T4
2421 vpxor \T4, \T6, \T6
2422
2423 vpclmulqdq $0x00, \T5, \XMM4, \T4
2424 vpxor \T4, \T7, \T7
2425
2426 vpclmulqdq $0x00, \T3, \T2, \T2
2427
2428 vpxor \T2, \XMM1, \XMM1
2429
2430 ######################
2431
2432 vmovdqa HashKey_4(arg1), \T5
2433 vpshufd $0b01001110, \XMM5, \T2
2434 vpshufd $0b01001110, \T5, \T3
2435 vpxor \XMM5, \T2, \T2
2436 vpxor \T5, \T3, \T3
2437
2438 vpclmulqdq $0x11, \T5, \XMM5, \T4
2439 vpxor \T4, \T6, \T6
2440
2441 vpclmulqdq $0x00, \T5, \XMM5, \T4
2442 vpxor \T4, \T7, \T7
2443
2444 vpclmulqdq $0x00, \T3, \T2, \T2
2445
2446 vpxor \T2, \XMM1, \XMM1
2447
2448 ######################
2449
2450 vmovdqa HashKey_3(arg1), \T5
2451 vpshufd $0b01001110, \XMM6, \T2
2452 vpshufd $0b01001110, \T5, \T3
2453 vpxor \XMM6, \T2, \T2
2454 vpxor \T5, \T3, \T3
2455
2456 vpclmulqdq $0x11, \T5, \XMM6, \T4
2457 vpxor \T4, \T6, \T6
2458
2459 vpclmulqdq $0x00, \T5, \XMM6, \T4
2460 vpxor \T4, \T7, \T7
2461
2462 vpclmulqdq $0x00, \T3, \T2, \T2
2463
2464 vpxor \T2, \XMM1, \XMM1
2465
2466 ######################
2467
2468 vmovdqa HashKey_2(arg1), \T5
2469 vpshufd $0b01001110, \XMM7, \T2
2470 vpshufd $0b01001110, \T5, \T3
2471 vpxor \XMM7, \T2, \T2
2472 vpxor \T5, \T3, \T3
2473
2474 vpclmulqdq $0x11, \T5, \XMM7, \T4
2475 vpxor \T4, \T6, \T6
2476
2477 vpclmulqdq $0x00, \T5, \XMM7, \T4
2478 vpxor \T4, \T7, \T7
2479
2480 vpclmulqdq $0x00, \T3, \T2, \T2
2481
2482 vpxor \T2, \XMM1, \XMM1
2483
2484 ######################
2485
2486 vmovdqa HashKey(arg1), \T5
2487 vpshufd $0b01001110, \XMM8, \T2
2488 vpshufd $0b01001110, \T5, \T3
2489 vpxor \XMM8, \T2, \T2
2490 vpxor \T5, \T3, \T3
2491
2492 vpclmulqdq $0x11, \T5, \XMM8, \T4
2493 vpxor \T4, \T6, \T6
2494
2495 vpclmulqdq $0x00, \T5, \XMM8, \T4
2496 vpxor \T4, \T7, \T7
2497
2498 vpclmulqdq $0x00, \T3, \T2, \T2
2499
2500 vpxor \T2, \XMM1, \XMM1
2501 vpxor \T6, \XMM1, \XMM1
2502 vpxor \T7, \XMM1, \T2
2503
2504
2505
2506
2507 vpslldq $8, \T2, \T4
2508 vpsrldq $8, \T2, \T2
2509
2510 vpxor \T4, \T7, \T7
2511 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2512 # accumulated carry-less multiplications
2513
2514 #######################################################################
2515 #first phase of the reduction
2516 vmovdqa POLY2(%rip), \T3
2517
2518 vpclmulqdq $0x01, \T7, \T3, \T2
2519 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2520
2521 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2522 #######################################################################
2523
2524
2525 #second phase of the reduction
2526 vpclmulqdq $0x00, \T7, \T3, \T2
2527 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2528
2529 vpclmulqdq $0x10, \T7, \T3, \T4
2530 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2531
2532 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2533 #######################################################################
2534 vpxor \T4, \T6, \T6 # the result is in T6
2535 .endm
2536
2537
2538
2539 # combined for GCM encrypt and decrypt functions
2540 # clobbering all xmm registers
2541 # clobbering r10, r11, r12, r13, r14, r15
2542 .macro GCM_ENC_DEC_AVX2 ENC_DEC
2543
2544 #the number of pushes must equal STACK_OFFSET
2545 push %r12
2546 push %r13
2547 push %r14
2548 push %r15
2549
2550 mov %rsp, %r14
2551
2552
2553
2554
2555 sub $VARIABLE_OFFSET, %rsp
2556 and $~63, %rsp # align rsp to 64 bytes
2557
2558
2559 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2560
2561 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2562 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2563
2564 mov %r13, %r12
2565 shr $4, %r12
2566 and $7, %r12
2567 jz _initial_num_blocks_is_0\@
2568
2569 cmp $7, %r12
2570 je _initial_num_blocks_is_7\@
2571 cmp $6, %r12
2572 je _initial_num_blocks_is_6\@
2573 cmp $5, %r12
2574 je _initial_num_blocks_is_5\@
2575 cmp $4, %r12
2576 je _initial_num_blocks_is_4\@
2577 cmp $3, %r12
2578 je _initial_num_blocks_is_3\@
2579 cmp $2, %r12
2580 je _initial_num_blocks_is_2\@
2581
2582 jmp _initial_num_blocks_is_1\@
2583
2584 _initial_num_blocks_is_7\@:
2585 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2586 sub $16*7, %r13
2587 jmp _initial_blocks_encrypted\@
2588
2589 _initial_num_blocks_is_6\@:
2590 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2591 sub $16*6, %r13
2592 jmp _initial_blocks_encrypted\@
2593
2594 _initial_num_blocks_is_5\@:
2595 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2596 sub $16*5, %r13
2597 jmp _initial_blocks_encrypted\@
2598
2599 _initial_num_blocks_is_4\@:
2600 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2601 sub $16*4, %r13
2602 jmp _initial_blocks_encrypted\@
2603
2604 _initial_num_blocks_is_3\@:
2605 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2606 sub $16*3, %r13
2607 jmp _initial_blocks_encrypted\@
2608
2609 _initial_num_blocks_is_2\@:
2610 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2611 sub $16*2, %r13
2612 jmp _initial_blocks_encrypted\@
2613
2614 _initial_num_blocks_is_1\@:
2615 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2616 sub $16*1, %r13
2617 jmp _initial_blocks_encrypted\@
2618
2619 _initial_num_blocks_is_0\@:
2620 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2621
2622
2623 _initial_blocks_encrypted\@:
2624 cmp $0, %r13
2625 je _zero_cipher_left\@
2626
2627 sub $128, %r13
2628 je _eight_cipher_left\@
2629
2630
2631
2632
2633 vmovd %xmm9, %r15d
2634 and $255, %r15d
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636
2637
2638 _encrypt_by_8_new\@:
2639 cmp $(255-8), %r15d
2640 jg _encrypt_by_8\@
2641
2642
2643
2644 add $8, %r15b
2645 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2646 add $128, %r11
2647 sub $128, %r13
2648 jne _encrypt_by_8_new\@
2649
2650 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651 jmp _eight_cipher_left\@
2652
2653 _encrypt_by_8\@:
2654 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2655 add $8, %r15b
2656 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2658 add $128, %r11
2659 sub $128, %r13
2660 jne _encrypt_by_8_new\@
2661
2662 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2663
2664
2665
2666
2667 _eight_cipher_left\@:
2668 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2669
2670
2671 _zero_cipher_left\@:
2672 cmp $16, arg4
2673 jl _only_less_than_16\@
2674
2675 mov arg4, %r13
2676 and $15, %r13 # r13 = (arg4 mod 16)
2677
2678 je _multiple_of_16_bytes\@
2679
2680 # handle the last <16 Byte block seperately
2681
2682
2683 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2684 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2686
2687 sub $16, %r11
2688 add %r13, %r11
2689 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2690
2691 lea SHIFT_MASK+16(%rip), %r12
2692 sub %r13, %r12 # adjust the shuffle mask pointer
2693 # to be able to shift 16-r13 bytes
2694 # (r13 is the number of bytes in plaintext mod 16)
2695 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2696 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2697 jmp _final_ghash_mul\@
2698
2699 _only_less_than_16\@:
2700 # check for 0 length
2701 mov arg4, %r13
2702 and $15, %r13 # r13 = (arg4 mod 16)
2703
2704 je _multiple_of_16_bytes\@
2705
2706 # handle the last <16 Byte block seperately
2707
2708
2709 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2710 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2712
2713
2714 lea SHIFT_MASK+16(%rip), %r12
2715 sub %r13, %r12 # adjust the shuffle mask pointer to be
2716 # able to shift 16-r13 bytes (r13 is the
2717 # number of bytes in plaintext mod 16)
2718
2719 _get_last_16_byte_loop\@:
2720 movb (arg3, %r11), %al
2721 movb %al, TMP1 (%rsp , %r11)
2722 add $1, %r11
2723 cmp %r13, %r11
2724 jne _get_last_16_byte_loop\@
2725
2726 vmovdqu TMP1(%rsp), %xmm1
2727
2728 sub $16, %r11
2729
2730 _final_ghash_mul\@:
2731 .if \ENC_DEC == DEC
2732 vmovdqa %xmm1, %xmm2
2733 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2734 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2735 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2736 vpand %xmm1, %xmm2, %xmm2
2737 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738 vpxor %xmm2, %xmm14, %xmm14
2739 #GHASH computation for the last <16 Byte block
2740 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2741 sub %r13, %r11
2742 add $16, %r11
2743 .else
2744 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2745 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2746 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2747 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748 vpxor %xmm9, %xmm14, %xmm14
2749 #GHASH computation for the last <16 Byte block
2750 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2751 sub %r13, %r11
2752 add $16, %r11
2753 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2754 .endif
2755
2756
2757 #############################
2758 # output r13 Bytes
2759 vmovq %xmm9, %rax
2760 cmp $8, %r13
2761 jle _less_than_8_bytes_left\@
2762
2763 mov %rax, (arg2 , %r11)
2764 add $8, %r11
2765 vpsrldq $8, %xmm9, %xmm9
2766 vmovq %xmm9, %rax
2767 sub $8, %r13
2768
2769 _less_than_8_bytes_left\@:
2770 movb %al, (arg2 , %r11)
2771 add $1, %r11
2772 shr $8, %rax
2773 sub $1, %r13
2774 jne _less_than_8_bytes_left\@
2775 #############################
2776
2777 _multiple_of_16_bytes\@:
2778 mov arg7, %r12 # r12 = aadLen (number of bytes)
2779 shl $3, %r12 # convert into number of bits
2780 vmovd %r12d, %xmm15 # len(A) in xmm15
2781
2782 shl $3, arg4 # len(C) in bits (*128)
2783 vmovq arg4, %xmm1
2784 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2785 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2786
2787 vpxor %xmm15, %xmm14, %xmm14
2788 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2789 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2790
2791 mov arg5, %rax # rax = *Y0
2792 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2793
2794 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2795
2796 vpxor %xmm14, %xmm9, %xmm9
2797
2798
2799
2800 _return_T\@:
2801 mov arg8, %r10 # r10 = authTag
2802 mov arg9, %r11 # r11 = auth_tag_len
2803
2804 cmp $16, %r11
2805 je _T_16\@
2806
2807 cmp $8, %r11
2808 jl _T_4\@
2809
2810 _T_8\@:
2811 vmovq %xmm9, %rax
2812 mov %rax, (%r10)
2813 add $8, %r10
2814 sub $8, %r11
2815 vpsrldq $8, %xmm9, %xmm9
2816 cmp $0, %r11
2817 je _return_T_done\@
2818 _T_4\@:
2819 vmovd %xmm9, %eax
2820 mov %eax, (%r10)
2821 add $4, %r10
2822 sub $4, %r11
2823 vpsrldq $4, %xmm9, %xmm9
2824 cmp $0, %r11
2825 je _return_T_done\@
2826 _T_123\@:
2827 vmovd %xmm9, %eax
2828 cmp $2, %r11
2829 jl _T_1\@
2830 mov %ax, (%r10)
2831 cmp $2, %r11
2832 je _return_T_done\@
2833 add $2, %r10
2834 sar $16, %eax
2835 _T_1\@:
2836 mov %al, (%r10)
2837 jmp _return_T_done\@
2838
2839 _T_16\@:
2840 vmovdqu %xmm9, (%r10)
2841
2842 _return_T_done\@:
2843 mov %r14, %rsp
2844
2845 pop %r15
2846 pop %r14
2847 pop %r13
2848 pop %r12
2849 .endm
2850
2851
2852 #############################################################
2853 #void aesni_gcm_precomp_avx_gen4
2854 # (gcm_data *my_ctx_data,
2855 # u8 *hash_subkey)# /* H, the Hash sub key input.
2856 # Data starts on a 16-byte boundary. */
2857 #############################################################
2858 ENTRY(aesni_gcm_precomp_avx_gen4)
2859 #the number of pushes must equal STACK_OFFSET
2860 push %r12
2861 push %r13
2862 push %r14
2863 push %r15
2864
2865 mov %rsp, %r14
2866
2867
2868
2869 sub $VARIABLE_OFFSET, %rsp
2870 and $~63, %rsp # align rsp to 64 bytes
2871
2872 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2873
2874 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2875 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2876 vmovdqa %xmm6, %xmm2
2877 vpsllq $1, %xmm6, %xmm6
2878 vpsrlq $63, %xmm2, %xmm2
2879 vmovdqa %xmm2, %xmm1
2880 vpslldq $8, %xmm2, %xmm2
2881 vpsrldq $8, %xmm1, %xmm1
2882 vpor %xmm2, %xmm6, %xmm6
2883 #reduction
2884 vpshufd $0b00100100, %xmm1, %xmm2
2885 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886 vpand POLY(%rip), %xmm2, %xmm2
2887 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2888 #######################################################################
2889 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2890
2891
2892 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2893
2894 mov %r14, %rsp
2895
2896 pop %r15
2897 pop %r14
2898 pop %r13
2899 pop %r12
2900 ret
2901 ENDPROC(aesni_gcm_precomp_avx_gen4)
2902
2903
2904 ###############################################################################
2905 #void aesni_gcm_enc_avx_gen4(
2906 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2907 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2908 # const u8 *in, /* Plaintext input */
2909 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2910 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2911 # (from Security Association) concatenated with 8 byte
2912 # Initialisation Vector (from IPSec ESP Payload)
2913 # concatenated with 0x00000001. 16-byte aligned pointer. */
2914 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2915 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2916 # u8 *auth_tag, /* Authenticated Tag output. */
2917 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2918 # Valid values are 16 (most likely), 12 or 8. */
2919 ###############################################################################
2920 ENTRY(aesni_gcm_enc_avx_gen4)
2921 GCM_ENC_DEC_AVX2 ENC
2922 ret
2923 ENDPROC(aesni_gcm_enc_avx_gen4)
2924
2925 ###############################################################################
2926 #void aesni_gcm_dec_avx_gen4(
2927 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2928 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2929 # const u8 *in, /* Ciphertext input */
2930 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2931 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2932 # (from Security Association) concatenated with 8 byte
2933 # Initialisation Vector (from IPSec ESP Payload)
2934 # concatenated with 0x00000001. 16-byte aligned pointer. */
2935 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2936 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2937 # u8 *auth_tag, /* Authenticated Tag output. */
2938 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2939 # Valid values are 16 (most likely), 12 or 8. */
2940 ###############################################################################
2941 ENTRY(aesni_gcm_dec_avx_gen4)
2942 GCM_ENC_DEC_AVX2 DEC
2943 ret
2944 ENDPROC(aesni_gcm_dec_avx_gen4)
2945
2946 #endif /* CONFIG_AS_AVX2 */