]> git.proxmox.com Git - mirror_zfs.git/blame - module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
icp: fix rodata being marked as text in x86 Asm code
[mirror_zfs.git] / module / icp / asm-x86_64 / modes / aesni-gcm-x86_64.S
CommitLineData
31b160f0
AF
1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2#
3# Licensed under the Apache License 2.0 (the "License"). You may not use
4# this file except in compliance with the License. You can obtain a copy
5# in the file LICENSE in the source distribution or at
6# https://www.openssl.org/source/license.html
7
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16#
17# AES-NI-CTR+GHASH stitch.
18#
19# February 2013
20#
21# OpenSSL GCM implementation is organized in such way that its
22# performance is rather close to the sum of its streamed components,
23# in the context parallelized AES-NI CTR and modulo-scheduled
24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25# was observed to perform significantly better than the sum of the
26# components on contemporary CPUs, the effort was deemed impossible to
27# justify. This module is based on combination of Intel submissions,
28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29# Locktyukhin of Intel Corp. who verified that it reduces shuffles
30# pressure with notable relative improvement, achieving 1.0 cycle per
31# byte processed with 128-bit key on Haswell processor, 0.74 - on
32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33# measurements for favourable packet size, one divisible by 96.
34# Applications using the EVP interface will observe a few percent
35# worse performance.]
36#
37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38#
39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42# Generated once from
43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44# and modified for ICP. Modification are kept at a bare minimum to ease later
45# upstream merges.
46
47#if defined(__x86_64__) && defined(HAVE_AVX) && \
5b3b7955
AF
48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
b844489e
AL
50#define _ASM
51#include <sys/asm_linkage.h>
52
5b3b7955 53.extern gcm_avx_can_use_movbe
31b160f0
AF
54
55.text
56
5b3b7955 57#ifdef HAVE_MOVBE
31b160f0
AF
58.type _aesni_ctr32_ghash_6x,@function
59.align 32
60_aesni_ctr32_ghash_6x:
d9655c5b 61.cfi_startproc
31b160f0
AF
62 vmovdqu 32(%r11),%xmm2
63 subq $6,%rdx
64 vpxor %xmm4,%xmm4,%xmm4
65 vmovdqu 0-128(%rcx),%xmm15
66 vpaddb %xmm2,%xmm1,%xmm10
67 vpaddb %xmm2,%xmm10,%xmm11
68 vpaddb %xmm2,%xmm11,%xmm12
69 vpaddb %xmm2,%xmm12,%xmm13
70 vpaddb %xmm2,%xmm13,%xmm14
71 vpxor %xmm15,%xmm1,%xmm9
72 vmovdqu %xmm4,16+8(%rsp)
73 jmp .Loop6x
74
75.align 32
76.Loop6x:
77 addl $100663296,%ebx
78 jc .Lhandle_ctr32
79 vmovdqu 0-32(%r9),%xmm3
80 vpaddb %xmm2,%xmm14,%xmm1
81 vpxor %xmm15,%xmm10,%xmm10
82 vpxor %xmm15,%xmm11,%xmm11
83
84.Lresume_ctr32:
85 vmovdqu %xmm1,(%r8)
86 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
87 vpxor %xmm15,%xmm12,%xmm12
88 vmovups 16-128(%rcx),%xmm2
89 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
90 xorq %r12,%r12
91 cmpq %r14,%r15
92
93 vaesenc %xmm2,%xmm9,%xmm9
94 vmovdqu 48+8(%rsp),%xmm0
95 vpxor %xmm15,%xmm13,%xmm13
96 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
97 vaesenc %xmm2,%xmm10,%xmm10
98 vpxor %xmm15,%xmm14,%xmm14
99 setnc %r12b
100 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
101 vaesenc %xmm2,%xmm11,%xmm11
102 vmovdqu 16-32(%r9),%xmm3
103 negq %r12
104 vaesenc %xmm2,%xmm12,%xmm12
105 vpxor %xmm5,%xmm6,%xmm6
106 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
107 vpxor %xmm4,%xmm8,%xmm8
108 vaesenc %xmm2,%xmm13,%xmm13
109 vpxor %xmm5,%xmm1,%xmm4
110 andq $0x60,%r12
111 vmovups 32-128(%rcx),%xmm15
112 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
113 vaesenc %xmm2,%xmm14,%xmm14
114
115 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
116 leaq (%r14,%r12,1),%r14
117 vaesenc %xmm15,%xmm9,%xmm9
118 vpxor 16+8(%rsp),%xmm8,%xmm8
119 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
120 vmovdqu 64+8(%rsp),%xmm0
121 vaesenc %xmm15,%xmm10,%xmm10
122 movbeq 88(%r14),%r13
123 vaesenc %xmm15,%xmm11,%xmm11
124 movbeq 80(%r14),%r12
125 vaesenc %xmm15,%xmm12,%xmm12
126 movq %r13,32+8(%rsp)
127 vaesenc %xmm15,%xmm13,%xmm13
128 movq %r12,40+8(%rsp)
129 vmovdqu 48-32(%r9),%xmm5
130 vaesenc %xmm15,%xmm14,%xmm14
131
132 vmovups 48-128(%rcx),%xmm15
133 vpxor %xmm1,%xmm6,%xmm6
134 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
135 vaesenc %xmm15,%xmm9,%xmm9
136 vpxor %xmm2,%xmm6,%xmm6
137 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
138 vaesenc %xmm15,%xmm10,%xmm10
139 vpxor %xmm3,%xmm7,%xmm7
140 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
141 vaesenc %xmm15,%xmm11,%xmm11
142 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
143 vmovdqu 80+8(%rsp),%xmm0
144 vaesenc %xmm15,%xmm12,%xmm12
145 vaesenc %xmm15,%xmm13,%xmm13
146 vpxor %xmm1,%xmm4,%xmm4
147 vmovdqu 64-32(%r9),%xmm1
148 vaesenc %xmm15,%xmm14,%xmm14
149
150 vmovups 64-128(%rcx),%xmm15
151 vpxor %xmm2,%xmm6,%xmm6
152 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
153 vaesenc %xmm15,%xmm9,%xmm9
154 vpxor %xmm3,%xmm6,%xmm6
155 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
156 vaesenc %xmm15,%xmm10,%xmm10
157 movbeq 72(%r14),%r13
158 vpxor %xmm5,%xmm7,%xmm7
159 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
160 vaesenc %xmm15,%xmm11,%xmm11
161 movbeq 64(%r14),%r12
162 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
163 vmovdqu 96+8(%rsp),%xmm0
164 vaesenc %xmm15,%xmm12,%xmm12
165 movq %r13,48+8(%rsp)
166 vaesenc %xmm15,%xmm13,%xmm13
167 movq %r12,56+8(%rsp)
168 vpxor %xmm2,%xmm4,%xmm4
169 vmovdqu 96-32(%r9),%xmm2
170 vaesenc %xmm15,%xmm14,%xmm14
171
172 vmovups 80-128(%rcx),%xmm15
173 vpxor %xmm3,%xmm6,%xmm6
174 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
175 vaesenc %xmm15,%xmm9,%xmm9
176 vpxor %xmm5,%xmm6,%xmm6
177 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
178 vaesenc %xmm15,%xmm10,%xmm10
179 movbeq 56(%r14),%r13
180 vpxor %xmm1,%xmm7,%xmm7
181 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
182 vpxor 112+8(%rsp),%xmm8,%xmm8
183 vaesenc %xmm15,%xmm11,%xmm11
184 movbeq 48(%r14),%r12
185 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
186 vaesenc %xmm15,%xmm12,%xmm12
187 movq %r13,64+8(%rsp)
188 vaesenc %xmm15,%xmm13,%xmm13
189 movq %r12,72+8(%rsp)
190 vpxor %xmm3,%xmm4,%xmm4
191 vmovdqu 112-32(%r9),%xmm3
192 vaesenc %xmm15,%xmm14,%xmm14
193
194 vmovups 96-128(%rcx),%xmm15
195 vpxor %xmm5,%xmm6,%xmm6
196 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
197 vaesenc %xmm15,%xmm9,%xmm9
198 vpxor %xmm1,%xmm6,%xmm6
199 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
200 vaesenc %xmm15,%xmm10,%xmm10
201 movbeq 40(%r14),%r13
202 vpxor %xmm2,%xmm7,%xmm7
203 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
204 vaesenc %xmm15,%xmm11,%xmm11
205 movbeq 32(%r14),%r12
206 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
207 vaesenc %xmm15,%xmm12,%xmm12
208 movq %r13,80+8(%rsp)
209 vaesenc %xmm15,%xmm13,%xmm13
210 movq %r12,88+8(%rsp)
211 vpxor %xmm5,%xmm6,%xmm6
212 vaesenc %xmm15,%xmm14,%xmm14
213 vpxor %xmm1,%xmm6,%xmm6
214
215 vmovups 112-128(%rcx),%xmm15
216 vpslldq $8,%xmm6,%xmm5
217 vpxor %xmm2,%xmm4,%xmm4
218 vmovdqu 16(%r11),%xmm3
219
220 vaesenc %xmm15,%xmm9,%xmm9
221 vpxor %xmm8,%xmm7,%xmm7
222 vaesenc %xmm15,%xmm10,%xmm10
223 vpxor %xmm5,%xmm4,%xmm4
224 movbeq 24(%r14),%r13
225 vaesenc %xmm15,%xmm11,%xmm11
226 movbeq 16(%r14),%r12
227 vpalignr $8,%xmm4,%xmm4,%xmm0
228 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
229 movq %r13,96+8(%rsp)
230 vaesenc %xmm15,%xmm12,%xmm12
231 movq %r12,104+8(%rsp)
232 vaesenc %xmm15,%xmm13,%xmm13
233 vmovups 128-128(%rcx),%xmm1
234 vaesenc %xmm15,%xmm14,%xmm14
235
236 vaesenc %xmm1,%xmm9,%xmm9
237 vmovups 144-128(%rcx),%xmm15
238 vaesenc %xmm1,%xmm10,%xmm10
239 vpsrldq $8,%xmm6,%xmm6
240 vaesenc %xmm1,%xmm11,%xmm11
241 vpxor %xmm6,%xmm7,%xmm7
242 vaesenc %xmm1,%xmm12,%xmm12
243 vpxor %xmm0,%xmm4,%xmm4
244 movbeq 8(%r14),%r13
245 vaesenc %xmm1,%xmm13,%xmm13
246 movbeq 0(%r14),%r12
247 vaesenc %xmm1,%xmm14,%xmm14
248 vmovups 160-128(%rcx),%xmm1
249 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
250 jb .Lenc_tail
251
252 vaesenc %xmm15,%xmm9,%xmm9
253 vaesenc %xmm15,%xmm10,%xmm10
254 vaesenc %xmm15,%xmm11,%xmm11
255 vaesenc %xmm15,%xmm12,%xmm12
256 vaesenc %xmm15,%xmm13,%xmm13
257 vaesenc %xmm15,%xmm14,%xmm14
258
259 vaesenc %xmm1,%xmm9,%xmm9
260 vaesenc %xmm1,%xmm10,%xmm10
261 vaesenc %xmm1,%xmm11,%xmm11
262 vaesenc %xmm1,%xmm12,%xmm12
263 vaesenc %xmm1,%xmm13,%xmm13
264 vmovups 176-128(%rcx),%xmm15
265 vaesenc %xmm1,%xmm14,%xmm14
266 vmovups 192-128(%rcx),%xmm1
267 cmpl $14,%ebp // ICP does not zero key schedule.
268 jb .Lenc_tail
269
270 vaesenc %xmm15,%xmm9,%xmm9
271 vaesenc %xmm15,%xmm10,%xmm10
272 vaesenc %xmm15,%xmm11,%xmm11
273 vaesenc %xmm15,%xmm12,%xmm12
274 vaesenc %xmm15,%xmm13,%xmm13
275 vaesenc %xmm15,%xmm14,%xmm14
276
277 vaesenc %xmm1,%xmm9,%xmm9
278 vaesenc %xmm1,%xmm10,%xmm10
279 vaesenc %xmm1,%xmm11,%xmm11
280 vaesenc %xmm1,%xmm12,%xmm12
281 vaesenc %xmm1,%xmm13,%xmm13
282 vmovups 208-128(%rcx),%xmm15
283 vaesenc %xmm1,%xmm14,%xmm14
284 vmovups 224-128(%rcx),%xmm1
285 jmp .Lenc_tail
286
287.align 32
288.Lhandle_ctr32:
289 vmovdqu (%r11),%xmm0
290 vpshufb %xmm0,%xmm1,%xmm6
291 vmovdqu 48(%r11),%xmm5
292 vpaddd 64(%r11),%xmm6,%xmm10
293 vpaddd %xmm5,%xmm6,%xmm11
294 vmovdqu 0-32(%r9),%xmm3
295 vpaddd %xmm5,%xmm10,%xmm12
296 vpshufb %xmm0,%xmm10,%xmm10
297 vpaddd %xmm5,%xmm11,%xmm13
298 vpshufb %xmm0,%xmm11,%xmm11
299 vpxor %xmm15,%xmm10,%xmm10
300 vpaddd %xmm5,%xmm12,%xmm14
301 vpshufb %xmm0,%xmm12,%xmm12
302 vpxor %xmm15,%xmm11,%xmm11
303 vpaddd %xmm5,%xmm13,%xmm1
304 vpshufb %xmm0,%xmm13,%xmm13
305 vpshufb %xmm0,%xmm14,%xmm14
306 vpshufb %xmm0,%xmm1,%xmm1
307 jmp .Lresume_ctr32
308
309.align 32
310.Lenc_tail:
311 vaesenc %xmm15,%xmm9,%xmm9
312 vmovdqu %xmm7,16+8(%rsp)
313 vpalignr $8,%xmm4,%xmm4,%xmm8
314 vaesenc %xmm15,%xmm10,%xmm10
315 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
316 vpxor 0(%rdi),%xmm1,%xmm2
317 vaesenc %xmm15,%xmm11,%xmm11
318 vpxor 16(%rdi),%xmm1,%xmm0
319 vaesenc %xmm15,%xmm12,%xmm12
320 vpxor 32(%rdi),%xmm1,%xmm5
321 vaesenc %xmm15,%xmm13,%xmm13
322 vpxor 48(%rdi),%xmm1,%xmm6
323 vaesenc %xmm15,%xmm14,%xmm14
324 vpxor 64(%rdi),%xmm1,%xmm7
325 vpxor 80(%rdi),%xmm1,%xmm3
326 vmovdqu (%r8),%xmm1
327
328 vaesenclast %xmm2,%xmm9,%xmm9
329 vmovdqu 32(%r11),%xmm2
330 vaesenclast %xmm0,%xmm10,%xmm10
331 vpaddb %xmm2,%xmm1,%xmm0
332 movq %r13,112+8(%rsp)
333 leaq 96(%rdi),%rdi
334 vaesenclast %xmm5,%xmm11,%xmm11
335 vpaddb %xmm2,%xmm0,%xmm5
336 movq %r12,120+8(%rsp)
337 leaq 96(%rsi),%rsi
338 vmovdqu 0-128(%rcx),%xmm15
339 vaesenclast %xmm6,%xmm12,%xmm12
340 vpaddb %xmm2,%xmm5,%xmm6
341 vaesenclast %xmm7,%xmm13,%xmm13
342 vpaddb %xmm2,%xmm6,%xmm7
343 vaesenclast %xmm3,%xmm14,%xmm14
344 vpaddb %xmm2,%xmm7,%xmm3
345
346 addq $0x60,%r10
347 subq $0x6,%rdx
348 jc .L6x_done
349
350 vmovups %xmm9,-96(%rsi)
351 vpxor %xmm15,%xmm1,%xmm9
352 vmovups %xmm10,-80(%rsi)
353 vmovdqa %xmm0,%xmm10
354 vmovups %xmm11,-64(%rsi)
355 vmovdqa %xmm5,%xmm11
356 vmovups %xmm12,-48(%rsi)
357 vmovdqa %xmm6,%xmm12
358 vmovups %xmm13,-32(%rsi)
359 vmovdqa %xmm7,%xmm13
360 vmovups %xmm14,-16(%rsi)
361 vmovdqa %xmm3,%xmm14
362 vmovdqu 32+8(%rsp),%xmm7
363 jmp .Loop6x
364
365.L6x_done:
366 vpxor 16+8(%rsp),%xmm8,%xmm8
367 vpxor %xmm4,%xmm8,%xmm8
368
b844489e 369 RET
d9655c5b 370.cfi_endproc
31b160f0 371.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
5b3b7955
AF
372#endif /* ifdef HAVE_MOVBE */
373
374.type _aesni_ctr32_ghash_no_movbe_6x,@function
375.align 32
376_aesni_ctr32_ghash_no_movbe_6x:
d9655c5b 377.cfi_startproc
5b3b7955
AF
378 vmovdqu 32(%r11),%xmm2
379 subq $6,%rdx
380 vpxor %xmm4,%xmm4,%xmm4
381 vmovdqu 0-128(%rcx),%xmm15
382 vpaddb %xmm2,%xmm1,%xmm10
383 vpaddb %xmm2,%xmm10,%xmm11
384 vpaddb %xmm2,%xmm11,%xmm12
385 vpaddb %xmm2,%xmm12,%xmm13
386 vpaddb %xmm2,%xmm13,%xmm14
387 vpxor %xmm15,%xmm1,%xmm9
388 vmovdqu %xmm4,16+8(%rsp)
389 jmp .Loop6x_nmb
390
391.align 32
392.Loop6x_nmb:
393 addl $100663296,%ebx
394 jc .Lhandle_ctr32_nmb
395 vmovdqu 0-32(%r9),%xmm3
396 vpaddb %xmm2,%xmm14,%xmm1
397 vpxor %xmm15,%xmm10,%xmm10
398 vpxor %xmm15,%xmm11,%xmm11
399
400.Lresume_ctr32_nmb:
401 vmovdqu %xmm1,(%r8)
402 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
403 vpxor %xmm15,%xmm12,%xmm12
404 vmovups 16-128(%rcx),%xmm2
405 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
406 xorq %r12,%r12
407 cmpq %r14,%r15
408
409 vaesenc %xmm2,%xmm9,%xmm9
410 vmovdqu 48+8(%rsp),%xmm0
411 vpxor %xmm15,%xmm13,%xmm13
412 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
413 vaesenc %xmm2,%xmm10,%xmm10
414 vpxor %xmm15,%xmm14,%xmm14
415 setnc %r12b
416 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
417 vaesenc %xmm2,%xmm11,%xmm11
418 vmovdqu 16-32(%r9),%xmm3
419 negq %r12
420 vaesenc %xmm2,%xmm12,%xmm12
421 vpxor %xmm5,%xmm6,%xmm6
422 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
423 vpxor %xmm4,%xmm8,%xmm8
424 vaesenc %xmm2,%xmm13,%xmm13
425 vpxor %xmm5,%xmm1,%xmm4
426 andq $0x60,%r12
427 vmovups 32-128(%rcx),%xmm15
428 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
429 vaesenc %xmm2,%xmm14,%xmm14
430
431 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
432 leaq (%r14,%r12,1),%r14
433 vaesenc %xmm15,%xmm9,%xmm9
434 vpxor 16+8(%rsp),%xmm8,%xmm8
435 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
436 vmovdqu 64+8(%rsp),%xmm0
437 vaesenc %xmm15,%xmm10,%xmm10
438 movq 88(%r14),%r13
439 bswapq %r13
440 vaesenc %xmm15,%xmm11,%xmm11
441 movq 80(%r14),%r12
442 bswapq %r12
443 vaesenc %xmm15,%xmm12,%xmm12
444 movq %r13,32+8(%rsp)
445 vaesenc %xmm15,%xmm13,%xmm13
446 movq %r12,40+8(%rsp)
447 vmovdqu 48-32(%r9),%xmm5
448 vaesenc %xmm15,%xmm14,%xmm14
449
450 vmovups 48-128(%rcx),%xmm15
451 vpxor %xmm1,%xmm6,%xmm6
452 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
453 vaesenc %xmm15,%xmm9,%xmm9
454 vpxor %xmm2,%xmm6,%xmm6
455 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
456 vaesenc %xmm15,%xmm10,%xmm10
457 vpxor %xmm3,%xmm7,%xmm7
458 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
459 vaesenc %xmm15,%xmm11,%xmm11
460 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
461 vmovdqu 80+8(%rsp),%xmm0
462 vaesenc %xmm15,%xmm12,%xmm12
463 vaesenc %xmm15,%xmm13,%xmm13
464 vpxor %xmm1,%xmm4,%xmm4
465 vmovdqu 64-32(%r9),%xmm1
466 vaesenc %xmm15,%xmm14,%xmm14
467
468 vmovups 64-128(%rcx),%xmm15
469 vpxor %xmm2,%xmm6,%xmm6
470 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
471 vaesenc %xmm15,%xmm9,%xmm9
472 vpxor %xmm3,%xmm6,%xmm6
473 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
474 vaesenc %xmm15,%xmm10,%xmm10
475 movq 72(%r14),%r13
476 bswapq %r13
477 vpxor %xmm5,%xmm7,%xmm7
478 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
479 vaesenc %xmm15,%xmm11,%xmm11
480 movq 64(%r14),%r12
481 bswapq %r12
482 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
483 vmovdqu 96+8(%rsp),%xmm0
484 vaesenc %xmm15,%xmm12,%xmm12
485 movq %r13,48+8(%rsp)
486 vaesenc %xmm15,%xmm13,%xmm13
487 movq %r12,56+8(%rsp)
488 vpxor %xmm2,%xmm4,%xmm4
489 vmovdqu 96-32(%r9),%xmm2
490 vaesenc %xmm15,%xmm14,%xmm14
491
492 vmovups 80-128(%rcx),%xmm15
493 vpxor %xmm3,%xmm6,%xmm6
494 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
495 vaesenc %xmm15,%xmm9,%xmm9
496 vpxor %xmm5,%xmm6,%xmm6
497 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
498 vaesenc %xmm15,%xmm10,%xmm10
499 movq 56(%r14),%r13
500 bswapq %r13
501 vpxor %xmm1,%xmm7,%xmm7
502 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
503 vpxor 112+8(%rsp),%xmm8,%xmm8
504 vaesenc %xmm15,%xmm11,%xmm11
505 movq 48(%r14),%r12
506 bswapq %r12
507 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
508 vaesenc %xmm15,%xmm12,%xmm12
509 movq %r13,64+8(%rsp)
510 vaesenc %xmm15,%xmm13,%xmm13
511 movq %r12,72+8(%rsp)
512 vpxor %xmm3,%xmm4,%xmm4
513 vmovdqu 112-32(%r9),%xmm3
514 vaesenc %xmm15,%xmm14,%xmm14
515
516 vmovups 96-128(%rcx),%xmm15
517 vpxor %xmm5,%xmm6,%xmm6
518 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
519 vaesenc %xmm15,%xmm9,%xmm9
520 vpxor %xmm1,%xmm6,%xmm6
521 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
522 vaesenc %xmm15,%xmm10,%xmm10
523 movq 40(%r14),%r13
524 bswapq %r13
525 vpxor %xmm2,%xmm7,%xmm7
526 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
527 vaesenc %xmm15,%xmm11,%xmm11
528 movq 32(%r14),%r12
529 bswapq %r12
530 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
531 vaesenc %xmm15,%xmm12,%xmm12
532 movq %r13,80+8(%rsp)
533 vaesenc %xmm15,%xmm13,%xmm13
534 movq %r12,88+8(%rsp)
535 vpxor %xmm5,%xmm6,%xmm6
536 vaesenc %xmm15,%xmm14,%xmm14
537 vpxor %xmm1,%xmm6,%xmm6
538
539 vmovups 112-128(%rcx),%xmm15
540 vpslldq $8,%xmm6,%xmm5
541 vpxor %xmm2,%xmm4,%xmm4
542 vmovdqu 16(%r11),%xmm3
543
544 vaesenc %xmm15,%xmm9,%xmm9
545 vpxor %xmm8,%xmm7,%xmm7
546 vaesenc %xmm15,%xmm10,%xmm10
547 vpxor %xmm5,%xmm4,%xmm4
548 movq 24(%r14),%r13
549 bswapq %r13
550 vaesenc %xmm15,%xmm11,%xmm11
551 movq 16(%r14),%r12
552 bswapq %r12
553 vpalignr $8,%xmm4,%xmm4,%xmm0
554 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
555 movq %r13,96+8(%rsp)
556 vaesenc %xmm15,%xmm12,%xmm12
557 movq %r12,104+8(%rsp)
558 vaesenc %xmm15,%xmm13,%xmm13
559 vmovups 128-128(%rcx),%xmm1
560 vaesenc %xmm15,%xmm14,%xmm14
561
562 vaesenc %xmm1,%xmm9,%xmm9
563 vmovups 144-128(%rcx),%xmm15
564 vaesenc %xmm1,%xmm10,%xmm10
565 vpsrldq $8,%xmm6,%xmm6
566 vaesenc %xmm1,%xmm11,%xmm11
567 vpxor %xmm6,%xmm7,%xmm7
568 vaesenc %xmm1,%xmm12,%xmm12
569 vpxor %xmm0,%xmm4,%xmm4
570 movq 8(%r14),%r13
571 bswapq %r13
572 vaesenc %xmm1,%xmm13,%xmm13
573 movq 0(%r14),%r12
574 bswapq %r12
575 vaesenc %xmm1,%xmm14,%xmm14
576 vmovups 160-128(%rcx),%xmm1
577 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
578 jb .Lenc_tail_nmb
579
580 vaesenc %xmm15,%xmm9,%xmm9
581 vaesenc %xmm15,%xmm10,%xmm10
582 vaesenc %xmm15,%xmm11,%xmm11
583 vaesenc %xmm15,%xmm12,%xmm12
584 vaesenc %xmm15,%xmm13,%xmm13
585 vaesenc %xmm15,%xmm14,%xmm14
586
587 vaesenc %xmm1,%xmm9,%xmm9
588 vaesenc %xmm1,%xmm10,%xmm10
589 vaesenc %xmm1,%xmm11,%xmm11
590 vaesenc %xmm1,%xmm12,%xmm12
591 vaesenc %xmm1,%xmm13,%xmm13
592 vmovups 176-128(%rcx),%xmm15
593 vaesenc %xmm1,%xmm14,%xmm14
594 vmovups 192-128(%rcx),%xmm1
595 cmpl $14,%ebp // ICP does not zero key schedule.
596 jb .Lenc_tail_nmb
597
598 vaesenc %xmm15,%xmm9,%xmm9
599 vaesenc %xmm15,%xmm10,%xmm10
600 vaesenc %xmm15,%xmm11,%xmm11
601 vaesenc %xmm15,%xmm12,%xmm12
602 vaesenc %xmm15,%xmm13,%xmm13
603 vaesenc %xmm15,%xmm14,%xmm14
604
605 vaesenc %xmm1,%xmm9,%xmm9
606 vaesenc %xmm1,%xmm10,%xmm10
607 vaesenc %xmm1,%xmm11,%xmm11
608 vaesenc %xmm1,%xmm12,%xmm12
609 vaesenc %xmm1,%xmm13,%xmm13
610 vmovups 208-128(%rcx),%xmm15
611 vaesenc %xmm1,%xmm14,%xmm14
612 vmovups 224-128(%rcx),%xmm1
613 jmp .Lenc_tail_nmb
614
615.align 32
616.Lhandle_ctr32_nmb:
617 vmovdqu (%r11),%xmm0
618 vpshufb %xmm0,%xmm1,%xmm6
619 vmovdqu 48(%r11),%xmm5
620 vpaddd 64(%r11),%xmm6,%xmm10
621 vpaddd %xmm5,%xmm6,%xmm11
622 vmovdqu 0-32(%r9),%xmm3
623 vpaddd %xmm5,%xmm10,%xmm12
624 vpshufb %xmm0,%xmm10,%xmm10
625 vpaddd %xmm5,%xmm11,%xmm13
626 vpshufb %xmm0,%xmm11,%xmm11
627 vpxor %xmm15,%xmm10,%xmm10
628 vpaddd %xmm5,%xmm12,%xmm14
629 vpshufb %xmm0,%xmm12,%xmm12
630 vpxor %xmm15,%xmm11,%xmm11
631 vpaddd %xmm5,%xmm13,%xmm1
632 vpshufb %xmm0,%xmm13,%xmm13
633 vpshufb %xmm0,%xmm14,%xmm14
634 vpshufb %xmm0,%xmm1,%xmm1
635 jmp .Lresume_ctr32_nmb
636
637.align 32
638.Lenc_tail_nmb:
639 vaesenc %xmm15,%xmm9,%xmm9
640 vmovdqu %xmm7,16+8(%rsp)
641 vpalignr $8,%xmm4,%xmm4,%xmm8
642 vaesenc %xmm15,%xmm10,%xmm10
643 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
644 vpxor 0(%rdi),%xmm1,%xmm2
645 vaesenc %xmm15,%xmm11,%xmm11
646 vpxor 16(%rdi),%xmm1,%xmm0
647 vaesenc %xmm15,%xmm12,%xmm12
648 vpxor 32(%rdi),%xmm1,%xmm5
649 vaesenc %xmm15,%xmm13,%xmm13
650 vpxor 48(%rdi),%xmm1,%xmm6
651 vaesenc %xmm15,%xmm14,%xmm14
652 vpxor 64(%rdi),%xmm1,%xmm7
653 vpxor 80(%rdi),%xmm1,%xmm3
654 vmovdqu (%r8),%xmm1
655
656 vaesenclast %xmm2,%xmm9,%xmm9
657 vmovdqu 32(%r11),%xmm2
658 vaesenclast %xmm0,%xmm10,%xmm10
659 vpaddb %xmm2,%xmm1,%xmm0
660 movq %r13,112+8(%rsp)
661 leaq 96(%rdi),%rdi
662 vaesenclast %xmm5,%xmm11,%xmm11
663 vpaddb %xmm2,%xmm0,%xmm5
664 movq %r12,120+8(%rsp)
665 leaq 96(%rsi),%rsi
666 vmovdqu 0-128(%rcx),%xmm15
667 vaesenclast %xmm6,%xmm12,%xmm12
668 vpaddb %xmm2,%xmm5,%xmm6
669 vaesenclast %xmm7,%xmm13,%xmm13
670 vpaddb %xmm2,%xmm6,%xmm7
671 vaesenclast %xmm3,%xmm14,%xmm14
672 vpaddb %xmm2,%xmm7,%xmm3
673
674 addq $0x60,%r10
675 subq $0x6,%rdx
676 jc .L6x_done_nmb
677
678 vmovups %xmm9,-96(%rsi)
679 vpxor %xmm15,%xmm1,%xmm9
680 vmovups %xmm10,-80(%rsi)
681 vmovdqa %xmm0,%xmm10
682 vmovups %xmm11,-64(%rsi)
683 vmovdqa %xmm5,%xmm11
684 vmovups %xmm12,-48(%rsi)
685 vmovdqa %xmm6,%xmm12
686 vmovups %xmm13,-32(%rsi)
687 vmovdqa %xmm7,%xmm13
688 vmovups %xmm14,-16(%rsi)
689 vmovdqa %xmm3,%xmm14
690 vmovdqu 32+8(%rsp),%xmm7
691 jmp .Loop6x_nmb
692
693.L6x_done_nmb:
694 vpxor 16+8(%rsp),%xmm8,%xmm8
695 vpxor %xmm4,%xmm8,%xmm8
696
b844489e 697 RET
d9655c5b 698.cfi_endproc
5b3b7955
AF
699.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
700
31b160f0
AF
701.globl aesni_gcm_decrypt
702.type aesni_gcm_decrypt,@function
703.align 32
704aesni_gcm_decrypt:
705.cfi_startproc
706 xorq %r10,%r10
707 cmpq $0x60,%rdx
708 jb .Lgcm_dec_abort
709
710 leaq (%rsp),%rax
711.cfi_def_cfa_register %rax
712 pushq %rbx
713.cfi_offset %rbx,-16
714 pushq %rbp
715.cfi_offset %rbp,-24
716 pushq %r12
717.cfi_offset %r12,-32
718 pushq %r13
719.cfi_offset %r13,-40
720 pushq %r14
721.cfi_offset %r14,-48
722 pushq %r15
723.cfi_offset %r15,-56
e8beeaa1
AF
724 pushq %r9
725.cfi_offset %r9,-64
31b160f0
AF
726 vzeroupper
727
728 vmovdqu (%r8),%xmm1
729 addq $-128,%rsp
730 movl 12(%r8),%ebx
731 leaq .Lbswap_mask(%rip),%r11
732 leaq -128(%rcx),%r14
733 movq $0xf80,%r15
734 vmovdqu (%r9),%xmm8
735 andq $-128,%rsp
736 vmovdqu (%r11),%xmm0
737 leaq 128(%rcx),%rcx
e8beeaa1
AF
738 movq 32(%r9),%r9
739 leaq 32(%r9),%r9
31b160f0
AF
740 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
741 vpshufb %xmm0,%xmm8,%xmm8
742
743 andq %r15,%r14
744 andq %rsp,%r15
745 subq %r14,%r15
746 jc .Ldec_no_key_aliasing
747 cmpq $768,%r15
748 jnc .Ldec_no_key_aliasing
749 subq %r15,%rsp
750.Ldec_no_key_aliasing:
751
752 vmovdqu 80(%rdi),%xmm7
753 leaq (%rdi),%r14
754 vmovdqu 64(%rdi),%xmm4
755 leaq -192(%rdi,%rdx,1),%r15
756 vmovdqu 48(%rdi),%xmm5
757 shrq $4,%rdx
758 xorq %r10,%r10
759 vmovdqu 32(%rdi),%xmm6
760 vpshufb %xmm0,%xmm7,%xmm7
761 vmovdqu 16(%rdi),%xmm2
762 vpshufb %xmm0,%xmm4,%xmm4
763 vmovdqu (%rdi),%xmm3
764 vpshufb %xmm0,%xmm5,%xmm5
765 vmovdqu %xmm4,48(%rsp)
766 vpshufb %xmm0,%xmm6,%xmm6
767 vmovdqu %xmm5,64(%rsp)
768 vpshufb %xmm0,%xmm2,%xmm2
769 vmovdqu %xmm6,80(%rsp)
770 vpshufb %xmm0,%xmm3,%xmm3
771 vmovdqu %xmm2,96(%rsp)
772 vmovdqu %xmm3,112(%rsp)
773
5b3b7955
AF
774#ifdef HAVE_MOVBE
775#ifdef _KERNEL
776 testl $1,gcm_avx_can_use_movbe(%rip)
777#else
778 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
779#endif
780 jz 1f
31b160f0 781 call _aesni_ctr32_ghash_6x
5b3b7955
AF
782 jmp 2f
7831:
784#endif
785 call _aesni_ctr32_ghash_no_movbe_6x
7862:
31b160f0
AF
787 vmovups %xmm9,-96(%rsi)
788 vmovups %xmm10,-80(%rsi)
789 vmovups %xmm11,-64(%rsi)
790 vmovups %xmm12,-48(%rsi)
791 vmovups %xmm13,-32(%rsi)
792 vmovups %xmm14,-16(%rsi)
793
794 vpshufb (%r11),%xmm8,%xmm8
e8beeaa1
AF
795 movq -56(%rax),%r9
796.cfi_restore %r9
797 vmovdqu %xmm8,(%r9)
31b160f0
AF
798
799 vzeroupper
800 movq -48(%rax),%r15
801.cfi_restore %r15
802 movq -40(%rax),%r14
803.cfi_restore %r14
804 movq -32(%rax),%r13
805.cfi_restore %r13
806 movq -24(%rax),%r12
807.cfi_restore %r12
808 movq -16(%rax),%rbp
809.cfi_restore %rbp
810 movq -8(%rax),%rbx
811.cfi_restore %rbx
812 leaq (%rax),%rsp
813.cfi_def_cfa_register %rsp
814.Lgcm_dec_abort:
815 movq %r10,%rax
b844489e 816 RET
31b160f0
AF
817.cfi_endproc
818.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
819.type _aesni_ctr32_6x,@function
820.align 32
821_aesni_ctr32_6x:
d9655c5b 822.cfi_startproc
31b160f0
AF
823 vmovdqu 0-128(%rcx),%xmm4
824 vmovdqu 32(%r11),%xmm2
825 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
826 vmovups 16-128(%rcx),%xmm15
827 leaq 32-128(%rcx),%r12
828 vpxor %xmm4,%xmm1,%xmm9
829 addl $100663296,%ebx
830 jc .Lhandle_ctr32_2
831 vpaddb %xmm2,%xmm1,%xmm10
832 vpaddb %xmm2,%xmm10,%xmm11
833 vpxor %xmm4,%xmm10,%xmm10
834 vpaddb %xmm2,%xmm11,%xmm12
835 vpxor %xmm4,%xmm11,%xmm11
836 vpaddb %xmm2,%xmm12,%xmm13
837 vpxor %xmm4,%xmm12,%xmm12
838 vpaddb %xmm2,%xmm13,%xmm14
839 vpxor %xmm4,%xmm13,%xmm13
840 vpaddb %xmm2,%xmm14,%xmm1
841 vpxor %xmm4,%xmm14,%xmm14
842 jmp .Loop_ctr32
843
844.align 16
845.Loop_ctr32:
846 vaesenc %xmm15,%xmm9,%xmm9
847 vaesenc %xmm15,%xmm10,%xmm10
848 vaesenc %xmm15,%xmm11,%xmm11
849 vaesenc %xmm15,%xmm12,%xmm12
850 vaesenc %xmm15,%xmm13,%xmm13
851 vaesenc %xmm15,%xmm14,%xmm14
852 vmovups (%r12),%xmm15
853 leaq 16(%r12),%r12
854 decl %r13d
855 jnz .Loop_ctr32
856
857 vmovdqu (%r12),%xmm3
858 vaesenc %xmm15,%xmm9,%xmm9
859 vpxor 0(%rdi),%xmm3,%xmm4
860 vaesenc %xmm15,%xmm10,%xmm10
861 vpxor 16(%rdi),%xmm3,%xmm5
862 vaesenc %xmm15,%xmm11,%xmm11
863 vpxor 32(%rdi),%xmm3,%xmm6
864 vaesenc %xmm15,%xmm12,%xmm12
865 vpxor 48(%rdi),%xmm3,%xmm8
866 vaesenc %xmm15,%xmm13,%xmm13
867 vpxor 64(%rdi),%xmm3,%xmm2
868 vaesenc %xmm15,%xmm14,%xmm14
869 vpxor 80(%rdi),%xmm3,%xmm3
870 leaq 96(%rdi),%rdi
871
872 vaesenclast %xmm4,%xmm9,%xmm9
873 vaesenclast %xmm5,%xmm10,%xmm10
874 vaesenclast %xmm6,%xmm11,%xmm11
875 vaesenclast %xmm8,%xmm12,%xmm12
876 vaesenclast %xmm2,%xmm13,%xmm13
877 vaesenclast %xmm3,%xmm14,%xmm14
878 vmovups %xmm9,0(%rsi)
879 vmovups %xmm10,16(%rsi)
880 vmovups %xmm11,32(%rsi)
881 vmovups %xmm12,48(%rsi)
882 vmovups %xmm13,64(%rsi)
883 vmovups %xmm14,80(%rsi)
884 leaq 96(%rsi),%rsi
885
b844489e 886 RET
31b160f0
AF
887.align 32
888.Lhandle_ctr32_2:
889 vpshufb %xmm0,%xmm1,%xmm6
890 vmovdqu 48(%r11),%xmm5
891 vpaddd 64(%r11),%xmm6,%xmm10
892 vpaddd %xmm5,%xmm6,%xmm11
893 vpaddd %xmm5,%xmm10,%xmm12
894 vpshufb %xmm0,%xmm10,%xmm10
895 vpaddd %xmm5,%xmm11,%xmm13
896 vpshufb %xmm0,%xmm11,%xmm11
897 vpxor %xmm4,%xmm10,%xmm10
898 vpaddd %xmm5,%xmm12,%xmm14
899 vpshufb %xmm0,%xmm12,%xmm12
900 vpxor %xmm4,%xmm11,%xmm11
901 vpaddd %xmm5,%xmm13,%xmm1
902 vpshufb %xmm0,%xmm13,%xmm13
903 vpxor %xmm4,%xmm12,%xmm12
904 vpshufb %xmm0,%xmm14,%xmm14
905 vpxor %xmm4,%xmm13,%xmm13
906 vpshufb %xmm0,%xmm1,%xmm1
907 vpxor %xmm4,%xmm14,%xmm14
908 jmp .Loop_ctr32
d9655c5b 909.cfi_endproc
31b160f0
AF
910.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
911
912.globl aesni_gcm_encrypt
913.type aesni_gcm_encrypt,@function
914.align 32
915aesni_gcm_encrypt:
916.cfi_startproc
917 xorq %r10,%r10
918 cmpq $288,%rdx
919 jb .Lgcm_enc_abort
920
921 leaq (%rsp),%rax
922.cfi_def_cfa_register %rax
923 pushq %rbx
924.cfi_offset %rbx,-16
925 pushq %rbp
926.cfi_offset %rbp,-24
927 pushq %r12
928.cfi_offset %r12,-32
929 pushq %r13
930.cfi_offset %r13,-40
931 pushq %r14
932.cfi_offset %r14,-48
933 pushq %r15
934.cfi_offset %r15,-56
e8beeaa1
AF
935 pushq %r9
936.cfi_offset %r9,-64
31b160f0
AF
937 vzeroupper
938
939 vmovdqu (%r8),%xmm1
940 addq $-128,%rsp
941 movl 12(%r8),%ebx
942 leaq .Lbswap_mask(%rip),%r11
943 leaq -128(%rcx),%r14
944 movq $0xf80,%r15
945 leaq 128(%rcx),%rcx
946 vmovdqu (%r11),%xmm0
947 andq $-128,%rsp
948 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
949
950 andq %r15,%r14
951 andq %rsp,%r15
952 subq %r14,%r15
953 jc .Lenc_no_key_aliasing
954 cmpq $768,%r15
955 jnc .Lenc_no_key_aliasing
956 subq %r15,%rsp
957.Lenc_no_key_aliasing:
958
959 leaq (%rsi),%r14
960 leaq -192(%rsi,%rdx,1),%r15
961 shrq $4,%rdx
962
963 call _aesni_ctr32_6x
964 vpshufb %xmm0,%xmm9,%xmm8
965 vpshufb %xmm0,%xmm10,%xmm2
966 vmovdqu %xmm8,112(%rsp)
967 vpshufb %xmm0,%xmm11,%xmm4
968 vmovdqu %xmm2,96(%rsp)
969 vpshufb %xmm0,%xmm12,%xmm5
970 vmovdqu %xmm4,80(%rsp)
971 vpshufb %xmm0,%xmm13,%xmm6
972 vmovdqu %xmm5,64(%rsp)
973 vpshufb %xmm0,%xmm14,%xmm7
974 vmovdqu %xmm6,48(%rsp)
975
976 call _aesni_ctr32_6x
977
978 vmovdqu (%r9),%xmm8
e8beeaa1
AF
979 movq 32(%r9),%r9
980 leaq 32(%r9),%r9
31b160f0
AF
981 subq $12,%rdx
982 movq $192,%r10
983 vpshufb %xmm0,%xmm8,%xmm8
984
5b3b7955
AF
985#ifdef HAVE_MOVBE
986#ifdef _KERNEL
987 testl $1,gcm_avx_can_use_movbe(%rip)
988#else
989 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
990#endif
991 jz 1f
31b160f0 992 call _aesni_ctr32_ghash_6x
5b3b7955
AF
993 jmp 2f
9941:
995#endif
996 call _aesni_ctr32_ghash_no_movbe_6x
9972:
31b160f0
AF
998 vmovdqu 32(%rsp),%xmm7
999 vmovdqu (%r11),%xmm0
1000 vmovdqu 0-32(%r9),%xmm3
1001 vpunpckhqdq %xmm7,%xmm7,%xmm1
1002 vmovdqu 32-32(%r9),%xmm15
1003 vmovups %xmm9,-96(%rsi)
1004 vpshufb %xmm0,%xmm9,%xmm9
1005 vpxor %xmm7,%xmm1,%xmm1
1006 vmovups %xmm10,-80(%rsi)
1007 vpshufb %xmm0,%xmm10,%xmm10
1008 vmovups %xmm11,-64(%rsi)
1009 vpshufb %xmm0,%xmm11,%xmm11
1010 vmovups %xmm12,-48(%rsi)
1011 vpshufb %xmm0,%xmm12,%xmm12
1012 vmovups %xmm13,-32(%rsi)
1013 vpshufb %xmm0,%xmm13,%xmm13
1014 vmovups %xmm14,-16(%rsi)
1015 vpshufb %xmm0,%xmm14,%xmm14
1016 vmovdqu %xmm9,16(%rsp)
1017 vmovdqu 48(%rsp),%xmm6
1018 vmovdqu 16-32(%r9),%xmm0
1019 vpunpckhqdq %xmm6,%xmm6,%xmm2
1020 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
1021 vpxor %xmm6,%xmm2,%xmm2
1022 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
1023 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
1024
1025 vmovdqu 64(%rsp),%xmm9
1026 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
1027 vmovdqu 48-32(%r9),%xmm3
1028 vpxor %xmm5,%xmm4,%xmm4
1029 vpunpckhqdq %xmm9,%xmm9,%xmm5
1030 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
1031 vpxor %xmm9,%xmm5,%xmm5
1032 vpxor %xmm7,%xmm6,%xmm6
1033 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
1034 vmovdqu 80-32(%r9),%xmm15
1035 vpxor %xmm1,%xmm2,%xmm2
1036
1037 vmovdqu 80(%rsp),%xmm1
1038 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
1039 vmovdqu 64-32(%r9),%xmm0
1040 vpxor %xmm4,%xmm7,%xmm7
1041 vpunpckhqdq %xmm1,%xmm1,%xmm4
1042 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
1043 vpxor %xmm1,%xmm4,%xmm4
1044 vpxor %xmm6,%xmm9,%xmm9
1045 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
1046 vpxor %xmm2,%xmm5,%xmm5
1047
1048 vmovdqu 96(%rsp),%xmm2
1049 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
1050 vmovdqu 96-32(%r9),%xmm3
1051 vpxor %xmm7,%xmm6,%xmm6
1052 vpunpckhqdq %xmm2,%xmm2,%xmm7
1053 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
1054 vpxor %xmm2,%xmm7,%xmm7
1055 vpxor %xmm9,%xmm1,%xmm1
1056 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
1057 vmovdqu 128-32(%r9),%xmm15
1058 vpxor %xmm5,%xmm4,%xmm4
1059
1060 vpxor 112(%rsp),%xmm8,%xmm8
1061 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
1062 vmovdqu 112-32(%r9),%xmm0
1063 vpunpckhqdq %xmm8,%xmm8,%xmm9
1064 vpxor %xmm6,%xmm5,%xmm5
1065 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
1066 vpxor %xmm8,%xmm9,%xmm9
1067 vpxor %xmm1,%xmm2,%xmm2
1068 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
1069 vpxor %xmm4,%xmm7,%xmm4
1070
1071 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
1072 vmovdqu 0-32(%r9),%xmm3
1073 vpunpckhqdq %xmm14,%xmm14,%xmm1
1074 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
1075 vpxor %xmm14,%xmm1,%xmm1
1076 vpxor %xmm5,%xmm6,%xmm5
1077 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
1078 vmovdqu 32-32(%r9),%xmm15
1079 vpxor %xmm2,%xmm8,%xmm7
1080 vpxor %xmm4,%xmm9,%xmm6
1081
1082 vmovdqu 16-32(%r9),%xmm0
1083 vpxor %xmm5,%xmm7,%xmm9
1084 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
1085 vpxor %xmm9,%xmm6,%xmm6
1086 vpunpckhqdq %xmm13,%xmm13,%xmm2
1087 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
1088 vpxor %xmm13,%xmm2,%xmm2
1089 vpslldq $8,%xmm6,%xmm9
1090 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
1091 vpxor %xmm9,%xmm5,%xmm8
1092 vpsrldq $8,%xmm6,%xmm6
1093 vpxor %xmm6,%xmm7,%xmm7
1094
1095 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
1096 vmovdqu 48-32(%r9),%xmm3
1097 vpxor %xmm4,%xmm5,%xmm5
1098 vpunpckhqdq %xmm12,%xmm12,%xmm9
1099 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
1100 vpxor %xmm12,%xmm9,%xmm9
1101 vpxor %xmm14,%xmm13,%xmm13
1102 vpalignr $8,%xmm8,%xmm8,%xmm14
1103 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
1104 vmovdqu 80-32(%r9),%xmm15
1105 vpxor %xmm1,%xmm2,%xmm2
1106
1107 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
1108 vmovdqu 64-32(%r9),%xmm0
1109 vpxor %xmm5,%xmm4,%xmm4
1110 vpunpckhqdq %xmm11,%xmm11,%xmm1
1111 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
1112 vpxor %xmm11,%xmm1,%xmm1
1113 vpxor %xmm13,%xmm12,%xmm12
1114 vxorps 16(%rsp),%xmm7,%xmm7
1115 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
1116 vpxor %xmm2,%xmm9,%xmm9
1117
1118 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
1119 vxorps %xmm14,%xmm8,%xmm8
1120
1121 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
1122 vmovdqu 96-32(%r9),%xmm3
1123 vpxor %xmm4,%xmm5,%xmm5
1124 vpunpckhqdq %xmm10,%xmm10,%xmm2
1125 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
1126 vpxor %xmm10,%xmm2,%xmm2
1127 vpalignr $8,%xmm8,%xmm8,%xmm14
1128 vpxor %xmm12,%xmm11,%xmm11
1129 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
1130 vmovdqu 128-32(%r9),%xmm15
1131 vpxor %xmm9,%xmm1,%xmm1
1132
1133 vxorps %xmm7,%xmm14,%xmm14
1134 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
1135 vxorps %xmm14,%xmm8,%xmm8
1136
1137 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
1138 vmovdqu 112-32(%r9),%xmm0
1139 vpxor %xmm5,%xmm4,%xmm4
1140 vpunpckhqdq %xmm8,%xmm8,%xmm9
1141 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
1142 vpxor %xmm8,%xmm9,%xmm9
1143 vpxor %xmm11,%xmm10,%xmm10
1144 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
1145 vpxor %xmm1,%xmm2,%xmm2
1146
1147 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
1148 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
1149 vpxor %xmm4,%xmm5,%xmm5
1150 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
1151 vpxor %xmm10,%xmm7,%xmm7
1152 vpxor %xmm2,%xmm6,%xmm6
1153
1154 vpxor %xmm5,%xmm7,%xmm4
1155 vpxor %xmm4,%xmm6,%xmm6
1156 vpslldq $8,%xmm6,%xmm1
1157 vmovdqu 16(%r11),%xmm3
1158 vpsrldq $8,%xmm6,%xmm6
1159 vpxor %xmm1,%xmm5,%xmm8
1160 vpxor %xmm6,%xmm7,%xmm7
1161
1162 vpalignr $8,%xmm8,%xmm8,%xmm2
1163 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
1164 vpxor %xmm2,%xmm8,%xmm8
1165
1166 vpalignr $8,%xmm8,%xmm8,%xmm2
1167 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
1168 vpxor %xmm7,%xmm2,%xmm2
1169 vpxor %xmm2,%xmm8,%xmm8
1170 vpshufb (%r11),%xmm8,%xmm8
e8beeaa1
AF
1171 movq -56(%rax),%r9
1172.cfi_restore %r9
1173 vmovdqu %xmm8,(%r9)
31b160f0
AF
1174
1175 vzeroupper
1176 movq -48(%rax),%r15
1177.cfi_restore %r15
1178 movq -40(%rax),%r14
1179.cfi_restore %r14
1180 movq -32(%rax),%r13
1181.cfi_restore %r13
1182 movq -24(%rax),%r12
1183.cfi_restore %r12
1184 movq -16(%rax),%rbp
1185.cfi_restore %rbp
1186 movq -8(%rax),%rbx
1187.cfi_restore %rbx
1188 leaq (%rax),%rsp
1189.cfi_def_cfa_register %rsp
1190.Lgcm_enc_abort:
1191 movq %r10,%rax
b844489e 1192 RET
31b160f0
AF
1193.cfi_endproc
1194.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
1195
1196/* Some utility routines */
1197
1198/*
1199 * clear all fpu registers
1200 * void clear_fpu_regs_avx(void);
1201 */
1202.globl clear_fpu_regs_avx
1203.type clear_fpu_regs_avx,@function
1204.align 32
1205clear_fpu_regs_avx:
1206 vzeroall
43569ee3 1207 RET
31b160f0
AF
1208.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
1209
1210/*
1211 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1212 *
1213 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1214 * stores the result at `dst'. The XOR is performed using FPU registers,
1215 * so make sure FPU state is saved when running this in the kernel.
1216 */
1217.globl gcm_xor_avx
1218.type gcm_xor_avx,@function
1219.align 32
1220gcm_xor_avx:
1221 movdqu (%rdi), %xmm0
1222 movdqu (%rsi), %xmm1
1223 pxor %xmm1, %xmm0
1224 movdqu %xmm0, (%rsi)
43569ee3 1225 RET
31b160f0
AF
1226.size gcm_xor_avx,.-gcm_xor_avx
1227
1228/*
1229 * Toggle a boolean_t value atomically and return the new value.
1230 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1231 */
1232.globl atomic_toggle_boolean_nv
1233.type atomic_toggle_boolean_nv,@function
1234.align 32
1235atomic_toggle_boolean_nv:
1236 xorl %eax, %eax
1237 lock
1238 xorl $1, (%rdi)
1239 jz 1f
1240 movl $1, %eax
12411:
43569ee3 1242 RET
31b160f0
AF
1243.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
1244
61cca6fa 1245.pushsection .rodata
31b160f0
AF
1246.align 64
1247.Lbswap_mask:
1248.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1249.Lpoly:
1250.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1251.Lone_msb:
1252.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1253.Ltwo_lsb:
1254.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1255.Lone_lsb:
1256.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1257.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1258.align 64
61cca6fa 1259.popsection
31b160f0
AF
1260
1261/* Mark the stack non-executable. */
1262#if defined(__linux__) && defined(__ELF__)
1263.section .note.GNU-stack,"",%progbits
1264#endif
1265
1266#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */