]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / aes256_cntr_by4_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "os.asm"
29%include "memcpy.asm"
30
31; routine to do AES256 CNTR enc/decrypt "by4"
32; XMM registers are clobbered. Saving/restoring must be done at a higher level
33
9f95a23c
TL
34%ifndef AES_CNTR_256
35%define AES_CNTR_256 aes_cntr_256_sse
36%endif
37
11fdf7f2
TL
38extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
39
40%define CONCAT(a,b) a %+ b
41%define MOVDQ movdqu
42
43%define xdata0 xmm0
44%define xdata1 xmm1
45%define xdata2 xmm2
46%define xdata3 xmm3
47%define xdata4 xmm4
48%define xdata5 xmm5
49%define xdata6 xmm6
50%define xdata7 xmm7
51%define xcounter xmm8
52%define xbyteswap xmm9
53%define xkey0 xmm10
54%define xkey4 xmm11
55%define xkey8 xmm12
56%define xkey12 xmm13
57%define xkeyA xmm14
58%define xkeyB xmm15
59
60%ifdef LINUX
61%define p_in rdi
62%define p_IV rsi
63%define p_keys rdx
64%define p_out rcx
65%define num_bytes r8
66%define p_ivlen r9
67%else
68%define p_in rcx
69%define p_IV rdx
70%define p_keys r8
71%define p_out r9
72%define num_bytes r10
73%define p_ivlen qword [rsp + 8*6]
74%endif
75
76%define tmp r11
77%define p_tmp rsp + _buffer
78
79%macro do_aes_load 1
80 do_aes %1, 1
81%endmacro
82
83%macro do_aes_noload 1
84 do_aes %1, 0
85%endmacro
86
87
88; do_aes num_in_par load_keys
89; This increments p_in, but not p_out
90%macro do_aes 2
91%define %%by %1
92%define %%load_keys %2
93
94%if (%%load_keys)
95 movdqa xkey0, [p_keys + 0*16]
96%endif
97
98 movdqa xdata0, xcounter
99 pshufb xdata0, xbyteswap
100%assign i 1
101%rep (%%by - 1)
102 movdqa CONCAT(xdata,i), xcounter
103 paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
104 pshufb CONCAT(xdata,i), xbyteswap
105%assign i (i + 1)
106%endrep
107
108 movdqa xkeyA, [p_keys + 1*16]
109
110 pxor xdata0, xkey0
111 paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
112%assign i 1
113%rep (%%by - 1)
114 pxor CONCAT(xdata,i), xkey0
115%assign i (i + 1)
116%endrep
117
118 movdqa xkeyB, [p_keys + 2*16]
119%assign i 0
120%rep %%by
121 aesenc CONCAT(xdata,i), xkeyA ; key 1
122%assign i (i+1)
123%endrep
124
125 movdqa xkeyA, [p_keys + 3*16]
126%assign i 0
127%rep %%by
128 aesenc CONCAT(xdata,i), xkeyB ; key 2
129%assign i (i+1)
130%endrep
131
132 add p_in, 16*%%by
133
134%if (%%load_keys)
135 movdqa xkey4, [p_keys + 4*16]
136%endif
137%assign i 0
138%rep %%by
139 aesenc CONCAT(xdata,i), xkeyA ; key 3
140%assign i (i+1)
141%endrep
142
143 movdqa xkeyA, [p_keys + 5*16]
144%assign i 0
145%rep %%by
146 aesenc CONCAT(xdata,i), xkey4 ; key 4
147%assign i (i+1)
148%endrep
149
150 movdqa xkeyB, [p_keys + 6*16]
151%assign i 0
152%rep %%by
153 aesenc CONCAT(xdata,i), xkeyA ; key 5
154%assign i (i+1)
155%endrep
156
157 movdqa xkeyA, [p_keys + 7*16]
158%assign i 0
159%rep %%by
160 aesenc CONCAT(xdata,i), xkeyB ; key 6
161%assign i (i+1)
162%endrep
163
164%if (%%load_keys)
165 movdqa xkey8, [p_keys + 8*16]
166%endif
167%assign i 0
168%rep %%by
169 aesenc CONCAT(xdata,i), xkeyA ; key 7
170%assign i (i+1)
171%endrep
172
173 movdqa xkeyA, [p_keys + 9*16]
174%assign i 0
175%rep %%by
176 aesenc CONCAT(xdata,i), xkey8 ; key 8
177%assign i (i+1)
178%endrep
179
180 movdqa xkeyB, [p_keys + 10*16]
181%assign i 0
182%rep %%by
183 aesenc CONCAT(xdata,i), xkeyA ; key 9
184%assign i (i+1)
185%endrep
186
187 movdqa xkeyA, [p_keys + 11*16]
188%assign i 0
189%rep %%by
190 aesenc CONCAT(xdata,i), xkeyB ; key 10
191%assign i (i+1)
192%endrep
193
194%if (%%load_keys)
195 movdqa xkey12, [p_keys + 12*16]
196%endif
197%assign i 0
198%rep %%by
199 aesenc CONCAT(xdata,i), xkeyA ; key 11
200%assign i (i+1)
201%endrep
202
203 movdqa xkeyA, [p_keys + 13*16]
204%assign i 0
205%rep %%by
206 aesenc CONCAT(xdata,i), xkey12 ; key 12
207%assign i (i+1)
208%endrep
209
210 movdqa xkeyB, [p_keys + 14*16]
211%assign i 0
212%rep %%by
213 aesenc CONCAT(xdata,i), xkeyA ; key 13
214%assign i (i+1)
215%endrep
216
217%assign i 0
218%rep %%by
219 aesenclast CONCAT(xdata,i), xkeyB ; key 14
220%assign i (i+1)
221%endrep
222
223%assign i 0
224%rep (%%by / 2)
225%assign j (i+1)
226 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
227 MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
228 pxor CONCAT(xdata,i), xkeyA
229 pxor CONCAT(xdata,j), xkeyB
230%assign i (i+2)
231%endrep
232%if (i < %%by)
233 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
234 pxor CONCAT(xdata,i), xkeyA
235%endif
236
237%assign i 0
238%rep %%by
239 MOVDQ [p_out + i*16], CONCAT(xdata,i)
240%assign i (i+1)
241%endrep
242%endmacro
243
244struc STACK
245_buffer: resq 2
246_rsp_save: resq 1
247endstruc
248
249;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
250;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
251;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252section .text
253
254;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
255align 32
9f95a23c
TL
256MKGLOBAL(AES_CNTR_256,function,internal)
257AES_CNTR_256:
11fdf7f2
TL
258
259%ifndef LINUX
260 mov num_bytes, [rsp + 8*5]
261%endif
262
263 movdqa xbyteswap, [rel byteswap_const]
264 test p_ivlen, 16
265 jnz iv_is_16_bytes
266 ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
267 mov DWORD(tmp), 0x01000000
268 pinsrq xcounter, [p_IV], 0
269 pinsrd xcounter, [p_IV + 8], 2
270 pinsrd xcounter, DWORD(tmp), 3
271bswap_iv:
272 pshufb xcounter, xbyteswap
273
274 mov tmp, num_bytes
275 and tmp, 3*16
276 jz chk ; x4 > or < 15 (not 3 lines)
277
278 ; 1 <= tmp <= 3
279 cmp tmp, 2*16
280 jg eq3
281 je eq2
282eq1:
283 do_aes_load 1
284 add p_out, 1*16
285 jmp chk
286
287eq2:
288 do_aes_load 2
289 add p_out, 2*16
290 jmp chk
291
292eq3:
293 do_aes_load 3
294 add p_out, 3*16
295 ; fall through to chk
296chk:
297 and num_bytes, ~(3*16)
298 jz do_return2
299 cmp num_bytes, 16
300 jb last
301
302 ; process multiples of 4 blocks
303 movdqa xkey0, [p_keys + 0*16]
304 movdqa xkey4, [p_keys + 4*16]
305 movdqa xkey8, [p_keys + 8*16]
306 movdqa xkey12, [p_keys + 12*16]
307 jmp main_loop2
308
309align 32
310main_loop2:
311 ; num_bytes is a multiple of 4 and >0
312 do_aes_noload 4
313 add p_out, 4*16
314 sub num_bytes, 4*16
315 cmp num_bytes, 4*16
316 jae main_loop2
317
318 test num_bytes, 15 ; partial bytes to be processed?
319 jnz last
320
321do_return2:
322 ; don't return updated IV
323; pshufb xcounter, xbyteswap
324; movdqu [p_IV], xcounter
325 ret
326
327last:
328 ;; Code dealing with the partial block cases
329 ; reserve 16 byte aligned buffer on stack
330 mov rax, rsp
331 sub rsp, STACK_size
332 and rsp, -16
333 mov [rsp + _rsp_save], rax ; save SP
334
335 ; copy input bytes into scratch buffer
336 memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
337 ; Encryption of a single partial block (p_tmp)
338 pshufb xcounter, xbyteswap
339 movdqa xdata0, xcounter
340 pxor xdata0, [p_keys + 16*0]
341%assign i 1
342%rep 13
343 aesenc xdata0, [p_keys + 16*i]
344%assign i (i+1)
345%endrep
346 ; created keystream
347 aesenclast xdata0, [p_keys + 16*i]
348 ; xor keystream with the message (scratch)
349 pxor xdata0, [p_tmp]
350 movdqa [p_tmp], xdata0
351 ; copy result into the output buffer
352 memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
353 ; remove the stack frame
354 mov rsp, [rsp + _rsp_save] ; original SP
355 jmp do_return2
356
357iv_is_16_bytes:
358 ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
359 movdqu xcounter, [p_IV]
360 jmp bswap_iv
361
362%ifdef LINUX
363section .note.GNU-stack noalloc noexec nowrite progbits
364%endif