]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / aes128_cntr_by4_sse.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 %include "os.asm"
29 %include "memcpy.asm"
30
31 ; routine to do AES128 CNTR enc/decrypt "by4"
32 ; XMM registers are clobbered. Saving/restoring must be done at a higher level
33
34 %ifndef AES_CNTR_128
35 %define AES_CNTR_128 aes_cntr_128_sse
36 %endif
37
38 extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
39
40 %define CONCAT(a,b) a %+ b
41 %define MOVDQ movdqu
42
43 %define xdata0 xmm0
44 %define xdata1 xmm1
45 %define xdata2 xmm2
46 %define xdata3 xmm3
47 %define xdata4 xmm4
48 %define xdata5 xmm5
49 %define xdata6 xmm6
50 %define xdata7 xmm7
51 %define xcounter xmm8
52 %define xbyteswap xmm9
53 %define xkey0 xmm10
54 %define xkey3 xmm11
55 %define xkey6 xmm12
56 %define xkey9 xmm13
57 %define xkeyA xmm14
58 %define xkeyB xmm15
59
60 %ifdef LINUX
61 %define p_in rdi
62 %define p_IV rsi
63 %define p_keys rdx
64 %define p_out rcx
65 %define num_bytes r8
66 %define p_ivlen r9
67 %else
68 %define p_in rcx
69 %define p_IV rdx
70 %define p_keys r8
71 %define p_out r9
72 %define num_bytes r10
73 %define p_ivlen qword [rsp + 8*6]
74 %endif
75
76 %define p_tmp rsp + _buffer
77 %define tmp r11
78
79 %macro do_aes_load 1
80 do_aes %1, 1
81 %endmacro
82
83 %macro do_aes_noload 1
84 do_aes %1, 0
85 %endmacro
86
87 ; do_aes num_in_par load_keys
88 ; This increments p_in, but not p_out
89 %macro do_aes 2
90 %define %%by %1
91 %define %%load_keys %2
92
93 %if (%%load_keys)
94 movdqa xkey0, [p_keys + 0*16]
95 %endif
96
97 movdqa xdata0, xcounter
98 pshufb xdata0, xbyteswap
99 %assign i 1
100 %rep (%%by - 1)
101 movdqa CONCAT(xdata,i), xcounter
102 paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
103 pshufb CONCAT(xdata,i), xbyteswap
104 %assign i (i + 1)
105 %endrep
106
107 movdqa xkeyA, [p_keys + 1*16]
108
109 pxor xdata0, xkey0
110 paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
111 %assign i 1
112 %rep (%%by - 1)
113 pxor CONCAT(xdata,i), xkey0
114 %assign i (i + 1)
115 %endrep
116
117 movdqa xkeyB, [p_keys + 2*16]
118 %assign i 0
119 %rep %%by
120 aesenc CONCAT(xdata,i), xkeyA ; key 1
121 %assign i (i+1)
122 %endrep
123
124 %if (%%load_keys)
125 movdqa xkey3, [p_keys + 3*16]
126 %endif
127 %assign i 0
128 %rep %%by
129 aesenc CONCAT(xdata,i), xkeyB ; key 2
130 %assign i (i+1)
131 %endrep
132
133 add p_in, 16*%%by
134
135 movdqa xkeyB, [p_keys + 4*16]
136 %assign i 0
137 %rep %%by
138 aesenc CONCAT(xdata,i), xkey3 ; key 3
139 %assign i (i+1)
140 %endrep
141
142 movdqa xkeyA, [p_keys + 5*16]
143 %assign i 0
144 %rep %%by
145 aesenc CONCAT(xdata,i), xkeyB ; key 4
146 %assign i (i+1)
147 %endrep
148
149 %if (%%load_keys)
150 movdqa xkey6, [p_keys + 6*16]
151 %endif
152 %assign i 0
153 %rep %%by
154 aesenc CONCAT(xdata,i), xkeyA ; key 5
155 %assign i (i+1)
156 %endrep
157
158 movdqa xkeyA, [p_keys + 7*16]
159 %assign i 0
160 %rep %%by
161 aesenc CONCAT(xdata,i), xkey6 ; key 6
162 %assign i (i+1)
163 %endrep
164
165 movdqa xkeyB, [p_keys + 8*16]
166 %assign i 0
167 %rep %%by
168 aesenc CONCAT(xdata,i), xkeyA ; key 7
169 %assign i (i+1)
170 %endrep
171
172 %if (%%load_keys)
173 movdqa xkey9, [p_keys + 9*16]
174 %endif
175 %assign i 0
176 %rep %%by
177 aesenc CONCAT(xdata,i), xkeyB ; key 8
178 %assign i (i+1)
179 %endrep
180
181 movdqa xkeyB, [p_keys + 10*16]
182 %assign i 0
183 %rep %%by
184 aesenc CONCAT(xdata,i), xkey9 ; key 9
185 %assign i (i+1)
186 %endrep
187
188 %assign i 0
189 %rep %%by
190 aesenclast CONCAT(xdata,i), xkeyB ; key 10
191 %assign i (i+1)
192 %endrep
193
194 %assign i 0
195 %rep (%%by / 2)
196 %assign j (i+1)
197 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
198 MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
199 pxor CONCAT(xdata,i), xkeyA
200 pxor CONCAT(xdata,j), xkeyB
201 %assign i (i+2)
202 %endrep
203 %if (i < %%by)
204 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
205 pxor CONCAT(xdata,i), xkeyA
206 %endif
207
208 %assign i 0
209 %rep %%by
210 MOVDQ [p_out + i*16], CONCAT(xdata,i)
211 %assign i (i+1)
212 %endrep
213 %endmacro
214
215 struc STACK
216 _buffer: resq 2
217 _rsp_save: resq 1
218 endstruc
219
220 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
223 section .text
224
225 ;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
226 align 32
227 MKGLOBAL(AES_CNTR_128,function,internal)
228 AES_CNTR_128:
229
230 %ifndef LINUX
231 mov num_bytes, [rsp + 8*5] ; arg5
232 %endif
233
234 movdqa xbyteswap, [rel byteswap_const]
235 test p_ivlen, 16
236 jnz iv_is_16_bytes
237 ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
238 mov DWORD(tmp), 0x01000000
239 pinsrq xcounter, [p_IV], 0
240 pinsrd xcounter, [p_IV + 8], 2
241 pinsrd xcounter, DWORD(tmp), 3
242 bswap_iv:
243 pshufb xcounter, xbyteswap
244
245 mov tmp, num_bytes
246 and tmp, 3*16
247 jz chk ; x4 > or < 15 (not 3 lines)
248
249 ; 1 <= tmp <= 3
250 cmp tmp, 2*16
251 jg eq3
252 je eq2
253 eq1:
254 do_aes_load 1 ; 1 block
255 add p_out, 1*16
256 jmp chk
257
258 eq2:
259 do_aes_load 2 ; 2 blocks
260 add p_out, 2*16
261 jmp chk
262
263 eq3:
264 do_aes_load 3 ; 3 blocks
265 add p_out, 3*16
266 ; fall through to chk
267 chk:
268 and num_bytes, ~(3*16)
269 jz do_return2
270 cmp num_bytes, 16
271 jb last
272
273 ; process multiples of 4 blocks
274 movdqa xkey0, [p_keys + 0*16]
275 movdqa xkey3, [p_keys + 3*16]
276 movdqa xkey6, [p_keys + 6*16]
277 movdqa xkey9, [p_keys + 9*16]
278 jmp main_loop2
279
280 align 32
281 main_loop2:
282 ; num_bytes is a multiple of 4 blocks + partial bytes
283 do_aes_noload 4
284 add p_out, 4*16
285 sub num_bytes, 4*16
286 cmp num_bytes, 4*16
287 jae main_loop2
288
289 test num_bytes, 15 ; partial bytes to be processed?
290 jnz last
291
292 do_return2:
293 ; don't return updated IV
294 ; pshufb xcounter, xbyteswap
295 ; movdqu [p_IV], xcounter
296 ret
297
298 last:
299 ;; Code dealing with the partial block cases
300 ; reserve 16 byte aligned buffer on the stack
301 mov rax, rsp
302 sub rsp, STACK_size
303 and rsp, -16
304 mov [rsp + _rsp_save], rax ; save SP
305
306 ; copy input bytes into scratch buffer
307 memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
308 ; Encryption of a single partial block (p_tmp)
309 pshufb xcounter, xbyteswap
310 movdqa xdata0, xcounter
311 pxor xdata0, [p_keys + 16*0]
312 %assign i 1
313 %rep 9
314 aesenc xdata0, [p_keys + 16*i]
315 %assign i (i+1)
316 %endrep
317 ; created keystream
318 aesenclast xdata0, [p_keys + 16*i]
319 ; xor keystream with the message (scratch)
320 pxor xdata0, [p_tmp]
321 movdqa [p_tmp], xdata0
322 ; copy result into the output buffer
323 memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
324 ; remove the stack frame
325 mov rsp, [rsp + _rsp_save] ; original SP
326 jmp do_return2
327
328 iv_is_16_bytes:
329 ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
330 movdqu xcounter, [p_IV]
331 jmp bswap_iv
332
333 %ifdef LINUX
334 section .note.GNU-stack noalloc noexec nowrite progbits
335 %endif