]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / aes256_cntr_by4_sse.asm
CommitLineData
11fdf7f2 1;;
f67539c2 2;; Copyright (c) 2012-2019, Intel Corporation
11fdf7f2
TL
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
f67539c2
TL
28%include "include/os.asm"
29%include "include/memcpy.asm"
30%include "include/const.inc"
11fdf7f2
TL
31
32; routine to do AES256 CNTR enc/decrypt "by4"
33; XMM registers are clobbered. Saving/restoring must be done at a higher level
34
9f95a23c
TL
35%ifndef AES_CNTR_256
36%define AES_CNTR_256 aes_cntr_256_sse
f67539c2 37%define AES_CNTR_BIT_256 aes_cntr_bit_256_sse
9f95a23c
TL
38%endif
39
11fdf7f2
TL
40extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
41
42%define CONCAT(a,b) a %+ b
43%define MOVDQ movdqu
44
45%define xdata0 xmm0
46%define xdata1 xmm1
f67539c2 47%define xpart xmm1
11fdf7f2
TL
48%define xdata2 xmm2
49%define xdata3 xmm3
50%define xdata4 xmm4
51%define xdata5 xmm5
52%define xdata6 xmm6
53%define xdata7 xmm7
54%define xcounter xmm8
f67539c2 55%define xtmp xmm8
11fdf7f2 56%define xbyteswap xmm9
f67539c2 57%define xtmp2 xmm9
11fdf7f2 58%define xkey0 xmm10
f67539c2 59%define xtmp3 xmm10
11fdf7f2
TL
60%define xkey4 xmm11
61%define xkey8 xmm12
62%define xkey12 xmm13
63%define xkeyA xmm14
64%define xkeyB xmm15
65
66%ifdef LINUX
67%define p_in rdi
68%define p_IV rsi
69%define p_keys rdx
70%define p_out rcx
71%define num_bytes r8
f67539c2 72%define num_bits r8
11fdf7f2
TL
73%define p_ivlen r9
74%else
75%define p_in rcx
76%define p_IV rdx
77%define p_keys r8
78%define p_out r9
79%define num_bytes r10
f67539c2 80%define num_bits r10
11fdf7f2
TL
81%define p_ivlen qword [rsp + 8*6]
82%endif
83
84%define tmp r11
11fdf7f2 85
f67539c2
TL
86%define r_bits r12
87%define tmp2 r13
88%define mask r14
89
90%macro do_aes_load 2
91 do_aes %1, %2, 1
11fdf7f2
TL
92%endmacro
93
f67539c2
TL
94%macro do_aes_noload 2
95 do_aes %1, %2, 0
11fdf7f2
TL
96%endmacro
97
98
99; do_aes num_in_par load_keys
100; This increments p_in, but not p_out
f67539c2 101%macro do_aes 3
11fdf7f2 102%define %%by %1
f67539c2
TL
103%define %%cntr_type %2
104%define %%load_keys %3
11fdf7f2
TL
105
106%if (%%load_keys)
107 movdqa xkey0, [p_keys + 0*16]
108%endif
109
110 movdqa xdata0, xcounter
111 pshufb xdata0, xbyteswap
112%assign i 1
113%rep (%%by - 1)
114 movdqa CONCAT(xdata,i), xcounter
115 paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
116 pshufb CONCAT(xdata,i), xbyteswap
117%assign i (i + 1)
118%endrep
119
120 movdqa xkeyA, [p_keys + 1*16]
121
122 pxor xdata0, xkey0
f67539c2
TL
123%ifidn %%cntr_type, CNTR_BIT
124 paddq xcounter, [rel CONCAT(ddq_add_,%%by)]
125%else
11fdf7f2 126 paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
f67539c2
TL
127%endif
128
11fdf7f2
TL
129%assign i 1
130%rep (%%by - 1)
131 pxor CONCAT(xdata,i), xkey0
132%assign i (i + 1)
133%endrep
134
135 movdqa xkeyB, [p_keys + 2*16]
136%assign i 0
137%rep %%by
138 aesenc CONCAT(xdata,i), xkeyA ; key 1
139%assign i (i+1)
140%endrep
141
142 movdqa xkeyA, [p_keys + 3*16]
143%assign i 0
144%rep %%by
145 aesenc CONCAT(xdata,i), xkeyB ; key 2
146%assign i (i+1)
147%endrep
148
149 add p_in, 16*%%by
150
151%if (%%load_keys)
152 movdqa xkey4, [p_keys + 4*16]
153%endif
154%assign i 0
155%rep %%by
156 aesenc CONCAT(xdata,i), xkeyA ; key 3
157%assign i (i+1)
158%endrep
159
160 movdqa xkeyA, [p_keys + 5*16]
161%assign i 0
162%rep %%by
163 aesenc CONCAT(xdata,i), xkey4 ; key 4
164%assign i (i+1)
165%endrep
166
167 movdqa xkeyB, [p_keys + 6*16]
168%assign i 0
169%rep %%by
170 aesenc CONCAT(xdata,i), xkeyA ; key 5
171%assign i (i+1)
172%endrep
173
174 movdqa xkeyA, [p_keys + 7*16]
175%assign i 0
176%rep %%by
177 aesenc CONCAT(xdata,i), xkeyB ; key 6
178%assign i (i+1)
179%endrep
180
181%if (%%load_keys)
182 movdqa xkey8, [p_keys + 8*16]
183%endif
184%assign i 0
185%rep %%by
186 aesenc CONCAT(xdata,i), xkeyA ; key 7
187%assign i (i+1)
188%endrep
189
190 movdqa xkeyA, [p_keys + 9*16]
191%assign i 0
192%rep %%by
193 aesenc CONCAT(xdata,i), xkey8 ; key 8
194%assign i (i+1)
195%endrep
196
197 movdqa xkeyB, [p_keys + 10*16]
198%assign i 0
199%rep %%by
200 aesenc CONCAT(xdata,i), xkeyA ; key 9
201%assign i (i+1)
202%endrep
203
204 movdqa xkeyA, [p_keys + 11*16]
205%assign i 0
206%rep %%by
207 aesenc CONCAT(xdata,i), xkeyB ; key 10
208%assign i (i+1)
209%endrep
210
211%if (%%load_keys)
212 movdqa xkey12, [p_keys + 12*16]
213%endif
214%assign i 0
215%rep %%by
216 aesenc CONCAT(xdata,i), xkeyA ; key 11
217%assign i (i+1)
218%endrep
219
220 movdqa xkeyA, [p_keys + 13*16]
221%assign i 0
222%rep %%by
223 aesenc CONCAT(xdata,i), xkey12 ; key 12
224%assign i (i+1)
225%endrep
226
227 movdqa xkeyB, [p_keys + 14*16]
228%assign i 0
229%rep %%by
230 aesenc CONCAT(xdata,i), xkeyA ; key 13
231%assign i (i+1)
232%endrep
233
234%assign i 0
235%rep %%by
236 aesenclast CONCAT(xdata,i), xkeyB ; key 14
237%assign i (i+1)
238%endrep
239
240%assign i 0
241%rep (%%by / 2)
242%assign j (i+1)
243 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
244 MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
245 pxor CONCAT(xdata,i), xkeyA
246 pxor CONCAT(xdata,j), xkeyB
247%assign i (i+2)
248%endrep
249%if (i < %%by)
250 MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
251 pxor CONCAT(xdata,i), xkeyA
252%endif
253
f67539c2
TL
254%ifidn %%cntr_type, CNTR_BIT
255 ;; check if this is the end of the message
256 mov tmp, num_bytes
257 and tmp, ~(%%by*16)
258 jnz %%skip_preserve
259 ;; Check if there is a partial byte
260 or r_bits, r_bits
261 jz %%skip_preserve
262
263%assign idx (%%by - 1)
264 ;; Load output to get last partial byte
265 movdqu xtmp, [p_out + idx * 16]
266
267 ;; Save RCX in temporary GP register
268 mov tmp, rcx
269 mov mask, 0xff
270 mov cl, BYTE(r_bits)
271 shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
272 mov rcx, tmp
273
274 movq xtmp2, mask
275 pslldq xtmp2, 15
276 ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
277 ;; in the partial byte
278
279 ;; Clear all the bits that do not need to be preserved from the output
280 pand xtmp, xtmp2
281
282 ;; Clear all bits from the input that are not to be ciphered
283 pandn xtmp2, CONCAT(xdata, idx)
284 por xtmp2, xtmp
285 movdqa CONCAT(xdata, idx), xtmp2
286
287%%skip_preserve:
288%endif
289
11fdf7f2
TL
290%assign i 0
291%rep %%by
292 MOVDQ [p_out + i*16], CONCAT(xdata,i)
293%assign i (i+1)
294%endrep
295%endmacro
296
11fdf7f2
TL
297;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
298;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
299;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
300section .text
301
f67539c2
TL
302;; Macro performing AES-CTR.
303;;
304%macro DO_CNTR 1
305%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
11fdf7f2
TL
306
307%ifndef LINUX
308 mov num_bytes, [rsp + 8*5]
309%endif
310
f67539c2
TL
311%ifidn %%CNTR_TYPE, CNTR_BIT
312 push r12
313 push r13
314 push r14
315%endif
316
11fdf7f2 317 movdqa xbyteswap, [rel byteswap_const]
f67539c2 318%ifidn %%CNTR_TYPE, CNTR
11fdf7f2 319 test p_ivlen, 16
f67539c2 320 jnz %%iv_is_16_bytes
11fdf7f2
TL
321 ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
322 mov DWORD(tmp), 0x01000000
323 pinsrq xcounter, [p_IV], 0
324 pinsrd xcounter, [p_IV + 8], 2
325 pinsrd xcounter, DWORD(tmp), 3
f67539c2
TL
326
327%else ;; CNTR_BIT
328 ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
329 movdqu xcounter, [p_IV]
330%endif
331
332%%bswap_iv:
11fdf7f2
TL
333 pshufb xcounter, xbyteswap
334
f67539c2
TL
335 ;; calculate len
336 ;; convert bits to bytes (message length in bits for CNTR_BIT)
337%ifidn %%CNTR_TYPE, CNTR_BIT
338 mov r_bits, num_bits
339 add num_bits, 7
340 shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
341 and r_bits, 7 ; Check if there are remainder bits (0-7)
342%endif
11fdf7f2
TL
343 mov tmp, num_bytes
344 and tmp, 3*16
f67539c2 345 jz %%chk ; x4 > or < 15 (not 3 lines)
11fdf7f2
TL
346
347 ; 1 <= tmp <= 3
348 cmp tmp, 2*16
f67539c2
TL
349 jg %%eq3
350 je %%eq2
351%%eq1:
352 do_aes_load 1, %%CNTR_TYPE
11fdf7f2 353 add p_out, 1*16
f67539c2 354 jmp %%chk
11fdf7f2 355
f67539c2
TL
356%%eq2:
357 do_aes_load 2, %%CNTR_TYPE
11fdf7f2 358 add p_out, 2*16
f67539c2 359 jmp %%chk
11fdf7f2 360
f67539c2
TL
361%%eq3:
362 do_aes_load 3, %%CNTR_TYPE
11fdf7f2
TL
363 add p_out, 3*16
364 ; fall through to chk
f67539c2
TL
365%%chk:
366 and num_bytes, ~(3*16)
367 jz %%do_return2
368
11fdf7f2 369 cmp num_bytes, 16
f67539c2 370 jb %%last
11fdf7f2
TL
371
372 ; process multiples of 4 blocks
373 movdqa xkey0, [p_keys + 0*16]
374 movdqa xkey4, [p_keys + 4*16]
375 movdqa xkey8, [p_keys + 8*16]
376 movdqa xkey12, [p_keys + 12*16]
11fdf7f2
TL
377
378align 32
f67539c2
TL
379%%main_loop2:
380 ; num_bytes is a multiple of 4 blocks + partial bytes
381 do_aes_noload 4, %%CNTR_TYPE
11fdf7f2
TL
382 add p_out, 4*16
383 sub num_bytes, 4*16
384 cmp num_bytes, 4*16
f67539c2 385 jae %%main_loop2
11fdf7f2 386
f67539c2
TL
387 ; Check if there is a partial block
388 or num_bytes, num_bytes
389 jnz %%last
390
391%%do_return2:
392
393%ifidn %%CNTR_TYPE, CNTR_BIT
394 pop r14
395 pop r13
396 pop r12
397%endif
11fdf7f2 398
11fdf7f2
TL
399 ret
400
f67539c2
TL
401%%last:
402
403 ; load partial block into XMM register
404 simd_load_sse_15_1 xpart, p_in, num_bytes
405
406%%final_ctr_enc:
407 ; Encryption of a single partial block
408 pshufb xcounter, xbyteswap
409 movdqa xdata0, xcounter
410 pxor xdata0, [p_keys + 16*0]
11fdf7f2
TL
411%assign i 1
412%rep 13
413 aesenc xdata0, [p_keys + 16*i]
414%assign i (i+1)
415%endrep
416 ; created keystream
417 aesenclast xdata0, [p_keys + 16*i]
f67539c2 418
11fdf7f2 419 ; xor keystream with the message (scratch)
f67539c2
TL
420 pxor xdata0, xpart
421
422%ifidn %%CNTR_TYPE, CNTR_BIT
423 ;; Check if there is a partial byte
424 or r_bits, r_bits
425 jz %%store_output
426
427 ;; Load output to get last partial byte
428 simd_load_sse_15_1 xtmp, p_out, num_bytes
429
430 ;; Save RCX in temporary GP register
431 mov tmp, rcx
432 mov mask, 0xff
433%ifidn r_bits, rcx
434%error "r_bits cannot be mapped to rcx!"
435%endif
436 mov cl, BYTE(r_bits)
437 shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
438 mov rcx, tmp
439
440 movq xtmp2, mask
441
442 ;; Get number of full bytes in last block of 16 bytes
443 mov tmp, num_bytes
444 dec tmp
445 XPSLLB xtmp2, tmp, xtmp3, tmp2
446 ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
447 ;; in the partial byte
448
449 ;; Clear all the bits that do not need to be preserved from the output
450 pand xtmp, xtmp2
451
452 ;; Clear the bits from the input that are not to be ciphered
453 pandn xtmp2, xdata0
454 por xtmp2, xtmp
455 movdqa xdata0, xtmp2
456%endif
457
458%%store_output:
459 ; copy result into the output buffer
460 simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax
461
462 jmp %%do_return2
463
464%%iv_is_16_bytes:
11fdf7f2
TL
465 ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
466 movdqu xcounter, [p_IV]
f67539c2
TL
467 jmp %%bswap_iv
468%endmacro
469
470align 32
471;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
472MKGLOBAL(AES_CNTR_256,function,internal)
473AES_CNTR_256:
474 DO_CNTR CNTR
475
476;; aes_cntr_bit_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len)
477MKGLOBAL(AES_CNTR_BIT_256,function,internal)
478AES_CNTR_BIT_256:
479 DO_CNTR CNTR_BIT
11fdf7f2
TL
480
481%ifdef LINUX
482section .note.GNU-stack noalloc noexec nowrite progbits
483%endif