]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / aes256_cntr_by8_avx.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
f67539c2
TL
28%include "include/os.asm"
29%include "include/memcpy.asm"
30%include "include/const.inc"
31%include "include/reg_sizes.asm"
11fdf7f2
TL
32
33; routine to do AES256 CNTR enc/decrypt "by8"
34; XMM registers are clobbered. Saving/restoring must be done at a higher level
35
36extern byteswap_const
37extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
38extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8
39
40%define CONCAT(a,b) a %+ b
41%define VMOVDQ vmovdqu
42
43%define xdata0 xmm0
44%define xdata1 xmm1
f67539c2 45%define xpart xmm1
11fdf7f2
TL
46%define xdata2 xmm2
47%define xdata3 xmm3
48%define xdata4 xmm4
49%define xdata5 xmm5
50%define xdata6 xmm6
51%define xdata7 xmm7
52%define xcounter xmm8
f67539c2 53%define xtmp xmm8
11fdf7f2 54%define xbyteswap xmm9
f67539c2 55%define xtmp2 xmm9
11fdf7f2 56%define xkey0 xmm10
f67539c2 57%define xtmp3 xmm10
11fdf7f2
TL
58%define xkey4 xmm11
59%define xkey8 xmm12
60%define xkey12 xmm13
61%define xkeyA xmm14
62%define xkeyB xmm15
63
64%ifdef LINUX
65%define p_in rdi
66%define p_IV rsi
67%define p_keys rdx
68%define p_out rcx
69%define num_bytes r8
f67539c2 70%define num_bits r8
11fdf7f2
TL
71%define p_ivlen r9
72%else
73%define p_in rcx
74%define p_IV rdx
75%define p_keys r8
76%define p_out r9
77%define num_bytes r10
f67539c2 78%define num_bits r10
11fdf7f2
TL
79%define p_ivlen qword [rsp + 8*6]
80%endif
81
82%define tmp r11
11fdf7f2 83
f67539c2
TL
84%define r_bits r12
85%define tmp2 r13
86%define mask r14
87
88%macro do_aes_load 2
89 do_aes %1, %2, 1
11fdf7f2
TL
90%endmacro
91
f67539c2
TL
92%macro do_aes_noload 2
93 do_aes %1, %2, 0
11fdf7f2
TL
94%endmacro
95
96; do_aes num_in_par load_keys
97; This increments p_in, but not p_out
f67539c2 98%macro do_aes 3
11fdf7f2 99%define %%by %1
f67539c2
TL
100%define %%cntr_type %2
101%define %%load_keys %3
11fdf7f2
TL
102
103%if (%%load_keys)
104 vmovdqa xkey0, [p_keys + 0*16]
105%endif
106
107 vpshufb xdata0, xcounter, xbyteswap
108%assign i 1
109%rep (%%by - 1)
110 vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
111 vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
112%assign i (i + 1)
113%endrep
114
115 vmovdqa xkeyA, [p_keys + 1*16]
116
117 vpxor xdata0, xkey0
f67539c2 118%ifidn %%cntr_type, CNTR_BIT
11fdf7f2 119 vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
f67539c2
TL
120%else
121 vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
122%endif
123
11fdf7f2
TL
124%assign i 1
125%rep (%%by - 1)
126 vpxor CONCAT(xdata,i), xkey0
127%assign i (i + 1)
128%endrep
129
130 vmovdqa xkeyB, [p_keys + 2*16]
131%assign i 0
132%rep %%by
133 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
134%assign i (i+1)
135%endrep
136
137 vmovdqa xkeyA, [p_keys + 3*16]
138%assign i 0
139%rep %%by
140 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
141%assign i (i+1)
142%endrep
143
144 add p_in, 16*%%by
145
146%if (%%load_keys)
147 vmovdqa xkey4, [p_keys + 4*16]
148%endif
149%assign i 0
150%rep %%by
151 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3
152%assign i (i+1)
153%endrep
154
155 vmovdqa xkeyA, [p_keys + 5*16]
156%assign i 0
157%rep %%by
158 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4
159%assign i (i+1)
160%endrep
161
162 vmovdqa xkeyB, [p_keys + 6*16]
163%assign i 0
164%rep %%by
165 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
166%assign i (i+1)
167%endrep
168
169 vmovdqa xkeyA, [p_keys + 7*16]
170%assign i 0
171%rep %%by
172 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6
173%assign i (i+1)
174%endrep
175
176%if (%%load_keys)
177 vmovdqa xkey8, [p_keys + 8*16]
178%endif
179%assign i 0
180%rep %%by
181 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
182%assign i (i+1)
183%endrep
184
185 vmovdqa xkeyA, [p_keys + 9*16]
186%assign i 0
187%rep %%by
188 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8
189%assign i (i+1)
190%endrep
191
192 vmovdqa xkeyB, [p_keys + 10*16]
193%assign i 0
194%rep %%by
195 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9
196%assign i (i+1)
197%endrep
198
199 vmovdqa xkeyA, [p_keys + 11*16]
200%assign i 0
201%rep %%by
202 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
203%assign i (i+1)
204%endrep
205
206%if (%%load_keys)
207 vmovdqa xkey12, [p_keys + 12*16]
208%endif
209%assign i 0
210%rep %%by
211 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11
212%assign i (i+1)
213%endrep
214
215 vmovdqa xkeyA, [p_keys + 13*16]
216%assign i 0
217%rep %%by
218 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12
219%assign i (i+1)
220%endrep
221
222 vmovdqa xkeyB, [p_keys + 14*16]
223%assign i 0
224%rep %%by
225 vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13
226%assign i (i+1)
227%endrep
228
229%assign i 0
230%rep %%by
231 vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14
232%assign i (i+1)
233%endrep
234
235%assign i 0
236%rep (%%by / 2)
237%assign j (i+1)
238 VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
239 VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
240 vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
241 vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
242%assign i (i+2)
243%endrep
244%if (i < %%by)
245 VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
246 vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
247%endif
248
f67539c2
TL
249%ifidn %%cntr_type, CNTR_BIT
250 ;; check if this is the end of the message
251 mov tmp, num_bytes
252 and tmp, ~(%%by*16)
253 jnz %%skip_preserve
254 ;; Check if there is a partial byte
255 or r_bits, r_bits
256 jz %%skip_preserve
257
258%assign idx (%%by - 1)
259 ;; Load output to get last partial byte
260 vmovdqu xtmp, [p_out + idx * 16]
261
262 ;; Save RCX in temporary GP register
263 mov tmp, rcx
264 mov mask, 0xff
265 mov cl, BYTE(r_bits)
266 shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
267 mov rcx, tmp
268
269 vmovq xtmp2, mask
270 vpslldq xtmp2, 15
271 ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
272 ;; in the partial byte
273
274 ;; Clear all the bits that do not need to be preserved from the output
275 vpand xtmp, xtmp, xtmp2
276
277 ;; Clear all bits from the input that are not to be ciphered
278 vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
279 vpor CONCAT(xdata,idx), xtmp
280
281%%skip_preserve:
282%endif
283
11fdf7f2
TL
284%assign i 0
285%rep %%by
286 VMOVDQ [p_out + i*16], CONCAT(xdata,i)
287%assign i (i+1)
288%endrep
289%endmacro
290
11fdf7f2
TL
291;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
292;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
294
295section .text
f67539c2
TL
296;; Macro performing AES-CTR.
297;;
298%macro DO_CNTR 1
299%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
11fdf7f2
TL
300
301%ifndef LINUX
302 mov num_bytes, [rsp + 8*5]
303%endif
304
f67539c2
TL
305%ifidn %%CNTR_TYPE, CNTR_BIT
306 push r12
307 push r13
308 push r14
309%endif
310
11fdf7f2 311 vmovdqa xbyteswap, [rel byteswap_const]
f67539c2 312%ifidn %%CNTR_TYPE, CNTR
11fdf7f2 313 test p_ivlen, 16
f67539c2 314 jnz %%iv_is_16_bytes
11fdf7f2
TL
315 ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
316 mov DWORD(tmp), 0x01000000
317 vpinsrq xcounter, [p_IV], 0
318 vpinsrd xcounter, [p_IV + 8], 2
319 vpinsrd xcounter, DWORD(tmp), 3
f67539c2
TL
320
321%else ;; CNTR_BIT
322 ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
323 vmovdqu xcounter, [p_IV]
324%endif
325%%bswap_iv:
11fdf7f2
TL
326 vpshufb xcounter, xbyteswap
327
f67539c2
TL
328 ;; calculate len
329 ;; convert bits to bytes (message length in bits for CNTR_BIT)
330%ifidn %%CNTR_TYPE, CNTR_BIT
331 mov r_bits, num_bits
332 add num_bits, 7
333 shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
334 and r_bits, 7 ; Check if there are remainder bits (0-7)
335%endif
336
11fdf7f2
TL
337 mov tmp, num_bytes
338 and tmp, 7*16
f67539c2 339 jz %%chk ; x8 > or < 15 (not 7 lines)
11fdf7f2
TL
340
341 ; 1 <= tmp <= 7
342 cmp tmp, 4*16
f67539c2
TL
343 jg %%gt4
344 je %%eq4
11fdf7f2 345
f67539c2 346%%lt4:
11fdf7f2 347 cmp tmp, 2*16
f67539c2
TL
348 jg %%eq3
349 je %%eq2
350%%eq1:
351 do_aes_load 1, %%CNTR_TYPE
11fdf7f2 352 add p_out, 1*16
f67539c2 353 jmp %%chk
11fdf7f2 354
f67539c2
TL
355%%eq2:
356 do_aes_load 2, %%CNTR_TYPE
11fdf7f2 357 add p_out, 2*16
f67539c2 358 jmp %%chk
11fdf7f2 359
f67539c2
TL
360%%eq3:
361 do_aes_load 3, %%CNTR_TYPE
11fdf7f2 362 add p_out, 3*16
f67539c2 363 jmp %%chk
11fdf7f2 364
f67539c2
TL
365%%eq4:
366 do_aes_load 4, %%CNTR_TYPE
11fdf7f2 367 add p_out, 4*16
f67539c2 368 jmp %%chk
11fdf7f2 369
f67539c2 370%%gt4:
11fdf7f2 371 cmp tmp, 6*16
f67539c2
TL
372 jg %%eq7
373 je %%eq6
11fdf7f2 374
f67539c2
TL
375%%eq5:
376 do_aes_load 5, %%CNTR_TYPE
11fdf7f2 377 add p_out, 5*16
f67539c2 378 jmp %%chk
11fdf7f2 379
f67539c2
TL
380%%eq6:
381 do_aes_load 6, %%CNTR_TYPE
11fdf7f2 382 add p_out, 6*16
f67539c2 383 jmp %%chk
11fdf7f2 384
f67539c2
TL
385%%eq7:
386 do_aes_load 7, %%CNTR_TYPE
11fdf7f2
TL
387 add p_out, 7*16
388 ; fall through to chk
f67539c2 389%%chk:
11fdf7f2 390 and num_bytes, ~(7*16)
f67539c2 391 jz %%do_return2
11fdf7f2
TL
392
393 cmp num_bytes, 16
f67539c2 394 jb %%last
11fdf7f2
TL
395
396 ; process multiples of 8 blocks
397 vmovdqa xkey0, [p_keys + 0*16]
398 vmovdqa xkey4, [p_keys + 4*16]
399 vmovdqa xkey8, [p_keys + 8*16]
400 vmovdqa xkey12, [p_keys + 12*16]
f67539c2 401 jmp %%main_loop2
11fdf7f2
TL
402
403align 32
f67539c2
TL
404%%main_loop2:
405 ; num_bytes is a multiple of 8 blocks + partial bytes
406 do_aes_noload 8, %%CNTR_TYPE
11fdf7f2
TL
407 add p_out, 8*16
408 sub num_bytes, 8*16
409 cmp num_bytes, 8*16
f67539c2 410 jae %%main_loop2
11fdf7f2 411
f67539c2
TL
412 ; Check if there is a partial block
413 or num_bytes, num_bytes
414 jnz %%last
415
416%%do_return2:
417%ifidn %%CNTR_TYPE, CNTR_BIT
418 pop r14
419 pop r13
420 pop r12
421%endif
11fdf7f2 422
11fdf7f2
TL
423 ret
424
f67539c2
TL
425%%last:
426
427 ; load partial block into XMM register
428 simd_load_avx_15_1 xpart, p_in, num_bytes
429
430%%final_ctr_enc:
431 ; Encryption of a single partial block
11fdf7f2
TL
432 vpshufb xcounter, xbyteswap
433 vmovdqa xdata0, xcounter
434 vpxor xdata0, [p_keys + 16*0]
435%assign i 1
436%rep 13
437 vaesenc xdata0, [p_keys + 16*i]
438%assign i (i+1)
439%endrep
440 ; created keystream
441 vaesenclast xdata0, [p_keys + 16*i]
f67539c2 442
11fdf7f2 443 ; xor keystream with the message (scratch)
f67539c2
TL
444 vpxor xdata0, xpart
445
446%ifidn %%CNTR_TYPE, CNTR_BIT
447 ;; Check if there is a partial byte
448 or r_bits, r_bits
449 jz %%store_output
450
451 ;; Load output to get last partial byte
452 simd_load_avx_15_1 xtmp, p_out, num_bytes
453
454 ;; Save RCX in temporary GP register
455 mov tmp, rcx
456 mov mask, 0xff
457%ifidn r_bits, rcx
458%error "r_bits cannot be mapped to rcx!"
459%endif
460 mov cl, BYTE(r_bits)
461 shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
462 mov rcx, tmp
463
464 vmovq xtmp2, mask
465
466 ;; Get number of full bytes in last block of 16 bytes
467 mov tmp, num_bytes
468 dec tmp
469 XVPSLLB xtmp2, tmp, xtmp3, tmp2
470 ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
471 ;; in the partial byte
472
473 ;; Clear all the bits that do not need to be preserved from the output
474 vpand xtmp, xtmp, xtmp2
475
476 ;; Clear the bits from the input that are not to be ciphered
477 vpandn xdata0, xtmp2, xdata0
478 vpor xdata0, xtmp
479%endif
480
481%%store_output:
482 ; copy result into the output buffer
483 simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
484
485 jmp %%do_return2
486
487%%iv_is_16_bytes:
11fdf7f2
TL
488 ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
489 vmovdqu xcounter, [p_IV]
f67539c2
TL
490 jmp %%bswap_iv
491%endmacro
492
493align 32
494%ifdef CNTR_CCM_AVX
495; JOB_AES_HMAC * aes_cntr_ccm_256_avx(JOB_AES_HMAC *job)
496; arg 1 : job
497MKGLOBAL(aes_cntr_ccm_256_avx,function,internal)
498aes_cntr_ccm_256_avx:
499 DO_CNTR CCM
500%else
501;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
502;; UINT64 iv_len)
503MKGLOBAL(aes_cntr_256_avx,function,internal)
504aes_cntr_256_avx:
505 DO_CNTR CNTR
506
507;; aes_cntr_bit_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
508;; UINT64 iv_len)
509MKGLOBAL(aes_cntr_bit_256_avx,function,internal)
510aes_cntr_bit_256_avx:
511 DO_CNTR CNTR_BIT
512%endif ;; CNTR_CCM_AVX
11fdf7f2
TL
513
514%ifdef LINUX
515section .note.GNU-stack noalloc noexec nowrite progbits
516%endif