X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fspdk%2Fintel-ipsec-mb%2Favx%2Faes256_cntr_by8_avx.asm;fp=ceph%2Fsrc%2Fspdk%2Fintel-ipsec-mb%2Favx%2Faes256_cntr_by8_avx.asm;h=e201339da37564a514a8dea59338f035dc1e2367;hb=f67539c23b11f3b8a2ecaeeddf7a403ae1c442a8;hp=80c30f4b95480632b2f40418b66a94114272ba03;hpb=64a4c04e6850c6d9086e4c37f57c4eada541b05e;p=ceph.git diff --git a/ceph/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm b/ceph/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm index 80c30f4b9..e201339da 100644 --- a/ceph/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm +++ b/ceph/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm @@ -25,8 +25,10 @@ ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; -%include "os.asm" -%include "memcpy.asm" +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" ; routine to do AES256 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level @@ -40,6 +42,7 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define xdata0 xmm0 %define xdata1 xmm1 +%define xpart xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 @@ -47,8 +50,11 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 +%define xtmp xmm8 %define xbyteswap xmm9 +%define xtmp2 xmm9 %define xkey0 xmm10 +%define xtmp3 xmm10 %define xkey4 xmm11 %define xkey8 xmm12 %define xkey12 xmm13 @@ -61,6 +67,7 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define p_keys rdx %define p_out rcx %define num_bytes r8 +%define num_bits r8 %define p_ivlen r9 %else %define p_in rcx @@ -68,25 +75,30 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define p_keys r8 %define p_out r9 %define num_bytes r10 +%define num_bits r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 -%define p_tmp rsp + _buffer -%macro do_aes_load 1 - do_aes %1, 1 +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 %endmacro -%macro do_aes_noload 1 - do_aes %1, 0 +%macro do_aes_noload 2 + do_aes %1, %2, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out -%macro do_aes 2 +%macro do_aes 3 %define %%by %1 -%define %%load_keys %2 +%define %%cntr_type %2 +%define %%load_keys %3 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] @@ -103,7 +115,12 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 vmovdqa xkeyA, [p_keys + 1*16] vpxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + %assign i 1 %rep (%%by - 1) vpxor CONCAT(xdata,i), xkey0 @@ -229,6 +246,41 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %endif +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + vmovdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + vpslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx) + vpor CONCAT(xdata,idx), xtmp + +%%skip_preserve: +%endif + %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) @@ -236,132 +288,147 @@ extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %endrep %endmacro -struc STACK -_buffer: resq 2 -_rsp_save: resq 1 -endstruc - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text - -;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) -align 32 -MKGLOBAL(aes_cntr_256_avx,function,internal) -aes_cntr_256_avx: +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM) %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + vmovdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR test p_ivlen, 16 - jnz iv_is_16_bytes + jnz %%iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 vpinsrq xcounter, [p_IV], 0 vpinsrd xcounter, [p_IV + 8], 2 vpinsrd xcounter, DWORD(tmp), 3 -bswap_iv: + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + vmovdqu xcounter, [p_IV] +%endif +%%bswap_iv: vpshufb xcounter, xbyteswap + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + mov tmp, num_bytes and tmp, 7*16 - jz chk ; x8 > or < 15 (not 7 lines) + jz %%chk ; x8 > or < 15 (not 7 lines) ; 1 <= tmp <= 7 cmp tmp, 4*16 - jg gt4 - je eq4 + jg %%gt4 + je %%eq4 -lt4: +%%lt4: cmp tmp, 2*16 - jg eq3 - je eq2 -eq1: - do_aes_load 1 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE add p_out, 1*16 - jmp chk + jmp %%chk -eq2: - do_aes_load 2 +%%eq2: + do_aes_load 2, %%CNTR_TYPE add p_out, 2*16 - jmp chk + jmp %%chk -eq3: - do_aes_load 3 +%%eq3: + do_aes_load 3, %%CNTR_TYPE add p_out, 3*16 - jmp chk + jmp %%chk -eq4: - do_aes_load 4 +%%eq4: + do_aes_load 4, %%CNTR_TYPE add p_out, 4*16 - jmp chk + jmp %%chk -gt4: +%%gt4: cmp tmp, 6*16 - jg eq7 - je eq6 + jg %%eq7 + je %%eq6 -eq5: - do_aes_load 5 +%%eq5: + do_aes_load 5, %%CNTR_TYPE add p_out, 5*16 - jmp chk + jmp %%chk -eq6: - do_aes_load 6 +%%eq6: + do_aes_load 6, %%CNTR_TYPE add p_out, 6*16 - jmp chk + jmp %%chk -eq7: - do_aes_load 7 +%%eq7: + do_aes_load 7, %%CNTR_TYPE add p_out, 7*16 ; fall through to chk -chk: +%%chk: and num_bytes, ~(7*16) - jz do_return2 + jz %%do_return2 cmp num_bytes, 16 - jb last + jb %%last ; process multiples of 8 blocks vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey4, [p_keys + 4*16] vmovdqa xkey8, [p_keys + 8*16] vmovdqa xkey12, [p_keys + 12*16] - jmp main_loop2 + jmp %%main_loop2 align 32 -main_loop2: - ; num_bytes is a multiple of 8 and >0 - do_aes_noload 8 +%%main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8, %%CNTR_TYPE add p_out, 8*16 sub num_bytes, 8*16 cmp num_bytes, 8*16 - jae main_loop2 + jae %%main_loop2 - test num_bytes, 15 ; partial bytes to be processed? - jnz last + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif -do_return2: -; don't return updated IV -; vpshufb xcounter, xcounter, xbyteswap -; vmovdqu [p_IV], xcounter ret -last: - ;; Code dealing with the partial block cases - ; reserve 16 byte aligned buffer on stack - mov rax, rsp - sub rsp, STACK_size - and rsp, -16 - mov [rsp + _rsp_save], rax ; save SP - - ; copy input bytes into scratch buffer - memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax - ; Encryption of a single partial block (p_tmp) +%%last: + + ; load partial block into XMM register + simd_load_avx_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block vpshufb xcounter, xbyteswap vmovdqa xdata0, xcounter vpxor xdata0, [p_keys + 16*0] @@ -372,19 +439,77 @@ last: %endrep ; created keystream vaesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) - vpxor xdata0, [p_tmp] - vmovdqa [p_tmp], xdata0 - ; copy result into the output buffer - memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax - ; remove the stack frame - mov rsp, [rsp + _rsp_save] ; original SP - jmp do_return2 - -iv_is_16_bytes: + vpxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_avx_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XVPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + vpandn xdata0, xtmp2, xdata0 + vpor xdata0, xtmp +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) vmovdqu xcounter, [p_IV] - jmp bswap_iv + jmp %%bswap_iv +%endmacro + +align 32 +%ifdef CNTR_CCM_AVX +; JOB_AES_HMAC * aes_cntr_ccm_256_avx(JOB_AES_HMAC *job) +; arg 1 : job +MKGLOBAL(aes_cntr_ccm_256_avx,function,internal) +aes_cntr_ccm_256_avx: + DO_CNTR CCM +%else +;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_256_avx,function,internal) +aes_cntr_256_avx: + DO_CNTR CNTR + +;; aes_cntr_bit_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_bit_256_avx,function,internal) +aes_cntr_bit_256_avx: + DO_CNTR CNTR_BIT +%endif ;; CNTR_CCM_AVX %ifdef LINUX section .note.GNU-stack noalloc noexec nowrite progbits