]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / mh_sha256 / mh_sha256_block_avx512.asm
diff --git a/ceph/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/ceph/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
new file mode 100644 (file)
index 0000000..1ee76dd
--- /dev/null
@@ -0,0 +1,682 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12             ; must be saved and restored
+ %define tmp4  r13             ; must be saved and restored
+ %define tmp5  r14             ; must be saved and restored
+ %define tmp6  r15             ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+       push    r12
+       push    r13
+       push    r14
+       push    r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+       pop     r15
+       pop     r14
+       pop     r13
+       pop     r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12            ; must be saved and restored
+ %define tmp2   r13            ; must be saved and restored
+ %define tmp3   r14            ; must be saved and restored
+ %define tmp4   r15            ; must be saved and restored
+ %define tmp5   rdi            ; must be saved and restored
+ %define tmp6   rsi            ; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8               ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+       alloc_stack     stack_size
+       save_xmm128     xmm6, 0*16
+       save_xmm128     xmm7, 1*16
+       save_xmm128     xmm8, 2*16
+       save_xmm128     xmm9, 3*16
+       save_xmm128     xmm10, 4*16
+       save_xmm128     xmm11, 5*16
+       save_xmm128     xmm12, 6*16
+       save_xmm128     xmm13, 7*16
+       save_xmm128     xmm14, 8*16
+       save_xmm128     xmm15, 9*16
+       save_reg        r12,  10*16 + 0*8
+       save_reg        r13,  10*16 + 1*8
+       save_reg        r14,  10*16 + 2*8
+       save_reg        r15,  10*16 + 3*8
+       save_reg        rdi,  10*16 + 4*8
+       save_reg        rsi,  10*16 + 5*8
+       end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+       movdqa  xmm6, [rsp + 0*16]
+       movdqa  xmm7, [rsp + 1*16]
+       movdqa  xmm8, [rsp + 2*16]
+       movdqa  xmm9, [rsp + 3*16]
+       movdqa  xmm10, [rsp + 4*16]
+       movdqa  xmm11, [rsp + 5*16]
+       movdqa  xmm12, [rsp + 6*16]
+       movdqa  xmm13, [rsp + 7*16]
+       movdqa  xmm14, [rsp + 8*16]
+       movdqa  xmm15, [rsp + 9*16]
+       mov     r12,  [rsp + 10*16 + 0*8]
+       mov     r13,  [rsp + 10*16 + 1*8]
+       mov     r14,  [rsp + 10*16 + 2*8]
+       mov     r15,  [rsp + 10*16 + 3*8]
+       mov     rdi,  [rsp + 10*16 + 4*8]
+       mov     rsi,  [rsp + 10*16 + 5*8]
+       add     rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops          arg3
+;variables of mh_sha256
+%define mh_in_p        arg0
+%define mh_digests_p   arg1
+%define mh_data_p      arg2
+;variables used by storing segs_digests on stack
+%define RSP_SAVE       tmp2
+%define FRAMESZ        4*8*16          ;BYTES*DWORDS*SEGS
+; Common definitions
+%define ROUND  tmp4
+%define TBL    tmp5
+
+%define pref   tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+       prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define A      zmm0
+%define B      zmm1
+%define C      zmm2
+%define D      zmm3
+%define E      zmm4
+%define F      zmm5
+%define G      zmm6
+%define H      zmm7
+%define T1     zmm8
+%define TMP0   zmm9
+%define TMP1   zmm10
+%define TMP2   zmm11
+%define TMP3   zmm12
+%define TMP4   zmm13
+%define TMP5   zmm14
+%define TMP6   zmm15
+
+%define W0     zmm16
+%define W1     zmm17
+%define W2     zmm18
+%define W3     zmm19
+%define W4     zmm20
+%define W5     zmm21
+%define W6     zmm22
+%define W7     zmm23
+%define W8     zmm24
+%define W9     zmm25
+%define W10    zmm26
+%define W11    zmm27
+%define W12    zmm28
+%define W13    zmm29
+%define W14    zmm30
+%define W15    zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define APPEND(a,b) a %+ b
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2  ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6  ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7  ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT   %1
+%define %%ROUND        %2
+       ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+       ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+       ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+       ;; H becomes T2, then add T1 for A
+       ;; D becomes D + T1 for E
+
+       vpaddd          T1, H, TMP3             ; T1 = H + Kt
+       vmovdqa32       TMP0, E
+       vprord          TMP1, E, 6              ; ROR_6(E)
+       vprord          TMP2, E, 11             ; ROR_11(E)
+       vprord          TMP3, E, 25             ; ROR_25(E)
+       vpternlogd      TMP0, F, G, 0xCA        ; TMP0 = CH(E,F,G)
+       vpaddd          T1, T1, %%WT            ; T1 = T1 + Wt
+       vpternlogd      TMP1, TMP2, TMP3, 0x96  ; TMP1 = SIGMA1(E)
+       vpaddd          T1, T1, TMP0            ; T1 = T1 + CH(E,F,G)
+       vpaddd          T1, T1, TMP1            ; T1 = T1 + SIGMA1(E)
+       vpaddd          D, D, T1                ; D = D + T1
+
+       vprord          H, A, 2                 ; ROR_2(A)
+       vprord          TMP2, A, 13             ; ROR_13(A)
+       vprord          TMP3, A, 22             ; ROR_22(A)
+       vmovdqa32       TMP0, A
+       vpternlogd      TMP0, B, C, 0xE8        ; TMP0 = MAJ(A,B,C)
+       vpternlogd      H, TMP2, TMP3, 0x96     ; H(T2) = SIGMA0(A)
+       vpaddd          H, H, TMP0              ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+       vpaddd          H, H, T1                ; H(A) = H(T2) + T1
+
+       vmovdqa32       TMP3, [TBL + ((%%ROUND+1)*64)]  ; Next Kt
+
+       ;; Rotate the args A-H (rotation of names associated with regs)
+       ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT   %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14        %4
+       vprord          TMP4, %%WTp14, 17       ; ROR_17(Wt-2)
+       vprord          TMP5, %%WTp14, 19       ; ROR_19(Wt-2)
+       vpsrld          TMP6, %%WTp14, 10       ; SHR_10(Wt-2)
+       vpternlogd      TMP4, TMP5, TMP6, 0x96  ; TMP4 = sigma1(Wt-2)
+
+       vpaddd          %%WT, %%WT, TMP4        ; Wt = Wt-16 + sigma1(Wt-2)
+       vpaddd          %%WT, %%WT, %%WTp9      ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+       vprord          TMP4, %%WTp1, 7         ; ROR_7(Wt-15)
+       vprord          TMP5, %%WTp1, 18        ; ROR_18(Wt-15)
+       vpsrld          TMP6, %%WTp1, 3         ; SHR_3(Wt-15)
+       vpternlogd      TMP4, TMP5, TMP6, 0x96  ; TMP4 = sigma0(Wt-15)
+
+       vpaddd          %%WT, %%WT, TMP4        ; Wt = Wt-16 + sigma1(Wt-2) +
+                                               ;      Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT    %1
+%define %%OFFSET %2
+       mov             inp0, [IN + (%%OFFSET*8)]
+       vmovups         %%WT, [inp0+IDX]
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 |    | Ha| Hb | Hc |...| Hp |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+;              uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+global mh_sha256_block_avx512
+func(mh_sha256_block_avx512)
+       endbranch
+       FUNC_SAVE
+       ; save rsp
+       mov     RSP_SAVE, rsp
+
+       cmp     loops, 0
+       jle     .return
+
+       ; leave enough space to store segs_digests
+       sub     rsp, FRAMESZ
+       ; align rsp to 64 Bytes needed by avx512
+       and     rsp, ~0x3F
+       lea     TBL,[TABLE]
+
+       ; copy segs_digests into stack and ZMM
+       VMOVPS  A, [mh_digests_p + 64*0]
+       VMOVPS  B, [mh_digests_p + 64*1]
+       VMOVPS  C, [mh_digests_p + 64*2]
+       VMOVPS  D, [mh_digests_p + 64*3]
+       VMOVPS  E, [mh_digests_p + 64*4]
+       VMOVPS  F, [mh_digests_p + 64*5]
+       VMOVPS  G, [mh_digests_p + 64*6]
+       VMOVPS  H, [mh_digests_p + 64*7]
+
+.block_loop:
+       ; Save digests for later addition
+       vmovdqa32 [rsp + 64*0], A
+       vmovdqa32 [rsp + 64*1], B
+       vmovdqa32 [rsp + 64*2], C
+       vmovdqa32 [rsp + 64*3], D
+       vmovdqa32 [rsp + 64*4], E
+       vmovdqa32 [rsp + 64*5], F
+       vmovdqa32 [rsp + 64*6], G
+       vmovdqa32 [rsp + 64*7], H
+
+       vmovdqa32       TMP3, [TBL]     ; First K
+       ;transform to big-endian data and store on aligned_frame
+       vmovdqa32       TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+       ;using extra 16 ZMM registers instead of heap
+%assign I 0
+%rep 8
+%assign J (I+1)
+       VMOVPS  APPEND(W,I),[mh_in_p + I*64+0*64]
+       VMOVPS  APPEND(W,J),[mh_in_p + I*64+1*64]
+
+       vpshufb APPEND(W,I), APPEND(W,I), TMP2
+       vpshufb APPEND(W,J), APPEND(W,J), TMP2
+%assign I (I+2)
+%endrep
+
+       ; MSG Schedule for W0-W15 is now complete in registers
+       ; Process first 48 rounds
+       ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+       ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 64
+       PROCESS_LOOP  APPEND(W,J),  I
+       %if I < 48
+       MSG_SCHED_ROUND_16_63  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+       %endif
+       %if I % 8 = 4
+               PREFETCH_X [mh_in_p + 1024+128*(I / 8)]
+       %endif
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+       ;; add old digest
+       vpaddd  A, A, [rsp + 0*64]
+       vpaddd  B, B, [rsp + 1*64]
+       vpaddd  C, C, [rsp + 2*64]
+       vpaddd  D, D, [rsp + 3*64]
+       vpaddd  E, E, [rsp + 4*64]
+       vpaddd  F, F, [rsp + 5*64]
+       vpaddd  G, G, [rsp + 6*64]
+       vpaddd  H, H, [rsp + 7*64]
+
+       add     mh_in_p,   1024
+       sub     loops, 1
+       jne     .block_loop
+
+       ; copy segs_digests back to mh_digests_p
+
+       VMOVPS  [mh_digests_p + 64*0], A
+       VMOVPS  [mh_digests_p + 64*1], B
+       VMOVPS  [mh_digests_p + 64*2], C
+       VMOVPS  [mh_digests_p + 64*3], D
+       VMOVPS  [mh_digests_p + 64*4], E
+       VMOVPS  [mh_digests_p + 64*5], F
+       VMOVPS  [mh_digests_p + 64*6], G
+       VMOVPS  [mh_digests_p + 64*7], H
+
+       mov     rsp, RSP_SAVE                   ; restore rsp
+
+.return:
+       FUNC_RESTORE
+       ret
+
+endproc_frame
+
+section .data
+align 64
+TABLE:
+       dq      0x428a2f98428a2f98, 0x428a2f98428a2f98
+       dq      0x428a2f98428a2f98, 0x428a2f98428a2f98
+       dq      0x428a2f98428a2f98, 0x428a2f98428a2f98
+       dq      0x428a2f98428a2f98, 0x428a2f98428a2f98
+       dq      0x7137449171374491, 0x7137449171374491
+       dq      0x7137449171374491, 0x7137449171374491
+       dq      0x7137449171374491, 0x7137449171374491
+       dq      0x7137449171374491, 0x7137449171374491
+       dq      0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+       dq      0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+       dq      0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+       dq      0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+       dq      0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+       dq      0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+       dq      0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+       dq      0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+       dq      0x3956c25b3956c25b, 0x3956c25b3956c25b
+       dq      0x3956c25b3956c25b, 0x3956c25b3956c25b
+       dq      0x3956c25b3956c25b, 0x3956c25b3956c25b
+       dq      0x3956c25b3956c25b, 0x3956c25b3956c25b
+       dq      0x59f111f159f111f1, 0x59f111f159f111f1
+       dq      0x59f111f159f111f1, 0x59f111f159f111f1
+       dq      0x59f111f159f111f1, 0x59f111f159f111f1
+       dq      0x59f111f159f111f1, 0x59f111f159f111f1
+       dq      0x923f82a4923f82a4, 0x923f82a4923f82a4
+       dq      0x923f82a4923f82a4, 0x923f82a4923f82a4
+       dq      0x923f82a4923f82a4, 0x923f82a4923f82a4
+       dq      0x923f82a4923f82a4, 0x923f82a4923f82a4
+       dq      0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+       dq      0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+       dq      0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+       dq      0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+       dq      0xd807aa98d807aa98, 0xd807aa98d807aa98
+       dq      0xd807aa98d807aa98, 0xd807aa98d807aa98
+       dq      0xd807aa98d807aa98, 0xd807aa98d807aa98
+       dq      0xd807aa98d807aa98, 0xd807aa98d807aa98
+       dq      0x12835b0112835b01, 0x12835b0112835b01
+       dq      0x12835b0112835b01, 0x12835b0112835b01
+       dq      0x12835b0112835b01, 0x12835b0112835b01
+       dq      0x12835b0112835b01, 0x12835b0112835b01
+       dq      0x243185be243185be, 0x243185be243185be
+       dq      0x243185be243185be, 0x243185be243185be
+       dq      0x243185be243185be, 0x243185be243185be
+       dq      0x243185be243185be, 0x243185be243185be
+       dq      0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+       dq      0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+       dq      0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+       dq      0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+       dq      0x72be5d7472be5d74, 0x72be5d7472be5d74
+       dq      0x72be5d7472be5d74, 0x72be5d7472be5d74
+       dq      0x72be5d7472be5d74, 0x72be5d7472be5d74
+       dq      0x72be5d7472be5d74, 0x72be5d7472be5d74
+       dq      0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+       dq      0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+       dq      0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+       dq      0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+       dq      0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+       dq      0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+       dq      0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+       dq      0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+       dq      0xc19bf174c19bf174, 0xc19bf174c19bf174
+       dq      0xc19bf174c19bf174, 0xc19bf174c19bf174
+       dq      0xc19bf174c19bf174, 0xc19bf174c19bf174
+       dq      0xc19bf174c19bf174, 0xc19bf174c19bf174
+       dq      0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+       dq      0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+       dq      0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+       dq      0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+       dq      0xefbe4786efbe4786, 0xefbe4786efbe4786
+       dq      0xefbe4786efbe4786, 0xefbe4786efbe4786
+       dq      0xefbe4786efbe4786, 0xefbe4786efbe4786
+       dq      0xefbe4786efbe4786, 0xefbe4786efbe4786
+       dq      0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+       dq      0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+       dq      0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+       dq      0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+       dq      0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+       dq      0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+       dq      0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+       dq      0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+       dq      0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+       dq      0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+       dq      0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+       dq      0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+       dq      0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+       dq      0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+       dq      0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+       dq      0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+       dq      0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+       dq      0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+       dq      0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+       dq      0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+       dq      0x76f988da76f988da, 0x76f988da76f988da
+       dq      0x76f988da76f988da, 0x76f988da76f988da
+       dq      0x76f988da76f988da, 0x76f988da76f988da
+       dq      0x76f988da76f988da, 0x76f988da76f988da
+       dq      0x983e5152983e5152, 0x983e5152983e5152
+       dq      0x983e5152983e5152, 0x983e5152983e5152
+       dq      0x983e5152983e5152, 0x983e5152983e5152
+       dq      0x983e5152983e5152, 0x983e5152983e5152
+       dq      0xa831c66da831c66d, 0xa831c66da831c66d
+       dq      0xa831c66da831c66d, 0xa831c66da831c66d
+       dq      0xa831c66da831c66d, 0xa831c66da831c66d
+       dq      0xa831c66da831c66d, 0xa831c66da831c66d
+       dq      0xb00327c8b00327c8, 0xb00327c8b00327c8
+       dq      0xb00327c8b00327c8, 0xb00327c8b00327c8
+       dq      0xb00327c8b00327c8, 0xb00327c8b00327c8
+       dq      0xb00327c8b00327c8, 0xb00327c8b00327c8
+       dq      0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+       dq      0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+       dq      0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+       dq      0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+       dq      0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+       dq      0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+       dq      0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+       dq      0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+       dq      0xd5a79147d5a79147, 0xd5a79147d5a79147
+       dq      0xd5a79147d5a79147, 0xd5a79147d5a79147
+       dq      0xd5a79147d5a79147, 0xd5a79147d5a79147
+       dq      0xd5a79147d5a79147, 0xd5a79147d5a79147
+       dq      0x06ca635106ca6351, 0x06ca635106ca6351
+       dq      0x06ca635106ca6351, 0x06ca635106ca6351
+       dq      0x06ca635106ca6351, 0x06ca635106ca6351
+       dq      0x06ca635106ca6351, 0x06ca635106ca6351
+       dq      0x1429296714292967, 0x1429296714292967
+       dq      0x1429296714292967, 0x1429296714292967
+       dq      0x1429296714292967, 0x1429296714292967
+       dq      0x1429296714292967, 0x1429296714292967
+       dq      0x27b70a8527b70a85, 0x27b70a8527b70a85
+       dq      0x27b70a8527b70a85, 0x27b70a8527b70a85
+       dq      0x27b70a8527b70a85, 0x27b70a8527b70a85
+       dq      0x27b70a8527b70a85, 0x27b70a8527b70a85
+       dq      0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+       dq      0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+       dq      0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+       dq      0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+       dq      0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+       dq      0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+       dq      0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+       dq      0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+       dq      0x53380d1353380d13, 0x53380d1353380d13
+       dq      0x53380d1353380d13, 0x53380d1353380d13
+       dq      0x53380d1353380d13, 0x53380d1353380d13
+       dq      0x53380d1353380d13, 0x53380d1353380d13
+       dq      0x650a7354650a7354, 0x650a7354650a7354
+       dq      0x650a7354650a7354, 0x650a7354650a7354
+       dq      0x650a7354650a7354, 0x650a7354650a7354
+       dq      0x650a7354650a7354, 0x650a7354650a7354
+       dq      0x766a0abb766a0abb, 0x766a0abb766a0abb
+       dq      0x766a0abb766a0abb, 0x766a0abb766a0abb
+       dq      0x766a0abb766a0abb, 0x766a0abb766a0abb
+       dq      0x766a0abb766a0abb, 0x766a0abb766a0abb
+       dq      0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+       dq      0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+       dq      0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+       dq      0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+       dq      0x92722c8592722c85, 0x92722c8592722c85
+       dq      0x92722c8592722c85, 0x92722c8592722c85
+       dq      0x92722c8592722c85, 0x92722c8592722c85
+       dq      0x92722c8592722c85, 0x92722c8592722c85
+       dq      0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+       dq      0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+       dq      0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+       dq      0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+       dq      0xa81a664ba81a664b, 0xa81a664ba81a664b
+       dq      0xa81a664ba81a664b, 0xa81a664ba81a664b
+       dq      0xa81a664ba81a664b, 0xa81a664ba81a664b
+       dq      0xa81a664ba81a664b, 0xa81a664ba81a664b
+       dq      0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+       dq      0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+       dq      0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+       dq      0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+       dq      0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+       dq      0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+       dq      0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+       dq      0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+       dq      0xd192e819d192e819, 0xd192e819d192e819
+       dq      0xd192e819d192e819, 0xd192e819d192e819
+       dq      0xd192e819d192e819, 0xd192e819d192e819
+       dq      0xd192e819d192e819, 0xd192e819d192e819
+       dq      0xd6990624d6990624, 0xd6990624d6990624
+       dq      0xd6990624d6990624, 0xd6990624d6990624
+       dq      0xd6990624d6990624, 0xd6990624d6990624
+       dq      0xd6990624d6990624, 0xd6990624d6990624
+       dq      0xf40e3585f40e3585, 0xf40e3585f40e3585
+       dq      0xf40e3585f40e3585, 0xf40e3585f40e3585
+       dq      0xf40e3585f40e3585, 0xf40e3585f40e3585
+       dq      0xf40e3585f40e3585, 0xf40e3585f40e3585
+       dq      0x106aa070106aa070, 0x106aa070106aa070
+       dq      0x106aa070106aa070, 0x106aa070106aa070
+       dq      0x106aa070106aa070, 0x106aa070106aa070
+       dq      0x106aa070106aa070, 0x106aa070106aa070
+       dq      0x19a4c11619a4c116, 0x19a4c11619a4c116
+       dq      0x19a4c11619a4c116, 0x19a4c11619a4c116
+       dq      0x19a4c11619a4c116, 0x19a4c11619a4c116
+       dq      0x19a4c11619a4c116, 0x19a4c11619a4c116
+       dq      0x1e376c081e376c08, 0x1e376c081e376c08
+       dq      0x1e376c081e376c08, 0x1e376c081e376c08
+       dq      0x1e376c081e376c08, 0x1e376c081e376c08
+       dq      0x1e376c081e376c08, 0x1e376c081e376c08
+       dq      0x2748774c2748774c, 0x2748774c2748774c
+       dq      0x2748774c2748774c, 0x2748774c2748774c
+       dq      0x2748774c2748774c, 0x2748774c2748774c
+       dq      0x2748774c2748774c, 0x2748774c2748774c
+       dq      0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+       dq      0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+       dq      0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+       dq      0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+       dq      0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+       dq      0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+       dq      0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+       dq      0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+       dq      0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+       dq      0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+       dq      0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+       dq      0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+       dq      0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+       dq      0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+       dq      0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+       dq      0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+       dq      0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+       dq      0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+       dq      0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+       dq      0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+       dq      0x748f82ee748f82ee, 0x748f82ee748f82ee
+       dq      0x748f82ee748f82ee, 0x748f82ee748f82ee
+       dq      0x748f82ee748f82ee, 0x748f82ee748f82ee
+       dq      0x748f82ee748f82ee, 0x748f82ee748f82ee
+       dq      0x78a5636f78a5636f, 0x78a5636f78a5636f
+       dq      0x78a5636f78a5636f, 0x78a5636f78a5636f
+       dq      0x78a5636f78a5636f, 0x78a5636f78a5636f
+       dq      0x78a5636f78a5636f, 0x78a5636f78a5636f
+       dq      0x84c8781484c87814, 0x84c8781484c87814
+       dq      0x84c8781484c87814, 0x84c8781484c87814
+       dq      0x84c8781484c87814, 0x84c8781484c87814
+       dq      0x84c8781484c87814, 0x84c8781484c87814
+       dq      0x8cc702088cc70208, 0x8cc702088cc70208
+       dq      0x8cc702088cc70208, 0x8cc702088cc70208
+       dq      0x8cc702088cc70208, 0x8cc702088cc70208
+       dq      0x8cc702088cc70208, 0x8cc702088cc70208
+       dq      0x90befffa90befffa, 0x90befffa90befffa
+       dq      0x90befffa90befffa, 0x90befffa90befffa
+       dq      0x90befffa90befffa, 0x90befffa90befffa
+       dq      0x90befffa90befffa, 0x90befffa90befffa
+       dq      0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+       dq      0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+       dq      0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+       dq      0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+       dq      0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+       dq      0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+       dq      0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+       dq      0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+       dq      0xc67178f2c67178f2, 0xc67178f2c67178f2
+       dq      0xc67178f2c67178f2, 0xc67178f2c67178f2
+       dq      0xc67178f2c67178f2, 0xc67178f2c67178f2
+       dq      0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+                        dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+                        dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+                        dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha256_block_avx512
+no_mh_sha256_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
+