1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "sm3_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
37 ;; code to compute oct SM3 using SSE-256 / AVX2
38 ;; outer calling routine takes care of save and restore of XMM registers
39 ;; Logic designed/laid out by JDG
41 ;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15
42 ;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
43 ;; Windows preserves: rcx rbp r8
45 ;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
46 ;; Linux preserves: rdi rbp r8
50 %ifidn __OUTPUT_FORMAT__, elf64
67 %define SIZE INP_SIZE ; rsi
81 %define APPEND(a,b) a %+ b
119 ; W(j) = WB(j) + WB(j+4)
120 ; Keep WB(j) - W(j+4) to reduce momory read
128 %define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register
130 %define SM3_DIGEST_WORD_SIZE 4
131 %define MAX_SM3_LANES 8
132 %define NUM_SM3_DIGEST_WORDS 8
133 %define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE)
137 ;; Assume stack aligned to 32 bytes before call
138 ;; Therefore FRAMESZ mod 32 must be 32-8 = 24
145 %define FRAMESZ stack_frame_size
146 %define _DIGEST stack_frame.digest
147 %define _WBTMP stack_frame.wbtmp
148 %define _RSP_SAVE stack_frame.rsp
150 %define YTMP0 rsp + _WBTMP + 0*SZ8
151 %define YTMP1 rsp + _WBTMP + 1*SZ8
152 %define YTMP2 rsp + _WBTMP + 2*SZ8
153 %define YTMP3 rsp + _WBTMP + 3*SZ8
154 %define YTMP4 rsp + _WBTMP + 4*SZ8
156 %define YTMPI rsp + _WBTMP + I*SZ8
157 %define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8
158 %define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8
159 %define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8
160 %define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8
163 %define VMOVPS vmovups
179 ; process top half (r0..r3) {a...d}
180 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
181 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
182 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
183 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
184 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
185 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
186 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
187 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
189 ; use r2 in place of t0
190 ; process bottom half (r4..r7) {e...h}
191 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
192 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
193 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
194 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
195 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
196 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
197 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
198 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
200 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
201 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
202 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
203 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
204 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
205 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
206 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
207 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
223 %macro ROTATE_ARGS_AD 0
234 %macro ROTATE_ARGS_EH 0
250 vpslld %%tmp, %%reg, %%imm
251 vpsrld %%reg, %%reg, (32-(%%imm))
252 vpor %%reg, %%reg, %%tmp
261 vpslld %%tmp, %%src, %%imm
262 vpsrld %%reg, %%src, (32-(%%imm))
263 vpor %%reg, %%reg, %%tmp
266 ;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes);
267 ;; arg 1 : STATE : pointer to input data
268 ;; arg 2 : INP_SIZE : size of input in blocks
269 mk_global sm3_mb_x8_avx2,function,internal
273 ; general registers preserved in outer calling routine
274 ; outer calling routine saves all the YMM registers
276 ; save rsp, allocate 32-byte aligned for local variables
280 mov [rsp + _RSP_SAVE], IDX
284 ;; load the address of each of the 8 message lanes
285 ;; getting ready to transpose input onto stack
286 mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
287 mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
288 mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
289 mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
290 mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
291 mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
292 mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
293 mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
301 ; Pre calculate the WB 0..68 an W 0..64
302 ; It will better than calculate WB/W in round method
304 ; ps : SHA256(AVX2) calculate WB/W in round method
306 ; Pre calculation memory io time:
307 ; read : 68 + 3 * 52(read WB)
308 ; write : 52(write WB17..68)
309 ; Round method calculation memory io time:
310 ; read : 48 * 6(read 6 number of WB each round)
311 ; write : 52 + 64(same as upper)
313 VMOVPS WB0,[inp0+IDX]
314 VMOVPS WB1,[inp1+IDX]
315 VMOVPS WB2,[inp2+IDX]
316 VMOVPS WB3,[inp3+IDX]
317 VMOVPS WB4,[inp4+IDX]
318 VMOVPS WB5,[inp5+IDX]
319 VMOVPS WB6,[inp6+IDX]
320 VMOVPS WB7,[inp7+IDX]
322 TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1
323 vmovdqa WBTMP0, [SHUF_MASK]
336 VMOVPS WB8,[inp0+IDX + 32]
337 VMOVPS WB9,[inp1+IDX + 32]
338 VMOVPS WB10,[inp2+IDX + 32]
339 VMOVPS WB11,[inp3+IDX + 32]
340 VMOVPS WB12,[inp4+IDX + 32]
341 VMOVPS WB13,[inp5+IDX + 32]
342 VMOVPS WB14,[inp6+IDX + 32]
343 VMOVPS WB15,[inp7+IDX + 32]
345 TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3
346 vmovdqa WBTMP2, [SHUF_MASK]
356 ; WB0 WB1 already saved
359 vmovdqa [YTMPI], APPEND(WB,I)
363 vmovdqa WB0 , [YTMP0]
364 vmovdqa WB1 , [YTMP1]
366 ; Calculate WB 16...67
369 %assign J_1 ((I-1) % 16) ;tmp to use
370 %assign J_2 ((I-2) % 16) ;tmp to use
371 %assign J_3 ((I-3) % 16)
372 %assign J_4 ((I-4) % 16) ;tmp to use
373 %assign J_9 ((I-9) % 16)
374 %assign J_13 ((I-13) % 16)
375 %assign J_6 ((I-6) % 16)
377 ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3)
378 vpxor APPEND(WB,J),APPEND(WB,J_2)
379 vpxor APPEND(WB,J),APPEND(WB,J_9)
381 ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J)
382 ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J)
383 vpxor APPEND(WB,J),APPEND(WB,J_2)
384 vpxor APPEND(WB,J),APPEND(WB,J_1)
386 ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13)
387 vpxor APPEND(WB,J),APPEND(WB,J_2)
388 vpxor APPEND(WB,J),APPEND(WB,J_6)
390 vmovdqa [YTMPI], APPEND(WB,J)
392 vmovdqa APPEND(WB,J_1), [YTMPI_1]
393 vmovdqa APPEND(WB,J_2), [YTMPI_2]
394 vmovdqa APPEND(WB,J_4), [YTMPI_4]
401 ; Every round need load A-H
402 ; Because we pre calculate the WB
403 vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE]
404 vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE]
405 vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE]
406 vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE]
407 vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE]
408 vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE]
409 vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE]
410 vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE]
423 ROLD_nd TMP0,12,TMP1,A
424 vmovdqa TMP1, [TBL + (I*32)]
442 ; Rotate a,b,c,d first
443 ; after P0(TT2) , Wj0 will be relase
453 ROLD_nd TMP1,9,TMP2,TMP0
454 ROLD_nd Wj0,17,TMP2,TMP0
465 vmovdqa Wj4, [YTMPI5]
471 ROLD_nd TMP0,12,TMP1,A
472 vmovdqa TMP1, [TBL + (I*32)]
482 ; FF16/GG16 diff with FF64/GG64
483 ; So the register which keep D should be release before calculate TT1
510 ROLD_nd TMP1,9,TMP2,TMP0
511 ROLD_nd Wj0,17,TMP2,TMP0
521 vmovdqa Wj4, [YTMPI5]
525 vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE]
526 vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE]
527 vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE]
528 vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE]
529 vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE]
530 vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE]
531 vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE]
532 vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE]
534 ; Write back to memory (state object) the transposed digest
535 vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A
536 vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B
537 vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C
538 vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D
539 vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E
540 vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F
541 vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G
542 vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H
551 ; update input pointers
553 mov [STATE + _args_data_ptr + 0*8], inp0
555 mov [STATE + _args_data_ptr + 1*8], inp1
557 mov [STATE + _args_data_ptr + 2*8], inp2
559 mov [STATE + _args_data_ptr + 3*8], inp3
561 mov [STATE + _args_data_ptr + 4*8], inp4
563 mov [STATE + _args_data_ptr + 5*8], inp5
565 mov [STATE + _args_data_ptr + 6*8], inp6
567 mov [STATE + _args_data_ptr + 7*8], inp7
571 mov rsp, [rsp + _RSP_SAVE]
575 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
576 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
581 dq 0x79cc451979cc4519,0x79cc451979cc4519
582 dq 0x79cc451979cc4519,0x79cc451979cc4519
583 dq 0xf3988a32f3988a32,0xf3988a32f3988a32
584 dq 0xf3988a32f3988a32,0xf3988a32f3988a32
585 dq 0xe7311465e7311465,0xe7311465e7311465
586 dq 0xe7311465e7311465,0xe7311465e7311465
587 dq 0xce6228cbce6228cb,0xce6228cbce6228cb
588 dq 0xce6228cbce6228cb,0xce6228cbce6228cb
589 dq 0x9cc451979cc45197,0x9cc451979cc45197
590 dq 0x9cc451979cc45197,0x9cc451979cc45197
591 dq 0x3988a32f3988a32f,0x3988a32f3988a32f
592 dq 0x3988a32f3988a32f,0x3988a32f3988a32f
593 dq 0x7311465e7311465e,0x7311465e7311465e
594 dq 0x7311465e7311465e,0x7311465e7311465e
595 dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
596 dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
597 dq 0xcc451979cc451979,0xcc451979cc451979
598 dq 0xcc451979cc451979,0xcc451979cc451979
599 dq 0x988a32f3988a32f3,0x988a32f3988a32f3
600 dq 0x988a32f3988a32f3,0x988a32f3988a32f3
601 dq 0x311465e7311465e7,0x311465e7311465e7
602 dq 0x311465e7311465e7,0x311465e7311465e7
603 dq 0x6228cbce6228cbce,0x6228cbce6228cbce
604 dq 0x6228cbce6228cbce,0x6228cbce6228cbce
605 dq 0xc451979cc451979c,0xc451979cc451979c
606 dq 0xc451979cc451979c,0xc451979cc451979c
607 dq 0x88a32f3988a32f39,0x88a32f3988a32f39
608 dq 0x88a32f3988a32f39,0x88a32f3988a32f39
609 dq 0x11465e7311465e73,0x11465e7311465e73
610 dq 0x11465e7311465e73,0x11465e7311465e73
611 dq 0x228cbce6228cbce6,0x228cbce6228cbce6
612 dq 0x228cbce6228cbce6,0x228cbce6228cbce6
613 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
614 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
615 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
616 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
617 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
618 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
619 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
620 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
621 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
622 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
623 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
624 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
625 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
626 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
627 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
628 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
629 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
630 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
631 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
632 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
633 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
634 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
635 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
636 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
637 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
638 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
639 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
640 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
641 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
642 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
643 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
644 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
645 dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
646 dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
647 dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
648 dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
649 dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
650 dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
651 dq 0xd43cec53d43cec53,0xd43cec53d43cec53
652 dq 0xd43cec53d43cec53,0xd43cec53d43cec53
653 dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
654 dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
655 dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
656 dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
657 dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
658 dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
659 dq 0x43cec53d43cec53d,0x43cec53d43cec53d
660 dq 0x43cec53d43cec53d,0x43cec53d43cec53d
661 dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
662 dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
663 dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
664 dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
665 dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
666 dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
667 dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
668 dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
669 dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
670 dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
671 dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
672 dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
673 dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
674 dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
675 dq 0xcec53d43cec53d43,0xcec53d43cec53d43
676 dq 0xcec53d43cec53d43,0xcec53d43cec53d43
677 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
678 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
679 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
680 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
681 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
682 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
683 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
684 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
685 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
686 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
687 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
688 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
689 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
690 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
691 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
692 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
693 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
694 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
695 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
696 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
697 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
698 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
699 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
700 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
701 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
702 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
703 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
704 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
705 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
706 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
707 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
708 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
710 SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
711 dq 0x0405060700010203,0x0c0d0e0f08090a0b