]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sm3_mb / sm3_mb_x8_avx2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sm3_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 [bits 64]
34 default rel
35 section .text
36
37 ;; code to compute oct SM3 using SSE-256 / AVX2
38 ;; outer calling routine takes care of save and restore of XMM registers
39 ;; Logic designed/laid out by JDG
40
41 ;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15
42 ;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
43 ;; Windows preserves: rcx rbp r8
44 ;;
45 ;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
46 ;; Linux preserves: rdi rbp r8
47 ;;
48 ;; clobbers ymm0-15
49
50 %ifidn __OUTPUT_FORMAT__, elf64
51 ; Linux definitions
52 %define arg1 rdi
53 %define arg2 rsi
54 %define reg3 rcx
55 %define reg4 rdx
56 %else
57 ; Windows definitions
58 %define arg1 rcx
59 %define arg2 rdx
60 %define reg3 rsi
61 %define reg4 rdi
62 %endif
63
64 ; Common definitions
65 %define STATE arg1
66 %define INP_SIZE arg2
67 %define SIZE INP_SIZE ; rsi
68
69 %define IDX rax
70 %define TBL reg3
71
72 %define inp0 r9
73 %define inp1 r10
74 %define inp2 r11
75 %define inp3 r12
76 %define inp4 r13
77 %define inp5 r14
78 %define inp6 r15
79 %define inp7 reg4
80
81 %define APPEND(a,b) a %+ b
82
83 %define WB0 ymm0
84 %define WB1 ymm1
85 %define WB2 ymm2
86 %define WB3 ymm3
87 %define WB4 ymm4
88 %define WB5 ymm5
89 %define WB6 ymm6
90 %define WB7 ymm7
91 %define WB8 ymm8
92 %define WB9 ymm9
93 %define WB10 ymm10
94 %define WB11 ymm11
95 %define WB12 ymm12
96 %define WB13 ymm13
97 %define WB14 ymm14
98 %define WB15 ymm15
99
100 %define WBTMP0 ymm8
101 %define WBTMP1 ymm9
102
103 %define WBTMP2 ymm0
104 %define WBTMP3 ymm1
105
106 %define A ymm0
107 %define B ymm1
108 %define C ymm2
109 %define D ymm3
110 %define E ymm4
111 %define F ymm5
112 %define G ymm6
113 %define H ymm7
114
115 %define TMP0 ymm8
116 %define TMP1 ymm9
117 %define TMP2 ymm10
118
119 ; W(j) = WB(j) + WB(j+4)
120 ; Keep WB(j) - W(j+4) to reduce momory read
121 %define Wj0 ymm11
122 %define Wj1 ymm12
123 %define Wj2 ymm13
124 %define Wj3 ymm14
125 %define Wj4 ymm15
126
127
128 %define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register
129 %define PTR_SZ 8
130 %define SM3_DIGEST_WORD_SIZE 4
131 %define MAX_SM3_LANES 8
132 %define NUM_SM3_DIGEST_WORDS 8
133 %define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE)
134
135 ; Define stack usage
136
137 ;; Assume stack aligned to 32 bytes before call
138 ;; Therefore FRAMESZ mod 32 must be 32-8 = 24
139 struc stack_frame
140 .data resb 16*SZ8
141 .digest resb 8*SZ8
142 .wbtmp resb 69*SZ8
143 .rsp resb 8
144 endstruc
145 %define FRAMESZ stack_frame_size
146 %define _DIGEST stack_frame.digest
147 %define _WBTMP stack_frame.wbtmp
148 %define _RSP_SAVE stack_frame.rsp
149
150 %define YTMP0 rsp + _WBTMP + 0*SZ8
151 %define YTMP1 rsp + _WBTMP + 1*SZ8
152 %define YTMP2 rsp + _WBTMP + 2*SZ8
153 %define YTMP3 rsp + _WBTMP + 3*SZ8
154 %define YTMP4 rsp + _WBTMP + 4*SZ8
155
156 %define YTMPI rsp + _WBTMP + I*SZ8
157 %define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8
158 %define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8
159 %define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8
160 %define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8
161
162
163 %define VMOVPS vmovups
164
165 ;;;;;;;;
166 ; same as sha256
167 ;;;;;;;;
168 %macro TRANSPOSE8 10
169 %define %%r0 %1
170 %define %%r1 %2
171 %define %%r2 %3
172 %define %%r3 %4
173 %define %%r4 %5
174 %define %%r5 %6
175 %define %%r6 %7
176 %define %%r7 %8
177 %define %%t0 %9
178 %define %%t1 %10
179 ; process top half (r0..r3) {a...d}
180 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
181 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
182 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
183 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
184 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
185 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
186 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
187 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
188
189 ; use r2 in place of t0
190 ; process bottom half (r4..r7) {e...h}
191 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
192 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
193 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
194 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
195 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
196 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
197 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
198 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
199
200 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
201 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
202 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
203 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
204 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
205 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
206 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
207 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
208 %endmacro
209
210 %macro ROTATE_W 0
211
212 %xdefine TMP_ Wj0
213 %xdefine Wj0 Wj1
214 %xdefine Wj1 Wj2
215 %xdefine Wj2 Wj3
216 %xdefine Wj3 Wj4
217
218 %xdefine Wj4 TMP_
219
220 %endmacro
221
222 ; ROTATE A,B,C,D
223 %macro ROTATE_ARGS_AD 0
224
225 %xdefine TMP_ D
226 %xdefine D C
227 %xdefine C B
228 %xdefine B A
229 %xdefine A TMP2
230 %xdefine TMP2 TMP_
231
232 %endmacro
233
234 %macro ROTATE_ARGS_EH 0
235
236 %xdefine TMP_ H
237 %xdefine H G
238 %xdefine G F
239 %xdefine F E
240 %xdefine E TMP0
241 %xdefine TMP0 TMP_
242
243 %endmacro
244
245 %macro ROLD 3
246
247 %define %%reg %1
248 %define %%imm %2
249 %define %%tmp %3
250 vpslld %%tmp, %%reg, %%imm
251 vpsrld %%reg, %%reg, (32-(%%imm))
252 vpor %%reg, %%reg, %%tmp
253
254 %endmacro
255
256 %macro ROLD_nd 4
257 %define %%reg %1
258 %define %%imm %2
259 %define %%tmp %3
260 %define %%src %4
261 vpslld %%tmp, %%src, %%imm
262 vpsrld %%reg, %%src, (32-(%%imm))
263 vpor %%reg, %%reg, %%tmp
264 %endmacro
265
266 ;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes);
267 ;; arg 1 : STATE : pointer to input data
268 ;; arg 2 : INP_SIZE : size of input in blocks
269 mk_global sm3_mb_x8_avx2,function,internal
270 align 16
271 sm3_mb_x8_avx2:
272 endbranch
273 ; general registers preserved in outer calling routine
274 ; outer calling routine saves all the YMM registers
275
276 ; save rsp, allocate 32-byte aligned for local variables
277 mov IDX, rsp
278 sub rsp, FRAMESZ
279 and rsp, ~31
280 mov [rsp + _RSP_SAVE], IDX
281
282 lea TBL,[TABLE]
283
284 ;; load the address of each of the 8 message lanes
285 ;; getting ready to transpose input onto stack
286 mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
287 mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
288 mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
289 mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
290 mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
291 mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
292 mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
293 mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
294
295 xor IDX, IDX
296
297 %assign cur_loop 0
298 lloop:
299
300 ;
301 ; Pre calculate the WB 0..68 an W 0..64
302 ; It will better than calculate WB/W in round method
303 ;
304 ; ps : SHA256(AVX2) calculate WB/W in round method
305 ;
306 ; Pre calculation memory io time:
307 ; read : 68 + 3 * 52(read WB)
308 ; write : 52(write WB17..68)
309 ; Round method calculation memory io time:
310 ; read : 48 * 6(read 6 number of WB each round)
311 ; write : 52 + 64(same as upper)
312 ;
313 VMOVPS WB0,[inp0+IDX]
314 VMOVPS WB1,[inp1+IDX]
315 VMOVPS WB2,[inp2+IDX]
316 VMOVPS WB3,[inp3+IDX]
317 VMOVPS WB4,[inp4+IDX]
318 VMOVPS WB5,[inp5+IDX]
319 VMOVPS WB6,[inp6+IDX]
320 VMOVPS WB7,[inp7+IDX]
321
322 TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1
323 vmovdqa WBTMP0, [SHUF_MASK]
324 vpshufb WB0,WBTMP0
325 vpshufb WB1,WBTMP0
326 vpshufb WB2,WBTMP0
327 vpshufb WB3,WBTMP0
328 vpshufb WB4,WBTMP0
329 vpshufb WB5,WBTMP0
330 vpshufb WB6,WBTMP0
331 vpshufb WB7,WBTMP0
332
333 vmovdqa [YTMP0], WB0
334 vmovdqa [YTMP1], WB1
335
336 VMOVPS WB8,[inp0+IDX + 32]
337 VMOVPS WB9,[inp1+IDX + 32]
338 VMOVPS WB10,[inp2+IDX + 32]
339 VMOVPS WB11,[inp3+IDX + 32]
340 VMOVPS WB12,[inp4+IDX + 32]
341 VMOVPS WB13,[inp5+IDX + 32]
342 VMOVPS WB14,[inp6+IDX + 32]
343 VMOVPS WB15,[inp7+IDX + 32]
344
345 TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3
346 vmovdqa WBTMP2, [SHUF_MASK]
347 vpshufb WB8,WBTMP2
348 vpshufb WB9,WBTMP2
349 vpshufb WB10,WBTMP2
350 vpshufb WB11,WBTMP2
351 vpshufb WB12,WBTMP2
352 vpshufb WB13,WBTMP2
353 vpshufb WB14,WBTMP2
354 vpshufb WB15,WBTMP2
355
356 ; WB0 WB1 already saved
357 %assign I 2
358 %rep 14
359 vmovdqa [YTMPI], APPEND(WB,I)
360 %assign I (I+1)
361 %endrep
362
363 vmovdqa WB0 , [YTMP0]
364 vmovdqa WB1 , [YTMP1]
365
366 ; Calculate WB 16...67
367 %rep 52
368 %assign J (I % 16)
369 %assign J_1 ((I-1) % 16) ;tmp to use
370 %assign J_2 ((I-2) % 16) ;tmp to use
371 %assign J_3 ((I-3) % 16)
372 %assign J_4 ((I-4) % 16) ;tmp to use
373 %assign J_9 ((I-9) % 16)
374 %assign J_13 ((I-13) % 16)
375 %assign J_6 ((I-6) % 16)
376
377 ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3)
378 vpxor APPEND(WB,J),APPEND(WB,J_2)
379 vpxor APPEND(WB,J),APPEND(WB,J_9)
380
381 ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J)
382 ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J)
383 vpxor APPEND(WB,J),APPEND(WB,J_2)
384 vpxor APPEND(WB,J),APPEND(WB,J_1)
385
386 ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13)
387 vpxor APPEND(WB,J),APPEND(WB,J_2)
388 vpxor APPEND(WB,J),APPEND(WB,J_6)
389
390 vmovdqa [YTMPI], APPEND(WB,J)
391
392 vmovdqa APPEND(WB,J_1), [YTMPI_1]
393 vmovdqa APPEND(WB,J_2), [YTMPI_2]
394 vmovdqa APPEND(WB,J_4), [YTMPI_4]
395
396 %assign I (I+1)
397 %endrep
398
399 add IDX, 4*4*4
400
401 ; Every round need load A-H
402 ; Because we pre calculate the WB
403 vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE]
404 vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE]
405 vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE]
406 vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE]
407 vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE]
408 vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE]
409 vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE]
410 vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE]
411
412 vmovdqa Wj0, [YTMP0]
413 vmovdqa Wj1, [YTMP1]
414 vmovdqa Wj2, [YTMP2]
415 vmovdqa Wj3, [YTMP3]
416 vmovdqa Wj4, [YTMP4]
417
418
419 %assign I 0
420 %rep 16
421
422 ; SS1 - TMP1
423 ROLD_nd TMP0,12,TMP1,A
424 vmovdqa TMP1, [TBL + (I*32)]
425 vpaddd TMP1,E
426 vpaddd TMP1,TMP0
427 ROLD TMP1,7,TMP2
428
429 ; SS2 - TMP2
430 vpxor TMP2,TMP1,TMP0
431
432 ; TT1
433 vpxor TMP0,A,B
434 vpxor TMP0,C
435 vpaddd TMP2,TMP0
436 vpaddd TMP2,D
437 vpxor TMP0,Wj0,Wj4
438 vpaddd TMP2,TMP0
439
440 ROLD B,9,TMP0
441
442 ; Rotate a,b,c,d first
443 ; after P0(TT2) , Wj0 will be relase
444 ROTATE_ARGS_AD
445
446 ; P0(TT2)
447 vpxor TMP0,E,F
448 vpxor TMP0,G
449 vpaddd TMP0,H
450 vpaddd TMP0,TMP1
451 vpaddd TMP0,Wj0
452
453 ROLD_nd TMP1,9,TMP2,TMP0
454 ROLD_nd Wj0,17,TMP2,TMP0
455
456 vpxor TMP0,TMP1
457 vpxor TMP0,Wj0
458
459 ROLD F,19,TMP2
460
461 ROTATE_ARGS_EH
462
463 ROTATE_W
464
465 vmovdqa Wj4, [YTMPI5]
466 %assign I (I+1)
467 %endrep
468
469 %rep 48
470 ; SS1 - TMP1
471 ROLD_nd TMP0,12,TMP1,A
472 vmovdqa TMP1, [TBL + (I*32)]
473 vpaddd TMP1,E
474 vpaddd TMP1,TMP0
475 ROLD TMP1,7,TMP2
476
477 ; SS2 - TMP2
478 vpxor TMP2,TMP1,TMP0
479
480 ; SS2 + D first
481 ; D will be release
482 ; FF16/GG16 diff with FF64/GG64
483 ; So the register which keep D should be release before calculate TT1
484 vpaddd TMP2,D
485
486 ; TT1
487 vpor TMP0,A,B
488 vpand TMP0,C
489 vpand D,A,B
490 vpor TMP0,D
491
492 vpaddd TMP2,TMP0
493 vpxor TMP0,Wj0,Wj4
494 vpaddd TMP2,TMP0
495
496 ROLD B,9,TMP0
497
498 ROTATE_ARGS_AD
499
500 ; P0(TT2)
501 vpaddd TMP1,H
502 vpaddd TMP1,Wj0
503
504 vpand TMP0,E,F
505 vpandn Wj0,E,G
506 vpor TMP0,Wj0
507
508 vpaddd TMP0,TMP1
509
510 ROLD_nd TMP1,9,TMP2,TMP0
511 ROLD_nd Wj0,17,TMP2,TMP0
512
513 vpxor TMP0,TMP1
514 vpxor TMP0,Wj0
515
516 ROLD F,19,TMP2
517
518 ROTATE_ARGS_EH
519
520 ROTATE_W
521 vmovdqa Wj4, [YTMPI5]
522 %assign I (I+1)
523 %endrep
524
525 vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE]
526 vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE]
527 vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE]
528 vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE]
529 vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE]
530 vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE]
531 vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE]
532 vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE]
533
534 ; Write back to memory (state object) the transposed digest
535 vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A
536 vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B
537 vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C
538 vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D
539 vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E
540 vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F
541 vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G
542 vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H
543
544 sub SIZE, 1
545 je last_loop
546 jmp lloop
547
548 last_loop:
549
550
551 ; update input pointers
552 add inp0, IDX
553 mov [STATE + _args_data_ptr + 0*8], inp0
554 add inp1, IDX
555 mov [STATE + _args_data_ptr + 1*8], inp1
556 add inp2, IDX
557 mov [STATE + _args_data_ptr + 2*8], inp2
558 add inp3, IDX
559 mov [STATE + _args_data_ptr + 3*8], inp3
560 add inp4, IDX
561 mov [STATE + _args_data_ptr + 4*8], inp4
562 add inp5, IDX
563 mov [STATE + _args_data_ptr + 5*8], inp5
564 add inp6, IDX
565 mov [STATE + _args_data_ptr + 6*8], inp6
566 add inp7, IDX
567 mov [STATE + _args_data_ptr + 7*8], inp7
568
569 ;;;;;;;;;;;;;;;;
570 ;; Postamble
571 mov rsp, [rsp + _RSP_SAVE]
572 ret
573
574
575 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
576 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
577
578 align 64
579 global TABLE
580 TABLE:
581 dq 0x79cc451979cc4519,0x79cc451979cc4519
582 dq 0x79cc451979cc4519,0x79cc451979cc4519
583 dq 0xf3988a32f3988a32,0xf3988a32f3988a32
584 dq 0xf3988a32f3988a32,0xf3988a32f3988a32
585 dq 0xe7311465e7311465,0xe7311465e7311465
586 dq 0xe7311465e7311465,0xe7311465e7311465
587 dq 0xce6228cbce6228cb,0xce6228cbce6228cb
588 dq 0xce6228cbce6228cb,0xce6228cbce6228cb
589 dq 0x9cc451979cc45197,0x9cc451979cc45197
590 dq 0x9cc451979cc45197,0x9cc451979cc45197
591 dq 0x3988a32f3988a32f,0x3988a32f3988a32f
592 dq 0x3988a32f3988a32f,0x3988a32f3988a32f
593 dq 0x7311465e7311465e,0x7311465e7311465e
594 dq 0x7311465e7311465e,0x7311465e7311465e
595 dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
596 dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
597 dq 0xcc451979cc451979,0xcc451979cc451979
598 dq 0xcc451979cc451979,0xcc451979cc451979
599 dq 0x988a32f3988a32f3,0x988a32f3988a32f3
600 dq 0x988a32f3988a32f3,0x988a32f3988a32f3
601 dq 0x311465e7311465e7,0x311465e7311465e7
602 dq 0x311465e7311465e7,0x311465e7311465e7
603 dq 0x6228cbce6228cbce,0x6228cbce6228cbce
604 dq 0x6228cbce6228cbce,0x6228cbce6228cbce
605 dq 0xc451979cc451979c,0xc451979cc451979c
606 dq 0xc451979cc451979c,0xc451979cc451979c
607 dq 0x88a32f3988a32f39,0x88a32f3988a32f39
608 dq 0x88a32f3988a32f39,0x88a32f3988a32f39
609 dq 0x11465e7311465e73,0x11465e7311465e73
610 dq 0x11465e7311465e73,0x11465e7311465e73
611 dq 0x228cbce6228cbce6,0x228cbce6228cbce6
612 dq 0x228cbce6228cbce6,0x228cbce6228cbce6
613 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
614 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
615 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
616 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
617 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
618 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
619 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
620 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
621 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
622 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
623 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
624 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
625 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
626 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
627 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
628 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
629 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
630 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
631 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
632 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
633 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
634 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
635 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
636 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
637 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
638 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
639 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
640 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
641 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
642 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
643 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
644 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
645 dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
646 dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
647 dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
648 dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
649 dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
650 dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
651 dq 0xd43cec53d43cec53,0xd43cec53d43cec53
652 dq 0xd43cec53d43cec53,0xd43cec53d43cec53
653 dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
654 dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
655 dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
656 dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
657 dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
658 dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
659 dq 0x43cec53d43cec53d,0x43cec53d43cec53d
660 dq 0x43cec53d43cec53d,0x43cec53d43cec53d
661 dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
662 dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
663 dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
664 dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
665 dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
666 dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
667 dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
668 dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
669 dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
670 dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
671 dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
672 dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
673 dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
674 dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
675 dq 0xcec53d43cec53d43,0xcec53d43cec53d43
676 dq 0xcec53d43cec53d43,0xcec53d43cec53d43
677 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
678 dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
679 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
680 dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
681 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
682 dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
683 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
684 dq 0xec53d43cec53d43c,0xec53d43cec53d43c
685 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
686 dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
687 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
688 dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
689 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
690 dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
691 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
692 dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
693 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
694 dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
695 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
696 dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
697 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
698 dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
699 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
700 dq 0x53d43cec53d43cec,0x53d43cec53d43cec
701 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
702 dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
703 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
704 dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
705 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
706 dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
707 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
708 dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
709
710 SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
711 dq 0x0405060700010203,0x0c0d0e0f08090a0b