]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc16_t10dif_02.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_02.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30; Function API:
31; UINT16 crc16_t10dif_02(
32; UINT16 init_crc, //initial CRC value, 16 bits
33; const unsigned char *buf, //buffer pointer to calculate CRC on
34; UINT64 len //buffer length in bytes (64-bit data)
35; );
36;
37; Authors:
38; Erdinc Ozturk
39; Vinodh Gopal
40; James Guilford
41;
42; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
44
45%include "reg_sizes.asm"
46
47%define fetch_dist 1024
48
49[bits 64]
50default rel
51
52section .text
53
54%ifidn __OUTPUT_FORMAT__, win64
55 %xdefine arg1 rcx
56 %xdefine arg2 rdx
57 %xdefine arg3 r8
58
59 %xdefine arg1_low32 ecx
60%else
61 %xdefine arg1 rdi
62 %xdefine arg2 rsi
63 %xdefine arg3 rdx
64
65 %xdefine arg1_low32 edi
66%endif
67
68%ifidn __OUTPUT_FORMAT__, win64
69 %define XMM_SAVE 16*2
70 %define VARIABLE_OFFSET 16*10+8
71%else
72 %define VARIABLE_OFFSET 16*2+8
73%endif
74
75align 16
20effc67 76mk_global crc16_t10dif_02, function
f91f0fd5 77crc16_t10dif_02:
20effc67 78 endbranch
f91f0fd5
TL
79
80 ; adjust the 16-bit initial_crc value, scale it to 32 bits
81 shl arg1_low32, 16
82
83 ; After this point, code flow is exactly same as a 32-bit CRC.
84 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
85
86 sub rsp, VARIABLE_OFFSET
87%ifidn __OUTPUT_FORMAT__, win64
88 ; push the xmm registers into the stack to maintain
89 vmovdqa [rsp+16*2],xmm6
90 vmovdqa [rsp+16*3],xmm7
91 vmovdqa [rsp+16*4],xmm8
92 vmovdqa [rsp+16*5],xmm9
93 vmovdqa [rsp+16*6],xmm10
94 vmovdqa [rsp+16*7],xmm11
95 vmovdqa [rsp+16*8],xmm12
96 vmovdqa [rsp+16*9],xmm13
97%endif
98
99 ; check if smaller than 256
100 cmp arg3, 256
101
102 ; for sizes less than 256, we can't fold 128B at a time...
103 jl _less_than_256
104
105
106 ; load the initial crc value
107 vmovd xmm10, arg1_low32 ; initial crc
108
109 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
110 ; because data will be byte-reflected and will align with initial crc at correct place.
111 vpslldq xmm10, 12
112
113 vmovdqa xmm11, [SHUF_MASK]
114 ; receive the initial 128B data, xor the initial crc value
115 vmovdqu xmm0, [arg2+16*0]
116 vmovdqu xmm1, [arg2+16*1]
117 vmovdqu xmm2, [arg2+16*2]
118 vmovdqu xmm3, [arg2+16*3]
119 vmovdqu xmm4, [arg2+16*4]
120 vmovdqu xmm5, [arg2+16*5]
121 vmovdqu xmm6, [arg2+16*6]
122 vmovdqu xmm7, [arg2+16*7]
123
124 vpshufb xmm0, xmm11
125 ; XOR the initial_crc value
126 vpxor xmm0, xmm10
127 vpshufb xmm1, xmm11
128 vpshufb xmm2, xmm11
129 vpshufb xmm3, xmm11
130 vpshufb xmm4, xmm11
131 vpshufb xmm5, xmm11
132 vpshufb xmm6, xmm11
133 vpshufb xmm7, xmm11
134
135 vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
136 ;imm value of pclmulqdq instruction will determine which constant to use
137 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
138 ; we subtract 256 instead of 128 to save one instruction from the loop
139 sub arg3, 256
140
141 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
142 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
143
144
145 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
146_fold_128_B_loop:
147
148 ; update the buffer pointer
149 add arg2, 128 ; buf += 128;
150
151 prefetchnta [arg2+fetch_dist+0]
152 vmovdqu xmm9, [arg2+16*0]
153 vmovdqu xmm12, [arg2+16*1]
154 vpshufb xmm9, xmm11
155 vpshufb xmm12, xmm11
156 vmovdqa xmm8, xmm0
157 vmovdqa xmm13, xmm1
158 vpclmulqdq xmm0, xmm10, 0x0
159 vpclmulqdq xmm8, xmm10 , 0x11
160 vpclmulqdq xmm1, xmm10, 0x0
161 vpclmulqdq xmm13, xmm10 , 0x11
162 vpxor xmm0, xmm9
163 vxorps xmm0, xmm8
164 vpxor xmm1, xmm12
165 vxorps xmm1, xmm13
166
167 prefetchnta [arg2+fetch_dist+32]
168 vmovdqu xmm9, [arg2+16*2]
169 vmovdqu xmm12, [arg2+16*3]
170 vpshufb xmm9, xmm11
171 vpshufb xmm12, xmm11
172 vmovdqa xmm8, xmm2
173 vmovdqa xmm13, xmm3
174 vpclmulqdq xmm2, xmm10, 0x0
175 vpclmulqdq xmm8, xmm10 , 0x11
176 vpclmulqdq xmm3, xmm10, 0x0
177 vpclmulqdq xmm13, xmm10 , 0x11
178 vpxor xmm2, xmm9
179 vxorps xmm2, xmm8
180 vpxor xmm3, xmm12
181 vxorps xmm3, xmm13
182
183 prefetchnta [arg2+fetch_dist+64]
184 vmovdqu xmm9, [arg2+16*4]
185 vmovdqu xmm12, [arg2+16*5]
186 vpshufb xmm9, xmm11
187 vpshufb xmm12, xmm11
188 vmovdqa xmm8, xmm4
189 vmovdqa xmm13, xmm5
190 vpclmulqdq xmm4, xmm10, 0x0
191 vpclmulqdq xmm8, xmm10 , 0x11
192 vpclmulqdq xmm5, xmm10, 0x0
193 vpclmulqdq xmm13, xmm10 , 0x11
194 vpxor xmm4, xmm9
195 vxorps xmm4, xmm8
196 vpxor xmm5, xmm12
197 vxorps xmm5, xmm13
198
199 prefetchnta [arg2+fetch_dist+96]
200 vmovdqu xmm9, [arg2+16*6]
201 vmovdqu xmm12, [arg2+16*7]
202 vpshufb xmm9, xmm11
203 vpshufb xmm12, xmm11
204 vmovdqa xmm8, xmm6
205 vmovdqa xmm13, xmm7
206 vpclmulqdq xmm6, xmm10, 0x0
207 vpclmulqdq xmm8, xmm10 , 0x11
208 vpclmulqdq xmm7, xmm10, 0x0
209 vpclmulqdq xmm13, xmm10 , 0x11
210 vpxor xmm6, xmm9
211 vxorps xmm6, xmm8
212 vpxor xmm7, xmm12
213 vxorps xmm7, xmm13
214
215 sub arg3, 128
216
217 ; check if there is another 128B in the buffer to be able to fold
218 jge _fold_128_B_loop
219 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
220
221
222 add arg2, 128
223 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
224 ; fold the 8 xmm registers to 1 xmm register with different constants
225
226 vmovdqa xmm10, [rk9]
227 vmovdqa xmm8, xmm0
228 vpclmulqdq xmm0, xmm10, 0x11
229 vpclmulqdq xmm8, xmm10, 0x0
230 vpxor xmm7, xmm8
231 vxorps xmm7, xmm0
232
233 vmovdqa xmm10, [rk11]
234 vmovdqa xmm8, xmm1
235 vpclmulqdq xmm1, xmm10, 0x11
236 vpclmulqdq xmm8, xmm10, 0x0
237 vpxor xmm7, xmm8
238 vxorps xmm7, xmm1
239
240 vmovdqa xmm10, [rk13]
241 vmovdqa xmm8, xmm2
242 vpclmulqdq xmm2, xmm10, 0x11
243 vpclmulqdq xmm8, xmm10, 0x0
244 vpxor xmm7, xmm8
245 vpxor xmm7, xmm2
246
247 vmovdqa xmm10, [rk15]
248 vmovdqa xmm8, xmm3
249 vpclmulqdq xmm3, xmm10, 0x11
250 vpclmulqdq xmm8, xmm10, 0x0
251 vpxor xmm7, xmm8
252 vxorps xmm7, xmm3
253
254 vmovdqa xmm10, [rk17]
255 vmovdqa xmm8, xmm4
256 vpclmulqdq xmm4, xmm10, 0x11
257 vpclmulqdq xmm8, xmm10, 0x0
258 vpxor xmm7, xmm8
259 vpxor xmm7, xmm4
260
261 vmovdqa xmm10, [rk19]
262 vmovdqa xmm8, xmm5
263 vpclmulqdq xmm5, xmm10, 0x11
264 vpclmulqdq xmm8, xmm10, 0x0
265 vpxor xmm7, xmm8
266 vxorps xmm7, xmm5
267
268 vmovdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
269 ;imm value of pclmulqdq instruction will determine which constant to use
270 vmovdqa xmm8, xmm6
271 vpclmulqdq xmm6, xmm10, 0x11
272 vpclmulqdq xmm8, xmm10, 0x0
273 vpxor xmm7, xmm8
274 vpxor xmm7, xmm6
275
276
277 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
278 ; instead of a cmp instruction, we use the negative flag with the jl instruction
279 add arg3, 128-16
280 jl _final_reduction_for_128
281
282 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
283 ; we can fold 16 bytes at a time if y>=16
284 ; continue folding 16B at a time
285
286_16B_reduction_loop:
287 vmovdqa xmm8, xmm7
288 vpclmulqdq xmm7, xmm10, 0x11
289 vpclmulqdq xmm8, xmm10, 0x0
290 vpxor xmm7, xmm8
291 vmovdqu xmm0, [arg2]
292 vpshufb xmm0, xmm11
293 vpxor xmm7, xmm0
294 add arg2, 16
295 sub arg3, 16
296 ; instead of a cmp instruction, we utilize the flags with the jge instruction
297 ; equivalent of: cmp arg3, 16-16
298 ; check if there is any more 16B in the buffer to be able to fold
299 jge _16B_reduction_loop
300
301 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
302 ;first, we reduce the data in the xmm7 register
303
304
305_final_reduction_for_128:
306 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
307 add arg3, 16
308 je _128_done
309
310 ; here we are getting data that is less than 16 bytes.
311 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
312 ; after that the registers need to be adjusted.
313_get_last_two_xmms:
314 vmovdqa xmm2, xmm7
315
316 vmovdqu xmm1, [arg2 - 16 + arg3]
317 vpshufb xmm1, xmm11
318
319 ; get rid of the extra data that was loaded before
320 ; load the shift constant
321 lea rax, [pshufb_shf_table + 16]
322 sub rax, arg3
323 vmovdqu xmm0, [rax]
324
325 ; shift xmm2 to the left by arg3 bytes
326 vpshufb xmm2, xmm0
327
328 ; shift xmm7 to the right by 16-arg3 bytes
329 vpxor xmm0, [mask1]
330 vpshufb xmm7, xmm0
331 vpblendvb xmm1, xmm1, xmm2, xmm0
332
333 ; fold 16 Bytes
334 vmovdqa xmm2, xmm1
335 vmovdqa xmm8, xmm7
336 vpclmulqdq xmm7, xmm10, 0x11
337 vpclmulqdq xmm8, xmm10, 0x0
338 vpxor xmm7, xmm8
339 vpxor xmm7, xmm2
340
341_128_done:
342 ; compute crc of a 128-bit value
343 vmovdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
344 vmovdqa xmm0, xmm7
345
346 ;64b fold
347 vpclmulqdq xmm7, xmm10, 0x1
348 vpslldq xmm0, 8
349 vpxor xmm7, xmm0
350
351 ;32b fold
352 vmovdqa xmm0, xmm7
353
354 vpand xmm0, [mask2]
355
356 vpsrldq xmm7, 12
357 vpclmulqdq xmm7, xmm10, 0x10
358 vpxor xmm7, xmm0
359
360 ;barrett reduction
361_barrett:
362 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
363 vmovdqa xmm0, xmm7
364 vpclmulqdq xmm7, xmm10, 0x01
365 vpslldq xmm7, 4
366 vpclmulqdq xmm7, xmm10, 0x11
367
368 vpslldq xmm7, 4
369 vpxor xmm7, xmm0
370 vpextrd eax, xmm7,1
371
372_cleanup:
373 ; scale the result back to 16 bits
374 shr eax, 16
375%ifidn __OUTPUT_FORMAT__, win64
376 vmovdqa xmm6, [rsp+16*2]
377 vmovdqa xmm7, [rsp+16*3]
378 vmovdqa xmm8, [rsp+16*4]
379 vmovdqa xmm9, [rsp+16*5]
380 vmovdqa xmm10, [rsp+16*6]
381 vmovdqa xmm11, [rsp+16*7]
382 vmovdqa xmm12, [rsp+16*8]
383 vmovdqa xmm13, [rsp+16*9]
384%endif
385 add rsp, VARIABLE_OFFSET
386 ret
387
388
389;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
393
394align 16
395_less_than_256:
396
397 ; check if there is enough buffer to be able to fold 16B at a time
398 cmp arg3, 32
399 jl _less_than_32
400 vmovdqa xmm11, [SHUF_MASK]
401
402 ; if there is, load the constants
403 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
404
405 vmovd xmm0, arg1_low32 ; get the initial crc value
406 vpslldq xmm0, 12 ; align it to its correct place
407 vmovdqu xmm7, [arg2] ; load the plaintext
408 vpshufb xmm7, xmm11 ; byte-reflect the plaintext
409 vpxor xmm7, xmm0
410
411
412 ; update the buffer pointer
413 add arg2, 16
414
415 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
416 sub arg3, 32
417
418 jmp _16B_reduction_loop
419
420
421align 16
422_less_than_32:
423 ; mov initial crc to the return value. this is necessary for zero-length buffers.
424 mov eax, arg1_low32
425 test arg3, arg3
426 je _cleanup
427
428 vmovdqa xmm11, [SHUF_MASK]
429
430 vmovd xmm0, arg1_low32 ; get the initial crc value
431 vpslldq xmm0, 12 ; align it to its correct place
432
433 cmp arg3, 16
434 je _exact_16_left
435 jl _less_than_16_left
436
437 vmovdqu xmm7, [arg2] ; load the plaintext
438 vpshufb xmm7, xmm11 ; byte-reflect the plaintext
439 vpxor xmm7, xmm0 ; xor the initial crc value
440 add arg2, 16
441 sub arg3, 16
442 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
443 jmp _get_last_two_xmms
444
445
446align 16
447_less_than_16_left:
448 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
449
450 vpxor xmm1, xmm1
451 mov r11, rsp
452 vmovdqa [r11], xmm1
453
454 cmp arg3, 4
455 jl _only_less_than_4
456
457 ; backup the counter value
458 mov r9, arg3
459 cmp arg3, 8
460 jl _less_than_8_left
461
462 ; load 8 Bytes
463 mov rax, [arg2]
464 mov [r11], rax
465 add r11, 8
466 sub arg3, 8
467 add arg2, 8
468_less_than_8_left:
469
470 cmp arg3, 4
471 jl _less_than_4_left
472
473 ; load 4 Bytes
474 mov eax, [arg2]
475 mov [r11], eax
476 add r11, 4
477 sub arg3, 4
478 add arg2, 4
479_less_than_4_left:
480
481 cmp arg3, 2
482 jl _less_than_2_left
483
484 ; load 2 Bytes
485 mov ax, [arg2]
486 mov [r11], ax
487 add r11, 2
488 sub arg3, 2
489 add arg2, 2
490_less_than_2_left:
491 cmp arg3, 1
492 jl _zero_left
493
494 ; load 1 Byte
495 mov al, [arg2]
496 mov [r11], al
497_zero_left:
498 vmovdqa xmm7, [rsp]
499 vpshufb xmm7, xmm11
500 vpxor xmm7, xmm0 ; xor the initial crc value
501
502 lea rax, [pshufb_shf_table + 16]
503 sub rax, r9
504 vmovdqu xmm0, [rax]
505 vpxor xmm0, [mask1]
506
507 vpshufb xmm7, xmm0
508 jmp _128_done
509
510align 16
511_exact_16_left:
512 vmovdqu xmm7, [arg2]
513 vpshufb xmm7, xmm11
514 vpxor xmm7, xmm0 ; xor the initial crc value
515
516 jmp _128_done
517
518_only_less_than_4:
519 cmp arg3, 3
520 jl _only_less_than_3
521
522 ; load 3 Bytes
523 mov al, [arg2]
524 mov [r11], al
525
526 mov al, [arg2+1]
527 mov [r11+1], al
528
529 mov al, [arg2+2]
530 mov [r11+2], al
531
532 vmovdqa xmm7, [rsp]
533 vpshufb xmm7, xmm11
534 vpxor xmm7, xmm0 ; xor the initial crc value
535
536 vpsrldq xmm7, 5
537
538 jmp _barrett
539_only_less_than_3:
540 cmp arg3, 2
541 jl _only_less_than_2
542
543 ; load 2 Bytes
544 mov al, [arg2]
545 mov [r11], al
546
547 mov al, [arg2+1]
548 mov [r11+1], al
549
550 vmovdqa xmm7, [rsp]
551 vpshufb xmm7, xmm11
552 vpxor xmm7, xmm0 ; xor the initial crc value
553
554 vpsrldq xmm7, 6
555
556 jmp _barrett
557_only_less_than_2:
558
559 ; load 1 Byte
560 mov al, [arg2]
561 mov [r11], al
562
563 vmovdqa xmm7, [rsp]
564 vpshufb xmm7, xmm11
565 vpxor xmm7, xmm0 ; xor the initial crc value
566
567 vpsrldq xmm7, 7
568
569 jmp _barrett
570
571section .data
572
573; precomputed constants
574; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
575align 16
576; Q = 0x18BB70000
577; rk1 = 2^(32*3) mod Q << 32
578; rk2 = 2^(32*5) mod Q << 32
579; rk3 = 2^(32*15) mod Q << 32
580; rk4 = 2^(32*17) mod Q << 32
581; rk5 = 2^(32*3) mod Q << 32
582; rk6 = 2^(32*2) mod Q << 32
583; rk7 = floor(2^64/Q)
584; rk8 = Q
585rk1:
586DQ 0x2d56000000000000
587rk2:
588DQ 0x06df000000000000
589rk3:
590DQ 0x9d9d000000000000
591rk4:
592DQ 0x7cf5000000000000
593rk5:
594DQ 0x2d56000000000000
595rk6:
596DQ 0x1368000000000000
597rk7:
598DQ 0x00000001f65a57f8
599rk8:
600DQ 0x000000018bb70000
601
602rk9:
603DQ 0xceae000000000000
604rk10:
605DQ 0xbfd6000000000000
606rk11:
607DQ 0x1e16000000000000
608rk12:
609DQ 0x713c000000000000
610rk13:
611DQ 0xf7f9000000000000
612rk14:
613DQ 0x80a6000000000000
614rk15:
615DQ 0x044c000000000000
616rk16:
617DQ 0xe658000000000000
618rk17:
619DQ 0xad18000000000000
620rk18:
621DQ 0xa497000000000000
622rk19:
623DQ 0x6ee3000000000000
624rk20:
625DQ 0xe7b5000000000000
626
627mask1:
628dq 0x8080808080808080, 0x8080808080808080
629mask2:
630dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
631
632SHUF_MASK:
633dq 0x08090A0B0C0D0E0F, 0x0001020304050607
634
635pshufb_shf_table:
636; use these values for shift constants for the pshufb instruction
637; different alignments result in values as shown:
638; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
639; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
640; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
641; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
642; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
643; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
644; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
645; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
646; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
647; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
648; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
649; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
650; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
651; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
652; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
653dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
654dq 0x0706050403020100, 0x000e0d0c0b0a0908