]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_by16_10.asm
27a2e02a019119a03460096dc5690479b497cf5b
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by16_10.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; Function API:
32 ; UINT32 crc16_t10dif_by16_10(
33 ; UINT16 init_crc, //initial CRC value, 16 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
36 ; );
37 ;
38 ; Authors:
39 ; Erdinc Ozturk
40 ; Vinodh Gopal
41 ; James Guilford
42 ;
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 ;
46 ;
47
48 %include "reg_sizes.asm"
49
50 %ifndef FUNCTION_NAME
51 %define FUNCTION_NAME crc16_t10dif_by16_10
52 %endif
53
54 %if (AS_FEATURE_LEVEL) >= 10
55
56 [bits 64]
57 default rel
58
59 section .text
60
61
62 %ifidn __OUTPUT_FORMAT__, win64
63 %xdefine arg1 rcx
64 %xdefine arg2 rdx
65 %xdefine arg3 r8
66
67 %xdefine arg1_low32 ecx
68 %else
69 %xdefine arg1 rdi
70 %xdefine arg2 rsi
71 %xdefine arg3 rdx
72
73 %xdefine arg1_low32 edi
74 %endif
75
76 %define TMP 16*0
77 %ifidn __OUTPUT_FORMAT__, win64
78 %define XMM_SAVE 16*2
79 %define VARIABLE_OFFSET 16*12+8
80 %else
81 %define VARIABLE_OFFSET 16*2+8
82 %endif
83
84 align 16
85 mk_global FUNCTION_NAME, function
86 FUNCTION_NAME:
87 endbranch
88
89 ; adjust the 16-bit initial_crc value, scale it to 32 bits
90 shl arg1_low32, 16
91
92 ; After this point, code flow is exactly same as a 32-bit CRC.
93 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
94
95 sub rsp, VARIABLE_OFFSET
96
97 %ifidn __OUTPUT_FORMAT__, win64
98 ; push the xmm registers into the stack to maintain
99 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
100 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
101 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
102 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
103 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
104 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
105 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
106 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
107 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
108 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
109 %endif
110
111 vbroadcasti32x4 zmm18, [SHUF_MASK]
112 cmp arg3, 256
113 jl .less_than_256
114
115 ; load the initial crc value
116 vmovd xmm10, arg1_low32 ; initial crc
117
118 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
119 ; because data will be byte-reflected and will align with initial crc at correct place.
120 vpslldq xmm10, 12
121
122 ; receive the initial 64B data, xor the initial crc value
123 vmovdqu8 zmm0, [arg2+16*0]
124 vmovdqu8 zmm4, [arg2+16*4]
125 vpshufb zmm0, zmm0, zmm18
126 vpshufb zmm4, zmm4, zmm18
127 vpxorq zmm0, zmm10
128 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
129 ;imm value of pclmulqdq instruction will determine which constant to use
130
131 sub arg3, 256
132 cmp arg3, 256
133 jl .fold_128_B_loop
134
135 vmovdqu8 zmm7, [arg2+16*8]
136 vmovdqu8 zmm8, [arg2+16*12]
137 vpshufb zmm7, zmm7, zmm18
138 vpshufb zmm8, zmm8, zmm18
139 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
140 sub arg3, 256
141
142 .fold_256_B_loop:
143 add arg2, 256
144 vmovdqu8 zmm3, [arg2+16*0]
145 vpshufb zmm3, zmm3, zmm18
146 vpclmulqdq zmm1, zmm0, zmm16, 0x00
147 vpclmulqdq zmm2, zmm0, zmm16, 0x11
148 vpxorq zmm0, zmm1, zmm2
149 vpxorq zmm0, zmm0, zmm3
150
151 vmovdqu8 zmm9, [arg2+16*4]
152 vpshufb zmm9, zmm9, zmm18
153 vpclmulqdq zmm5, zmm4, zmm16, 0x00
154 vpclmulqdq zmm6, zmm4, zmm16, 0x11
155 vpxorq zmm4, zmm5, zmm6
156 vpxorq zmm4, zmm4, zmm9
157
158 vmovdqu8 zmm11, [arg2+16*8]
159 vpshufb zmm11, zmm11, zmm18
160 vpclmulqdq zmm12, zmm7, zmm16, 0x00
161 vpclmulqdq zmm13, zmm7, zmm16, 0x11
162 vpxorq zmm7, zmm12, zmm13
163 vpxorq zmm7, zmm7, zmm11
164
165 vmovdqu8 zmm17, [arg2+16*12]
166 vpshufb zmm17, zmm17, zmm18
167 vpclmulqdq zmm14, zmm8, zmm16, 0x00
168 vpclmulqdq zmm15, zmm8, zmm16, 0x11
169 vpxorq zmm8, zmm14, zmm15
170 vpxorq zmm8, zmm8, zmm17
171
172 sub arg3, 256
173 jge .fold_256_B_loop
174
175 ;; Fold 256 into 128
176 add arg2, 256
177 vpclmulqdq zmm1, zmm0, zmm10, 0x00
178 vpclmulqdq zmm2, zmm0, zmm10, 0x11
179 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
180
181 vpclmulqdq zmm5, zmm4, zmm10, 0x00
182 vpclmulqdq zmm6, zmm4, zmm10, 0x11
183 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
184
185 vmovdqa32 zmm0, zmm7
186 vmovdqa32 zmm4, zmm8
187
188 add arg3, 128
189 jmp .fold_128_B_register
190
191
192
193 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
194 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
195
196 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
197 .fold_128_B_loop:
198 add arg2, 128
199 vmovdqu8 zmm8, [arg2+16*0]
200 vpshufb zmm8, zmm8, zmm18
201 vpclmulqdq zmm2, zmm0, zmm10, 0x00
202 vpclmulqdq zmm1, zmm0, zmm10, 0x11
203 vpxorq zmm0, zmm2, zmm1
204 vpxorq zmm0, zmm0, zmm8
205
206 vmovdqu8 zmm9, [arg2+16*4]
207 vpshufb zmm9, zmm9, zmm18
208 vpclmulqdq zmm5, zmm4, zmm10, 0x00
209 vpclmulqdq zmm6, zmm4, zmm10, 0x11
210 vpxorq zmm4, zmm5, zmm6
211 vpxorq zmm4, zmm4, zmm9
212
213 sub arg3, 128
214 jge .fold_128_B_loop
215 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216
217 add arg2, 128
218 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
219 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
220
221 .fold_128_B_register:
222 ; fold the 8 128b parts into 1 xmm register with different constants
223 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
224 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
225 vpclmulqdq zmm1, zmm0, zmm16, 0x00
226 vpclmulqdq zmm2, zmm0, zmm16, 0x11
227 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
228
229 vpclmulqdq zmm5, zmm4, zmm11, 0x00
230 vpclmulqdq zmm6, zmm4, zmm11, 0x11
231 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
232 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
233 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
234
235 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
236 vpxorq ymm8, ymm8, ymm1
237 vextracti64x2 xmm5, ymm8, 1
238 vpxorq xmm7, xmm5, xmm8
239
240 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
241 ; instead of a cmp instruction, we use the negative flag with the jl instruction
242 add arg3, 128-16
243 jl .final_reduction_for_128
244
245 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
246 ; we can fold 16 bytes at a time if y>=16
247 ; continue folding 16B at a time
248
249 .16B_reduction_loop:
250 vpclmulqdq xmm8, xmm7, xmm10, 0x11
251 vpclmulqdq xmm7, xmm7, xmm10, 0x00
252 vpxor xmm7, xmm8
253 vmovdqu xmm0, [arg2]
254 vpshufb xmm0, xmm0, xmm18
255 vpxor xmm7, xmm0
256 add arg2, 16
257 sub arg3, 16
258 ; instead of a cmp instruction, we utilize the flags with the jge instruction
259 ; equivalent of: cmp arg3, 16-16
260 ; check if there is any more 16B in the buffer to be able to fold
261 jge .16B_reduction_loop
262
263 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
264 ;first, we reduce the data in the xmm7 register
265
266
267 .final_reduction_for_128:
268 add arg3, 16
269 je .128_done
270
271 ; here we are getting data that is less than 16 bytes.
272 ; since we know that there was data before the pointer, we can offset
273 ; the input pointer before the actual point, to receive exactly 16 bytes.
274 ; after that the registers need to be adjusted.
275 .get_last_two_xmms:
276
277 vmovdqa xmm2, xmm7
278 vmovdqu xmm1, [arg2 - 16 + arg3]
279 vpshufb xmm1, xmm18
280
281 ; get rid of the extra data that was loaded before
282 ; load the shift constant
283 lea rax, [pshufb_shf_table + 16]
284 sub rax, arg3
285 vmovdqu xmm0, [rax]
286
287 vpshufb xmm2, xmm0
288 vpxor xmm0, [mask1]
289 vpshufb xmm7, xmm0
290 vpblendvb xmm1, xmm1, xmm2, xmm0
291
292 vpclmulqdq xmm8, xmm7, xmm10, 0x11
293 vpclmulqdq xmm7, xmm7, xmm10, 0x00
294 vpxor xmm7, xmm8
295 vpxor xmm7, xmm1
296
297 .128_done:
298 ; compute crc of a 128-bit value
299 vmovdqa xmm10, [rk5]
300 vmovdqa xmm0, xmm7
301
302 ;64b fold
303 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
304 vpslldq xmm0, 8
305 vpxor xmm7, xmm0
306
307 ;32b fold
308 vmovdqa xmm0, xmm7
309 vpand xmm0, [mask2]
310 vpsrldq xmm7, 12
311 vpclmulqdq xmm7, xmm10, 0x10
312 vpxor xmm7, xmm0
313
314 ;barrett reduction
315 .barrett:
316 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
317 vmovdqa xmm0, xmm7
318 vpclmulqdq xmm7, xmm10, 0x01
319 vpslldq xmm7, 4
320 vpclmulqdq xmm7, xmm10, 0x11
321
322 vpslldq xmm7, 4
323 vpxor xmm7, xmm0
324 vpextrd eax, xmm7, 1
325
326 .cleanup:
327 ; scale the result back to 16 bits
328 shr eax, 16
329
330 %ifidn __OUTPUT_FORMAT__, win64
331 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
332 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
333 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
334 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
335 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
336 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
337 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
338 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
339 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
340 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
341 %endif
342 add rsp, VARIABLE_OFFSET
343 ret
344
345
346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350
351 align 16
352 .less_than_256:
353
354 ; check if there is enough buffer to be able to fold 16B at a time
355 cmp arg3, 32
356 jl .less_than_32
357
358 ; if there is, load the constants
359 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
360
361 vmovd xmm0, arg1_low32 ; get the initial crc value
362 vpslldq xmm0, 12 ; align it to its correct place
363 vmovdqu xmm7, [arg2] ; load the plaintext
364 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
365 vpxor xmm7, xmm0
366
367 ; update the buffer pointer
368 add arg2, 16
369
370 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
371 sub arg3, 32
372
373 jmp .16B_reduction_loop
374
375
376 align 16
377 .less_than_32:
378 ; mov initial crc to the return value. this is necessary for zero-length buffers.
379 mov eax, arg1_low32
380 test arg3, arg3
381 je .cleanup
382
383 vmovd xmm0, arg1_low32 ; get the initial crc value
384 vpslldq xmm0, 12 ; align it to its correct place
385
386 cmp arg3, 16
387 je .exact_16_left
388 jl .less_than_16_left
389
390 vmovdqu xmm7, [arg2] ; load the plaintext
391 vpshufb xmm7, xmm18
392 vpxor xmm7, xmm0 ; xor the initial crc value
393 add arg2, 16
394 sub arg3, 16
395 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
396 jmp .get_last_two_xmms
397
398 align 16
399 .less_than_16_left:
400 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
401
402 vpxor xmm1, xmm1
403 mov r11, rsp
404 vmovdqa [r11], xmm1
405
406 cmp arg3, 4
407 jl .only_less_than_4
408
409 ; backup the counter value
410 mov r9, arg3
411 cmp arg3, 8
412 jl .less_than_8_left
413
414 ; load 8 Bytes
415 mov rax, [arg2]
416 mov [r11], rax
417 add r11, 8
418 sub arg3, 8
419 add arg2, 8
420 .less_than_8_left:
421
422 cmp arg3, 4
423 jl .less_than_4_left
424
425 ; load 4 Bytes
426 mov eax, [arg2]
427 mov [r11], eax
428 add r11, 4
429 sub arg3, 4
430 add arg2, 4
431 .less_than_4_left:
432
433 cmp arg3, 2
434 jl .less_than_2_left
435
436 ; load 2 Bytes
437 mov ax, [arg2]
438 mov [r11], ax
439 add r11, 2
440 sub arg3, 2
441 add arg2, 2
442 .less_than_2_left:
443 cmp arg3, 1
444 jl .zero_left
445
446 ; load 1 Byte
447 mov al, [arg2]
448 mov [r11], al
449
450 .zero_left:
451 vmovdqa xmm7, [rsp]
452 vpshufb xmm7, xmm18
453 vpxor xmm7, xmm0 ; xor the initial crc value
454
455 lea rax, [pshufb_shf_table + 16]
456 sub rax, r9
457 vmovdqu xmm0, [rax]
458 vpxor xmm0, [mask1]
459
460 vpshufb xmm7,xmm0
461 jmp .128_done
462
463 align 16
464 .exact_16_left:
465 vmovdqu xmm7, [arg2]
466 vpshufb xmm7, xmm18
467 vpxor xmm7, xmm0 ; xor the initial crc value
468 jmp .128_done
469
470 .only_less_than_4:
471 cmp arg3, 3
472 jl .only_less_than_3
473
474 ; load 3 Bytes
475 mov al, [arg2]
476 mov [r11], al
477
478 mov al, [arg2+1]
479 mov [r11+1], al
480
481 mov al, [arg2+2]
482 mov [r11+2], al
483
484 vmovdqa xmm7, [rsp]
485 vpshufb xmm7, xmm18
486 vpxor xmm7, xmm0 ; xor the initial crc value
487
488 vpsrldq xmm7, 5
489 jmp .barrett
490
491 .only_less_than_3:
492 cmp arg3, 2
493 jl .only_less_than_2
494
495 ; load 2 Bytes
496 mov al, [arg2]
497 mov [r11], al
498
499 mov al, [arg2+1]
500 mov [r11+1], al
501
502 vmovdqa xmm7, [rsp]
503 vpshufb xmm7, xmm18
504 vpxor xmm7, xmm0 ; xor the initial crc value
505
506 vpsrldq xmm7, 6
507 jmp .barrett
508
509 .only_less_than_2:
510 ; load 1 Byte
511 mov al, [arg2]
512 mov [r11], al
513
514 vmovdqa xmm7, [rsp]
515 vpshufb xmm7, xmm18
516 vpxor xmm7, xmm0 ; xor the initial crc value
517
518 vpsrldq xmm7, 7
519 jmp .barrett
520
521 section .data
522 align 32
523
524 %ifndef USE_CONSTS
525 ; precomputed constants
526
527 rk_1: dq 0xdccf000000000000
528 rk_2: dq 0x4b0b000000000000
529 rk1: dq 0x2d56000000000000
530 rk2: dq 0x06df000000000000
531 rk3: dq 0x9d9d000000000000
532 rk4: dq 0x7cf5000000000000
533 rk5: dq 0x2d56000000000000
534 rk6: dq 0x1368000000000000
535 rk7: dq 0x00000001f65a57f8
536 rk8: dq 0x000000018bb70000
537 rk9: dq 0xceae000000000000
538 rk10: dq 0xbfd6000000000000
539 rk11: dq 0x1e16000000000000
540 rk12: dq 0x713c000000000000
541 rk13: dq 0xf7f9000000000000
542 rk14: dq 0x80a6000000000000
543 rk15: dq 0x044c000000000000
544 rk16: dq 0xe658000000000000
545 rk17: dq 0xad18000000000000
546 rk18: dq 0xa497000000000000
547 rk19: dq 0x6ee3000000000000
548 rk20: dq 0xe7b5000000000000
549
550 rk_1b: dq 0x2d56000000000000
551 rk_2b: dq 0x06df000000000000
552 dq 0x0000000000000000
553 dq 0x0000000000000000
554 %else
555 INCLUDE_CONSTS
556 %endif
557
558 mask1: dq 0x8080808080808080, 0x8080808080808080
559 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
560
561 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
562
563 pshufb_shf_table:
564 ; use these values for shift constants for the pshufb instruction
565 ; different alignments result in values as shown:
566 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
567 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
568 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
569 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
570 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
571 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
572 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
573 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
574 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
575 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
576 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
577 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
578 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
579 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
580 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
581 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
582 dq 0x0706050403020100, 0x000e0d0c0b0a0908
583 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
584 dq 0x8080808080808080, 0x8080808080808080
585
586 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
587 %ifidn __OUTPUT_FORMAT__, win64
588 global no_ %+ FUNCTION_NAME
589 no_ %+ FUNCTION_NAME %+ :
590 %endif
591 %endif ; (AS_FEATURE_LEVEL) >= 10