]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc32_ieee_by16_10.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by16_10.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31; Function API:
32; UINT32 crc32_gzip_refl_by16_10(
33; UINT32 init_crc, //initial CRC value, 32 bits
34; const unsigned char *buf, //buffer pointer to calculate CRC on
35; UINT64 len //buffer length in bytes (64-bit data)
36; );
37;
38; Authors:
39; Erdinc Ozturk
40; Vinodh Gopal
41; James Guilford
42;
43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45;
46;
47
48%include "reg_sizes.asm"
49
50%ifndef FUNCTION_NAME
51%define FUNCTION_NAME crc32_ieee_by16_10
52%endif
53
54%if (AS_FEATURE_LEVEL) >= 10
55
56[bits 64]
57default rel
58
59section .text
60
61
62%ifidn __OUTPUT_FORMAT__, win64
63 %xdefine arg1 rcx
64 %xdefine arg2 rdx
65 %xdefine arg3 r8
66
67 %xdefine arg1_low32 ecx
68%else
69 %xdefine arg1 rdi
70 %xdefine arg2 rsi
71 %xdefine arg3 rdx
72
73 %xdefine arg1_low32 edi
74%endif
75
76%define TMP 16*0
77%ifidn __OUTPUT_FORMAT__, win64
78 %define XMM_SAVE 16*2
79 %define VARIABLE_OFFSET 16*12+8
80%else
81 %define VARIABLE_OFFSET 16*2+8
82%endif
83
84align 16
85global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
86FUNCTION_NAME:
87
88 not arg1_low32
89 sub rsp, VARIABLE_OFFSET
90
91%ifidn __OUTPUT_FORMAT__, win64
92 ; push the xmm registers into the stack to maintain
93 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
94 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
95 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
96 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
97 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
98 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
99 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
100 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
101 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
102 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
103%endif
104
105 vbroadcasti32x4 zmm18, [SHUF_MASK]
106 cmp arg3, 256
107 jl .less_than_256
108
109 ; load the initial crc value
110 vmovd xmm10, arg1_low32 ; initial crc
111
112 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
113 ; because data will be byte-reflected and will align with initial crc at correct place.
114 vpslldq xmm10, 12
115
116 ; receive the initial 64B data, xor the initial crc value
117 vmovdqu8 zmm0, [arg2+16*0]
118 vmovdqu8 zmm4, [arg2+16*4]
119 vpshufb zmm0, zmm0, zmm18
120 vpshufb zmm4, zmm4, zmm18
121 vpxorq zmm0, zmm10
122 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
123 ;imm value of pclmulqdq instruction will determine which constant to use
124
125 sub arg3, 256
126 cmp arg3, 256
127 jl .fold_128_B_loop
128
129 vmovdqu8 zmm7, [arg2+16*8]
130 vmovdqu8 zmm8, [arg2+16*12]
131 vpshufb zmm7, zmm7, zmm18
132 vpshufb zmm8, zmm8, zmm18
133 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
134 sub arg3, 256
135
136.fold_256_B_loop:
137 add arg2, 256
138 vmovdqu8 zmm3, [arg2+16*0]
139 vpshufb zmm3, zmm3, zmm18
140 vpclmulqdq zmm1, zmm0, zmm16, 0x00
141 vpclmulqdq zmm2, zmm0, zmm16, 0x11
142 vpxorq zmm0, zmm1, zmm2
143 vpxorq zmm0, zmm0, zmm3
144
145 vmovdqu8 zmm9, [arg2+16*4]
146 vpshufb zmm9, zmm9, zmm18
147 vpclmulqdq zmm5, zmm4, zmm16, 0x00
148 vpclmulqdq zmm6, zmm4, zmm16, 0x11
149 vpxorq zmm4, zmm5, zmm6
150 vpxorq zmm4, zmm4, zmm9
151
152 vmovdqu8 zmm11, [arg2+16*8]
153 vpshufb zmm11, zmm11, zmm18
154 vpclmulqdq zmm12, zmm7, zmm16, 0x00
155 vpclmulqdq zmm13, zmm7, zmm16, 0x11
156 vpxorq zmm7, zmm12, zmm13
157 vpxorq zmm7, zmm7, zmm11
158
159 vmovdqu8 zmm17, [arg2+16*12]
160 vpshufb zmm17, zmm17, zmm18
161 vpclmulqdq zmm14, zmm8, zmm16, 0x00
162 vpclmulqdq zmm15, zmm8, zmm16, 0x11
163 vpxorq zmm8, zmm14, zmm15
164 vpxorq zmm8, zmm8, zmm17
165
166 sub arg3, 256
167 jge .fold_256_B_loop
168
169 ;; Fold 256 into 128
170 add arg2, 256
171 vpclmulqdq zmm1, zmm0, zmm10, 0x00
172 vpclmulqdq zmm2, zmm0, zmm10, 0x11
173 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
174
175 vpclmulqdq zmm5, zmm4, zmm10, 0x00
176 vpclmulqdq zmm6, zmm4, zmm10, 0x11
177 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
178
179 vmovdqa32 zmm0, zmm7
180 vmovdqa32 zmm4, zmm8
181
182 add arg3, 128
183 jmp .fold_128_B_register
184
185
186
187 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
188 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
189
190 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
191.fold_128_B_loop:
192 add arg2, 128
193 vmovdqu8 zmm8, [arg2+16*0]
194 vpshufb zmm8, zmm8, zmm18
195 vpclmulqdq zmm2, zmm0, zmm10, 0x00
196 vpclmulqdq zmm1, zmm0, zmm10, 0x11
197 vpxorq zmm0, zmm2, zmm1
198 vpxorq zmm0, zmm0, zmm8
199
200 vmovdqu8 zmm9, [arg2+16*4]
201 vpshufb zmm9, zmm9, zmm18
202 vpclmulqdq zmm5, zmm4, zmm10, 0x00
203 vpclmulqdq zmm6, zmm4, zmm10, 0x11
204 vpxorq zmm4, zmm5, zmm6
205 vpxorq zmm4, zmm4, zmm9
206
207 sub arg3, 128
208 jge .fold_128_B_loop
209 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210
211 add arg2, 128
212 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
213 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
214
215.fold_128_B_register:
216 ; fold the 8 128b parts into 1 xmm register with different constants
217 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
218 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
219 vpclmulqdq zmm1, zmm0, zmm16, 0x00
220 vpclmulqdq zmm2, zmm0, zmm16, 0x11
221 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
222
223 vpclmulqdq zmm5, zmm4, zmm11, 0x00
224 vpclmulqdq zmm6, zmm4, zmm11, 0x11
225 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
226 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
227 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
228
229 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
230 vpxorq ymm8, ymm8, ymm1
231 vextracti64x2 xmm5, ymm8, 1
232 vpxorq xmm7, xmm5, xmm8
233
234 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
235 ; instead of a cmp instruction, we use the negative flag with the jl instruction
236 add arg3, 128-16
237 jl .final_reduction_for_128
238
239 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
240 ; we can fold 16 bytes at a time if y>=16
241 ; continue folding 16B at a time
242
243.16B_reduction_loop:
244 vpclmulqdq xmm8, xmm7, xmm10, 0x11
245 vpclmulqdq xmm7, xmm7, xmm10, 0x00
246 vpxor xmm7, xmm8
247 vmovdqu xmm0, [arg2]
248 vpshufb xmm0, xmm0, xmm18
249 vpxor xmm7, xmm0
250 add arg2, 16
251 sub arg3, 16
252 ; instead of a cmp instruction, we utilize the flags with the jge instruction
253 ; equivalent of: cmp arg3, 16-16
254 ; check if there is any more 16B in the buffer to be able to fold
255 jge .16B_reduction_loop
256
257 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
258 ;first, we reduce the data in the xmm7 register
259
260
261.final_reduction_for_128:
262 add arg3, 16
263 je .128_done
264
265 ; here we are getting data that is less than 16 bytes.
266 ; since we know that there was data before the pointer, we can offset
267 ; the input pointer before the actual point, to receive exactly 16 bytes.
268 ; after that the registers need to be adjusted.
269.get_last_two_xmms:
270
271 vmovdqa xmm2, xmm7
272 vmovdqu xmm1, [arg2 - 16 + arg3]
273 vpshufb xmm1, xmm18
274
275 ; get rid of the extra data that was loaded before
276 ; load the shift constant
277 lea rax, [pshufb_shf_table + 16]
278 sub rax, arg3
279 vmovdqu xmm0, [rax]
280
281 vpshufb xmm2, xmm0
282 vpxor xmm0, [mask1]
283 vpshufb xmm7, xmm0
284 vpblendvb xmm1, xmm1, xmm2, xmm0
285
286 vpclmulqdq xmm8, xmm7, xmm10, 0x11
287 vpclmulqdq xmm7, xmm7, xmm10, 0x00
288 vpxor xmm7, xmm8
289 vpxor xmm7, xmm1
290
291.128_done:
292 ; compute crc of a 128-bit value
293 vmovdqa xmm10, [rk5]
294 vmovdqa xmm0, xmm7
295
296 ;64b fold
297 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
298 vpslldq xmm0, 8
299 vpxor xmm7, xmm0
300
301 ;32b fold
302 vmovdqa xmm0, xmm7
303 vpand xmm0, [mask2]
304 vpsrldq xmm7, 12
305 vpclmulqdq xmm7, xmm10, 0x10
306 vpxor xmm7, xmm0
307
308 ;barrett reduction
309.barrett:
310 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
311 vmovdqa xmm0, xmm7
312 vpclmulqdq xmm7, xmm10, 0x01
313 vpslldq xmm7, 4
314 vpclmulqdq xmm7, xmm10, 0x11
315
316 vpslldq xmm7, 4
317 vpxor xmm7, xmm0
318 vpextrd eax, xmm7, 1
319
320.cleanup:
321 not eax
322
323
324%ifidn __OUTPUT_FORMAT__, win64
325 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
326 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
327 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
328 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
329 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
330 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
331 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
332 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
333 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
334 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
335%endif
336 add rsp, VARIABLE_OFFSET
337 ret
338
339
340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344
345align 16
346.less_than_256:
347
348 ; check if there is enough buffer to be able to fold 16B at a time
349 cmp arg3, 32
350 jl .less_than_32
351
352 ; if there is, load the constants
353 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
354
355 vmovd xmm0, arg1_low32 ; get the initial crc value
356 vpslldq xmm0, 12 ; align it to its correct place
357 vmovdqu xmm7, [arg2] ; load the plaintext
358 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
359 vpxor xmm7, xmm0
360
361 ; update the buffer pointer
362 add arg2, 16
363
364 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
365 sub arg3, 32
366
367 jmp .16B_reduction_loop
368
369
370align 16
371.less_than_32:
372 ; mov initial crc to the return value. this is necessary for zero-length buffers.
373 mov eax, arg1_low32
374 test arg3, arg3
375 je .cleanup
376
377 vmovd xmm0, arg1_low32 ; get the initial crc value
378 vpslldq xmm0, 12 ; align it to its correct place
379
380 cmp arg3, 16
381 je .exact_16_left
382 jl .less_than_16_left
383
384 vmovdqu xmm7, [arg2] ; load the plaintext
385 vpshufb xmm7, xmm18
386 vpxor xmm7, xmm0 ; xor the initial crc value
387 add arg2, 16
388 sub arg3, 16
389 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
390 jmp .get_last_two_xmms
391
392align 16
393.less_than_16_left:
394 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
395
396 vpxor xmm1, xmm1
397 mov r11, rsp
398 vmovdqa [r11], xmm1
399
400 cmp arg3, 4
401 jl .only_less_than_4
402
403 ; backup the counter value
404 mov r9, arg3
405 cmp arg3, 8
406 jl .less_than_8_left
407
408 ; load 8 Bytes
409 mov rax, [arg2]
410 mov [r11], rax
411 add r11, 8
412 sub arg3, 8
413 add arg2, 8
414.less_than_8_left:
415
416 cmp arg3, 4
417 jl .less_than_4_left
418
419 ; load 4 Bytes
420 mov eax, [arg2]
421 mov [r11], eax
422 add r11, 4
423 sub arg3, 4
424 add arg2, 4
425.less_than_4_left:
426
427 cmp arg3, 2
428 jl .less_than_2_left
429
430 ; load 2 Bytes
431 mov ax, [arg2]
432 mov [r11], ax
433 add r11, 2
434 sub arg3, 2
435 add arg2, 2
436.less_than_2_left:
437 cmp arg3, 1
438 jl .zero_left
439
440 ; load 1 Byte
441 mov al, [arg2]
442 mov [r11], al
443
444.zero_left:
445 vmovdqa xmm7, [rsp]
446 vpshufb xmm7, xmm18
447 vpxor xmm7, xmm0 ; xor the initial crc value
448
449 lea rax, [pshufb_shf_table + 16]
450 sub rax, r9
451 vmovdqu xmm0, [rax]
452 vpxor xmm0, [mask1]
453
454 vpshufb xmm7,xmm0
455 jmp .128_done
456
457align 16
458.exact_16_left:
459 vmovdqu xmm7, [arg2]
460 vpshufb xmm7, xmm18
461 vpxor xmm7, xmm0 ; xor the initial crc value
462 jmp .128_done
463
464.only_less_than_4:
465 cmp arg3, 3
466 jl .only_less_than_3
467
468 ; load 3 Bytes
469 mov al, [arg2]
470 mov [r11], al
471
472 mov al, [arg2+1]
473 mov [r11+1], al
474
475 mov al, [arg2+2]
476 mov [r11+2], al
477
478 vmovdqa xmm7, [rsp]
479 vpshufb xmm7, xmm18
480 vpxor xmm7, xmm0 ; xor the initial crc value
481
482 vpsrldq xmm7, 5
483 jmp .barrett
484
485.only_less_than_3:
486 cmp arg3, 2
487 jl .only_less_than_2
488
489 ; load 2 Bytes
490 mov al, [arg2]
491 mov [r11], al
492
493 mov al, [arg2+1]
494 mov [r11+1], al
495
496 vmovdqa xmm7, [rsp]
497 vpshufb xmm7, xmm18
498 vpxor xmm7, xmm0 ; xor the initial crc value
499
500 vpsrldq xmm7, 6
501 jmp .barrett
502
503.only_less_than_2:
504 ; load 1 Byte
505 mov al, [arg2]
506 mov [r11], al
507
508 vmovdqa xmm7, [rsp]
509 vpshufb xmm7, xmm18
510 vpxor xmm7, xmm0 ; xor the initial crc value
511
512 vpsrldq xmm7, 7
513 jmp .barrett
514
515section .data
516align 32
517
518%ifndef USE_CONSTS
519; precomputed constants
520rk_1: dq 0x1851689900000000
521rk_2: dq 0xa3dc855100000000
522rk1: dq 0xf200aa6600000000
523rk2: dq 0x17d3315d00000000
524rk3: dq 0x022ffca500000000
525rk4: dq 0x9d9ee22f00000000
526rk5: dq 0xf200aa6600000000
527rk6: dq 0x490d678d00000000
528rk7: dq 0x0000000104d101df
529rk8: dq 0x0000000104c11db7
530rk9: dq 0x6ac7e7d700000000
531rk10: dq 0xfcd922af00000000
532rk11: dq 0x34e45a6300000000
533rk12: dq 0x8762c1f600000000
534rk13: dq 0x5395a0ea00000000
535rk14: dq 0x54f2d5c700000000
536rk15: dq 0xd3504ec700000000
537rk16: dq 0x57a8445500000000
538rk17: dq 0xc053585d00000000
539rk18: dq 0x766f1b7800000000
540rk19: dq 0xcd8c54b500000000
541rk20: dq 0xab40b71e00000000
542
543rk_1b: dq 0xf200aa6600000000
544rk_2b: dq 0x17d3315d00000000
545 dq 0x0000000000000000
546 dq 0x0000000000000000
547%else
548INCLUDE_CONSTS
549%endif
550
551mask1: dq 0x8080808080808080, 0x8080808080808080
552mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
553
554SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
555
556pshufb_shf_table:
557; use these values for shift constants for the pshufb instruction
558; different alignments result in values as shown:
559; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
560; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
561; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
562; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
563; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
564; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
565; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
566; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
567; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
568; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
569; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
570; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
571; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
572; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
573; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
574dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
575dq 0x0706050403020100, 0x000e0d0c0b0a0908
576dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
577dq 0x8080808080808080, 0x8080808080808080
578
579%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
580%ifidn __OUTPUT_FORMAT__, win64
581global no_ %+ FUNCTION_NAME
582no_ %+ FUNCTION_NAME %+ :
583%endif
584%endif ; (AS_FEATURE_LEVEL) >= 10