]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc32_ieee_by16_10.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by16_10.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31; Function API:
32; UINT32 crc32_gzip_refl_by16_10(
33; UINT32 init_crc, //initial CRC value, 32 bits
34; const unsigned char *buf, //buffer pointer to calculate CRC on
35; UINT64 len //buffer length in bytes (64-bit data)
36; );
37;
38; Authors:
39; Erdinc Ozturk
40; Vinodh Gopal
41; James Guilford
42;
43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45;
46;
47
48%include "reg_sizes.asm"
49
50%ifndef FUNCTION_NAME
51%define FUNCTION_NAME crc32_ieee_by16_10
52%endif
53
54%if (AS_FEATURE_LEVEL) >= 10
55
56[bits 64]
57default rel
58
59section .text
60
61
62%ifidn __OUTPUT_FORMAT__, win64
63 %xdefine arg1 rcx
64 %xdefine arg2 rdx
65 %xdefine arg3 r8
66
67 %xdefine arg1_low32 ecx
68%else
69 %xdefine arg1 rdi
70 %xdefine arg2 rsi
71 %xdefine arg3 rdx
72
73 %xdefine arg1_low32 edi
74%endif
75
76%define TMP 16*0
77%ifidn __OUTPUT_FORMAT__, win64
78 %define XMM_SAVE 16*2
79 %define VARIABLE_OFFSET 16*12+8
80%else
81 %define VARIABLE_OFFSET 16*2+8
82%endif
83
84align 16
20effc67 85mk_global FUNCTION_NAME, function
f91f0fd5 86FUNCTION_NAME:
20effc67 87 endbranch
f91f0fd5
TL
88
89 not arg1_low32
90 sub rsp, VARIABLE_OFFSET
91
92%ifidn __OUTPUT_FORMAT__, win64
93 ; push the xmm registers into the stack to maintain
94 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
95 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
96 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
97 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
98 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
99 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
100 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
101 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
102 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
103 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
104%endif
105
106 vbroadcasti32x4 zmm18, [SHUF_MASK]
107 cmp arg3, 256
108 jl .less_than_256
109
110 ; load the initial crc value
111 vmovd xmm10, arg1_low32 ; initial crc
112
113 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
114 ; because data will be byte-reflected and will align with initial crc at correct place.
115 vpslldq xmm10, 12
116
117 ; receive the initial 64B data, xor the initial crc value
118 vmovdqu8 zmm0, [arg2+16*0]
119 vmovdqu8 zmm4, [arg2+16*4]
120 vpshufb zmm0, zmm0, zmm18
121 vpshufb zmm4, zmm4, zmm18
122 vpxorq zmm0, zmm10
123 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
124 ;imm value of pclmulqdq instruction will determine which constant to use
125
126 sub arg3, 256
127 cmp arg3, 256
128 jl .fold_128_B_loop
129
130 vmovdqu8 zmm7, [arg2+16*8]
131 vmovdqu8 zmm8, [arg2+16*12]
132 vpshufb zmm7, zmm7, zmm18
133 vpshufb zmm8, zmm8, zmm18
134 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
135 sub arg3, 256
136
137.fold_256_B_loop:
138 add arg2, 256
139 vmovdqu8 zmm3, [arg2+16*0]
140 vpshufb zmm3, zmm3, zmm18
141 vpclmulqdq zmm1, zmm0, zmm16, 0x00
142 vpclmulqdq zmm2, zmm0, zmm16, 0x11
143 vpxorq zmm0, zmm1, zmm2
144 vpxorq zmm0, zmm0, zmm3
145
146 vmovdqu8 zmm9, [arg2+16*4]
147 vpshufb zmm9, zmm9, zmm18
148 vpclmulqdq zmm5, zmm4, zmm16, 0x00
149 vpclmulqdq zmm6, zmm4, zmm16, 0x11
150 vpxorq zmm4, zmm5, zmm6
151 vpxorq zmm4, zmm4, zmm9
152
153 vmovdqu8 zmm11, [arg2+16*8]
154 vpshufb zmm11, zmm11, zmm18
155 vpclmulqdq zmm12, zmm7, zmm16, 0x00
156 vpclmulqdq zmm13, zmm7, zmm16, 0x11
157 vpxorq zmm7, zmm12, zmm13
158 vpxorq zmm7, zmm7, zmm11
159
160 vmovdqu8 zmm17, [arg2+16*12]
161 vpshufb zmm17, zmm17, zmm18
162 vpclmulqdq zmm14, zmm8, zmm16, 0x00
163 vpclmulqdq zmm15, zmm8, zmm16, 0x11
164 vpxorq zmm8, zmm14, zmm15
165 vpxorq zmm8, zmm8, zmm17
166
167 sub arg3, 256
168 jge .fold_256_B_loop
169
170 ;; Fold 256 into 128
171 add arg2, 256
172 vpclmulqdq zmm1, zmm0, zmm10, 0x00
173 vpclmulqdq zmm2, zmm0, zmm10, 0x11
174 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
175
176 vpclmulqdq zmm5, zmm4, zmm10, 0x00
177 vpclmulqdq zmm6, zmm4, zmm10, 0x11
178 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
179
180 vmovdqa32 zmm0, zmm7
181 vmovdqa32 zmm4, zmm8
182
183 add arg3, 128
184 jmp .fold_128_B_register
185
186
187
188 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
189 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
190
191 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
192.fold_128_B_loop:
193 add arg2, 128
194 vmovdqu8 zmm8, [arg2+16*0]
195 vpshufb zmm8, zmm8, zmm18
196 vpclmulqdq zmm2, zmm0, zmm10, 0x00
197 vpclmulqdq zmm1, zmm0, zmm10, 0x11
198 vpxorq zmm0, zmm2, zmm1
199 vpxorq zmm0, zmm0, zmm8
200
201 vmovdqu8 zmm9, [arg2+16*4]
202 vpshufb zmm9, zmm9, zmm18
203 vpclmulqdq zmm5, zmm4, zmm10, 0x00
204 vpclmulqdq zmm6, zmm4, zmm10, 0x11
205 vpxorq zmm4, zmm5, zmm6
206 vpxorq zmm4, zmm4, zmm9
207
208 sub arg3, 128
209 jge .fold_128_B_loop
210 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
211
212 add arg2, 128
213 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
214 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
215
216.fold_128_B_register:
217 ; fold the 8 128b parts into 1 xmm register with different constants
218 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
219 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
220 vpclmulqdq zmm1, zmm0, zmm16, 0x00
221 vpclmulqdq zmm2, zmm0, zmm16, 0x11
222 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
223
224 vpclmulqdq zmm5, zmm4, zmm11, 0x00
225 vpclmulqdq zmm6, zmm4, zmm11, 0x11
226 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
227 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
228 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
229
230 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
231 vpxorq ymm8, ymm8, ymm1
232 vextracti64x2 xmm5, ymm8, 1
233 vpxorq xmm7, xmm5, xmm8
234
235 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
236 ; instead of a cmp instruction, we use the negative flag with the jl instruction
237 add arg3, 128-16
238 jl .final_reduction_for_128
239
240 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
241 ; we can fold 16 bytes at a time if y>=16
242 ; continue folding 16B at a time
243
244.16B_reduction_loop:
245 vpclmulqdq xmm8, xmm7, xmm10, 0x11
246 vpclmulqdq xmm7, xmm7, xmm10, 0x00
247 vpxor xmm7, xmm8
248 vmovdqu xmm0, [arg2]
249 vpshufb xmm0, xmm0, xmm18
250 vpxor xmm7, xmm0
251 add arg2, 16
252 sub arg3, 16
253 ; instead of a cmp instruction, we utilize the flags with the jge instruction
254 ; equivalent of: cmp arg3, 16-16
255 ; check if there is any more 16B in the buffer to be able to fold
256 jge .16B_reduction_loop
257
258 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
259 ;first, we reduce the data in the xmm7 register
260
261
262.final_reduction_for_128:
263 add arg3, 16
264 je .128_done
265
266 ; here we are getting data that is less than 16 bytes.
267 ; since we know that there was data before the pointer, we can offset
268 ; the input pointer before the actual point, to receive exactly 16 bytes.
269 ; after that the registers need to be adjusted.
270.get_last_two_xmms:
271
272 vmovdqa xmm2, xmm7
273 vmovdqu xmm1, [arg2 - 16 + arg3]
274 vpshufb xmm1, xmm18
275
276 ; get rid of the extra data that was loaded before
277 ; load the shift constant
278 lea rax, [pshufb_shf_table + 16]
279 sub rax, arg3
280 vmovdqu xmm0, [rax]
281
282 vpshufb xmm2, xmm0
283 vpxor xmm0, [mask1]
284 vpshufb xmm7, xmm0
285 vpblendvb xmm1, xmm1, xmm2, xmm0
286
287 vpclmulqdq xmm8, xmm7, xmm10, 0x11
288 vpclmulqdq xmm7, xmm7, xmm10, 0x00
289 vpxor xmm7, xmm8
290 vpxor xmm7, xmm1
291
292.128_done:
293 ; compute crc of a 128-bit value
294 vmovdqa xmm10, [rk5]
295 vmovdqa xmm0, xmm7
296
297 ;64b fold
298 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
299 vpslldq xmm0, 8
300 vpxor xmm7, xmm0
301
302 ;32b fold
303 vmovdqa xmm0, xmm7
304 vpand xmm0, [mask2]
305 vpsrldq xmm7, 12
306 vpclmulqdq xmm7, xmm10, 0x10
307 vpxor xmm7, xmm0
308
309 ;barrett reduction
310.barrett:
311 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
312 vmovdqa xmm0, xmm7
313 vpclmulqdq xmm7, xmm10, 0x01
314 vpslldq xmm7, 4
315 vpclmulqdq xmm7, xmm10, 0x11
316
317 vpslldq xmm7, 4
318 vpxor xmm7, xmm0
319 vpextrd eax, xmm7, 1
320
321.cleanup:
322 not eax
323
324
325%ifidn __OUTPUT_FORMAT__, win64
326 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
327 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
328 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
329 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
330 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
331 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
332 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
333 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
334 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
335 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
336%endif
337 add rsp, VARIABLE_OFFSET
338 ret
339
340
341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345
346align 16
347.less_than_256:
348
349 ; check if there is enough buffer to be able to fold 16B at a time
350 cmp arg3, 32
351 jl .less_than_32
352
353 ; if there is, load the constants
354 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
355
356 vmovd xmm0, arg1_low32 ; get the initial crc value
357 vpslldq xmm0, 12 ; align it to its correct place
358 vmovdqu xmm7, [arg2] ; load the plaintext
359 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
360 vpxor xmm7, xmm0
361
362 ; update the buffer pointer
363 add arg2, 16
364
365 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
366 sub arg3, 32
367
368 jmp .16B_reduction_loop
369
370
371align 16
372.less_than_32:
373 ; mov initial crc to the return value. this is necessary for zero-length buffers.
374 mov eax, arg1_low32
375 test arg3, arg3
376 je .cleanup
377
378 vmovd xmm0, arg1_low32 ; get the initial crc value
379 vpslldq xmm0, 12 ; align it to its correct place
380
381 cmp arg3, 16
382 je .exact_16_left
383 jl .less_than_16_left
384
385 vmovdqu xmm7, [arg2] ; load the plaintext
386 vpshufb xmm7, xmm18
387 vpxor xmm7, xmm0 ; xor the initial crc value
388 add arg2, 16
389 sub arg3, 16
390 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
391 jmp .get_last_two_xmms
392
393align 16
394.less_than_16_left:
395 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
396
397 vpxor xmm1, xmm1
398 mov r11, rsp
399 vmovdqa [r11], xmm1
400
401 cmp arg3, 4
402 jl .only_less_than_4
403
404 ; backup the counter value
405 mov r9, arg3
406 cmp arg3, 8
407 jl .less_than_8_left
408
409 ; load 8 Bytes
410 mov rax, [arg2]
411 mov [r11], rax
412 add r11, 8
413 sub arg3, 8
414 add arg2, 8
415.less_than_8_left:
416
417 cmp arg3, 4
418 jl .less_than_4_left
419
420 ; load 4 Bytes
421 mov eax, [arg2]
422 mov [r11], eax
423 add r11, 4
424 sub arg3, 4
425 add arg2, 4
426.less_than_4_left:
427
428 cmp arg3, 2
429 jl .less_than_2_left
430
431 ; load 2 Bytes
432 mov ax, [arg2]
433 mov [r11], ax
434 add r11, 2
435 sub arg3, 2
436 add arg2, 2
437.less_than_2_left:
438 cmp arg3, 1
439 jl .zero_left
440
441 ; load 1 Byte
442 mov al, [arg2]
443 mov [r11], al
444
445.zero_left:
446 vmovdqa xmm7, [rsp]
447 vpshufb xmm7, xmm18
448 vpxor xmm7, xmm0 ; xor the initial crc value
449
450 lea rax, [pshufb_shf_table + 16]
451 sub rax, r9
452 vmovdqu xmm0, [rax]
453 vpxor xmm0, [mask1]
454
455 vpshufb xmm7,xmm0
456 jmp .128_done
457
458align 16
459.exact_16_left:
460 vmovdqu xmm7, [arg2]
461 vpshufb xmm7, xmm18
462 vpxor xmm7, xmm0 ; xor the initial crc value
463 jmp .128_done
464
465.only_less_than_4:
466 cmp arg3, 3
467 jl .only_less_than_3
468
469 ; load 3 Bytes
470 mov al, [arg2]
471 mov [r11], al
472
473 mov al, [arg2+1]
474 mov [r11+1], al
475
476 mov al, [arg2+2]
477 mov [r11+2], al
478
479 vmovdqa xmm7, [rsp]
480 vpshufb xmm7, xmm18
481 vpxor xmm7, xmm0 ; xor the initial crc value
482
483 vpsrldq xmm7, 5
484 jmp .barrett
485
486.only_less_than_3:
487 cmp arg3, 2
488 jl .only_less_than_2
489
490 ; load 2 Bytes
491 mov al, [arg2]
492 mov [r11], al
493
494 mov al, [arg2+1]
495 mov [r11+1], al
496
497 vmovdqa xmm7, [rsp]
498 vpshufb xmm7, xmm18
499 vpxor xmm7, xmm0 ; xor the initial crc value
500
501 vpsrldq xmm7, 6
502 jmp .barrett
503
504.only_less_than_2:
505 ; load 1 Byte
506 mov al, [arg2]
507 mov [r11], al
508
509 vmovdqa xmm7, [rsp]
510 vpshufb xmm7, xmm18
511 vpxor xmm7, xmm0 ; xor the initial crc value
512
513 vpsrldq xmm7, 7
514 jmp .barrett
515
516section .data
517align 32
518
519%ifndef USE_CONSTS
520; precomputed constants
521rk_1: dq 0x1851689900000000
522rk_2: dq 0xa3dc855100000000
523rk1: dq 0xf200aa6600000000
524rk2: dq 0x17d3315d00000000
525rk3: dq 0x022ffca500000000
526rk4: dq 0x9d9ee22f00000000
527rk5: dq 0xf200aa6600000000
528rk6: dq 0x490d678d00000000
529rk7: dq 0x0000000104d101df
530rk8: dq 0x0000000104c11db7
531rk9: dq 0x6ac7e7d700000000
532rk10: dq 0xfcd922af00000000
533rk11: dq 0x34e45a6300000000
534rk12: dq 0x8762c1f600000000
535rk13: dq 0x5395a0ea00000000
536rk14: dq 0x54f2d5c700000000
537rk15: dq 0xd3504ec700000000
538rk16: dq 0x57a8445500000000
539rk17: dq 0xc053585d00000000
540rk18: dq 0x766f1b7800000000
541rk19: dq 0xcd8c54b500000000
542rk20: dq 0xab40b71e00000000
543
544rk_1b: dq 0xf200aa6600000000
545rk_2b: dq 0x17d3315d00000000
546 dq 0x0000000000000000
547 dq 0x0000000000000000
548%else
549INCLUDE_CONSTS
550%endif
551
552mask1: dq 0x8080808080808080, 0x8080808080808080
553mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
554
555SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
556
557pshufb_shf_table:
558; use these values for shift constants for the pshufb instruction
559; different alignments result in values as shown:
560; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
561; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
562; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
563; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
564; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
565; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
566; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
567; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
568; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
569; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
570; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
571; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
572; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
573; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
574; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
575dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
576dq 0x0706050403020100, 0x000e0d0c0b0a0908
577dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
578dq 0x8080808080808080, 0x8080808080808080
579
580%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
581%ifidn __OUTPUT_FORMAT__, win64
582global no_ %+ FUNCTION_NAME
583no_ %+ FUNCTION_NAME %+ :
584%endif
585%endif ; (AS_FEATURE_LEVEL) >= 10