]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc16_t10dif_by16_10.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by16_10.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31; Function API:
32; UINT32 crc16_t10dif_by16_10(
33; UINT16 init_crc, //initial CRC value, 16 bits
34; const unsigned char *buf, //buffer pointer to calculate CRC on
35; UINT64 len //buffer length in bytes (64-bit data)
36; );
37;
38; Authors:
39; Erdinc Ozturk
40; Vinodh Gopal
41; James Guilford
42;
43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45;
46;
47
48%include "reg_sizes.asm"
49
50%ifndef FUNCTION_NAME
51%define FUNCTION_NAME crc16_t10dif_by16_10
52%endif
53
54%if (AS_FEATURE_LEVEL) >= 10
55
56[bits 64]
57default rel
58
59section .text
60
61
62%ifidn __OUTPUT_FORMAT__, win64
63 %xdefine arg1 rcx
64 %xdefine arg2 rdx
65 %xdefine arg3 r8
66
67 %xdefine arg1_low32 ecx
68%else
69 %xdefine arg1 rdi
70 %xdefine arg2 rsi
71 %xdefine arg3 rdx
72
73 %xdefine arg1_low32 edi
74%endif
75
76%define TMP 16*0
77%ifidn __OUTPUT_FORMAT__, win64
78 %define XMM_SAVE 16*2
79 %define VARIABLE_OFFSET 16*12+8
80%else
81 %define VARIABLE_OFFSET 16*2+8
82%endif
83
84align 16
85global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
86FUNCTION_NAME:
87
88 ; adjust the 16-bit initial_crc value, scale it to 32 bits
89 shl arg1_low32, 16
90
91 ; After this point, code flow is exactly same as a 32-bit CRC.
92 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
93
94 sub rsp, VARIABLE_OFFSET
95
96%ifidn __OUTPUT_FORMAT__, win64
97 ; push the xmm registers into the stack to maintain
98 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
99 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
100 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
101 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
102 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
103 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
104 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
105 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
106 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
107 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
108%endif
109
110 vbroadcasti32x4 zmm18, [SHUF_MASK]
111 cmp arg3, 256
112 jl .less_than_256
113
114 ; load the initial crc value
115 vmovd xmm10, arg1_low32 ; initial crc
116
117 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
118 ; because data will be byte-reflected and will align with initial crc at correct place.
119 vpslldq xmm10, 12
120
121 ; receive the initial 64B data, xor the initial crc value
122 vmovdqu8 zmm0, [arg2+16*0]
123 vmovdqu8 zmm4, [arg2+16*4]
124 vpshufb zmm0, zmm0, zmm18
125 vpshufb zmm4, zmm4, zmm18
126 vpxorq zmm0, zmm10
127 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
128 ;imm value of pclmulqdq instruction will determine which constant to use
129
130 sub arg3, 256
131 cmp arg3, 256
132 jl .fold_128_B_loop
133
134 vmovdqu8 zmm7, [arg2+16*8]
135 vmovdqu8 zmm8, [arg2+16*12]
136 vpshufb zmm7, zmm7, zmm18
137 vpshufb zmm8, zmm8, zmm18
138 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
139 sub arg3, 256
140
141.fold_256_B_loop:
142 add arg2, 256
143 vmovdqu8 zmm3, [arg2+16*0]
144 vpshufb zmm3, zmm3, zmm18
145 vpclmulqdq zmm1, zmm0, zmm16, 0x00
146 vpclmulqdq zmm2, zmm0, zmm16, 0x11
147 vpxorq zmm0, zmm1, zmm2
148 vpxorq zmm0, zmm0, zmm3
149
150 vmovdqu8 zmm9, [arg2+16*4]
151 vpshufb zmm9, zmm9, zmm18
152 vpclmulqdq zmm5, zmm4, zmm16, 0x00
153 vpclmulqdq zmm6, zmm4, zmm16, 0x11
154 vpxorq zmm4, zmm5, zmm6
155 vpxorq zmm4, zmm4, zmm9
156
157 vmovdqu8 zmm11, [arg2+16*8]
158 vpshufb zmm11, zmm11, zmm18
159 vpclmulqdq zmm12, zmm7, zmm16, 0x00
160 vpclmulqdq zmm13, zmm7, zmm16, 0x11
161 vpxorq zmm7, zmm12, zmm13
162 vpxorq zmm7, zmm7, zmm11
163
164 vmovdqu8 zmm17, [arg2+16*12]
165 vpshufb zmm17, zmm17, zmm18
166 vpclmulqdq zmm14, zmm8, zmm16, 0x00
167 vpclmulqdq zmm15, zmm8, zmm16, 0x11
168 vpxorq zmm8, zmm14, zmm15
169 vpxorq zmm8, zmm8, zmm17
170
171 sub arg3, 256
172 jge .fold_256_B_loop
173
174 ;; Fold 256 into 128
175 add arg2, 256
176 vpclmulqdq zmm1, zmm0, zmm10, 0x00
177 vpclmulqdq zmm2, zmm0, zmm10, 0x11
178 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
179
180 vpclmulqdq zmm5, zmm4, zmm10, 0x00
181 vpclmulqdq zmm6, zmm4, zmm10, 0x11
182 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
183
184 vmovdqa32 zmm0, zmm7
185 vmovdqa32 zmm4, zmm8
186
187 add arg3, 128
188 jmp .fold_128_B_register
189
190
191
192 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
193 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
194
195 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
196.fold_128_B_loop:
197 add arg2, 128
198 vmovdqu8 zmm8, [arg2+16*0]
199 vpshufb zmm8, zmm8, zmm18
200 vpclmulqdq zmm2, zmm0, zmm10, 0x00
201 vpclmulqdq zmm1, zmm0, zmm10, 0x11
202 vpxorq zmm0, zmm2, zmm1
203 vpxorq zmm0, zmm0, zmm8
204
205 vmovdqu8 zmm9, [arg2+16*4]
206 vpshufb zmm9, zmm9, zmm18
207 vpclmulqdq zmm5, zmm4, zmm10, 0x00
208 vpclmulqdq zmm6, zmm4, zmm10, 0x11
209 vpxorq zmm4, zmm5, zmm6
210 vpxorq zmm4, zmm4, zmm9
211
212 sub arg3, 128
213 jge .fold_128_B_loop
214 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
215
216 add arg2, 128
217 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
218 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
219
220.fold_128_B_register:
221 ; fold the 8 128b parts into 1 xmm register with different constants
222 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
223 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
224 vpclmulqdq zmm1, zmm0, zmm16, 0x00
225 vpclmulqdq zmm2, zmm0, zmm16, 0x11
226 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
227
228 vpclmulqdq zmm5, zmm4, zmm11, 0x00
229 vpclmulqdq zmm6, zmm4, zmm11, 0x11
230 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
231 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
232 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
233
234 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
235 vpxorq ymm8, ymm8, ymm1
236 vextracti64x2 xmm5, ymm8, 1
237 vpxorq xmm7, xmm5, xmm8
238
239 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
240 ; instead of a cmp instruction, we use the negative flag with the jl instruction
241 add arg3, 128-16
242 jl .final_reduction_for_128
243
244 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
245 ; we can fold 16 bytes at a time if y>=16
246 ; continue folding 16B at a time
247
248.16B_reduction_loop:
249 vpclmulqdq xmm8, xmm7, xmm10, 0x11
250 vpclmulqdq xmm7, xmm7, xmm10, 0x00
251 vpxor xmm7, xmm8
252 vmovdqu xmm0, [arg2]
253 vpshufb xmm0, xmm0, xmm18
254 vpxor xmm7, xmm0
255 add arg2, 16
256 sub arg3, 16
257 ; instead of a cmp instruction, we utilize the flags with the jge instruction
258 ; equivalent of: cmp arg3, 16-16
259 ; check if there is any more 16B in the buffer to be able to fold
260 jge .16B_reduction_loop
261
262 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
263 ;first, we reduce the data in the xmm7 register
264
265
266.final_reduction_for_128:
267 add arg3, 16
268 je .128_done
269
270 ; here we are getting data that is less than 16 bytes.
271 ; since we know that there was data before the pointer, we can offset
272 ; the input pointer before the actual point, to receive exactly 16 bytes.
273 ; after that the registers need to be adjusted.
274.get_last_two_xmms:
275
276 vmovdqa xmm2, xmm7
277 vmovdqu xmm1, [arg2 - 16 + arg3]
278 vpshufb xmm1, xmm18
279
280 ; get rid of the extra data that was loaded before
281 ; load the shift constant
282 lea rax, [pshufb_shf_table + 16]
283 sub rax, arg3
284 vmovdqu xmm0, [rax]
285
286 vpshufb xmm2, xmm0
287 vpxor xmm0, [mask1]
288 vpshufb xmm7, xmm0
289 vpblendvb xmm1, xmm1, xmm2, xmm0
290
291 vpclmulqdq xmm8, xmm7, xmm10, 0x11
292 vpclmulqdq xmm7, xmm7, xmm10, 0x00
293 vpxor xmm7, xmm8
294 vpxor xmm7, xmm1
295
296.128_done:
297 ; compute crc of a 128-bit value
298 vmovdqa xmm10, [rk5]
299 vmovdqa xmm0, xmm7
300
301 ;64b fold
302 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
303 vpslldq xmm0, 8
304 vpxor xmm7, xmm0
305
306 ;32b fold
307 vmovdqa xmm0, xmm7
308 vpand xmm0, [mask2]
309 vpsrldq xmm7, 12
310 vpclmulqdq xmm7, xmm10, 0x10
311 vpxor xmm7, xmm0
312
313 ;barrett reduction
314.barrett:
315 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
316 vmovdqa xmm0, xmm7
317 vpclmulqdq xmm7, xmm10, 0x01
318 vpslldq xmm7, 4
319 vpclmulqdq xmm7, xmm10, 0x11
320
321 vpslldq xmm7, 4
322 vpxor xmm7, xmm0
323 vpextrd eax, xmm7, 1
324
325.cleanup:
326 ; scale the result back to 16 bits
327 shr eax, 16
328
329%ifidn __OUTPUT_FORMAT__, win64
330 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
331 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
332 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
333 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
334 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
335 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
336 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
337 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
338 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
339 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
340%endif
341 add rsp, VARIABLE_OFFSET
342 ret
343
344
345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349
350align 16
351.less_than_256:
352
353 ; check if there is enough buffer to be able to fold 16B at a time
354 cmp arg3, 32
355 jl .less_than_32
356
357 ; if there is, load the constants
358 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
359
360 vmovd xmm0, arg1_low32 ; get the initial crc value
361 vpslldq xmm0, 12 ; align it to its correct place
362 vmovdqu xmm7, [arg2] ; load the plaintext
363 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
364 vpxor xmm7, xmm0
365
366 ; update the buffer pointer
367 add arg2, 16
368
369 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
370 sub arg3, 32
371
372 jmp .16B_reduction_loop
373
374
375align 16
376.less_than_32:
377 ; mov initial crc to the return value. this is necessary for zero-length buffers.
378 mov eax, arg1_low32
379 test arg3, arg3
380 je .cleanup
381
382 vmovd xmm0, arg1_low32 ; get the initial crc value
383 vpslldq xmm0, 12 ; align it to its correct place
384
385 cmp arg3, 16
386 je .exact_16_left
387 jl .less_than_16_left
388
389 vmovdqu xmm7, [arg2] ; load the plaintext
390 vpshufb xmm7, xmm18
391 vpxor xmm7, xmm0 ; xor the initial crc value
392 add arg2, 16
393 sub arg3, 16
394 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
395 jmp .get_last_two_xmms
396
397align 16
398.less_than_16_left:
399 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
400
401 vpxor xmm1, xmm1
402 mov r11, rsp
403 vmovdqa [r11], xmm1
404
405 cmp arg3, 4
406 jl .only_less_than_4
407
408 ; backup the counter value
409 mov r9, arg3
410 cmp arg3, 8
411 jl .less_than_8_left
412
413 ; load 8 Bytes
414 mov rax, [arg2]
415 mov [r11], rax
416 add r11, 8
417 sub arg3, 8
418 add arg2, 8
419.less_than_8_left:
420
421 cmp arg3, 4
422 jl .less_than_4_left
423
424 ; load 4 Bytes
425 mov eax, [arg2]
426 mov [r11], eax
427 add r11, 4
428 sub arg3, 4
429 add arg2, 4
430.less_than_4_left:
431
432 cmp arg3, 2
433 jl .less_than_2_left
434
435 ; load 2 Bytes
436 mov ax, [arg2]
437 mov [r11], ax
438 add r11, 2
439 sub arg3, 2
440 add arg2, 2
441.less_than_2_left:
442 cmp arg3, 1
443 jl .zero_left
444
445 ; load 1 Byte
446 mov al, [arg2]
447 mov [r11], al
448
449.zero_left:
450 vmovdqa xmm7, [rsp]
451 vpshufb xmm7, xmm18
452 vpxor xmm7, xmm0 ; xor the initial crc value
453
454 lea rax, [pshufb_shf_table + 16]
455 sub rax, r9
456 vmovdqu xmm0, [rax]
457 vpxor xmm0, [mask1]
458
459 vpshufb xmm7,xmm0
460 jmp .128_done
461
462align 16
463.exact_16_left:
464 vmovdqu xmm7, [arg2]
465 vpshufb xmm7, xmm18
466 vpxor xmm7, xmm0 ; xor the initial crc value
467 jmp .128_done
468
469.only_less_than_4:
470 cmp arg3, 3
471 jl .only_less_than_3
472
473 ; load 3 Bytes
474 mov al, [arg2]
475 mov [r11], al
476
477 mov al, [arg2+1]
478 mov [r11+1], al
479
480 mov al, [arg2+2]
481 mov [r11+2], al
482
483 vmovdqa xmm7, [rsp]
484 vpshufb xmm7, xmm18
485 vpxor xmm7, xmm0 ; xor the initial crc value
486
487 vpsrldq xmm7, 5
488 jmp .barrett
489
490.only_less_than_3:
491 cmp arg3, 2
492 jl .only_less_than_2
493
494 ; load 2 Bytes
495 mov al, [arg2]
496 mov [r11], al
497
498 mov al, [arg2+1]
499 mov [r11+1], al
500
501 vmovdqa xmm7, [rsp]
502 vpshufb xmm7, xmm18
503 vpxor xmm7, xmm0 ; xor the initial crc value
504
505 vpsrldq xmm7, 6
506 jmp .barrett
507
508.only_less_than_2:
509 ; load 1 Byte
510 mov al, [arg2]
511 mov [r11], al
512
513 vmovdqa xmm7, [rsp]
514 vpshufb xmm7, xmm18
515 vpxor xmm7, xmm0 ; xor the initial crc value
516
517 vpsrldq xmm7, 7
518 jmp .barrett
519
520section .data
521align 32
522
523%ifndef USE_CONSTS
524; precomputed constants
525
526rk_1: dq 0xdccf000000000000
527rk_2: dq 0x4b0b000000000000
528rk1: dq 0x2d56000000000000
529rk2: dq 0x06df000000000000
530rk3: dq 0x9d9d000000000000
531rk4: dq 0x7cf5000000000000
532rk5: dq 0x2d56000000000000
533rk6: dq 0x1368000000000000
534rk7: dq 0x00000001f65a57f8
535rk8: dq 0x000000018bb70000
536rk9: dq 0xceae000000000000
537rk10: dq 0xbfd6000000000000
538rk11: dq 0x1e16000000000000
539rk12: dq 0x713c000000000000
540rk13: dq 0xf7f9000000000000
541rk14: dq 0x80a6000000000000
542rk15: dq 0x044c000000000000
543rk16: dq 0xe658000000000000
544rk17: dq 0xad18000000000000
545rk18: dq 0xa497000000000000
546rk19: dq 0x6ee3000000000000
547rk20: dq 0xe7b5000000000000
548
549rk_1b: dq 0x2d56000000000000
550rk_2b: dq 0x06df000000000000
551 dq 0x0000000000000000
552 dq 0x0000000000000000
553%else
554INCLUDE_CONSTS
555%endif
556
557mask1: dq 0x8080808080808080, 0x8080808080808080
558mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
559
560SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
561
562pshufb_shf_table:
563; use these values for shift constants for the pshufb instruction
564; different alignments result in values as shown:
565; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
566; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
567; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
568; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
569; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
570; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
571; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
572; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
573; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
574; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
575; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
576; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
577; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
578; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
579; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
580dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
581dq 0x0706050403020100, 0x000e0d0c0b0a0908
582dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
583dq 0x8080808080808080, 0x8080808080808080
584
585%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
586%ifidn __OUTPUT_FORMAT__, win64
587global no_ %+ FUNCTION_NAME
588no_ %+ FUNCTION_NAME %+ :
589%endif
590%endif ; (AS_FEATURE_LEVEL) >= 10