ceph/src/isa-l/crc/crc32_iscsi_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       UINT32 crc32_iscsi_by16_10(
  33 ;               UINT32 init_crc, //initial CRC value, 32 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  45 ;
  46 ;
  47
  48 %include "reg_sizes.asm"
  49
  50 %ifndef FUNCTION_NAME
  51 %define FUNCTION_NAME crc32_iscsi_by16_10
  52 %endif
  53
  54 %if (AS_FEATURE_LEVEL) >= 10
  55
  56 [bits 64]
  57 default rel
  58
  59 section .text
  60
  61
  62 %ifidn __OUTPUT_FORMAT__, win64
  63         %xdefine        arg1 r8
  64         %xdefine        arg2 rcx
  65         %xdefine        arg3 rdx
  66
  67         %xdefine        arg1_low32 r8d
  68 %else
  69         %xdefine        arg1 rdx
  70         %xdefine        arg2 rdi
  71         %xdefine        arg3 rsi
  72
  73         %xdefine        arg1_low32 edx
  74 %endif
  75
  76 %define TMP 16*0
  77 %ifidn __OUTPUT_FORMAT__, win64
  78         %define XMM_SAVE 16*2
  79         %define VARIABLE_OFFSET 16*12+8
  80 %else
  81         %define VARIABLE_OFFSET 16*2+8
  82 %endif
  83
  84 align 16
  85 mk_global FUNCTION_NAME, function
  86 FUNCTION_NAME:
  87         endbranch
  88         sub             rsp, VARIABLE_OFFSET
  89
  90 %ifidn __OUTPUT_FORMAT__, win64
  91         ; push the xmm registers into the stack to maintain
  92         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
  93         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
  94         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
  95         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
  96         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
  97         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
  98         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
  99         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
 100         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
 101         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
 102 %endif
 103
 104         ; check if smaller than 256B
 105         cmp             arg3, 256
 106         jl              .less_than_256
 107
 108         ; load the initial crc value
 109         vmovd           xmm10, arg1_low32      ; initial crc
 110
 111         ; receive the initial 64B data, xor the initial crc value
 112         vmovdqu8        zmm0, [arg2+16*0]
 113         vmovdqu8        zmm4, [arg2+16*4]
 114         vpxorq          zmm0, zmm10
 115         vbroadcasti32x4 zmm10, [rk3]    ;xmm10 has rk3 and rk4
 116                                         ;imm value of pclmulqdq instruction will determine which constant to use
 117
 118         sub             arg3, 256
 119         cmp             arg3, 256
 120         jl              .fold_128_B_loop
 121
 122         vmovdqu8        zmm7, [arg2+16*8]
 123         vmovdqu8        zmm8, [arg2+16*12]
 124         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 125         sub             arg3, 256
 126
 127 .fold_256_B_loop:
 128         add             arg2, 256
 129         vmovdqu8        zmm3, [arg2+16*0]
 130         vpclmulqdq      zmm1, zmm0, zmm16, 0x10
 131         vpclmulqdq      zmm2, zmm0, zmm16, 0x01
 132         vpxorq          zmm0, zmm1, zmm2
 133         vpxorq          zmm0, zmm0, zmm3
 134
 135         vmovdqu8        zmm9, [arg2+16*4]
 136         vpclmulqdq      zmm5, zmm4, zmm16, 0x10
 137         vpclmulqdq      zmm6, zmm4, zmm16, 0x01
 138         vpxorq          zmm4, zmm5, zmm6
 139         vpxorq          zmm4, zmm4, zmm9
 140
 141         vmovdqu8        zmm11, [arg2+16*8]
 142         vpclmulqdq      zmm12, zmm7, zmm16, 0x10
 143         vpclmulqdq      zmm13, zmm7, zmm16, 0x01
 144         vpxorq          zmm7, zmm12, zmm13
 145         vpxorq          zmm7, zmm7, zmm11
 146
 147         vmovdqu8        zmm17, [arg2+16*12]
 148         vpclmulqdq      zmm14, zmm8, zmm16, 0x10
 149         vpclmulqdq      zmm15, zmm8, zmm16, 0x01
 150         vpxorq          zmm8, zmm14, zmm15
 151         vpxorq          zmm8, zmm8, zmm17
 152
 153         sub             arg3, 256
 154         jge             .fold_256_B_loop
 155
 156         ;; Fold 256 into 128
 157         add             arg2, 256
 158         vpclmulqdq      zmm1, zmm0, zmm10, 0x01
 159         vpclmulqdq      zmm2, zmm0, zmm10, 0x10
 160         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 161
 162         vpclmulqdq      zmm5, zmm4, zmm10, 0x01
 163         vpclmulqdq      zmm6, zmm4, zmm10, 0x10
 164         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 165
 166         vmovdqa32       zmm0, zmm7
 167         vmovdqa32       zmm4, zmm8
 168
 169         add             arg3, 128
 170         jmp             .fold_128_B_register
 171
 172
 173
 174         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
 175         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 176
 177         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 178 .fold_128_B_loop:
 179         add             arg2, 128
 180         vmovdqu8        zmm8, [arg2+16*0]
 181         vpclmulqdq      zmm2, zmm0, zmm10, 0x10
 182         vpclmulqdq      zmm1, zmm0, zmm10, 0x01
 183         vpxorq          zmm0, zmm2, zmm1
 184         vpxorq          zmm0, zmm0, zmm8
 185
 186         vmovdqu8        zmm9, [arg2+16*4]
 187         vpclmulqdq      zmm5, zmm4, zmm10, 0x10
 188         vpclmulqdq      zmm6, zmm4, zmm10, 0x01
 189         vpxorq          zmm4, zmm5, zmm6
 190         vpxorq          zmm4, zmm4, zmm9
 191
 192         sub             arg3, 128
 193         jge             .fold_128_B_loop
 194         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 195
 196         add             arg2, 128
 197         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 198         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 199
 200 .fold_128_B_register:
 201         ; fold the 8 128b parts into 1 xmm register with different constants
 202         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 203         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 204         vpclmulqdq      zmm1, zmm0, zmm16, 0x01
 205         vpclmulqdq      zmm2, zmm0, zmm16, 0x10
 206         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 207
 208         vpclmulqdq      zmm5, zmm4, zmm11, 0x01
 209         vpclmulqdq      zmm6, zmm4, zmm11, 0x10
 210         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 211         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 212         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 213
 214         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 215         vpxorq          ymm8, ymm8, ymm1
 216         vextracti64x2   xmm5, ymm8, 1
 217         vpxorq          xmm7, xmm5, xmm8
 218
 219         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 220         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 221         add             arg3, 128-16
 222         jl              .final_reduction_for_128
 223
 224         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 225         ; we can fold 16 bytes at a time if y>=16
 226         ; continue folding 16B at a time
 227
 228 .16B_reduction_loop:
 229         vpclmulqdq      xmm8, xmm7, xmm10, 0x1
 230         vpclmulqdq      xmm7, xmm7, xmm10, 0x10
 231         vpxor           xmm7, xmm8
 232         vmovdqu         xmm0, [arg2]
 233         vpxor           xmm7, xmm0
 234         add             arg2, 16
 235         sub             arg3, 16
 236         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 237         ; equivalent of: cmp arg3, 16-16
 238         ; check if there is any more 16B in the buffer to be able to fold
 239         jge             .16B_reduction_loop
 240
 241         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 242         ;first, we reduce the data in the xmm7 register
 243
 244
 245 .final_reduction_for_128:
 246         add             arg3, 16
 247         je              .128_done
 248
 249         ; here we are getting data that is less than 16 bytes.
 250         ; since we know that there was data before the pointer, we can offset
 251         ; the input pointer before the actual point, to receive exactly 16 bytes.
 252         ; after that the registers need to be adjusted.
 253 .get_last_two_xmms:
 254
 255         vmovdqa         xmm2, xmm7
 256         vmovdqu         xmm1, [arg2 - 16 + arg3]
 257
 258         ; get rid of the extra data that was loaded before
 259         ; load the shift constant
 260         lea             rax, [pshufb_shf_table]
 261         add             rax, arg3
 262         vmovdqu         xmm0, [rax]
 263
 264         vpshufb         xmm7, xmm0
 265         vpxor           xmm0, [mask3]
 266         vpshufb         xmm2, xmm0
 267
 268         vpblendvb       xmm2, xmm2, xmm1, xmm0
 269         ;;;;;;;;;;
 270         vpclmulqdq      xmm8, xmm7, xmm10, 0x1
 271         vpclmulqdq      xmm7, xmm7, xmm10, 0x10
 272         vpxor           xmm7, xmm8
 273         vpxor           xmm7, xmm2
 274
 275 .128_done:
 276         ; compute crc of a 128-bit value
 277         vmovdqa         xmm10, [rk5]
 278         vmovdqa         xmm0, xmm7
 279
 280         ;64b fold
 281         vpclmulqdq      xmm7, xmm10, 0
 282         vpsrldq         xmm0, 8
 283         vpxor           xmm7, xmm0
 284
 285         ;32b fold
 286         vmovdqa         xmm0, xmm7
 287         vpslldq         xmm7, 4
 288         vpclmulqdq      xmm7, xmm10, 0x10
 289         vpxor           xmm7, xmm0
 290
 291
 292         ;barrett reduction
 293 .barrett:
 294         vpand           xmm7, [mask2]
 295         vmovdqa         xmm1, xmm7
 296         vmovdqa         xmm2, xmm7
 297         vmovdqa         xmm10, [rk7]
 298
 299         vpclmulqdq      xmm7, xmm10, 0
 300         vpxor           xmm7, xmm2
 301         vpand           xmm7, [mask]
 302         vmovdqa         xmm2, xmm7
 303         vpclmulqdq      xmm7, xmm10, 0x10
 304         vpxor           xmm7, xmm2
 305         vpxor           xmm7, xmm1
 306         vpextrd         eax, xmm7, 2
 307
 308 .cleanup:
 309
 310 %ifidn __OUTPUT_FORMAT__, win64
 311         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 312         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 313         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 314         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 315         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 316         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 317         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 318         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 319         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 320         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 321 %endif
 322         add             rsp, VARIABLE_OFFSET
 323         ret
 324
 325
 326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 327 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 328 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 329 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 330
 331 align 16
 332 .less_than_256:
 333
 334         ; check if there is enough buffer to be able to fold 16B at a time
 335         cmp     arg3, 32
 336         jl      .less_than_32
 337
 338         ; if there is, load the constants
 339         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 340
 341         vmovd   xmm0, arg1_low32        ; get the initial crc value
 342         vmovdqu xmm7, [arg2]            ; load the plaintext
 343         vpxor   xmm7, xmm0
 344
 345         ; update the buffer pointer
 346         add     arg2, 16
 347
 348         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 349         sub     arg3, 32
 350
 351         jmp     .16B_reduction_loop
 352
 353
 354 align 16
 355 .less_than_32:
 356         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 357         mov     eax, arg1_low32
 358         test    arg3, arg3
 359         je      .cleanup
 360
 361         vmovd   xmm0, arg1_low32        ; get the initial crc value
 362
 363         cmp     arg3, 16
 364         je      .exact_16_left
 365         jl      .less_than_16_left
 366
 367         vmovdqu xmm7, [arg2]            ; load the plaintext
 368         vpxor   xmm7, xmm0              ; xor the initial crc value
 369         add     arg2, 16
 370         sub     arg3, 16
 371         vmovdqa xmm10, [rk1]            ; rk1 and rk2 in xmm10
 372         jmp     .get_last_two_xmms
 373
 374 align 16
 375 .less_than_16_left:
 376         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 377
 378         vpxor   xmm1, xmm1
 379         mov     r11, rsp
 380         vmovdqa [r11], xmm1
 381
 382         cmp     arg3, 4
 383         jl      .only_less_than_4
 384
 385         ; backup the counter value
 386         mov     r9, arg3
 387         cmp     arg3, 8
 388         jl      .less_than_8_left
 389
 390         ; load 8 Bytes
 391         mov     rax, [arg2]
 392         mov     [r11], rax
 393         add     r11, 8
 394         sub     arg3, 8
 395         add     arg2, 8
 396 .less_than_8_left:
 397
 398         cmp     arg3, 4
 399         jl      .less_than_4_left
 400
 401         ; load 4 Bytes
 402         mov     eax, [arg2]
 403         mov     [r11], eax
 404         add     r11, 4
 405         sub     arg3, 4
 406         add     arg2, 4
 407 .less_than_4_left:
 408
 409         cmp     arg3, 2
 410         jl      .less_than_2_left
 411
 412         ; load 2 Bytes
 413         mov     ax, [arg2]
 414         mov     [r11], ax
 415         add     r11, 2
 416         sub     arg3, 2
 417         add     arg2, 2
 418 .less_than_2_left:
 419         cmp     arg3, 1
 420         jl      .zero_left
 421
 422         ; load 1 Byte
 423         mov     al, [arg2]
 424         mov     [r11], al
 425
 426 .zero_left:
 427         vmovdqa xmm7, [rsp]
 428         vpxor   xmm7, xmm0      ; xor the initial crc value
 429
 430         lea     rax,[pshufb_shf_table]
 431         vmovdqu xmm0, [rax + r9]
 432         vpshufb xmm7,xmm0
 433         jmp     .128_done
 434
 435 align 16
 436 .exact_16_left:
 437         vmovdqu xmm7, [arg2]
 438         vpxor   xmm7, xmm0      ; xor the initial crc value
 439         jmp     .128_done
 440
 441 .only_less_than_4:
 442         cmp     arg3, 3
 443         jl      .only_less_than_3
 444
 445         ; load 3 Bytes
 446         mov     al, [arg2]
 447         mov     [r11], al
 448
 449         mov     al, [arg2+1]
 450         mov     [r11+1], al
 451
 452         mov     al, [arg2+2]
 453         mov     [r11+2], al
 454
 455         vmovdqa xmm7, [rsp]
 456         vpxor   xmm7, xmm0      ; xor the initial crc value
 457
 458         vpslldq xmm7, 5
 459         jmp     .barrett
 460
 461 .only_less_than_3:
 462         cmp     arg3, 2
 463         jl      .only_less_than_2
 464
 465         ; load 2 Bytes
 466         mov     al, [arg2]
 467         mov     [r11], al
 468
 469         mov     al, [arg2+1]
 470         mov     [r11+1], al
 471
 472         vmovdqa xmm7, [rsp]
 473         vpxor   xmm7, xmm0      ; xor the initial crc value
 474
 475         vpslldq xmm7, 6
 476         jmp     .barrett
 477
 478 .only_less_than_2:
 479         ; load 1 Byte
 480         mov     al, [arg2]
 481         mov     [r11], al
 482
 483         vmovdqa xmm7, [rsp]
 484         vpxor   xmm7, xmm0      ; xor the initial crc value
 485
 486         vpslldq xmm7, 7
 487         jmp     .barrett
 488
 489 section .data
 490 align 32
 491
 492 %ifndef USE_CONSTS
 493 ; precomputed constants
 494 rk_1: dq 0x00000000b9e02b86
 495 rk_2: dq 0x00000000dcb17aa4
 496 rk1: dq 0x00000000493c7d27
 497 rk2: dq 0x0000000ec1068c50
 498 rk3: dq 0x0000000206e38d70
 499 rk4: dq 0x000000006992cea2
 500 rk5: dq 0x00000000493c7d27
 501 rk6: dq 0x00000000dd45aab8
 502 rk7: dq 0x00000000dea713f0
 503 rk8: dq 0x0000000105ec76f0
 504 rk9: dq 0x0000000047db8317
 505 rk10: dq 0x000000002ad91c30
 506 rk11: dq 0x000000000715ce53
 507 rk12: dq 0x00000000c49f4f67
 508 rk13: dq 0x0000000039d3b296
 509 rk14: dq 0x00000000083a6eec
 510 rk15: dq 0x000000009e4addf8
 511 rk16: dq 0x00000000740eef02
 512 rk17: dq 0x00000000ddc0152b
 513 rk18: dq 0x000000001c291d04
 514 rk19: dq 0x00000000ba4fc28e
 515 rk20: dq 0x000000003da6d0cb
 516
 517 rk_1b: dq 0x00000000493c7d27
 518 rk_2b: dq 0x0000000ec1068c50
 519         dq 0x0000000000000000
 520         dq 0x0000000000000000
 521
 522 %else
 523 INCLUDE_CONSTS
 524 %endif
 525
 526 pshufb_shf_table:
 527 ; use these values for shift constants for the pshufb instruction
 528 ; different alignments result in values as shown:
 529 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 530 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 531 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 532 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 533 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 534 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 535 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 536 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 537 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 538 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 539 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 540 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 541 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 542 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 543 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 544 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 545 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 546
 547 mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 548 mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 549 mask3: dq     0x8080808080808080, 0x8080808080808080
 550
 551 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 552 %ifidn __OUTPUT_FORMAT__, win64
 553 global no_ %+ FUNCTION_NAME
 554 no_ %+ FUNCTION_NAME %+ :
 555 %endif
 556 %endif ; (AS_FEATURE_LEVEL) >= 10