ceph/src/isa-l/crc/crc64_iso_norm_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_iso_norm_by16_10(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 %include "reg_sizes.asm"
  39
  40 %ifndef FUNCTION_NAME
  41 %define FUNCTION_NAME crc64_iso_norm_by16_10
  42 %endif
  43
  44 %if (AS_FEATURE_LEVEL) >= 10
  45
  46 %define fetch_dist      1024
  47
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52
  53
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62 %endif
  63
  64 %ifidn __OUTPUT_FORMAT__, win64
  65         %define XMM_SAVE 16*2
  66         %define VARIABLE_OFFSET 16*12+8
  67 %else
  68         %define VARIABLE_OFFSET 16*2+8
  69 %endif
  70
  71 align 16
  72 global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
  73 FUNCTION_NAME:
  74         not             arg1
  75         sub             rsp, VARIABLE_OFFSET
  76
  77 %ifidn __OUTPUT_FORMAT__, win64
  78         ; push the xmm registers into the stack to maintain
  79         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
  80         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
  81         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
  82         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
  83         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
  84         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
  85         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
  86         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
  87         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
  88         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
  89 %endif
  90         vbroadcasti32x4 zmm18, [SHUF_MASK]
  91         cmp             arg3, 256
  92         jl              _less_than_256
  93
  94         ; load the initial crc value
  95         vmovq           xmm10, arg1      ; initial crc
  96
  97         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
  98         ; because data will be byte-reflected and will align with initial crc at correct place.
  99         vpslldq         xmm10, 8
 100
 101         ; receive the initial 128B data, xor the initial crc value
 102         vmovdqu8        zmm0, [arg2+16*0]
 103         vmovdqu8        zmm4, [arg2+16*4]
 104         vpshufb         zmm0, zmm0, zmm18
 105         vpshufb         zmm4, zmm4, zmm18
 106         vpxorq          zmm0, zmm10
 107         vbroadcasti32x4 zmm10, [rk3]    ;zmm10 has rk3 and rk4
 108                                         ;imm value of pclmulqdq instruction will determine which constant to use
 109         sub             arg3, 256
 110         cmp             arg3, 256
 111         jl              _fold_128_B_loop
 112
 113         vmovdqu8        zmm7, [arg2+16*8]
 114         vmovdqu8        zmm8, [arg2+16*12]
 115         vpshufb         zmm7, zmm7, zmm18
 116         vpshufb         zmm8, zmm8, zmm18
 117         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 118         sub             arg3, 256
 119
 120 _fold_256_B_loop:
 121         add             arg2, 256
 122         vmovdqu8        zmm3, [arg2+16*0]
 123         vpshufb         zmm3, zmm3, zmm18
 124         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 125         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 126         vpxorq          zmm0, zmm1, zmm2
 127         vpxorq          zmm0, zmm0, zmm3
 128
 129         vmovdqu8        zmm9, [arg2+16*4]
 130         vpshufb         zmm9, zmm9, zmm18
 131         vpclmulqdq      zmm5, zmm4, zmm16, 0x00
 132         vpclmulqdq      zmm6, zmm4, zmm16, 0x11
 133         vpxorq          zmm4, zmm5, zmm6
 134         vpxorq          zmm4, zmm4, zmm9
 135
 136         vmovdqu8        zmm11, [arg2+16*8]
 137         vpshufb         zmm11, zmm11, zmm18
 138         vpclmulqdq      zmm12, zmm7, zmm16, 0x00
 139         vpclmulqdq      zmm13, zmm7, zmm16, 0x11
 140         vpxorq          zmm7, zmm12, zmm13
 141         vpxorq          zmm7, zmm7, zmm11
 142
 143         vmovdqu8        zmm17, [arg2+16*12]
 144         vpshufb         zmm17, zmm17, zmm18
 145         vpclmulqdq      zmm14, zmm8, zmm16, 0x00
 146         vpclmulqdq      zmm15, zmm8, zmm16, 0x11
 147         vpxorq          zmm8, zmm14, zmm15
 148         vpxorq          zmm8, zmm8, zmm17
 149
 150         sub             arg3, 256
 151         jge             _fold_256_B_loop
 152
 153         ;; Fold 256 into 128
 154         add             arg2, 256
 155         vpclmulqdq      zmm1, zmm0, zmm10, 0x00
 156         vpclmulqdq      zmm2, zmm0, zmm10, 0x11
 157         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 158
 159         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 160         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 161         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 162
 163         vmovdqa32       zmm0, zmm7
 164         vmovdqa32       zmm4, zmm8
 165
 166         add             arg3, 128
 167         jmp             _fold_128_B_register
 168
 169         ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
 170 _fold_128_B_loop:
 171         add             arg2, 128       ; update the buffer pointer
 172         vmovdqu8        zmm8, [arg2+16*0]
 173         vpshufb         zmm8, zmm8, zmm18
 174         vpclmulqdq      zmm1, zmm0, zmm10, 0x00
 175         vpclmulqdq      zmm2, zmm0, zmm10, 0x11
 176         vpxorq          zmm0, zmm1, zmm2
 177         vpxorq          zmm0, zmm0, zmm8
 178
 179         vmovdqu8        zmm9, [arg2+16*4]
 180         vpshufb         zmm9, zmm9, zmm18
 181         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 182         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 183         vpxorq          zmm4, zmm5, zmm6
 184         vpxorq          zmm4, zmm4, zmm9
 185         sub             arg3, 128
 186         jge             _fold_128_B_loop
 187         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 188
 189         add             arg2, 128
 190         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 191         ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
 192
 193 _fold_128_B_register:
 194         ; fold the 8 128b parts into 1 xmm register with different constants
 195         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 196         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 197         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 198         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 199         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 200
 201         vpclmulqdq      zmm5, zmm4, zmm11, 0x00
 202         vpclmulqdq      zmm6, zmm4, zmm11, 0x11
 203         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 204         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 205         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 206
 207         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 208         vpxorq          ymm8, ymm8, ymm1
 209         vextracti64x2   xmm5, ymm8, 1
 210         vpxorq          xmm7, xmm5, xmm8
 211
 212         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 213         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 214         add             arg3, 128-16
 215         jl              _final_reduction_for_128
 216
 217         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 218         ; we can fold 16 bytes at a time if y>=16
 219         ; continue folding 16B at a time
 220
 221 _16B_reduction_loop:
 222         vmovdqa         xmm8, xmm7
 223         vpclmulqdq      xmm7, xmm10, 0x11
 224         vpclmulqdq      xmm8, xmm10, 0x00
 225         vpxor           xmm7, xmm8
 226         vmovdqu         xmm0, [arg2]
 227         vpshufb         xmm0, xmm0, xmm18
 228         vpxor           xmm7, xmm0
 229         add             arg2, 16
 230         sub             arg3, 16
 231         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 232         ; equivalent of: cmp arg3, 16-16
 233         ; check if there is any more 16B in the buffer to be able to fold
 234         jge             _16B_reduction_loop
 235
 236         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 237         ;first, we reduce the data in the xmm7 register
 238
 239
 240 _final_reduction_for_128:
 241         add             arg3, 16
 242         je              _128_done
 243         ; here we are getting data that is less than 16 bytes.
 244         ; since we know that there was data before the pointer, we can offset
 245         ; the input pointer before the actual point, to receive exactly 16 bytes.
 246         ; after that the registers need to be adjusted.
 247 _get_last_two_xmms:
 248
 249         vmovdqa         xmm2, xmm7
 250         vmovdqu         xmm1, [arg2 - 16 + arg3]
 251         vpshufb         xmm1, xmm18
 252
 253         ; get rid of the extra data that was loaded before
 254         ; load the shift constant
 255         lea             rax, [pshufb_shf_table + 16]
 256         sub             rax, arg3
 257         vmovdqu         xmm0, [rax]
 258
 259         ; shift xmm2 to the left by arg3 bytes
 260         vpshufb         xmm2, xmm0
 261
 262         ; shift xmm7 to the right by 16-arg3 bytes
 263         vpxor           xmm0, [mask1]
 264         vpshufb         xmm7, xmm0
 265         vpblendvb       xmm1, xmm1, xmm2, xmm0
 266
 267         ; fold 16 Bytes
 268         vmovdqa         xmm2, xmm1
 269         vmovdqa         xmm8, xmm7
 270         vpclmulqdq      xmm7, xmm10, 0x11
 271         vpclmulqdq      xmm8, xmm10, 0x0
 272         vpxor           xmm7, xmm8
 273         vpxor           xmm7, xmm2
 274
 275 _128_done:
 276         ; compute crc of a 128-bit value
 277         vmovdqa         xmm10, [rk5]
 278         vmovdqa         xmm0, xmm7
 279
 280         ;64b fold
 281         vpclmulqdq      xmm7, xmm10, 0x01       ; H*L
 282         vpslldq         xmm0, 8
 283         vpxor           xmm7, xmm0
 284
 285         ;barrett reduction
 286 _barrett:
 287         vmovdqa         xmm10, [rk7]    ; rk7 and rk8 in xmm10
 288         vmovdqa         xmm0, xmm7
 289
 290         vmovdqa         xmm1, xmm7
 291         vpand           xmm1, [mask3]
 292         vpclmulqdq      xmm7, xmm10, 0x01
 293         vpxor           xmm7, xmm1
 294
 295         vpclmulqdq      xmm7, xmm10, 0x11
 296         vpxor           xmm7, xmm0
 297         vpextrq         rax, xmm7, 0
 298
 299 _cleanup:
 300         not             rax
 301
 302
 303 %ifidn __OUTPUT_FORMAT__, win64
 304         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 305         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 306         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 307         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 308         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 309         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 310         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 311         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 312         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 313         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 314 %endif
 315         add             rsp, VARIABLE_OFFSET
 316         ret
 317
 318 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 322
 323 align 16
 324 _less_than_256:
 325
 326         ; check if there is enough buffer to be able to fold 16B at a time
 327         cmp     arg3, 32
 328         jl      _less_than_32
 329
 330         ; if there is, load the constants
 331         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 332
 333         vmovq   xmm0, arg1      ; get the initial crc value
 334         vpslldq xmm0, 8 ; align it to its correct place
 335         vmovdqu xmm7, [arg2]    ; load the plaintext
 336         vpshufb xmm7, xmm18     ; byte-reflect the plaintext
 337         vpxor   xmm7, xmm0
 338
 339         ; update the buffer pointer
 340         add     arg2, 16
 341
 342         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 343         sub     arg3, 32
 344
 345         jmp     _16B_reduction_loop
 346
 347 align 16
 348 _less_than_32:
 349         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 350         mov     rax, arg1
 351         test    arg3, arg3
 352         je      _cleanup
 353
 354         vmovq   xmm0, arg1      ; get the initial crc value
 355         vpslldq xmm0, 8 ; align it to its correct place
 356
 357         cmp     arg3, 16
 358         je      _exact_16_left
 359         jl      _less_than_16_left
 360
 361         vmovdqu xmm7, [arg2]    ; load the plaintext
 362         vpshufb xmm7, xmm18     ; byte-reflect the plaintext
 363         vpxor   xmm7, xmm0      ; xor the initial crc value
 364         add     arg2, 16
 365         sub     arg3, 16
 366         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 367         jmp     _get_last_two_xmms
 368
 369
 370 align 16
 371 _less_than_16_left:
 372         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 373
 374         vpxor   xmm1, xmm1
 375         mov     r11, rsp
 376         vmovdqa [r11], xmm1
 377
 378         ; backup the counter value
 379         mov     r9, arg3
 380         cmp     arg3, 8
 381         jl      _less_than_8_left
 382
 383         ; load 8 Bytes
 384         mov     rax, [arg2]
 385         mov     [r11], rax
 386         add     r11, 8
 387         sub     arg3, 8
 388         add     arg2, 8
 389 _less_than_8_left:
 390
 391         cmp     arg3, 4
 392         jl      _less_than_4_left
 393
 394         ; load 4 Bytes
 395         mov     eax, [arg2]
 396         mov     [r11], eax
 397         add     r11, 4
 398         sub     arg3, 4
 399         add     arg2, 4
 400 _less_than_4_left:
 401
 402         cmp     arg3, 2
 403         jl      _less_than_2_left
 404
 405         ; load 2 Bytes
 406         mov     ax, [arg2]
 407         mov     [r11], ax
 408         add     r11, 2
 409         sub     arg3, 2
 410         add     arg2, 2
 411 _less_than_2_left:
 412         cmp     arg3, 1
 413         jl      _zero_left
 414
 415         ; load 1 Byte
 416         mov     al, [arg2]
 417         mov     [r11], al
 418
 419 _zero_left:
 420         vmovdqa xmm7, [rsp]
 421         vpshufb xmm7, xmm18
 422         vpxor   xmm7, xmm0      ; xor the initial crc value
 423
 424         lea     rax, [pshufb_shf_table + 16]
 425         sub     rax, r9
 426
 427         cmp     r9, 8
 428         jl      _end_1to7
 429
 430 _end_8to15:
 431         vmovdqu xmm0, [rax]
 432         vpxor   xmm0, [mask1]
 433
 434         vpshufb xmm7, xmm0
 435         jmp     _128_done
 436
 437 _end_1to7:
 438         ; Right shift (8-length) bytes in XMM
 439         add     rax, 8
 440         vmovdqu xmm0, [rax]
 441         vpshufb xmm7,xmm0
 442
 443         jmp     _barrett
 444
 445 align 16
 446 _exact_16_left:
 447         vmovdqu xmm7, [arg2]
 448         vpshufb xmm7, xmm18
 449         vpxor   xmm7, xmm0      ; xor the initial crc value
 450
 451         jmp     _128_done
 452
 453 section .data
 454 align 32
 455
 456 %ifndef USE_CONSTS
 457 ; precomputed constants
 458 rk_1: dq 0x0000001a00000144
 459 rk_2: dq 0x0000015e00001dac
 460 rk1:  dq 0x0000000000000145
 461 rk2:  dq 0x0000000000001db7
 462 rk3:  dq 0x000100000001001a
 463 rk4:  dq 0x001b0000001b015e
 464 rk5:  dq 0x0000000000000145
 465 rk6:  dq 0x0000000000000000
 466 rk7:  dq 0x000000000000001b
 467 rk8:  dq 0x000000000000001b
 468 rk9:  dq 0x0150145145145015
 469 rk10: dq 0x1c71db6db6db71c7
 470 rk11: dq 0x0001110110110111
 471 rk12: dq 0x001aab1ab1ab1aab
 472 rk13: dq 0x0000014445014445
 473 rk14: dq 0x00001daab71daab7
 474 rk15: dq 0x0000000101000101
 475 rk16: dq 0x0000001b1b001b1b
 476 rk17: dq 0x0000000001514515
 477 rk18: dq 0x000000001c6db6c7
 478 rk19: dq 0x0000000000011011
 479 rk20: dq 0x00000000001ab1ab
 480
 481 rk_1b: dq 0x0000000000000145
 482 rk_2b: dq 0x0000000000001db7
 483         dq 0x0000000000000000
 484         dq 0x0000000000000000
 485 %else
 486 INCLUDE_CONSTS
 487 %endif
 488
 489 mask1: dq 0x8080808080808080, 0x8080808080808080
 490 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 491 mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
 492
 493 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 494
 495 pshufb_shf_table:
 496 ; use these values for shift constants for the pshufb instruction
 497 ; different alignments result in values as shown:
 498 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 499 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 500 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 501 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 502 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 503 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 504 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 505 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 506 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 507 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 508 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 509 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 510 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 511 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 512 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 513 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 514 dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
 515 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
 516 dq 0x8080808080808080, 0x8080808080808080
 517
 518
 519 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 520 %ifidn __OUTPUT_FORMAT__, win64
 521 global no_ %+ FUNCTION_NAME
 522 no_ %+ FUNCTION_NAME %+ :
 523 %endif
 524 %endif ; (AS_FEATURE_LEVEL) >= 10