ceph/src/isa-l/crc/crc32_ieee_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       UINT32 crc32_gzip_refl_by16_10(
  33 ;               UINT32 init_crc, //initial CRC value, 32 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  45 ;
  46 ;
  47
  48 %include "reg_sizes.asm"
  49
  50 %ifndef FUNCTION_NAME
  51 %define FUNCTION_NAME crc32_ieee_by16_10
  52 %endif
  53
  54 %if (AS_FEATURE_LEVEL) >= 10
  55
  56 [bits 64]
  57 default rel
  58
  59 section .text
  60
  61
  62 %ifidn __OUTPUT_FORMAT__, win64
  63         %xdefine        arg1 rcx
  64         %xdefine        arg2 rdx
  65         %xdefine        arg3 r8
  66
  67         %xdefine        arg1_low32 ecx
  68 %else
  69         %xdefine        arg1 rdi
  70         %xdefine        arg2 rsi
  71         %xdefine        arg3 rdx
  72
  73         %xdefine        arg1_low32 edi
  74 %endif
  75
  76 %define TMP 16*0
  77 %ifidn __OUTPUT_FORMAT__, win64
  78         %define XMM_SAVE 16*2
  79         %define VARIABLE_OFFSET 16*12+8
  80 %else
  81         %define VARIABLE_OFFSET 16*2+8
  82 %endif
  83
  84 align 16
  85 mk_global FUNCTION_NAME, function
  86 FUNCTION_NAME:
  87         endbranch
  88
  89         not             arg1_low32
  90         sub             rsp, VARIABLE_OFFSET
  91
  92 %ifidn __OUTPUT_FORMAT__, win64
  93         ; push the xmm registers into the stack to maintain
  94         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
  95         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
  96         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
  97         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
  98         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
  99         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
 100         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
 101         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
 102         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
 103         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
 104 %endif
 105
 106         vbroadcasti32x4 zmm18, [SHUF_MASK]
 107         cmp             arg3, 256
 108         jl              .less_than_256
 109
 110         ; load the initial crc value
 111         vmovd           xmm10, arg1_low32      ; initial crc
 112
 113         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 114         ; because data will be byte-reflected and will align with initial crc at correct place.
 115         vpslldq         xmm10, 12
 116
 117         ; receive the initial 64B data, xor the initial crc value
 118         vmovdqu8        zmm0, [arg2+16*0]
 119         vmovdqu8        zmm4, [arg2+16*4]
 120         vpshufb         zmm0, zmm0, zmm18
 121         vpshufb         zmm4, zmm4, zmm18
 122         vpxorq          zmm0, zmm10
 123         vbroadcasti32x4 zmm10, [rk3]    ;xmm10 has rk3 and rk4
 124                                         ;imm value of pclmulqdq instruction will determine which constant to use
 125
 126         sub             arg3, 256
 127         cmp             arg3, 256
 128         jl              .fold_128_B_loop
 129
 130         vmovdqu8        zmm7, [arg2+16*8]
 131         vmovdqu8        zmm8, [arg2+16*12]
 132         vpshufb         zmm7, zmm7, zmm18
 133         vpshufb         zmm8, zmm8, zmm18
 134         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 135         sub             arg3, 256
 136
 137 .fold_256_B_loop:
 138         add             arg2, 256
 139         vmovdqu8        zmm3, [arg2+16*0]
 140         vpshufb         zmm3, zmm3, zmm18
 141         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 142         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 143         vpxorq          zmm0, zmm1, zmm2
 144         vpxorq          zmm0, zmm0, zmm3
 145
 146         vmovdqu8        zmm9, [arg2+16*4]
 147         vpshufb         zmm9, zmm9, zmm18
 148         vpclmulqdq      zmm5, zmm4, zmm16, 0x00
 149         vpclmulqdq      zmm6, zmm4, zmm16, 0x11
 150         vpxorq          zmm4, zmm5, zmm6
 151         vpxorq          zmm4, zmm4, zmm9
 152
 153         vmovdqu8        zmm11, [arg2+16*8]
 154         vpshufb         zmm11, zmm11, zmm18
 155         vpclmulqdq      zmm12, zmm7, zmm16, 0x00
 156         vpclmulqdq      zmm13, zmm7, zmm16, 0x11
 157         vpxorq          zmm7, zmm12, zmm13
 158         vpxorq          zmm7, zmm7, zmm11
 159
 160         vmovdqu8        zmm17, [arg2+16*12]
 161         vpshufb         zmm17, zmm17, zmm18
 162         vpclmulqdq      zmm14, zmm8, zmm16, 0x00
 163         vpclmulqdq      zmm15, zmm8, zmm16, 0x11
 164         vpxorq          zmm8, zmm14, zmm15
 165         vpxorq          zmm8, zmm8, zmm17
 166
 167         sub             arg3, 256
 168         jge             .fold_256_B_loop
 169
 170         ;; Fold 256 into 128
 171         add             arg2, 256
 172         vpclmulqdq      zmm1, zmm0, zmm10, 0x00
 173         vpclmulqdq      zmm2, zmm0, zmm10, 0x11
 174         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 175
 176         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 177         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 178         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 179
 180         vmovdqa32       zmm0, zmm7
 181         vmovdqa32       zmm4, zmm8
 182
 183         add             arg3, 128
 184         jmp             .fold_128_B_register
 185
 186
 187
 188         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
 189         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 190
 191         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 192 .fold_128_B_loop:
 193         add             arg2, 128
 194         vmovdqu8        zmm8, [arg2+16*0]
 195         vpshufb         zmm8, zmm8, zmm18
 196         vpclmulqdq      zmm2, zmm0, zmm10, 0x00
 197         vpclmulqdq      zmm1, zmm0, zmm10, 0x11
 198         vpxorq          zmm0, zmm2, zmm1
 199         vpxorq          zmm0, zmm0, zmm8
 200
 201         vmovdqu8        zmm9, [arg2+16*4]
 202         vpshufb         zmm9, zmm9, zmm18
 203         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 204         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 205         vpxorq          zmm4, zmm5, zmm6
 206         vpxorq          zmm4, zmm4, zmm9
 207
 208         sub             arg3, 128
 209         jge             .fold_128_B_loop
 210         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 211
 212         add             arg2, 128
 213         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 214         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 215
 216 .fold_128_B_register:
 217         ; fold the 8 128b parts into 1 xmm register with different constants
 218         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 219         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 220         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 221         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 222         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 223
 224         vpclmulqdq      zmm5, zmm4, zmm11, 0x00
 225         vpclmulqdq      zmm6, zmm4, zmm11, 0x11
 226         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 227         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 228         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 229
 230         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 231         vpxorq          ymm8, ymm8, ymm1
 232         vextracti64x2   xmm5, ymm8, 1
 233         vpxorq          xmm7, xmm5, xmm8
 234
 235         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 236         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 237         add             arg3, 128-16
 238         jl              .final_reduction_for_128
 239
 240         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 241         ; we can fold 16 bytes at a time if y>=16
 242         ; continue folding 16B at a time
 243
 244 .16B_reduction_loop:
 245         vpclmulqdq      xmm8, xmm7, xmm10, 0x11
 246         vpclmulqdq      xmm7, xmm7, xmm10, 0x00
 247         vpxor           xmm7, xmm8
 248         vmovdqu         xmm0, [arg2]
 249         vpshufb         xmm0, xmm0, xmm18
 250         vpxor           xmm7, xmm0
 251         add             arg2, 16
 252         sub             arg3, 16
 253         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 254         ; equivalent of: cmp arg3, 16-16
 255         ; check if there is any more 16B in the buffer to be able to fold
 256         jge             .16B_reduction_loop
 257
 258         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 259         ;first, we reduce the data in the xmm7 register
 260
 261
 262 .final_reduction_for_128:
 263         add             arg3, 16
 264         je              .128_done
 265
 266         ; here we are getting data that is less than 16 bytes.
 267         ; since we know that there was data before the pointer, we can offset
 268         ; the input pointer before the actual point, to receive exactly 16 bytes.
 269         ; after that the registers need to be adjusted.
 270 .get_last_two_xmms:
 271
 272         vmovdqa         xmm2, xmm7
 273         vmovdqu         xmm1, [arg2 - 16 + arg3]
 274         vpshufb         xmm1, xmm18
 275
 276         ; get rid of the extra data that was loaded before
 277         ; load the shift constant
 278         lea             rax, [pshufb_shf_table + 16]
 279         sub             rax, arg3
 280         vmovdqu         xmm0, [rax]
 281
 282         vpshufb         xmm2, xmm0
 283         vpxor           xmm0, [mask1]
 284         vpshufb         xmm7, xmm0
 285         vpblendvb       xmm1, xmm1, xmm2, xmm0
 286
 287         vpclmulqdq      xmm8, xmm7, xmm10, 0x11
 288         vpclmulqdq      xmm7, xmm7, xmm10, 0x00
 289         vpxor           xmm7, xmm8
 290         vpxor           xmm7, xmm1
 291
 292 .128_done:
 293         ; compute crc of a 128-bit value
 294         vmovdqa         xmm10, [rk5]
 295         vmovdqa         xmm0, xmm7
 296
 297         ;64b fold
 298         vpclmulqdq      xmm7, xmm10, 0x01       ; H*L
 299         vpslldq         xmm0, 8
 300         vpxor           xmm7, xmm0
 301
 302         ;32b fold
 303         vmovdqa         xmm0, xmm7
 304         vpand           xmm0, [mask2]
 305         vpsrldq         xmm7, 12
 306         vpclmulqdq      xmm7, xmm10, 0x10
 307         vpxor           xmm7, xmm0
 308
 309         ;barrett reduction
 310 .barrett:
 311         vmovdqa         xmm10, [rk7]    ; rk7 and rk8 in xmm10
 312         vmovdqa         xmm0, xmm7
 313         vpclmulqdq      xmm7, xmm10, 0x01
 314         vpslldq         xmm7, 4
 315         vpclmulqdq      xmm7, xmm10, 0x11
 316
 317         vpslldq         xmm7, 4
 318         vpxor           xmm7, xmm0
 319         vpextrd         eax, xmm7, 1
 320
 321 .cleanup:
 322         not             eax
 323
 324
 325 %ifidn __OUTPUT_FORMAT__, win64
 326         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 327         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 328         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 329         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 330         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 331         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 332         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 333         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 334         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 335         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 336 %endif
 337         add             rsp, VARIABLE_OFFSET
 338         ret
 339
 340
 341 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 342 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 344 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 345
 346 align 16
 347 .less_than_256:
 348
 349         ; check if there is enough buffer to be able to fold 16B at a time
 350         cmp     arg3, 32
 351         jl      .less_than_32
 352
 353         ; if there is, load the constants
 354         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 355
 356         vmovd   xmm0, arg1_low32        ; get the initial crc value
 357         vpslldq xmm0, 12                ; align it to its correct place
 358         vmovdqu xmm7, [arg2]            ; load the plaintext
 359         vpshufb xmm7, xmm18             ; byte-reflect the plaintext
 360         vpxor   xmm7, xmm0
 361
 362         ; update the buffer pointer
 363         add     arg2, 16
 364
 365         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 366         sub     arg3, 32
 367
 368         jmp     .16B_reduction_loop
 369
 370
 371 align 16
 372 .less_than_32:
 373         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 374         mov     eax, arg1_low32
 375         test    arg3, arg3
 376         je      .cleanup
 377
 378         vmovd   xmm0, arg1_low32        ; get the initial crc value
 379         vpslldq xmm0, 12                ; align it to its correct place
 380
 381         cmp     arg3, 16
 382         je      .exact_16_left
 383         jl      .less_than_16_left
 384
 385         vmovdqu xmm7, [arg2]            ; load the plaintext
 386         vpshufb xmm7, xmm18
 387         vpxor   xmm7, xmm0              ; xor the initial crc value
 388         add     arg2, 16
 389         sub     arg3, 16
 390         vmovdqa xmm10, [rk1]            ; rk1 and rk2 in xmm10
 391         jmp     .get_last_two_xmms
 392
 393 align 16
 394 .less_than_16_left:
 395         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 396
 397         vpxor   xmm1, xmm1
 398         mov     r11, rsp
 399         vmovdqa [r11], xmm1
 400
 401         cmp     arg3, 4
 402         jl      .only_less_than_4
 403
 404         ; backup the counter value
 405         mov     r9, arg3
 406         cmp     arg3, 8
 407         jl      .less_than_8_left
 408
 409         ; load 8 Bytes
 410         mov     rax, [arg2]
 411         mov     [r11], rax
 412         add     r11, 8
 413         sub     arg3, 8
 414         add     arg2, 8
 415 .less_than_8_left:
 416
 417         cmp     arg3, 4
 418         jl      .less_than_4_left
 419
 420         ; load 4 Bytes
 421         mov     eax, [arg2]
 422         mov     [r11], eax
 423         add     r11, 4
 424         sub     arg3, 4
 425         add     arg2, 4
 426 .less_than_4_left:
 427
 428         cmp     arg3, 2
 429         jl      .less_than_2_left
 430
 431         ; load 2 Bytes
 432         mov     ax, [arg2]
 433         mov     [r11], ax
 434         add     r11, 2
 435         sub     arg3, 2
 436         add     arg2, 2
 437 .less_than_2_left:
 438         cmp     arg3, 1
 439         jl      .zero_left
 440
 441         ; load 1 Byte
 442         mov     al, [arg2]
 443         mov     [r11], al
 444
 445 .zero_left:
 446         vmovdqa xmm7, [rsp]
 447         vpshufb xmm7, xmm18
 448         vpxor   xmm7, xmm0      ; xor the initial crc value
 449
 450         lea     rax, [pshufb_shf_table + 16]
 451         sub     rax, r9
 452         vmovdqu xmm0, [rax]
 453         vpxor   xmm0, [mask1]
 454
 455         vpshufb xmm7,xmm0
 456         jmp     .128_done
 457
 458 align 16
 459 .exact_16_left:
 460         vmovdqu xmm7, [arg2]
 461         vpshufb xmm7, xmm18
 462         vpxor   xmm7, xmm0      ; xor the initial crc value
 463         jmp     .128_done
 464
 465 .only_less_than_4:
 466         cmp     arg3, 3
 467         jl      .only_less_than_3
 468
 469         ; load 3 Bytes
 470         mov     al, [arg2]
 471         mov     [r11], al
 472
 473         mov     al, [arg2+1]
 474         mov     [r11+1], al
 475
 476         mov     al, [arg2+2]
 477         mov     [r11+2], al
 478
 479         vmovdqa xmm7, [rsp]
 480         vpshufb xmm7, xmm18
 481         vpxor   xmm7, xmm0      ; xor the initial crc value
 482
 483         vpsrldq xmm7, 5
 484         jmp     .barrett
 485
 486 .only_less_than_3:
 487         cmp     arg3, 2
 488         jl      .only_less_than_2
 489
 490         ; load 2 Bytes
 491         mov     al, [arg2]
 492         mov     [r11], al
 493
 494         mov     al, [arg2+1]
 495         mov     [r11+1], al
 496
 497         vmovdqa xmm7, [rsp]
 498         vpshufb xmm7, xmm18
 499         vpxor   xmm7, xmm0      ; xor the initial crc value
 500
 501         vpsrldq xmm7, 6
 502         jmp     .barrett
 503
 504 .only_less_than_2:
 505         ; load 1 Byte
 506         mov     al, [arg2]
 507         mov     [r11], al
 508
 509         vmovdqa xmm7, [rsp]
 510         vpshufb xmm7, xmm18
 511         vpxor   xmm7, xmm0      ; xor the initial crc value
 512
 513         vpsrldq xmm7, 7
 514         jmp     .barrett
 515
 516 section .data
 517 align 32
 518
 519 %ifndef USE_CONSTS
 520 ; precomputed constants
 521 rk_1: dq 0x1851689900000000
 522 rk_2: dq 0xa3dc855100000000
 523 rk1:  dq 0xf200aa6600000000
 524 rk2:  dq 0x17d3315d00000000
 525 rk3:  dq 0x022ffca500000000
 526 rk4:  dq 0x9d9ee22f00000000
 527 rk5:  dq 0xf200aa6600000000
 528 rk6:  dq 0x490d678d00000000
 529 rk7:  dq 0x0000000104d101df
 530 rk8:  dq 0x0000000104c11db7
 531 rk9:  dq 0x6ac7e7d700000000
 532 rk10: dq 0xfcd922af00000000
 533 rk11: dq 0x34e45a6300000000
 534 rk12: dq 0x8762c1f600000000
 535 rk13: dq 0x5395a0ea00000000
 536 rk14: dq 0x54f2d5c700000000
 537 rk15: dq 0xd3504ec700000000
 538 rk16: dq 0x57a8445500000000
 539 rk17: dq 0xc053585d00000000
 540 rk18: dq 0x766f1b7800000000
 541 rk19: dq 0xcd8c54b500000000
 542 rk20: dq 0xab40b71e00000000
 543
 544 rk_1b: dq 0xf200aa6600000000
 545 rk_2b: dq 0x17d3315d00000000
 546         dq 0x0000000000000000
 547         dq 0x0000000000000000
 548 %else
 549 INCLUDE_CONSTS
 550 %endif
 551
 552 mask1: dq 0x8080808080808080, 0x8080808080808080
 553 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 554
 555 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 556
 557 pshufb_shf_table:
 558 ; use these values for shift constants for the pshufb instruction
 559 ; different alignments result in values as shown:
 560 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 561 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 562 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 563 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 564 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 565 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 566 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 567 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 568 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 569 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 570 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 571 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 572 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 573 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 574 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 575 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 576 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 577 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
 578 dq 0x8080808080808080, 0x8080808080808080
 579
 580 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 581 %ifidn __OUTPUT_FORMAT__, win64
 582 global no_ %+ FUNCTION_NAME
 583 no_ %+ FUNCTION_NAME %+ :
 584 %endif
 585 %endif ; (AS_FEATURE_LEVEL) >= 10