ceph/src/isa-l/crc/crc16_t10dif_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       UINT32 crc16_t10dif_by16_10(
  33 ;               UINT16 init_crc, //initial CRC value, 16 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  45 ;
  46 ;
  47
  48 %include "reg_sizes.asm"
  49
  50 %ifndef FUNCTION_NAME
  51 %define FUNCTION_NAME crc16_t10dif_by16_10
  52 %endif
  53
  54 %if (AS_FEATURE_LEVEL) >= 10
  55
  56 [bits 64]
  57 default rel
  58
  59 section .text
  60
  61
  62 %ifidn __OUTPUT_FORMAT__, win64
  63         %xdefine        arg1 rcx
  64         %xdefine        arg2 rdx
  65         %xdefine        arg3 r8
  66
  67         %xdefine        arg1_low32 ecx
  68 %else
  69         %xdefine        arg1 rdi
  70         %xdefine        arg2 rsi
  71         %xdefine        arg3 rdx
  72
  73         %xdefine        arg1_low32 edi
  74 %endif
  75
  76 %define TMP 16*0
  77 %ifidn __OUTPUT_FORMAT__, win64
  78         %define XMM_SAVE 16*2
  79         %define VARIABLE_OFFSET 16*12+8
  80 %else
  81         %define VARIABLE_OFFSET 16*2+8
  82 %endif
  83
  84 align 16
  85 mk_global FUNCTION_NAME, function
  86 FUNCTION_NAME:
  87         endbranch
  88
  89         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  90         shl             arg1_low32, 16
  91
  92         ; After this point, code flow is exactly same as a 32-bit CRC.
  93         ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
  94
  95         sub             rsp, VARIABLE_OFFSET
  96
  97 %ifidn __OUTPUT_FORMAT__, win64
  98         ; push the xmm registers into the stack to maintain
  99         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
 100         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
 101         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
 102         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
 103         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
 104         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
 105         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
 106         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
 107         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
 108         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
 109 %endif
 110
 111         vbroadcasti32x4 zmm18, [SHUF_MASK]
 112         cmp             arg3, 256
 113         jl              .less_than_256
 114
 115         ; load the initial crc value
 116         vmovd           xmm10, arg1_low32      ; initial crc
 117
 118         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 119         ; because data will be byte-reflected and will align with initial crc at correct place.
 120         vpslldq         xmm10, 12
 121
 122         ; receive the initial 64B data, xor the initial crc value
 123         vmovdqu8        zmm0, [arg2+16*0]
 124         vmovdqu8        zmm4, [arg2+16*4]
 125         vpshufb         zmm0, zmm0, zmm18
 126         vpshufb         zmm4, zmm4, zmm18
 127         vpxorq          zmm0, zmm10
 128         vbroadcasti32x4 zmm10, [rk3]    ;xmm10 has rk3 and rk4
 129                                         ;imm value of pclmulqdq instruction will determine which constant to use
 130
 131         sub             arg3, 256
 132         cmp             arg3, 256
 133         jl              .fold_128_B_loop
 134
 135         vmovdqu8        zmm7, [arg2+16*8]
 136         vmovdqu8        zmm8, [arg2+16*12]
 137         vpshufb         zmm7, zmm7, zmm18
 138         vpshufb         zmm8, zmm8, zmm18
 139         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 140         sub             arg3, 256
 141
 142 .fold_256_B_loop:
 143         add             arg2, 256
 144         vmovdqu8        zmm3, [arg2+16*0]
 145         vpshufb         zmm3, zmm3, zmm18
 146         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 147         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 148         vpxorq          zmm0, zmm1, zmm2
 149         vpxorq          zmm0, zmm0, zmm3
 150
 151         vmovdqu8        zmm9, [arg2+16*4]
 152         vpshufb         zmm9, zmm9, zmm18
 153         vpclmulqdq      zmm5, zmm4, zmm16, 0x00
 154         vpclmulqdq      zmm6, zmm4, zmm16, 0x11
 155         vpxorq          zmm4, zmm5, zmm6
 156         vpxorq          zmm4, zmm4, zmm9
 157
 158         vmovdqu8        zmm11, [arg2+16*8]
 159         vpshufb         zmm11, zmm11, zmm18
 160         vpclmulqdq      zmm12, zmm7, zmm16, 0x00
 161         vpclmulqdq      zmm13, zmm7, zmm16, 0x11
 162         vpxorq          zmm7, zmm12, zmm13
 163         vpxorq          zmm7, zmm7, zmm11
 164
 165         vmovdqu8        zmm17, [arg2+16*12]
 166         vpshufb         zmm17, zmm17, zmm18
 167         vpclmulqdq      zmm14, zmm8, zmm16, 0x00
 168         vpclmulqdq      zmm15, zmm8, zmm16, 0x11
 169         vpxorq          zmm8, zmm14, zmm15
 170         vpxorq          zmm8, zmm8, zmm17
 171
 172         sub             arg3, 256
 173         jge             .fold_256_B_loop
 174
 175         ;; Fold 256 into 128
 176         add             arg2, 256
 177         vpclmulqdq      zmm1, zmm0, zmm10, 0x00
 178         vpclmulqdq      zmm2, zmm0, zmm10, 0x11
 179         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 180
 181         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 182         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 183         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 184
 185         vmovdqa32       zmm0, zmm7
 186         vmovdqa32       zmm4, zmm8
 187
 188         add             arg3, 128
 189         jmp             .fold_128_B_register
 190
 191
 192
 193         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
 194         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 195
 196         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 197 .fold_128_B_loop:
 198         add             arg2, 128
 199         vmovdqu8        zmm8, [arg2+16*0]
 200         vpshufb         zmm8, zmm8, zmm18
 201         vpclmulqdq      zmm2, zmm0, zmm10, 0x00
 202         vpclmulqdq      zmm1, zmm0, zmm10, 0x11
 203         vpxorq          zmm0, zmm2, zmm1
 204         vpxorq          zmm0, zmm0, zmm8
 205
 206         vmovdqu8        zmm9, [arg2+16*4]
 207         vpshufb         zmm9, zmm9, zmm18
 208         vpclmulqdq      zmm5, zmm4, zmm10, 0x00
 209         vpclmulqdq      zmm6, zmm4, zmm10, 0x11
 210         vpxorq          zmm4, zmm5, zmm6
 211         vpxorq          zmm4, zmm4, zmm9
 212
 213         sub             arg3, 128
 214         jge             .fold_128_B_loop
 215         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 216
 217         add             arg2, 128
 218         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 219         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 220
 221 .fold_128_B_register:
 222         ; fold the 8 128b parts into 1 xmm register with different constants
 223         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 224         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 225         vpclmulqdq      zmm1, zmm0, zmm16, 0x00
 226         vpclmulqdq      zmm2, zmm0, zmm16, 0x11
 227         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 228
 229         vpclmulqdq      zmm5, zmm4, zmm11, 0x00
 230         vpclmulqdq      zmm6, zmm4, zmm11, 0x11
 231         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 232         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 233         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 234
 235         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 236         vpxorq          ymm8, ymm8, ymm1
 237         vextracti64x2   xmm5, ymm8, 1
 238         vpxorq          xmm7, xmm5, xmm8
 239
 240         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 241         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 242         add             arg3, 128-16
 243         jl              .final_reduction_for_128
 244
 245         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 246         ; we can fold 16 bytes at a time if y>=16
 247         ; continue folding 16B at a time
 248
 249 .16B_reduction_loop:
 250         vpclmulqdq      xmm8, xmm7, xmm10, 0x11
 251         vpclmulqdq      xmm7, xmm7, xmm10, 0x00
 252         vpxor           xmm7, xmm8
 253         vmovdqu         xmm0, [arg2]
 254         vpshufb         xmm0, xmm0, xmm18
 255         vpxor           xmm7, xmm0
 256         add             arg2, 16
 257         sub             arg3, 16
 258         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 259         ; equivalent of: cmp arg3, 16-16
 260         ; check if there is any more 16B in the buffer to be able to fold
 261         jge             .16B_reduction_loop
 262
 263         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 264         ;first, we reduce the data in the xmm7 register
 265
 266
 267 .final_reduction_for_128:
 268         add             arg3, 16
 269         je              .128_done
 270
 271         ; here we are getting data that is less than 16 bytes.
 272         ; since we know that there was data before the pointer, we can offset
 273         ; the input pointer before the actual point, to receive exactly 16 bytes.
 274         ; after that the registers need to be adjusted.
 275 .get_last_two_xmms:
 276
 277         vmovdqa         xmm2, xmm7
 278         vmovdqu         xmm1, [arg2 - 16 + arg3]
 279         vpshufb         xmm1, xmm18
 280
 281         ; get rid of the extra data that was loaded before
 282         ; load the shift constant
 283         lea             rax, [pshufb_shf_table + 16]
 284         sub             rax, arg3
 285         vmovdqu         xmm0, [rax]
 286
 287         vpshufb         xmm2, xmm0
 288         vpxor           xmm0, [mask1]
 289         vpshufb         xmm7, xmm0
 290         vpblendvb       xmm1, xmm1, xmm2, xmm0
 291
 292         vpclmulqdq      xmm8, xmm7, xmm10, 0x11
 293         vpclmulqdq      xmm7, xmm7, xmm10, 0x00
 294         vpxor           xmm7, xmm8
 295         vpxor           xmm7, xmm1
 296
 297 .128_done:
 298         ; compute crc of a 128-bit value
 299         vmovdqa         xmm10, [rk5]
 300         vmovdqa         xmm0, xmm7
 301
 302         ;64b fold
 303         vpclmulqdq      xmm7, xmm10, 0x01       ; H*L
 304         vpslldq         xmm0, 8
 305         vpxor           xmm7, xmm0
 306
 307         ;32b fold
 308         vmovdqa         xmm0, xmm7
 309         vpand           xmm0, [mask2]
 310         vpsrldq         xmm7, 12
 311         vpclmulqdq      xmm7, xmm10, 0x10
 312         vpxor           xmm7, xmm0
 313
 314         ;barrett reduction
 315 .barrett:
 316         vmovdqa         xmm10, [rk7]    ; rk7 and rk8 in xmm10
 317         vmovdqa         xmm0, xmm7
 318         vpclmulqdq      xmm7, xmm10, 0x01
 319         vpslldq         xmm7, 4
 320         vpclmulqdq      xmm7, xmm10, 0x11
 321
 322         vpslldq         xmm7, 4
 323         vpxor           xmm7, xmm0
 324         vpextrd         eax, xmm7, 1
 325
 326 .cleanup:
 327         ; scale the result back to 16 bits
 328         shr             eax, 16
 329
 330 %ifidn __OUTPUT_FORMAT__, win64
 331         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 332         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 333         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 334         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 335         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 336         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 337         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 338         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 339         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 340         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 341 %endif
 342         add             rsp, VARIABLE_OFFSET
 343         ret
 344
 345
 346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 350
 351 align 16
 352 .less_than_256:
 353
 354         ; check if there is enough buffer to be able to fold 16B at a time
 355         cmp     arg3, 32
 356         jl      .less_than_32
 357
 358         ; if there is, load the constants
 359         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 360
 361         vmovd   xmm0, arg1_low32        ; get the initial crc value
 362         vpslldq xmm0, 12                ; align it to its correct place
 363         vmovdqu xmm7, [arg2]            ; load the plaintext
 364         vpshufb xmm7, xmm18             ; byte-reflect the plaintext
 365         vpxor   xmm7, xmm0
 366
 367         ; update the buffer pointer
 368         add     arg2, 16
 369
 370         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 371         sub     arg3, 32
 372
 373         jmp     .16B_reduction_loop
 374
 375
 376 align 16
 377 .less_than_32:
 378         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 379         mov     eax, arg1_low32
 380         test    arg3, arg3
 381         je      .cleanup
 382
 383         vmovd   xmm0, arg1_low32        ; get the initial crc value
 384         vpslldq xmm0, 12                ; align it to its correct place
 385
 386         cmp     arg3, 16
 387         je      .exact_16_left
 388         jl      .less_than_16_left
 389
 390         vmovdqu xmm7, [arg2]            ; load the plaintext
 391         vpshufb xmm7, xmm18
 392         vpxor   xmm7, xmm0              ; xor the initial crc value
 393         add     arg2, 16
 394         sub     arg3, 16
 395         vmovdqa xmm10, [rk1]            ; rk1 and rk2 in xmm10
 396         jmp     .get_last_two_xmms
 397
 398 align 16
 399 .less_than_16_left:
 400         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 401
 402         vpxor   xmm1, xmm1
 403         mov     r11, rsp
 404         vmovdqa [r11], xmm1
 405
 406         cmp     arg3, 4
 407         jl      .only_less_than_4
 408
 409         ; backup the counter value
 410         mov     r9, arg3
 411         cmp     arg3, 8
 412         jl      .less_than_8_left
 413
 414         ; load 8 Bytes
 415         mov     rax, [arg2]
 416         mov     [r11], rax
 417         add     r11, 8
 418         sub     arg3, 8
 419         add     arg2, 8
 420 .less_than_8_left:
 421
 422         cmp     arg3, 4
 423         jl      .less_than_4_left
 424
 425         ; load 4 Bytes
 426         mov     eax, [arg2]
 427         mov     [r11], eax
 428         add     r11, 4
 429         sub     arg3, 4
 430         add     arg2, 4
 431 .less_than_4_left:
 432
 433         cmp     arg3, 2
 434         jl      .less_than_2_left
 435
 436         ; load 2 Bytes
 437         mov     ax, [arg2]
 438         mov     [r11], ax
 439         add     r11, 2
 440         sub     arg3, 2
 441         add     arg2, 2
 442 .less_than_2_left:
 443         cmp     arg3, 1
 444         jl      .zero_left
 445
 446         ; load 1 Byte
 447         mov     al, [arg2]
 448         mov     [r11], al
 449
 450 .zero_left:
 451         vmovdqa xmm7, [rsp]
 452         vpshufb xmm7, xmm18
 453         vpxor   xmm7, xmm0      ; xor the initial crc value
 454
 455         lea     rax, [pshufb_shf_table + 16]
 456         sub     rax, r9
 457         vmovdqu xmm0, [rax]
 458         vpxor   xmm0, [mask1]
 459
 460         vpshufb xmm7,xmm0
 461         jmp     .128_done
 462
 463 align 16
 464 .exact_16_left:
 465         vmovdqu xmm7, [arg2]
 466         vpshufb xmm7, xmm18
 467         vpxor   xmm7, xmm0      ; xor the initial crc value
 468         jmp     .128_done
 469
 470 .only_less_than_4:
 471         cmp     arg3, 3
 472         jl      .only_less_than_3
 473
 474         ; load 3 Bytes
 475         mov     al, [arg2]
 476         mov     [r11], al
 477
 478         mov     al, [arg2+1]
 479         mov     [r11+1], al
 480
 481         mov     al, [arg2+2]
 482         mov     [r11+2], al
 483
 484         vmovdqa xmm7, [rsp]
 485         vpshufb xmm7, xmm18
 486         vpxor   xmm7, xmm0      ; xor the initial crc value
 487
 488         vpsrldq xmm7, 5
 489         jmp     .barrett
 490
 491 .only_less_than_3:
 492         cmp     arg3, 2
 493         jl      .only_less_than_2
 494
 495         ; load 2 Bytes
 496         mov     al, [arg2]
 497         mov     [r11], al
 498
 499         mov     al, [arg2+1]
 500         mov     [r11+1], al
 501
 502         vmovdqa xmm7, [rsp]
 503         vpshufb xmm7, xmm18
 504         vpxor   xmm7, xmm0      ; xor the initial crc value
 505
 506         vpsrldq xmm7, 6
 507         jmp     .barrett
 508
 509 .only_less_than_2:
 510         ; load 1 Byte
 511         mov     al, [arg2]
 512         mov     [r11], al
 513
 514         vmovdqa xmm7, [rsp]
 515         vpshufb xmm7, xmm18
 516         vpxor   xmm7, xmm0      ; xor the initial crc value
 517
 518         vpsrldq xmm7, 7
 519         jmp     .barrett
 520
 521 section .data
 522 align 32
 523
 524 %ifndef USE_CONSTS
 525 ; precomputed constants
 526
 527 rk_1: dq 0xdccf000000000000
 528 rk_2: dq 0x4b0b000000000000
 529 rk1:  dq 0x2d56000000000000
 530 rk2:  dq 0x06df000000000000
 531 rk3:  dq 0x9d9d000000000000
 532 rk4:  dq 0x7cf5000000000000
 533 rk5:  dq 0x2d56000000000000
 534 rk6:  dq 0x1368000000000000
 535 rk7:  dq 0x00000001f65a57f8
 536 rk8:  dq 0x000000018bb70000
 537 rk9:  dq 0xceae000000000000
 538 rk10: dq 0xbfd6000000000000
 539 rk11: dq 0x1e16000000000000
 540 rk12: dq 0x713c000000000000
 541 rk13: dq 0xf7f9000000000000
 542 rk14: dq 0x80a6000000000000
 543 rk15: dq 0x044c000000000000
 544 rk16: dq 0xe658000000000000
 545 rk17: dq 0xad18000000000000
 546 rk18: dq 0xa497000000000000
 547 rk19: dq 0x6ee3000000000000
 548 rk20: dq 0xe7b5000000000000
 549
 550 rk_1b: dq 0x2d56000000000000
 551 rk_2b: dq 0x06df000000000000
 552         dq 0x0000000000000000
 553         dq 0x0000000000000000
 554 %else
 555 INCLUDE_CONSTS
 556 %endif
 557
 558 mask1: dq 0x8080808080808080, 0x8080808080808080
 559 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 560
 561 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 562
 563 pshufb_shf_table:
 564 ; use these values for shift constants for the pshufb instruction
 565 ; different alignments result in values as shown:
 566 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 567 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 568 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 569 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 570 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 571 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 572 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 573 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 574 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 575 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 576 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 577 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 578 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 579 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 580 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 581 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 582 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 583 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
 584 dq 0x8080808080808080, 0x8080808080808080
 585
 586 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 587 %ifidn __OUTPUT_FORMAT__, win64
 588 global no_ %+ FUNCTION_NAME
 589 no_ %+ FUNCTION_NAME %+ :
 590 %endif
 591 %endif ; (AS_FEATURE_LEVEL) >= 10