ceph/src/spdk/isa-l/crc/crc32_gzip_refl_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       UINT32 crc32_gzip_refl_by8(
  33 ;               UINT32 init_crc, //initial CRC value, 32 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  45 ;
  46 ;
  47 ;       sample yasm command line:
  48 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
  49 ;
  50 ;       As explained here:
  51 ;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
  52 ;       CRC-32 checksum is described in RFC 1952
  53 ;       Implementing RFC 1952 CRC:
  54 ;       http://www.ietf.org/rfc/rfc1952.txt
  55
  56 %include "reg_sizes.asm"
  57
  58 %define fetch_dist      1024
  59
  60 [bits 64]
  61 default rel
  62
  63 section .text
  64
  65
  66 %ifidn __OUTPUT_FORMAT__, win64
  67         %xdefine        arg1 rcx
  68         %xdefine        arg2 rdx
  69         %xdefine        arg3 r8
  70
  71         %xdefine        arg1_low32 ecx
  72 %else
  73         %xdefine        arg1 rdi
  74         %xdefine        arg2 rsi
  75         %xdefine        arg3 rdx
  76
  77         %xdefine        arg1_low32 edi
  78 %endif
  79
  80 %define TMP 16*0
  81 %ifidn __OUTPUT_FORMAT__, win64
  82         %define XMM_SAVE 16*2
  83         %define VARIABLE_OFFSET 16*10+8
  84 %else
  85         %define VARIABLE_OFFSET 16*2+8
  86 %endif
  87
  88 align 16
  89 global  crc32_gzip_refl_by8:ISAL_SYM_TYPE_FUNCTION
  90 crc32_gzip_refl_by8:
  91
  92         ; unsigned long c = crc ^ 0xffffffffL;
  93         not     arg1_low32      ;
  94
  95
  96         sub     rsp, VARIABLE_OFFSET
  97 %ifidn __OUTPUT_FORMAT__, win64
  98         ; push the xmm registers into the stack to maintain
  99         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
 100         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
 101         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
 102         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
 103         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
 104         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
 105         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
 106         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
 107 %endif
 108
 109         ; check if smaller than 256B
 110         cmp     arg3, 256
 111
 112         ; for sizes less than 256, we can't fold 128B at a time...
 113         jl      _less_than_256
 114
 115
 116         ; load the initial crc value
 117         movd    xmm10, arg1_low32      ; initial crc
 118
 119         ; receive the initial 64B data, xor the initial crc value
 120         movdqu  xmm0, [arg2+16*0]
 121         movdqu  xmm1, [arg2+16*1]
 122         movdqu  xmm2, [arg2+16*2]
 123         movdqu  xmm3, [arg2+16*3]
 124         movdqu  xmm4, [arg2+16*4]
 125         movdqu  xmm5, [arg2+16*5]
 126         movdqu  xmm6, [arg2+16*6]
 127         movdqu  xmm7, [arg2+16*7]
 128
 129         ; XOR the initial_crc value
 130         pxor    xmm0, xmm10
 131         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 132                                         ;imm value of pclmulqdq instruction will determine which constant to use
 133         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 134         ; we subtract 256 instead of 128 to save one instruction from the loop
 135         sub     arg3, 256
 136
 137         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 138         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 139
 140
 141         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 142 _fold_128_B_loop:
 143
 144         ; update the buffer pointer
 145         add     arg2, 128
 146
 147         prefetchnta [arg2+fetch_dist+0]
 148         movdqu  xmm9, [arg2+16*0]
 149         movdqu  xmm12, [arg2+16*1]
 150         movdqa  xmm8, xmm0
 151         movdqa  xmm13, xmm1
 152         pclmulqdq       xmm0, xmm10, 0x10
 153         pclmulqdq       xmm8, xmm10 , 0x1
 154         pclmulqdq       xmm1, xmm10, 0x10
 155         pclmulqdq       xmm13, xmm10 , 0x1
 156         pxor    xmm0, xmm9
 157         xorps   xmm0, xmm8
 158         pxor    xmm1, xmm12
 159         xorps   xmm1, xmm13
 160
 161         prefetchnta [arg2+fetch_dist+32]
 162         movdqu  xmm9, [arg2+16*2]
 163         movdqu  xmm12, [arg2+16*3]
 164         movdqa  xmm8, xmm2
 165         movdqa  xmm13, xmm3
 166         pclmulqdq       xmm2, xmm10, 0x10
 167         pclmulqdq       xmm8, xmm10 , 0x1
 168         pclmulqdq       xmm3, xmm10, 0x10
 169         pclmulqdq       xmm13, xmm10 , 0x1
 170         pxor    xmm2, xmm9
 171         xorps   xmm2, xmm8
 172         pxor    xmm3, xmm12
 173         xorps   xmm3, xmm13
 174
 175         prefetchnta [arg2+fetch_dist+64]
 176         movdqu  xmm9, [arg2+16*4]
 177         movdqu  xmm12, [arg2+16*5]
 178         movdqa  xmm8, xmm4
 179         movdqa  xmm13, xmm5
 180         pclmulqdq       xmm4, xmm10, 0x10
 181         pclmulqdq       xmm8, xmm10 , 0x1
 182         pclmulqdq       xmm5, xmm10, 0x10
 183         pclmulqdq       xmm13, xmm10 , 0x1
 184         pxor    xmm4, xmm9
 185         xorps   xmm4, xmm8
 186         pxor    xmm5, xmm12
 187         xorps   xmm5, xmm13
 188
 189         prefetchnta [arg2+fetch_dist+96]
 190         movdqu  xmm9, [arg2+16*6]
 191         movdqu  xmm12, [arg2+16*7]
 192         movdqa  xmm8, xmm6
 193         movdqa  xmm13, xmm7
 194         pclmulqdq       xmm6, xmm10, 0x10
 195         pclmulqdq       xmm8, xmm10 , 0x1
 196         pclmulqdq       xmm7, xmm10, 0x10
 197         pclmulqdq       xmm13, xmm10 , 0x1
 198         pxor    xmm6, xmm9
 199         xorps   xmm6, xmm8
 200         pxor    xmm7, xmm12
 201         xorps   xmm7, xmm13
 202
 203         sub     arg3, 128
 204
 205         ; check if there is another 128B in the buffer to be able to fold
 206         jge     _fold_128_B_loop
 207         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 208
 209
 210         add     arg2, 128
 211         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 212         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 213
 214
 215         ; fold the 8 xmm registers to 1 xmm register with different constants
 216
 217         movdqa  xmm10, [rk9]
 218         movdqa  xmm8, xmm0
 219         pclmulqdq       xmm0, xmm10, 0x1
 220         pclmulqdq       xmm8, xmm10, 0x10
 221         pxor    xmm7, xmm8
 222         xorps   xmm7, xmm0
 223
 224         movdqa  xmm10, [rk11]
 225         movdqa  xmm8, xmm1
 226         pclmulqdq       xmm1, xmm10, 0x1
 227         pclmulqdq       xmm8, xmm10, 0x10
 228         pxor    xmm7, xmm8
 229         xorps   xmm7, xmm1
 230
 231         movdqa  xmm10, [rk13]
 232         movdqa  xmm8, xmm2
 233         pclmulqdq       xmm2, xmm10, 0x1
 234         pclmulqdq       xmm8, xmm10, 0x10
 235         pxor    xmm7, xmm8
 236         pxor    xmm7, xmm2
 237
 238         movdqa  xmm10, [rk15]
 239         movdqa  xmm8, xmm3
 240         pclmulqdq       xmm3, xmm10, 0x1
 241         pclmulqdq       xmm8, xmm10, 0x10
 242         pxor    xmm7, xmm8
 243         xorps   xmm7, xmm3
 244
 245         movdqa  xmm10, [rk17]
 246         movdqa  xmm8, xmm4
 247         pclmulqdq       xmm4, xmm10, 0x1
 248         pclmulqdq       xmm8, xmm10, 0x10
 249         pxor    xmm7, xmm8
 250         pxor    xmm7, xmm4
 251
 252         movdqa  xmm10, [rk19]
 253         movdqa  xmm8, xmm5
 254         pclmulqdq       xmm5, xmm10, 0x1
 255         pclmulqdq       xmm8, xmm10, 0x10
 256         pxor    xmm7, xmm8
 257         xorps   xmm7, xmm5
 258
 259         movdqa  xmm10, [rk1]
 260         movdqa  xmm8, xmm6
 261         pclmulqdq       xmm6, xmm10, 0x1
 262         pclmulqdq       xmm8, xmm10, 0x10
 263         pxor    xmm7, xmm8
 264         pxor    xmm7, xmm6
 265
 266
 267         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 268         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 269         add     arg3, 128-16
 270         jl      _final_reduction_for_128
 271
 272         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 273         ; we can fold 16 bytes at a time if y>=16
 274         ; continue folding 16B at a time
 275
 276 _16B_reduction_loop:
 277         movdqa  xmm8, xmm7
 278         pclmulqdq       xmm7, xmm10, 0x1
 279         pclmulqdq       xmm8, xmm10, 0x10
 280         pxor    xmm7, xmm8
 281         movdqu  xmm0, [arg2]
 282         pxor    xmm7, xmm0
 283         add     arg2, 16
 284         sub     arg3, 16
 285         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 286         ; equivalent of: cmp arg3, 16-16
 287         ; check if there is any more 16B in the buffer to be able to fold
 288         jge     _16B_reduction_loop
 289
 290         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 291         ;first, we reduce the data in the xmm7 register
 292
 293
 294 _final_reduction_for_128:
 295         add arg3, 16
 296         je _128_done
 297
 298 ; here we are getting data that is less than 16 bytes.
 299         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 300         ; after that the registers need to be adjusted.
 301 _get_last_two_xmms:
 302
 303
 304         movdqa xmm2, xmm7
 305         movdqu xmm1, [arg2 - 16 + arg3]
 306
 307         ; get rid of the extra data that was loaded before
 308         ; load the shift constant
 309         lea     rax, [pshufb_shf_table]
 310         add     rax, arg3
 311         movdqu  xmm0, [rax]
 312
 313
 314         pshufb  xmm7, xmm0
 315         pxor    xmm0, [mask3]
 316         pshufb  xmm2, xmm0
 317
 318         pblendvb        xmm2, xmm1     ;xmm0 is implicit
 319         ;;;;;;;;;;
 320         movdqa  xmm8, xmm7
 321         pclmulqdq       xmm7, xmm10, 0x1
 322
 323         pclmulqdq       xmm8, xmm10, 0x10
 324         pxor    xmm7, xmm8
 325         pxor    xmm7, xmm2
 326
 327 _128_done:
 328         ; compute crc of a 128-bit value
 329         movdqa  xmm10, [rk5]
 330         movdqa  xmm0, xmm7
 331
 332         ;64b fold
 333         pclmulqdq       xmm7, xmm10, 0
 334         psrldq  xmm0, 8
 335         pxor    xmm7, xmm0
 336
 337         ;32b fold
 338         movdqa  xmm0, xmm7
 339         pslldq  xmm7, 4
 340         pclmulqdq       xmm7, xmm10, 0x10
 341
 342         pxor    xmm7, xmm0
 343
 344
 345         ;barrett reduction
 346 _barrett:
 347         pand    xmm7, [mask2]
 348         movdqa  xmm1, xmm7
 349         movdqa  xmm2, xmm7
 350         movdqa  xmm10, [rk7]
 351
 352         pclmulqdq       xmm7, xmm10, 0
 353         pxor    xmm7, xmm2
 354         pand    xmm7, [mask]
 355         movdqa  xmm2, xmm7
 356         pclmulqdq       xmm7, xmm10, 0x10
 357         pxor    xmm7, xmm2
 358         pxor    xmm7, xmm1
 359         pextrd  eax, xmm7, 2
 360
 361 _cleanup:
 362         ; return c ^ 0xffffffffL;
 363         not     eax
 364
 365
 366 %ifidn __OUTPUT_FORMAT__, win64
 367         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 368         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 369         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 370         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 371         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 372         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 373         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 374         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 375 %endif
 376         add     rsp, VARIABLE_OFFSET
 377         ret
 378
 379
 380 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 381 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 384
 385 align 16
 386 _less_than_256:
 387
 388         ; check if there is enough buffer to be able to fold 16B at a time
 389         cmp     arg3, 32
 390         jl      _less_than_32
 391
 392         ; if there is, load the constants
 393         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 394
 395         movd    xmm0, arg1_low32       ; get the initial crc value
 396         movdqu  xmm7, [arg2]            ; load the plaintext
 397         pxor    xmm7, xmm0
 398
 399         ; update the buffer pointer
 400         add     arg2, 16
 401
 402         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 403         sub     arg3, 32
 404
 405         jmp     _16B_reduction_loop
 406
 407
 408 align 16
 409 _less_than_32:
 410         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 411         mov     eax, arg1_low32
 412         test    arg3, arg3
 413         je      _cleanup
 414
 415         movd    xmm0, arg1_low32        ; get the initial crc value
 416
 417         cmp     arg3, 16
 418         je      _exact_16_left
 419         jl      _less_than_16_left
 420
 421         movdqu  xmm7, [arg2]            ; load the plaintext
 422         pxor    xmm7, xmm0              ; xor the initial crc value
 423         add     arg2, 16
 424         sub     arg3, 16
 425         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 426         jmp     _get_last_two_xmms
 427
 428
 429 align 16
 430 _less_than_16_left:
 431         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 432
 433         pxor    xmm1, xmm1
 434         mov     r11, rsp
 435         movdqa  [r11], xmm1
 436
 437         cmp     arg3, 4
 438         jl      _only_less_than_4
 439
 440         ;       backup the counter value
 441         mov     r9, arg3
 442         cmp     arg3, 8
 443         jl      _less_than_8_left
 444
 445         ; load 8 Bytes
 446         mov     rax, [arg2]
 447         mov     [r11], rax
 448         add     r11, 8
 449         sub     arg3, 8
 450         add     arg2, 8
 451 _less_than_8_left:
 452
 453         cmp     arg3, 4
 454         jl      _less_than_4_left
 455
 456         ; load 4 Bytes
 457         mov     eax, [arg2]
 458         mov     [r11], eax
 459         add     r11, 4
 460         sub     arg3, 4
 461         add     arg2, 4
 462 _less_than_4_left:
 463
 464         cmp     arg3, 2
 465         jl      _less_than_2_left
 466
 467         ; load 2 Bytes
 468         mov     ax, [arg2]
 469         mov     [r11], ax
 470         add     r11, 2
 471         sub     arg3, 2
 472         add     arg2, 2
 473 _less_than_2_left:
 474         cmp     arg3, 1
 475         jl      _zero_left
 476
 477         ; load 1 Byte
 478         mov     al, [arg2]
 479         mov     [r11], al
 480
 481 _zero_left:
 482         movdqa  xmm7, [rsp]
 483         pxor    xmm7, xmm0      ; xor the initial crc value
 484
 485         lea rax,[pshufb_shf_table]
 486         movdqu  xmm0, [rax + r9]
 487         pshufb  xmm7,xmm0
 488
 489
 490
 491         jmp     _128_done
 492
 493 align 16
 494 _exact_16_left:
 495         movdqu  xmm7, [arg2]
 496         pxor    xmm7, xmm0      ; xor the initial crc value
 497
 498         jmp     _128_done
 499
 500 _only_less_than_4:
 501         cmp     arg3, 3
 502         jl      _only_less_than_3
 503
 504         ; load 3 Bytes
 505         mov     al, [arg2]
 506         mov     [r11], al
 507
 508         mov     al, [arg2+1]
 509         mov     [r11+1], al
 510
 511         mov     al, [arg2+2]
 512         mov     [r11+2], al
 513
 514         movdqa  xmm7, [rsp]
 515         pxor    xmm7, xmm0      ; xor the initial crc value
 516
 517         pslldq  xmm7, 5
 518
 519         jmp     _barrett
 520 _only_less_than_3:
 521         cmp     arg3, 2
 522         jl      _only_less_than_2
 523
 524         ; load 2 Bytes
 525         mov     al, [arg2]
 526         mov     [r11], al
 527
 528         mov     al, [arg2+1]
 529         mov     [r11+1], al
 530
 531         movdqa  xmm7, [rsp]
 532         pxor    xmm7, xmm0      ; xor the initial crc value
 533
 534         pslldq  xmm7, 6
 535
 536         jmp     _barrett
 537 _only_less_than_2:
 538
 539         ; load 1 Byte
 540         mov     al, [arg2]
 541         mov     [r11], al
 542
 543         movdqa  xmm7, [rsp]
 544         pxor    xmm7, xmm0      ; xor the initial crc value
 545
 546         pslldq  xmm7, 7
 547
 548         jmp     _barrett
 549
 550 section .data
 551
 552 ; precomputed constants
 553 align 16
 554 rk1 :
 555 DQ 0x00000000ccaa009e
 556 rk2 :
 557 DQ 0x00000001751997d0
 558 rk3 :
 559 DQ 0x000000014a7fe880
 560 rk4 :
 561 DQ 0x00000001e88ef372
 562 rk5 :
 563 DQ 0x00000000ccaa009e
 564 rk6 :
 565 DQ 0x0000000163cd6124
 566 rk7 :
 567 DQ 0x00000001f7011640
 568 rk8 :
 569 DQ 0x00000001db710640
 570 rk9 :
 571 DQ 0x00000001d7cfc6ac
 572 rk10 :
 573 DQ 0x00000001ea89367e
 574 rk11 :
 575 DQ 0x000000018cb44e58
 576 rk12 :
 577 DQ 0x00000000df068dc2
 578 rk13 :
 579 DQ 0x00000000ae0b5394
 580 rk14 :
 581 DQ 0x00000001c7569e54
 582 rk15 :
 583 DQ 0x00000001c6e41596
 584 rk16 :
 585 DQ 0x0000000154442bd4
 586 rk17 :
 587 DQ 0x0000000174359406
 588 rk18 :
 589 DQ 0x000000003db1ecdc
 590 rk19 :
 591 DQ 0x000000015a546366
 592 rk20 :
 593 DQ 0x00000000f1da05aa
 594
 595 mask:
 596 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 597 mask2:
 598 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 599 mask3:
 600 dq     0x8080808080808080, 0x8080808080808080
 601
 602 pshufb_shf_table:
 603 ; use these values for shift constants for the pshufb instruction
 604 ; different alignments result in values as shown:
 605 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 606 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 607 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 608 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 609 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 610 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 611 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 612 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 613 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 614 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 615 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 616 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 617 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 618 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 619 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 620 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 621 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 622
 623 ;;;       func        core, ver, snum
 624 slversion crc32_gzip_refl_by8, 01,   00,  002c