ceph/src/isa-l/crc/crc32_ieee_01.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       UINT32 crc32_ieee_01(
  32 ;               UINT32 init_crc, //initial CRC value, 32 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  44
  45 %include "reg_sizes.asm"
  46
  47 [bits 64]
  48 default rel
  49
  50 section .text
  51
  52 %ifidn __OUTPUT_FORMAT__, win64
  53         %xdefine        arg1 rcx
  54         %xdefine        arg2 rdx
  55         %xdefine        arg3 r8
  56
  57         %xdefine        arg1_low32 ecx
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62
  63         %xdefine        arg1_low32 edi
  64 %endif
  65
  66 %define TMP 16*0
  67 %ifidn __OUTPUT_FORMAT__, win64
  68         %define XMM_SAVE 16*2
  69         %define VARIABLE_OFFSET 16*10+8
  70 %else
  71         %define VARIABLE_OFFSET 16*2+8
  72 %endif
  73 align 16
  74 global  crc32_ieee_01:function
  75 crc32_ieee_01:
  76
  77         not     arg1_low32      ;~init_crc
  78
  79         sub     rsp,VARIABLE_OFFSET
  80
  81 %ifidn __OUTPUT_FORMAT__, win64
  82         ; push the xmm registers into the stack to maintain
  83         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  84         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  85         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  86         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  87         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  88         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  89         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  90         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  91 %endif
  92
  93
  94         ; check if smaller than 256
  95         cmp     arg3, 256
  96
  97         ; for sizes less than 256, we can't fold 128B at a time...
  98         jl      _less_than_256
  99
 100
 101         ; load the initial crc value
 102         movd    xmm10, arg1_low32       ; initial crc
 103
 104         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 105         ; because data will be byte-reflected and will align with initial crc at correct place.
 106         pslldq  xmm10, 12
 107
 108         movdqa xmm11, [SHUF_MASK]
 109         ; receive the initial 128B data, xor the initial crc value
 110         movdqu  xmm0, [arg2+16*0]
 111         movdqu  xmm1, [arg2+16*1]
 112         movdqu  xmm2, [arg2+16*2]
 113         movdqu  xmm3, [arg2+16*3]
 114         movdqu  xmm4, [arg2+16*4]
 115         movdqu  xmm5, [arg2+16*5]
 116         movdqu  xmm6, [arg2+16*6]
 117         movdqu  xmm7, [arg2+16*7]
 118
 119         pshufb  xmm0, xmm11
 120         ; XOR the initial_crc value
 121         pxor    xmm0, xmm10
 122         pshufb  xmm1, xmm11
 123         pshufb  xmm2, xmm11
 124         pshufb  xmm3, xmm11
 125         pshufb  xmm4, xmm11
 126         pshufb  xmm5, xmm11
 127         pshufb  xmm6, xmm11
 128         pshufb  xmm7, xmm11
 129
 130         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 131                                         ;imm value of pclmulqdq instruction will determine which constant to use
 132         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 133         ; we subtract 256 instead of 128 to save one instruction from the loop
 134         sub     arg3, 256
 135
 136         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 137         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 138
 139
 140         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 141 _fold_128_B_loop:
 142
 143         ; update the buffer pointer
 144         add     arg2, 128               ;    buf += 128;
 145
 146         movdqu  xmm9, [arg2+16*0]
 147         movdqu  xmm12, [arg2+16*1]
 148         pshufb  xmm9, xmm11
 149         pshufb  xmm12, xmm11
 150         movdqa  xmm8, xmm0
 151         movdqa  xmm13, xmm1
 152         pclmulqdq       xmm0, xmm10, 0x0
 153         pclmulqdq       xmm8, xmm10 , 0x11
 154         pclmulqdq       xmm1, xmm10, 0x0
 155         pclmulqdq       xmm13, xmm10 , 0x11
 156         pxor    xmm0, xmm9
 157         xorps   xmm0, xmm8
 158         pxor    xmm1, xmm12
 159         xorps   xmm1, xmm13
 160
 161         movdqu  xmm9, [arg2+16*2]
 162         movdqu  xmm12, [arg2+16*3]
 163         pshufb  xmm9, xmm11
 164         pshufb  xmm12, xmm11
 165         movdqa  xmm8, xmm2
 166         movdqa  xmm13, xmm3
 167         pclmulqdq       xmm2, xmm10, 0x0
 168         pclmulqdq       xmm8, xmm10 , 0x11
 169         pclmulqdq       xmm3, xmm10, 0x0
 170         pclmulqdq       xmm13, xmm10 , 0x11
 171         pxor    xmm2, xmm9
 172         xorps   xmm2, xmm8
 173         pxor    xmm3, xmm12
 174         xorps   xmm3, xmm13
 175
 176         movdqu  xmm9, [arg2+16*4]
 177         movdqu  xmm12, [arg2+16*5]
 178         pshufb  xmm9, xmm11
 179         pshufb  xmm12, xmm11
 180         movdqa  xmm8, xmm4
 181         movdqa  xmm13, xmm5
 182         pclmulqdq       xmm4, xmm10, 0x0
 183         pclmulqdq       xmm8, xmm10 , 0x11
 184         pclmulqdq       xmm5, xmm10, 0x0
 185         pclmulqdq       xmm13, xmm10 , 0x11
 186         pxor    xmm4, xmm9
 187         xorps   xmm4, xmm8
 188         pxor    xmm5, xmm12
 189         xorps   xmm5, xmm13
 190
 191         movdqu  xmm9, [arg2+16*6]
 192         movdqu  xmm12, [arg2+16*7]
 193         pshufb  xmm9, xmm11
 194         pshufb  xmm12, xmm11
 195         movdqa  xmm8, xmm6
 196         movdqa  xmm13, xmm7
 197         pclmulqdq       xmm6, xmm10, 0x0
 198         pclmulqdq       xmm8, xmm10 , 0x11
 199         pclmulqdq       xmm7, xmm10, 0x0
 200         pclmulqdq       xmm13, xmm10 , 0x11
 201         pxor    xmm6, xmm9
 202         xorps   xmm6, xmm8
 203         pxor    xmm7, xmm12
 204         xorps   xmm7, xmm13
 205
 206         sub     arg3, 128
 207
 208         ; check if there is another 128B in the buffer to be able to fold
 209         jge     _fold_128_B_loop
 210         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 211
 212
 213         add     arg2, 128
 214         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 215         ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 216
 217
 218         ; fold the 8 xmm registers to 1 xmm register with different constants
 219
 220         movdqa  xmm10, [rk9]
 221         movdqa  xmm8, xmm0
 222         pclmulqdq       xmm0, xmm10, 0x11
 223         pclmulqdq       xmm8, xmm10, 0x0
 224         pxor    xmm7, xmm8
 225         xorps   xmm7, xmm0
 226
 227         movdqa  xmm10, [rk11]
 228         movdqa  xmm8, xmm1
 229         pclmulqdq       xmm1, xmm10, 0x11
 230         pclmulqdq       xmm8, xmm10, 0x0
 231         pxor    xmm7, xmm8
 232         xorps   xmm7, xmm1
 233
 234         movdqa  xmm10, [rk13]
 235         movdqa  xmm8, xmm2
 236         pclmulqdq       xmm2, xmm10, 0x11
 237         pclmulqdq       xmm8, xmm10, 0x0
 238         pxor    xmm7, xmm8
 239         pxor    xmm7, xmm2
 240
 241         movdqa  xmm10, [rk15]
 242         movdqa  xmm8, xmm3
 243         pclmulqdq       xmm3, xmm10, 0x11
 244         pclmulqdq       xmm8, xmm10, 0x0
 245         pxor    xmm7, xmm8
 246         xorps   xmm7, xmm3
 247
 248         movdqa  xmm10, [rk17]
 249         movdqa  xmm8, xmm4
 250         pclmulqdq       xmm4, xmm10, 0x11
 251         pclmulqdq       xmm8, xmm10, 0x0
 252         pxor    xmm7, xmm8
 253         pxor    xmm7, xmm4
 254
 255         movdqa  xmm10, [rk19]
 256         movdqa  xmm8, xmm5
 257         pclmulqdq       xmm5, xmm10, 0x11
 258         pclmulqdq       xmm8, xmm10, 0x0
 259         pxor    xmm7, xmm8
 260         xorps   xmm7, xmm5
 261
 262         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 263                                                                         ;imm value of pclmulqdq instruction will determine which constant to use
 264         movdqa  xmm8, xmm6
 265         pclmulqdq       xmm6, xmm10, 0x11
 266         pclmulqdq       xmm8, xmm10, 0x0
 267         pxor    xmm7, xmm8
 268         pxor    xmm7, xmm6
 269
 270
 271         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 272         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 273         add     arg3, 128-16
 274         jl      _final_reduction_for_128
 275
 276         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 277         ; we can fold 16 bytes at a time if y>=16
 278         ; continue folding 16B at a time
 279
 280 _16B_reduction_loop:
 281         movdqa  xmm8, xmm7
 282         pclmulqdq       xmm7, xmm10, 0x11
 283         pclmulqdq       xmm8, xmm10, 0x0
 284         pxor    xmm7, xmm8
 285         movdqu  xmm0, [arg2]
 286         pshufb  xmm0, xmm11
 287         pxor    xmm7, xmm0
 288         add     arg2, 16
 289         sub     arg3, 16
 290         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 291         ; equivalent of: cmp arg3, 16-16
 292         ; check if there is any more 16B in the buffer to be able to fold
 293         jge     _16B_reduction_loop
 294
 295         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 296         ;first, we reduce the data in the xmm7 register
 297
 298
 299 _final_reduction_for_128:
 300         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 301         add     arg3, 16
 302         je      _128_done
 303
 304         ; here we are getting data that is less than 16 bytes.
 305         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 306         ; after that the registers need to be adjusted.
 307 _get_last_two_xmms:
 308         movdqa  xmm2, xmm7
 309
 310         movdqu  xmm1, [arg2 - 16 + arg3]
 311         pshufb  xmm1, xmm11
 312
 313         ; get rid of the extra data that was loaded before
 314         ; load the shift constant
 315         lea     rax, [pshufb_shf_table + 16]
 316         sub     rax, arg3
 317         movdqu  xmm0, [rax]
 318
 319         ; shift xmm2 to the left by arg3 bytes
 320         pshufb  xmm2, xmm0
 321
 322         ; shift xmm7 to the right by 16-arg3 bytes
 323         pxor    xmm0, [mask1]
 324         pshufb  xmm7, xmm0
 325         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 326
 327         ; fold 16 Bytes
 328         movdqa  xmm2, xmm1
 329         movdqa  xmm8, xmm7
 330         pclmulqdq       xmm7, xmm10, 0x11
 331         pclmulqdq       xmm8, xmm10, 0x0
 332         pxor    xmm7, xmm8
 333         pxor    xmm7, xmm2
 334
 335 _128_done:
 336         ; compute crc of a 128-bit value
 337         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 338         movdqa  xmm0, xmm7
 339
 340         ;64b fold
 341         pclmulqdq       xmm7, xmm10, 0x1
 342         pslldq  xmm0, 8
 343         pxor    xmm7, xmm0
 344
 345         ;32b fold
 346         movdqa  xmm0, xmm7
 347
 348         pand    xmm0, [mask2]
 349
 350         psrldq  xmm7, 12
 351         pclmulqdq       xmm7, xmm10, 0x10
 352         pxor    xmm7, xmm0
 353
 354         ;barrett reduction
 355 _barrett:
 356         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 357         movdqa  xmm0, xmm7
 358         pclmulqdq       xmm7, xmm10, 0x01
 359         pslldq  xmm7, 4
 360         pclmulqdq       xmm7, xmm10, 0x11
 361
 362         pslldq  xmm7, 4
 363         pxor    xmm7, xmm0
 364         pextrd  eax, xmm7,1
 365
 366 _cleanup:
 367         not     eax
 368 %ifidn __OUTPUT_FORMAT__, win64
 369         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 370         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 371         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 372         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 373         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 374         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 375         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 376         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 377 %endif
 378         add     rsp,VARIABLE_OFFSET
 379         ret
 380
 381
 382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 385 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 386
 387 align 16
 388 _less_than_256:
 389
 390         ; check if there is enough buffer to be able to fold 16B at a time
 391         cmp     arg3, 32
 392         jl      _less_than_32
 393         movdqa xmm11, [SHUF_MASK]
 394
 395         ; if there is, load the constants
 396         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 397
 398         movd    xmm0, arg1_low32        ; get the initial crc value
 399         pslldq  xmm0, 12        ; align it to its correct place
 400         movdqu  xmm7, [arg2]    ; load the plaintext
 401         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 402         pxor    xmm7, xmm0
 403
 404
 405         ; update the buffer pointer
 406         add     arg2, 16
 407
 408         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 409         sub     arg3, 32
 410
 411         jmp     _16B_reduction_loop
 412
 413
 414 align 16
 415 _less_than_32:
 416         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 417         mov     eax, arg1_low32
 418         test    arg3, arg3
 419         je      _cleanup
 420
 421         movdqa xmm11, [SHUF_MASK]
 422
 423         movd    xmm0, arg1_low32        ; get the initial crc value
 424         pslldq  xmm0, 12        ; align it to its correct place
 425
 426         cmp     arg3, 16
 427         je      _exact_16_left
 428         jl      _less_than_16_left
 429
 430         movdqu  xmm7, [arg2]    ; load the plaintext
 431         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 432         pxor    xmm7, xmm0      ; xor the initial crc value
 433         add     arg2, 16
 434         sub     arg3, 16
 435         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 436         jmp     _get_last_two_xmms
 437
 438
 439 align 16
 440 _less_than_16_left:
 441         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 442
 443         pxor    xmm1, xmm1
 444         mov     r11, rsp
 445         movdqa  [r11], xmm1
 446
 447         cmp     arg3, 4
 448         jl      _only_less_than_4
 449
 450         ;       backup the counter value
 451         mov     r9, arg3
 452         cmp     arg3, 8
 453         jl      _less_than_8_left
 454
 455         ; load 8 Bytes
 456         mov     rax, [arg2]
 457         mov     [r11], rax
 458         add     r11, 8
 459         sub     arg3, 8
 460         add     arg2, 8
 461 _less_than_8_left:
 462
 463         cmp     arg3, 4
 464         jl      _less_than_4_left
 465
 466         ; load 4 Bytes
 467         mov     eax, [arg2]
 468         mov     [r11], eax
 469         add     r11, 4
 470         sub     arg3, 4
 471         add     arg2, 4
 472 _less_than_4_left:
 473
 474         cmp     arg3, 2
 475         jl      _less_than_2_left
 476
 477         ; load 2 Bytes
 478         mov     ax, [arg2]
 479         mov     [r11], ax
 480         add     r11, 2
 481         sub     arg3, 2
 482         add     arg2, 2
 483 _less_than_2_left:
 484         cmp     arg3, 1
 485         jl      _zero_left
 486
 487         ; load 1 Byte
 488         mov     al, [arg2]
 489         mov     [r11], al
 490 _zero_left:
 491         movdqa  xmm7, [rsp]
 492         pshufb  xmm7, xmm11
 493         pxor    xmm7, xmm0      ; xor the initial crc value
 494
 495         ; shl r9, 4
 496         lea     rax, [pshufb_shf_table + 16]
 497         sub     rax, r9
 498         movdqu  xmm0, [rax]
 499         pxor    xmm0, [mask1]
 500
 501         pshufb  xmm7, xmm0
 502         jmp     _128_done
 503
 504 align 16
 505 _exact_16_left:
 506         movdqu  xmm7, [arg2]
 507         pshufb  xmm7, xmm11
 508         pxor    xmm7, xmm0      ; xor the initial crc value
 509
 510         jmp     _128_done
 511
 512 _only_less_than_4:
 513         cmp     arg3, 3
 514         jl      _only_less_than_3
 515
 516         ; load 3 Bytes
 517         mov     al, [arg2]
 518         mov     [r11], al
 519
 520         mov     al, [arg2+1]
 521         mov     [r11+1], al
 522
 523         mov     al, [arg2+2]
 524         mov     [r11+2], al
 525
 526         movdqa  xmm7, [rsp]
 527         pshufb  xmm7, xmm11
 528         pxor    xmm7, xmm0      ; xor the initial crc value
 529
 530         psrldq  xmm7, 5
 531
 532         jmp     _barrett
 533 _only_less_than_3:
 534         cmp     arg3, 2
 535         jl      _only_less_than_2
 536
 537         ; load 2 Bytes
 538         mov     al, [arg2]
 539         mov     [r11], al
 540
 541         mov     al, [arg2+1]
 542         mov     [r11+1], al
 543
 544         movdqa  xmm7, [rsp]
 545         pshufb  xmm7, xmm11
 546         pxor    xmm7, xmm0      ; xor the initial crc value
 547
 548         psrldq  xmm7, 6
 549
 550         jmp     _barrett
 551 _only_less_than_2:
 552
 553         ; load 1 Byte
 554         mov     al, [arg2]
 555         mov     [r11], al
 556
 557         movdqa  xmm7, [rsp]
 558         pshufb  xmm7, xmm11
 559         pxor    xmm7, xmm0      ; xor the initial crc value
 560
 561         psrldq  xmm7, 7
 562
 563         jmp     _barrett
 564
 565 section .data
 566
 567 ; precomputed constants
 568 align 16
 569
 570 rk1 :
 571 DQ 0xf200aa6600000000
 572 rk2 :
 573 DQ 0x17d3315d00000000
 574 rk3 :
 575 DQ 0x022ffca500000000
 576 rk4 :
 577 DQ 0x9d9ee22f00000000
 578 rk5 :
 579 DQ 0xf200aa6600000000
 580 rk6 :
 581 DQ 0x490d678d00000000
 582 rk7 :
 583 DQ 0x0000000104d101df
 584 rk8 :
 585 DQ 0x0000000104c11db7
 586 rk9 :
 587 DQ 0x6ac7e7d700000000
 588 rk10 :
 589 DQ 0xfcd922af00000000
 590 rk11 :
 591 DQ 0x34e45a6300000000
 592 rk12 :
 593 DQ 0x8762c1f600000000
 594 rk13 :
 595 DQ 0x5395a0ea00000000
 596 rk14 :
 597 DQ 0x54f2d5c700000000
 598 rk15 :
 599 DQ 0xd3504ec700000000
 600 rk16 :
 601 DQ 0x57a8445500000000
 602 rk17 :
 603 DQ 0xc053585d00000000
 604 rk18 :
 605 DQ 0x766f1b7800000000
 606 rk19 :
 607 DQ 0xcd8c54b500000000
 608 rk20 :
 609 DQ 0xab40b71e00000000
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619 mask1:
 620 dq 0x8080808080808080, 0x8080808080808080
 621 mask2:
 622 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 623
 624 SHUF_MASK:
 625 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 626
 627 pshufb_shf_table:
 628 ; use these values for shift constants for the pshufb instruction
 629 ; different alignments result in values as shown:
 630 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 631 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 632 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 633 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 634 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 635 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 636 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 637 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 638 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 639 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 640 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 641 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 642 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 643 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 644 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 645 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 646 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 647
 648 ;;;       func        core, ver, snum
 649 slversion crc32_ieee_01, 01,   06,  0011
 650