ceph/src/isa-l/crc/crc32_ieee_01.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       UINT32 crc32_ieee_01(
  32 ;               UINT32 init_crc, //initial CRC value, 32 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  44
  45 %include "reg_sizes.asm"
  46
  47 %define fetch_dist      1024
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52
  53 %ifidn __OUTPUT_FORMAT__, win64
  54         %xdefine        arg1 rcx
  55         %xdefine        arg2 rdx
  56         %xdefine        arg3 r8
  57
  58         %xdefine        arg1_low32 ecx
  59 %else
  60         %xdefine        arg1 rdi
  61         %xdefine        arg2 rsi
  62         %xdefine        arg3 rdx
  63
  64         %xdefine        arg1_low32 edi
  65 %endif
  66
  67 %define TMP 16*0
  68 %ifidn __OUTPUT_FORMAT__, win64
  69         %define XMM_SAVE 16*2
  70         %define VARIABLE_OFFSET 16*10+8
  71 %else
  72         %define VARIABLE_OFFSET 16*2+8
  73 %endif
  74 align 16
  75 global  crc32_ieee_01:function
  76 crc32_ieee_01:
  77
  78         not     arg1_low32      ;~init_crc
  79
  80         sub     rsp,VARIABLE_OFFSET
  81
  82 %ifidn __OUTPUT_FORMAT__, win64
  83         ; push the xmm registers into the stack to maintain
  84         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  85         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  86         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  87         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  88         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  89         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  90         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  91         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  92 %endif
  93
  94
  95         ; check if smaller than 256
  96         cmp     arg3, 256
  97
  98         ; for sizes less than 256, we can't fold 128B at a time...
  99         jl      _less_than_256
 100
 101
 102         ; load the initial crc value
 103         movd    xmm10, arg1_low32       ; initial crc
 104
 105         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 106         ; because data will be byte-reflected and will align with initial crc at correct place.
 107         pslldq  xmm10, 12
 108
 109         movdqa xmm11, [SHUF_MASK]
 110         ; receive the initial 128B data, xor the initial crc value
 111         movdqu  xmm0, [arg2+16*0]
 112         movdqu  xmm1, [arg2+16*1]
 113         movdqu  xmm2, [arg2+16*2]
 114         movdqu  xmm3, [arg2+16*3]
 115         movdqu  xmm4, [arg2+16*4]
 116         movdqu  xmm5, [arg2+16*5]
 117         movdqu  xmm6, [arg2+16*6]
 118         movdqu  xmm7, [arg2+16*7]
 119
 120         pshufb  xmm0, xmm11
 121         ; XOR the initial_crc value
 122         pxor    xmm0, xmm10
 123         pshufb  xmm1, xmm11
 124         pshufb  xmm2, xmm11
 125         pshufb  xmm3, xmm11
 126         pshufb  xmm4, xmm11
 127         pshufb  xmm5, xmm11
 128         pshufb  xmm6, xmm11
 129         pshufb  xmm7, xmm11
 130
 131         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 132                                         ;imm value of pclmulqdq instruction will determine which constant to use
 133         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 134         ; we subtract 256 instead of 128 to save one instruction from the loop
 135         sub     arg3, 256
 136
 137         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 138         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 139
 140
 141         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 142 _fold_128_B_loop:
 143
 144         ; update the buffer pointer
 145         add     arg2, 128               ;    buf += 128;
 146
 147         prefetchnta [arg2+fetch_dist+0]
 148         movdqu  xmm9, [arg2+16*0]
 149         movdqu  xmm12, [arg2+16*1]
 150         pshufb  xmm9, xmm11
 151         pshufb  xmm12, xmm11
 152         movdqa  xmm8, xmm0
 153         movdqa  xmm13, xmm1
 154         pclmulqdq       xmm0, xmm10, 0x0
 155         pclmulqdq       xmm8, xmm10 , 0x11
 156         pclmulqdq       xmm1, xmm10, 0x0
 157         pclmulqdq       xmm13, xmm10 , 0x11
 158         pxor    xmm0, xmm9
 159         xorps   xmm0, xmm8
 160         pxor    xmm1, xmm12
 161         xorps   xmm1, xmm13
 162
 163         prefetchnta [arg2+fetch_dist+32]
 164         movdqu  xmm9, [arg2+16*2]
 165         movdqu  xmm12, [arg2+16*3]
 166         pshufb  xmm9, xmm11
 167         pshufb  xmm12, xmm11
 168         movdqa  xmm8, xmm2
 169         movdqa  xmm13, xmm3
 170         pclmulqdq       xmm2, xmm10, 0x0
 171         pclmulqdq       xmm8, xmm10 , 0x11
 172         pclmulqdq       xmm3, xmm10, 0x0
 173         pclmulqdq       xmm13, xmm10 , 0x11
 174         pxor    xmm2, xmm9
 175         xorps   xmm2, xmm8
 176         pxor    xmm3, xmm12
 177         xorps   xmm3, xmm13
 178
 179         prefetchnta [arg2+fetch_dist+64]
 180         movdqu  xmm9, [arg2+16*4]
 181         movdqu  xmm12, [arg2+16*5]
 182         pshufb  xmm9, xmm11
 183         pshufb  xmm12, xmm11
 184         movdqa  xmm8, xmm4
 185         movdqa  xmm13, xmm5
 186         pclmulqdq       xmm4, xmm10, 0x0
 187         pclmulqdq       xmm8, xmm10 , 0x11
 188         pclmulqdq       xmm5, xmm10, 0x0
 189         pclmulqdq       xmm13, xmm10 , 0x11
 190         pxor    xmm4, xmm9
 191         xorps   xmm4, xmm8
 192         pxor    xmm5, xmm12
 193         xorps   xmm5, xmm13
 194
 195         prefetchnta [arg2+fetch_dist+96]
 196         movdqu  xmm9, [arg2+16*6]
 197         movdqu  xmm12, [arg2+16*7]
 198         pshufb  xmm9, xmm11
 199         pshufb  xmm12, xmm11
 200         movdqa  xmm8, xmm6
 201         movdqa  xmm13, xmm7
 202         pclmulqdq       xmm6, xmm10, 0x0
 203         pclmulqdq       xmm8, xmm10 , 0x11
 204         pclmulqdq       xmm7, xmm10, 0x0
 205         pclmulqdq       xmm13, xmm10 , 0x11
 206         pxor    xmm6, xmm9
 207         xorps   xmm6, xmm8
 208         pxor    xmm7, xmm12
 209         xorps   xmm7, xmm13
 210
 211         sub     arg3, 128
 212
 213         ; check if there is another 128B in the buffer to be able to fold
 214         jge     _fold_128_B_loop
 215         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 216
 217
 218         add     arg2, 128
 219         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 220         ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 221
 222
 223         ; fold the 8 xmm registers to 1 xmm register with different constants
 224
 225         movdqa  xmm10, [rk9]
 226         movdqa  xmm8, xmm0
 227         pclmulqdq       xmm0, xmm10, 0x11
 228         pclmulqdq       xmm8, xmm10, 0x0
 229         pxor    xmm7, xmm8
 230         xorps   xmm7, xmm0
 231
 232         movdqa  xmm10, [rk11]
 233         movdqa  xmm8, xmm1
 234         pclmulqdq       xmm1, xmm10, 0x11
 235         pclmulqdq       xmm8, xmm10, 0x0
 236         pxor    xmm7, xmm8
 237         xorps   xmm7, xmm1
 238
 239         movdqa  xmm10, [rk13]
 240         movdqa  xmm8, xmm2
 241         pclmulqdq       xmm2, xmm10, 0x11
 242         pclmulqdq       xmm8, xmm10, 0x0
 243         pxor    xmm7, xmm8
 244         pxor    xmm7, xmm2
 245
 246         movdqa  xmm10, [rk15]
 247         movdqa  xmm8, xmm3
 248         pclmulqdq       xmm3, xmm10, 0x11
 249         pclmulqdq       xmm8, xmm10, 0x0
 250         pxor    xmm7, xmm8
 251         xorps   xmm7, xmm3
 252
 253         movdqa  xmm10, [rk17]
 254         movdqa  xmm8, xmm4
 255         pclmulqdq       xmm4, xmm10, 0x11
 256         pclmulqdq       xmm8, xmm10, 0x0
 257         pxor    xmm7, xmm8
 258         pxor    xmm7, xmm4
 259
 260         movdqa  xmm10, [rk19]
 261         movdqa  xmm8, xmm5
 262         pclmulqdq       xmm5, xmm10, 0x11
 263         pclmulqdq       xmm8, xmm10, 0x0
 264         pxor    xmm7, xmm8
 265         xorps   xmm7, xmm5
 266
 267         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 268                                                                         ;imm value of pclmulqdq instruction will determine which constant to use
 269         movdqa  xmm8, xmm6
 270         pclmulqdq       xmm6, xmm10, 0x11
 271         pclmulqdq       xmm8, xmm10, 0x0
 272         pxor    xmm7, xmm8
 273         pxor    xmm7, xmm6
 274
 275
 276         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 277         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 278         add     arg3, 128-16
 279         jl      _final_reduction_for_128
 280
 281         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 282         ; we can fold 16 bytes at a time if y>=16
 283         ; continue folding 16B at a time
 284
 285 _16B_reduction_loop:
 286         movdqa  xmm8, xmm7
 287         pclmulqdq       xmm7, xmm10, 0x11
 288         pclmulqdq       xmm8, xmm10, 0x0
 289         pxor    xmm7, xmm8
 290         movdqu  xmm0, [arg2]
 291         pshufb  xmm0, xmm11
 292         pxor    xmm7, xmm0
 293         add     arg2, 16
 294         sub     arg3, 16
 295         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 296         ; equivalent of: cmp arg3, 16-16
 297         ; check if there is any more 16B in the buffer to be able to fold
 298         jge     _16B_reduction_loop
 299
 300         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 301         ;first, we reduce the data in the xmm7 register
 302
 303
 304 _final_reduction_for_128:
 305         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 306         add     arg3, 16
 307         je      _128_done
 308
 309         ; here we are getting data that is less than 16 bytes.
 310         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 311         ; after that the registers need to be adjusted.
 312 _get_last_two_xmms:
 313         movdqa  xmm2, xmm7
 314
 315         movdqu  xmm1, [arg2 - 16 + arg3]
 316         pshufb  xmm1, xmm11
 317
 318         ; get rid of the extra data that was loaded before
 319         ; load the shift constant
 320         lea     rax, [pshufb_shf_table + 16]
 321         sub     rax, arg3
 322         movdqu  xmm0, [rax]
 323
 324         ; shift xmm2 to the left by arg3 bytes
 325         pshufb  xmm2, xmm0
 326
 327         ; shift xmm7 to the right by 16-arg3 bytes
 328         pxor    xmm0, [mask1]
 329         pshufb  xmm7, xmm0
 330         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 331
 332         ; fold 16 Bytes
 333         movdqa  xmm2, xmm1
 334         movdqa  xmm8, xmm7
 335         pclmulqdq       xmm7, xmm10, 0x11
 336         pclmulqdq       xmm8, xmm10, 0x0
 337         pxor    xmm7, xmm8
 338         pxor    xmm7, xmm2
 339
 340 _128_done:
 341         ; compute crc of a 128-bit value
 342         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 343         movdqa  xmm0, xmm7
 344
 345         ;64b fold
 346         pclmulqdq       xmm7, xmm10, 0x1
 347         pslldq  xmm0, 8
 348         pxor    xmm7, xmm0
 349
 350         ;32b fold
 351         movdqa  xmm0, xmm7
 352
 353         pand    xmm0, [mask2]
 354
 355         psrldq  xmm7, 12
 356         pclmulqdq       xmm7, xmm10, 0x10
 357         pxor    xmm7, xmm0
 358
 359         ;barrett reduction
 360 _barrett:
 361         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 362         movdqa  xmm0, xmm7
 363         pclmulqdq       xmm7, xmm10, 0x01
 364         pslldq  xmm7, 4
 365         pclmulqdq       xmm7, xmm10, 0x11
 366
 367         pslldq  xmm7, 4
 368         pxor    xmm7, xmm0
 369         pextrd  eax, xmm7,1
 370
 371 _cleanup:
 372         not     eax
 373 %ifidn __OUTPUT_FORMAT__, win64
 374         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 375         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 376         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 377         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 378         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 379         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 380         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 381         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 382 %endif
 383         add     rsp,VARIABLE_OFFSET
 384         ret
 385
 386
 387 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 391
 392 align 16
 393 _less_than_256:
 394
 395         ; check if there is enough buffer to be able to fold 16B at a time
 396         cmp     arg3, 32
 397         jl      _less_than_32
 398         movdqa xmm11, [SHUF_MASK]
 399
 400         ; if there is, load the constants
 401         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 402
 403         movd    xmm0, arg1_low32        ; get the initial crc value
 404         pslldq  xmm0, 12        ; align it to its correct place
 405         movdqu  xmm7, [arg2]    ; load the plaintext
 406         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 407         pxor    xmm7, xmm0
 408
 409
 410         ; update the buffer pointer
 411         add     arg2, 16
 412
 413         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 414         sub     arg3, 32
 415
 416         jmp     _16B_reduction_loop
 417
 418
 419 align 16
 420 _less_than_32:
 421         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 422         mov     eax, arg1_low32
 423         test    arg3, arg3
 424         je      _cleanup
 425
 426         movdqa xmm11, [SHUF_MASK]
 427
 428         movd    xmm0, arg1_low32        ; get the initial crc value
 429         pslldq  xmm0, 12        ; align it to its correct place
 430
 431         cmp     arg3, 16
 432         je      _exact_16_left
 433         jl      _less_than_16_left
 434
 435         movdqu  xmm7, [arg2]    ; load the plaintext
 436         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 437         pxor    xmm7, xmm0      ; xor the initial crc value
 438         add     arg2, 16
 439         sub     arg3, 16
 440         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 441         jmp     _get_last_two_xmms
 442
 443
 444 align 16
 445 _less_than_16_left:
 446         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 447
 448         pxor    xmm1, xmm1
 449         mov     r11, rsp
 450         movdqa  [r11], xmm1
 451
 452         cmp     arg3, 4
 453         jl      _only_less_than_4
 454
 455         ;       backup the counter value
 456         mov     r9, arg3
 457         cmp     arg3, 8
 458         jl      _less_than_8_left
 459
 460         ; load 8 Bytes
 461         mov     rax, [arg2]
 462         mov     [r11], rax
 463         add     r11, 8
 464         sub     arg3, 8
 465         add     arg2, 8
 466 _less_than_8_left:
 467
 468         cmp     arg3, 4
 469         jl      _less_than_4_left
 470
 471         ; load 4 Bytes
 472         mov     eax, [arg2]
 473         mov     [r11], eax
 474         add     r11, 4
 475         sub     arg3, 4
 476         add     arg2, 4
 477 _less_than_4_left:
 478
 479         cmp     arg3, 2
 480         jl      _less_than_2_left
 481
 482         ; load 2 Bytes
 483         mov     ax, [arg2]
 484         mov     [r11], ax
 485         add     r11, 2
 486         sub     arg3, 2
 487         add     arg2, 2
 488 _less_than_2_left:
 489         cmp     arg3, 1
 490         jl      _zero_left
 491
 492         ; load 1 Byte
 493         mov     al, [arg2]
 494         mov     [r11], al
 495 _zero_left:
 496         movdqa  xmm7, [rsp]
 497         pshufb  xmm7, xmm11
 498         pxor    xmm7, xmm0      ; xor the initial crc value
 499
 500         ; shl r9, 4
 501         lea     rax, [pshufb_shf_table + 16]
 502         sub     rax, r9
 503         movdqu  xmm0, [rax]
 504         pxor    xmm0, [mask1]
 505
 506         pshufb  xmm7, xmm0
 507         jmp     _128_done
 508
 509 align 16
 510 _exact_16_left:
 511         movdqu  xmm7, [arg2]
 512         pshufb  xmm7, xmm11
 513         pxor    xmm7, xmm0      ; xor the initial crc value
 514
 515         jmp     _128_done
 516
 517 _only_less_than_4:
 518         cmp     arg3, 3
 519         jl      _only_less_than_3
 520
 521         ; load 3 Bytes
 522         mov     al, [arg2]
 523         mov     [r11], al
 524
 525         mov     al, [arg2+1]
 526         mov     [r11+1], al
 527
 528         mov     al, [arg2+2]
 529         mov     [r11+2], al
 530
 531         movdqa  xmm7, [rsp]
 532         pshufb  xmm7, xmm11
 533         pxor    xmm7, xmm0      ; xor the initial crc value
 534
 535         psrldq  xmm7, 5
 536
 537         jmp     _barrett
 538 _only_less_than_3:
 539         cmp     arg3, 2
 540         jl      _only_less_than_2
 541
 542         ; load 2 Bytes
 543         mov     al, [arg2]
 544         mov     [r11], al
 545
 546         mov     al, [arg2+1]
 547         mov     [r11+1], al
 548
 549         movdqa  xmm7, [rsp]
 550         pshufb  xmm7, xmm11
 551         pxor    xmm7, xmm0      ; xor the initial crc value
 552
 553         psrldq  xmm7, 6
 554
 555         jmp     _barrett
 556 _only_less_than_2:
 557
 558         ; load 1 Byte
 559         mov     al, [arg2]
 560         mov     [r11], al
 561
 562         movdqa  xmm7, [rsp]
 563         pshufb  xmm7, xmm11
 564         pxor    xmm7, xmm0      ; xor the initial crc value
 565
 566         psrldq  xmm7, 7
 567
 568         jmp     _barrett
 569
 570 section .data
 571
 572 ; precomputed constants
 573 align 16
 574
 575 rk1 :
 576 DQ 0xf200aa6600000000
 577 rk2 :
 578 DQ 0x17d3315d00000000
 579 rk3 :
 580 DQ 0x022ffca500000000
 581 rk4 :
 582 DQ 0x9d9ee22f00000000
 583 rk5 :
 584 DQ 0xf200aa6600000000
 585 rk6 :
 586 DQ 0x490d678d00000000
 587 rk7 :
 588 DQ 0x0000000104d101df
 589 rk8 :
 590 DQ 0x0000000104c11db7
 591 rk9 :
 592 DQ 0x6ac7e7d700000000
 593 rk10 :
 594 DQ 0xfcd922af00000000
 595 rk11 :
 596 DQ 0x34e45a6300000000
 597 rk12 :
 598 DQ 0x8762c1f600000000
 599 rk13 :
 600 DQ 0x5395a0ea00000000
 601 rk14 :
 602 DQ 0x54f2d5c700000000
 603 rk15 :
 604 DQ 0xd3504ec700000000
 605 rk16 :
 606 DQ 0x57a8445500000000
 607 rk17 :
 608 DQ 0xc053585d00000000
 609 rk18 :
 610 DQ 0x766f1b7800000000
 611 rk19 :
 612 DQ 0xcd8c54b500000000
 613 rk20 :
 614 DQ 0xab40b71e00000000
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624 mask1:
 625 dq 0x8080808080808080, 0x8080808080808080
 626 mask2:
 627 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 628
 629 SHUF_MASK:
 630 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 631
 632 pshufb_shf_table:
 633 ; use these values for shift constants for the pshufb instruction
 634 ; different alignments result in values as shown:
 635 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 636 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 637 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 638 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 639 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 640 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 641 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 642 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 643 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 644 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 645 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 646 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 647 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 648 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 649 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 650 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 651 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 652
 653 ;;;       func        core, ver, snum
 654 slversion crc32_ieee_01, 01,   06,  0011
 655