ceph/src/isa-l/crc/crc16_t10dif_02.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_02(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  44
  45 %include "reg_sizes.asm"
  46
  47 %define fetch_dist      1024
  48
  49 [bits 64]
  50 default rel
  51
  52 section .text
  53
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58
  59         %xdefine        arg1_low32 ecx
  60 %else
  61         %xdefine        arg1 rdi
  62         %xdefine        arg2 rsi
  63         %xdefine        arg3 rdx
  64
  65         %xdefine        arg1_low32 edi
  66 %endif
  67
  68 %ifidn __OUTPUT_FORMAT__, win64
  69         %define XMM_SAVE 16*2
  70         %define VARIABLE_OFFSET 16*10+8
  71 %else
  72         %define VARIABLE_OFFSET 16*2+8
  73 %endif
  74
  75 align 16
  76 global  crc16_t10dif_02:ISAL_SYM_TYPE_FUNCTION
  77 crc16_t10dif_02:
  78
  79         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  80         shl     arg1_low32, 16
  81
  82         ; After this point, code flow is exactly same as a 32-bit CRC.
  83         ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
  84
  85         sub     rsp, VARIABLE_OFFSET
  86 %ifidn __OUTPUT_FORMAT__, win64
  87         ; push the xmm registers into the stack to maintain
  88         vmovdqa [rsp+16*2],xmm6
  89         vmovdqa [rsp+16*3],xmm7
  90         vmovdqa [rsp+16*4],xmm8
  91         vmovdqa [rsp+16*5],xmm9
  92         vmovdqa [rsp+16*6],xmm10
  93         vmovdqa [rsp+16*7],xmm11
  94         vmovdqa [rsp+16*8],xmm12
  95         vmovdqa [rsp+16*9],xmm13
  96 %endif
  97
  98         ; check if smaller than 256
  99         cmp     arg3, 256
 100
 101         ; for sizes less than 256, we can't fold 128B at a time...
 102         jl      _less_than_256
 103
 104
 105         ; load the initial crc value
 106         vmovd   xmm10, arg1_low32       ; initial crc
 107
 108         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 109         ; because data will be byte-reflected and will align with initial crc at correct place.
 110         vpslldq xmm10, 12
 111
 112         vmovdqa xmm11, [SHUF_MASK]
 113         ; receive the initial 128B data, xor the initial crc value
 114         vmovdqu xmm0, [arg2+16*0]
 115         vmovdqu xmm1, [arg2+16*1]
 116         vmovdqu xmm2, [arg2+16*2]
 117         vmovdqu xmm3, [arg2+16*3]
 118         vmovdqu xmm4, [arg2+16*4]
 119         vmovdqu xmm5, [arg2+16*5]
 120         vmovdqu xmm6, [arg2+16*6]
 121         vmovdqu xmm7, [arg2+16*7]
 122
 123         vpshufb xmm0, xmm11
 124         ; XOR the initial_crc value
 125         vpxor   xmm0, xmm10
 126         vpshufb xmm1, xmm11
 127         vpshufb xmm2, xmm11
 128         vpshufb xmm3, xmm11
 129         vpshufb xmm4, xmm11
 130         vpshufb xmm5, xmm11
 131         vpshufb xmm6, xmm11
 132         vpshufb xmm7, xmm11
 133
 134         vmovdqa xmm10, [rk3]    ;xmm10 has rk3 and rk4
 135                                         ;imm value of pclmulqdq instruction will determine which constant to use
 136         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 137         ; we subtract 256 instead of 128 to save one instruction from the loop
 138         sub     arg3, 256
 139
 140         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 141         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 142
 143
 144         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 145 _fold_128_B_loop:
 146
 147         ; update the buffer pointer
 148         add     arg2, 128               ;    buf += 128;
 149
 150         prefetchnta [arg2+fetch_dist+0]
 151         vmovdqu xmm9, [arg2+16*0]
 152         vmovdqu xmm12, [arg2+16*1]
 153         vpshufb xmm9, xmm11
 154         vpshufb xmm12, xmm11
 155         vmovdqa xmm8, xmm0
 156         vmovdqa xmm13, xmm1
 157         vpclmulqdq      xmm0, xmm10, 0x0
 158         vpclmulqdq      xmm8, xmm10 , 0x11
 159         vpclmulqdq      xmm1, xmm10, 0x0
 160         vpclmulqdq      xmm13, xmm10 , 0x11
 161         vpxor   xmm0, xmm9
 162         vxorps  xmm0, xmm8
 163         vpxor   xmm1, xmm12
 164         vxorps  xmm1, xmm13
 165
 166         prefetchnta [arg2+fetch_dist+32]
 167         vmovdqu xmm9, [arg2+16*2]
 168         vmovdqu xmm12, [arg2+16*3]
 169         vpshufb xmm9, xmm11
 170         vpshufb xmm12, xmm11
 171         vmovdqa xmm8, xmm2
 172         vmovdqa xmm13, xmm3
 173         vpclmulqdq      xmm2, xmm10, 0x0
 174         vpclmulqdq      xmm8, xmm10 , 0x11
 175         vpclmulqdq      xmm3, xmm10, 0x0
 176         vpclmulqdq      xmm13, xmm10 , 0x11
 177         vpxor   xmm2, xmm9
 178         vxorps  xmm2, xmm8
 179         vpxor   xmm3, xmm12
 180         vxorps  xmm3, xmm13
 181
 182         prefetchnta [arg2+fetch_dist+64]
 183         vmovdqu xmm9, [arg2+16*4]
 184         vmovdqu xmm12, [arg2+16*5]
 185         vpshufb xmm9, xmm11
 186         vpshufb xmm12, xmm11
 187         vmovdqa xmm8, xmm4
 188         vmovdqa xmm13, xmm5
 189         vpclmulqdq      xmm4, xmm10, 0x0
 190         vpclmulqdq      xmm8, xmm10 , 0x11
 191         vpclmulqdq      xmm5, xmm10, 0x0
 192         vpclmulqdq      xmm13, xmm10 , 0x11
 193         vpxor   xmm4, xmm9
 194         vxorps  xmm4, xmm8
 195         vpxor   xmm5, xmm12
 196         vxorps  xmm5, xmm13
 197
 198         prefetchnta [arg2+fetch_dist+96]
 199         vmovdqu xmm9, [arg2+16*6]
 200         vmovdqu xmm12, [arg2+16*7]
 201         vpshufb xmm9, xmm11
 202         vpshufb xmm12, xmm11
 203         vmovdqa xmm8, xmm6
 204         vmovdqa xmm13, xmm7
 205         vpclmulqdq      xmm6, xmm10, 0x0
 206         vpclmulqdq      xmm8, xmm10 , 0x11
 207         vpclmulqdq      xmm7, xmm10, 0x0
 208         vpclmulqdq      xmm13, xmm10 , 0x11
 209         vpxor   xmm6, xmm9
 210         vxorps  xmm6, xmm8
 211         vpxor   xmm7, xmm12
 212         vxorps  xmm7, xmm13
 213
 214         sub     arg3, 128
 215
 216         ; check if there is another 128B in the buffer to be able to fold
 217         jge     _fold_128_B_loop
 218         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 219
 220
 221         add     arg2, 128
 222         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 223         ; fold the 8 xmm registers to 1 xmm register with different constants
 224
 225         vmovdqa xmm10, [rk9]
 226         vmovdqa xmm8, xmm0
 227         vpclmulqdq      xmm0, xmm10, 0x11
 228         vpclmulqdq      xmm8, xmm10, 0x0
 229         vpxor   xmm7, xmm8
 230         vxorps  xmm7, xmm0
 231
 232         vmovdqa xmm10, [rk11]
 233         vmovdqa xmm8, xmm1
 234         vpclmulqdq      xmm1, xmm10, 0x11
 235         vpclmulqdq      xmm8, xmm10, 0x0
 236         vpxor   xmm7, xmm8
 237         vxorps  xmm7, xmm1
 238
 239         vmovdqa xmm10, [rk13]
 240         vmovdqa xmm8, xmm2
 241         vpclmulqdq      xmm2, xmm10, 0x11
 242         vpclmulqdq      xmm8, xmm10, 0x0
 243         vpxor   xmm7, xmm8
 244         vpxor   xmm7, xmm2
 245
 246         vmovdqa xmm10, [rk15]
 247         vmovdqa xmm8, xmm3
 248         vpclmulqdq      xmm3, xmm10, 0x11
 249         vpclmulqdq      xmm8, xmm10, 0x0
 250         vpxor   xmm7, xmm8
 251         vxorps  xmm7, xmm3
 252
 253         vmovdqa xmm10, [rk17]
 254         vmovdqa xmm8, xmm4
 255         vpclmulqdq      xmm4, xmm10, 0x11
 256         vpclmulqdq      xmm8, xmm10, 0x0
 257         vpxor   xmm7, xmm8
 258         vpxor   xmm7, xmm4
 259
 260         vmovdqa xmm10, [rk19]
 261         vmovdqa xmm8, xmm5
 262         vpclmulqdq      xmm5, xmm10, 0x11
 263         vpclmulqdq      xmm8, xmm10, 0x0
 264         vpxor   xmm7, xmm8
 265         vxorps  xmm7, xmm5
 266
 267         vmovdqa xmm10, [rk1]    ;xmm10 has rk1 and rk2
 268                                 ;imm value of pclmulqdq instruction will determine which constant to use
 269         vmovdqa xmm8, xmm6
 270         vpclmulqdq      xmm6, xmm10, 0x11
 271         vpclmulqdq      xmm8, xmm10, 0x0
 272         vpxor   xmm7, xmm8
 273         vpxor   xmm7, xmm6
 274
 275
 276         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 277         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 278         add     arg3, 128-16
 279         jl      _final_reduction_for_128
 280
 281         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 282         ; we can fold 16 bytes at a time if y>=16
 283         ; continue folding 16B at a time
 284
 285 _16B_reduction_loop:
 286         vmovdqa xmm8, xmm7
 287         vpclmulqdq      xmm7, xmm10, 0x11
 288         vpclmulqdq      xmm8, xmm10, 0x0
 289         vpxor   xmm7, xmm8
 290         vmovdqu xmm0, [arg2]
 291         vpshufb xmm0, xmm11
 292         vpxor   xmm7, xmm0
 293         add     arg2, 16
 294         sub     arg3, 16
 295         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 296         ; equivalent of: cmp arg3, 16-16
 297         ; check if there is any more 16B in the buffer to be able to fold
 298         jge     _16B_reduction_loop
 299
 300         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 301         ;first, we reduce the data in the xmm7 register
 302
 303
 304 _final_reduction_for_128:
 305         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 306         add     arg3, 16
 307         je      _128_done
 308
 309         ; here we are getting data that is less than 16 bytes.
 310         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 311         ; after that the registers need to be adjusted.
 312 _get_last_two_xmms:
 313         vmovdqa xmm2, xmm7
 314
 315         vmovdqu xmm1, [arg2 - 16 + arg3]
 316         vpshufb xmm1, xmm11
 317
 318         ; get rid of the extra data that was loaded before
 319         ; load the shift constant
 320         lea     rax, [pshufb_shf_table + 16]
 321         sub     rax, arg3
 322         vmovdqu xmm0, [rax]
 323
 324         ; shift xmm2 to the left by arg3 bytes
 325         vpshufb xmm2, xmm0
 326
 327         ; shift xmm7 to the right by 16-arg3 bytes
 328         vpxor   xmm0, [mask1]
 329         vpshufb xmm7, xmm0
 330         vpblendvb       xmm1, xmm1, xmm2, xmm0
 331
 332         ; fold 16 Bytes
 333         vmovdqa xmm2, xmm1
 334         vmovdqa xmm8, xmm7
 335         vpclmulqdq      xmm7, xmm10, 0x11
 336         vpclmulqdq      xmm8, xmm10, 0x0
 337         vpxor   xmm7, xmm8
 338         vpxor   xmm7, xmm2
 339
 340 _128_done:
 341         ; compute crc of a 128-bit value
 342         vmovdqa xmm10, [rk5]    ; rk5 and rk6 in xmm10
 343         vmovdqa xmm0, xmm7
 344
 345         ;64b fold
 346         vpclmulqdq      xmm7, xmm10, 0x1
 347         vpslldq xmm0, 8
 348         vpxor   xmm7, xmm0
 349
 350         ;32b fold
 351         vmovdqa xmm0, xmm7
 352
 353         vpand   xmm0, [mask2]
 354
 355         vpsrldq xmm7, 12
 356         vpclmulqdq      xmm7, xmm10, 0x10
 357         vpxor   xmm7, xmm0
 358
 359         ;barrett reduction
 360 _barrett:
 361         vmovdqa xmm10, [rk7]    ; rk7 and rk8 in xmm10
 362         vmovdqa xmm0, xmm7
 363         vpclmulqdq      xmm7, xmm10, 0x01
 364         vpslldq xmm7, 4
 365         vpclmulqdq      xmm7, xmm10, 0x11
 366
 367         vpslldq xmm7, 4
 368         vpxor   xmm7, xmm0
 369         vpextrd eax, xmm7,1
 370
 371 _cleanup:
 372         ; scale the result back to 16 bits
 373         shr     eax, 16
 374 %ifidn __OUTPUT_FORMAT__, win64
 375         vmovdqa xmm6, [rsp+16*2]
 376         vmovdqa xmm7, [rsp+16*3]
 377         vmovdqa xmm8, [rsp+16*4]
 378         vmovdqa xmm9, [rsp+16*5]
 379         vmovdqa xmm10, [rsp+16*6]
 380         vmovdqa xmm11, [rsp+16*7]
 381         vmovdqa xmm12, [rsp+16*8]
 382         vmovdqa xmm13, [rsp+16*9]
 383 %endif
 384         add     rsp, VARIABLE_OFFSET
 385         ret
 386
 387
 388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 392
 393 align 16
 394 _less_than_256:
 395
 396         ; check if there is enough buffer to be able to fold 16B at a time
 397         cmp     arg3, 32
 398         jl      _less_than_32
 399         vmovdqa xmm11, [SHUF_MASK]
 400
 401         ; if there is, load the constants
 402         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 403
 404         vmovd   xmm0, arg1_low32        ; get the initial crc value
 405         vpslldq xmm0, 12        ; align it to its correct place
 406         vmovdqu xmm7, [arg2]    ; load the plaintext
 407         vpshufb xmm7, xmm11     ; byte-reflect the plaintext
 408         vpxor   xmm7, xmm0
 409
 410
 411         ; update the buffer pointer
 412         add     arg2, 16
 413
 414         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 415         sub     arg3, 32
 416
 417         jmp     _16B_reduction_loop
 418
 419
 420 align 16
 421 _less_than_32:
 422         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 423         mov     eax, arg1_low32
 424         test    arg3, arg3
 425         je      _cleanup
 426
 427         vmovdqa xmm11, [SHUF_MASK]
 428
 429         vmovd   xmm0, arg1_low32        ; get the initial crc value
 430         vpslldq xmm0, 12        ; align it to its correct place
 431
 432         cmp     arg3, 16
 433         je      _exact_16_left
 434         jl      _less_than_16_left
 435
 436         vmovdqu xmm7, [arg2]    ; load the plaintext
 437         vpshufb xmm7, xmm11     ; byte-reflect the plaintext
 438         vpxor   xmm7, xmm0      ; xor the initial crc value
 439         add     arg2, 16
 440         sub     arg3, 16
 441         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 442         jmp     _get_last_two_xmms
 443
 444
 445 align 16
 446 _less_than_16_left:
 447         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 448
 449         vpxor   xmm1, xmm1
 450         mov     r11, rsp
 451         vmovdqa [r11], xmm1
 452
 453         cmp     arg3, 4
 454         jl      _only_less_than_4
 455
 456         ;       backup the counter value
 457         mov     r9, arg3
 458         cmp     arg3, 8
 459         jl      _less_than_8_left
 460
 461         ; load 8 Bytes
 462         mov     rax, [arg2]
 463         mov     [r11], rax
 464         add     r11, 8
 465         sub     arg3, 8
 466         add     arg2, 8
 467 _less_than_8_left:
 468
 469         cmp     arg3, 4
 470         jl      _less_than_4_left
 471
 472         ; load 4 Bytes
 473         mov     eax, [arg2]
 474         mov     [r11], eax
 475         add     r11, 4
 476         sub     arg3, 4
 477         add     arg2, 4
 478 _less_than_4_left:
 479
 480         cmp     arg3, 2
 481         jl      _less_than_2_left
 482
 483         ; load 2 Bytes
 484         mov     ax, [arg2]
 485         mov     [r11], ax
 486         add     r11, 2
 487         sub     arg3, 2
 488         add     arg2, 2
 489 _less_than_2_left:
 490         cmp     arg3, 1
 491         jl      _zero_left
 492
 493         ; load 1 Byte
 494         mov     al, [arg2]
 495         mov     [r11], al
 496 _zero_left:
 497         vmovdqa xmm7, [rsp]
 498         vpshufb xmm7, xmm11
 499         vpxor   xmm7, xmm0      ; xor the initial crc value
 500
 501         lea     rax, [pshufb_shf_table + 16]
 502         sub     rax, r9
 503         vmovdqu xmm0, [rax]
 504         vpxor   xmm0, [mask1]
 505
 506         vpshufb xmm7, xmm0
 507         jmp     _128_done
 508
 509 align 16
 510 _exact_16_left:
 511         vmovdqu xmm7, [arg2]
 512         vpshufb xmm7, xmm11
 513         vpxor   xmm7, xmm0      ; xor the initial crc value
 514
 515         jmp     _128_done
 516
 517 _only_less_than_4:
 518         cmp     arg3, 3
 519         jl      _only_less_than_3
 520
 521         ; load 3 Bytes
 522         mov     al, [arg2]
 523         mov     [r11], al
 524
 525         mov     al, [arg2+1]
 526         mov     [r11+1], al
 527
 528         mov     al, [arg2+2]
 529         mov     [r11+2], al
 530
 531         vmovdqa xmm7, [rsp]
 532         vpshufb xmm7, xmm11
 533         vpxor   xmm7, xmm0      ; xor the initial crc value
 534
 535         vpsrldq xmm7, 5
 536
 537         jmp     _barrett
 538 _only_less_than_3:
 539         cmp     arg3, 2
 540         jl      _only_less_than_2
 541
 542         ; load 2 Bytes
 543         mov     al, [arg2]
 544         mov     [r11], al
 545
 546         mov     al, [arg2+1]
 547         mov     [r11+1], al
 548
 549         vmovdqa xmm7, [rsp]
 550         vpshufb xmm7, xmm11
 551         vpxor   xmm7, xmm0      ; xor the initial crc value
 552
 553         vpsrldq xmm7, 6
 554
 555         jmp     _barrett
 556 _only_less_than_2:
 557
 558         ; load 1 Byte
 559         mov     al, [arg2]
 560         mov     [r11], al
 561
 562         vmovdqa xmm7, [rsp]
 563         vpshufb xmm7, xmm11
 564         vpxor   xmm7, xmm0      ; xor the initial crc value
 565
 566         vpsrldq xmm7, 7
 567
 568         jmp     _barrett
 569
 570 section .data
 571
 572 ; precomputed constants
 573 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 574 align 16
 575 ; Q = 0x18BB70000
 576 ; rk1 = 2^(32*3) mod Q << 32
 577 ; rk2 = 2^(32*5) mod Q << 32
 578 ; rk3 = 2^(32*15) mod Q << 32
 579 ; rk4 = 2^(32*17) mod Q << 32
 580 ; rk5 = 2^(32*3) mod Q << 32
 581 ; rk6 = 2^(32*2) mod Q << 32
 582 ; rk7 = floor(2^64/Q)
 583 ; rk8 = Q
 584 rk1:
 585 DQ 0x2d56000000000000
 586 rk2:
 587 DQ 0x06df000000000000
 588 rk3:
 589 DQ 0x9d9d000000000000
 590 rk4:
 591 DQ 0x7cf5000000000000
 592 rk5:
 593 DQ 0x2d56000000000000
 594 rk6:
 595 DQ 0x1368000000000000
 596 rk7:
 597 DQ 0x00000001f65a57f8
 598 rk8:
 599 DQ 0x000000018bb70000
 600
 601 rk9:
 602 DQ 0xceae000000000000
 603 rk10:
 604 DQ 0xbfd6000000000000
 605 rk11:
 606 DQ 0x1e16000000000000
 607 rk12:
 608 DQ 0x713c000000000000
 609 rk13:
 610 DQ 0xf7f9000000000000
 611 rk14:
 612 DQ 0x80a6000000000000
 613 rk15:
 614 DQ 0x044c000000000000
 615 rk16:
 616 DQ 0xe658000000000000
 617 rk17:
 618 DQ 0xad18000000000000
 619 rk18:
 620 DQ 0xa497000000000000
 621 rk19:
 622 DQ 0x6ee3000000000000
 623 rk20:
 624 DQ 0xe7b5000000000000
 625
 626 mask1:
 627 dq 0x8080808080808080, 0x8080808080808080
 628 mask2:
 629 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 630
 631 SHUF_MASK:
 632 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 633
 634 pshufb_shf_table:
 635 ; use these values for shift constants for the pshufb instruction
 636 ; different alignments result in values as shown:
 637 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 638 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 639 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 640 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 641 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 642 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 643 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 644 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 645 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 646 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 647 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 648 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 649 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 650 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 651 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 652 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 653 dq 0x0706050403020100, 0x000e0d0c0b0a0908