ceph/src/isa-l/crc/crc16_t10dif_01.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_01(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  44
  45 %include "reg_sizes.asm"
  46
  47 %define fetch_dist      1024
  48
  49 [bits 64]
  50 default rel
  51
  52 section .text
  53
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58
  59         %xdefine        arg1_low32 ecx
  60 %else
  61         %xdefine        arg1 rdi
  62         %xdefine        arg2 rsi
  63         %xdefine        arg3 rdx
  64
  65         %xdefine        arg1_low32 edi
  66 %endif
  67
  68 %ifidn __OUTPUT_FORMAT__, win64
  69         %define XMM_SAVE 16*2
  70         %define VARIABLE_OFFSET 16*10+8
  71 %else
  72         %define VARIABLE_OFFSET 16*2+8
  73 %endif
  74
  75 align 16
  76 global  crc16_t10dif_01:function
  77 crc16_t10dif_01:
  78
  79         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  80         shl     arg1_low32, 16
  81
  82         ; After this point, code flow is exactly same as a 32-bit CRC.
  83         ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
  84
  85         sub     rsp, VARIABLE_OFFSET
  86 %ifidn __OUTPUT_FORMAT__, win64
  87         ; push the xmm registers into the stack to maintain
  88         movdqa [rsp+16*2],xmm6
  89         movdqa [rsp+16*3],xmm7
  90         movdqa [rsp+16*4],xmm8
  91         movdqa [rsp+16*5],xmm9
  92         movdqa [rsp+16*6],xmm10
  93         movdqa [rsp+16*7],xmm11
  94         movdqa [rsp+16*8],xmm12
  95         movdqa [rsp+16*9],xmm13
  96 %endif
  97
  98         ; check if smaller than 256
  99         cmp     arg3, 256
 100
 101         ; for sizes less than 256, we can't fold 128B at a time...
 102         jl      _less_than_256
 103
 104
 105         ; load the initial crc value
 106         movd    xmm10, arg1_low32       ; initial crc
 107
 108         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 109         ; because data will be byte-reflected and will align with initial crc at correct place.
 110         pslldq  xmm10, 12
 111
 112         movdqa xmm11, [SHUF_MASK]
 113         ; receive the initial 128B data, xor the initial crc value
 114         movdqu  xmm0, [arg2+16*0]
 115         movdqu  xmm1, [arg2+16*1]
 116         movdqu  xmm2, [arg2+16*2]
 117         movdqu  xmm3, [arg2+16*3]
 118         movdqu  xmm4, [arg2+16*4]
 119         movdqu  xmm5, [arg2+16*5]
 120         movdqu  xmm6, [arg2+16*6]
 121         movdqu  xmm7, [arg2+16*7]
 122
 123         pshufb  xmm0, xmm11
 124         ; XOR the initial_crc value
 125         pxor    xmm0, xmm10
 126         pshufb  xmm1, xmm11
 127         pshufb  xmm2, xmm11
 128         pshufb  xmm3, xmm11
 129         pshufb  xmm4, xmm11
 130         pshufb  xmm5, xmm11
 131         pshufb  xmm6, xmm11
 132         pshufb  xmm7, xmm11
 133
 134         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 135                                         ;imm value of pclmulqdq instruction will determine which constant to use
 136         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 137         ; we subtract 256 instead of 128 to save one instruction from the loop
 138         sub     arg3, 256
 139
 140         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 141         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 142
 143
 144         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 145 _fold_128_B_loop:
 146
 147         ; update the buffer pointer
 148         add     arg2, 128               ;    buf += 128;
 149
 150         prefetchnta [arg2+fetch_dist+0]
 151         movdqu  xmm9, [arg2+16*0]
 152         movdqu  xmm12, [arg2+16*1]
 153         pshufb  xmm9, xmm11
 154         pshufb  xmm12, xmm11
 155         movdqa  xmm8, xmm0
 156         movdqa  xmm13, xmm1
 157         pclmulqdq       xmm0, xmm10, 0x0
 158         pclmulqdq       xmm8, xmm10 , 0x11
 159         pclmulqdq       xmm1, xmm10, 0x0
 160         pclmulqdq       xmm13, xmm10 , 0x11
 161         pxor    xmm0, xmm9
 162         xorps   xmm0, xmm8
 163         pxor    xmm1, xmm12
 164         xorps   xmm1, xmm13
 165
 166         prefetchnta [arg2+fetch_dist+32]
 167         movdqu  xmm9, [arg2+16*2]
 168         movdqu  xmm12, [arg2+16*3]
 169         pshufb  xmm9, xmm11
 170         pshufb  xmm12, xmm11
 171         movdqa  xmm8, xmm2
 172         movdqa  xmm13, xmm3
 173         pclmulqdq       xmm2, xmm10, 0x0
 174         pclmulqdq       xmm8, xmm10 , 0x11
 175         pclmulqdq       xmm3, xmm10, 0x0
 176         pclmulqdq       xmm13, xmm10 , 0x11
 177         pxor    xmm2, xmm9
 178         xorps   xmm2, xmm8
 179         pxor    xmm3, xmm12
 180         xorps   xmm3, xmm13
 181
 182         prefetchnta [arg2+fetch_dist+64]
 183         movdqu  xmm9, [arg2+16*4]
 184         movdqu  xmm12, [arg2+16*5]
 185         pshufb  xmm9, xmm11
 186         pshufb  xmm12, xmm11
 187         movdqa  xmm8, xmm4
 188         movdqa  xmm13, xmm5
 189         pclmulqdq       xmm4, xmm10, 0x0
 190         pclmulqdq       xmm8, xmm10 , 0x11
 191         pclmulqdq       xmm5, xmm10, 0x0
 192         pclmulqdq       xmm13, xmm10 , 0x11
 193         pxor    xmm4, xmm9
 194         xorps   xmm4, xmm8
 195         pxor    xmm5, xmm12
 196         xorps   xmm5, xmm13
 197
 198         prefetchnta [arg2+fetch_dist+96]
 199         movdqu  xmm9, [arg2+16*6]
 200         movdqu  xmm12, [arg2+16*7]
 201         pshufb  xmm9, xmm11
 202         pshufb  xmm12, xmm11
 203         movdqa  xmm8, xmm6
 204         movdqa  xmm13, xmm7
 205         pclmulqdq       xmm6, xmm10, 0x0
 206         pclmulqdq       xmm8, xmm10 , 0x11
 207         pclmulqdq       xmm7, xmm10, 0x0
 208         pclmulqdq       xmm13, xmm10 , 0x11
 209         pxor    xmm6, xmm9
 210         xorps   xmm6, xmm8
 211         pxor    xmm7, xmm12
 212         xorps   xmm7, xmm13
 213
 214         sub     arg3, 128
 215
 216         ; check if there is another 128B in the buffer to be able to fold
 217         jge     _fold_128_B_loop
 218         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 219
 220
 221         add     arg2, 128
 222         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 223         ; fold the 8 xmm registers to 1 xmm register with different constants
 224
 225         movdqa  xmm10, [rk9]
 226         movdqa  xmm8, xmm0
 227         pclmulqdq       xmm0, xmm10, 0x11
 228         pclmulqdq       xmm8, xmm10, 0x0
 229         pxor    xmm7, xmm8
 230         xorps   xmm7, xmm0
 231
 232         movdqa  xmm10, [rk11]
 233         movdqa  xmm8, xmm1
 234         pclmulqdq       xmm1, xmm10, 0x11
 235         pclmulqdq       xmm8, xmm10, 0x0
 236         pxor    xmm7, xmm8
 237         xorps   xmm7, xmm1
 238
 239         movdqa  xmm10, [rk13]
 240         movdqa  xmm8, xmm2
 241         pclmulqdq       xmm2, xmm10, 0x11
 242         pclmulqdq       xmm8, xmm10, 0x0
 243         pxor    xmm7, xmm8
 244         pxor    xmm7, xmm2
 245
 246         movdqa  xmm10, [rk15]
 247         movdqa  xmm8, xmm3
 248         pclmulqdq       xmm3, xmm10, 0x11
 249         pclmulqdq       xmm8, xmm10, 0x0
 250         pxor    xmm7, xmm8
 251         xorps   xmm7, xmm3
 252
 253         movdqa  xmm10, [rk17]
 254         movdqa  xmm8, xmm4
 255         pclmulqdq       xmm4, xmm10, 0x11
 256         pclmulqdq       xmm8, xmm10, 0x0
 257         pxor    xmm7, xmm8
 258         pxor    xmm7, xmm4
 259
 260         movdqa  xmm10, [rk19]
 261         movdqa  xmm8, xmm5
 262         pclmulqdq       xmm5, xmm10, 0x11
 263         pclmulqdq       xmm8, xmm10, 0x0
 264         pxor    xmm7, xmm8
 265         xorps   xmm7, xmm5
 266
 267         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 268                                 ;imm value of pclmulqdq instruction will determine which constant to use
 269         movdqa  xmm8, xmm6
 270         pclmulqdq       xmm6, xmm10, 0x11
 271         pclmulqdq       xmm8, xmm10, 0x0
 272         pxor    xmm7, xmm8
 273         pxor    xmm7, xmm6
 274
 275
 276         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 277         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 278         add     arg3, 128-16
 279         jl      _final_reduction_for_128
 280
 281         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 282         ; we can fold 16 bytes at a time if y>=16
 283         ; continue folding 16B at a time
 284
 285 _16B_reduction_loop:
 286         movdqa  xmm8, xmm7
 287         pclmulqdq       xmm7, xmm10, 0x11
 288         pclmulqdq       xmm8, xmm10, 0x0
 289         pxor    xmm7, xmm8
 290         movdqu  xmm0, [arg2]
 291         pshufb  xmm0, xmm11
 292         pxor    xmm7, xmm0
 293         add     arg2, 16
 294         sub     arg3, 16
 295         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 296         ; equivalent of: cmp arg3, 16-16
 297         ; check if there is any more 16B in the buffer to be able to fold
 298         jge     _16B_reduction_loop
 299
 300         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 301         ;first, we reduce the data in the xmm7 register
 302
 303
 304 _final_reduction_for_128:
 305         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 306         add     arg3, 16
 307         je      _128_done
 308
 309         ; here we are getting data that is less than 16 bytes.
 310         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 311         ; after that the registers need to be adjusted.
 312 _get_last_two_xmms:
 313         movdqa  xmm2, xmm7
 314
 315         movdqu  xmm1, [arg2 - 16 + arg3]
 316         pshufb  xmm1, xmm11
 317
 318         ; get rid of the extra data that was loaded before
 319         ; load the shift constant
 320         lea     rax, [pshufb_shf_table + 16]
 321         sub     rax, arg3
 322         movdqu  xmm0, [rax]
 323
 324         ; shift xmm2 to the left by arg3 bytes
 325         pshufb  xmm2, xmm0
 326
 327         ; shift xmm7 to the right by 16-arg3 bytes
 328         pxor    xmm0, [mask1]
 329         pshufb  xmm7, xmm0
 330         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 331
 332         ; fold 16 Bytes
 333         movdqa  xmm2, xmm1
 334         movdqa  xmm8, xmm7
 335         pclmulqdq       xmm7, xmm10, 0x11
 336         pclmulqdq       xmm8, xmm10, 0x0
 337         pxor    xmm7, xmm8
 338         pxor    xmm7, xmm2
 339
 340 _128_done:
 341         ; compute crc of a 128-bit value
 342         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 343         movdqa  xmm0, xmm7
 344
 345         ;64b fold
 346         pclmulqdq       xmm7, xmm10, 0x1
 347         pslldq  xmm0, 8
 348         pxor    xmm7, xmm0
 349
 350         ;32b fold
 351         movdqa  xmm0, xmm7
 352
 353         pand    xmm0, [mask2]
 354
 355         psrldq  xmm7, 12
 356         pclmulqdq       xmm7, xmm10, 0x10
 357         pxor    xmm7, xmm0
 358
 359         ;barrett reduction
 360 _barrett:
 361         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 362         movdqa  xmm0, xmm7
 363         pclmulqdq       xmm7, xmm10, 0x01
 364         pslldq  xmm7, 4
 365         pclmulqdq       xmm7, xmm10, 0x11
 366
 367         pslldq  xmm7, 4
 368         pxor    xmm7, xmm0
 369         pextrd  eax, xmm7,1
 370
 371 _cleanup:
 372         ; scale the result back to 16 bits
 373         shr     eax, 16
 374 %ifidn __OUTPUT_FORMAT__, win64
 375         movdqa  xmm6, [rsp+16*2]
 376         movdqa  xmm7, [rsp+16*3]
 377         movdqa  xmm8, [rsp+16*4]
 378         movdqa  xmm9, [rsp+16*5]
 379         movdqa  xmm10, [rsp+16*6]
 380         movdqa  xmm11, [rsp+16*7]
 381         movdqa  xmm12, [rsp+16*8]
 382         movdqa  xmm13, [rsp+16*9]
 383 %endif
 384         add     rsp, VARIABLE_OFFSET
 385         ret
 386
 387
 388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 392
 393 align 16
 394 _less_than_256:
 395
 396         ; check if there is enough buffer to be able to fold 16B at a time
 397         cmp     arg3, 32
 398         jl      _less_than_32
 399         movdqa xmm11, [SHUF_MASK]
 400
 401         ; if there is, load the constants
 402         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 403
 404         movd    xmm0, arg1_low32        ; get the initial crc value
 405         pslldq  xmm0, 12        ; align it to its correct place
 406         movdqu  xmm7, [arg2]    ; load the plaintext
 407         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 408         pxor    xmm7, xmm0
 409
 410
 411         ; update the buffer pointer
 412         add     arg2, 16
 413
 414         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 415         sub     arg3, 32
 416
 417         jmp     _16B_reduction_loop
 418
 419
 420 align 16
 421 _less_than_32:
 422         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 423         mov     eax, arg1_low32
 424         test    arg3, arg3
 425         je      _cleanup
 426
 427         movdqa xmm11, [SHUF_MASK]
 428
 429         movd    xmm0, arg1_low32        ; get the initial crc value
 430         pslldq  xmm0, 12        ; align it to its correct place
 431
 432         cmp     arg3, 16
 433         je      _exact_16_left
 434         jl      _less_than_16_left
 435
 436         movdqu  xmm7, [arg2]    ; load the plaintext
 437         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 438         pxor    xmm7, xmm0      ; xor the initial crc value
 439         add     arg2, 16
 440         sub     arg3, 16
 441         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 442         jmp     _get_last_two_xmms
 443
 444
 445 align 16
 446 _less_than_16_left:
 447         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 448
 449         pxor    xmm1, xmm1
 450         mov     r11, rsp
 451         movdqa  [r11], xmm1
 452
 453         cmp     arg3, 4
 454         jl      _only_less_than_4
 455
 456         ;       backup the counter value
 457         mov     r9, arg3
 458         cmp     arg3, 8
 459         jl      _less_than_8_left
 460
 461         ; load 8 Bytes
 462         mov     rax, [arg2]
 463         mov     [r11], rax
 464         add     r11, 8
 465         sub     arg3, 8
 466         add     arg2, 8
 467 _less_than_8_left:
 468
 469         cmp     arg3, 4
 470         jl      _less_than_4_left
 471
 472         ; load 4 Bytes
 473         mov     eax, [arg2]
 474         mov     [r11], eax
 475         add     r11, 4
 476         sub     arg3, 4
 477         add     arg2, 4
 478 _less_than_4_left:
 479
 480         cmp     arg3, 2
 481         jl      _less_than_2_left
 482
 483         ; load 2 Bytes
 484         mov     ax, [arg2]
 485         mov     [r11], ax
 486         add     r11, 2
 487         sub     arg3, 2
 488         add     arg2, 2
 489 _less_than_2_left:
 490         cmp     arg3, 1
 491         jl      _zero_left
 492
 493         ; load 1 Byte
 494         mov     al, [arg2]
 495         mov     [r11], al
 496 _zero_left:
 497         movdqa  xmm7, [rsp]
 498         pshufb  xmm7, xmm11
 499         pxor    xmm7, xmm0      ; xor the initial crc value
 500
 501         lea     rax, [pshufb_shf_table + 16]
 502         sub     rax, r9
 503         movdqu  xmm0, [rax]
 504         pxor    xmm0, [mask1]
 505
 506         pshufb  xmm7, xmm0
 507         jmp     _128_done
 508
 509 align 16
 510 _exact_16_left:
 511         movdqu  xmm7, [arg2]
 512         pshufb  xmm7, xmm11
 513         pxor    xmm7, xmm0      ; xor the initial crc value
 514
 515         jmp     _128_done
 516
 517 _only_less_than_4:
 518         cmp     arg3, 3
 519         jl      _only_less_than_3
 520
 521         ; load 3 Bytes
 522         mov     al, [arg2]
 523         mov     [r11], al
 524
 525         mov     al, [arg2+1]
 526         mov     [r11+1], al
 527
 528         mov     al, [arg2+2]
 529         mov     [r11+2], al
 530
 531         movdqa  xmm7, [rsp]
 532         pshufb  xmm7, xmm11
 533         pxor    xmm7, xmm0      ; xor the initial crc value
 534
 535         psrldq  xmm7, 5
 536
 537         jmp     _barrett
 538 _only_less_than_3:
 539         cmp     arg3, 2
 540         jl      _only_less_than_2
 541
 542         ; load 2 Bytes
 543         mov     al, [arg2]
 544         mov     [r11], al
 545
 546         mov     al, [arg2+1]
 547         mov     [r11+1], al
 548
 549         movdqa  xmm7, [rsp]
 550         pshufb  xmm7, xmm11
 551         pxor    xmm7, xmm0      ; xor the initial crc value
 552
 553         psrldq  xmm7, 6
 554
 555         jmp     _barrett
 556 _only_less_than_2:
 557
 558         ; load 1 Byte
 559         mov     al, [arg2]
 560         mov     [r11], al
 561
 562         movdqa  xmm7, [rsp]
 563         pshufb  xmm7, xmm11
 564         pxor    xmm7, xmm0      ; xor the initial crc value
 565
 566         psrldq  xmm7, 7
 567
 568         jmp     _barrett
 569
 570 section .data
 571
 572 ; precomputed constants
 573 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 574 align 16
 575 ; Q = 0x18BB70000
 576 ; rk1 = 2^(32*3) mod Q << 32
 577 ; rk2 = 2^(32*5) mod Q << 32
 578 ; rk3 = 2^(32*15) mod Q << 32
 579 ; rk4 = 2^(32*17) mod Q << 32
 580 ; rk5 = 2^(32*3) mod Q << 32
 581 ; rk6 = 2^(32*2) mod Q << 32
 582 ; rk7 = floor(2^64/Q)
 583 ; rk8 = Q
 584 rk1:
 585 DQ 0x2d56000000000000
 586 rk2:
 587 DQ 0x06df000000000000
 588 rk3:
 589 DQ 0x9d9d000000000000
 590 rk4:
 591 DQ 0x7cf5000000000000
 592 rk5:
 593 DQ 0x2d56000000000000
 594 rk6:
 595 DQ 0x1368000000000000
 596 rk7:
 597 DQ 0x00000001f65a57f8
 598 rk8:
 599 DQ 0x000000018bb70000
 600
 601 rk9:
 602 DQ 0xceae000000000000
 603 rk10:
 604 DQ 0xbfd6000000000000
 605 rk11:
 606 DQ 0x1e16000000000000
 607 rk12:
 608 DQ 0x713c000000000000
 609 rk13:
 610 DQ 0xf7f9000000000000
 611 rk14:
 612 DQ 0x80a6000000000000
 613 rk15:
 614 DQ 0x044c000000000000
 615 rk16:
 616 DQ 0xe658000000000000
 617 rk17:
 618 DQ 0xad18000000000000
 619 rk18:
 620 DQ 0xa497000000000000
 621 rk19:
 622 DQ 0x6ee3000000000000
 623 rk20:
 624 DQ 0xe7b5000000000000
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634 mask1:
 635 dq 0x8080808080808080, 0x8080808080808080
 636 mask2:
 637 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 638
 639 SHUF_MASK:
 640 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 641
 642 pshufb_shf_table:
 643 ; use these values for shift constants for the pshufb instruction
 644 ; different alignments result in values as shown:
 645 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 646 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 647 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 648 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 649 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 650 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 651 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 652 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 653 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 654 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 655 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 656 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 657 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 658 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 659 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 660 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 661 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 662
 663 ;;;       func          core, ver, snum
 664 slversion crc16_t10dif_01, 01,   06,  0010
 665