ceph/src/isa-l/crc/crc16_t10dif_01.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_01(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  44
  45 %include "reg_sizes.asm"
  46
  47 [bits 64]
  48 default rel
  49
  50 section .text
  51
  52 %ifidn __OUTPUT_FORMAT__, win64
  53         %xdefine        arg1 rcx
  54         %xdefine        arg2 rdx
  55         %xdefine        arg3 r8
  56
  57         %xdefine        arg1_low32 ecx
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62
  63         %xdefine        arg1_low32 edi
  64 %endif
  65
  66 %ifidn __OUTPUT_FORMAT__, win64
  67         %define XMM_SAVE 16*2
  68         %define VARIABLE_OFFSET 16*10+8
  69 %else
  70         %define VARIABLE_OFFSET 16*2+8
  71 %endif
  72
  73 align 16
  74 global  crc16_t10dif_01:function
  75 crc16_t10dif_01:
  76
  77         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  78         shl     arg1_low32, 16
  79
  80         ; After this point, code flow is exactly same as a 32-bit CRC.
  81         ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
  82
  83         sub     rsp, VARIABLE_OFFSET
  84 %ifidn __OUTPUT_FORMAT__, win64
  85         ; push the xmm registers into the stack to maintain
  86         movdqa [rsp+16*2],xmm6
  87         movdqa [rsp+16*3],xmm7
  88         movdqa [rsp+16*4],xmm8
  89         movdqa [rsp+16*5],xmm9
  90         movdqa [rsp+16*6],xmm10
  91         movdqa [rsp+16*7],xmm11
  92         movdqa [rsp+16*8],xmm12
  93         movdqa [rsp+16*9],xmm13
  94 %endif
  95
  96         ; check if smaller than 256
  97         cmp     arg3, 256
  98
  99         ; for sizes less than 256, we can't fold 128B at a time...
 100         jl      _less_than_256
 101
 102
 103         ; load the initial crc value
 104         movd    xmm10, arg1_low32       ; initial crc
 105
 106         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
 107         ; because data will be byte-reflected and will align with initial crc at correct place.
 108         pslldq  xmm10, 12
 109
 110         movdqa xmm11, [SHUF_MASK]
 111         ; receive the initial 128B data, xor the initial crc value
 112         movdqu  xmm0, [arg2+16*0]
 113         movdqu  xmm1, [arg2+16*1]
 114         movdqu  xmm2, [arg2+16*2]
 115         movdqu  xmm3, [arg2+16*3]
 116         movdqu  xmm4, [arg2+16*4]
 117         movdqu  xmm5, [arg2+16*5]
 118         movdqu  xmm6, [arg2+16*6]
 119         movdqu  xmm7, [arg2+16*7]
 120
 121         pshufb  xmm0, xmm11
 122         ; XOR the initial_crc value
 123         pxor    xmm0, xmm10
 124         pshufb  xmm1, xmm11
 125         pshufb  xmm2, xmm11
 126         pshufb  xmm3, xmm11
 127         pshufb  xmm4, xmm11
 128         pshufb  xmm5, xmm11
 129         pshufb  xmm6, xmm11
 130         pshufb  xmm7, xmm11
 131
 132         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 133                                         ;imm value of pclmulqdq instruction will determine which constant to use
 134         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 135         ; we subtract 256 instead of 128 to save one instruction from the loop
 136         sub     arg3, 256
 137
 138         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 139         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 140
 141
 142         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 143 _fold_128_B_loop:
 144
 145         ; update the buffer pointer
 146         add     arg2, 128               ;    buf += 128;
 147
 148         movdqu  xmm9, [arg2+16*0]
 149         movdqu  xmm12, [arg2+16*1]
 150         pshufb  xmm9, xmm11
 151         pshufb  xmm12, xmm11
 152         movdqa  xmm8, xmm0
 153         movdqa  xmm13, xmm1
 154         pclmulqdq       xmm0, xmm10, 0x0
 155         pclmulqdq       xmm8, xmm10 , 0x11
 156         pclmulqdq       xmm1, xmm10, 0x0
 157         pclmulqdq       xmm13, xmm10 , 0x11
 158         pxor    xmm0, xmm9
 159         xorps   xmm0, xmm8
 160         pxor    xmm1, xmm12
 161         xorps   xmm1, xmm13
 162
 163         movdqu  xmm9, [arg2+16*2]
 164         movdqu  xmm12, [arg2+16*3]
 165         pshufb  xmm9, xmm11
 166         pshufb  xmm12, xmm11
 167         movdqa  xmm8, xmm2
 168         movdqa  xmm13, xmm3
 169         pclmulqdq       xmm2, xmm10, 0x0
 170         pclmulqdq       xmm8, xmm10 , 0x11
 171         pclmulqdq       xmm3, xmm10, 0x0
 172         pclmulqdq       xmm13, xmm10 , 0x11
 173         pxor    xmm2, xmm9
 174         xorps   xmm2, xmm8
 175         pxor    xmm3, xmm12
 176         xorps   xmm3, xmm13
 177
 178         movdqu  xmm9, [arg2+16*4]
 179         movdqu  xmm12, [arg2+16*5]
 180         pshufb  xmm9, xmm11
 181         pshufb  xmm12, xmm11
 182         movdqa  xmm8, xmm4
 183         movdqa  xmm13, xmm5
 184         pclmulqdq       xmm4, xmm10, 0x0
 185         pclmulqdq       xmm8, xmm10 , 0x11
 186         pclmulqdq       xmm5, xmm10, 0x0
 187         pclmulqdq       xmm13, xmm10 , 0x11
 188         pxor    xmm4, xmm9
 189         xorps   xmm4, xmm8
 190         pxor    xmm5, xmm12
 191         xorps   xmm5, xmm13
 192
 193         movdqu  xmm9, [arg2+16*6]
 194         movdqu  xmm12, [arg2+16*7]
 195         pshufb  xmm9, xmm11
 196         pshufb  xmm12, xmm11
 197         movdqa  xmm8, xmm6
 198         movdqa  xmm13, xmm7
 199         pclmulqdq       xmm6, xmm10, 0x0
 200         pclmulqdq       xmm8, xmm10 , 0x11
 201         pclmulqdq       xmm7, xmm10, 0x0
 202         pclmulqdq       xmm13, xmm10 , 0x11
 203         pxor    xmm6, xmm9
 204         xorps   xmm6, xmm8
 205         pxor    xmm7, xmm12
 206         xorps   xmm7, xmm13
 207
 208         sub     arg3, 128
 209
 210         ; check if there is another 128B in the buffer to be able to fold
 211         jge     _fold_128_B_loop
 212         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 213
 214
 215         add     arg2, 128
 216         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 217         ; fold the 8 xmm registers to 1 xmm register with different constants
 218
 219         movdqa  xmm10, [rk9]
 220         movdqa  xmm8, xmm0
 221         pclmulqdq       xmm0, xmm10, 0x11
 222         pclmulqdq       xmm8, xmm10, 0x0
 223         pxor    xmm7, xmm8
 224         xorps   xmm7, xmm0
 225
 226         movdqa  xmm10, [rk11]
 227         movdqa  xmm8, xmm1
 228         pclmulqdq       xmm1, xmm10, 0x11
 229         pclmulqdq       xmm8, xmm10, 0x0
 230         pxor    xmm7, xmm8
 231         xorps   xmm7, xmm1
 232
 233         movdqa  xmm10, [rk13]
 234         movdqa  xmm8, xmm2
 235         pclmulqdq       xmm2, xmm10, 0x11
 236         pclmulqdq       xmm8, xmm10, 0x0
 237         pxor    xmm7, xmm8
 238         pxor    xmm7, xmm2
 239
 240         movdqa  xmm10, [rk15]
 241         movdqa  xmm8, xmm3
 242         pclmulqdq       xmm3, xmm10, 0x11
 243         pclmulqdq       xmm8, xmm10, 0x0
 244         pxor    xmm7, xmm8
 245         xorps   xmm7, xmm3
 246
 247         movdqa  xmm10, [rk17]
 248         movdqa  xmm8, xmm4
 249         pclmulqdq       xmm4, xmm10, 0x11
 250         pclmulqdq       xmm8, xmm10, 0x0
 251         pxor    xmm7, xmm8
 252         pxor    xmm7, xmm4
 253
 254         movdqa  xmm10, [rk19]
 255         movdqa  xmm8, xmm5
 256         pclmulqdq       xmm5, xmm10, 0x11
 257         pclmulqdq       xmm8, xmm10, 0x0
 258         pxor    xmm7, xmm8
 259         xorps   xmm7, xmm5
 260
 261         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 262                                 ;imm value of pclmulqdq instruction will determine which constant to use
 263         movdqa  xmm8, xmm6
 264         pclmulqdq       xmm6, xmm10, 0x11
 265         pclmulqdq       xmm8, xmm10, 0x0
 266         pxor    xmm7, xmm8
 267         pxor    xmm7, xmm6
 268
 269
 270         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 271         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 272         add     arg3, 128-16
 273         jl      _final_reduction_for_128
 274
 275         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 276         ; we can fold 16 bytes at a time if y>=16
 277         ; continue folding 16B at a time
 278
 279 _16B_reduction_loop:
 280         movdqa  xmm8, xmm7
 281         pclmulqdq       xmm7, xmm10, 0x11
 282         pclmulqdq       xmm8, xmm10, 0x0
 283         pxor    xmm7, xmm8
 284         movdqu  xmm0, [arg2]
 285         pshufb  xmm0, xmm11
 286         pxor    xmm7, xmm0
 287         add     arg2, 16
 288         sub     arg3, 16
 289         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 290         ; equivalent of: cmp arg3, 16-16
 291         ; check if there is any more 16B in the buffer to be able to fold
 292         jge     _16B_reduction_loop
 293
 294         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 295         ;first, we reduce the data in the xmm7 register
 296
 297
 298 _final_reduction_for_128:
 299         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 300         add     arg3, 16
 301         je      _128_done
 302
 303         ; here we are getting data that is less than 16 bytes.
 304         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 305         ; after that the registers need to be adjusted.
 306 _get_last_two_xmms:
 307         movdqa  xmm2, xmm7
 308
 309         movdqu  xmm1, [arg2 - 16 + arg3]
 310         pshufb  xmm1, xmm11
 311
 312         ; get rid of the extra data that was loaded before
 313         ; load the shift constant
 314         lea     rax, [pshufb_shf_table + 16]
 315         sub     rax, arg3
 316         movdqu  xmm0, [rax]
 317
 318         ; shift xmm2 to the left by arg3 bytes
 319         pshufb  xmm2, xmm0
 320
 321         ; shift xmm7 to the right by 16-arg3 bytes
 322         pxor    xmm0, [mask1]
 323         pshufb  xmm7, xmm0
 324         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 325
 326         ; fold 16 Bytes
 327         movdqa  xmm2, xmm1
 328         movdqa  xmm8, xmm7
 329         pclmulqdq       xmm7, xmm10, 0x11
 330         pclmulqdq       xmm8, xmm10, 0x0
 331         pxor    xmm7, xmm8
 332         pxor    xmm7, xmm2
 333
 334 _128_done:
 335         ; compute crc of a 128-bit value
 336         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 337         movdqa  xmm0, xmm7
 338
 339         ;64b fold
 340         pclmulqdq       xmm7, xmm10, 0x1
 341         pslldq  xmm0, 8
 342         pxor    xmm7, xmm0
 343
 344         ;32b fold
 345         movdqa  xmm0, xmm7
 346
 347         pand    xmm0, [mask2]
 348
 349         psrldq  xmm7, 12
 350         pclmulqdq       xmm7, xmm10, 0x10
 351         pxor    xmm7, xmm0
 352
 353         ;barrett reduction
 354 _barrett:
 355         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 356         movdqa  xmm0, xmm7
 357         pclmulqdq       xmm7, xmm10, 0x01
 358         pslldq  xmm7, 4
 359         pclmulqdq       xmm7, xmm10, 0x11
 360
 361         pslldq  xmm7, 4
 362         pxor    xmm7, xmm0
 363         pextrd  eax, xmm7,1
 364
 365 _cleanup:
 366         ; scale the result back to 16 bits
 367         shr     eax, 16
 368 %ifidn __OUTPUT_FORMAT__, win64
 369         movdqa  xmm6, [rsp+16*2]
 370         movdqa  xmm7, [rsp+16*3]
 371         movdqa  xmm8, [rsp+16*4]
 372         movdqa  xmm9, [rsp+16*5]
 373         movdqa  xmm10, [rsp+16*6]
 374         movdqa  xmm11, [rsp+16*7]
 375         movdqa  xmm12, [rsp+16*8]
 376         movdqa  xmm13, [rsp+16*9]
 377 %endif
 378         add     rsp, VARIABLE_OFFSET
 379         ret
 380
 381
 382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 385 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 386
 387 align 16
 388 _less_than_256:
 389
 390         ; check if there is enough buffer to be able to fold 16B at a time
 391         cmp     arg3, 32
 392         jl      _less_than_32
 393         movdqa xmm11, [SHUF_MASK]
 394
 395         ; if there is, load the constants
 396         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 397
 398         movd    xmm0, arg1_low32        ; get the initial crc value
 399         pslldq  xmm0, 12        ; align it to its correct place
 400         movdqu  xmm7, [arg2]    ; load the plaintext
 401         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 402         pxor    xmm7, xmm0
 403
 404
 405         ; update the buffer pointer
 406         add     arg2, 16
 407
 408         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 409         sub     arg3, 32
 410
 411         jmp     _16B_reduction_loop
 412
 413
 414 align 16
 415 _less_than_32:
 416         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 417         mov     eax, arg1_low32
 418         test    arg3, arg3
 419         je      _cleanup
 420
 421         movdqa xmm11, [SHUF_MASK]
 422
 423         movd    xmm0, arg1_low32        ; get the initial crc value
 424         pslldq  xmm0, 12        ; align it to its correct place
 425
 426         cmp     arg3, 16
 427         je      _exact_16_left
 428         jl      _less_than_16_left
 429
 430         movdqu  xmm7, [arg2]    ; load the plaintext
 431         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 432         pxor    xmm7, xmm0      ; xor the initial crc value
 433         add     arg2, 16
 434         sub     arg3, 16
 435         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 436         jmp     _get_last_two_xmms
 437
 438
 439 align 16
 440 _less_than_16_left:
 441         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 442
 443         pxor    xmm1, xmm1
 444         mov     r11, rsp
 445         movdqa  [r11], xmm1
 446
 447         cmp     arg3, 4
 448         jl      _only_less_than_4
 449
 450         ;       backup the counter value
 451         mov     r9, arg3
 452         cmp     arg3, 8
 453         jl      _less_than_8_left
 454
 455         ; load 8 Bytes
 456         mov     rax, [arg2]
 457         mov     [r11], rax
 458         add     r11, 8
 459         sub     arg3, 8
 460         add     arg2, 8
 461 _less_than_8_left:
 462
 463         cmp     arg3, 4
 464         jl      _less_than_4_left
 465
 466         ; load 4 Bytes
 467         mov     eax, [arg2]
 468         mov     [r11], eax
 469         add     r11, 4
 470         sub     arg3, 4
 471         add     arg2, 4
 472 _less_than_4_left:
 473
 474         cmp     arg3, 2
 475         jl      _less_than_2_left
 476
 477         ; load 2 Bytes
 478         mov     ax, [arg2]
 479         mov     [r11], ax
 480         add     r11, 2
 481         sub     arg3, 2
 482         add     arg2, 2
 483 _less_than_2_left:
 484         cmp     arg3, 1
 485         jl      _zero_left
 486
 487         ; load 1 Byte
 488         mov     al, [arg2]
 489         mov     [r11], al
 490 _zero_left:
 491         movdqa  xmm7, [rsp]
 492         pshufb  xmm7, xmm11
 493         pxor    xmm7, xmm0      ; xor the initial crc value
 494
 495         lea     rax, [pshufb_shf_table + 16]
 496         sub     rax, r9
 497         movdqu  xmm0, [rax]
 498         pxor    xmm0, [mask1]
 499
 500         pshufb  xmm7, xmm0
 501         jmp     _128_done
 502
 503 align 16
 504 _exact_16_left:
 505         movdqu  xmm7, [arg2]
 506         pshufb  xmm7, xmm11
 507         pxor    xmm7, xmm0      ; xor the initial crc value
 508
 509         jmp     _128_done
 510
 511 _only_less_than_4:
 512         cmp     arg3, 3
 513         jl      _only_less_than_3
 514
 515         ; load 3 Bytes
 516         mov     al, [arg2]
 517         mov     [r11], al
 518
 519         mov     al, [arg2+1]
 520         mov     [r11+1], al
 521
 522         mov     al, [arg2+2]
 523         mov     [r11+2], al
 524
 525         movdqa  xmm7, [rsp]
 526         pshufb  xmm7, xmm11
 527         pxor    xmm7, xmm0      ; xor the initial crc value
 528
 529         psrldq  xmm7, 5
 530
 531         jmp     _barrett
 532 _only_less_than_3:
 533         cmp     arg3, 2
 534         jl      _only_less_than_2
 535
 536         ; load 2 Bytes
 537         mov     al, [arg2]
 538         mov     [r11], al
 539
 540         mov     al, [arg2+1]
 541         mov     [r11+1], al
 542
 543         movdqa  xmm7, [rsp]
 544         pshufb  xmm7, xmm11
 545         pxor    xmm7, xmm0      ; xor the initial crc value
 546
 547         psrldq  xmm7, 6
 548
 549         jmp     _barrett
 550 _only_less_than_2:
 551
 552         ; load 1 Byte
 553         mov     al, [arg2]
 554         mov     [r11], al
 555
 556         movdqa  xmm7, [rsp]
 557         pshufb  xmm7, xmm11
 558         pxor    xmm7, xmm0      ; xor the initial crc value
 559
 560         psrldq  xmm7, 7
 561
 562         jmp     _barrett
 563
 564 section .data
 565
 566 ; precomputed constants
 567 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 568 align 16
 569 ; Q = 0x18BB70000
 570 ; rk1 = 2^(32*3) mod Q << 32
 571 ; rk2 = 2^(32*5) mod Q << 32
 572 ; rk3 = 2^(32*15) mod Q << 32
 573 ; rk4 = 2^(32*17) mod Q << 32
 574 ; rk5 = 2^(32*3) mod Q << 32
 575 ; rk6 = 2^(32*2) mod Q << 32
 576 ; rk7 = floor(2^64/Q)
 577 ; rk8 = Q
 578 rk1:
 579 DQ 0x2d56000000000000
 580 rk2:
 581 DQ 0x06df000000000000
 582 rk3:
 583 DQ 0x9d9d000000000000
 584 rk4:
 585 DQ 0x7cf5000000000000
 586 rk5:
 587 DQ 0x2d56000000000000
 588 rk6:
 589 DQ 0x1368000000000000
 590 rk7:
 591 DQ 0x00000001f65a57f8
 592 rk8:
 593 DQ 0x000000018bb70000
 594
 595 rk9:
 596 DQ 0xceae000000000000
 597 rk10:
 598 DQ 0xbfd6000000000000
 599 rk11:
 600 DQ 0x1e16000000000000
 601 rk12:
 602 DQ 0x713c000000000000
 603 rk13:
 604 DQ 0xf7f9000000000000
 605 rk14:
 606 DQ 0x80a6000000000000
 607 rk15:
 608 DQ 0x044c000000000000
 609 rk16:
 610 DQ 0xe658000000000000
 611 rk17:
 612 DQ 0xad18000000000000
 613 rk18:
 614 DQ 0xa497000000000000
 615 rk19:
 616 DQ 0x6ee3000000000000
 617 rk20:
 618 DQ 0xe7b5000000000000
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628 mask1:
 629 dq 0x8080808080808080, 0x8080808080808080
 630 mask2:
 631 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 632
 633 SHUF_MASK:
 634 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 635
 636 pshufb_shf_table:
 637 ; use these values for shift constants for the pshufb instruction
 638 ; different alignments result in values as shown:
 639 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 640 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 641 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 642 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 643 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 644 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 645 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 646 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 647 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 648 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 649 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 650 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 651 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 652 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 653 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 654 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 655 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 656
 657 ;;;       func          core, ver, snum
 658 slversion crc16_t10dif_01, 01,   06,  0010
 659