ceph/src/isa-l/crc/crc64_ecma_refl_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_ecma_refl_by8(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  39 ;       sample yasm command line:
  40 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
  41 %include "reg_sizes.asm"
  42
  43 %define fetch_dist      1024
  44
  45 [bits 64]
  46 default rel
  47
  48 section .text
  49
  50
  51 %ifidn __OUTPUT_FORMAT__, win64
  52         %xdefine        arg1 rcx
  53         %xdefine        arg2 rdx
  54         %xdefine        arg3 r8
  55 %else
  56         %xdefine        arg1 rdi
  57         %xdefine        arg2 rsi
  58         %xdefine        arg3 rdx
  59 %endif
  60
  61 %define TMP 16*0
  62 %ifidn __OUTPUT_FORMAT__, win64
  63         %define XMM_SAVE 16*2
  64         %define VARIABLE_OFFSET 16*10+8
  65 %else
  66         %define VARIABLE_OFFSET 16*2+8
  67 %endif
  68
  69
  70 align 16
  71 mk_global crc64_ecma_refl_by8, function
  72 crc64_ecma_refl_by8:
  73         endbranch
  74         ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
  75         not arg1
  76         sub     rsp, VARIABLE_OFFSET
  77
  78 %ifidn __OUTPUT_FORMAT__, win64
  79         ; push the xmm registers into the stack to maintain
  80         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  81         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  82         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  83         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  84         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  85         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  86         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  87         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  88 %endif
  89
  90         ; check if smaller than 256B
  91         cmp     arg3, 256
  92
  93         ; for sizes less than 256, we can't fold 128B at a time...
  94         jl      _less_than_256
  95
  96
  97         ; load the initial crc value
  98         movq    xmm10, arg1      ; initial crc
  99       ; receive the initial 128B data, xor the initial crc value
 100         movdqu  xmm0, [arg2+16*0]
 101         movdqu  xmm1, [arg2+16*1]
 102         movdqu  xmm2, [arg2+16*2]
 103         movdqu  xmm3, [arg2+16*3]
 104         movdqu  xmm4, [arg2+16*4]
 105         movdqu  xmm5, [arg2+16*5]
 106         movdqu  xmm6, [arg2+16*6]
 107         movdqu  xmm7, [arg2+16*7]
 108
 109         ; XOR the initial_crc value
 110         pxor    xmm0, xmm10
 111         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 112                                         ;imm value of pclmulqdq instruction will determine which constant to use
 113         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 114         ; we subtract 256 instead of 128 to save one instruction from the loop
 115         sub     arg3, 256
 116
 117         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 118         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 119
 120
 121         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 122 _fold_128_B_loop:
 123
 124         ; update the buffer pointer
 125         add     arg2, 128
 126
 127         prefetchnta [arg2+fetch_dist+0]
 128         movdqu  xmm9, [arg2+16*0]
 129         movdqu  xmm12, [arg2+16*1]
 130         movdqa  xmm8, xmm0
 131         movdqa  xmm13, xmm1
 132         pclmulqdq       xmm0, xmm10, 0x10
 133         pclmulqdq       xmm8, xmm10 , 0x1
 134         pclmulqdq       xmm1, xmm10, 0x10
 135         pclmulqdq       xmm13, xmm10 , 0x1
 136         pxor    xmm0, xmm9
 137         xorps   xmm0, xmm8
 138         pxor    xmm1, xmm12
 139         xorps   xmm1, xmm13
 140
 141         prefetchnta [arg2+fetch_dist+32]
 142         movdqu  xmm9, [arg2+16*2]
 143         movdqu  xmm12, [arg2+16*3]
 144         movdqa  xmm8, xmm2
 145         movdqa  xmm13, xmm3
 146         pclmulqdq       xmm2, xmm10, 0x10
 147         pclmulqdq       xmm8, xmm10 , 0x1
 148         pclmulqdq       xmm3, xmm10, 0x10
 149         pclmulqdq       xmm13, xmm10 , 0x1
 150         pxor    xmm2, xmm9
 151         xorps   xmm2, xmm8
 152         pxor    xmm3, xmm12
 153         xorps   xmm3, xmm13
 154
 155         prefetchnta [arg2+fetch_dist+64]
 156         movdqu  xmm9, [arg2+16*4]
 157         movdqu  xmm12, [arg2+16*5]
 158         movdqa  xmm8, xmm4
 159         movdqa  xmm13, xmm5
 160         pclmulqdq       xmm4, xmm10, 0x10
 161         pclmulqdq       xmm8, xmm10 , 0x1
 162         pclmulqdq       xmm5, xmm10, 0x10
 163         pclmulqdq       xmm13, xmm10 , 0x1
 164         pxor    xmm4, xmm9
 165         xorps   xmm4, xmm8
 166         pxor    xmm5, xmm12
 167         xorps   xmm5, xmm13
 168
 169         prefetchnta [arg2+fetch_dist+96]
 170         movdqu  xmm9, [arg2+16*6]
 171         movdqu  xmm12, [arg2+16*7]
 172         movdqa  xmm8, xmm6
 173         movdqa  xmm13, xmm7
 174         pclmulqdq       xmm6, xmm10, 0x10
 175         pclmulqdq       xmm8, xmm10 , 0x1
 176         pclmulqdq       xmm7, xmm10, 0x10
 177         pclmulqdq       xmm13, xmm10 , 0x1
 178         pxor    xmm6, xmm9
 179         xorps   xmm6, xmm8
 180         pxor    xmm7, xmm12
 181         xorps   xmm7, xmm13
 182
 183         sub     arg3, 128
 184
 185         ; check if there is another 128B in the buffer to be able to fold
 186         jge     _fold_128_B_loop
 187         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 188
 189         add     arg2, 128
 190         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 191         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 192
 193
 194         ; fold the 8 xmm registers to 1 xmm register with different constants
 195         ; xmm0 to xmm7
 196         movdqa  xmm10, [rk9]
 197         movdqa  xmm8, xmm0
 198         pclmulqdq       xmm0, xmm10, 0x1
 199         pclmulqdq       xmm8, xmm10, 0x10
 200         pxor    xmm7, xmm8
 201         xorps   xmm7, xmm0
 202         ;xmm1 to xmm7
 203         movdqa  xmm10, [rk11]
 204         movdqa  xmm8, xmm1
 205         pclmulqdq       xmm1, xmm10, 0x1
 206         pclmulqdq       xmm8, xmm10, 0x10
 207         pxor    xmm7, xmm8
 208         xorps   xmm7, xmm1
 209
 210         movdqa  xmm10, [rk13]
 211         movdqa  xmm8, xmm2
 212         pclmulqdq       xmm2, xmm10, 0x1
 213         pclmulqdq       xmm8, xmm10, 0x10
 214         pxor    xmm7, xmm8
 215         pxor    xmm7, xmm2
 216
 217         movdqa  xmm10, [rk15]
 218         movdqa  xmm8, xmm3
 219         pclmulqdq       xmm3, xmm10, 0x1
 220         pclmulqdq       xmm8, xmm10, 0x10
 221         pxor    xmm7, xmm8
 222         xorps   xmm7, xmm3
 223
 224         movdqa  xmm10, [rk17]
 225         movdqa  xmm8, xmm4
 226         pclmulqdq       xmm4, xmm10, 0x1
 227         pclmulqdq       xmm8, xmm10, 0x10
 228         pxor    xmm7, xmm8
 229         pxor    xmm7, xmm4
 230
 231         movdqa  xmm10, [rk19]
 232         movdqa  xmm8, xmm5
 233         pclmulqdq       xmm5, xmm10, 0x1
 234         pclmulqdq       xmm8, xmm10, 0x10
 235         pxor    xmm7, xmm8
 236         xorps   xmm7, xmm5
 237         ; xmm6 to xmm7
 238         movdqa  xmm10, [rk1]
 239         movdqa  xmm8, xmm6
 240         pclmulqdq       xmm6, xmm10, 0x1
 241         pclmulqdq       xmm8, xmm10, 0x10
 242         pxor    xmm7, xmm8
 243         pxor    xmm7, xmm6
 244
 245
 246         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 247         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 248         add     arg3, 128-16
 249         jl      _final_reduction_for_128
 250
 251         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 252         ; we can fold 16 bytes at a time if y>=16
 253         ; continue folding 16B at a time
 254
 255 _16B_reduction_loop:
 256         movdqa  xmm8, xmm7
 257         pclmulqdq       xmm7, xmm10, 0x1
 258         pclmulqdq       xmm8, xmm10, 0x10
 259         pxor    xmm7, xmm8
 260         movdqu  xmm0, [arg2]
 261         pxor    xmm7, xmm0
 262         add     arg2, 16
 263         sub     arg3, 16
 264         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 265         ; equivalent of: cmp arg3, 16-16
 266         ; check if there is any more 16B in the buffer to be able to fold
 267         jge     _16B_reduction_loop
 268
 269         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 270         ;first, we reduce the data in the xmm7 register
 271
 272
 273 _final_reduction_for_128:
 274         add arg3, 16
 275         je _128_done
 276   ; here we are getting data that is less than 16 bytes.
 277         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 278         ; after that the registers need to be adjusted.
 279 _get_last_two_xmms:
 280
 281
 282         movdqa xmm2, xmm7
 283         movdqu xmm1, [arg2 - 16 + arg3]
 284
 285         ; get rid of the extra data that was loaded before
 286         ; load the shift constant
 287         lea     rax, [pshufb_shf_table]
 288         add     rax, arg3
 289         movdqu  xmm0, [rax]
 290
 291
 292         pshufb  xmm7, xmm0
 293         pxor    xmm0, [mask3]
 294         pshufb  xmm2, xmm0
 295
 296         pblendvb        xmm2, xmm1     ;xmm0 is implicit
 297         ;;;;;;;;;;
 298         movdqa  xmm8, xmm7
 299         pclmulqdq       xmm7, xmm10, 0x1
 300
 301         pclmulqdq       xmm8, xmm10, 0x10
 302         pxor    xmm7, xmm8
 303         pxor    xmm7, xmm2
 304
 305 _128_done:
 306         ; compute crc of a 128-bit value
 307         movdqa  xmm10, [rk5]
 308         movdqa  xmm0, xmm7
 309
 310         ;64b fold
 311         pclmulqdq       xmm7, xmm10, 0
 312         psrldq  xmm0, 8
 313         pxor    xmm7, xmm0
 314
 315         ;barrett reduction
 316 _barrett:
 317         movdqa  xmm1, xmm7
 318         movdqa  xmm10, [rk7]
 319
 320         pclmulqdq       xmm7, xmm10, 0
 321         movdqa  xmm2, xmm7
 322         pclmulqdq       xmm7, xmm10, 0x10
 323         pslldq  xmm2, 8
 324         pxor    xmm7, xmm2
 325         pxor    xmm7, xmm1
 326         pextrq  rax, xmm7, 1
 327
 328 _cleanup:
 329         ; return c ^ 0xffffffff, ffffffffL;
 330         not     rax
 331
 332
 333 %ifidn __OUTPUT_FORMAT__, win64
 334         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 335         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 336         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 337         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 338         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 339         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 340         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 341         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 342 %endif
 343         add     rsp, VARIABLE_OFFSET
 344         ret
 345
 346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 350
 351 align 16
 352 _less_than_256:
 353
 354         ; check if there is enough buffer to be able to fold 16B at a time
 355         cmp     arg3, 32
 356         jl      _less_than_32
 357
 358         ; if there is, load the constants
 359         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 360
 361         movq    xmm0, arg1       ; get the initial crc value
 362         movdqu  xmm7, [arg2]            ; load the plaintext
 363         pxor    xmm7, xmm0
 364
 365         ; update the buffer pointer
 366         add     arg2, 16
 367
 368         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 369         sub     arg3, 32
 370
 371         jmp     _16B_reduction_loop
 372
 373 align 16
 374 _less_than_32:
 375         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 376         mov     rax, arg1
 377         test    arg3, arg3
 378         je      _cleanup
 379
 380         movq    xmm0, arg1       ; get the initial crc value
 381
 382         cmp     arg3, 16
 383         je      _exact_16_left
 384         jl      _less_than_16_left
 385
 386         movdqu  xmm7, [arg2]            ; load the plaintext
 387         pxor    xmm7, xmm0              ; xor the initial crc value
 388         add     arg2, 16
 389         sub     arg3, 16
 390         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 391         jmp     _get_last_two_xmms
 392
 393
 394 align 16
 395 _less_than_16_left:
 396         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 397
 398         pxor    xmm1, xmm1
 399         mov     r11, rsp
 400         movdqa  [r11], xmm1
 401
 402         ;       backup the counter value
 403         mov     r9, arg3
 404         cmp     arg3, 8
 405         jl      _less_than_8_left
 406
 407         ; load 8 Bytes
 408         mov     rax, [arg2]
 409         mov     [r11], rax
 410         add     r11, 8
 411         sub     arg3, 8
 412         add     arg2, 8
 413 _less_than_8_left:
 414
 415         cmp     arg3, 4
 416         jl      _less_than_4_left
 417
 418         ; load 4 Bytes
 419         mov     eax, [arg2]
 420         mov     [r11], eax
 421         add     r11, 4
 422         sub     arg3, 4
 423         add     arg2, 4
 424 _less_than_4_left:
 425
 426         cmp     arg3, 2
 427         jl      _less_than_2_left
 428
 429         ; load 2 Bytes
 430         mov     ax, [arg2]
 431         mov     [r11], ax
 432         add     r11, 2
 433         sub     arg3, 2
 434         add     arg2, 2
 435 _less_than_2_left:
 436         cmp     arg3, 1
 437         jl      _zero_left
 438
 439         ; load 1 Byte
 440         mov     al, [arg2]
 441         mov     [r11], al
 442
 443 _zero_left:
 444         movdqa  xmm7, [rsp]
 445         pxor    xmm7, xmm0      ; xor the initial crc value
 446
 447         lea rax,[pshufb_shf_table]
 448
 449         cmp     r9, 8
 450         jl      _end_1to7
 451
 452 _end_8to15:
 453         movdqu  xmm0, [rax + r9]
 454         pshufb  xmm7,xmm0
 455         jmp     _128_done
 456
 457 _end_1to7:
 458         ; Left shift (8-length) bytes in XMM
 459         movdqu  xmm0, [rax + r9 + 8]
 460         pshufb  xmm7,xmm0
 461
 462         jmp     _barrett
 463
 464 align 16
 465 _exact_16_left:
 466         movdqu  xmm7, [arg2]
 467         pxor    xmm7, xmm0      ; xor the initial crc value
 468
 469         jmp     _128_done
 470
 471 section .data
 472
 473 ; precomputed constants
 474 align 16
 475 ; rk7 = floor(2^128/Q)
 476 ; rk8 = Q
 477 rk1 :
 478 DQ 0xdabe95afc7875f40
 479 rk2 :
 480 DQ 0xe05dd497ca393ae4
 481 rk3 :
 482 DQ 0xd7d86b2af73de740
 483 rk4 :
 484 DQ 0x8757d71d4fcc1000
 485 rk5 :
 486 DQ 0xdabe95afc7875f40
 487 rk6 :
 488 DQ 0x0000000000000000
 489 rk7 :
 490 DQ 0x9c3e466c172963d5
 491 rk8 :
 492 DQ 0x92d8af2baf0e1e84
 493 rk9 :
 494 DQ 0x947874de595052cb
 495 rk10 :
 496 DQ 0x9e735cb59b4724da
 497 rk11 :
 498 DQ 0xe4ce2cd55fea0037
 499 rk12 :
 500 DQ 0x2fe3fd2920ce82ec
 501 rk13 :
 502 DQ 0xe31d519421a63a5
 503 rk14 :
 504 DQ 0x2e30203212cac325
 505 rk15 :
 506 DQ 0x81f6054a7842df4
 507 rk16 :
 508 DQ 0x6ae3efbb9dd441f3
 509 rk17 :
 510 DQ 0x69a35d91c3730254
 511 rk18 :
 512 DQ 0xb5ea1af9c013aca4
 513 rk19 :
 514 DQ 0x3be653a30fe1af51
 515 rk20 :
 516 DQ 0x60095b008a9efa44
 517
 518
 519 pshufb_shf_table:
 520 ; use these values for shift constants for the pshufb instruction
 521 ; different alignments result in values as shown:
 522 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 523 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 524 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 525 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 526 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 527 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 528 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 529 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 530 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 531 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 532 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 533 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 534 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 535 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 536 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 537 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 538 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 539
 540
 541 mask:
 542 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 543 mask2:
 544 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 545 mask3:
 546 dq     0x8080808080808080, 0x8080808080808080
 547
 548 ;;;       func        core, ver, snum
 549 slversion crc64_ecma_refl_by8, 01,   00,  001d