ceph/src/isa-l/crc/crc64_ecma_refl_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_ecma_refl_by8(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  39 ;       sample yasm command line:
  40 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
  41 %include "reg_sizes.asm"
  42
  43 %define fetch_dist      1024
  44
  45 [bits 64]
  46 default rel
  47
  48 section .text
  49
  50
  51 %ifidn __OUTPUT_FORMAT__, win64
  52         %xdefine        arg1 rcx
  53         %xdefine        arg2 rdx
  54         %xdefine        arg3 r8
  55 %else
  56         %xdefine        arg1 rdi
  57         %xdefine        arg2 rsi
  58         %xdefine        arg3 rdx
  59 %endif
  60
  61 %define TMP 16*0
  62 %ifidn __OUTPUT_FORMAT__, win64
  63         %define XMM_SAVE 16*2
  64         %define VARIABLE_OFFSET 16*10+8
  65 %else
  66         %define VARIABLE_OFFSET 16*2+8
  67 %endif
  68
  69
  70 align 16
  71 global crc64_ecma_refl_by8:function
  72 crc64_ecma_refl_by8:
  73         ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
  74         not arg1
  75         sub     rsp, VARIABLE_OFFSET
  76
  77 %ifidn __OUTPUT_FORMAT__, win64
  78         ; push the xmm registers into the stack to maintain
  79         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  80         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  81         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  82         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  83         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  84         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  85         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  86         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  87 %endif
  88
  89         ; check if smaller than 256B
  90         cmp     arg3, 256
  91
  92         ; for sizes less than 256, we can't fold 128B at a time...
  93         jl      _less_than_256
  94
  95
  96         ; load the initial crc value
  97         movq    xmm10, arg1      ; initial crc
  98       ; receive the initial 128B data, xor the initial crc value
  99         movdqu  xmm0, [arg2+16*0]
 100         movdqu  xmm1, [arg2+16*1]
 101         movdqu  xmm2, [arg2+16*2]
 102         movdqu  xmm3, [arg2+16*3]
 103         movdqu  xmm4, [arg2+16*4]
 104         movdqu  xmm5, [arg2+16*5]
 105         movdqu  xmm6, [arg2+16*6]
 106         movdqu  xmm7, [arg2+16*7]
 107
 108         ; XOR the initial_crc value
 109         pxor    xmm0, xmm10
 110         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 111                                         ;imm value of pclmulqdq instruction will determine which constant to use
 112         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 113         ; we subtract 256 instead of 128 to save one instruction from the loop
 114         sub     arg3, 256
 115
 116         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 117         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 118
 119
 120         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 121 _fold_128_B_loop:
 122
 123         ; update the buffer pointer
 124         add     arg2, 128
 125
 126         prefetchnta [arg2+fetch_dist+0]
 127         movdqu  xmm9, [arg2+16*0]
 128         movdqu  xmm12, [arg2+16*1]
 129         movdqa  xmm8, xmm0
 130         movdqa  xmm13, xmm1
 131         pclmulqdq       xmm0, xmm10, 0x10
 132         pclmulqdq       xmm8, xmm10 , 0x1
 133         pclmulqdq       xmm1, xmm10, 0x10
 134         pclmulqdq       xmm13, xmm10 , 0x1
 135         pxor    xmm0, xmm9
 136         xorps   xmm0, xmm8
 137         pxor    xmm1, xmm12
 138         xorps   xmm1, xmm13
 139
 140         prefetchnta [arg2+fetch_dist+32]
 141         movdqu  xmm9, [arg2+16*2]
 142         movdqu  xmm12, [arg2+16*3]
 143         movdqa  xmm8, xmm2
 144         movdqa  xmm13, xmm3
 145         pclmulqdq       xmm2, xmm10, 0x10
 146         pclmulqdq       xmm8, xmm10 , 0x1
 147         pclmulqdq       xmm3, xmm10, 0x10
 148         pclmulqdq       xmm13, xmm10 , 0x1
 149         pxor    xmm2, xmm9
 150         xorps   xmm2, xmm8
 151         pxor    xmm3, xmm12
 152         xorps   xmm3, xmm13
 153
 154         prefetchnta [arg2+fetch_dist+64]
 155         movdqu  xmm9, [arg2+16*4]
 156         movdqu  xmm12, [arg2+16*5]
 157         movdqa  xmm8, xmm4
 158         movdqa  xmm13, xmm5
 159         pclmulqdq       xmm4, xmm10, 0x10
 160         pclmulqdq       xmm8, xmm10 , 0x1
 161         pclmulqdq       xmm5, xmm10, 0x10
 162         pclmulqdq       xmm13, xmm10 , 0x1
 163         pxor    xmm4, xmm9
 164         xorps   xmm4, xmm8
 165         pxor    xmm5, xmm12
 166         xorps   xmm5, xmm13
 167
 168         prefetchnta [arg2+fetch_dist+96]
 169         movdqu  xmm9, [arg2+16*6]
 170         movdqu  xmm12, [arg2+16*7]
 171         movdqa  xmm8, xmm6
 172         movdqa  xmm13, xmm7
 173         pclmulqdq       xmm6, xmm10, 0x10
 174         pclmulqdq       xmm8, xmm10 , 0x1
 175         pclmulqdq       xmm7, xmm10, 0x10
 176         pclmulqdq       xmm13, xmm10 , 0x1
 177         pxor    xmm6, xmm9
 178         xorps   xmm6, xmm8
 179         pxor    xmm7, xmm12
 180         xorps   xmm7, xmm13
 181
 182         sub     arg3, 128
 183
 184         ; check if there is another 128B in the buffer to be able to fold
 185         jge     _fold_128_B_loop
 186         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 187
 188         add     arg2, 128
 189         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 190         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 191
 192
 193         ; fold the 8 xmm registers to 1 xmm register with different constants
 194         ; xmm0 to xmm7
 195         movdqa  xmm10, [rk9]
 196         movdqa  xmm8, xmm0
 197         pclmulqdq       xmm0, xmm10, 0x1
 198         pclmulqdq       xmm8, xmm10, 0x10
 199         pxor    xmm7, xmm8
 200         xorps   xmm7, xmm0
 201         ;xmm1 to xmm7
 202         movdqa  xmm10, [rk11]
 203         movdqa  xmm8, xmm1
 204         pclmulqdq       xmm1, xmm10, 0x1
 205         pclmulqdq       xmm8, xmm10, 0x10
 206         pxor    xmm7, xmm8
 207         xorps   xmm7, xmm1
 208
 209         movdqa  xmm10, [rk13]
 210         movdqa  xmm8, xmm2
 211         pclmulqdq       xmm2, xmm10, 0x1
 212         pclmulqdq       xmm8, xmm10, 0x10
 213         pxor    xmm7, xmm8
 214         pxor    xmm7, xmm2
 215
 216         movdqa  xmm10, [rk15]
 217         movdqa  xmm8, xmm3
 218         pclmulqdq       xmm3, xmm10, 0x1
 219         pclmulqdq       xmm8, xmm10, 0x10
 220         pxor    xmm7, xmm8
 221         xorps   xmm7, xmm3
 222
 223         movdqa  xmm10, [rk17]
 224         movdqa  xmm8, xmm4
 225         pclmulqdq       xmm4, xmm10, 0x1
 226         pclmulqdq       xmm8, xmm10, 0x10
 227         pxor    xmm7, xmm8
 228         pxor    xmm7, xmm4
 229
 230         movdqa  xmm10, [rk19]
 231         movdqa  xmm8, xmm5
 232         pclmulqdq       xmm5, xmm10, 0x1
 233         pclmulqdq       xmm8, xmm10, 0x10
 234         pxor    xmm7, xmm8
 235         xorps   xmm7, xmm5
 236         ; xmm6 to xmm7
 237         movdqa  xmm10, [rk1]
 238         movdqa  xmm8, xmm6
 239         pclmulqdq       xmm6, xmm10, 0x1
 240         pclmulqdq       xmm8, xmm10, 0x10
 241         pxor    xmm7, xmm8
 242         pxor    xmm7, xmm6
 243
 244
 245         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 246         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 247         add     arg3, 128-16
 248         jl      _final_reduction_for_128
 249
 250         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 251         ; we can fold 16 bytes at a time if y>=16
 252         ; continue folding 16B at a time
 253
 254 _16B_reduction_loop:
 255         movdqa  xmm8, xmm7
 256         pclmulqdq       xmm7, xmm10, 0x1
 257         pclmulqdq       xmm8, xmm10, 0x10
 258         pxor    xmm7, xmm8
 259         movdqu  xmm0, [arg2]
 260         pxor    xmm7, xmm0
 261         add     arg2, 16
 262         sub     arg3, 16
 263         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 264         ; equivalent of: cmp arg3, 16-16
 265         ; check if there is any more 16B in the buffer to be able to fold
 266         jge     _16B_reduction_loop
 267
 268         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 269         ;first, we reduce the data in the xmm7 register
 270
 271
 272 _final_reduction_for_128:
 273         add arg3, 16
 274         je _128_done
 275   ; here we are getting data that is less than 16 bytes.
 276         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 277         ; after that the registers need to be adjusted.
 278 _get_last_two_xmms:
 279
 280
 281         movdqa xmm2, xmm7
 282         movdqu xmm1, [arg2 - 16 + arg3]
 283
 284         ; get rid of the extra data that was loaded before
 285         ; load the shift constant
 286         lea     rax, [pshufb_shf_table]
 287         add     rax, arg3
 288         movdqu  xmm0, [rax]
 289
 290
 291         pshufb  xmm7, xmm0
 292         pxor    xmm0, [mask3]
 293         pshufb  xmm2, xmm0
 294
 295         pblendvb        xmm2, xmm1     ;xmm0 is implicit
 296         ;;;;;;;;;;
 297         movdqa  xmm8, xmm7
 298         pclmulqdq       xmm7, xmm10, 0x1
 299
 300         pclmulqdq       xmm8, xmm10, 0x10
 301         pxor    xmm7, xmm8
 302         pxor    xmm7, xmm2
 303
 304 _128_done:
 305         ; compute crc of a 128-bit value
 306         movdqa  xmm10, [rk5]
 307         movdqa  xmm0, xmm7
 308
 309         ;64b fold
 310         pclmulqdq       xmm7, xmm10, 0
 311         psrldq  xmm0, 8
 312         pxor    xmm7, xmm0
 313
 314         ;barrett reduction
 315 _barrett:
 316         movdqa  xmm1, xmm7
 317         movdqa  xmm10, [rk7]
 318
 319         pclmulqdq       xmm7, xmm10, 0
 320         movdqa  xmm2, xmm7
 321         pclmulqdq       xmm7, xmm10, 0x10
 322         pslldq  xmm2, 8
 323         pxor    xmm7, xmm2
 324         pxor    xmm7, xmm1
 325         pextrq  rax, xmm7, 1
 326
 327 _cleanup:
 328         ; return c ^ 0xffffffff, ffffffffL;
 329         not     rax
 330
 331
 332 %ifidn __OUTPUT_FORMAT__, win64
 333         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 334         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 335         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 336         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 337         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 338         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 339         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 340         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 341 %endif
 342         add     rsp, VARIABLE_OFFSET
 343         ret
 344
 345 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 346 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 349
 350 align 16
 351 _less_than_256:
 352
 353         ; check if there is enough buffer to be able to fold 16B at a time
 354         cmp     arg3, 32
 355         jl      _less_than_32
 356
 357         ; if there is, load the constants
 358         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 359
 360         movq    xmm0, arg1       ; get the initial crc value
 361         movdqu  xmm7, [arg2]            ; load the plaintext
 362         pxor    xmm7, xmm0
 363
 364         ; update the buffer pointer
 365         add     arg2, 16
 366
 367         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 368         sub     arg3, 32
 369
 370         jmp     _16B_reduction_loop
 371
 372 align 16
 373 _less_than_32:
 374         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 375         mov     rax, arg1
 376         test    arg3, arg3
 377         je      _cleanup
 378
 379         movq    xmm0, arg1       ; get the initial crc value
 380
 381         cmp     arg3, 16
 382         je      _exact_16_left
 383         jl      _less_than_16_left
 384
 385         movdqu  xmm7, [arg2]            ; load the plaintext
 386         pxor    xmm7, xmm0              ; xor the initial crc value
 387         add     arg2, 16
 388         sub     arg3, 16
 389         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 390         jmp     _get_last_two_xmms
 391
 392
 393 align 16
 394 _less_than_16_left:
 395         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 396
 397         pxor    xmm1, xmm1
 398         mov     r11, rsp
 399         movdqa  [r11], xmm1
 400
 401         ;       backup the counter value
 402         mov     r9, arg3
 403         cmp     arg3, 8
 404         jl      _less_than_8_left
 405
 406         ; load 8 Bytes
 407         mov     rax, [arg2]
 408         mov     [r11], rax
 409         add     r11, 8
 410         sub     arg3, 8
 411         add     arg2, 8
 412 _less_than_8_left:
 413
 414         cmp     arg3, 4
 415         jl      _less_than_4_left
 416
 417         ; load 4 Bytes
 418         mov     eax, [arg2]
 419         mov     [r11], eax
 420         add     r11, 4
 421         sub     arg3, 4
 422         add     arg2, 4
 423 _less_than_4_left:
 424
 425         cmp     arg3, 2
 426         jl      _less_than_2_left
 427
 428         ; load 2 Bytes
 429         mov     ax, [arg2]
 430         mov     [r11], ax
 431         add     r11, 2
 432         sub     arg3, 2
 433         add     arg2, 2
 434 _less_than_2_left:
 435         cmp     arg3, 1
 436         jl      _zero_left
 437
 438         ; load 1 Byte
 439         mov     al, [arg2]
 440         mov     [r11], al
 441
 442 _zero_left:
 443         movdqa  xmm7, [rsp]
 444         pxor    xmm7, xmm0      ; xor the initial crc value
 445
 446         lea rax,[pshufb_shf_table]
 447
 448         cmp     r9, 8
 449         jl      _end_1to7
 450
 451 _end_8to15:
 452         movdqu  xmm0, [rax + r9]
 453         pshufb  xmm7,xmm0
 454         jmp     _128_done
 455
 456 _end_1to7:
 457         ; Left shift (8-length) bytes in XMM
 458         movdqu  xmm0, [rax + r9 + 8]
 459         pshufb  xmm7,xmm0
 460
 461         jmp     _barrett
 462
 463 align 16
 464 _exact_16_left:
 465         movdqu  xmm7, [arg2]
 466         pxor    xmm7, xmm0      ; xor the initial crc value
 467
 468         jmp     _128_done
 469
 470 section .data
 471
 472 ; precomputed constants
 473 align 16
 474 ; rk7 = floor(2^128/Q)
 475 ; rk8 = Q
 476 rk1 :
 477 DQ 0xdabe95afc7875f40
 478 rk2 :
 479 DQ 0xe05dd497ca393ae4
 480 rk3 :
 481 DQ 0xd7d86b2af73de740
 482 rk4 :
 483 DQ 0x8757d71d4fcc1000
 484 rk5 :
 485 DQ 0xdabe95afc7875f40
 486 rk6 :
 487 DQ 0x0000000000000000
 488 rk7 :
 489 DQ 0x9c3e466c172963d5
 490 rk8 :
 491 DQ 0x92d8af2baf0e1e84
 492 rk9 :
 493 DQ 0x947874de595052cb
 494 rk10 :
 495 DQ 0x9e735cb59b4724da
 496 rk11 :
 497 DQ 0xe4ce2cd55fea0037
 498 rk12 :
 499 DQ 0x2fe3fd2920ce82ec
 500 rk13 :
 501 DQ 0xe31d519421a63a5
 502 rk14 :
 503 DQ 0x2e30203212cac325
 504 rk15 :
 505 DQ 0x81f6054a7842df4
 506 rk16 :
 507 DQ 0x6ae3efbb9dd441f3
 508 rk17 :
 509 DQ 0x69a35d91c3730254
 510 rk18 :
 511 DQ 0xb5ea1af9c013aca4
 512 rk19 :
 513 DQ 0x3be653a30fe1af51
 514 rk20 :
 515 DQ 0x60095b008a9efa44
 516
 517
 518 pshufb_shf_table:
 519 ; use these values for shift constants for the pshufb instruction
 520 ; different alignments result in values as shown:
 521 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 522 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 523 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 524 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 525 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 526 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 527 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 528 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 529 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 530 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 531 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 532 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 533 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 534 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 535 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 536 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 537 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 538
 539
 540 mask:
 541 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 542 mask2:
 543 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 544 mask3:
 545 dq     0x8080808080808080, 0x8080808080808080
 546
 547 ;;;       func        core, ver, snum
 548 slversion crc64_ecma_refl_by8, 01,   00,  001d