ceph/src/isa-l/crc/crc64_jones_norm_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       uint64_t crc64_jones_norm_by8(
  32 ;               uint64_t init_crc, //initial CRC value, 64 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               uint64_t len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 %include "reg_sizes.asm"
  38
  39 %define fetch_dist      1024
  40
  41 [bits 64]
  42 default rel
  43
  44 section .text
  45
  46 %ifidn __OUTPUT_FORMAT__, win64
  47         %xdefine        arg1 rcx
  48         %xdefine        arg2 rdx
  49         %xdefine        arg3 r8
  50 %else
  51         %xdefine        arg1 rdi
  52         %xdefine        arg2 rsi
  53         %xdefine        arg3 rdx
  54 %endif
  55
  56 %define TMP 16*0
  57 %ifidn __OUTPUT_FORMAT__, win64
  58         %define XMM_SAVE 16*2
  59         %define VARIABLE_OFFSET 16*10+8
  60 %else
  61         %define VARIABLE_OFFSET 16*2+8
  62 %endif
  63 align 16
  64 global  crc64_jones_norm_by8:function
  65 crc64_jones_norm_by8:
  66
  67         not     arg1      ;~init_crc
  68
  69         sub     rsp,VARIABLE_OFFSET
  70
  71 %ifidn __OUTPUT_FORMAT__, win64
  72         ; push the xmm registers into the stack to maintain
  73         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  74         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  75         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  76         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  77         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  78         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  79         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  80         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  81 %endif
  82
  83
  84         ; check if smaller than 256
  85         cmp     arg3, 256
  86
  87         ; for sizes less than 256, we can't fold 128B at a time...
  88         jl      _less_than_256
  89
  90
  91         ; load the initial crc value
  92         movq    xmm10, arg1     ; initial crc
  93
  94         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
  95         ; because data will be byte-reflected and will align with initial crc at correct place.
  96         pslldq  xmm10, 8
  97
  98         movdqa xmm11, [SHUF_MASK]
  99         ; receive the initial 128B data, xor the initial crc value
 100         movdqu  xmm0, [arg2+16*0]
 101         movdqu  xmm1, [arg2+16*1]
 102         movdqu  xmm2, [arg2+16*2]
 103         movdqu  xmm3, [arg2+16*3]
 104         movdqu  xmm4, [arg2+16*4]
 105         movdqu  xmm5, [arg2+16*5]
 106         movdqu  xmm6, [arg2+16*6]
 107         movdqu  xmm7, [arg2+16*7]
 108
 109         pshufb  xmm0, xmm11
 110         ; XOR the initial_crc value
 111         pxor    xmm0, xmm10
 112         pshufb  xmm1, xmm11
 113         pshufb  xmm2, xmm11
 114         pshufb  xmm3, xmm11
 115         pshufb  xmm4, xmm11
 116         pshufb  xmm5, xmm11
 117         pshufb  xmm6, xmm11
 118         pshufb  xmm7, xmm11
 119
 120         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 121                                 ;imm value of pclmulqdq instruction will determine which constant to use
 122         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 123         ; we subtract 256 instead of 128 to save one instruction from the loop
 124         sub     arg3, 256
 125
 126         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 127         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 128
 129
 130         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 131 _fold_128_B_loop:
 132
 133         ; update the buffer pointer
 134         add     arg2, 128               ;    buf += 128;
 135
 136         prefetchnta [arg2+fetch_dist+0]
 137         movdqu  xmm9, [arg2+16*0]
 138         movdqu  xmm12, [arg2+16*1]
 139         pshufb  xmm9, xmm11
 140         pshufb  xmm12, xmm11
 141         movdqa  xmm8, xmm0
 142         movdqa  xmm13, xmm1
 143         pclmulqdq       xmm0, xmm10, 0x0
 144         pclmulqdq       xmm8, xmm10 , 0x11
 145         pclmulqdq       xmm1, xmm10, 0x0
 146         pclmulqdq       xmm13, xmm10 , 0x11
 147         pxor    xmm0, xmm9
 148         xorps   xmm0, xmm8
 149         pxor    xmm1, xmm12
 150         xorps   xmm1, xmm13
 151
 152         prefetchnta [arg2+fetch_dist+32]
 153         movdqu  xmm9, [arg2+16*2]
 154         movdqu  xmm12, [arg2+16*3]
 155         pshufb  xmm9, xmm11
 156         pshufb  xmm12, xmm11
 157         movdqa  xmm8, xmm2
 158         movdqa  xmm13, xmm3
 159         pclmulqdq       xmm2, xmm10, 0x0
 160         pclmulqdq       xmm8, xmm10 , 0x11
 161         pclmulqdq       xmm3, xmm10, 0x0
 162         pclmulqdq       xmm13, xmm10 , 0x11
 163         pxor    xmm2, xmm9
 164         xorps   xmm2, xmm8
 165         pxor    xmm3, xmm12
 166         xorps   xmm3, xmm13
 167
 168         prefetchnta [arg2+fetch_dist+64]
 169         movdqu  xmm9, [arg2+16*4]
 170         movdqu  xmm12, [arg2+16*5]
 171         pshufb  xmm9, xmm11
 172         pshufb  xmm12, xmm11
 173         movdqa  xmm8, xmm4
 174         movdqa  xmm13, xmm5
 175         pclmulqdq       xmm4, xmm10, 0x0
 176         pclmulqdq       xmm8, xmm10 , 0x11
 177         pclmulqdq       xmm5, xmm10, 0x0
 178         pclmulqdq       xmm13, xmm10 , 0x11
 179         pxor    xmm4, xmm9
 180         xorps   xmm4, xmm8
 181         pxor    xmm5, xmm12
 182         xorps   xmm5, xmm13
 183
 184         prefetchnta [arg2+fetch_dist+96]
 185         movdqu  xmm9, [arg2+16*6]
 186         movdqu  xmm12, [arg2+16*7]
 187         pshufb  xmm9, xmm11
 188         pshufb  xmm12, xmm11
 189         movdqa  xmm8, xmm6
 190         movdqa  xmm13, xmm7
 191         pclmulqdq       xmm6, xmm10, 0x0
 192         pclmulqdq       xmm8, xmm10 , 0x11
 193         pclmulqdq       xmm7, xmm10, 0x0
 194         pclmulqdq       xmm13, xmm10 , 0x11
 195         pxor    xmm6, xmm9
 196         xorps   xmm6, xmm8
 197         pxor    xmm7, xmm12
 198         xorps   xmm7, xmm13
 199
 200         sub     arg3, 128
 201
 202         ; check if there is another 128B in the buffer to be able to fold
 203         jge     _fold_128_B_loop
 204         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 205
 206         add     arg2, 128
 207         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 208         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 209
 210
 211         ; fold the 8 xmm registers to 1 xmm register with different constants
 212
 213         movdqa  xmm10, [rk9]
 214         movdqa  xmm8, xmm0
 215         pclmulqdq       xmm0, xmm10, 0x11
 216         pclmulqdq       xmm8, xmm10, 0x0
 217         pxor    xmm7, xmm8
 218         xorps   xmm7, xmm0
 219
 220         movdqa  xmm10, [rk11]
 221         movdqa  xmm8, xmm1
 222         pclmulqdq       xmm1, xmm10, 0x11
 223         pclmulqdq       xmm8, xmm10, 0x0
 224         pxor    xmm7, xmm8
 225         xorps   xmm7, xmm1
 226
 227         movdqa  xmm10, [rk13]
 228         movdqa  xmm8, xmm2
 229         pclmulqdq       xmm2, xmm10, 0x11
 230         pclmulqdq       xmm8, xmm10, 0x0
 231         pxor    xmm7, xmm8
 232         pxor    xmm7, xmm2
 233
 234         movdqa  xmm10, [rk15]
 235         movdqa  xmm8, xmm3
 236         pclmulqdq       xmm3, xmm10, 0x11
 237         pclmulqdq       xmm8, xmm10, 0x0
 238         pxor    xmm7, xmm8
 239         xorps   xmm7, xmm3
 240
 241         movdqa  xmm10, [rk17]
 242         movdqa  xmm8, xmm4
 243         pclmulqdq       xmm4, xmm10, 0x11
 244         pclmulqdq       xmm8, xmm10, 0x0
 245         pxor    xmm7, xmm8
 246         pxor    xmm7, xmm4
 247
 248         movdqa  xmm10, [rk19]
 249         movdqa  xmm8, xmm5
 250         pclmulqdq       xmm5, xmm10, 0x11
 251         pclmulqdq       xmm8, xmm10, 0x0
 252         pxor    xmm7, xmm8
 253         xorps   xmm7, xmm5
 254
 255         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 256
 257         movdqa  xmm8, xmm6
 258         pclmulqdq       xmm6, xmm10, 0x11
 259         pclmulqdq       xmm8, xmm10, 0x0
 260         pxor    xmm7, xmm8
 261         pxor    xmm7, xmm6
 262
 263
 264         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 265         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 266         add     arg3, 128-16
 267         jl      _final_reduction_for_128
 268
 269         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 270         ; we can fold 16 bytes at a time if y>=16
 271         ; continue folding 16B at a time
 272
 273 _16B_reduction_loop:
 274         movdqa  xmm8, xmm7
 275         pclmulqdq       xmm7, xmm10, 0x11
 276         pclmulqdq       xmm8, xmm10, 0x0
 277         pxor    xmm7, xmm8
 278         movdqu  xmm0, [arg2]
 279         pshufb  xmm0, xmm11
 280         pxor    xmm7, xmm0
 281         add     arg2, 16
 282         sub     arg3, 16
 283         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 284         ; equivalent of: cmp arg3, 16-16
 285         ; check if there is any more 16B in the buffer to be able to fold
 286         jge     _16B_reduction_loop
 287
 288         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 289         ;first, we reduce the data in the xmm7 register
 290
 291
 292 _final_reduction_for_128:
 293         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 294         add     arg3, 16
 295         je      _128_done
 296
 297         ; here we are getting data that is less than 16 bytes.
 298         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 299         ; after that the registers need to be adjusted.
 300 _get_last_two_xmms:
 301         movdqa  xmm2, xmm7
 302
 303         movdqu  xmm1, [arg2 - 16 + arg3]
 304         pshufb  xmm1, xmm11
 305
 306         ; get rid of the extra data that was loaded before
 307         ; load the shift constant
 308         lea     rax, [pshufb_shf_table + 16]
 309         sub     rax, arg3
 310         movdqu  xmm0, [rax]
 311
 312         ; shift xmm2 to the left by arg3 bytes
 313         pshufb  xmm2, xmm0
 314
 315         ; shift xmm7 to the right by 16-arg3 bytes
 316         pxor    xmm0, [mask1]
 317         pshufb  xmm7, xmm0
 318         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 319
 320         ; fold 16 Bytes
 321         movdqa  xmm2, xmm1
 322         movdqa  xmm8, xmm7
 323         pclmulqdq       xmm7, xmm10, 0x11
 324         pclmulqdq       xmm8, xmm10, 0x0
 325         pxor    xmm7, xmm8
 326         pxor    xmm7, xmm2
 327
 328 _128_done:
 329         ; compute crc of a 128-bit value
 330         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 331         movdqa  xmm0, xmm7
 332
 333         ;64b fold
 334         pclmulqdq       xmm7, xmm10, 0x01       ; H*L
 335         pslldq  xmm0, 8
 336         pxor    xmm7, xmm0
 337
 338         ;barrett reduction
 339 _barrett:
 340         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 341         movdqa  xmm0, xmm7
 342
 343         movdqa  xmm1, xmm7
 344         pand    xmm1, [mask3]
 345         pclmulqdq       xmm7, xmm10, 0x01
 346         pxor    xmm7, xmm1
 347
 348         pclmulqdq       xmm7, xmm10, 0x11
 349         pxor    xmm7, xmm0
 350         pextrq  rax, xmm7, 0
 351
 352 _cleanup:
 353         not     rax
 354 %ifidn __OUTPUT_FORMAT__, win64
 355         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 356         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 357         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 358         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 359         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 360         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 361         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 362         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 363 %endif
 364         add     rsp, VARIABLE_OFFSET
 365         ret
 366
 367 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 369 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 370 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 371
 372 align 16
 373 _less_than_256:
 374
 375         ; check if there is enough buffer to be able to fold 16B at a time
 376         cmp     arg3, 32
 377         jl      _less_than_32
 378         movdqa xmm11, [SHUF_MASK]
 379
 380         ; if there is, load the constants
 381         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 382
 383         movq    xmm0, arg1      ; get the initial crc value
 384         pslldq  xmm0, 8 ; align it to its correct place
 385         movdqu  xmm7, [arg2]    ; load the plaintext
 386         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 387         pxor    xmm7, xmm0
 388
 389
 390         ; update the buffer pointer
 391         add     arg2, 16
 392
 393         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 394         sub     arg3, 32
 395
 396         jmp     _16B_reduction_loop
 397 align 16
 398 _less_than_32:
 399         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 400         mov     rax, arg1
 401         test    arg3, arg3
 402         je      _cleanup
 403
 404         movdqa xmm11, [SHUF_MASK]
 405
 406         movq    xmm0, arg1      ; get the initial crc value
 407         pslldq  xmm0, 8 ; align it to its correct place
 408
 409         cmp     arg3, 16
 410         je      _exact_16_left
 411         jl      _less_than_16_left
 412
 413         movdqu  xmm7, [arg2]    ; load the plaintext
 414         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 415         pxor    xmm7, xmm0      ; xor the initial crc value
 416         add     arg2, 16
 417         sub     arg3, 16
 418         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 419         jmp     _get_last_two_xmms
 420 align 16
 421 _less_than_16_left:
 422         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 423         pxor    xmm1, xmm1
 424         mov     r11, rsp
 425         movdqa  [r11], xmm1
 426
 427         ;       backup the counter value
 428         mov     r9, arg3
 429         cmp     arg3, 8
 430         jl      _less_than_8_left
 431
 432         ; load 8 Bytes
 433         mov     rax, [arg2]
 434         mov     [r11], rax
 435         add     r11, 8
 436         sub     arg3, 8
 437         add     arg2, 8
 438 _less_than_8_left:
 439
 440         cmp     arg3, 4
 441         jl      _less_than_4_left
 442
 443         ; load 4 Bytes
 444         mov     eax, [arg2]
 445         mov     [r11], eax
 446         add     r11, 4
 447         sub     arg3, 4
 448         add     arg2, 4
 449 _less_than_4_left:
 450
 451         cmp     arg3, 2
 452         jl      _less_than_2_left
 453
 454         ; load 2 Bytes
 455         mov     ax, [arg2]
 456         mov     [r11], ax
 457         add     r11, 2
 458         sub     arg3, 2
 459         add     arg2, 2
 460 _less_than_2_left:
 461         cmp     arg3, 1
 462         jl      _zero_left
 463
 464         ; load 1 Byte
 465         mov     al, [arg2]
 466         mov     [r11], al
 467 _zero_left:
 468         movdqa  xmm7, [rsp]
 469         pshufb  xmm7, xmm11
 470         pxor    xmm7, xmm0      ; xor the initial crc value
 471
 472         ; shl r9, 4
 473         lea     rax, [pshufb_shf_table + 16]
 474         sub     rax, r9
 475
 476         cmp     r9, 8
 477         jl      _end_1to7
 478
 479 _end_8to15:
 480         movdqu  xmm0, [rax]
 481         pxor    xmm0, [mask1]
 482
 483         pshufb  xmm7, xmm0
 484         jmp     _128_done
 485
 486 _end_1to7:
 487         ; Right shift (8-length) bytes in XMM
 488         add     rax, 8
 489         movdqu  xmm0, [rax]
 490         pshufb  xmm7,xmm0
 491
 492         jmp     _barrett
 493 align 16
 494 _exact_16_left:
 495         movdqu  xmm7, [arg2]
 496         pshufb  xmm7, xmm11
 497         pxor    xmm7, xmm0      ; xor the initial crc value
 498
 499         jmp     _128_done
 500
 501 section .data
 502
 503 ; precomputed constants
 504 align 16
 505
 506 rk1:
 507 DQ 0x4445ed2750017038
 508 rk2:
 509 DQ 0x698b74157cfbd736
 510 rk3:
 511 DQ 0x0cfcfb5101c4b775
 512 rk4:
 513 DQ 0x65403fd47cbec866
 514 rk5:
 515 DQ 0x4445ed2750017038
 516 rk6:
 517 DQ 0x0000000000000000
 518 rk7:
 519 DQ 0xddf3eeb298be6cf8
 520 rk8:
 521 DQ 0xad93d23594c935a9
 522 rk9:
 523 DQ 0xd8dc208e2ba527b4
 524 rk10:
 525 DQ 0xf032cfec76bb2bc5
 526 rk11:
 527 DQ 0xb536044f357f4238
 528 rk12:
 529 DQ 0xfdbf104d938ba67a
 530 rk13:
 531 DQ 0xeeddad9297a843e7
 532 rk14:
 533 DQ 0x3550bce629466473
 534 rk15:
 535 DQ 0x4e501e58ca43d25e
 536 rk16:
 537 DQ 0x13c961588f27f643
 538 rk17:
 539 DQ 0x3b60d00dcb1099bc
 540 rk18:
 541 DQ 0x44bf1f468c53b9a3
 542 rk19:
 543 DQ 0x96f2236e317179ee
 544 rk20:
 545 DQ 0xf00839aa0dd64bac
 546
 547 mask1:
 548 dq 0x8080808080808080, 0x8080808080808080
 549 mask2:
 550 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 551 mask3:
 552 dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
 553
 554 SHUF_MASK:
 555 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 556
 557 pshufb_shf_table:
 558 ; use these values for shift constants for the pshufb instruction
 559 ; different alignments result in values as shown:
 560 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 561 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 562 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 563 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 564 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 565 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 566 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 567 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 568 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 569 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 570 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 571 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 572 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 573 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 574 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 575 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 576 dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
 577 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
 578 dq 0x8080808080808080, 0x8080808080808080
 579
 580 ;;;       func        core, ver, snum
 581 slversion crc64_jones_norm_by8, 01,   00,  0026