ceph/src/isa-l/crc/crc64_iso_refl_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_iso_refl_by8(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 %include "reg_sizes.asm"
  39
  40 %define fetch_dist      1024
  41
  42 [bits 64]
  43 default rel
  44
  45 section .text
  46
  47
  48 %ifidn __OUTPUT_FORMAT__, win64
  49         %xdefine        arg1 rcx
  50         %xdefine        arg2 rdx
  51         %xdefine        arg3 r8
  52 %else
  53         %xdefine        arg1 rdi
  54         %xdefine        arg2 rsi
  55         %xdefine        arg3 rdx
  56 %endif
  57
  58 %define TMP 16*0
  59 %ifidn __OUTPUT_FORMAT__, win64
  60         %define XMM_SAVE 16*2
  61         %define VARIABLE_OFFSET 16*10+8
  62 %else
  63         %define VARIABLE_OFFSET 16*2+8
  64 %endif
  65
  66
  67 align 16
  68 global crc64_iso_refl_by8:function
  69 crc64_iso_refl_by8:
  70         ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
  71         not arg1
  72         sub     rsp, VARIABLE_OFFSET
  73
  74 %ifidn __OUTPUT_FORMAT__, win64
  75         ; push the xmm registers into the stack to maintain
  76         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  77         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  78         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  79         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  80         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  81         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  82         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  83         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  84 %endif
  85
  86         ; check if smaller than 256B
  87         cmp     arg3, 256
  88
  89         ; for sizes less than 256, we can't fold 128B at a time...
  90         jl      _less_than_256
  91
  92
  93         ; load the initial crc value
  94         movq    xmm10, arg1      ; initial crc
  95       ; receive the initial 128B data, xor the initial crc value
  96         movdqu  xmm0, [arg2+16*0]
  97         movdqu  xmm1, [arg2+16*1]
  98         movdqu  xmm2, [arg2+16*2]
  99         movdqu  xmm3, [arg2+16*3]
 100         movdqu  xmm4, [arg2+16*4]
 101         movdqu  xmm5, [arg2+16*5]
 102         movdqu  xmm6, [arg2+16*6]
 103         movdqu  xmm7, [arg2+16*7]
 104
 105         ; XOR the initial_crc value
 106         pxor    xmm0, xmm10
 107         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 108                                         ;imm value of pclmulqdq instruction will determine which constant to use
 109         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 110         ; we subtract 256 instead of 128 to save one instruction from the loop
 111         sub     arg3, 256
 112
 113         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 114         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 115
 116
 117         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 118 _fold_128_B_loop:
 119
 120         ; update the buffer pointer
 121         add     arg2, 128
 122
 123         prefetchnta [arg2+fetch_dist+0]
 124         movdqu  xmm9, [arg2+16*0]
 125         movdqu  xmm12, [arg2+16*1]
 126         movdqa  xmm8, xmm0
 127         movdqa  xmm13, xmm1
 128         pclmulqdq       xmm0, xmm10, 0x10
 129         pclmulqdq       xmm8, xmm10 , 0x1
 130         pclmulqdq       xmm1, xmm10, 0x10
 131         pclmulqdq       xmm13, xmm10 , 0x1
 132         pxor    xmm0, xmm9
 133         xorps   xmm0, xmm8
 134         pxor    xmm1, xmm12
 135         xorps   xmm1, xmm13
 136
 137         prefetchnta [arg2+fetch_dist+32]
 138         movdqu  xmm9, [arg2+16*2]
 139         movdqu  xmm12, [arg2+16*3]
 140         movdqa  xmm8, xmm2
 141         movdqa  xmm13, xmm3
 142         pclmulqdq       xmm2, xmm10, 0x10
 143         pclmulqdq       xmm8, xmm10 , 0x1
 144         pclmulqdq       xmm3, xmm10, 0x10
 145         pclmulqdq       xmm13, xmm10 , 0x1
 146         pxor    xmm2, xmm9
 147         xorps   xmm2, xmm8
 148         pxor    xmm3, xmm12
 149         xorps   xmm3, xmm13
 150
 151         prefetchnta [arg2+fetch_dist+64]
 152         movdqu  xmm9, [arg2+16*4]
 153         movdqu  xmm12, [arg2+16*5]
 154         movdqa  xmm8, xmm4
 155         movdqa  xmm13, xmm5
 156         pclmulqdq       xmm4, xmm10, 0x10
 157         pclmulqdq       xmm8, xmm10 , 0x1
 158         pclmulqdq       xmm5, xmm10, 0x10
 159         pclmulqdq       xmm13, xmm10 , 0x1
 160         pxor    xmm4, xmm9
 161         xorps   xmm4, xmm8
 162         pxor    xmm5, xmm12
 163         xorps   xmm5, xmm13
 164
 165         prefetchnta [arg2+fetch_dist+96]
 166         movdqu  xmm9, [arg2+16*6]
 167         movdqu  xmm12, [arg2+16*7]
 168         movdqa  xmm8, xmm6
 169         movdqa  xmm13, xmm7
 170         pclmulqdq       xmm6, xmm10, 0x10
 171         pclmulqdq       xmm8, xmm10 , 0x1
 172         pclmulqdq       xmm7, xmm10, 0x10
 173         pclmulqdq       xmm13, xmm10 , 0x1
 174         pxor    xmm6, xmm9
 175         xorps   xmm6, xmm8
 176         pxor    xmm7, xmm12
 177         xorps   xmm7, xmm13
 178
 179         sub     arg3, 128
 180
 181         ; check if there is another 128B in the buffer to be able to fold
 182         jge     _fold_128_B_loop
 183         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 184
 185         add     arg2, 128
 186         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 187         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 188
 189
 190         ; fold the 8 xmm registers to 1 xmm register with different constants
 191         ; xmm0 to xmm7
 192         movdqa  xmm10, [rk9]
 193         movdqa  xmm8, xmm0
 194         pclmulqdq       xmm0, xmm10, 0x1
 195         pclmulqdq       xmm8, xmm10, 0x10
 196         pxor    xmm7, xmm8
 197         xorps   xmm7, xmm0
 198         ;xmm1 to xmm7
 199         movdqa  xmm10, [rk11]
 200         movdqa  xmm8, xmm1
 201         pclmulqdq       xmm1, xmm10, 0x1
 202         pclmulqdq       xmm8, xmm10, 0x10
 203         pxor    xmm7, xmm8
 204         xorps   xmm7, xmm1
 205
 206         movdqa  xmm10, [rk13]
 207         movdqa  xmm8, xmm2
 208         pclmulqdq       xmm2, xmm10, 0x1
 209         pclmulqdq       xmm8, xmm10, 0x10
 210         pxor    xmm7, xmm8
 211         pxor    xmm7, xmm2
 212
 213         movdqa  xmm10, [rk15]
 214         movdqa  xmm8, xmm3
 215         pclmulqdq       xmm3, xmm10, 0x1
 216         pclmulqdq       xmm8, xmm10, 0x10
 217         pxor    xmm7, xmm8
 218         xorps   xmm7, xmm3
 219
 220         movdqa  xmm10, [rk17]
 221         movdqa  xmm8, xmm4
 222         pclmulqdq       xmm4, xmm10, 0x1
 223         pclmulqdq       xmm8, xmm10, 0x10
 224         pxor    xmm7, xmm8
 225         pxor    xmm7, xmm4
 226
 227         movdqa  xmm10, [rk19]
 228         movdqa  xmm8, xmm5
 229         pclmulqdq       xmm5, xmm10, 0x1
 230         pclmulqdq       xmm8, xmm10, 0x10
 231         pxor    xmm7, xmm8
 232         xorps   xmm7, xmm5
 233         ; xmm6 to xmm7
 234         movdqa  xmm10, [rk1]
 235         movdqa  xmm8, xmm6
 236         pclmulqdq       xmm6, xmm10, 0x1
 237         pclmulqdq       xmm8, xmm10, 0x10
 238         pxor    xmm7, xmm8
 239         pxor    xmm7, xmm6
 240
 241
 242         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 243         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 244         add     arg3, 128-16
 245         jl      _final_reduction_for_128
 246
 247         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 248         ; we can fold 16 bytes at a time if y>=16
 249         ; continue folding 16B at a time
 250
 251 _16B_reduction_loop:
 252         movdqa  xmm8, xmm7
 253         pclmulqdq       xmm7, xmm10, 0x1
 254         pclmulqdq       xmm8, xmm10, 0x10
 255         pxor    xmm7, xmm8
 256         movdqu  xmm0, [arg2]
 257         pxor    xmm7, xmm0
 258         add     arg2, 16
 259         sub     arg3, 16
 260         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 261         ; equivalent of: cmp arg3, 16-16
 262         ; check if there is any more 16B in the buffer to be able to fold
 263         jge     _16B_reduction_loop
 264
 265         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 266         ;first, we reduce the data in the xmm7 register
 267
 268
 269 _final_reduction_for_128:
 270         add arg3, 16
 271         je _128_done
 272   ; here we are getting data that is less than 16 bytes.
 273         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 274         ; after that the registers need to be adjusted.
 275 _get_last_two_xmms:
 276
 277
 278         movdqa xmm2, xmm7
 279         movdqu xmm1, [arg2 - 16 + arg3]
 280
 281         ; get rid of the extra data that was loaded before
 282         ; load the shift constant
 283         lea     rax, [pshufb_shf_table]
 284         add     rax, arg3
 285         movdqu  xmm0, [rax]
 286
 287
 288         pshufb  xmm7, xmm0
 289         pxor    xmm0, [mask3]
 290         pshufb  xmm2, xmm0
 291
 292         pblendvb        xmm2, xmm1     ;xmm0 is implicit
 293         ;;;;;;;;;;
 294         movdqa  xmm8, xmm7
 295         pclmulqdq       xmm7, xmm10, 0x1
 296
 297         pclmulqdq       xmm8, xmm10, 0x10
 298         pxor    xmm7, xmm8
 299         pxor    xmm7, xmm2
 300
 301 _128_done:
 302         ; compute crc of a 128-bit value
 303         movdqa  xmm10, [rk5]
 304         movdqa  xmm0, xmm7
 305
 306         ;64b fold
 307         pclmulqdq       xmm7, xmm10, 0
 308         psrldq  xmm0, 8
 309         pxor    xmm7, xmm0
 310
 311         ;barrett reduction
 312 _barrett:
 313         movdqa  xmm1, xmm7
 314         movdqa  xmm10, [rk7]
 315
 316         pclmulqdq       xmm7, xmm10, 0
 317         movdqa  xmm2, xmm7
 318         pclmulqdq       xmm7, xmm10, 0x10
 319         pslldq  xmm2, 8
 320         pxor    xmm7, xmm2
 321         pxor    xmm7, xmm1
 322         pextrq  rax, xmm7, 1
 323
 324 _cleanup:
 325         ; return c ^ 0xffffffff, ffffffffL;
 326         not     rax
 327
 328
 329 %ifidn __OUTPUT_FORMAT__, win64
 330         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 331         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 332         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 333         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 334         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 335         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 336         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 337         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 338 %endif
 339         add     rsp, VARIABLE_OFFSET
 340         ret
 341
 342 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 344 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 345 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 346
 347 align 16
 348 _less_than_256:
 349
 350         ; check if there is enough buffer to be able to fold 16B at a time
 351         cmp     arg3, 32
 352         jl      _less_than_32
 353
 354         ; if there is, load the constants
 355         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 356
 357         movq    xmm0, arg1       ; get the initial crc value
 358         movdqu  xmm7, [arg2]            ; load the plaintext
 359         pxor    xmm7, xmm0
 360
 361         ; update the buffer pointer
 362         add     arg2, 16
 363
 364         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 365         sub     arg3, 32
 366
 367         jmp     _16B_reduction_loop
 368
 369 align 16
 370 _less_than_32:
 371         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 372         mov     rax, arg1
 373         test    arg3, arg3
 374         je      _cleanup
 375
 376         movq    xmm0, arg1       ; get the initial crc value
 377
 378         cmp     arg3, 16
 379         je      _exact_16_left
 380         jl      _less_than_16_left
 381
 382         movdqu  xmm7, [arg2]            ; load the plaintext
 383         pxor    xmm7, xmm0              ; xor the initial crc value
 384         add     arg2, 16
 385         sub     arg3, 16
 386         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 387         jmp     _get_last_two_xmms
 388
 389
 390 align 16
 391 _less_than_16_left:
 392         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 393
 394         pxor    xmm1, xmm1
 395         mov     r11, rsp
 396         movdqa  [r11], xmm1
 397
 398         ;       backup the counter value
 399         mov     r9, arg3
 400         cmp     arg3, 8
 401         jl      _less_than_8_left
 402
 403         ; load 8 Bytes
 404         mov     rax, [arg2]
 405         mov     [r11], rax
 406         add     r11, 8
 407         sub     arg3, 8
 408         add     arg2, 8
 409 _less_than_8_left:
 410
 411         cmp     arg3, 4
 412         jl      _less_than_4_left
 413
 414         ; load 4 Bytes
 415         mov     eax, [arg2]
 416         mov     [r11], eax
 417         add     r11, 4
 418         sub     arg3, 4
 419         add     arg2, 4
 420 _less_than_4_left:
 421
 422         cmp     arg3, 2
 423         jl      _less_than_2_left
 424
 425         ; load 2 Bytes
 426         mov     ax, [arg2]
 427         mov     [r11], ax
 428         add     r11, 2
 429         sub     arg3, 2
 430         add     arg2, 2
 431 _less_than_2_left:
 432         cmp     arg3, 1
 433         jl      _zero_left
 434
 435         ; load 1 Byte
 436         mov     al, [arg2]
 437         mov     [r11], al
 438
 439 _zero_left:
 440         movdqa  xmm7, [rsp]
 441         pxor    xmm7, xmm0      ; xor the initial crc value
 442
 443         lea rax,[pshufb_shf_table]
 444
 445         cmp     r9, 8
 446         jl      _end_1to7
 447
 448 _end_8to15:
 449         movdqu  xmm0, [rax + r9]
 450         pshufb  xmm7,xmm0
 451         jmp     _128_done
 452
 453 _end_1to7:
 454         ; Left shift (8-length) bytes in XMM
 455         movdqu  xmm0, [rax + r9 + 8]
 456         pshufb  xmm7,xmm0
 457
 458         jmp     _barrett
 459
 460 align 16
 461 _exact_16_left:
 462         movdqu  xmm7, [arg2]
 463         pxor    xmm7, xmm0      ; xor the initial crc value
 464
 465         jmp     _128_done
 466
 467 section .data
 468
 469 ; precomputed constants
 470 align 16
 471 ; rk7 = floor(2^128/Q)
 472 ; rk8 = Q
 473 rk1:
 474 DQ 0xf500000000000001
 475 rk2:
 476 DQ 0x6b70000000000001
 477 rk3:
 478 DQ 0xb001000000010000
 479 rk4:
 480 DQ 0xf501b0000001b000
 481 rk5:
 482 DQ 0xf500000000000001
 483 rk6:
 484 DQ 0x0000000000000000
 485 rk7:
 486 DQ 0xb000000000000001
 487 rk8:
 488 DQ 0xb000000000000000
 489 rk9:
 490 DQ 0xe014514514501501
 491 rk10:
 492 DQ 0x771db6db6db71c71
 493 rk11:
 494 DQ 0xa101101101110001
 495 rk12:
 496 DQ 0x1ab1ab1ab1aab001
 497 rk13:
 498 DQ 0xf445014445000001
 499 rk14:
 500 DQ 0x6aab71daab700001
 501 rk15:
 502 DQ 0xb100010100000001
 503 rk16:
 504 DQ 0x01b001b1b0000001
 505 rk17:
 506 DQ 0xe145150000000001
 507 rk18:
 508 DQ 0x76db6c7000000001
 509 rk19:
 510 DQ 0xa011000000000001
 511 rk20:
 512 DQ 0x1b1ab00000000001
 513
 514 pshufb_shf_table:
 515 ; use these values for shift constants for the pshufb instruction
 516 ; different alignments result in values as shown:
 517 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 518 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 519 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 520 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 521 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 522 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 523 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 524 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 525 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 526 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 527 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 528 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 529 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 530 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 531 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 532 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 533 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 534
 535
 536 mask:
 537 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 538 mask2:
 539 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 540 mask3:
 541 dq     0x8080808080808080, 0x8080808080808080
 542
 543 ;;;       func        core, ver, snum
 544 slversion crc64_iso_refl_by8, 01,   00,  0023