ceph/src/isa-l/crc/crc16_t10dif_copy_by4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29 ;
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_copy_by4(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               unsigned char *dst, //buffer pointer destination for copy
  34 ;               const unsigned char *src, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  45 ;
  46
  47 %include "reg_sizes.asm"
  48
  49 %define fetch_dist      1024
  50
  51 [bits 64]
  52 default rel
  53
  54 section .text
  55 %ifidn __OUTPUT_FORMAT__, win64
  56         %xdefine        arg1 rcx
  57         %xdefine        arg2 rdx
  58         %xdefine        arg3 r8
  59         %xdefine        arg4 r9
  60         %xdefine        tmp1 r10
  61         %xdefine        arg1_low32 ecx
  62 %else
  63         %xdefine        arg1 rdi
  64         %xdefine        arg2 rsi
  65         %xdefine        arg3 rdx
  66         %xdefine        arg4 rcx
  67         %xdefine        tmp1 r10
  68         %xdefine        arg1_low32 edi
  69 %endif
  70
  71 align 16
  72 global  crc16_t10dif_copy_by4:ISAL_SYM_TYPE_FUNCTION
  73 crc16_t10dif_copy_by4:
  74
  75         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  76         shl     arg1_low32, 16
  77
  78         ; After this point, code flow is exactly same as a 32-bit CRC.
  79         ; The only difference is before returning eax, we will shift
  80         ; it right 16 bits, to scale back to 16 bits.
  81
  82         sub     rsp,16*4+8
  83
  84         ; push the xmm registers into the stack to maintain
  85         movdqa [rsp+16*2],xmm6
  86         movdqa [rsp+16*3],xmm7
  87
  88         ; check if smaller than 128B
  89         cmp     arg4, 128
  90
  91         ; for sizes less than 128, we can't fold 64B at a time...
  92         jl      _less_than_128
  93
  94
  95         ; load the initial crc value
  96         movd    xmm6, arg1_low32        ; initial crc
  97
  98         ; crc value does not need to be byte-reflected, but it needs to
  99         ; be moved to the high part of the register.
 100         ; because data will be byte-reflected and will align with
 101         ; initial crc at correct place.
 102         pslldq  xmm6, 12
 103
 104         movdqa xmm7, [SHUF_MASK]
 105         ; receive the initial 64B data, xor the initial crc value
 106         movdqu  xmm0, [arg3]
 107         movdqu  xmm1, [arg3+16]
 108         movdqu  xmm2, [arg3+32]
 109         movdqu  xmm3, [arg3+48]
 110
 111         ; copy initial data
 112         movdqu  [arg2], xmm0
 113         movdqu  [arg2+16], xmm1
 114         movdqu  [arg2+32], xmm2
 115         movdqu  [arg2+48], xmm3
 116
 117         pshufb  xmm0, xmm7
 118         ; XOR the initial_crc value
 119         pxor    xmm0, xmm6
 120         pshufb  xmm1, xmm7
 121         pshufb  xmm2, xmm7
 122         pshufb  xmm3, xmm7
 123
 124         movdqa  xmm6, [rk3]     ;xmm6 has rk3 and rk4
 125                                         ;imm value of pclmulqdq instruction
 126                                         ;will determine which constant to use
 127         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 128         ; we subtract 128 instead of 64 to save one instruction from the loop
 129         sub     arg4, 128
 130
 131         ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
 132         ; buffer. The _fold_64_B_loop
 133         ; loop will fold 64B at a time until we have 64+y Bytes of buffer
 134
 135
 136         ; fold 64B at a time. This section of the code folds 4 xmm
 137         ; registers in parallel
 138 _fold_64_B_loop:
 139
 140         ; update the buffer pointer
 141         add     arg3, 64                ;    buf += 64;
 142         add     arg2, 64
 143
 144         prefetchnta [arg3+fetch_dist+0]
 145         movdqu  xmm4, xmm0
 146         movdqu  xmm5, xmm1
 147
 148         pclmulqdq       xmm0, xmm6 , 0x11
 149         pclmulqdq       xmm1, xmm6 , 0x11
 150
 151         pclmulqdq       xmm4, xmm6, 0x0
 152         pclmulqdq       xmm5, xmm6, 0x0
 153
 154         pxor    xmm0, xmm4
 155         pxor    xmm1, xmm5
 156
 157         prefetchnta [arg3+fetch_dist+32]
 158         movdqu  xmm4, xmm2
 159         movdqu  xmm5, xmm3
 160
 161         pclmulqdq       xmm2, xmm6, 0x11
 162         pclmulqdq       xmm3, xmm6, 0x11
 163
 164         pclmulqdq       xmm4, xmm6, 0x0
 165         pclmulqdq       xmm5, xmm6, 0x0
 166
 167         pxor    xmm2, xmm4
 168         pxor    xmm3, xmm5
 169
 170         movdqu  xmm4, [arg3]
 171         movdqu  xmm5, [arg3+16]
 172         movdqu  [arg2], xmm4
 173         movdqu  [arg2+16], xmm5
 174         pshufb  xmm4, xmm7
 175         pshufb  xmm5, xmm7
 176         pxor    xmm0, xmm4
 177         pxor    xmm1, xmm5
 178
 179         movdqu  xmm4, [arg3+32]
 180         movdqu  xmm5, [arg3+48]
 181         movdqu  [arg2+32], xmm4
 182         movdqu  [arg2+48], xmm5
 183         pshufb  xmm4, xmm7
 184         pshufb  xmm5, xmm7
 185
 186         pxor    xmm2, xmm4
 187         pxor    xmm3, xmm5
 188
 189         sub     arg4, 64
 190
 191         ; check if there is another 64B in the buffer to be able to fold
 192         jge     _fold_64_B_loop
 193         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 194
 195
 196         add     arg3, 64
 197         add     arg2, 64
 198         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 199         ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 200
 201
 202         ; fold the 4 xmm registers to 1 xmm register with different constants
 203
 204         movdqa  xmm6, [rk1]     ;xmm6 has rk1 and rk2
 205                                         ;imm value of pclmulqdq instruction will
 206                                         ;determine which constant to use
 207
 208         movdqa  xmm4, xmm0
 209         pclmulqdq       xmm0, xmm6, 0x11
 210         pclmulqdq       xmm4, xmm6, 0x0
 211         pxor    xmm1, xmm4
 212         pxor    xmm1, xmm0
 213
 214         movdqa  xmm4, xmm1
 215         pclmulqdq       xmm1, xmm6, 0x11
 216         pclmulqdq       xmm4, xmm6, 0x0
 217         pxor    xmm2, xmm4
 218         pxor    xmm2, xmm1
 219
 220         movdqa  xmm4, xmm2
 221         pclmulqdq       xmm2, xmm6, 0x11
 222         pclmulqdq       xmm4, xmm6, 0x0
 223         pxor    xmm3, xmm4
 224         pxor    xmm3, xmm2
 225
 226
 227         ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
 228         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 229         add     arg4, 64-16
 230         jl      _final_reduction_for_128
 231
 232         ; now we have 16+y bytes left to reduce. 16 Bytes
 233         ; is in register xmm3 and the rest is in memory
 234         ; we can fold 16 bytes at a time if y>=16
 235         ; continue folding 16B at a time
 236
 237 _16B_reduction_loop:
 238         movdqa  xmm4, xmm3
 239         pclmulqdq       xmm3, xmm6, 0x11
 240         pclmulqdq       xmm4, xmm6, 0x0
 241         pxor    xmm3, xmm4
 242         movdqu  xmm0, [arg3]
 243         movdqu  [arg2], xmm0
 244         pshufb  xmm0, xmm7
 245         pxor    xmm3, xmm0
 246         add     arg3, 16
 247         add     arg2, 16
 248         sub     arg4, 16
 249         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 250         ; equivalent of: cmp arg4, 16-16
 251         ; check if there is any more 16B in the buffer to be able to fold
 252         jge     _16B_reduction_loop
 253
 254         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 255         ;first, we reduce the data in the xmm3 register
 256
 257
 258 _final_reduction_for_128:
 259         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 260         add     arg4, 16
 261         je      _128_done
 262
 263         ; here we are getting data that is less than 16 bytes.
 264         ; since we know that there was data before the pointer,
 265         ; we can offset the input pointer before the actual point,
 266         ; to receive exactly 16 bytes.
 267         ; after that the registers need to be adjusted.
 268 _get_last_two_xmms:
 269         movdqa  xmm2, xmm3
 270
 271         movdqu  xmm1, [arg3 - 16 + arg4]
 272         movdqu  [arg2 - 16 + arg4], xmm1
 273         pshufb  xmm1, xmm7
 274
 275         ; get rid of the extra data that was loaded before
 276         ; load the shift constant
 277         lea     rax, [pshufb_shf_table + 16]
 278         sub     rax, arg4
 279         movdqu  xmm0, [rax]
 280
 281         ; shift xmm2 to the left by arg4 bytes
 282         pshufb  xmm2, xmm0
 283
 284         ; shift xmm3 to the right by 16-arg4 bytes
 285         pxor    xmm0, [mask1]
 286         pshufb  xmm3, xmm0
 287         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 288
 289         ; fold 16 Bytes
 290         movdqa  xmm2, xmm1
 291         movdqa  xmm4, xmm3
 292         pclmulqdq       xmm3, xmm6, 0x11
 293         pclmulqdq       xmm4, xmm6, 0x0
 294         pxor    xmm3, xmm4
 295         pxor    xmm3, xmm2
 296
 297 _128_done:
 298         ; compute crc of a 128-bit value
 299         movdqa  xmm6, [rk5]     ; rk5 and rk6 in xmm6
 300         movdqa  xmm0, xmm3
 301
 302         ;64b fold
 303         pclmulqdq       xmm3, xmm6, 0x1
 304         pslldq  xmm0, 8
 305         pxor    xmm3, xmm0
 306
 307         ;32b fold
 308         movdqa  xmm0, xmm3
 309
 310         pand    xmm0, [mask2]
 311
 312         psrldq  xmm3, 12
 313         pclmulqdq       xmm3, xmm6, 0x10
 314         pxor    xmm3, xmm0
 315
 316         ;barrett reduction
 317 _barrett:
 318         movdqa  xmm6, [rk7]     ; rk7 and rk8 in xmm6
 319         movdqa  xmm0, xmm3
 320         pclmulqdq       xmm3, xmm6, 0x01
 321         pslldq  xmm3, 4
 322         pclmulqdq       xmm3, xmm6, 0x11
 323
 324         pslldq  xmm3, 4
 325         pxor    xmm3, xmm0
 326         pextrd  eax, xmm3,1
 327
 328 _cleanup:
 329         ; scale the result back to 16 bits
 330         shr     eax, 16
 331         movdqa  xmm6, [rsp+16*2]
 332         movdqa  xmm7, [rsp+16*3]
 333         add     rsp,16*4+8
 334         ret
 335
 336
 337 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 338 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 339 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 340 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 341
 342 align 16
 343 _less_than_128:
 344
 345         ; check if there is enough buffer to be able to fold 16B at a time
 346         cmp     arg4, 32
 347         jl      _less_than_32
 348         movdqa xmm7, [SHUF_MASK]
 349
 350         ; if there is, load the constants
 351         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 352
 353         movd    xmm0, arg1_low32        ; get the initial crc value
 354         pslldq  xmm0, 12        ; align it to its correct place
 355         movdqu  xmm3, [arg3]    ; load the plaintext
 356         movdqu  [arg2], xmm3    ; store copy
 357         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 358         pxor    xmm3, xmm0
 359
 360
 361         ; update the buffer pointer
 362         add     arg3, 16
 363         add     arg2, 16
 364
 365         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 366         sub     arg4, 32
 367
 368         jmp     _16B_reduction_loop
 369
 370
 371 align 16
 372 _less_than_32:
 373         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 374         mov     eax, arg1_low32
 375         test    arg4, arg4
 376         je      _cleanup
 377
 378         movdqa xmm7, [SHUF_MASK]
 379
 380         movd    xmm0, arg1_low32        ; get the initial crc value
 381         pslldq  xmm0, 12                ; align it to its correct place
 382
 383         cmp     arg4, 16
 384         je      _exact_16_left
 385         jl      _less_than_16_left
 386
 387         movdqu  xmm3, [arg3]    ; load the plaintext
 388         movdqu  [arg2], xmm3    ; store the copy
 389         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 390         pxor    xmm3, xmm0      ; xor the initial crc value
 391         add     arg3, 16
 392         add     arg2, 16
 393         sub     arg4, 16
 394         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 395         jmp     _get_last_two_xmms
 396
 397
 398 align 16
 399 _less_than_16_left:
 400         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 401
 402         pxor    xmm1, xmm1
 403         mov     r11, rsp
 404         movdqa  [r11], xmm1
 405
 406         cmp     arg4, 4
 407         jl      _only_less_than_4
 408
 409         ;       backup the counter value
 410         mov     tmp1, arg4
 411         cmp     arg4, 8
 412         jl      _less_than_8_left
 413
 414         ; load 8 Bytes
 415         mov     rax, [arg3]
 416         mov     [arg2], rax
 417         mov     [r11], rax
 418         add     r11, 8
 419         sub     arg4, 8
 420         add     arg3, 8
 421         add     arg2, 8
 422 _less_than_8_left:
 423
 424         cmp     arg4, 4
 425         jl      _less_than_4_left
 426
 427         ; load 4 Bytes
 428         mov     eax, [arg3]
 429         mov     [arg2], eax
 430         mov     [r11], eax
 431         add     r11, 4
 432         sub     arg4, 4
 433         add     arg3, 4
 434         add     arg2, 4
 435 _less_than_4_left:
 436
 437         cmp     arg4, 2
 438         jl      _less_than_2_left
 439
 440         ; load 2 Bytes
 441         mov     ax, [arg3]
 442         mov     [arg2], ax
 443         mov     [r11], ax
 444         add     r11, 2
 445         sub     arg4, 2
 446         add     arg3, 2
 447         add     arg2, 2
 448 _less_than_2_left:
 449         cmp     arg4, 1
 450         jl      _zero_left
 451
 452         ; load 1 Byte
 453         mov     al, [arg3]
 454         mov     [arg2], al
 455         mov     [r11], al
 456 _zero_left:
 457         movdqa  xmm3, [rsp]
 458         pshufb  xmm3, xmm7
 459         pxor    xmm3, xmm0      ; xor the initial crc value
 460
 461         ; shl tmp1, 4
 462         lea     rax, [pshufb_shf_table + 16]
 463         sub     rax, tmp1
 464         movdqu  xmm0, [rax]
 465         pxor    xmm0, [mask1]
 466
 467         pshufb  xmm3, xmm0
 468         jmp     _128_done
 469
 470 align 16
 471 _exact_16_left:
 472         movdqu  xmm3, [arg3]
 473         movdqu  [arg2], xmm3
 474         pshufb  xmm3, xmm7
 475         pxor    xmm3, xmm0      ; xor the initial crc value
 476
 477         jmp     _128_done
 478
 479 _only_less_than_4:
 480         cmp     arg4, 3
 481         jl      _only_less_than_3
 482
 483         ; load 3 Bytes
 484         mov     al, [arg3]
 485         mov     [arg2], al
 486         mov     [r11], al
 487
 488         mov     al, [arg3+1]
 489         mov     [arg2+1], al
 490         mov     [r11+1], al
 491
 492         mov     al, [arg3+2]
 493         mov     [arg2+2], al
 494         mov     [r11+2], al
 495
 496         movdqa  xmm3, [rsp]
 497         pshufb  xmm3, xmm7
 498         pxor    xmm3, xmm0      ; xor the initial crc value
 499
 500         psrldq  xmm3, 5
 501
 502         jmp     _barrett
 503 _only_less_than_3:
 504         cmp     arg4, 2
 505         jl      _only_less_than_2
 506
 507         ; load 2 Bytes
 508         mov     al, [arg3]
 509         mov     [arg2], al
 510         mov     [r11], al
 511
 512         mov     al, [arg3+1]
 513         mov     [arg2+1], al
 514         mov     [r11+1], al
 515
 516         movdqa  xmm3, [rsp]
 517         pshufb  xmm3, xmm7
 518         pxor    xmm3, xmm0      ; xor the initial crc value
 519
 520         psrldq  xmm3, 6
 521
 522         jmp     _barrett
 523 _only_less_than_2:
 524
 525         ; load 1 Byte
 526         mov     al, [arg3]
 527         mov     [arg2],al
 528         mov     [r11], al
 529
 530         movdqa  xmm3, [rsp]
 531         pshufb  xmm3, xmm7
 532         pxor    xmm3, xmm0      ; xor the initial crc value
 533
 534         psrldq  xmm3, 7
 535
 536         jmp     _barrett
 537
 538 section .data
 539
 540 ; precomputed constants
 541 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 542 align 16
 543 ; Q = 0x18BB70000
 544 ; rk1 = 2^(32*3) mod Q << 32
 545 ; rk2 = 2^(32*5) mod Q << 32
 546 ; rk3 = 2^(32*15) mod Q << 32
 547 ; rk4 = 2^(32*17) mod Q << 32
 548 ; rk5 = 2^(32*3) mod Q << 32
 549 ; rk6 = 2^(32*2) mod Q << 32
 550 ; rk7 = floor(2^64/Q)
 551 ; rk8 = Q
 552 rk1:
 553 DQ 0x2d56000000000000
 554 rk2:
 555 DQ 0x06df000000000000
 556 rk3:
 557 DQ 0x044c000000000000
 558 rk4:
 559 DQ 0xe658000000000000
 560 rk5:
 561 DQ 0x2d56000000000000
 562 rk6:
 563 DQ 0x1368000000000000
 564 rk7:
 565 DQ 0x00000001f65a57f8
 566 rk8:
 567 DQ 0x000000018bb70000
 568 mask1:
 569 dq 0x8080808080808080, 0x8080808080808080
 570 mask2:
 571 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 572
 573 SHUF_MASK:
 574 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 575
 576 pshufb_shf_table:
 577 ; use these values for shift constants for the pshufb instruction
 578 ; different alignments result in values as shown:
 579 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 580 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 581 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 582 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 583 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 584 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 585 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 586 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 587 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 588 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 589 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 590 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 591 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 592 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 593 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 594 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 595 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 596
 597 ;;;       func                   core, ver, snum
 598 slversion crc16_t10dif_copy_by4, 05,   02,  0000