ceph/src/isa-l/crc/crc64_ecma_norm_by8.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;       Function API:
  31 ;       uint64_t crc64_ecma_norm_by8(
  32 ;               uint64_t init_crc, //initial CRC value, 64 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               uint64_t len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
  38 %include "reg_sizes.asm"
  39
  40 %define fetch_dist      1024
  41
  42 [bits 64]
  43 default rel
  44
  45 section .text
  46
  47 %ifidn __OUTPUT_FORMAT__, win64
  48         %xdefine        arg1 rcx
  49         %xdefine        arg2 rdx
  50         %xdefine        arg3 r8
  51 %else
  52         %xdefine        arg1 rdi
  53         %xdefine        arg2 rsi
  54         %xdefine        arg3 rdx
  55 %endif
  56
  57 %define TMP 16*0
  58 %ifidn __OUTPUT_FORMAT__, win64
  59         %define XMM_SAVE 16*2
  60         %define VARIABLE_OFFSET 16*10+8
  61 %else
  62         %define VARIABLE_OFFSET 16*2+8
  63 %endif
  64 align 16
  65 global  crc64_ecma_norm_by8:function
  66 crc64_ecma_norm_by8:
  67
  68         not     arg1      ;~init_crc
  69
  70         sub     rsp,VARIABLE_OFFSET
  71
  72 %ifidn __OUTPUT_FORMAT__, win64
  73         ; push the xmm registers into the stack to maintain
  74         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  75         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  76         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
  77         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
  78         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
  79         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
  80         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
  81         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
  82 %endif
  83
  84
  85         ; check if smaller than 256
  86         cmp     arg3, 256
  87
  88         ; for sizes less than 256, we can't fold 128B at a time...
  89         jl      _less_than_256
  90
  91
  92         ; load the initial crc value
  93         movq    xmm10, arg1     ; initial crc
  94
  95         ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
  96         ; because data will be byte-reflected and will align with initial crc at correct place.
  97         pslldq  xmm10, 8
  98
  99         movdqa xmm11, [SHUF_MASK]
 100         ; receive the initial 128B data, xor the initial crc value
 101         movdqu  xmm0, [arg2+16*0]
 102         movdqu  xmm1, [arg2+16*1]
 103         movdqu  xmm2, [arg2+16*2]
 104         movdqu  xmm3, [arg2+16*3]
 105         movdqu  xmm4, [arg2+16*4]
 106         movdqu  xmm5, [arg2+16*5]
 107         movdqu  xmm6, [arg2+16*6]
 108         movdqu  xmm7, [arg2+16*7]
 109
 110         pshufb  xmm0, xmm11
 111         ; XOR the initial_crc value
 112         pxor    xmm0, xmm10
 113         pshufb  xmm1, xmm11
 114         pshufb  xmm2, xmm11
 115         pshufb  xmm3, xmm11
 116         pshufb  xmm4, xmm11
 117         pshufb  xmm5, xmm11
 118         pshufb  xmm6, xmm11
 119         pshufb  xmm7, xmm11
 120
 121         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 122                                         ;imm value of pclmulqdq instruction will determine which constant to use
 123         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 124         ; we subtract 256 instead of 128 to save one instruction from the loop
 125         sub     arg3, 256
 126
 127         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 128         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 129
 130
 131         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 132 _fold_128_B_loop:
 133
 134         ; update the buffer pointer
 135         add     arg2, 128               ;    buf += 128;
 136
 137         prefetchnta [arg2+fetch_dist+0]
 138         movdqu  xmm9, [arg2+16*0]
 139         movdqu  xmm12, [arg2+16*1]
 140         pshufb  xmm9, xmm11
 141         pshufb  xmm12, xmm11
 142         movdqa  xmm8, xmm0
 143         movdqa  xmm13, xmm1
 144         pclmulqdq       xmm0, xmm10, 0x0
 145         pclmulqdq       xmm8, xmm10 , 0x11
 146         pclmulqdq       xmm1, xmm10, 0x0
 147         pclmulqdq       xmm13, xmm10 , 0x11
 148         pxor    xmm0, xmm9
 149         xorps   xmm0, xmm8
 150         pxor    xmm1, xmm12
 151         xorps   xmm1, xmm13
 152
 153         prefetchnta [arg2+fetch_dist+32]
 154         movdqu  xmm9, [arg2+16*2]
 155         movdqu  xmm12, [arg2+16*3]
 156         pshufb  xmm9, xmm11
 157         pshufb  xmm12, xmm11
 158         movdqa  xmm8, xmm2
 159         movdqa  xmm13, xmm3
 160         pclmulqdq       xmm2, xmm10, 0x0
 161         pclmulqdq       xmm8, xmm10 , 0x11
 162         pclmulqdq       xmm3, xmm10, 0x0
 163         pclmulqdq       xmm13, xmm10 , 0x11
 164         pxor    xmm2, xmm9
 165         xorps   xmm2, xmm8
 166         pxor    xmm3, xmm12
 167         xorps   xmm3, xmm13
 168
 169         prefetchnta [arg2+fetch_dist+64]
 170         movdqu  xmm9, [arg2+16*4]
 171         movdqu  xmm12, [arg2+16*5]
 172         pshufb  xmm9, xmm11
 173         pshufb  xmm12, xmm11
 174         movdqa  xmm8, xmm4
 175         movdqa  xmm13, xmm5
 176         pclmulqdq       xmm4, xmm10, 0x0
 177         pclmulqdq       xmm8, xmm10 , 0x11
 178         pclmulqdq       xmm5, xmm10, 0x0
 179         pclmulqdq       xmm13, xmm10 , 0x11
 180         pxor    xmm4, xmm9
 181         xorps   xmm4, xmm8
 182         pxor    xmm5, xmm12
 183         xorps   xmm5, xmm13
 184
 185         prefetchnta [arg2+fetch_dist+96]
 186         movdqu  xmm9, [arg2+16*6]
 187         movdqu  xmm12, [arg2+16*7]
 188         pshufb  xmm9, xmm11
 189         pshufb  xmm12, xmm11
 190         movdqa  xmm8, xmm6
 191         movdqa  xmm13, xmm7
 192         pclmulqdq       xmm6, xmm10, 0x0
 193         pclmulqdq       xmm8, xmm10 , 0x11
 194         pclmulqdq       xmm7, xmm10, 0x0
 195         pclmulqdq       xmm13, xmm10 , 0x11
 196         pxor    xmm6, xmm9
 197         xorps   xmm6, xmm8
 198         pxor    xmm7, xmm12
 199         xorps   xmm7, xmm13
 200
 201         sub     arg3, 128
 202
 203         ; check if there is another 128B in the buffer to be able to fold
 204         jge     _fold_128_B_loop
 205         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 206
 207         add     arg2, 128
 208         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 209         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 210
 211
 212         ; fold the 8 xmm registers to 1 xmm register with different constants
 213
 214         movdqa  xmm10, [rk9]
 215         movdqa  xmm8, xmm0
 216         pclmulqdq       xmm0, xmm10, 0x11
 217         pclmulqdq       xmm8, xmm10, 0x0
 218         pxor    xmm7, xmm8
 219         xorps   xmm7, xmm0
 220
 221         movdqa  xmm10, [rk11]
 222         movdqa  xmm8, xmm1
 223         pclmulqdq       xmm1, xmm10, 0x11
 224         pclmulqdq       xmm8, xmm10, 0x0
 225         pxor    xmm7, xmm8
 226         xorps   xmm7, xmm1
 227
 228         movdqa  xmm10, [rk13]
 229         movdqa  xmm8, xmm2
 230         pclmulqdq       xmm2, xmm10, 0x11
 231         pclmulqdq       xmm8, xmm10, 0x0
 232         pxor    xmm7, xmm8
 233         pxor    xmm7, xmm2
 234
 235         movdqa  xmm10, [rk15]
 236         movdqa  xmm8, xmm3
 237         pclmulqdq       xmm3, xmm10, 0x11
 238         pclmulqdq       xmm8, xmm10, 0x0
 239         pxor    xmm7, xmm8
 240         xorps   xmm7, xmm3
 241
 242         movdqa  xmm10, [rk17]
 243         movdqa  xmm8, xmm4
 244         pclmulqdq       xmm4, xmm10, 0x11
 245         pclmulqdq       xmm8, xmm10, 0x0
 246         pxor    xmm7, xmm8
 247         pxor    xmm7, xmm4
 248
 249         movdqa  xmm10, [rk19]
 250         movdqa  xmm8, xmm5
 251         pclmulqdq       xmm5, xmm10, 0x11
 252         pclmulqdq       xmm8, xmm10, 0x0
 253         pxor    xmm7, xmm8
 254         xorps   xmm7, xmm5
 255
 256         movdqa  xmm10, [rk1]    ;xmm10 has rk1 and rk2
 257
 258         movdqa  xmm8, xmm6
 259         pclmulqdq       xmm6, xmm10, 0x11
 260         pclmulqdq       xmm8, xmm10, 0x0
 261         pxor    xmm7, xmm8
 262         pxor    xmm7, xmm6
 263
 264
 265         ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
 266         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 267         add     arg3, 128-16
 268         jl      _final_reduction_for_128
 269
 270         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 271         ; we can fold 16 bytes at a time if y>=16
 272         ; continue folding 16B at a time
 273
 274 _16B_reduction_loop:
 275         movdqa  xmm8, xmm7
 276         pclmulqdq       xmm7, xmm10, 0x11
 277         pclmulqdq       xmm8, xmm10, 0x0
 278         pxor    xmm7, xmm8
 279         movdqu  xmm0, [arg2]
 280         pshufb  xmm0, xmm11
 281         pxor    xmm7, xmm0
 282         add     arg2, 16
 283         sub     arg3, 16
 284         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 285         ; equivalent of: cmp arg3, 16-16
 286         ; check if there is any more 16B in the buffer to be able to fold
 287         jge     _16B_reduction_loop
 288
 289         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 290         ;first, we reduce the data in the xmm7 register
 291
 292
 293 _final_reduction_for_128:
 294         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 295         add     arg3, 16
 296         je      _128_done
 297
 298         ; here we are getting data that is less than 16 bytes.
 299         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 300         ; after that the registers need to be adjusted.
 301 _get_last_two_xmms:
 302         movdqa  xmm2, xmm7
 303
 304         movdqu  xmm1, [arg2 - 16 + arg3]
 305         pshufb  xmm1, xmm11
 306
 307         ; get rid of the extra data that was loaded before
 308         ; load the shift constant
 309         lea     rax, [pshufb_shf_table + 16]
 310         sub     rax, arg3
 311         movdqu  xmm0, [rax]
 312
 313         ; shift xmm2 to the left by arg3 bytes
 314         pshufb  xmm2, xmm0
 315
 316         ; shift xmm7 to the right by 16-arg3 bytes
 317         pxor    xmm0, [mask1]
 318         pshufb  xmm7, xmm0
 319         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 320
 321         ; fold 16 Bytes
 322         movdqa  xmm2, xmm1
 323         movdqa  xmm8, xmm7
 324         pclmulqdq       xmm7, xmm10, 0x11
 325         pclmulqdq       xmm8, xmm10, 0x0
 326         pxor    xmm7, xmm8
 327         pxor    xmm7, xmm2
 328
 329 _128_done:
 330         ; compute crc of a 128-bit value
 331         movdqa  xmm10, [rk5]    ; rk5 and rk6 in xmm10
 332         movdqa  xmm0, xmm7
 333
 334         ;64b fold
 335         pclmulqdq       xmm7, xmm10, 0x01       ; H*L
 336         pslldq  xmm0, 8
 337         pxor    xmm7, xmm0
 338
 339         ;barrett reduction
 340 _barrett:
 341         movdqa  xmm10, [rk7]    ; rk7 and rk8 in xmm10
 342         movdqa  xmm0, xmm7
 343
 344         movdqa  xmm1, xmm7
 345         pand    xmm1, [mask3]
 346         pclmulqdq       xmm7, xmm10, 0x01
 347         pxor    xmm7, xmm1
 348
 349         pclmulqdq       xmm7, xmm10, 0x11
 350         pxor    xmm7, xmm0
 351         pextrq  rax, xmm7, 0
 352
 353 _cleanup:
 354         not     rax
 355 %ifidn __OUTPUT_FORMAT__, win64
 356         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 357         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 358         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 359         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 360         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 361         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 362         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 363         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 364 %endif
 365         add     rsp, VARIABLE_OFFSET
 366         ret
 367
 368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 369 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 370 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 371 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 372
 373 align 16
 374 _less_than_256:
 375
 376         ; check if there is enough buffer to be able to fold 16B at a time
 377         cmp     arg3, 32
 378         jl      _less_than_32
 379         movdqa xmm11, [SHUF_MASK]
 380
 381         ; if there is, load the constants
 382         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 383
 384         movq    xmm0, arg1      ; get the initial crc value
 385         pslldq  xmm0, 8 ; align it to its correct place
 386         movdqu  xmm7, [arg2]    ; load the plaintext
 387         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 388         pxor    xmm7, xmm0
 389
 390
 391         ; update the buffer pointer
 392         add     arg2, 16
 393
 394         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 395         sub     arg3, 32
 396
 397         jmp     _16B_reduction_loop
 398 align 16
 399 _less_than_32:
 400         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 401         mov     rax, arg1
 402         test    arg3, arg3
 403         je      _cleanup
 404
 405         movdqa xmm11, [SHUF_MASK]
 406
 407         movq    xmm0, arg1      ; get the initial crc value
 408         pslldq  xmm0, 8 ; align it to its correct place
 409
 410         cmp     arg3, 16
 411         je      _exact_16_left
 412         jl      _less_than_16_left
 413
 414         movdqu  xmm7, [arg2]    ; load the plaintext
 415         pshufb  xmm7, xmm11     ; byte-reflect the plaintext
 416         pxor    xmm7, xmm0      ; xor the initial crc value
 417         add     arg2, 16
 418         sub     arg3, 16
 419         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 420         jmp     _get_last_two_xmms
 421 align 16
 422 _less_than_16_left:
 423         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 424         pxor    xmm1, xmm1
 425         mov     r11, rsp
 426         movdqa  [r11], xmm1
 427
 428         ;       backup the counter value
 429         mov     r9, arg3
 430         cmp     arg3, 8
 431         jl      _less_than_8_left
 432
 433         ; load 8 Bytes
 434         mov     rax, [arg2]
 435         mov     [r11], rax
 436         add     r11, 8
 437         sub     arg3, 8
 438         add     arg2, 8
 439 _less_than_8_left:
 440
 441         cmp     arg3, 4
 442         jl      _less_than_4_left
 443
 444         ; load 4 Bytes
 445         mov     eax, [arg2]
 446         mov     [r11], eax
 447         add     r11, 4
 448         sub     arg3, 4
 449         add     arg2, 4
 450 _less_than_4_left:
 451
 452         cmp     arg3, 2
 453         jl      _less_than_2_left
 454
 455         ; load 2 Bytes
 456         mov     ax, [arg2]
 457         mov     [r11], ax
 458         add     r11, 2
 459         sub     arg3, 2
 460         add     arg2, 2
 461 _less_than_2_left:
 462         cmp     arg3, 1
 463         jl      _zero_left
 464
 465         ; load 1 Byte
 466         mov     al, [arg2]
 467         mov     [r11], al
 468 _zero_left:
 469         movdqa  xmm7, [rsp]
 470         pshufb  xmm7, xmm11
 471         pxor    xmm7, xmm0      ; xor the initial crc value
 472
 473         ; shl r9, 4
 474         lea     rax, [pshufb_shf_table + 16]
 475         sub     rax, r9
 476
 477         cmp     r9, 8
 478         jl      _end_1to7
 479
 480 _end_8to15:
 481         movdqu  xmm0, [rax]
 482         pxor    xmm0, [mask1]
 483
 484         pshufb  xmm7, xmm0
 485         jmp     _128_done
 486
 487 _end_1to7:
 488         ; Right shift (8-length) bytes in XMM
 489         add     rax, 8
 490         movdqu  xmm0, [rax]
 491         pshufb  xmm7,xmm0
 492
 493         jmp     _barrett
 494 align 16
 495 _exact_16_left:
 496         movdqu  xmm7, [arg2]
 497         pshufb  xmm7, xmm11
 498         pxor    xmm7, xmm0      ; xor the initial crc value
 499
 500         jmp     _128_done
 501
 502 section .data
 503
 504 ; precomputed constants
 505 align 16
 506
 507 rk1 :
 508 DQ 0x5f5c3c7eb52fab6
 509 rk2 :
 510 DQ 0x4eb938a7d257740e
 511 rk3 :
 512 DQ 0x5cf79dea9ac37d6
 513 rk4 :
 514 DQ 0x001067e571d7d5c2
 515 rk5 :
 516 DQ 0x5f5c3c7eb52fab6
 517 rk6 :
 518 DQ 0x0000000000000000
 519 rk7 :
 520 DQ 0x578d29d06cc4f872
 521 rk8 :
 522 DQ 0x42f0e1eba9ea3693
 523 rk9 :
 524 DQ 0xe464f4df5fb60ac1
 525 rk10 :
 526 DQ 0xb649c5b35a759cf2
 527 rk11 :
 528 DQ 0x9af04e1eff82d0dd
 529 rk12 :
 530 DQ 0x6e82e609297f8fe8
 531 rk13 :
 532 DQ 0x97c516e98bd2e73
 533 rk14 :
 534 DQ 0xb76477b31e22e7b
 535 rk15 :
 536 DQ 0x5f6843ca540df020
 537 rk16 :
 538 DQ 0xddf4b6981205b83f
 539 rk17 :
 540 DQ 0x54819d8713758b2c
 541 rk18 :
 542 DQ 0x4a6b90073eb0af5a
 543 rk19 :
 544 DQ 0x571bee0a227ef92b
 545 rk20 :
 546 DQ 0x44bef2a201b5200c
 547
 548
 549 mask1:
 550 dq 0x8080808080808080, 0x8080808080808080
 551 mask2:
 552 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 553 mask3:
 554 dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
 555
 556 SHUF_MASK:
 557 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 558
 559 pshufb_shf_table:
 560 ; use these values for shift constants for the pshufb instruction
 561 ; different alignments result in values as shown:
 562 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 563 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 564 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 565 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 566 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 567 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 568 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 569 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 570 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 571 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 572 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 573 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 574 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 575 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 576 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 577 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 578 dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
 579 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
 580 dq 0x8080808080808080, 0x8080808080808080
 581
 582 ;;;       func        core, ver, snum
 583 slversion crc64_ecma_norm_by8, 01,   00,  001a