ceph/src/isa-l/igzip/crc32_gzip.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       UINT32 crc32_gzip(
  33 ;               UINT32 init_crc, //initial CRC value, 32 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               UINT64 len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 ;       Authors:
  39 ;               Erdinc Ozturk
  40 ;               Vinodh Gopal
  41 ;               James Guilford
  42 ;
  43 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  44 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  45 ;
  46 ;
  47 ;       sample yasm command line:
  48 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip
  49 ;
  50 ;       As explained here:
  51 ;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
  52 ;       CRC-32 checksum is described in RFC 1952
  53 ;       Implementing RFC 1952 CRC:
  54 ;       http://www.ietf.org/rfc/rfc1952.txt
  55
  56 %include "reg_sizes.asm"
  57
  58 [bits 64]
  59 default rel
  60
  61 section .text
  62
  63
  64 %ifidn __OUTPUT_FORMAT__, win64
  65         %xdefine        arg1 rcx
  66         %xdefine        arg2 rdx
  67         %xdefine        arg3 r8
  68
  69         %xdefine        arg1_low32 ecx
  70 %else
  71         %xdefine        arg1 rdi
  72         %xdefine        arg2 rsi
  73         %xdefine        arg3 rdx
  74
  75         %xdefine        arg1_low32 edi
  76 %endif
  77
  78 %define TMP 16*0
  79 %ifidn __OUTPUT_FORMAT__, win64
  80         %define XMM_SAVE 16*2
  81         %define VARIABLE_OFFSET 16*10+8
  82 %else
  83         %define VARIABLE_OFFSET 16*2+8
  84 %endif
  85
  86 align 16
  87 global  crc32_gzip_01
  88 crc32_gzip_01:
  89
  90         ; unsigned long c = crc ^ 0xffffffffL;
  91         not     arg1_low32      ;
  92
  93
  94         sub     rsp, VARIABLE_OFFSET
  95 %ifidn __OUTPUT_FORMAT__, win64
  96         ; push the xmm registers into the stack to maintain
  97         movdqa  [rsp + XMM_SAVE + 16*0], xmm6
  98         movdqa  [rsp + XMM_SAVE + 16*1], xmm7
  99         movdqa  [rsp + XMM_SAVE + 16*2], xmm8
 100         movdqa  [rsp + XMM_SAVE + 16*3], xmm9
 101         movdqa  [rsp + XMM_SAVE + 16*4], xmm10
 102         movdqa  [rsp + XMM_SAVE + 16*5], xmm11
 103         movdqa  [rsp + XMM_SAVE + 16*6], xmm12
 104         movdqa  [rsp + XMM_SAVE + 16*7], xmm13
 105 %endif
 106
 107         ; check if smaller than 256B
 108         cmp     arg3, 256
 109
 110         ; for sizes less than 256, we can't fold 128B at a time...
 111         jl      _less_than_256
 112
 113
 114         ; load the initial crc value
 115         movd    xmm10, arg1_low32      ; initial crc
 116
 117         ; receive the initial 64B data, xor the initial crc value
 118         movdqu  xmm0, [arg2+16*0]
 119         movdqu  xmm1, [arg2+16*1]
 120         movdqu  xmm2, [arg2+16*2]
 121         movdqu  xmm3, [arg2+16*3]
 122         movdqu  xmm4, [arg2+16*4]
 123         movdqu  xmm5, [arg2+16*5]
 124         movdqu  xmm6, [arg2+16*6]
 125         movdqu  xmm7, [arg2+16*7]
 126
 127         ; XOR the initial_crc value
 128         pxor    xmm0, xmm10
 129         movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
 130                                         ;imm value of pclmulqdq instruction will determine which constant to use
 131         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 132         ; we subtract 256 instead of 128 to save one instruction from the loop
 133         sub     arg3, 256
 134
 135         ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
 136         ; loop will fold 128B at a time until we have 128+y Bytes of buffer
 137
 138
 139         ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 140 _fold_128_B_loop:
 141
 142         ; update the buffer pointer
 143         add     arg2, 128
 144
 145         movdqu  xmm9, [arg2+16*0]
 146         movdqu  xmm12, [arg2+16*1]
 147         movdqa  xmm8, xmm0
 148         movdqa  xmm13, xmm1
 149         pclmulqdq       xmm0, xmm10, 0x10
 150         pclmulqdq       xmm8, xmm10 , 0x1
 151         pclmulqdq       xmm1, xmm10, 0x10
 152         pclmulqdq       xmm13, xmm10 , 0x1
 153         pxor    xmm0, xmm9
 154         xorps   xmm0, xmm8
 155         pxor    xmm1, xmm12
 156         xorps   xmm1, xmm13
 157
 158         movdqu  xmm9, [arg2+16*2]
 159         movdqu  xmm12, [arg2+16*3]
 160         movdqa  xmm8, xmm2
 161         movdqa  xmm13, xmm3
 162         pclmulqdq       xmm2, xmm10, 0x10
 163         pclmulqdq       xmm8, xmm10 , 0x1
 164         pclmulqdq       xmm3, xmm10, 0x10
 165         pclmulqdq       xmm13, xmm10 , 0x1
 166         pxor    xmm2, xmm9
 167         xorps   xmm2, xmm8
 168         pxor    xmm3, xmm12
 169         xorps   xmm3, xmm13
 170
 171         movdqu  xmm9, [arg2+16*4]
 172         movdqu  xmm12, [arg2+16*5]
 173         movdqa  xmm8, xmm4
 174         movdqa  xmm13, xmm5
 175         pclmulqdq       xmm4, xmm10, 0x10
 176         pclmulqdq       xmm8, xmm10 , 0x1
 177         pclmulqdq       xmm5, xmm10, 0x10
 178         pclmulqdq       xmm13, xmm10 , 0x1
 179         pxor    xmm4, xmm9
 180         xorps   xmm4, xmm8
 181         pxor    xmm5, xmm12
 182         xorps   xmm5, xmm13
 183
 184         movdqu  xmm9, [arg2+16*6]
 185         movdqu  xmm12, [arg2+16*7]
 186         movdqa  xmm8, xmm6
 187         movdqa  xmm13, xmm7
 188         pclmulqdq       xmm6, xmm10, 0x10
 189         pclmulqdq       xmm8, xmm10 , 0x1
 190         pclmulqdq       xmm7, xmm10, 0x10
 191         pclmulqdq       xmm13, xmm10 , 0x1
 192         pxor    xmm6, xmm9
 193         xorps   xmm6, xmm8
 194         pxor    xmm7, xmm12
 195         xorps   xmm7, xmm13
 196
 197         sub     arg3, 128
 198
 199         ; check if there is another 128B in the buffer to be able to fold
 200         jge     _fold_128_B_loop
 201         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 202
 203
 204         add     arg2, 128
 205         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 206         ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
 207
 208
 209         ; fold the 8 xmm registers to 1 xmm register with different constants
 210
 211         movdqa  xmm10, [rk9]
 212         movdqa  xmm8, xmm0
 213         pclmulqdq       xmm0, xmm10, 0x1
 214         pclmulqdq       xmm8, xmm10, 0x10
 215         pxor    xmm7, xmm8
 216         xorps   xmm7, xmm0
 217
 218         movdqa  xmm10, [rk11]
 219         movdqa  xmm8, xmm1
 220         pclmulqdq       xmm1, xmm10, 0x1
 221         pclmulqdq       xmm8, xmm10, 0x10
 222         pxor    xmm7, xmm8
 223         xorps   xmm7, xmm1
 224
 225         movdqa  xmm10, [rk13]
 226         movdqa  xmm8, xmm2
 227         pclmulqdq       xmm2, xmm10, 0x1
 228         pclmulqdq       xmm8, xmm10, 0x10
 229         pxor    xmm7, xmm8
 230         pxor    xmm7, xmm2
 231
 232         movdqa  xmm10, [rk15]
 233         movdqa  xmm8, xmm3
 234         pclmulqdq       xmm3, xmm10, 0x1
 235         pclmulqdq       xmm8, xmm10, 0x10
 236         pxor    xmm7, xmm8
 237         xorps   xmm7, xmm3
 238
 239         movdqa  xmm10, [rk17]
 240         movdqa  xmm8, xmm4
 241         pclmulqdq       xmm4, xmm10, 0x1
 242         pclmulqdq       xmm8, xmm10, 0x10
 243         pxor    xmm7, xmm8
 244         pxor    xmm7, xmm4
 245
 246         movdqa  xmm10, [rk19]
 247         movdqa  xmm8, xmm5
 248         pclmulqdq       xmm5, xmm10, 0x1
 249         pclmulqdq       xmm8, xmm10, 0x10
 250         pxor    xmm7, xmm8
 251         xorps   xmm7, xmm5
 252
 253         movdqa  xmm10, [rk1]
 254         movdqa  xmm8, xmm6
 255         pclmulqdq       xmm6, xmm10, 0x1
 256         pclmulqdq       xmm8, xmm10, 0x10
 257         pxor    xmm7, xmm8
 258         pxor    xmm7, xmm6
 259
 260
 261         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 262         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 263         add     arg3, 128-16
 264         jl      _final_reduction_for_128
 265
 266         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 267         ; we can fold 16 bytes at a time if y>=16
 268         ; continue folding 16B at a time
 269
 270 _16B_reduction_loop:
 271         movdqa  xmm8, xmm7
 272         pclmulqdq       xmm7, xmm10, 0x1
 273         pclmulqdq       xmm8, xmm10, 0x10
 274         pxor    xmm7, xmm8
 275         movdqu  xmm0, [arg2]
 276         pxor    xmm7, xmm0
 277         add     arg2, 16
 278         sub     arg3, 16
 279         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 280         ; equivalent of: cmp arg3, 16-16
 281         ; check if there is any more 16B in the buffer to be able to fold
 282         jge     _16B_reduction_loop
 283
 284         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 285         ;first, we reduce the data in the xmm7 register
 286
 287
 288 _final_reduction_for_128:
 289         add arg3, 16
 290         je _128_done
 291
 292 ; here we are getting data that is less than 16 bytes.
 293         ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
 294         ; after that the registers need to be adjusted.
 295 _get_last_two_xmms:
 296
 297
 298         movdqa xmm2, xmm7
 299         movdqu xmm1, [arg2 - 16 + arg3]
 300
 301         ; get rid of the extra data that was loaded before
 302         ; load the shift constant
 303         lea     rax, [pshufb_shf_table]
 304         add     rax, arg3
 305         movdqu  xmm0, [rax]
 306
 307
 308         pshufb  xmm7, xmm0
 309         pxor    xmm0, [mask3]
 310         pshufb  xmm2, xmm0
 311
 312         pblendvb        xmm2, xmm1     ;xmm0 is implicit
 313         ;;;;;;;;;;
 314         movdqa  xmm8, xmm7
 315         pclmulqdq       xmm7, xmm10, 0x1
 316
 317         pclmulqdq       xmm8, xmm10, 0x10
 318         pxor    xmm7, xmm8
 319         pxor    xmm7, xmm2
 320
 321 _128_done:
 322         ; compute crc of a 128-bit value
 323         movdqa  xmm10, [rk5]
 324         movdqa  xmm0, xmm7
 325
 326         ;64b fold
 327         pclmulqdq       xmm7, xmm10, 0
 328         psrldq  xmm0, 8
 329         pxor    xmm7, xmm0
 330
 331         ;32b fold
 332         movdqa  xmm0, xmm7
 333         pslldq  xmm7, 4
 334         pclmulqdq       xmm7, xmm10, 0x10
 335
 336         pxor    xmm7, xmm0
 337
 338
 339         ;barrett reduction
 340 _barrett:
 341         pand    xmm7, [mask2]
 342         movdqa  xmm1, xmm7
 343         movdqa  xmm2, xmm7
 344         movdqa  xmm10, [rk7]
 345
 346         pclmulqdq       xmm7, xmm10, 0
 347         pxor    xmm7, xmm2
 348         pand    xmm7, [mask]
 349         movdqa  xmm2, xmm7
 350         pclmulqdq       xmm7, xmm10, 0x10
 351         pxor    xmm7, xmm2
 352         pxor    xmm7, xmm1
 353         pextrd  eax, xmm7, 2
 354
 355 _cleanup:
 356         ; return c ^ 0xffffffffL;
 357         not     eax
 358
 359
 360 %ifidn __OUTPUT_FORMAT__, win64
 361         movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
 362         movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
 363         movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
 364         movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
 365         movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
 366         movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
 367         movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
 368         movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 369 %endif
 370         add     rsp, VARIABLE_OFFSET
 371         ret
 372
 373
 374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 378
 379 align 16
 380 _less_than_256:
 381
 382         ; check if there is enough buffer to be able to fold 16B at a time
 383         cmp     arg3, 32
 384         jl      _less_than_32
 385
 386         ; if there is, load the constants
 387         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 388
 389         movd    xmm0, arg1_low32       ; get the initial crc value
 390         movdqu  xmm7, [arg2]            ; load the plaintext
 391         pxor    xmm7, xmm0
 392
 393         ; update the buffer pointer
 394         add     arg2, 16
 395
 396         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 397         sub     arg3, 32
 398
 399         jmp     _16B_reduction_loop
 400
 401
 402 align 16
 403 _less_than_32:
 404         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 405         mov     eax, arg1_low32
 406         test    arg3, arg3
 407         je      _cleanup
 408
 409         movd    xmm0, arg1_low32        ; get the initial crc value
 410
 411         cmp     arg3, 16
 412         je      _exact_16_left
 413         jl      _less_than_16_left
 414
 415         movdqu  xmm7, [arg2]            ; load the plaintext
 416         pxor    xmm7, xmm0              ; xor the initial crc value
 417         add     arg2, 16
 418         sub     arg3, 16
 419         movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
 420         jmp     _get_last_two_xmms
 421
 422
 423 align 16
 424 _less_than_16_left:
 425         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 426
 427         pxor    xmm1, xmm1
 428         mov     r11, rsp
 429         movdqa  [r11], xmm1
 430
 431         cmp     arg3, 4
 432         jl      _only_less_than_4
 433
 434         ;       backup the counter value
 435         mov     r9, arg3
 436         cmp     arg3, 8
 437         jl      _less_than_8_left
 438
 439         ; load 8 Bytes
 440         mov     rax, [arg2]
 441         mov     [r11], rax
 442         add     r11, 8
 443         sub     arg3, 8
 444         add     arg2, 8
 445 _less_than_8_left:
 446
 447         cmp     arg3, 4
 448         jl      _less_than_4_left
 449
 450         ; load 4 Bytes
 451         mov     eax, [arg2]
 452         mov     [r11], eax
 453         add     r11, 4
 454         sub     arg3, 4
 455         add     arg2, 4
 456 _less_than_4_left:
 457
 458         cmp     arg3, 2
 459         jl      _less_than_2_left
 460
 461         ; load 2 Bytes
 462         mov     ax, [arg2]
 463         mov     [r11], ax
 464         add     r11, 2
 465         sub     arg3, 2
 466         add     arg2, 2
 467 _less_than_2_left:
 468         cmp     arg3, 1
 469         jl      _zero_left
 470
 471         ; load 1 Byte
 472         mov     al, [arg2]
 473         mov     [r11], al
 474
 475 _zero_left:
 476         movdqa  xmm7, [rsp]
 477         pxor    xmm7, xmm0      ; xor the initial crc value
 478
 479         lea rax,[pshufb_shf_table]
 480         movdqu  xmm0, [rax + r9]
 481         pshufb  xmm7,xmm0
 482
 483
 484
 485         jmp     _128_done
 486
 487 align 16
 488 _exact_16_left:
 489         movdqu  xmm7, [arg2]
 490         pxor    xmm7, xmm0      ; xor the initial crc value
 491
 492         jmp     _128_done
 493
 494 _only_less_than_4:
 495         cmp     arg3, 3
 496         jl      _only_less_than_3
 497
 498         ; load 3 Bytes
 499         mov     al, [arg2]
 500         mov     [r11], al
 501
 502         mov     al, [arg2+1]
 503         mov     [r11+1], al
 504
 505         mov     al, [arg2+2]
 506         mov     [r11+2], al
 507
 508         movdqa  xmm7, [rsp]
 509         pxor    xmm7, xmm0      ; xor the initial crc value
 510
 511         pslldq  xmm7, 5
 512
 513         jmp     _barrett
 514 _only_less_than_3:
 515         cmp     arg3, 2
 516         jl      _only_less_than_2
 517
 518         ; load 2 Bytes
 519         mov     al, [arg2]
 520         mov     [r11], al
 521
 522         mov     al, [arg2+1]
 523         mov     [r11+1], al
 524
 525         movdqa  xmm7, [rsp]
 526         pxor    xmm7, xmm0      ; xor the initial crc value
 527
 528         pslldq  xmm7, 6
 529
 530         jmp     _barrett
 531 _only_less_than_2:
 532
 533         ; load 1 Byte
 534         mov     al, [arg2]
 535         mov     [r11], al
 536
 537         movdqa  xmm7, [rsp]
 538         pxor    xmm7, xmm0      ; xor the initial crc value
 539
 540         pslldq  xmm7, 7
 541
 542         jmp     _barrett
 543
 544 section .data
 545
 546 ; precomputed constants
 547 align 16
 548 rk1 :
 549 DQ 0x00000000ccaa009e
 550 rk2 :
 551 DQ 0x00000001751997d0
 552 rk3 :
 553 DQ 0x000000014a7fe880
 554 rk4 :
 555 DQ 0x00000001e88ef372
 556 rk5 :
 557 DQ 0x00000000ccaa009e
 558 rk6 :
 559 DQ 0x0000000163cd6124
 560 rk7 :
 561 DQ 0x00000001f7011640
 562 rk8 :
 563 DQ 0x00000001db710640
 564 rk9 :
 565 DQ 0x00000001d7cfc6ac
 566 rk10 :
 567 DQ 0x00000001ea89367e
 568 rk11 :
 569 DQ 0x000000018cb44e58
 570 rk12 :
 571 DQ 0x00000000df068dc2
 572 rk13 :
 573 DQ 0x00000000ae0b5394
 574 rk14 :
 575 DQ 0x00000001c7569e54
 576 rk15 :
 577 DQ 0x00000001c6e41596
 578 rk16 :
 579 DQ 0x0000000154442bd4
 580 rk17 :
 581 DQ 0x0000000174359406
 582 rk18 :
 583 DQ 0x000000003db1ecdc
 584 rk19 :
 585 DQ 0x000000015a546366
 586 rk20 :
 587 DQ 0x00000000f1da05aa
 588
 589
 590 pshufb_shf_table:
 591 ; use these values for shift constants for the pshufb instruction
 592 ; different alignments result in values as shown:
 593 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 594 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 595 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 596 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 597 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 598 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 599 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 600 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 601 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 602 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 603 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 604 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 605 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 606 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 607 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 608 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 609 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 610
 611
 612 mask:
 613 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 614 mask2:
 615 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 616 mask3:
 617 dq     0x8080808080808080, 0x8080808080808080