ceph/src/isa-l/igzip/encode_df_04.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 %include "reg_sizes.asm"
  31 %include "lz0a_const.asm"
  32 %include "data_struct2.asm"
  33 %include "stdmac.asm"
  34
  35 %define ARCH 04
  36 %define USE_HSWNI
  37
  38 ; tree entry is 4 bytes:
  39 ; lit/len tree (513 entries)
  40 ; |  3  |  2   |  1 | 0 |
  41 ; | len |       code    |
  42 ;
  43 ; dist tree
  44 ; |  3  |  2   |  1 | 0 |
  45 ; |eblen:codlen|   code |
  46
  47 ; token format:
  48 ; DIST_OFFSET:0 : lit/len
  49 ; 31:(DIST_OFFSET + 5) : dist Extra Bits
  50 ; (DIST_OFFSET + 5):DIST_OFFSET : dist code
  51 ; lit/len: 0-256 (literal)
  52 ;          257-512 (dist + 254)
  53
  54 ; returns final token pointer
  55 ; equal to token_end if successful
  56 ;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
  57 ;                            BitBuf *out_buf, uint32_t *trees);
  58
  59 %ifidn __OUTPUT_FORMAT__, win64
  60 %define arg1 rcx
  61 %define arg2 rdx
  62 %define arg3 r8
  63 %define arg4 r9
  64 %define sym             rsi
  65 %define dsym            rdi
  66 %define hufftables      r9
  67 %define ptr             r11
  68 %else
  69 ; Linux
  70 %define arg1 rdi
  71 %define arg2 rsi
  72 %define arg3 rdx
  73 %define arg4 rcx
  74 %define sym             r9
  75 %define dsym            r8
  76 %define hufftables      r11
  77 %define ptr             rdi
  78 %endif
  79
  80 %define in_buf_end      arg2
  81 %define bitbuf          arg3
  82 %define out_buf         bitbuf
  83 ; bit_count is rcx
  84 %define bits            rax
  85 %define data            r12
  86 %define tmp             rbx
  87 %define len             dsym
  88 %define tmp2            r10
  89 %define end_ptr         rbp
  90
  91 %define LIT_MASK        ((0x1 << LIT_LEN_BIT_COUNT) - 1)
  92 %define DIST_MASK       ((0x1 << DIST_LIT_BIT_COUNT) - 1)
  93
  94 %define codes1          ymm1
  95 %define code_lens1      ymm2
  96 %define codes2          ymm3
  97 %define code_lens2      ymm4
  98 %define codes3          ymm5
  99 %define code_lens3      ymm6
 100 %define codes4          ymm7
 101 %define syms            ymm7
 102
 103 %define code_lens4      ymm8
 104 %define dsyms           ymm8
 105
 106 %define ytmp            ymm9
 107 %define codes_lookup1   ymm10
 108 %define codes_lookup2   ymm11
 109 %define datas           ymm12
 110 %define ybits           ymm13
 111 %define ybits_count     ymm14
 112 %define yoffset_mask    ymm15
 113
 114 %define VECTOR_SIZE 0x20
 115 %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
 116 %define VECTOR_SLOP 0x20 - 8
 117
 118 gpr_save_mem_offset     equ     0
 119 gpr_save_mem_size       equ     8 * 6
 120 xmm_save_mem_offset     equ     gpr_save_mem_offset + gpr_save_mem_size
 121 xmm_save_mem_size       equ     10 * 16
 122 bitbuf_mem_offset       equ     xmm_save_mem_offset + xmm_save_mem_size
 123 bitbuf_mem_size         equ     8
 124 stack_size              equ     gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
 125
 126
 127 %macro FUNC_SAVE 0
 128         sub     rsp, stack_size
 129         mov     [rsp + gpr_save_mem_offset + 0*8], rbx
 130         mov     [rsp + gpr_save_mem_offset + 1*8], rbp
 131         mov     [rsp + gpr_save_mem_offset + 2*8], r12
 132
 133 %ifidn __OUTPUT_FORMAT__, win64
 134         mov     [rsp + gpr_save_mem_offset + 3*8], rsi
 135         mov     [rsp + gpr_save_mem_offset + 4*8], rdi
 136
 137         MOVDQU  [rsp + xmm_save_mem_offset + 0*8], xmm6
 138         MOVDQU  [rsp + xmm_save_mem_offset + 1*8], xmm7
 139         MOVDQU  [rsp + xmm_save_mem_offset + 2*8], xmm8
 140         MOVDQU  [rsp + xmm_save_mem_offset + 3*8], xmm9
 141         MOVDQU  [rsp + xmm_save_mem_offset + 4*8], xmm10
 142         MOVDQU  [rsp + xmm_save_mem_offset + 5*8], xmm11
 143         MOVDQU  [rsp + xmm_save_mem_offset + 6*8], xmm12
 144         MOVDQU  [rsp + xmm_save_mem_offset + 7*8], xmm13
 145         MOVDQU  [rsp + xmm_save_mem_offset + 8*8], xmm14
 146         MOVDQU  [rsp + xmm_save_mem_offset + 9*8], xmm15
 147 %endif
 148
 149 %endm
 150
 151 %macro FUNC_RESTORE 0
 152         mov     rbx, [rsp + gpr_save_mem_offset + 0*8]
 153         mov     rbp, [rsp + gpr_save_mem_offset + 1*8]
 154         mov     r12, [rsp + gpr_save_mem_offset + 2*8]
 155
 156 %ifidn __OUTPUT_FORMAT__, win64
 157         mov     rsi, [rsp + gpr_save_mem_offset + 3*8]
 158         mov     rdi, [rsp + gpr_save_mem_offset + 4*8]
 159
 160         MOVDQU  xmm6, [rsp + xmm_save_mem_offset + 0*8]
 161         MOVDQU  xmm7, [rsp + xmm_save_mem_offset + 1*8]
 162         MOVDQU  xmm8, [rsp + xmm_save_mem_offset + 2*8]
 163         MOVDQU  xmm9, [rsp + xmm_save_mem_offset + 3*8]
 164         MOVDQU  xmm10, [rsp + xmm_save_mem_offset + 4*8]
 165         MOVDQU  xmm11, [rsp + xmm_save_mem_offset + 5*8]
 166         MOVDQU  xmm12, [rsp + xmm_save_mem_offset + 6*8]
 167         MOVDQU  xmm13, [rsp + xmm_save_mem_offset + 7*8]
 168         MOVDQU  xmm14, [rsp + xmm_save_mem_offset + 8*8]
 169         MOVDQU  xmm15, [rsp + xmm_save_mem_offset + 9*8]
 170 %endif
 171         add     rsp, stack_size
 172
 173 %endmacro
 174
 175 global encode_deflate_icf_ %+ ARCH
 176 encode_deflate_icf_ %+ ARCH:
 177         FUNC_SAVE
 178
 179 %ifnidn ptr, arg1
 180         mov     ptr, arg1
 181 %endif
 182 %ifnidn hufftables, arg4
 183         mov     hufftables, arg4
 184 %endif
 185
 186         mov     [rsp + bitbuf_mem_offset], bitbuf
 187         mov     bits, [bitbuf + _m_bits]
 188         mov     ecx, [bitbuf + _m_bit_count]
 189         mov     end_ptr, [bitbuf + _m_out_end]
 190         mov     out_buf, [bitbuf + _m_out_buf]  ; clobbers bitbuf
 191
 192         sub     end_ptr, VECTOR_SLOP
 193         sub     in_buf_end, VECTOR_LOOP_PROCESSED
 194         cmp     ptr, in_buf_end
 195         jge     .finish
 196
 197         vpcmpeqq        ytmp, ytmp, ytmp
 198         vmovdqu datas, [ptr]
 199         vpand   syms, datas, [lit_mask]
 200         vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
 201
 202         vpcmpeqq        ytmp, ytmp, ytmp
 203         vpsrld  dsyms, datas, DIST_OFFSET
 204         vpand   dsyms, dsyms, [dist_mask]
 205         vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
 206
 207         vmovq   ybits %+ x, bits
 208         vmovq   ybits_count %+ x, rcx
 209         vmovdqa yoffset_mask, [offset_mask]
 210
 211 .main_loop:
 212         ;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
 213         vpsrld  code_lens1, codes_lookup1, 24
 214         vpand   codes1, codes_lookup1, [lit_icr_mask]
 215
 216         ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
 217         ;; and code_lens3 the extra bit counts
 218         vpblendw        codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
 219         vpsrld  code_lens2, codes_lookup2, 24
 220         vpsrld  code_lens3, codes_lookup2, 16
 221         vpand   code_lens3, [eb_icr_mask]
 222
 223         ;; Set codes3 to contain the extra bits
 224         vpsrld  codes3, datas, EXTRA_BITS_OFFSET
 225
 226         cmp     out_buf, end_ptr
 227         ja      .main_loop_exit
 228
 229         ;; Start code lookups for next iteration
 230         add     ptr, VECTOR_SIZE
 231         vpcmpeqq        ytmp, ytmp, ytmp
 232         vmovdqu datas, [ptr]
 233         vpand   syms, datas, [lit_mask]
 234         vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
 235
 236         vpcmpeqq        ytmp, ytmp, ytmp
 237         vpsrld  dsyms, datas, DIST_OFFSET
 238         vpand   dsyms, dsyms, [dist_mask]
 239         vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
 240
 241         ;; Merge dist code with extra bits
 242         vpsllvd codes3, codes3, code_lens2
 243         vpxor   codes2, codes2, codes3
 244         vpaddd  code_lens2, code_lens2, code_lens3
 245
 246         ;; Check for long codes
 247         vpaddd  code_lens3, code_lens1, code_lens2
 248         vpcmpgtd        ytmp, code_lens3, [max_write_d]
 249         vptest  ytmp, ytmp
 250         jnz     .long_codes
 251
 252         ;; Merge dist and len codes
 253         vpsllvd codes2, codes2, code_lens1
 254         vpxor   codes1, codes1, codes2
 255
 256         ;; Split buffer data into qwords, ytmp is 0 after last branch
 257         vpblendd codes3, ytmp, codes1, 0x55
 258         vpsrlq  codes1, codes1, 32
 259         vpsrlq  code_lens1, code_lens3, 32
 260         vpblendd        code_lens3, ytmp, code_lens3, 0x55
 261
 262         ;; Merge bitbuf bits
 263         vpsllvq codes3, codes3, ybits_count
 264         vpxor   codes3, codes3, ybits
 265         vpaddq  code_lens3, code_lens3, ybits_count
 266
 267         ;; Merge two symbols into qwords
 268         vpsllvq codes1, codes1, code_lens3
 269         vpxor codes1, codes1, codes3
 270         vpaddq code_lens1, code_lens1, code_lens3
 271
 272         ;; Split buffer data into dqwords, ytmp is 0 after last branch
 273         vpblendd        codes2, ytmp, codes1, 0x33
 274         vpblendd        code_lens2, ytmp, code_lens1, 0x33
 275         vpsrldq codes1, 8
 276         vpsrldq code_lens1, 8
 277
 278         ;; Bit align dqwords
 279         vpaddq  code_lens1, code_lens1, code_lens2
 280         vpand   ybits_count, code_lens1, yoffset_mask ;Extra bits
 281         vpermq  ybits_count, ybits_count, 0xcf
 282         vpaddq  code_lens2, ybits_count
 283         vpsllvq codes2, codes2, ybits_count
 284
 285         ;; Merge two qwords into dqwords
 286         vmovdqa ytmp, [q_64]
 287         vpsubq  code_lens3, ytmp, code_lens2
 288         vpsrlvq codes3, codes1, code_lens3
 289         vpslldq codes3, codes3, 8
 290
 291         vpsllvq codes1, codes1, code_lens2
 292
 293         vpxor   codes1, codes1, codes3
 294         vpxor   codes1, codes1, codes2
 295
 296         vmovq   tmp, code_lens1 %+ x    ;Number of bytes
 297         shr     tmp, 3
 298
 299         ;; Extract last bytes
 300         vpaddq  code_lens2, code_lens1, ybits_count
 301         vpsrlq  code_lens2, code_lens2, 3
 302         vpshufb codes2, codes1, code_lens2
 303         vpand   codes2, codes2, [bytes_mask]
 304         vextracti128    ybits %+ x, codes2, 1
 305
 306         ;; Check for short codes
 307         vptest code_lens2, [min_write_mask]
 308         jz      .short_codes
 309 .short_codes_next:
 310
 311         vpermq  codes2, codes2, 0x45
 312         vpor    codes1, codes1, codes2
 313
 314         ;; bit shift upper dqword combined bits to line up with lower dqword
 315         vextracti128    code_lens2 %+ x, code_lens1, 1
 316
 317         ; Write out lower dqword of combined bits
 318         vmovdqu [out_buf], codes1
 319         vpaddq  code_lens1, code_lens1, code_lens2
 320
 321         vmovq   tmp2, code_lens1 %+ x   ;Number of bytes
 322         shr     tmp2, 3
 323         vpand   ybits_count, code_lens1, yoffset_mask ;Extra bits
 324
 325         ; Write out upper dqword of combined bits
 326         vextracti128    [out_buf + tmp], codes1, 1
 327         add     out_buf, tmp2
 328
 329         cmp     ptr, in_buf_end
 330         jbe     .main_loop
 331
 332 .main_loop_exit:
 333         vmovq   rcx, ybits_count %+ x
 334         vmovq   bits, ybits %+ x
 335         jmp     .finish
 336
 337 .short_codes:
 338         ;; Merge last bytes when the second dqword contains less than a byte
 339         vpor ybits %+ x, codes2 %+ x
 340         jmp .short_codes_next
 341
 342 .long_codes:
 343         add     end_ptr, VECTOR_SLOP
 344         sub     ptr, VECTOR_SIZE
 345
 346         vpxor ytmp, ytmp, ytmp
 347         vpblendd codes3, ytmp, codes1, 0x55
 348         vpblendd code_lens3, ytmp, code_lens1, 0x55
 349         vpblendd codes4, ytmp, codes2, 0x55
 350
 351         vpsllvq codes4, codes4, code_lens3
 352         vpxor   codes3, codes3, codes4
 353         vpaddd  code_lens3, code_lens1, code_lens2
 354
 355         vpsrlq  codes1, codes1, 32
 356         vpsrlq  code_lens1, code_lens1, 32
 357         vpsrlq  codes2, codes2, 32
 358
 359         vpsllvq codes2, codes2, code_lens1
 360         vpxor codes1, codes1, codes2
 361
 362         vpsrlq code_lens1, code_lens3, 32
 363         vpblendd        code_lens3, ytmp, code_lens3, 0x55
 364
 365         ;; Merge bitbuf bits
 366         vpsllvq codes3, codes3, ybits_count
 367         vpxor   codes3, codes3, ybits
 368         vpaddq  code_lens3, code_lens3, ybits_count
 369         vpaddq code_lens1, code_lens1, code_lens3
 370
 371         xor     bits, bits
 372         xor     rcx, rcx
 373         vpsubq code_lens1, code_lens1, code_lens3
 374 %rep 2
 375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 376         cmp     out_buf, end_ptr
 377         ja      .overflow
 378         ;; insert LL code
 379         vmovq   sym, codes3 %+ x
 380         vmovq   tmp2, code_lens3 %+ x
 381         SHLX    sym, sym, rcx
 382         or      bits, sym
 383         add     rcx, tmp2
 384
 385         ; empty bits
 386         mov     [out_buf], bits
 387         mov     tmp, rcx
 388         shr     tmp, 3          ; byte count
 389         add     out_buf, tmp
 390         mov     tmp, rcx
 391         and     rcx, ~7
 392         SHRX    bits, bits, rcx
 393         mov     rcx, tmp
 394         and     rcx, 7
 395         add     ptr, 4
 396
 397 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 398         cmp     out_buf, end_ptr
 399         ja      .overflow
 400         ;; insert LL code
 401         vmovq   sym, codes1 %+ x
 402         vmovq   tmp2, code_lens1 %+ x
 403         SHLX    sym, sym, rcx
 404         or      bits, sym
 405         add     rcx, tmp2
 406
 407         ; empty bits
 408         mov     [out_buf], bits
 409         mov     tmp, rcx
 410         shr     tmp, 3          ; byte count
 411         add     out_buf, tmp
 412         mov     tmp, rcx
 413         and     rcx, ~7
 414         SHRX    bits, bits, rcx
 415         mov     rcx, tmp
 416         and     rcx, 7
 417         add     ptr, 4
 418
 419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 420         cmp     out_buf, end_ptr
 421         ja      .overflow
 422         ;; insert LL code
 423         vpextrq sym, codes3 %+ x, 1
 424         vpextrq tmp2, code_lens3 %+ x, 1
 425         SHLX    sym, sym, rcx
 426         or      bits, sym
 427         add     rcx, tmp2
 428
 429         ; empty bits
 430         mov     [out_buf], bits
 431         mov     tmp, rcx
 432         shr     tmp, 3          ; byte count
 433         add     out_buf, tmp
 434         mov     tmp, rcx
 435         and     rcx, ~7
 436         SHRX    bits, bits, rcx
 437         mov     rcx, tmp
 438         and     rcx, 7
 439         add     ptr, 4
 440
 441 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 442         cmp     out_buf, end_ptr
 443         ja      .overflow
 444         ;; insert LL code
 445         vpextrq sym, codes1 %+ x, 1
 446         vpextrq tmp2, code_lens1 %+ x, 1
 447         SHLX    sym, sym, rcx
 448         or      bits, sym
 449         add     rcx, tmp2
 450
 451         ; empty bits
 452         mov     [out_buf], bits
 453         mov     tmp, rcx
 454         shr     tmp, 3          ; byte count
 455         add     out_buf, tmp
 456         mov     tmp, rcx
 457         and     rcx, ~7
 458         SHRX    bits, bits, rcx
 459         mov     rcx, tmp
 460         and     rcx, 7
 461         add     ptr, 4
 462
 463         vextracti128 codes3 %+ x, codes3, 1
 464         vextracti128 code_lens3 %+ x, code_lens3, 1
 465         vextracti128 codes1 %+ x, codes1, 1
 466         vextracti128 code_lens1 %+ x, code_lens1, 1
 467 %endrep
 468         sub     end_ptr, VECTOR_SLOP
 469
 470         vmovq   ybits %+ x, bits
 471         vmovq   ybits_count %+ x, rcx
 472         cmp     ptr, in_buf_end
 473         jbe     .main_loop
 474
 475 .finish:
 476         add     in_buf_end, VECTOR_LOOP_PROCESSED
 477         add     end_ptr, VECTOR_SLOP
 478
 479         cmp     ptr, in_buf_end
 480         jge     .overflow
 481
 482 .finish_loop:
 483         mov     DWORD(data), [ptr]
 484
 485         cmp     out_buf, end_ptr
 486         ja      .overflow
 487
 488         mov     sym, data
 489         and     sym, LIT_MASK   ; sym has ll_code
 490         mov     DWORD(sym), [hufftables + _lit_len_table + sym * 4]
 491
 492         ; look up dist sym
 493         mov     dsym, data
 494         shr     dsym, DIST_OFFSET
 495         and     dsym, DIST_MASK
 496         mov     DWORD(dsym), [hufftables + _dist_table + dsym * 4]
 497
 498         ; insert LL code
 499         ; sym: 31:24 length; 23:0 code
 500         mov     tmp2, sym
 501         and     sym, 0xFFFFFF
 502         SHLX    sym, sym, rcx
 503         shr     tmp2, 24
 504         or      bits, sym
 505         add     rcx, tmp2
 506
 507         ; insert dist code
 508         movzx   tmp, WORD(dsym)
 509         SHLX    tmp, tmp, rcx
 510         or      bits, tmp
 511         mov     tmp, dsym
 512         shr     tmp, 24
 513         add     rcx, tmp
 514
 515         ; insert dist extra bits
 516         shr     data, EXTRA_BITS_OFFSET
 517         add     ptr, 4
 518         SHLX    data, data, rcx
 519         or      bits, data
 520         shr     dsym, 16
 521         and     dsym, 0xFF
 522         add     rcx, dsym
 523
 524         ; empty bits
 525         mov     [out_buf], bits
 526         mov     tmp, rcx
 527         shr     tmp, 3          ; byte count
 528         add     out_buf, tmp
 529         mov     tmp, rcx
 530         and     rcx, ~7
 531         SHRX    bits, bits, rcx
 532         mov     rcx, tmp
 533         and     rcx, 7
 534
 535         cmp     ptr, in_buf_end
 536         jb      .finish_loop
 537
 538 .overflow:
 539         mov     tmp, [rsp + bitbuf_mem_offset]
 540         mov     [tmp + _m_bits], bits
 541         mov     [tmp + _m_bit_count], ecx
 542         mov     [tmp + _m_out_buf], out_buf
 543
 544         mov     rax, ptr
 545
 546         FUNC_RESTORE
 547
 548         ret
 549
 550 section .data
 551         align 32
 552 max_write_d:
 553         dd      0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
 554 min_write_mask:
 555         dq      0x00, 0x00, 0xff, 0x00
 556 offset_mask:
 557         dq      0x0000000000000007, 0x0000000000000000
 558         dq      0x0000000000000000, 0x0000000000000000
 559 q_64:
 560         dq      0x0000000000000040, 0x0000000000000000
 561         dq      0x0000000000000040, 0x0000000000000000
 562 lit_mask:
 563         dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
 564         dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
 565 dist_mask:
 566         dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
 567         dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
 568 lit_icr_mask:
 569         dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
 570         dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
 571 eb_icr_mask:
 572         dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
 573         dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
 574 bytes_mask:
 575         dq      0x00000000000000ff, 0x0000000000000000
 576         dq      0x00000000000000ff, 0x0000000000000000