ceph/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 %include "reg_sizes.asm"
  31 %include "lz0a_const.asm"
  32 %include "data_struct2.asm"
  33 %include "igzip_compare_types.asm"
  34 %define NEQ 4
  35
  36 default rel
  37
  38 %ifidn __OUTPUT_FORMAT__, win64
  39 %define arg1 rcx
  40 %define arg2 rdx
  41 %define arg3 r8
  42 %define arg4 r9
  43 %define len rdi
  44 %define tmp2 rdi
  45 %define dist rsi
  46 %else
  47 %define arg1 rdi
  48 %define arg2 rsi
  49 %define arg3 rdx
  50 %define arg4 rcx
  51 %define len r8
  52 %define tmp2 r8
  53 %define dist r9
  54 %endif
  55
  56 %define next_in arg1
  57 %define end_processed arg2
  58 %define end_in arg3
  59 %define match_lookup arg4
  60 %define match_in rax
  61 %define match_offset r10
  62 %define tmp1 r11
  63 %define end_processed_orig r12
  64 %define dist_code r13
  65 %define tmp3 r13
  66
  67 %define ymatch_lookup ymm0
  68 %define ymatch_lookup2 ymm1
  69 %define ylens ymm2
  70 %define ycmp2 ymm3
  71 %define ylens1 ymm4
  72 %define ylens2 ymm5
  73 %define ycmp ymm6
  74 %define ytmp1 ymm7
  75 %define ytmp2 ymm8
  76 %define yvect_size ymm9
  77 %define ymax_len ymm10
  78 %define ytwofiftysix ymm11
  79 %define ynlen_mask ymm12
  80 %define ydists_mask ymm13
  81 %define ylong_lens ymm14
  82 %define ylens_mask ymm15
  83
  84 %ifidn __OUTPUT_FORMAT__, win64
  85 %define stack_size  10*16 + 4 * 8 + 8
  86 %define func(x) proc_frame x
  87 %macro FUNC_SAVE 0
  88         alloc_stack     stack_size
  89         vmovdqa [rsp + 0*16], xmm6
  90         vmovdqa [rsp + 1*16], xmm7
  91         vmovdqa [rsp + 2*16], xmm8
  92         vmovdqa [rsp + 3*16], xmm9
  93         vmovdqa [rsp + 4*16], xmm10
  94         vmovdqa [rsp + 5*16], xmm11
  95         vmovdqa [rsp + 6*16], xmm12
  96         vmovdqa [rsp + 7*16], xmm13
  97         vmovdqa [rsp + 8*16], xmm14
  98         vmovdqa [rsp + 9*16], xmm15
  99         save_reg        rsi, 10*16 + 0*8
 100         save_reg        rdi, 10*16 + 1*8
 101         save_reg        r12, 10*16 + 2*8
 102         save_reg        r13, 10*16 + 3*8
 103         end_prolog
 104 %endm
 105
 106 %macro FUNC_RESTORE 0
 107         vmovdqa xmm6, [rsp + 0*16]
 108         vmovdqa xmm7, [rsp + 1*16]
 109         vmovdqa xmm8, [rsp + 2*16]
 110         vmovdqa xmm9, [rsp + 3*16]
 111         vmovdqa xmm10, [rsp + 4*16]
 112         vmovdqa xmm11, [rsp + 5*16]
 113         vmovdqa xmm12, [rsp + 6*16]
 114         vmovdqa xmm13, [rsp + 7*16]
 115         vmovdqa xmm14, [rsp + 8*16]
 116         vmovdqa xmm15, [rsp + 9*16]
 117
 118         mov     rsi, [rsp + 10*16 + 0*8]
 119         mov     rdi, [rsp + 10*16 + 1*8]
 120         mov     r12, [rsp + 10*16 + 2*8]
 121         mov     r13, [rsp + 10*16 + 3*8]
 122         add     rsp, stack_size
 123 %endm
 124 %else
 125 %define func(x) x:
 126 %macro FUNC_SAVE 0
 127         push r12
 128         push r13
 129 %endm
 130
 131 %macro FUNC_RESTORE 0
 132         pop r13
 133         pop r12
 134 %endm
 135 %endif
 136 %define VECT_SIZE 8
 137
 138 global set_long_icf_fg_04
 139 func(set_long_icf_fg_04)
 140         FUNC_SAVE
 141
 142         lea     end_in, [next_in + arg3]
 143         add     end_processed, next_in
 144         mov     end_processed_orig, end_processed
 145         lea     tmp1, [end_processed + LA_STATELESS]
 146         cmp     end_in, tmp1
 147         cmovg   end_in, tmp1
 148         sub     end_processed, VECT_SIZE - 1
 149         vmovdqu ylong_lens, [long_len]
 150         vmovdqu ylens_mask, [len_mask]
 151         vmovdqu ydists_mask, [dists_mask]
 152         vmovdqu ynlen_mask, [nlen_mask]
 153         vmovdqu yvect_size, [vect_size]
 154         vmovdqu ymax_len, [max_len]
 155         vmovdqu ytwofiftysix, [twofiftysix]
 156         vmovdqu ymatch_lookup, [match_lookup]
 157
 158 .fill_loop: ; Tahiti is a magical place
 159         vmovdqu ymatch_lookup2, ymatch_lookup
 160         vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
 161
 162         cmp     next_in, end_processed
 163         jae     .end_fill
 164
 165 .finish_entry:
 166         vpand   ylens, ymatch_lookup2, ylens_mask
 167         vpcmpgtd ycmp, ylens, ylong_lens
 168         vpmovmskb tmp1, ycmp
 169
 170 ;; Speculatively increment
 171         add     next_in, VECT_SIZE
 172         add     match_lookup, ICF_CODE_BYTES * VECT_SIZE
 173
 174         test    tmp1, tmp1
 175         jz      .fill_loop
 176
 177         tzcnt   match_offset, tmp1
 178         shr     match_offset, 2
 179
 180         lea     next_in, [next_in + match_offset - VECT_SIZE]
 181         lea     match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
 182         mov     dist %+ d, [match_lookup]
 183         vmovd   ymatch_lookup2 %+ x, dist %+ d
 184
 185         mov     tmp1, dist
 186         shr     dist, DIST_OFFSET
 187         and     dist, LIT_DIST_MASK
 188         shr     tmp1, EXTRA_BITS_OFFSET
 189         lea     tmp2, [dist_start]
 190         mov     dist %+ w, [tmp2 +  2 * dist]
 191         add     dist, tmp1
 192
 193         mov     match_in, next_in
 194         sub     match_in, dist
 195
 196         mov     len, 8
 197         mov     tmp3, end_in
 198         sub     tmp3, next_in
 199
 200         compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2
 201
 202         vmovd   ylens1 %+ x, len %+ d
 203         vpbroadcastd ylens1, ylens1 %+ x
 204         vpsubd  ylens1, ylens1, [increment]
 205         vpaddd  ylens1, ylens1, [twofiftyfour]
 206
 207         mov     tmp3, end_processed
 208         sub     tmp3, next_in
 209         cmp     len, tmp3
 210         cmovg   len, tmp3
 211
 212         add     next_in, len
 213         lea     match_lookup, [match_lookup + ICF_CODE_BYTES * len]
 214         vmovdqu ymatch_lookup, [match_lookup]
 215
 216         vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x
 217         vpand   ymatch_lookup2, ymatch_lookup2, ynlen_mask
 218
 219         neg     len
 220
 221 .update_match_lookup:
 222         vpand   ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len]
 223
 224         vpcmpgtd ycmp, ylens1, ylens2
 225         vpcmpgtd ytmp1, ylens1, ytwofiftysix
 226         vpand   ycmp, ycmp, ytmp1
 227         vpmovmskb tmp1, ycmp
 228
 229         vpcmpgtd ycmp2, ylens1, ymax_len
 230         vpandn ylens, ycmp2, ylens1
 231         vpand ycmp2, ymax_len, ycmp2
 232         vpor ylens, ycmp2
 233
 234         vpaddd  ylens2, ylens, ymatch_lookup2
 235         vpand   ylens2, ylens2, ycmp
 236
 237         vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2
 238
 239         test    tmp1 %+ d, tmp1 %+ d
 240         jz      .fill_loop
 241
 242         add     len, VECT_SIZE
 243         vpsubd  ylens1, ylens1, yvect_size
 244
 245         jmp     .update_match_lookup
 246
 247 .end_fill:
 248         mov     end_processed, end_processed_orig
 249         cmp     next_in, end_processed
 250         jge     .finish
 251
 252         mov     tmp1, end_processed
 253         sub     tmp1, next_in
 254         vmovd   ytmp1 %+ x, tmp1 %+ d
 255         vpbroadcastd ytmp1, ytmp1 %+ x
 256         vpcmpgtd ytmp1, ytmp1, [increment]
 257         vpand   ymatch_lookup2, ymatch_lookup2, ytmp1
 258         jmp     .finish_entry
 259
 260 .finish:
 261         FUNC_RESTORE
 262         ret
 263
 264 endproc_frame
 265
 266 section .data
 267 align 64
 268 dist_start:
 269         dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
 270         dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
 271         dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
 272         dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
 273 len_mask:
 274         dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
 275         dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
 276 dists_mask:
 277         dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
 278         dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
 279 long_len:
 280         dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
 281 increment:
 282         dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
 283 vect_size:
 284         dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
 285         dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
 286 twofiftyfour:
 287         dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
 288 twofiftysix:
 289         dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
 290 nlen_mask:
 291         dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
 292         dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
 293 max_len:
 294         dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102
 295         dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102