]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/spdk/isa-l/igzip/igzip_set_long_icf_fg_06.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / isa-l / igzip / igzip_set_long_icf_fg_06.asm
index 7e2cb22061ba5933228730bbb1d3975cae9a9eec..39708eda7b28300bda527ec6968f93c4ad7148b2 100644 (file)
 %define arg1 rcx
 %define arg2 rdx
 %define arg3 r8
-%define dist_code rsi
+%define arg4 r9
 %define len rdi
+%define dist rsi
 %else
 %define arg1 rdi
 %define arg2 rsi
 %define arg3 rdx
-%define dist_code rcx
+%define arg4 rcx
 %define len r8
+%define dist r9
 %endif
 
 %define next_in arg1
-%define end_in arg2
-%define match_lookup arg3
+%define end_processed arg2
+%define end_in arg3
+%define match_lookup arg4
 %define match_in rax
-%define dist r9
 %define match_offset r10
 %define tmp1 r11
-%define end_in_orig r12
+%define end_processed_orig r12
+%define dist_code r13
+%define tmp2 r13
 
 %define zmatch_lookup zmm0
 %define zmatch_lookup2 zmm1
@@ -71,7 +75,8 @@
 %define datas zmm11
 %define ztmp1 zmm12
 %define ztmp2 zmm13
-%define zvect_size zmm17
+%define zvect_size zmm16
+%define zmax_len zmm17
 %define ztwofiftyfour zmm18
 %define ztwofiftysix zmm19
 %define ztwosixtytwo zmm20
        save_reg        rsi, 8*16 + 0*8
        save_reg        rdi, 8*16 + 1*8
        save_reg        r12, 8*16 + 2*8
+       save_reg        r13, 8*16 + 3*8
        end_prolog
 %endm
 
        mov     rsi, [rsp + 8*16 + 0*8]
        mov     rdi, [rsp + 8*16 + 1*8]
        mov     r12, [rsp + 8*16 + 2*8]
+       mov     r13, [rsp + 8*16 + 3*8]
        add     rsp, stack_size
 %endm
 %else
 %define func(x) x:
 %macro FUNC_SAVE 0
        push    r12
+       push    r13
 %endm
 
 %macro FUNC_RESTORE 0
+       pop     r13
        pop     r12
 %endm
 %endif
@@ -137,30 +146,36 @@ global set_long_icf_fg_06
 func(set_long_icf_fg_06)
        FUNC_SAVE
 
-       mov     end_in_orig, end_in
-       sub     end_in, 15
-       vmovdqu32 zlong_lens, [long_len]
-       vmovdqu32 zlong_lens2, [long_len2]
-       vmovdqu32 zlens_mask, [len_mask]
+       lea     end_in, [next_in + arg3]
+       add     end_processed, next_in
+       mov     end_processed_orig, end_processed
+       lea     tmp1, [end_processed + LA_STATELESS]
+       cmp     end_in, tmp1
+       cmovg   end_in, tmp1
+       sub     end_processed, 15
+       vpbroadcastd zlong_lens, [long_len]
+       vpbroadcastd zlong_lens2, [long_len2]
+       vpbroadcastd zlens_mask, [len_mask]
        vmovdqu16 zdists_start, [dist_start]
-       vmovdqu32 zdists_mask, [dists_mask]
+       vpbroadcastd zdists_mask, [dists_mask]
        vmovdqu32 zincrement, [increment]
-       vmovdqu64 zdatas_perm2, [datas_perm2]
-       vmovdqu64 zdatas_perm3, [datas_perm3]
+       vbroadcasti64x2 zdatas_perm2, [datas_perm2]
+       vbroadcasti64x2 zdatas_perm3, [datas_perm3]
        vmovdqu64 zqword_shuf, [qword_shuf]
-       vmovdqu64 zbswap, [bswap_shuf]
-       vmovdqu64 znlen_mask, [nlen_mask]
-       vmovdqu64 zvect_size, [vect_size]
-       vmovdqu64 ztwofiftyfour, [twofiftyfour]
-       vmovdqu64 ztwofiftysix, [twofiftysix]
-       vmovdqu64 ztwosixtytwo, [twosixtytwo]
+       vbroadcasti64x2 zbswap, [bswap_shuf]
+       vpbroadcastd znlen_mask, [nlen_mask]
+       vpbroadcastd zvect_size, [vect_size]
+       vpbroadcastd zmax_len, [max_len]
+       vpbroadcastd ztwofiftyfour, [twofiftyfour]
+       vpbroadcastd ztwofiftysix, [twofiftysix]
+       vpbroadcastd ztwosixtytwo, [twosixtytwo]
        vmovdqu32 zmatch_lookup, [match_lookup]
 
 .fill_loop: ; Tahiti is a magical place
        vmovdqu32 zmatch_lookup2, zmatch_lookup
        vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
 
-       cmp     next_in, end_in
+       cmp     next_in, end_processed
        jae     .end_fill
 
 .finish_entry:
@@ -230,25 +245,21 @@ func(set_long_icf_fg_06)
        mov     match_in, next_in
        sub     match_in, dist
 
-       mov     len, 2
-%rep 3
-       vmovdqu8 ztmp1, [next_in + len]
-       vmovdqu8 ztmp2, [match_in + len]
-       vpcmpb  k3, ztmp1, [match_in + len], NEQ
-       ktestq  k3, k3
-       jnz     .miscompare
-
-       add     len, 64
-%endrep
-
-       vmovdqu8 ztmp1, [next_in + len]
-       vmovdqu8 ztmp2, [match_in + len]
-       vpcmpb  k3, ztmp1, ztmp2, 4
-
-.miscompare:
-       kmovq   tmp1, k3
-       tzcnt   tmp1, tmp1
-       add     len, tmp1
+       mov     len, 16
+       mov     tmp2, end_in
+       sub     tmp2, next_in
+
+       compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2
+
+       vpbroadcastd zlens1, len %+ d
+       vpsubd  zlens1, zlens1, zincrement
+       vpaddd  zlens1, zlens1, ztwofiftyfour
+
+       mov     tmp2, end_processed
+       sub     tmp2, next_in
+       cmp     len, tmp2
+       cmovg   len, tmp2
+
        add     next_in, len
        lea     match_lookup, [match_lookup + ICF_CODE_BYTES * len]
        vmovdqu32 zmatch_lookup, [match_lookup]
@@ -256,9 +267,6 @@ func(set_long_icf_fg_06)
        vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x
        vpandd  zmatch_lookup2, zmatch_lookup2, znlen_mask
 
-       vpbroadcastd zlens1, len %+ d
-       vpsubd  zlens1, zlens1, zincrement
-       vpaddd  zlens1, zlens1, ztwofiftyfour
        neg     len
 
 .update_match_lookup:
@@ -267,7 +275,11 @@ func(set_long_icf_fg_06)
        vpcmpgtd k4, zlens1, ztwofiftysix
        kandw   k3, k3, k4
 
-       vpaddd  zlens2 {k3}{z}, zlens1, zmatch_lookup2
+       vpcmpgtd k4, zlens1, zmax_len
+       vmovdqu32 zlens, zlens1
+       vmovdqu32 zlens {k4}, zmax_len
+
+       vpaddd  zlens2 {k3}{z}, zlens, zmatch_lookup2
 
        vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2
 
@@ -281,11 +293,11 @@ func(set_long_icf_fg_06)
        jmp     .update_match_lookup
 
 .end_fill:
-       mov     end_in, end_in_orig
-       cmp     next_in, end_in
+       mov     end_processed, end_processed_orig
+       cmp     next_in, end_processed
        jge     .finish
 
-       mov     tmp1, end_in
+       mov     tmp1, end_processed
        sub     tmp1, next_in
        vpbroadcastd ztmp1, tmp1 %+ d
        vpcmpd k3, ztmp1, zincrement, 6
@@ -301,44 +313,12 @@ endproc_frame
 
 section .data
 align 64
+;; 64 byte data
 dist_start:
        dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
        dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
        dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
        dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
-len_mask:
-       dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
-       dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
-       dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
-       dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
-dists_mask:
-       dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
-       dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
-       dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
-       dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
-long_len:
-       dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
-       dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
-long_len2:
-       dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
-       dd 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7
-
-increment:
-       dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
-       dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
-datas_perm2:
-       dq 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1
-datas_perm3:
-       dq 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2
-bswap_shuf:
-       db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-       db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
-       db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-       db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
-       db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-       db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
-       db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-       db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
 qword_shuf:
        db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
        db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
@@ -349,23 +329,39 @@ qword_shuf:
        db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
        db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
        db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+
+;; 16 byte data
+increment:
+       dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+       dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+
+datas_perm2:
+       dq 0x0, 0x1
+datas_perm3:
+       dq 0x1, 0x2
+bswap_shuf:
+       db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+       db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+
+;; 4 byte data
+len_mask:
+       dd LIT_LEN_MASK
+dists_mask:
+       dd LIT_DIST_MASK
+long_len:
+       dd 0x105
+long_len2:
+       dd 0x7
+max_len:
+       dd 0xfe + 0x102
 vect_size:
-       dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
-       dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
-       dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
-       dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
+       dd VECT_SIZE
 twofiftyfour:
-       dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
-       dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
+       dd 0xfe
 twofiftysix:
-       dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
-       dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
+       dd 0x100
 twosixtytwo:
-       dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106
-       dd 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106, 0x106
+       dd 0x106
 nlen_mask:
-       dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
-       dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
-       dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
-       dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
+       dd 0xfffffc00
 %endif