[ceph.git] / ceph / src / isa-l / igzip / encode_df_04.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%include "reg_sizes.asm"
%include "lz0a_const.asm"
%include "data_struct2.asm"
%include "stdmac.asm"

%define ARCH 04
%define USE_HSWNI

; tree entry is 4 bytes:
; lit/len tree (513 entries)
; |  3  |  2   |  1 | 0 |
; | len |       code    |
;
; dist tree
; |  3  |  2   |  1 | 0 |
; |eblen:codlen|   code |

; token format:
; DIST_OFFSET:0 : lit/len
; 31:(DIST_OFFSET + 5) : dist Extra Bits
; (DIST_OFFSET + 5):DIST_OFFSET : dist code
; lit/len: 0-256 (literal)
;          257-512 (dist + 254)

; returns final token pointer
; equal to token_end if successful
;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
;                            BitBuf *out_buf, uint32_t *trees);

%ifidn __OUTPUT_FORMAT__, win64
%define arg1 rcx
%define arg2 rdx
%define arg3 r8
%define arg4 r9
%define sym		rsi
%define dsym		rdi
%define hufftables	r9
%define ptr		r11
%else
; Linux
%define arg1 rdi
%define arg2 rsi
%define arg3 rdx
%define arg4 rcx
%define sym		r9
%define dsym		r8
%define hufftables	r11
%define ptr		rdi
%endif

%define in_buf_end	arg2
%define bitbuf		arg3
%define out_buf		bitbuf
; bit_count is rcx
%define bits		rax
%define data		r12
%define tmp		rbx
%define len 		dsym
%define tmp2 		r10
%define end_ptr		rbp

%define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
%define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)

%define codes1		ymm1
%define code_lens1	ymm2
%define codes2		ymm3
%define code_lens2	ymm4
%define codes3		ymm5
%define	code_lens3	ymm6
%define codes4		ymm7
%define syms		ymm7

%define code_lens4	ymm8
%define dsyms		ymm8

%define ytmp		ymm9
%define codes_lookup1	ymm10
%define	codes_lookup2	ymm11
%define datas		ymm12
%define ybits		ymm13
%define ybits_count	ymm14
%define yoffset_mask	ymm15

%define VECTOR_SIZE 0x20
%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
%define VECTOR_SLOP 0x20 - 8

gpr_save_mem_offset	equ	0
gpr_save_mem_size	equ	8 * 6
xmm_save_mem_offset	equ	gpr_save_mem_offset + gpr_save_mem_size
xmm_save_mem_size	equ	10 * 16
bitbuf_mem_offset	equ	xmm_save_mem_offset + xmm_save_mem_size
bitbuf_mem_size		equ	8
stack_size		equ	gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size


%macro FUNC_SAVE 0
	sub	rsp, stack_size
	mov	[rsp + gpr_save_mem_offset + 0*8], rbx
	mov	[rsp + gpr_save_mem_offset + 1*8], rbp
	mov	[rsp + gpr_save_mem_offset + 2*8], r12

%ifidn __OUTPUT_FORMAT__, win64
	mov	[rsp + gpr_save_mem_offset + 3*8], rsi
	mov	[rsp + gpr_save_mem_offset + 4*8], rdi

	MOVDQU	[rsp + xmm_save_mem_offset + 0*8], xmm6
	MOVDQU	[rsp + xmm_save_mem_offset + 1*8], xmm7
	MOVDQU	[rsp + xmm_save_mem_offset + 2*8], xmm8
	MOVDQU	[rsp + xmm_save_mem_offset + 3*8], xmm9
	MOVDQU	[rsp + xmm_save_mem_offset + 4*8], xmm10
	MOVDQU	[rsp + xmm_save_mem_offset + 5*8], xmm11
	MOVDQU	[rsp + xmm_save_mem_offset + 6*8], xmm12
	MOVDQU	[rsp + xmm_save_mem_offset + 7*8], xmm13
	MOVDQU	[rsp + xmm_save_mem_offset + 8*8], xmm14
	MOVDQU	[rsp + xmm_save_mem_offset + 9*8], xmm15
%endif

%endm

%macro FUNC_RESTORE 0
	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
	mov	r12, [rsp + gpr_save_mem_offset + 2*8]

%ifidn __OUTPUT_FORMAT__, win64
	mov	rsi, [rsp + gpr_save_mem_offset + 3*8]
	mov	rdi, [rsp + gpr_save_mem_offset + 4*8]

	MOVDQU	xmm6, [rsp + xmm_save_mem_offset + 0*8]
	MOVDQU	xmm7, [rsp + xmm_save_mem_offset + 1*8]
	MOVDQU	xmm8, [rsp + xmm_save_mem_offset + 2*8]
	MOVDQU	xmm9, [rsp + xmm_save_mem_offset + 3*8]
	MOVDQU	xmm10, [rsp + xmm_save_mem_offset + 4*8]
	MOVDQU	xmm11, [rsp + xmm_save_mem_offset + 5*8]
	MOVDQU	xmm12, [rsp + xmm_save_mem_offset + 6*8]
	MOVDQU	xmm13, [rsp + xmm_save_mem_offset + 7*8]
	MOVDQU	xmm14, [rsp + xmm_save_mem_offset + 8*8]
	MOVDQU	xmm15, [rsp + xmm_save_mem_offset + 9*8]
%endif
	add	rsp, stack_size

%endmacro

global encode_deflate_icf_ %+ ARCH
encode_deflate_icf_ %+ ARCH:
	FUNC_SAVE

%ifnidn ptr, arg1
	mov	ptr, arg1
%endif
%ifnidn hufftables, arg4
	mov	hufftables, arg4
%endif

	mov	[rsp + bitbuf_mem_offset], bitbuf
	mov	bits, [bitbuf + _m_bits]
	mov	ecx, [bitbuf + _m_bit_count]
	mov	end_ptr, [bitbuf + _m_out_end]
	mov	out_buf, [bitbuf + _m_out_buf]	; clobbers bitbuf

	sub	end_ptr, VECTOR_SLOP
	sub	in_buf_end, VECTOR_LOOP_PROCESSED
	cmp	ptr, in_buf_end
	jge	.finish

	vpcmpeqq	ytmp, ytmp, ytmp
	vmovdqu	datas, [ptr]
	vpand	syms, datas, [lit_mask]
	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp

	vpcmpeqq	ytmp, ytmp, ytmp
	vpsrld	dsyms, datas, DIST_OFFSET
	vpand	dsyms, dsyms, [dist_mask]
	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp

	vmovq	ybits %+ x, bits
	vmovq	ybits_count %+ x, rcx
	vmovdqa	yoffset_mask, [offset_mask]

.main_loop:
	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
	vpsrld	code_lens1, codes_lookup1, 24
	vpand	codes1, codes_lookup1, [lit_icr_mask]

	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
	;; and code_lens3 the extra bit counts
	vpblendw	codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
	vpsrld	code_lens2, codes_lookup2, 24
	vpsrld	code_lens3, codes_lookup2, 16
	vpand	code_lens3, [eb_icr_mask]

	;; Set codes3 to contain the extra bits
	vpsrld	codes3, datas, EXTRA_BITS_OFFSET

	cmp	out_buf, end_ptr
	ja	.main_loop_exit

	;; Start code lookups for next iteration
	add	ptr, VECTOR_SIZE
	vpcmpeqq	ytmp, ytmp, ytmp
	vmovdqu	datas, [ptr]
	vpand	syms, datas, [lit_mask]
	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp

	vpcmpeqq	ytmp, ytmp, ytmp
	vpsrld	dsyms, datas, DIST_OFFSET
	vpand	dsyms, dsyms, [dist_mask]
	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp

	;; Merge dist code with extra bits
	vpsllvd	codes3, codes3, code_lens2
	vpxor	codes2, codes2, codes3
	vpaddd	code_lens2, code_lens2, code_lens3

	;; Check for long codes
	vpaddd	code_lens3, code_lens1, code_lens2
	vpcmpgtd	ytmp, code_lens3, [max_write_d]
	vptest	ytmp, ytmp
	jnz	.long_codes

	;; Merge dist and len codes
	vpsllvd	codes2, codes2, code_lens1
	vpxor	codes1, codes1, codes2

	;; Split buffer data into qwords, ytmp is 0 after last branch
	vpblendd codes3, ytmp, codes1, 0x55
	vpsrlq	codes1, codes1, 32
	vpsrlq	code_lens1, code_lens3, 32
	vpblendd	code_lens3, ytmp, code_lens3, 0x55

	;; Merge bitbuf bits
	vpsllvq codes3, codes3, ybits_count
	vpxor	codes3, codes3, ybits
	vpaddq	code_lens3, code_lens3, ybits_count

	;; Merge two symbols into qwords
	vpsllvq	codes1, codes1, code_lens3
	vpxor codes1, codes1, codes3
	vpaddq code_lens1, code_lens1, code_lens3

	;; Split buffer data into dqwords, ytmp is 0 after last branch
	vpblendd	codes2, ytmp, codes1, 0x33
	vpblendd	code_lens2, ytmp, code_lens1, 0x33
	vpsrldq	codes1, 8
	vpsrldq	code_lens1, 8

	;; Bit align dqwords
	vpaddq	code_lens1, code_lens1, code_lens2
	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
	vpermq	ybits_count, ybits_count, 0xcf
	vpaddq	code_lens2, ybits_count
	vpsllvq	codes2, codes2, ybits_count

	;; Merge two qwords into dqwords
	vmovdqa	ytmp, [q_64]
	vpsubq	code_lens3, ytmp, code_lens2
	vpsrlvq	codes3, codes1, code_lens3
	vpslldq	codes3, codes3, 8

	vpsllvq	codes1, codes1, code_lens2

	vpxor	codes1, codes1, codes3
	vpxor	codes1, codes1, codes2

	vmovq	tmp, code_lens1 %+ x 	;Number of bytes
	shr	tmp, 3

	;; Extract last bytes
	vpaddq	code_lens2, code_lens1, ybits_count
	vpsrlq	code_lens2, code_lens2, 3
	vpshufb	codes2, codes1, code_lens2
	vpand	codes2, codes2, [bytes_mask]
	vextracti128	ybits %+ x, codes2, 1

	;; Check for short codes
	vptest code_lens2, [min_write_mask]
	jz	.short_codes
.short_codes_next:

	vpermq	codes2, codes2, 0x45
	vpor	codes1, codes1, codes2

	;; bit shift upper dqword combined bits to line up with lower dqword
	vextracti128	code_lens2 %+ x, code_lens1, 1

	; Write out lower dqword of combined bits
	vmovdqu	[out_buf], codes1
	vpaddq	code_lens1, code_lens1, code_lens2

	vmovq	tmp2, code_lens1 %+ x	;Number of bytes
	shr	tmp2, 3
	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits

	; Write out upper dqword of combined bits
	vextracti128	[out_buf + tmp], codes1, 1
	add	out_buf, tmp2

	cmp	ptr, in_buf_end
	jbe	.main_loop

.main_loop_exit:
	vmovq	rcx, ybits_count %+ x
	vmovq	bits, ybits %+ x
	jmp	.finish

.short_codes:
	;; Merge last bytes when the second dqword contains less than a byte
	vpor ybits %+ x, codes2 %+ x
	jmp .short_codes_next

.long_codes:
	add	end_ptr, VECTOR_SLOP
	sub	ptr, VECTOR_SIZE

	vpxor ytmp, ytmp, ytmp
	vpblendd codes3, ytmp, codes1, 0x55
	vpblendd code_lens3, ytmp, code_lens1, 0x55
	vpblendd codes4, ytmp, codes2, 0x55

	vpsllvq	codes4, codes4, code_lens3
	vpxor	codes3, codes3, codes4
	vpaddd	code_lens3, code_lens1, code_lens2

	vpsrlq	codes1, codes1, 32
	vpsrlq	code_lens1, code_lens1, 32
	vpsrlq	codes2, codes2, 32

	vpsllvq	codes2, codes2, code_lens1
	vpxor codes1, codes1, codes2

	vpsrlq code_lens1, code_lens3, 32
	vpblendd	code_lens3, ytmp, code_lens3, 0x55

	;; Merge bitbuf bits
	vpsllvq codes3, codes3, ybits_count
	vpxor	codes3, codes3, ybits
	vpaddq	code_lens3, code_lens3, ybits_count
	vpaddq code_lens1, code_lens1, code_lens3

	xor	bits, bits
	xor	rcx, rcx
	vpsubq code_lens1, code_lens1, code_lens3
%rep 2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	cmp	out_buf, end_ptr
	ja	.overflow
	;; insert LL code
	vmovq	sym, codes3 %+ x
	vmovq	tmp2, code_lens3 %+ x
	SHLX	sym, sym, rcx
	or	bits, sym
	add	rcx, tmp2

	; empty bits
	mov	[out_buf], bits
	mov	tmp, rcx
	shr	tmp, 3		; byte count
	add	out_buf, tmp
	mov	tmp, rcx
	and	rcx, ~7
	SHRX	bits, bits, rcx
	mov	rcx, tmp
	and	rcx, 7
	add	ptr, 4

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	cmp	out_buf, end_ptr
	ja	.overflow
	;; insert LL code
	vmovq	sym, codes1 %+ x
	vmovq	tmp2, code_lens1 %+ x
	SHLX	sym, sym, rcx
	or	bits, sym
	add	rcx, tmp2

	; empty bits
	mov	[out_buf], bits
	mov	tmp, rcx
	shr	tmp, 3		; byte count
	add	out_buf, tmp
	mov	tmp, rcx
	and	rcx, ~7
	SHRX	bits, bits, rcx
	mov	rcx, tmp
	and	rcx, 7
	add	ptr, 4

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	cmp	out_buf, end_ptr
	ja	.overflow
	;; insert LL code
	vpextrq	sym, codes3 %+ x, 1
	vpextrq	tmp2, code_lens3 %+ x, 1
	SHLX	sym, sym, rcx
	or	bits, sym
	add	rcx, tmp2

	; empty bits
	mov	[out_buf], bits
	mov	tmp, rcx
	shr	tmp, 3		; byte count
	add	out_buf, tmp
	mov	tmp, rcx
	and	rcx, ~7
	SHRX	bits, bits, rcx
	mov	rcx, tmp
	and	rcx, 7
	add	ptr, 4

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	cmp	out_buf, end_ptr
	ja	.overflow
	;; insert LL code
	vpextrq	sym, codes1 %+ x, 1
	vpextrq	tmp2, code_lens1 %+ x, 1
	SHLX	sym, sym, rcx
	or	bits, sym
	add	rcx, tmp2

	; empty bits
	mov	[out_buf], bits
	mov	tmp, rcx
	shr	tmp, 3		; byte count
	add	out_buf, tmp
	mov	tmp, rcx
	and	rcx, ~7
	SHRX	bits, bits, rcx
	mov	rcx, tmp
	and	rcx, 7
	add	ptr, 4

	vextracti128 codes3 %+ x, codes3, 1
	vextracti128 code_lens3 %+ x, code_lens3, 1
	vextracti128 codes1 %+ x, codes1, 1
	vextracti128 code_lens1 %+ x, code_lens1, 1
%endrep
	sub	end_ptr, VECTOR_SLOP

	vmovq	ybits %+ x, bits
	vmovq	ybits_count %+ x, rcx
	cmp	ptr, in_buf_end
	jbe	.main_loop

.finish:
	add	in_buf_end, VECTOR_LOOP_PROCESSED
	add	end_ptr, VECTOR_SLOP

	cmp	ptr, in_buf_end
	jge	.overflow

.finish_loop:
	mov	DWORD(data), [ptr]

	cmp	out_buf, end_ptr
	ja	.overflow

	mov	sym, data
	and	sym, LIT_MASK	; sym has ll_code
	mov	DWORD(sym), [hufftables + _lit_len_table + sym * 4]

	; look up dist sym
	mov	dsym, data
	shr	dsym, DIST_OFFSET
	and	dsym, DIST_MASK
	mov	DWORD(dsym), [hufftables + _dist_table + dsym * 4]

	; insert LL code
	; sym: 31:24 length; 23:0 code
	mov	tmp2, sym
	and	sym, 0xFFFFFF
	SHLX	sym, sym, rcx
	shr	tmp2, 24
	or	bits, sym
	add	rcx, tmp2

	; insert dist code
	movzx	tmp, WORD(dsym)
	SHLX	tmp, tmp, rcx
	or	bits, tmp
	mov	tmp, dsym
	shr	tmp, 24
	add	rcx, tmp

	; insert dist extra bits
	shr	data, EXTRA_BITS_OFFSET
	add	ptr, 4
	SHLX	data, data, rcx
	or	bits, data
	shr	dsym, 16
	and	dsym, 0xFF
	add	rcx, dsym

	; empty bits
	mov	[out_buf], bits
	mov	tmp, rcx
	shr	tmp, 3		; byte count
	add	out_buf, tmp
	mov	tmp, rcx
	and	rcx, ~7
	SHRX	bits, bits, rcx
	mov	rcx, tmp
	and	rcx, 7

	cmp	ptr, in_buf_end
	jb	.finish_loop

.overflow:
	mov	tmp, [rsp + bitbuf_mem_offset]
	mov	[tmp + _m_bits], bits
	mov	[tmp + _m_bit_count], ecx
	mov	[tmp + _m_out_buf], out_buf

	mov	rax, ptr

	FUNC_RESTORE

	ret

section .data
	align 32
max_write_d:
	dd	0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
min_write_mask:
	dq	0x00, 0x00, 0xff, 0x00
offset_mask:
	dq	0x0000000000000007, 0x0000000000000000
	dq	0x0000000000000000, 0x0000000000000000
q_64:
	dq	0x0000000000000040, 0x0000000000000000
	dq	0x0000000000000040, 0x0000000000000000
lit_mask:
	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
dist_mask:
	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
lit_icr_mask:
	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
eb_icr_mask:
	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
bytes_mask:
	dq	0x00000000000000ff, 0x0000000000000000
	dq	0x00000000000000ff, 0x0000000000000000
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	%include "reg_sizes.asm"
	31	%include "lz0a_const.asm"
	32	%include "data_struct2.asm"
	33	%include "stdmac.asm"
	34
224ce89b WB	35	%define ARCH 04
	36	%define USE_HSWNI
	37
f91f0fd5 TL	38	; tree entry is 4 bytes:
	39	; lit/len tree (513 entries)
	40	; \| 3 \| 2 \| 1 \| 0 \|
	41	; \| len \| code \|
	42	;
	43	; dist tree
	44	; \| 3 \| 2 \| 1 \| 0 \|
	45	; \|eblen:codlen\| code \|
	46
	47	; token format:
	48	; DIST_OFFSET:0 : lit/len
	49	; 31:(DIST_OFFSET + 5) : dist Extra Bits
	50	; (DIST_OFFSET + 5):DIST_OFFSET : dist code
	51	; lit/len: 0-256 (literal)
	52	; 257-512 (dist + 254)
	53
	54	; returns final token pointer
	55	; equal to token_end if successful
	56	; uint32_t* encode_df(uint32_t token_start, uint32_t token_end,
	57	; BitBuf out_buf, uint32_t trees);
	58
	59	%ifidn __OUTPUT_FORMAT__, win64
	60	%define arg1 rcx
	61	%define arg2 rdx
	62	%define arg3 r8
	63	%define arg4 r9
	64	%define sym rsi
	65	%define dsym rdi
	66	%define hufftables r9
	67	%define ptr r11
	68	%else
	69	; Linux
	70	%define arg1 rdi
	71	%define arg2 rsi
	72	%define arg3 rdx
	73	%define arg4 rcx
	74	%define sym r9
	75	%define dsym r8
	76	%define hufftables r11
	77	%define ptr rdi
	78	%endif
	79
	80	%define in_buf_end arg2
	81	%define bitbuf arg3
	82	%define out_buf bitbuf
	83	; bit_count is rcx
	84	%define bits rax
	85	%define data r12
	86	%define tmp rbx
	87	%define len dsym
	88	%define tmp2 r10
	89	%define end_ptr rbp
	90
	91	%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
	92	%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
	93
	94	%define codes1 ymm1
	95	%define code_lens1 ymm2
	96	%define codes2 ymm3
	97	%define code_lens2 ymm4
	98	%define codes3 ymm5
	99	%define code_lens3 ymm6
	100	%define codes4 ymm7
	101	%define syms ymm7
102
103	%define code_lens4 ymm8
104	%define dsyms ymm8
105
106	%define ytmp ymm9
107	%define codes_lookup1 ymm10
108	%define codes_lookup2 ymm11
109	%define datas ymm12
110	%define ybits ymm13
111	%define ybits_count ymm14
112	%define yoffset_mask ymm15
113
114	%define VECTOR_SIZE 0x20
115	%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
116	%define VECTOR_SLOP 0x20 - 8
117
118	gpr_save_mem_offset equ 0
119	gpr_save_mem_size equ 8 * 6
120	xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
121	xmm_save_mem_size equ 10 * 16
122	bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
123	bitbuf_mem_size equ 8
124	stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
125
126
127	%macro FUNC_SAVE 0
128	sub rsp, stack_size
129	mov [rsp + gpr_save_mem_offset + 0*8], rbx
130	mov [rsp + gpr_save_mem_offset + 1*8], rbp
131	mov [rsp + gpr_save_mem_offset + 2*8], r12
132
133	%ifidn __OUTPUT_FORMAT__, win64
134	mov [rsp + gpr_save_mem_offset + 3*8], rsi
135	mov [rsp + gpr_save_mem_offset + 4*8], rdi
136
137	MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
138	MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
139	MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
140	MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
141	MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
142	MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
143	MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
144	MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
145	MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
146	MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
147	%endif
148
149	%endm
150
151	%macro FUNC_RESTORE 0
152	mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153	mov rbp, [rsp + gpr_save_mem_offset + 1*8]
154	mov r12, [rsp + gpr_save_mem_offset + 2*8]
155
156	%ifidn __OUTPUT_FORMAT__, win64
157	mov rsi, [rsp + gpr_save_mem_offset + 3*8]
158	mov rdi, [rsp + gpr_save_mem_offset + 4*8]
159
160	MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
161	MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
162	MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
163	MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
164	MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
165	MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
166	MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
167	MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
168	MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
169	MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
170	%endif
171	add rsp, stack_size
172
173	%endmacro
174
175	global encode_deflate_icf_ %+ ARCH
176	encode_deflate_icf_ %+ ARCH:
177	FUNC_SAVE
178
179	%ifnidn ptr, arg1
180	mov ptr, arg1
181	%endif
182	%ifnidn hufftables, arg4
183	mov hufftables, arg4
184	%endif
185
186	mov [rsp + bitbuf_mem_offset], bitbuf
187	mov bits, [bitbuf + _m_bits]
188	mov ecx, [bitbuf + _m_bit_count]
189	mov end_ptr, [bitbuf + _m_out_end]
190	mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
191
192	sub end_ptr, VECTOR_SLOP
193	sub in_buf_end, VECTOR_LOOP_PROCESSED
194	cmp ptr, in_buf_end
195	jge .finish
196
197	vpcmpeqq ytmp, ytmp, ytmp
198	vmovdqu datas, [ptr]
199	vpand syms, datas, [lit_mask]
200	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
201
202	vpcmpeqq ytmp, ytmp, ytmp
203	vpsrld dsyms, datas, DIST_OFFSET
204	vpand dsyms, dsyms, [dist_mask]
205	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
206
207	vmovq ybits %+ x, bits
208	vmovq ybits_count %+ x, rcx
209	vmovdqa yoffset_mask, [offset_mask]
210
211	.main_loop:
212	;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
213	vpsrld code_lens1, codes_lookup1, 24
214	vpand codes1, codes_lookup1, [lit_icr_mask]
215
216	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
217	;; and code_lens3 the extra bit counts
218	vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
219	vpsrld code_lens2, codes_lookup2, 24
220	vpsrld code_lens3, codes_lookup2, 16
221	vpand code_lens3, [eb_icr_mask]
222
223	;; Set codes3 to contain the extra bits
224	vpsrld codes3, datas, EXTRA_BITS_OFFSET
225
226	cmp out_buf, end_ptr
227	ja .main_loop_exit
228
229	;; Start code lookups for next iteration
230	add ptr, VECTOR_SIZE
231	vpcmpeqq ytmp, ytmp, ytmp
232	vmovdqu datas, [ptr]
233	vpand syms, datas, [lit_mask]
234	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
235
236	vpcmpeqq ytmp, ytmp, ytmp
237	vpsrld dsyms, datas, DIST_OFFSET
238	vpand dsyms, dsyms, [dist_mask]
239	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
240
241	;; Merge dist code with extra bits
242	vpsllvd codes3, codes3, code_lens2
243	vpxor codes2, codes2, codes3
244	vpaddd code_lens2, code_lens2, code_lens3
245
246	;; Check for long codes
247	vpaddd code_lens3, code_lens1, code_lens2
248	vpcmpgtd ytmp, code_lens3, [max_write_d]
249	vptest ytmp, ytmp
250	jnz .long_codes
251
252	;; Merge dist and len codes
253	vpsllvd codes2, codes2, code_lens1
254	vpxor codes1, codes1, codes2
255
256	;; Split buffer data into qwords, ytmp is 0 after last branch
257	vpblendd codes3, ytmp, codes1, 0x55
258	vpsrlq codes1, codes1, 32
259	vpsrlq code_lens1, code_lens3, 32
260	vpblendd code_lens3, ytmp, code_lens3, 0x55
261
262	;; Merge bitbuf bits
263	vpsllvq codes3, codes3, ybits_count
264	vpxor codes3, codes3, ybits
265	vpaddq code_lens3, code_lens3, ybits_count
266
267	;; Merge two symbols into qwords
268	vpsllvq codes1, codes1, code_lens3
269	vpxor codes1, codes1, codes3
270	vpaddq code_lens1, code_lens1, code_lens3
271
272	;; Split buffer data into dqwords, ytmp is 0 after last branch
273	vpblendd codes2, ytmp, codes1, 0x33
274	vpblendd code_lens2, ytmp, code_lens1, 0x33
275	vpsrldq codes1, 8
276	vpsrldq code_lens1, 8
277
278	;; Bit align dqwords
279	vpaddq code_lens1, code_lens1, code_lens2
280	vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
281	vpermq ybits_count, ybits_count, 0xcf
282	vpaddq code_lens2, ybits_count
283	vpsllvq codes2, codes2, ybits_count
284
285	;; Merge two qwords into dqwords
286	vmovdqa ytmp, [q_64]
287	vpsubq code_lens3, ytmp, code_lens2
288	vpsrlvq codes3, codes1, code_lens3
289	vpslldq codes3, codes3, 8
290
291	vpsllvq codes1, codes1, code_lens2
292
293	vpxor codes1, codes1, codes3
294	vpxor codes1, codes1, codes2
295
296	vmovq tmp, code_lens1 %+ x ;Number of bytes
297	shr tmp, 3
298
299	;; Extract last bytes
300	vpaddq code_lens2, code_lens1, ybits_count
301	vpsrlq code_lens2, code_lens2, 3
302	vpshufb codes2, codes1, code_lens2
303	vpand codes2, codes2, [bytes_mask]
304	vextracti128 ybits %+ x, codes2, 1
305
306	;; Check for short codes
307	vptest code_lens2, [min_write_mask]
308	jz .short_codes
309	.short_codes_next:
310
311	vpermq codes2, codes2, 0x45
312	vpor codes1, codes1, codes2
313
314	;; bit shift upper dqword combined bits to line up with lower dqword
315	vextracti128 code_lens2 %+ x, code_lens1, 1
316
317	; Write out lower dqword of combined bits
318	vmovdqu [out_buf], codes1
319	vpaddq code_lens1, code_lens1, code_lens2
320
321	vmovq tmp2, code_lens1 %+ x ;Number of bytes
322	shr tmp2, 3
323	vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
324
325	; Write out upper dqword of combined bits
326	vextracti128 [out_buf + tmp], codes1, 1
327	add out_buf, tmp2
328
329	cmp ptr, in_buf_end
330	jbe .main_loop
331
332	.main_loop_exit:
333	vmovq rcx, ybits_count %+ x
334	vmovq bits, ybits %+ x
335	jmp .finish
336
337	.short_codes:
338	;; Merge last bytes when the second dqword contains less than a byte
339	vpor ybits %+ x, codes2 %+ x
340	jmp .short_codes_next
341
342	.long_codes:
343	add end_ptr, VECTOR_SLOP
344	sub ptr, VECTOR_SIZE
345
346	vpxor ytmp, ytmp, ytmp
347	vpblendd codes3, ytmp, codes1, 0x55
348	vpblendd code_lens3, ytmp, code_lens1, 0x55
349	vpblendd codes4, ytmp, codes2, 0x55
350
351	vpsllvq codes4, codes4, code_lens3
352	vpxor codes3, codes3, codes4
353	vpaddd code_lens3, code_lens1, code_lens2
354
355	vpsrlq codes1, codes1, 32
356	vpsrlq code_lens1, code_lens1, 32
357	vpsrlq codes2, codes2, 32
358
359	vpsllvq codes2, codes2, code_lens1
360	vpxor codes1, codes1, codes2
361
362	vpsrlq code_lens1, code_lens3, 32
363	vpblendd code_lens3, ytmp, code_lens3, 0x55
364
365	;; Merge bitbuf bits
366	vpsllvq codes3, codes3, ybits_count
367	vpxor codes3, codes3, ybits
368	vpaddq code_lens3, code_lens3, ybits_count
369	vpaddq code_lens1, code_lens1, code_lens3
370
371	xor bits, bits
372	xor rcx, rcx
373	vpsubq code_lens1, code_lens1, code_lens3
374	%rep 2
375	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376	cmp out_buf, end_ptr
377	ja .overflow
378	;; insert LL code
379	vmovq sym, codes3 %+ x
380	vmovq tmp2, code_lens3 %+ x
381	SHLX sym, sym, rcx
382	or bits, sym
383	add rcx, tmp2
384
385	; empty bits
386	mov [out_buf], bits
387	mov tmp, rcx
388	shr tmp, 3 ; byte count
389	add out_buf, tmp
390	mov tmp, rcx
391	and rcx, ~7
392	SHRX bits, bits, rcx
393	mov rcx, tmp
394	and rcx, 7
395	add ptr, 4
396
397	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
398	cmp out_buf, end_ptr
399	ja .overflow
400	;; insert LL code
401	vmovq sym, codes1 %+ x
402	vmovq tmp2, code_lens1 %+ x
403	SHLX sym, sym, rcx
404	or bits, sym
405	add rcx, tmp2
406
407	; empty bits
408	mov [out_buf], bits
409	mov tmp, rcx
410	shr tmp, 3 ; byte count
411	add out_buf, tmp
412	mov tmp, rcx
413	and rcx, ~7
414	SHRX bits, bits, rcx
415	mov rcx, tmp
416	and rcx, 7
417	add ptr, 4
418
419	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
420	cmp out_buf, end_ptr
421	ja .overflow
422	;; insert LL code
423	vpextrq sym, codes3 %+ x, 1
424	vpextrq tmp2, code_lens3 %+ x, 1
425	SHLX sym, sym, rcx
426	or bits, sym
427	add rcx, tmp2
428
429	; empty bits
430	mov [out_buf], bits
431	mov tmp, rcx
432	shr tmp, 3 ; byte count
433	add out_buf, tmp
434	mov tmp, rcx
435	and rcx, ~7
436	SHRX bits, bits, rcx
437	mov rcx, tmp
438	and rcx, 7
439	add ptr, 4
440
441	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
442	cmp out_buf, end_ptr
443	ja .overflow
444	;; insert LL code
445	vpextrq sym, codes1 %+ x, 1
446	vpextrq tmp2, code_lens1 %+ x, 1
447	SHLX sym, sym, rcx
448	or bits, sym
449	add rcx, tmp2
450
451	; empty bits
452	mov [out_buf], bits
453	mov tmp, rcx
454	shr tmp, 3 ; byte count
455	add out_buf, tmp
456	mov tmp, rcx
457	and rcx, ~7
458	SHRX bits, bits, rcx
459	mov rcx, tmp
460	and rcx, 7
461	add ptr, 4
462
463	vextracti128 codes3 %+ x, codes3, 1
464	vextracti128 code_lens3 %+ x, code_lens3, 1
465	vextracti128 codes1 %+ x, codes1, 1
466	vextracti128 code_lens1 %+ x, code_lens1, 1
467	%endrep
468	sub end_ptr, VECTOR_SLOP
469
470	vmovq ybits %+ x, bits
471	vmovq ybits_count %+ x, rcx
472	cmp ptr, in_buf_end
473	jbe .main_loop
474
475	.finish:
476	add in_buf_end, VECTOR_LOOP_PROCESSED
477	add end_ptr, VECTOR_SLOP
478
479	cmp ptr, in_buf_end
480	jge .overflow
481
482	.finish_loop:
483	mov DWORD(data), [ptr]
484
485	cmp out_buf, end_ptr
486	ja .overflow
487
488	mov sym, data
489	and sym, LIT_MASK ; sym has ll_code
490	mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
491
492	; look up dist sym
493	mov dsym, data
494	shr dsym, DIST_OFFSET
495	and dsym, DIST_MASK
496	mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
497
498	; insert LL code
499	; sym: 31:24 length; 23:0 code
500	mov tmp2, sym
501	and sym, 0xFFFFFF
502	SHLX sym, sym, rcx
503	shr tmp2, 24
504	or bits, sym
505	add rcx, tmp2
506
507	; insert dist code
508	movzx tmp, WORD(dsym)
509	SHLX tmp, tmp, rcx
510	or bits, tmp
511	mov tmp, dsym
512	shr tmp, 24
513	add rcx, tmp
514
515	; insert dist extra bits
516	shr data, EXTRA_BITS_OFFSET
517	add ptr, 4
518	SHLX data, data, rcx
519	or bits, data
520	shr dsym, 16
521	and dsym, 0xFF
522	add rcx, dsym
523
524	; empty bits
525	mov [out_buf], bits
526	mov tmp, rcx
527	shr tmp, 3 ; byte count
528	add out_buf, tmp
529	mov tmp, rcx
530	and rcx, ~7
531	SHRX bits, bits, rcx
532	mov rcx, tmp
533	and rcx, 7
534
535	cmp ptr, in_buf_end
536	jb .finish_loop
537
538	.overflow:
539	mov tmp, [rsp + bitbuf_mem_offset]
540	mov [tmp + _m_bits], bits
541	mov [tmp + _m_bit_count], ecx
542	mov [tmp + _m_out_buf], out_buf
543
544	mov rax, ptr
545
546	FUNC_RESTORE
547
548	ret
549
550	section .data
551	align 32
552	max_write_d:
553	dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
554	min_write_mask:
555	dq 0x00, 0x00, 0xff, 0x00
556	offset_mask:
557	dq 0x0000000000000007, 0x0000000000000000
558	dq 0x0000000000000000, 0x0000000000000000
559	q_64:
560	dq 0x0000000000000040, 0x0000000000000000
561	dq 0x0000000000000040, 0x0000000000000000
562	lit_mask:
563	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
564	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
565	dist_mask:
566	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
567	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
568	lit_icr_mask:
569	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
570	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
571	eb_icr_mask:
572	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
573	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
574	bytes_mask:
575	dq 0x00000000000000ff, 0x0000000000000000
576	dq 0x00000000000000ff, 0x0000000000000000