[ceph.git] / ceph / src / erasure-code / isa / isa-l / erasure_code / gf_4vect_dot_prod_sse.asm.s

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9

 %define tmp   r11
 %define tmp2  r10
 %define tmp3  r13		; must be saved and restored
 %define tmp4  r12		; must be saved and restored
 %define tmp5  r14		; must be saved and restored
 %define tmp6  r15		; must be saved and restored
 %define return rax
 %macro  SLDR   2
 %endmacro
 %define SSTR   SLDR
 %define PS     8
 %define LOG_PS 3

 %define func(x) x:
 %macro FUNC_SAVE 0
	push	r12
	push	r13
	push	r14
	push	r15
 %endmacro
 %macro FUNC_RESTORE 0
	pop	r15
	pop	r14
	pop	r13
	pop	r12
 %endmacro
%endif

%ifidn __OUTPUT_FORMAT__, win64
 %define arg0   rcx
 %define arg1   rdx
 %define arg2   r8
 %define arg3   r9

 %define arg4   r12 		; must be saved, loaded and restored
 %define arg5   r15 		; must be saved and restored
 %define tmp    r11
 %define tmp2   r10
 %define tmp3   r13		; must be saved and restored
 %define tmp4   r14		; must be saved and restored
 %define tmp5   rdi		; must be saved and restored
 %define tmp6   rsi		; must be saved and restored
 %define return rax
 %macro  SLDR   2
 %endmacro
 %define SSTR   SLDR
 %define PS     8
 %define LOG_PS 3
 %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
 %define arg(x)      [rsp + stack_size + PS + PS*x]

 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	alloc_stack	stack_size
	save_xmm128	xmm6, 0*16
	save_xmm128	xmm7, 1*16
	save_xmm128	xmm8, 2*16
	save_xmm128	xmm9, 3*16
	save_xmm128	xmm10, 4*16
	save_xmm128	xmm11, 5*16
	save_xmm128	xmm12, 6*16
	save_xmm128	xmm13, 7*16
	save_xmm128	xmm14, 8*16
	save_reg	r12,  9*16 + 0*8
	save_reg	r13,  9*16 + 1*8
	save_reg	r14,  9*16 + 2*8
	save_reg	r15,  9*16 + 3*8
	save_reg	rdi,  9*16 + 4*8
	save_reg	rsi,  9*16 + 5*8
	end_prolog
	mov	arg4, arg(4)
 %endmacro

 %macro FUNC_RESTORE 0
	movdqa	xmm6, [rsp + 0*16]
	movdqa	xmm7, [rsp + 1*16]
	movdqa	xmm8, [rsp + 2*16]
	movdqa	xmm9, [rsp + 3*16]
	movdqa	xmm10, [rsp + 4*16]
	movdqa	xmm11, [rsp + 5*16]
	movdqa	xmm12, [rsp + 6*16]
	movdqa	xmm13, [rsp + 7*16]
	movdqa	xmm14, [rsp + 8*16]
	mov	r12,  [rsp + 9*16 + 0*8]
	mov	r13,  [rsp + 9*16 + 1*8]
	mov	r14,  [rsp + 9*16 + 2*8]
	mov	r15,  [rsp + 9*16 + 3*8]
	mov	rdi,  [rsp + 9*16 + 4*8]
	mov	rsi,  [rsp + 9*16 + 5*8]
	add	rsp, stack_size
 %endmacro
%endif

%ifidn __OUTPUT_FORMAT__, elf32

;;;================== High Address;
;;;	arg4
;;;	arg3
;;;	arg2
;;;	arg1
;;;	arg0
;;;	return
;;;<================= esp of caller
;;;	ebp
;;;<================= ebp = esp
;;;	var0
;;;	var1
;;;	var2
;;;	var3
;;;	esi
;;;	edi
;;;	ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;

 %define PS     4
 %define LOG_PS 2
 %define func(x) x:
 %define arg(x) [ebp + PS*2 + PS*x]
 %define var(x) [ebp - PS - PS*x]

 %define trans	 ecx
 %define trans2  esi
 %define arg0	 trans		;trans and trans2 are for the variables in stack
 %define arg0_m	 arg(0)
 %define arg1	 ebx
 %define arg2	 arg2_m
 %define arg2_m	 arg(2)
 %define arg3	 trans
 %define arg3_m	 arg(3)
 %define arg4	 trans
 %define arg4_m	 arg(4)
 %define arg5	 trans2
 %define tmp	 edx
 %define tmp2	 edi
 %define tmp3	 trans2
 %define tmp3_m	 var(0)
 %define tmp4	 trans2
 %define tmp4_m	 var(1)
 %define tmp5	 trans2
 %define tmp5_m	 var(2)
 %define tmp6	 trans2
 %define tmp6_m	 var(3)
 %define return	 eax
 %macro SLDR 2				;stack load/restore
	mov %1, %2
 %endmacro
 %define SSTR SLDR

 %macro FUNC_SAVE 0
	push	ebp
	mov	ebp, esp
	sub	esp, PS*4		;4 local variables
	push	esi
	push	edi
	push	ebx
	mov	arg1, arg(1)
 %endmacro

 %macro FUNC_RESTORE 0
	pop	ebx
	pop	edi
	pop	esi
	add	esp, PS*4		;4 local variables
	pop	ebp
 %endmacro

%endif	; output formats

%define len    arg0
%define vec    arg1
%define mul_array arg2
%define	src    arg3
%define dest1  arg4
%define ptr    arg5
%define vec_i  tmp2
%define dest2  tmp3
%define dest3  tmp4
%define dest4  tmp5
%define vskip3 tmp6
%define pos    return

 %ifidn PS,4				;32-bit code
	%define  len_m 	arg0_m
	%define  src_m 	arg3_m
	%define  dest1_m arg4_m
	%define  dest2_m tmp3_m
	%define  dest3_m tmp4_m
	%define  dest4_m tmp5_m
	%define  vskip3_m tmp6_m
 %endif

%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
 %define XLDR movdqu
 %define XSTR movdqu
%else
;;; Use Non-temporal load/stor
 %ifdef NO_NT_LDST
  %define XLDR movdqa
  %define XSTR movdqa
 %else
  %define XLDR movntdqa
  %define XSTR movntdq
 %endif
%endif

%ifidn PS,8				; 64-bit code
 default rel
  [bits 64]
%endif


section .text

%ifidn PS,8				;64-bit code
 %define xmask0f   xmm14
 %define xgft1_lo  xmm2
 %define xgft1_hi  xmm3
 %define xgft2_lo  xmm11
 %define xgft2_hi  xmm4
 %define xgft3_lo  xmm9
 %define xgft3_hi  xmm5
 %define xgft4_lo  xmm7
 %define xgft4_hi  xmm6

 %define x0     xmm0
 %define xtmpa  xmm1
 %define xp1    xmm8
 %define xp2    xmm10
 %define xp3    xmm12
 %define xp4    xmm13
%else
 %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
 %define xmask0f   xmm_trans
 %define xgft1_lo  xmm_trans
 %define xgft1_hi  xmm6
 %define xgft2_lo  xgft1_lo
 %define xgft2_hi  xgft1_hi
 %define xgft3_lo  xgft1_lo
 %define xgft3_hi  xgft1_hi
 %define xgft4_lo  xgft1_lo
 %define xgft4_hi  xgft1_hi

 %define x0     xmm0
 %define xtmpa  xmm1
 %define xp1    xmm2
 %define xp2    xmm3
 %define xp3    xmm4
 %define xp4    xmm5
%endif
align 16
global gf_4vect_dot_prod_sse:function
func(gf_4vect_dot_prod_sse)
	FUNC_SAVE
	SLDR	len, len_m
	sub	len, 16
	SSTR	len_m, len
	jl	.return_fail
	xor	pos, pos
	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
	mov	vskip3,  vec
	imul	vskip3,  96
	SSTR	vskip3_m, vskip3
	sal	vec, 	 LOG_PS		;vec *= PS. Make vec_i count by PS
	SLDR	dest1, 	 dest1_m
	mov	dest2, 	 [dest1+PS]
	SSTR	dest2_m, dest2
	mov	dest3, 	 [dest1+2*PS]
	SSTR	dest3_m, dest3
	mov	dest4, 	 [dest1+3*PS]
	SSTR	dest4_m, dest4
	mov	dest1, 	 [dest1]
	SSTR	dest1_m, dest1

.loop16:
	pxor	xp1, xp1
	pxor	xp2, xp2
	pxor	xp3, xp3
	pxor	xp4, xp4
	mov	tmp, mul_array
	xor	vec_i, vec_i

.next_vect:
	SLDR 	src, src_m
	mov	ptr, [src+vec_i]

 %ifidn PS,8				;64-bit code
	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}

	XLDR	x0, 	[ptr+pos]	;Get next source vector
	add	tmp, 	32
	add	vec_i, 	PS

	movdqa	xtmpa, x0		;Keep unshifted copy of src
	psraw	x0, 4			;Shift to put high nibble into bits 4-0
	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
	pand	xtmpa, 	xmask0f		;Mask low src nibble in bits 4-0
 %else					;32-bit code
	XLDR	x0, 	 [ptr+pos]	;Get next source vector
	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte

	movdqa	xtmpa, 	x0		;Keep unshifted copy of src
	psraw	x0, 	4		;Shift to put high nibble into bits 4-0
	pand	x0, 	xmask0f		;Mask high src nibble in bits 4-0
	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0

	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 %endif

	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
	pxor	xp1, xgft1_hi		;xp1 += partial

 %ifidn PS,4				;32-bit code
	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 %endif
	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
	pxor	xp2, xgft2_hi		;xp2 += partial

 %ifidn PS,4				;32-bit code
	sal	vec, 1
	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
	sar 	vec, 1
 %endif
	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
	pxor	xp3, xgft3_hi		;xp3 += partial

 %ifidn PS,4				;32-bit code
	SLDR	vskip3, vskip3_m
	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
	add	tmp, 32
	add	vec_i, PS
 %endif
	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
	pxor	xp4, xgft4_hi		;xp4 += partial

	cmp	vec_i, vec
	jl	.next_vect

	SLDR	dest1, dest1_m
	SLDR	dest2, dest2_m
	XSTR	[dest1+pos], xp1
	XSTR	[dest2+pos], xp2
	SLDR	dest3, dest3_m
	XSTR	[dest3+pos], xp3
	SLDR	dest4, dest4_m
	XSTR	[dest4+pos], xp4

	SLDR	len, len_m
	add	pos, 16			;Loop on 16 bytes at a time
	cmp	pos, len
	jle	.loop16

	lea	tmp, [len + 16]
	cmp	pos, tmp
	je	.return_pass

	;; Tail len
	mov	pos, len	;Overlapped offset length-16
	jmp	.loop16		;Do one more overlap pass

.return_pass:
	mov	return, 0
	FUNC_RESTORE
	ret

.return_fail:
	mov	return, 1
	FUNC_RESTORE
	ret

endproc_frame

section .data

align 16
mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f

;;;       func                  core, ver, snum
slversion gf_4vect_dot_prod_sse, 00,  06,  0064
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;
	31	;;; gf_4vect_dot_prod_sse(len, vec, g_tbls, buffs, *dests);
	32	;;;
	33
	34	%include "reg_sizes.asm"
	35
	36	%ifidn __OUTPUT_FORMAT__, elf64
	37	%define arg0 rdi
	38	%define arg1 rsi
	39	%define arg2 rdx
	40	%define arg3 rcx
	41	%define arg4 r8
	42	%define arg5 r9
	43
	44	%define tmp r11
	45	%define tmp2 r10
	46	%define tmp3 r13 ; must be saved and restored
	47	%define tmp4 r12 ; must be saved and restored
	48	%define tmp5 r14 ; must be saved and restored
	49	%define tmp6 r15 ; must be saved and restored
	50	%define return rax
	51	%macro SLDR 2
	52	%endmacro
	53	%define SSTR SLDR
	54	%define PS 8
	55	%define LOG_PS 3
	56
	57	%define func(x) x:
	58	%macro FUNC_SAVE 0
	59	push r12
	60	push r13
	61	push r14
	62	push r15
	63	%endmacro
	64	%macro FUNC_RESTORE 0
65	pop r15
66	pop r14
67	pop r13
68	pop r12
69	%endmacro
70	%endif
71
72	%ifidn __OUTPUT_FORMAT__, win64
73	%define arg0 rcx
74	%define arg1 rdx
75	%define arg2 r8
76	%define arg3 r9
77
78	%define arg4 r12 ; must be saved, loaded and restored
79	%define arg5 r15 ; must be saved and restored
80	%define tmp r11
81	%define tmp2 r10
82	%define tmp3 r13 ; must be saved and restored
83	%define tmp4 r14 ; must be saved and restored
84	%define tmp5 rdi ; must be saved and restored
85	%define tmp6 rsi ; must be saved and restored
86	%define return rax
87	%macro SLDR 2
88	%endmacro
89	%define SSTR SLDR
90	%define PS 8
91	%define LOG_PS 3
92	%define stack_size 916 + 78 ; must be an odd multiple of 8
93	%define arg(x) [rsp + stack_size + PS + PS*x]
94
95	%define func(x) proc_frame x
96	%macro FUNC_SAVE 0
97	alloc_stack stack_size
98	save_xmm128 xmm6, 0*16
99	save_xmm128 xmm7, 1*16
100	save_xmm128 xmm8, 2*16
101	save_xmm128 xmm9, 3*16
102	save_xmm128 xmm10, 4*16
103	save_xmm128 xmm11, 5*16
104	save_xmm128 xmm12, 6*16
105	save_xmm128 xmm13, 7*16
106	save_xmm128 xmm14, 8*16
107	save_reg r12, 916 + 08
108	save_reg r13, 916 + 18
109	save_reg r14, 916 + 28
110	save_reg r15, 916 + 38
111	save_reg rdi, 916 + 48
112	save_reg rsi, 916 + 58
113	end_prolog
114	mov arg4, arg(4)
115	%endmacro
116
117	%macro FUNC_RESTORE 0
118	movdqa xmm6, [rsp + 0*16]
119	movdqa xmm7, [rsp + 1*16]
120	movdqa xmm8, [rsp + 2*16]
121	movdqa xmm9, [rsp + 3*16]
122	movdqa xmm10, [rsp + 4*16]
123	movdqa xmm11, [rsp + 5*16]
124	movdqa xmm12, [rsp + 6*16]
125	movdqa xmm13, [rsp + 7*16]
126	movdqa xmm14, [rsp + 8*16]
127	mov r12, [rsp + 916 + 08]
128	mov r13, [rsp + 916 + 18]
129	mov r14, [rsp + 916 + 28]
130	mov r15, [rsp + 916 + 38]
131	mov rdi, [rsp + 916 + 48]
132	mov rsi, [rsp + 916 + 58]
133	add rsp, stack_size
134	%endmacro
135	%endif
136
137	%ifidn __OUTPUT_FORMAT__, elf32
138
139	;;;================== High Address;
140	;;; arg4
141	;;; arg3
142	;;; arg2
143	;;; arg1
144	;;; arg0
145	;;; return
146	;;;<================= esp of caller
147	;;; ebp
148	;;;<================= ebp = esp
149	;;; var0
150	;;; var1
151	;;; var2
152	;;; var3
153	;;; esi
154	;;; edi
155	;;; ebx
156	;;;<================= esp of callee
157	;;;
158	;;;================== Low Address;
159
160	%define PS 4
161	%define LOG_PS 2
162	%define func(x) x:
163	%define arg(x) [ebp + PS2 + PSx]
164	%define var(x) [ebp - PS - PS*x]
165
166	%define trans ecx
167	%define trans2 esi
168	%define arg0 trans ;trans and trans2 are for the variables in stack
169	%define arg0_m arg(0)
170	%define arg1 ebx
171	%define arg2 arg2_m
172	%define arg2_m arg(2)
173	%define arg3 trans
174	%define arg3_m arg(3)
175	%define arg4 trans
176	%define arg4_m arg(4)
177	%define arg5 trans2
178	%define tmp edx
179	%define tmp2 edi
180	%define tmp3 trans2
181	%define tmp3_m var(0)
182	%define tmp4 trans2
183	%define tmp4_m var(1)
184	%define tmp5 trans2
185	%define tmp5_m var(2)
186	%define tmp6 trans2
187	%define tmp6_m var(3)
188	%define return eax
189	%macro SLDR 2 ;stack load/restore
190	mov %1, %2
191	%endmacro
192	%define SSTR SLDR
193
194	%macro FUNC_SAVE 0
195	push ebp
196	mov ebp, esp
197	sub esp, PS*4 ;4 local variables
198	push esi
199	push edi
200	push ebx
201	mov arg1, arg(1)
202	%endmacro
203
204	%macro FUNC_RESTORE 0
205	pop ebx
206	pop edi
207	pop esi
208	add esp, PS*4 ;4 local variables
209	pop ebp
210	%endmacro
211
212	%endif ; output formats
213
214	%define len arg0
215	%define vec arg1
216	%define mul_array arg2
217	%define src arg3
218	%define dest1 arg4
219	%define ptr arg5
220	%define vec_i tmp2
221	%define dest2 tmp3
222	%define dest3 tmp4
223	%define dest4 tmp5
224	%define vskip3 tmp6
225	%define pos return
226
227	%ifidn PS,4 ;32-bit code
228	%define len_m arg0_m
229	%define src_m arg3_m
230	%define dest1_m arg4_m
231	%define dest2_m tmp3_m
232	%define dest3_m tmp4_m
233	%define dest4_m tmp5_m
234	%define vskip3_m tmp6_m
235	%endif
236
237	%ifndef EC_ALIGNED_ADDR
238	;;; Use Un-aligned load/store
239	%define XLDR movdqu
240	%define XSTR movdqu
241	%else
242	;;; Use Non-temporal load/stor
243	%ifdef NO_NT_LDST
244	%define XLDR movdqa
245	%define XSTR movdqa
246	%else
247	%define XLDR movntdqa
248	%define XSTR movntdq
249	%endif
250	%endif
251
252	%ifidn PS,8 ; 64-bit code
253	default rel
254	[bits 64]
255	%endif
256
257
258	section .text
259
260	%ifidn PS,8 ;64-bit code
261	%define xmask0f xmm14
262	%define xgft1_lo xmm2
263	%define xgft1_hi xmm3
264	%define xgft2_lo xmm11
265	%define xgft2_hi xmm4
266	%define xgft3_lo xmm9
267	%define xgft3_hi xmm5
268	%define xgft4_lo xmm7
269	%define xgft4_hi xmm6
270
271	%define x0 xmm0
272	%define xtmpa xmm1
273	%define xp1 xmm8
274	%define xp2 xmm10
275	%define xp3 xmm12
276	%define xp4 xmm13
277	%else
278	%define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo
279	%define xmask0f xmm_trans
280	%define xgft1_lo xmm_trans
281	%define xgft1_hi xmm6
282	%define xgft2_lo xgft1_lo
283	%define xgft2_hi xgft1_hi
284	%define xgft3_lo xgft1_lo
285	%define xgft3_hi xgft1_hi
286	%define xgft4_lo xgft1_lo
287	%define xgft4_hi xgft1_hi
288
289	%define x0 xmm0
290	%define xtmpa xmm1
291	%define xp1 xmm2
292	%define xp2 xmm3
293	%define xp3 xmm4
294	%define xp4 xmm5
295	%endif
296	align 16
297	global gf_4vect_dot_prod_sse:function
298	func(gf_4vect_dot_prod_sse)
299	FUNC_SAVE
300	SLDR len, len_m
301	sub len, 16
302	SSTR len_m, len
303	jl .return_fail
304	xor pos, pos
305	movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
306	mov vskip3, vec
307	imul vskip3, 96
308	SSTR vskip3_m, vskip3
309	sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
310	SLDR dest1, dest1_m
311	mov dest2, [dest1+PS]
312	SSTR dest2_m, dest2
313	mov dest3, [dest1+2*PS]
314	SSTR dest3_m, dest3
315	mov dest4, [dest1+3*PS]
316	SSTR dest4_m, dest4
317	mov dest1, [dest1]
318	SSTR dest1_m, dest1
319
320	.loop16:
321	pxor xp1, xp1
322	pxor xp2, xp2
323	pxor xp3, xp3
324	pxor xp4, xp4
325	mov tmp, mul_array
326	xor vec_i, vec_i
327
328	.next_vect:
329	SLDR src, src_m
330	mov ptr, [src+vec_i]
331
332	%ifidn PS,8 ;64-bit code
333	movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
334	movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
335	movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
336	movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
337	movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
338	movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
339	movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
340	movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
341
342	XLDR x0, [ptr+pos] ;Get next source vector
343	add tmp, 32
344	add vec_i, PS
345
346	movdqa xtmpa, x0 ;Keep unshifted copy of src
347	psraw x0, 4 ;Shift to put high nibble into bits 4-0
348	pand x0, xmask0f ;Mask high src nibble in bits 4-0
349	pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
350	%else ;32-bit code
351	XLDR x0, [ptr+pos] ;Get next source vector
352	movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
353
354	movdqa xtmpa, x0 ;Keep unshifted copy of src
355	psraw x0, 4 ;Shift to put high nibble into bits 4-0
356	pand x0, xmask0f ;Mask high src nibble in bits 4-0
357	pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
358
359	movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
360	movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
361	%endif
362
363	pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
364	pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
365	pxor xgft1_hi, xgft1_lo ;GF add high and low partials
366	pxor xp1, xgft1_hi ;xp1 += partial
367
368	%ifidn PS,4 ;32-bit code
369	movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
370	movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
371	%endif
372	pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
373	pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
374	pxor xgft2_hi, xgft2_lo ;GF add high and low partials
375	pxor xp2, xgft2_hi ;xp2 += partial
376
377	%ifidn PS,4 ;32-bit code
378	sal vec, 1
379	movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
380	movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
381	sar vec, 1
382	%endif
383	pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
384	pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
385	pxor xgft3_hi, xgft3_lo ;GF add high and low partials
386	pxor xp3, xgft3_hi ;xp3 += partial
387
388	%ifidn PS,4 ;32-bit code
389	SLDR vskip3, vskip3_m
390	movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
391	movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
392	add tmp, 32
393	add vec_i, PS
394	%endif
395	pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
396	pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
397	pxor xgft4_hi, xgft4_lo ;GF add high and low partials
398	pxor xp4, xgft4_hi ;xp4 += partial
399
400	cmp vec_i, vec
401	jl .next_vect
402
403	SLDR dest1, dest1_m
404	SLDR dest2, dest2_m
405	XSTR [dest1+pos], xp1
406	XSTR [dest2+pos], xp2
407	SLDR dest3, dest3_m
408	XSTR [dest3+pos], xp3
409	SLDR dest4, dest4_m
410	XSTR [dest4+pos], xp4
411
412	SLDR len, len_m
413	add pos, 16 ;Loop on 16 bytes at a time
414	cmp pos, len
415	jle .loop16
416
417	lea tmp, [len + 16]
418	cmp pos, tmp
419	je .return_pass
420
421	;; Tail len
422	mov pos, len ;Overlapped offset length-16
423	jmp .loop16 ;Do one more overlap pass
424
425	.return_pass:
426	mov return, 0
427	FUNC_RESTORE
428	ret
429
430	.return_fail:
431	mov return, 1
432	FUNC_RESTORE
433	ret
434
435	endproc_frame
436
437	section .data
438
439	align 16
440	mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
441
442	;;; func core, ver, snum
443	slversion gf_4vect_dot_prod_sse, 00, 06, 0064