[ceph.git] / ceph / src / isa-l / raid / xor_check_sse.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Optimized xor of N source vectors using SSE
;;; int xor_gen_sse(int vects, int len, void **array)

;;; Generates xor parity vector from N (vects-1) sources in array of pointers
;;; (**array).  Last pointer is the dest.
;;; Vectors must be aligned to 16 bytes.  Length can be any value.

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9
 %define tmp   r11
 %define tmp2  rax
 %define tmp2.b al
 %define tmp3  arg4
 %define return rax
 %define PS 8
 %define func(x) x: endbranch
 %define FUNC_SAVE
 %define FUNC_RESTORE

%elifidn __OUTPUT_FORMAT__, win64
 %define arg0  rcx
 %define arg1  rdx
 %define arg2  r8
 %define arg3  r9
 %define return rax
 %define tmp2  rax
 %define tmp2.b al
 %define PS 8
 %define tmp   r11
 %define tmp3  r10
 %define stack_size  2*16 + 8 	; must be an odd multiple of 8
 %define func(x) proc_frame x

 %macro FUNC_SAVE 0
	alloc_stack	stack_size
	save_xmm128	xmm6, 0*16
	save_xmm128	xmm7, 1*16
	end_prolog
 %endmacro
 %macro FUNC_RESTORE 0
	movdqa	xmm6, [rsp + 0*16]
	movdqa	xmm7, [rsp + 1*16]
	add	rsp, stack_size
 %endmacro


%elifidn __OUTPUT_FORMAT__, elf32
 %define arg0   arg(0)
 %define arg1   ecx
 %define tmp2   eax
 %define tmp2.b  al
 %define tmp3   edx
 %define return eax
 %define PS 4
 %define func(x) x: endbranch
 %define arg(x) [ebp+8+PS*x]
 %define arg2  edi	; must sav/restore
 %define arg3  esi
 %define tmp   ebx

 %macro FUNC_SAVE 0
	push	ebp
	mov	ebp, esp
	push	esi
	push	edi
	push	ebx
	mov	arg1, arg(1)
	mov	arg2, arg(2)
 %endmacro

 %macro FUNC_RESTORE 0
	pop	ebx
	pop	edi
	pop	esi
	mov	esp, ebp	;if has frame pointer
	pop	ebp
 %endmacro

%endif	; output formats


%define vec arg0
%define	len arg1
%define ptr arg3
%define pos tmp3

%ifidn PS,8			; 64-bit code
 default rel
 [bits 64]
%endif

;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
 %define XLDR movdqa
 %define XSTR movdqa
%else
 %define XLDR movntdqa
 %define XSTR movntdq
%endif

section .text

align 16
mk_global  xor_check_sse, function
func(xor_check_sse)
	FUNC_SAVE
%ifidn PS,8				;64-bit code
	sub	vec, 1			; Keep as offset to last source
%else					;32-bit code
	mov	tmp, arg(0)		; Update vec length arg to last source
	sub	tmp, 1
	mov	arg(0), tmp
%endif

	jng	return_fail		;Must have at least 2 sources
	cmp	len, 0
	je	return_pass
	test	len, (128-1)		;Check alignment of length
	jnz	len_not_aligned


len_aligned_128bytes:
	sub	len, 128
	mov	pos, 0
	mov	tmp, vec		;Preset to last vector

loop128:
	mov	tmp2, [arg2+tmp*PS]	;Fetch last pointer in array
	sub	tmp, 1			;Next vect
	XLDR	xmm0, [tmp2+pos]	;Start with end of array in last vector
	XLDR	xmm1, [tmp2+pos+16]	;Keep xor parity in xmm0-7
	XLDR	xmm2, [tmp2+pos+(2*16)]
	XLDR	xmm3, [tmp2+pos+(3*16)]
	XLDR	xmm4, [tmp2+pos+(4*16)]
	XLDR	xmm5, [tmp2+pos+(5*16)]
	XLDR	xmm6, [tmp2+pos+(6*16)]
	XLDR	xmm7, [tmp2+pos+(7*16)]

next_vect:
	mov 	ptr, [arg2+tmp*PS]
	sub	tmp, 1
	xorpd	xmm0, [ptr+pos]		;Get next vector (source)
	xorpd	xmm1, [ptr+pos+16]
	xorpd	xmm2, [ptr+pos+(2*16)]
	xorpd	xmm3, [ptr+pos+(3*16)]
	xorpd	xmm4, [ptr+pos+(4*16)]
	xorpd	xmm5, [ptr+pos+(5*16)]
	xorpd	xmm6, [ptr+pos+(6*16)]
	xorpd	xmm7, [ptr+pos+(7*16)]
;;;  	prefetch [ptr+pos+(8*16)]
	jge	next_vect		;Loop for each vect

	;; End of vects, chech that all parity regs = 0
	mov	tmp, vec		;Back to last vector
	por	xmm0, xmm1
	por	xmm0, xmm2
	por	xmm0, xmm3
	por	xmm0, xmm4
	por	xmm0, xmm5
	por	xmm0, xmm6
	por	xmm0, xmm7
	ptest	xmm0, xmm0
	jnz	return_fail

	add	pos, 128
	cmp	pos, len
	jle	loop128

return_pass:
	FUNC_RESTORE
	mov	return, 0
	ret


;;; Do one byte at a time for no alignment case

xor_gen_byte:
	mov	tmp, vec		;Preset to last vector

loop_1byte:
	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
	mov	tmp2.b, [ptr+len-1]	;Get array n
	sub	tmp, 1
nextvect_1byte:
	mov 	ptr, [arg2+tmp*PS]
	xor	tmp2.b, [ptr+len-1]
	sub	tmp, 1
	jge	nextvect_1byte

	mov	tmp, vec		;Back to last vector
	cmp	tmp2.b, 0
	jne	return_fail
	sub	len, 1
	test	len, (8-1)
	jnz	loop_1byte

	cmp	len, 0
	je	return_pass
	test	len, (128-1)		;If not 0 and 128bit aligned
	jz	len_aligned_128bytes	; then do aligned case. len = y * 128

	;; else we are 8-byte aligned so fall through to recheck


	;; Unaligned length cases
len_not_aligned:
	test	len, (PS-1)
	jne	xor_gen_byte
	mov	tmp3, len
	and	tmp3, (128-1)		;Do the unaligned bytes 4-8 at a time
	mov	tmp, vec		;Preset to last vector

	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
loopN_bytes:
	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
	mov	tmp2, [ptr+len-PS]	;Get array n
	sub	tmp, 1
nextvect_Nbytes:
	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
	xor	tmp2, [ptr+len-PS]
	sub	tmp, 1
	jge	nextvect_Nbytes		;Loop for each source

	mov	tmp, vec		;Back to last vector
	cmp	tmp2, 0
	jne	return_fail
	sub	len, PS
	sub	tmp3, PS
	jg	loopN_bytes

	cmp	len, 128		;Now len is aligned to 128B
	jge	len_aligned_128bytes	;We can do the rest aligned

	cmp	len, 0
	je	return_pass

return_fail:
	mov	return, 1
	FUNC_RESTORE
	ret

endproc_frame

section .data

;;;       func           core, ver, snum
slversion xor_check_sse, 00,   03,  0031
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;; Optimized xor of N source vectors using SSE
	31	;;; int xor_gen_sse(int vects, int len, void **array)
	32
	33	;;; Generates xor parity vector from N (vects-1) sources in array of pointers
	34	;;; (**array). Last pointer is the dest.
	35	;;; Vectors must be aligned to 16 bytes. Length can be any value.
	36
	37	%include "reg_sizes.asm"
	38
	39	%ifidn __OUTPUT_FORMAT__, elf64
	40	%define arg0 rdi
	41	%define arg1 rsi
	42	%define arg2 rdx
	43	%define arg3 rcx
	44	%define arg4 r8
	45	%define arg5 r9
	46	%define tmp r11
	47	%define tmp2 rax
	48	%define tmp2.b al
	49	%define tmp3 arg4
	50	%define return rax
	51	%define PS 8
20effc67	52	%define func(x) x: endbranch
7c673cae FG	53	%define FUNC_SAVE
	54	%define FUNC_RESTORE
	55
	56	%elifidn __OUTPUT_FORMAT__, win64
	57	%define arg0 rcx
	58	%define arg1 rdx
	59	%define arg2 r8
	60	%define arg3 r9
	61	%define return rax
	62	%define tmp2 rax
	63	%define tmp2.b al
	64	%define PS 8
	65	%define tmp r11
	66	%define tmp3 r10
	67	%define stack_size 2*16 + 8 ; must be an odd multiple of 8
	68	%define func(x) proc_frame x
	69
	70	%macro FUNC_SAVE 0
	71	alloc_stack stack_size
	72	save_xmm128 xmm6, 0*16
	73	save_xmm128 xmm7, 1*16
	74	end_prolog
	75	%endmacro
	76	%macro FUNC_RESTORE 0
	77	movdqa xmm6, [rsp + 0*16]
	78	movdqa xmm7, [rsp + 1*16]
	79	add rsp, stack_size
	80	%endmacro
	81
	82
	83	%elifidn __OUTPUT_FORMAT__, elf32
	84	%define arg0 arg(0)
	85	%define arg1 ecx
	86	%define tmp2 eax
	87	%define tmp2.b al
	88	%define tmp3 edx
	89	%define return eax
	90	%define PS 4
20effc67	91	%define func(x) x: endbranch
7c673cae FG	92	%define arg(x) [ebp+8+PS*x]
	93	%define arg2 edi ; must sav/restore
	94	%define arg3 esi
	95	%define tmp ebx
	96
	97	%macro FUNC_SAVE 0
	98	push ebp
	99	mov ebp, esp
	100	push esi
	101	push edi
	102	push ebx
	103	mov arg1, arg(1)
	104	mov arg2, arg(2)
	105	%endmacro
	106
	107	%macro FUNC_RESTORE 0
	108	pop ebx
	109	pop edi
	110	pop esi
	111	mov esp, ebp ;if has frame pointer
	112	pop ebp
	113	%endmacro
	114
	115	%endif ; output formats
	116
	117
	118	%define vec arg0
	119	%define len arg1
	120	%define ptr arg3
	121	%define pos tmp3
	122
	123	%ifidn PS,8 ; 64-bit code
	124	default rel
	125	[bits 64]
	126	%endif
	127
	128	;;; Use Non-temporal load/stor
	129	%ifdef NO_NT_LDST
	130	%define XLDR movdqa
	131	%define XSTR movdqa
	132	%else
	133	%define XLDR movntdqa
	134	%define XSTR movntdq
	135	%endif
	136
	137	section .text
	138
	139	align 16
20effc67	140	mk_global xor_check_sse, function
7c673cae FG	141	func(xor_check_sse)
	142	FUNC_SAVE
	143	%ifidn PS,8 ;64-bit code
	144	sub vec, 1 ; Keep as offset to last source
	145	%else ;32-bit code
	146	mov tmp, arg(0) ; Update vec length arg to last source
	147	sub tmp, 1
	148	mov arg(0), tmp
	149	%endif
	150
	151	jng return_fail ;Must have at least 2 sources
	152	cmp len, 0
	153	je return_pass
	154	test len, (128-1) ;Check alignment of length
	155	jnz len_not_aligned
	156
	157
	158	len_aligned_128bytes:
	159	sub len, 128
	160	mov pos, 0
	161	mov tmp, vec ;Preset to last vector
	162
	163	loop128:
	164	mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array
	165	sub tmp, 1 ;Next vect
	166	XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector
	167	XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7
	168	XLDR xmm2, [tmp2+pos+(2*16)]
	169	XLDR xmm3, [tmp2+pos+(3*16)]
	170	XLDR xmm4, [tmp2+pos+(4*16)]
	171	XLDR xmm5, [tmp2+pos+(5*16)]
	172	XLDR xmm6, [tmp2+pos+(6*16)]
	173	XLDR xmm7, [tmp2+pos+(7*16)]
	174
	175	next_vect:
	176	mov ptr, [arg2+tmp*PS]
	177	sub tmp, 1
	178	xorpd xmm0, [ptr+pos] ;Get next vector (source)
	179	xorpd xmm1, [ptr+pos+16]
	180	xorpd xmm2, [ptr+pos+(2*16)]
	181	xorpd xmm3, [ptr+pos+(3*16)]
	182	xorpd xmm4, [ptr+pos+(4*16)]
	183	xorpd xmm5, [ptr+pos+(5*16)]
	184	xorpd xmm6, [ptr+pos+(6*16)]
	185	xorpd xmm7, [ptr+pos+(7*16)]
	186	;;; prefetch [ptr+pos+(8*16)]
	187	jge next_vect ;Loop for each vect
	188
	189	;; End of vects, chech that all parity regs = 0
	190	mov tmp, vec ;Back to last vector
	191	por xmm0, xmm1
	192	por xmm0, xmm2
	193	por xmm0, xmm3
	194	por xmm0, xmm4
	195	por xmm0, xmm5
	196	por xmm0, xmm6
	197	por xmm0, xmm7
	198	ptest xmm0, xmm0
	199	jnz return_fail
	200
	201	add pos, 128
	202	cmp pos, len
	203	jle loop128
	204
205	return_pass:
206	FUNC_RESTORE
207	mov return, 0
208	ret
209
210
211
212	;;; Do one byte at a time for no alignment case
213
214	xor_gen_byte:
215	mov tmp, vec ;Preset to last vector
216
217	loop_1byte:
218	mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
219	mov tmp2.b, [ptr+len-1] ;Get array n
220	sub tmp, 1
221	nextvect_1byte:
222	mov ptr, [arg2+tmp*PS]
223	xor tmp2.b, [ptr+len-1]
224	sub tmp, 1
225	jge nextvect_1byte
226
227	mov tmp, vec ;Back to last vector
228	cmp tmp2.b, 0
229	jne return_fail
230	sub len, 1
231	test len, (8-1)
232	jnz loop_1byte
233
234	cmp len, 0
235	je return_pass
236	test len, (128-1) ;If not 0 and 128bit aligned
237	jz len_aligned_128bytes ; then do aligned case. len = y * 128
238
239	;; else we are 8-byte aligned so fall through to recheck
240
241
242	;; Unaligned length cases
243	len_not_aligned:
244	test len, (PS-1)
245	jne xor_gen_byte
246	mov tmp3, len
247	and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time
248	mov tmp, vec ;Preset to last vector
249
250	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
251	loopN_bytes:
252	mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
253	mov tmp2, [ptr+len-PS] ;Get array n
254	sub tmp, 1
255	nextvect_Nbytes:
256	mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
257	xor tmp2, [ptr+len-PS]
258	sub tmp, 1
259	jge nextvect_Nbytes ;Loop for each source
260
261	mov tmp, vec ;Back to last vector
262	cmp tmp2, 0
263	jne return_fail
264	sub len, PS
265	sub tmp3, PS
266	jg loopN_bytes
267
268	cmp len, 128 ;Now len is aligned to 128B
269	jge len_aligned_128bytes ;We can do the rest aligned
270
271	cmp len, 0
272	je return_pass
273
274	return_fail:
275	mov return, 1
276	FUNC_RESTORE
277	ret
278
279	endproc_frame
280
281	section .data
282
283	;;; func core, ver, snum
284	slversion xor_check_sse, 00, 03, 0031
285