]>
Commit | Line | Data |
---|---|---|
224ce89b WB |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | ;;; Optimized xor of N source vectors using AVX512 | |
31 | ;;; int xor_gen_avx512(int vects, int len, void **array) | |
32 | ||
33 | ;;; Generates xor parity vector from N (vects-1) sources in array of pointers | |
34 | ;;; (**array). Last pointer is the dest. | |
35 | ;;; Vectors must be aligned to 32 bytes. Length can be any value. | |
36 | ||
37 | %include "reg_sizes.asm" | |
38 | ||
39 | %ifdef HAVE_AS_KNOWS_AVX512 | |
40 | ||
41 | %ifidn __OUTPUT_FORMAT__, elf64 | |
42 | %define arg0 rdi | |
43 | %define arg1 rsi | |
44 | %define arg2 rdx | |
45 | %define arg3 rcx | |
46 | %define arg4 r8 | |
47 | %define arg5 r9 | |
48 | %define tmp r11 | |
49 | %define tmp3 arg4 | |
20effc67 | 50 | %define func(x) x: endbranch |
224ce89b WB |
51 | %define return rax |
52 | %define FUNC_SAVE | |
53 | %define FUNC_RESTORE | |
54 | ||
55 | %elifidn __OUTPUT_FORMAT__, win64 | |
56 | %define arg0 rcx | |
57 | %define arg1 rdx | |
58 | %define arg2 r8 | |
59 | %define arg3 r9 | |
60 | %define tmp r11 | |
61 | %define tmp3 r10 | |
62 | %define func(x) proc_frame x | |
63 | %define return rax | |
64 | %define stack_size 2*16 + 8 ;must be an odd multiple of 8 | |
65 | ||
66 | %macro FUNC_SAVE 0 | |
67 | alloc_stack stack_size | |
68 | vmovdqu [rsp + 0*16], xmm6 | |
69 | vmovdqu [rsp + 1*16], xmm7 | |
70 | end_prolog | |
71 | %endmacro | |
72 | %macro FUNC_RESTORE 0 | |
73 | vmovdqu xmm6, [rsp + 0*16] | |
74 | vmovdqu xmm7, [rsp + 1*316] | |
75 | add rsp, stack_size | |
76 | %endmacro | |
77 | ||
78 | %endif ;output formats | |
79 | ||
80 | ||
81 | %define vec arg0 | |
82 | %define len arg1 | |
83 | %define ptr arg3 | |
84 | %define tmp2 rax | |
85 | %define tmp2.b al | |
86 | %define pos tmp3 | |
87 | %define PS 8 | |
88 | ||
89 | %define NO_NT_LDST | |
90 | ;;; Use Non-temporal load/stor | |
91 | %ifdef NO_NT_LDST | |
92 | %define XLDR vmovdqu8 | |
93 | %define XSTR vmovdqu8 | |
94 | %else | |
95 | %define XLDR vmovntdqa | |
96 | %define XSTR vmovntdq | |
97 | %endif | |
98 | ||
99 | ||
100 | default rel | |
101 | [bits 64] | |
102 | ||
103 | section .text | |
104 | ||
105 | align 16 | |
20effc67 | 106 | mk_global xor_gen_avx512, function |
224ce89b WB |
107 | func(xor_gen_avx512) |
108 | FUNC_SAVE | |
109 | sub vec, 2 ;Keep as offset to last source | |
110 | jng return_fail ;Must have at least 2 sources | |
111 | cmp len, 0 | |
112 | je return_pass | |
113 | test len, (128-1) ;Check alignment of length | |
114 | jnz len_not_aligned | |
115 | ||
116 | len_aligned_128bytes: | |
117 | sub len, 128 | |
118 | mov pos, 0 | |
119 | ||
120 | loop128: | |
121 | mov tmp, vec ;Back to last vector | |
122 | mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array | |
123 | sub tmp, 1 ;Next vect | |
124 | XLDR zmm0, [tmp2+pos] ;Start with end of array in last vector | |
125 | XLDR zmm1, [tmp2+pos+64] ;Keep xor parity in xmm0-7 | |
126 | ||
127 | next_vect: | |
128 | mov ptr, [arg2+tmp*PS] | |
129 | sub tmp, 1 | |
130 | XLDR zmm4, [ptr+pos] ;Get next vector (source) | |
131 | XLDR zmm5, [ptr+pos+64] | |
132 | vpxorq zmm0, zmm0, zmm4 ;Add to xor parity | |
133 | vpxorq zmm1, zmm1, zmm5 | |
134 | jge next_vect ;Loop for each source | |
135 | ||
136 | mov ptr, [arg2+PS+vec*PS] ;Address of parity vector | |
137 | XSTR [ptr+pos], zmm0 ;Write parity xor vector | |
138 | XSTR [ptr+pos+64], zmm1 | |
139 | add pos, 128 | |
140 | cmp pos, len | |
141 | jle loop128 | |
142 | ||
143 | return_pass: | |
144 | FUNC_RESTORE | |
145 | mov return, 0 | |
146 | ret | |
147 | ||
148 | ||
149 | ;;; Do one byte at a time for no alignment case | |
150 | loop_1byte: | |
151 | mov tmp, vec ;Back to last vector | |
152 | mov ptr, [arg2+vec*PS] ;Fetch last pointer in array | |
153 | mov tmp2.b, [ptr+len-1] ;Get array n | |
154 | sub tmp, 1 | |
155 | nextvect_1byte: | |
156 | mov ptr, [arg2+tmp*PS] | |
157 | xor tmp2.b, [ptr+len-1] | |
158 | sub tmp, 1 | |
159 | jge nextvect_1byte | |
160 | ||
161 | mov tmp, vec | |
162 | add tmp, 1 ;Add back to point to last vec | |
163 | mov ptr, [arg2+tmp*PS] | |
164 | mov [ptr+len-1], tmp2.b ;Write parity | |
165 | sub len, 1 | |
166 | test len, (PS-1) | |
167 | jnz loop_1byte | |
168 | ||
169 | cmp len, 0 | |
170 | je return_pass | |
171 | test len, (128-1) ;If not 0 and 128bit aligned | |
172 | jz len_aligned_128bytes ; then do aligned case. len = y * 128 | |
173 | ||
174 | ;; else we are 8-byte aligned so fall through to recheck | |
175 | ||
176 | ||
177 | ;; Unaligned length cases | |
178 | len_not_aligned: | |
179 | test len, (PS-1) | |
180 | jne loop_1byte | |
181 | mov tmp3, len | |
182 | and tmp3, (128-1) ;Do the unaligned bytes 8 at a time | |
183 | ||
184 | ;; Run backwards 8 bytes at a time for (tmp3) bytes | |
185 | loop8_bytes: | |
186 | mov tmp, vec ;Back to last vector | |
187 | mov ptr, [arg2+vec*PS] ;Fetch last pointer in array | |
188 | mov tmp2, [ptr+len-PS] ;Get array n | |
189 | sub tmp, 1 | |
190 | nextvect_8bytes: | |
191 | mov ptr, [arg2+tmp*PS] ;Get pointer to next vector | |
192 | xor tmp2, [ptr+len-PS] | |
193 | sub tmp, 1 | |
194 | jge nextvect_8bytes ;Loop for each source | |
195 | ||
196 | mov tmp, vec | |
197 | add tmp, 1 ;Add back to point to last vec | |
198 | mov ptr, [arg2+tmp*PS] | |
199 | mov [ptr+len-PS], tmp2 ;Write parity | |
200 | sub len, PS | |
201 | sub tmp3, PS | |
202 | jg loop8_bytes | |
203 | ||
204 | cmp len, 128 ;Now len is aligned to 128B | |
205 | jge len_aligned_128bytes ;We can do the rest aligned | |
206 | ||
207 | cmp len, 0 | |
208 | je return_pass | |
209 | ||
210 | return_fail: | |
211 | FUNC_RESTORE | |
212 | mov return, 1 | |
213 | ret | |
214 | ||
215 | endproc_frame | |
216 | ||
217 | %endif ; ifdef HAVE_AS_KNOWS_AVX512 |