]>
Commit | Line | Data |
---|---|---|
f91f0fd5 TL |
1 | /************************************************************** |
2 | Copyright (c) 2019 Huawei Technologies Co., Ltd. | |
3 | ||
4 | Redistribution and use in source and binary forms, with or without | |
5 | modification, are permitted provided that the following conditions | |
6 | are met: | |
7 | * Redistributions of source code must retain the above copyright | |
8 | notice, this list of conditions and the following disclaimer. | |
9 | * Redistributions in binary form must reproduce the above copyright | |
10 | notice, this list of conditions and the following disclaimer in | |
11 | the documentation and/or other materials provided with the | |
12 | distribution. | |
13 | * Neither the name of Huawei Corporation nor the names of its | |
14 | contributors may be used to endorse or promote products derived | |
15 | from this software without specific prior written permission. | |
16 | ||
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | **********************************************************************/ | |
29 | .text | |
30 | ||
31 | .global gf_4vect_dot_prod_neon | |
32 | .type gf_4vect_dot_prod_neon, %function | |
33 | ||
34 | ||
35 | /* arguments */ | |
36 | x_len .req x0 | |
37 | x_vec .req x1 | |
38 | x_tbl .req x2 | |
39 | x_src .req x3 | |
40 | x_dest .req x4 | |
41 | ||
42 | /* returns */ | |
43 | w_ret .req w0 | |
44 | ||
45 | /* local variables */ | |
46 | x_vec_i .req x5 | |
47 | x_ptr .req x6 | |
48 | x_pos .req x7 | |
49 | x_tmp .req x8 | |
50 | x_dest1 .req x9 | |
51 | x_tbl1 .req x10 | |
52 | x_dest2 .req x11 | |
53 | x_tbl2 .req x12 | |
54 | x_dest3 .req x13 | |
55 | x_tbl3 .req x14 | |
56 | x_dest4 .req x_dest | |
57 | x_tbl4 .req x15 | |
58 | ||
59 | /* vectors */ | |
60 | v_mask0f .req v0 | |
61 | q_mask0f .req q0 | |
62 | v_tmp1_lo .req v1 | |
63 | v_tmp1_hi .req v2 | |
64 | v_tmp1 .req v3 | |
65 | q_tmp1 .req q3 | |
66 | ||
67 | v_p1_0 .req v4 | |
68 | v_p2_0 .req v5 | |
69 | v_p3_0 .req v6 | |
70 | v_p4_0 .req v7 | |
71 | ||
72 | q_p1_0 .req q4 | |
73 | q_p2_0 .req q5 | |
74 | q_p3_0 .req q6 | |
75 | q_p4_0 .req q7 | |
76 | ||
77 | v_data_0 .req v8 | |
78 | v_data_1 .req v9 | |
79 | v_data_2 .req v10 | |
80 | v_data_3 .req v11 | |
81 | q_data_0 .req q8 | |
82 | q_data_1 .req q9 | |
83 | q_data_2 .req q10 | |
84 | q_data_3 .req q11 | |
85 | ||
86 | v_p1_3 .req v12 | |
87 | v_p2_3 .req v13 | |
88 | v_p3_3 .req v14 | |
89 | v_p4_3 .req v15 | |
90 | q_p1_3 .req q12 | |
91 | q_p2_3 .req q13 | |
92 | q_p3_3 .req q14 | |
93 | q_p4_3 .req q15 | |
94 | ||
95 | v_gft1_lo .req v16 | |
96 | v_gft1_hi .req v17 | |
97 | v_gft2_lo .req v18 | |
98 | v_gft2_hi .req v19 | |
99 | v_gft3_lo .req v20 | |
100 | v_gft3_hi .req v21 | |
101 | v_gft4_lo .req v22 | |
102 | v_gft4_hi .req v23 | |
103 | q_gft1_lo .req q16 | |
104 | q_gft1_hi .req q17 | |
105 | q_gft2_lo .req q18 | |
106 | q_gft2_hi .req q19 | |
107 | q_gft3_lo .req q20 | |
108 | q_gft3_hi .req q21 | |
109 | q_gft4_lo .req q22 | |
110 | q_gft4_hi .req q23 | |
111 | ||
112 | v_p1_1 .req v24 | |
113 | v_p1_2 .req v25 | |
114 | v_p2_1 .req v26 | |
115 | v_p2_2 .req v27 | |
116 | v_p3_1 .req v28 | |
117 | v_p3_2 .req v29 | |
118 | v_p4_1 .req v30 | |
119 | v_p4_2 .req v31 | |
120 | ||
121 | q_p1_1 .req q24 | |
122 | q_p1_2 .req q25 | |
123 | q_p2_1 .req q26 | |
124 | q_p2_2 .req q27 | |
125 | q_p3_1 .req q28 | |
126 | q_p3_2 .req q29 | |
127 | q_p4_1 .req q30 | |
128 | q_p4_2 .req q31 | |
129 | ||
130 | v_data .req v_tmp1 | |
131 | q_data .req q_tmp1 | |
132 | v_data_lo .req v_tmp1_lo | |
133 | v_data_hi .req v_tmp1_hi | |
134 | ||
135 | gf_4vect_dot_prod_neon: | |
136 | /* less than 16 bytes, return_fail */ | |
137 | cmp x_len, #16 | |
138 | blt .return_fail | |
139 | ||
140 | movi v_mask0f.16b, #0x0f | |
141 | mov x_pos, #0 | |
142 | lsl x_vec, x_vec, #3 | |
143 | ldr x_dest1, [x_dest, #8*0] | |
144 | ldr x_dest2, [x_dest, #8*1] | |
145 | ldr x_dest3, [x_dest, #8*2] | |
146 | ldr x_dest4, [x_dest, #8*3] | |
147 | ||
148 | .Lloop64_init: | |
149 | /* less than 64 bytes, goto Lloop16_init */ | |
150 | cmp x_len, #64 | |
151 | blt .Lloop16_init | |
152 | ||
153 | /* save d8 ~ d15 to stack */ | |
154 | sub sp, sp, #64 | |
155 | stp d8, d9, [sp] | |
156 | stp d10, d11, [sp, #16] | |
157 | stp d12, d13, [sp, #32] | |
158 | stp d14, d15, [sp, #48] | |
159 | ||
160 | sub x_len, x_len, #64 | |
161 | ||
162 | .Lloop64: | |
163 | movi v_p1_0.16b, #0 | |
164 | movi v_p1_1.16b, #0 | |
165 | movi v_p1_2.16b, #0 | |
166 | movi v_p1_3.16b, #0 | |
167 | movi v_p2_0.16b, #0 | |
168 | movi v_p2_1.16b, #0 | |
169 | movi v_p2_2.16b, #0 | |
170 | movi v_p2_3.16b, #0 | |
171 | movi v_p3_0.16b, #0 | |
172 | movi v_p3_1.16b, #0 | |
173 | movi v_p3_2.16b, #0 | |
174 | movi v_p3_3.16b, #0 | |
175 | movi v_p4_0.16b, #0 | |
176 | movi v_p4_1.16b, #0 | |
177 | movi v_p4_2.16b, #0 | |
178 | movi v_p4_3.16b, #0 | |
179 | ||
180 | mov x_tbl1, x_tbl | |
181 | add x_tbl2, x_tbl1, x_vec, lsl #2 | |
182 | add x_tbl3, x_tbl2, x_vec, lsl #2 | |
183 | add x_tbl4, x_tbl3, x_vec, lsl #2 | |
184 | mov x_vec_i, #0 | |
185 | prfm pldl1keep, [x_tbl1] | |
186 | prfm pldl1keep, [x_tbl2] | |
187 | prfm pldl1keep, [x_tbl3] | |
188 | prfm pldl1keep, [x_tbl4] | |
189 | ||
190 | .Lloop64_vects: | |
191 | ldr x_ptr, [x_src, x_vec_i] | |
192 | add x_vec_i, x_vec_i, #8 | |
193 | add x_ptr, x_ptr, x_pos | |
194 | ||
195 | ldr q_data_0, [x_ptr], #16 | |
196 | ldr q_data_1, [x_ptr], #16 | |
197 | ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 | |
198 | ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 | |
199 | ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 | |
200 | ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 | |
201 | ldr q_data_2, [x_ptr], #16 | |
202 | ldr q_data_3, [x_ptr], #16 | |
203 | ||
204 | prfm pldl1strm, [x_ptr] | |
205 | prfm pldl1keep, [x_tbl1] | |
206 | prfm pldl1keep, [x_tbl2] | |
207 | prfm pldl1keep, [x_tbl3] | |
208 | prfm pldl1keep, [x_tbl4] | |
209 | ||
210 | /* data_0 */ | |
211 | and v_tmp1.16b, v_data_0.16b, v_mask0f.16b | |
212 | ushr v_data_0.16b, v_data_0.16b, #4 | |
213 | ||
214 | tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b | |
215 | tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b | |
216 | eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b | |
217 | eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b | |
218 | ||
219 | tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b | |
220 | tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b | |
221 | eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b | |
222 | eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b | |
223 | ||
224 | tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b | |
225 | tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b | |
226 | eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b | |
227 | eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b | |
228 | ||
229 | tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b | |
230 | tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b | |
231 | eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b | |
232 | eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b | |
233 | ||
234 | /* data_1 */ | |
235 | and v_tmp1.16b, v_data_1.16b, v_mask0f.16b | |
236 | ushr v_data_1.16b, v_data_1.16b, #4 | |
237 | ||
238 | tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b | |
239 | tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b | |
240 | eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b | |
241 | eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b | |
242 | ||
243 | tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b | |
244 | tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b | |
245 | eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b | |
246 | eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b | |
247 | ||
248 | tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b | |
249 | tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b | |
250 | eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b | |
251 | eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b | |
252 | ||
253 | tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b | |
254 | tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b | |
255 | eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b | |
256 | eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b | |
257 | ||
258 | /* data_2 */ | |
259 | and v_tmp1.16b, v_data_2.16b, v_mask0f.16b | |
260 | ushr v_data_2.16b, v_data_2.16b, #4 | |
261 | ||
262 | tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b | |
263 | tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b | |
264 | eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b | |
265 | eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b | |
266 | ||
267 | tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b | |
268 | tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b | |
269 | eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b | |
270 | eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b | |
271 | ||
272 | tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b | |
273 | tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b | |
274 | eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b | |
275 | eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b | |
276 | ||
277 | tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b | |
278 | tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b | |
279 | eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b | |
280 | eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b | |
281 | ||
282 | /* data_3 */ | |
283 | and v_tmp1.16b, v_data_3.16b, v_mask0f.16b | |
284 | ushr v_data_3.16b, v_data_3.16b, #4 | |
285 | ||
286 | tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b | |
287 | tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b | |
288 | eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b | |
289 | eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b | |
290 | ||
291 | tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b | |
292 | tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b | |
293 | eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b | |
294 | eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b | |
295 | ||
296 | tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b | |
297 | tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b | |
298 | eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b | |
299 | eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b | |
300 | ||
301 | tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b | |
302 | tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b | |
303 | eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b | |
304 | eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b | |
305 | ||
306 | cmp x_vec_i, x_vec | |
307 | blt .Lloop64_vects | |
308 | ||
309 | .Lloop64_vects_end: | |
310 | add x_ptr, x_dest1, x_pos | |
311 | stp q_p1_0, q_p1_1, [x_ptr], #32 | |
312 | stp q_p1_2, q_p1_3, [x_ptr] | |
313 | ||
314 | add x_ptr, x_dest2, x_pos | |
315 | stp q_p2_0, q_p2_1, [x_ptr], #32 | |
316 | stp q_p2_2, q_p2_3, [x_ptr] | |
317 | ||
318 | add x_ptr, x_dest3, x_pos | |
319 | stp q_p3_0, q_p3_1, [x_ptr], #32 | |
320 | stp q_p3_2, q_p3_3, [x_ptr] | |
321 | ||
322 | add x_ptr, x_dest4, x_pos | |
323 | stp q_p4_0, q_p4_1, [x_ptr], #32 | |
324 | stp q_p4_2, q_p4_3, [x_ptr] | |
325 | ||
326 | add x_pos, x_pos, #64 | |
327 | cmp x_pos, x_len | |
328 | ble .Lloop64 | |
329 | ||
330 | .Lloop64_end: | |
331 | /* restore d8 ~ d15 */ | |
332 | ldp d8, d9, [sp] | |
333 | ldp d10, d11, [sp, #16] | |
334 | ldp d12, d13, [sp, #32] | |
335 | ldp d14, d15, [sp, #48] | |
336 | add sp, sp, #64 | |
337 | ||
338 | add x_len, x_len, #64 | |
339 | cmp x_pos, x_len | |
340 | beq .return_pass | |
341 | ||
342 | .Lloop16_init: | |
343 | sub x_len, x_len, #16 | |
344 | cmp x_pos, x_len | |
345 | bgt .lessthan16_init | |
346 | ||
347 | .Lloop16: | |
348 | movi v_p1_0.16b, #0 | |
349 | movi v_p2_0.16b, #0 | |
350 | movi v_p3_0.16b, #0 | |
351 | movi v_p4_0.16b, #0 | |
352 | mov x_tbl1, x_tbl | |
353 | add x_tbl2, x_tbl1, x_vec, lsl #2 | |
354 | add x_tbl3, x_tbl2, x_vec, lsl #2 | |
355 | add x_tbl4, x_tbl3, x_vec, lsl #2 | |
356 | mov x_vec_i, #0 | |
357 | ||
358 | .Lloop16_vects: | |
359 | ldr x_ptr, [x_src, x_vec_i] | |
360 | add x_vec_i, x_vec_i, #8 | |
361 | ldr q_data, [x_ptr, x_pos] | |
362 | ||
363 | ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 | |
364 | ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 | |
365 | ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 | |
366 | ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 | |
367 | ||
368 | prfm pldl1keep, [x_tbl1] | |
369 | prfm pldl1keep, [x_tbl2] | |
370 | prfm pldl1keep, [x_tbl3] | |
371 | prfm pldl1keep, [x_tbl4] | |
372 | ||
373 | and v_data_lo.16b, v_data.16b, v_mask0f.16b | |
374 | ushr v_data_hi.16b, v_data.16b, #4 | |
375 | ||
376 | tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b | |
377 | tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b | |
378 | tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b | |
379 | tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b | |
380 | tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b | |
381 | tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b | |
382 | tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b | |
383 | tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b | |
384 | ||
385 | eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b | |
386 | eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b | |
387 | eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b | |
388 | eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b | |
389 | eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b | |
390 | eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b | |
391 | eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b | |
392 | eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b | |
393 | ||
394 | cmp x_vec_i, x_vec | |
395 | bne .Lloop16_vects | |
396 | ||
397 | .Lloop16_vects_end: | |
398 | str q_p1_0, [x_dest1, x_pos] | |
399 | str q_p2_0, [x_dest2, x_pos] | |
400 | str q_p3_0, [x_dest3, x_pos] | |
401 | str q_p4_0, [x_dest4, x_pos] | |
402 | add x_pos, x_pos, #16 | |
403 | cmp x_pos, x_len | |
404 | ble .Lloop16 | |
405 | ||
406 | .Lloop16_end: | |
407 | sub x_tmp, x_pos, x_len | |
408 | cmp x_tmp, #16 | |
409 | beq .return_pass | |
410 | ||
411 | .lessthan16_init: | |
412 | mov x_pos, x_len | |
413 | b .Lloop16 | |
414 | ||
415 | .return_pass: | |
416 | mov w_ret, #0 | |
417 | ret | |
418 | ||
419 | .return_fail: | |
420 | mov w_ret, #1 | |
421 | ret |