1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
32 .global gf_5vect_dot_prod_neon
33 .type gf_5vect_dot_prod_neon, %function
96 v_data_0_hi .req v_data_0
97 v_data_1_hi .req v_data_1
98 v_data_2_hi .req v_data_2
99 v_data_3_hi .req v_data_3
137 v_data_lo .req v_p2_1
138 v_data_hi .req v_p3_1
140 v_gft1_lo .req v_p4_1
141 v_gft1_hi .req v_p5_1
142 v_gft2_lo .req v_p1_2
143 v_gft2_hi .req v_p2_2
144 v_gft3_lo .req v_p3_2
145 v_gft3_hi .req v_p4_2
146 v_gft4_lo .req v_p5_2
147 v_gft4_hi .req v_p1_3
148 v_gft5_lo .req v_p2_3
149 v_gft5_hi .req v_p3_3
150 q_gft1_lo .req q_p4_1
151 q_gft1_hi .req q_p5_1
152 q_gft2_lo .req q_p1_2
153 q_gft2_hi .req q_p2_2
154 q_gft3_lo .req q_p3_2
155 q_gft3_hi .req q_p4_2
156 q_gft4_lo .req q_p5_2
157 q_gft4_hi .req q_p1_3
158 q_gft5_lo .req q_p2_3
159 q_gft5_hi .req q_p3_3
162 gf_5vect_dot_prod_neon:
163 /* less than 16 bytes, return_fail */
169 ldr x_dest1, [x_dest, #8*0]
170 ldr x_dest2, [x_dest, #8*1]
171 ldr x_dest3, [x_dest, #8*2]
172 ldr x_dest4, [x_dest, #8*3]
173 ldr x_dest5, [x_dest, #8*4]
176 /* less than 64 bytes, goto Lloop16_init */
180 /* save d8 ~ d15 to stack */
183 stp d10, d11, [sp, #16]
184 stp d12, d13, [sp, #32]
185 stp d14, d15, [sp, #48]
187 sub x_len, x_len, #64
213 ldr x_ptr, [x_src, x_vec_i]
214 add x_ptr, x_ptr, x_pos
216 ldr q_data_0, [x_ptr], #16
217 ldr q_data_1, [x_ptr], #16
218 ldr q_data_2, [x_ptr], #16
219 ldr q_data_3, [x_ptr], #16
220 prfm pldl2keep, [x_ptr]
222 movi v_mask0f.16b, #0x0f
223 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
224 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
225 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
226 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
227 ushr v_data_0_hi.16b, v_data_0.16b, #4
228 ushr v_data_1_hi.16b, v_data_1.16b, #4
229 ushr v_data_2_hi.16b, v_data_2.16b, #4
230 ushr v_data_3_hi.16b, v_data_3.16b, #4
233 add x_tmp, x_tbl, x_vec_i, lsl #2
234 add x_vec_i, x_vec_i, #8
235 ldp q_gft_lo, q_gft_hi, [x_tmp]
236 prfm pldl3keep, [x_tmp, #32]
237 add x_tmp, x_tmp, x_vec, lsl #2
239 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
240 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
241 eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
242 eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
244 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
245 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
246 eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
247 eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
249 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
250 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
251 eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
252 eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
254 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
255 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
256 eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
257 eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
260 ldp q_gft_lo, q_gft_hi, [x_tmp]
261 prfm pldl3keep, [x_tmp, #32]
262 add x_tmp, x_tmp, x_vec, lsl #2
264 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
265 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
266 eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
267 eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
269 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
270 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
271 eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
272 eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
274 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
275 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
276 eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
277 eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
279 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
280 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
281 eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
282 eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
285 ldp q_gft_lo, q_gft_hi, [x_tmp]
286 prfm pldl3keep, [x_tmp, #32]
287 add x_tmp, x_tmp, x_vec, lsl #2
289 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
290 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
291 eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
292 eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
294 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
295 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
296 eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
297 eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
299 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
300 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
301 eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
302 eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
304 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
305 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
306 eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
307 eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
310 ldp q_gft_lo, q_gft_hi, [x_tmp]
311 prfm pldl3keep, [x_tmp, #32]
312 add x_tmp, x_tmp, x_vec, lsl #2
314 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
315 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
316 eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
317 eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
319 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
320 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
321 eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
322 eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
324 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
325 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
326 eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
327 eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
329 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
330 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
331 eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
332 eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
335 ldp q_gft_lo, q_gft_hi, [x_tmp]
336 prfm pldl3keep, [x_tmp, #32]
338 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
339 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
340 eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
341 eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
343 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
344 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
345 eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
346 eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
348 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
349 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
350 eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
351 eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
353 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
354 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
355 eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
356 eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
362 add x_ptr, x_dest1, x_pos
363 stp q_p1_0, q_p1_1, [x_ptr], #32
364 stp q_p1_2, q_p1_3, [x_ptr]
366 add x_ptr, x_dest2, x_pos
367 stp q_p2_0, q_p2_1, [x_ptr], #32
368 stp q_p2_2, q_p2_3, [x_ptr]
370 add x_ptr, x_dest3, x_pos
371 stp q_p3_0, q_p3_1, [x_ptr], #32
372 stp q_p3_2, q_p3_3, [x_ptr]
374 add x_ptr, x_dest4, x_pos
375 stp q_p4_0, q_p4_1, [x_ptr], #32
376 stp q_p4_2, q_p4_3, [x_ptr]
378 add x_ptr, x_dest5, x_pos
379 stp q_p5_0, q_p5_1, [x_ptr], #32
380 stp q_p5_2, q_p5_3, [x_ptr]
382 add x_pos, x_pos, #64
387 /* restore d8 ~ d15 */
389 ldp d10, d11, [sp, #16]
390 ldp d12, d13, [sp, #32]
391 ldp d14, d15, [sp, #48]
394 add x_len, x_len, #64
399 sub x_len, x_len, #16
412 ldr x_ptr, [x_src, x_vec_i]
413 ldr q_data, [x_ptr, x_pos]
415 movi v_mask0f.16b, #0x0f
416 and v_data_lo.16b, v_data.16b, v_mask0f.16b
417 ushr v_data_hi.16b, v_data.16b, #4
419 add x_tmp, x_tbl, x_vec_i, lsl #2
420 add x_vec_i, x_vec_i, #8
421 ldp q_gft1_lo, q_gft1_hi, [x_tmp]
422 add x_tmp, x_tmp, x_vec, lsl #2
423 ldp q_gft2_lo, q_gft2_hi, [x_tmp]
424 add x_tmp, x_tmp, x_vec, lsl #2
425 ldp q_gft3_lo, q_gft3_hi, [x_tmp]
426 add x_tmp, x_tmp, x_vec, lsl #2
427 ldp q_gft4_lo, q_gft4_hi, [x_tmp]
428 add x_tmp, x_tmp, x_vec, lsl #2
429 ldp q_gft5_lo, q_gft5_hi, [x_tmp]
431 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
432 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
433 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
434 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
435 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
436 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
437 tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
438 tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
439 tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
440 tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
442 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
443 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
444 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
445 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
446 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
447 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
448 eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
449 eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
450 eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
451 eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
457 str q_p1_0, [x_dest1, x_pos]
458 str q_p2_0, [x_dest2, x_pos]
459 str q_p3_0, [x_dest3, x_pos]
460 str q_p4_0, [x_dest4, x_pos]
461 str q_p5_0, [x_dest5, x_pos]
462 add x_pos, x_pos, #16
467 sub x_tmp, x_pos, x_len