]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_5vect_dot_prod_neon.S
1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29
30 .text
31
32 .global gf_5vect_dot_prod_neon
33 .type gf_5vect_dot_prod_neon, %function
34
35
36 /* arguments */
37 x_len .req x0
38 x_vec .req x1
39 x_tbl .req x2
40 x_src .req x3
41 x_dest .req x4
42
43 /* returns */
44 w_ret .req w0
45
46 /* local variables */
47 x_vec_i .req x5
48 x_ptr .req x6
49 x_pos .req x7
50 x_tmp .req x8
51 x_dest1 .req x9
52 x_dest2 .req x10
53 x_dest3 .req x11
54 x_dest4 .req x12
55 x_dest5 .req x13
56
57 /* vectors */
58 v_tmp1 .req v0
59 q_tmp1 .req q0
60 v_tmp2 .req v1
61 q_tmp2 .req q1
62
63 v_mask0f .req v_tmp1
64 q_mask0f .req q_tmp1
65 v_tmp_lo .req v_tmp1
66 v_tmp_hi .req v_tmp2
67
68 v_gft_lo .req v2
69 v_gft_hi .req v3
70 q_gft_lo .req q2
71 q_gft_hi .req q3
72
73 v_p1_0 .req v4
74 v_p2_0 .req v5
75 v_p3_0 .req v6
76 v_p4_0 .req v7
77
78 q_p1_0 .req q4
79 q_p2_0 .req q5
80 q_p3_0 .req q6
81 q_p4_0 .req q7
82
83 v_data_0 .req v8
84 v_data_1 .req v9
85 v_data_2 .req v10
86 v_data_3 .req v11
87 q_data_0 .req q8
88 q_data_1 .req q9
89 q_data_2 .req q10
90 q_data_3 .req q11
91
92 v_data_0_lo .req v12
93 v_data_1_lo .req v13
94 v_data_2_lo .req v14
95 v_data_3_lo .req v15
96 v_data_0_hi .req v_data_0
97 v_data_1_hi .req v_data_1
98 v_data_2_hi .req v_data_2
99 v_data_3_hi .req v_data_3
100
101 v_p5_0 .req v16
102 v_p1_1 .req v17
103 v_p2_1 .req v18
104 v_p3_1 .req v19
105 v_p4_1 .req v20
106 v_p5_1 .req v21
107 v_p1_2 .req v22
108 v_p2_2 .req v23
109 v_p3_2 .req v24
110 v_p4_2 .req v25
111 v_p5_2 .req v26
112 v_p1_3 .req v27
113 v_p2_3 .req v28
114 v_p3_3 .req v29
115 v_p4_3 .req v30
116 v_p5_3 .req v31
117
118 q_p5_0 .req q16
119 q_p1_1 .req q17
120 q_p2_1 .req q18
121 q_p3_1 .req q19
122 q_p4_1 .req q20
123 q_p5_1 .req q21
124 q_p1_2 .req q22
125 q_p2_2 .req q23
126 q_p3_2 .req q24
127 q_p4_2 .req q25
128 q_p5_2 .req q26
129 q_p1_3 .req q27
130 q_p2_3 .req q28
131 q_p3_3 .req q29
132 q_p4_3 .req q30
133 q_p5_3 .req q31
134
135 v_data .req v_p1_1
136 q_data .req q_p1_1
137 v_data_lo .req v_p2_1
138 v_data_hi .req v_p3_1
139
140 v_gft1_lo .req v_p4_1
141 v_gft1_hi .req v_p5_1
142 v_gft2_lo .req v_p1_2
143 v_gft2_hi .req v_p2_2
144 v_gft3_lo .req v_p3_2
145 v_gft3_hi .req v_p4_2
146 v_gft4_lo .req v_p5_2
147 v_gft4_hi .req v_p1_3
148 v_gft5_lo .req v_p2_3
149 v_gft5_hi .req v_p3_3
150 q_gft1_lo .req q_p4_1
151 q_gft1_hi .req q_p5_1
152 q_gft2_lo .req q_p1_2
153 q_gft2_hi .req q_p2_2
154 q_gft3_lo .req q_p3_2
155 q_gft3_hi .req q_p4_2
156 q_gft4_lo .req q_p5_2
157 q_gft4_hi .req q_p1_3
158 q_gft5_lo .req q_p2_3
159 q_gft5_hi .req q_p3_3
160
161
162 gf_5vect_dot_prod_neon:
163 /* less than 16 bytes, return_fail */
164 cmp x_len, #16
165 blt .return_fail
166
167 mov x_pos, #0
168 lsl x_vec, x_vec, #3
169 ldr x_dest1, [x_dest, #8*0]
170 ldr x_dest2, [x_dest, #8*1]
171 ldr x_dest3, [x_dest, #8*2]
172 ldr x_dest4, [x_dest, #8*3]
173 ldr x_dest5, [x_dest, #8*4]
174
175 .Lloop64_init:
176 /* less than 64 bytes, goto Lloop16_init */
177 cmp x_len, #64
178 blt .Lloop16_init
179
180 /* save d8 ~ d15 to stack */
181 sub sp, sp, #64
182 stp d8, d9, [sp]
183 stp d10, d11, [sp, #16]
184 stp d12, d13, [sp, #32]
185 stp d14, d15, [sp, #48]
186
187 sub x_len, x_len, #64
188
189 .Lloop64:
190 movi v_p1_0.16b, #0
191 movi v_p1_1.16b, #0
192 movi v_p1_2.16b, #0
193 movi v_p1_3.16b, #0
194 movi v_p2_0.16b, #0
195 movi v_p2_1.16b, #0
196 movi v_p2_2.16b, #0
197 movi v_p2_3.16b, #0
198 movi v_p3_0.16b, #0
199 movi v_p3_1.16b, #0
200 movi v_p3_2.16b, #0
201 movi v_p3_3.16b, #0
202 movi v_p4_0.16b, #0
203 movi v_p4_1.16b, #0
204 movi v_p4_2.16b, #0
205 movi v_p4_3.16b, #0
206 movi v_p5_0.16b, #0
207 movi v_p5_1.16b, #0
208 movi v_p5_2.16b, #0
209 movi v_p5_3.16b, #0
210 mov x_vec_i, #0
211
212 .Lloop64_vects:
213 ldr x_ptr, [x_src, x_vec_i]
214 add x_ptr, x_ptr, x_pos
215
216 ldr q_data_0, [x_ptr], #16
217 ldr q_data_1, [x_ptr], #16
218 ldr q_data_2, [x_ptr], #16
219 ldr q_data_3, [x_ptr], #16
220 prfm pldl2keep, [x_ptr]
221
222 movi v_mask0f.16b, #0x0f
223 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
224 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
225 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
226 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
227 ushr v_data_0_hi.16b, v_data_0.16b, #4
228 ushr v_data_1_hi.16b, v_data_1.16b, #4
229 ushr v_data_2_hi.16b, v_data_2.16b, #4
230 ushr v_data_3_hi.16b, v_data_3.16b, #4
231
232 /* v_p1_x */
233 add x_tmp, x_tbl, x_vec_i, lsl #2
234 add x_vec_i, x_vec_i, #8
235 ldp q_gft_lo, q_gft_hi, [x_tmp]
236 prfm pldl3keep, [x_tmp, #32]
237 add x_tmp, x_tmp, x_vec, lsl #2
238
239 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
240 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
241 eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
242 eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
243
244 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
245 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
246 eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
247 eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
248
249 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
250 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
251 eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
252 eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
253
254 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
255 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
256 eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
257 eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
258
259 /* v_p2_x */
260 ldp q_gft_lo, q_gft_hi, [x_tmp]
261 prfm pldl3keep, [x_tmp, #32]
262 add x_tmp, x_tmp, x_vec, lsl #2
263
264 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
265 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
266 eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
267 eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
268
269 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
270 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
271 eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
272 eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
273
274 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
275 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
276 eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
277 eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
278
279 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
280 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
281 eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
282 eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
283
284 /* v_p3_x */
285 ldp q_gft_lo, q_gft_hi, [x_tmp]
286 prfm pldl3keep, [x_tmp, #32]
287 add x_tmp, x_tmp, x_vec, lsl #2
288
289 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
290 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
291 eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
292 eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
293
294 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
295 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
296 eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
297 eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
298
299 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
300 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
301 eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
302 eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
303
304 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
305 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
306 eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
307 eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
308
309 /* v_p4_x */
310 ldp q_gft_lo, q_gft_hi, [x_tmp]
311 prfm pldl3keep, [x_tmp, #32]
312 add x_tmp, x_tmp, x_vec, lsl #2
313
314 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
315 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
316 eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
317 eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
318
319 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
320 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
321 eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
322 eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
323
324 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
325 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
326 eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
327 eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
328
329 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
330 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
331 eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
332 eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
333
334 /* v_p5_x */
335 ldp q_gft_lo, q_gft_hi, [x_tmp]
336 prfm pldl3keep, [x_tmp, #32]
337
338 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
339 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
340 eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
341 eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
342
343 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
344 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
345 eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
346 eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
347
348 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
349 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
350 eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
351 eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
352
353 tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
354 tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
355 eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
356 eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
357
358 cmp x_vec_i, x_vec
359 blt .Lloop64_vects
360
361 .Lloop64_vects_end:
362 add x_ptr, x_dest1, x_pos
363 stp q_p1_0, q_p1_1, [x_ptr], #32
364 stp q_p1_2, q_p1_3, [x_ptr]
365
366 add x_ptr, x_dest2, x_pos
367 stp q_p2_0, q_p2_1, [x_ptr], #32
368 stp q_p2_2, q_p2_3, [x_ptr]
369
370 add x_ptr, x_dest3, x_pos
371 stp q_p3_0, q_p3_1, [x_ptr], #32
372 stp q_p3_2, q_p3_3, [x_ptr]
373
374 add x_ptr, x_dest4, x_pos
375 stp q_p4_0, q_p4_1, [x_ptr], #32
376 stp q_p4_2, q_p4_3, [x_ptr]
377
378 add x_ptr, x_dest5, x_pos
379 stp q_p5_0, q_p5_1, [x_ptr], #32
380 stp q_p5_2, q_p5_3, [x_ptr]
381
382 add x_pos, x_pos, #64
383 cmp x_pos, x_len
384 ble .Lloop64
385
386 .Lloop64_end:
387 /* restore d8 ~ d15 */
388 ldp d8, d9, [sp]
389 ldp d10, d11, [sp, #16]
390 ldp d12, d13, [sp, #32]
391 ldp d14, d15, [sp, #48]
392 add sp, sp, #64
393
394 add x_len, x_len, #64
395 cmp x_pos, x_len
396 beq .return_pass
397
398 .Lloop16_init:
399 sub x_len, x_len, #16
400 cmp x_pos, x_len
401 bgt .lessthan16_init
402
403 .Lloop16:
404 movi v_p1_0.16b, #0
405 movi v_p2_0.16b, #0
406 movi v_p3_0.16b, #0
407 movi v_p4_0.16b, #0
408 movi v_p5_0.16b, #0
409 mov x_vec_i, #0
410
411 .Lloop16_vects:
412 ldr x_ptr, [x_src, x_vec_i]
413 ldr q_data, [x_ptr, x_pos]
414
415 movi v_mask0f.16b, #0x0f
416 and v_data_lo.16b, v_data.16b, v_mask0f.16b
417 ushr v_data_hi.16b, v_data.16b, #4
418
419 add x_tmp, x_tbl, x_vec_i, lsl #2
420 add x_vec_i, x_vec_i, #8
421 ldp q_gft1_lo, q_gft1_hi, [x_tmp]
422 add x_tmp, x_tmp, x_vec, lsl #2
423 ldp q_gft2_lo, q_gft2_hi, [x_tmp]
424 add x_tmp, x_tmp, x_vec, lsl #2
425 ldp q_gft3_lo, q_gft3_hi, [x_tmp]
426 add x_tmp, x_tmp, x_vec, lsl #2
427 ldp q_gft4_lo, q_gft4_hi, [x_tmp]
428 add x_tmp, x_tmp, x_vec, lsl #2
429 ldp q_gft5_lo, q_gft5_hi, [x_tmp]
430
431 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
432 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
433 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
434 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
435 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
436 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
437 tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
438 tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
439 tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
440 tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
441
442 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
443 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
444 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
445 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
446 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
447 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
448 eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
449 eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
450 eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
451 eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
452
453 cmp x_vec_i, x_vec
454 bne .Lloop16_vects
455
456 .Lloop16_vects_end:
457 str q_p1_0, [x_dest1, x_pos]
458 str q_p2_0, [x_dest2, x_pos]
459 str q_p3_0, [x_dest3, x_pos]
460 str q_p4_0, [x_dest4, x_pos]
461 str q_p5_0, [x_dest5, x_pos]
462 add x_pos, x_pos, #16
463 cmp x_pos, x_len
464 ble .Lloop16
465
466 .Lloop16_end:
467 sub x_tmp, x_pos, x_len
468 cmp x_tmp, #16
469 beq .return_pass
470
471 .lessthan16_init:
472 mov x_pos, x_len
473 b .Lloop16
474
475 .return_pass:
476 mov w_ret, #0
477 ret
478
479 .return_fail:
480 mov w_ret, #1
481 ret