]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_4vect_dot_prod_neon.S
CommitLineData
f91f0fd5
TL
1/**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_4vect_dot_prod_neon
32.type gf_4vect_dot_prod_neon, %function
33
34
35/* arguments */
36x_len .req x0
37x_vec .req x1
38x_tbl .req x2
39x_src .req x3
40x_dest .req x4
41
42/* returns */
43w_ret .req w0
44
45/* local variables */
46x_vec_i .req x5
47x_ptr .req x6
48x_pos .req x7
49x_tmp .req x8
50x_dest1 .req x9
51x_tbl1 .req x10
52x_dest2 .req x11
53x_tbl2 .req x12
54x_dest3 .req x13
55x_tbl3 .req x14
56x_dest4 .req x_dest
57x_tbl4 .req x15
58
59/* vectors */
60v_mask0f .req v0
61q_mask0f .req q0
62v_tmp1_lo .req v1
63v_tmp1_hi .req v2
64v_tmp1 .req v3
65q_tmp1 .req q3
66
67v_p1_0 .req v4
68v_p2_0 .req v5
69v_p3_0 .req v6
70v_p4_0 .req v7
71
72q_p1_0 .req q4
73q_p2_0 .req q5
74q_p3_0 .req q6
75q_p4_0 .req q7
76
77v_data_0 .req v8
78v_data_1 .req v9
79v_data_2 .req v10
80v_data_3 .req v11
81q_data_0 .req q8
82q_data_1 .req q9
83q_data_2 .req q10
84q_data_3 .req q11
85
86v_p1_3 .req v12
87v_p2_3 .req v13
88v_p3_3 .req v14
89v_p4_3 .req v15
90q_p1_3 .req q12
91q_p2_3 .req q13
92q_p3_3 .req q14
93q_p4_3 .req q15
94
95v_gft1_lo .req v16
96v_gft1_hi .req v17
97v_gft2_lo .req v18
98v_gft2_hi .req v19
99v_gft3_lo .req v20
100v_gft3_hi .req v21
101v_gft4_lo .req v22
102v_gft4_hi .req v23
103q_gft1_lo .req q16
104q_gft1_hi .req q17
105q_gft2_lo .req q18
106q_gft2_hi .req q19
107q_gft3_lo .req q20
108q_gft3_hi .req q21
109q_gft4_lo .req q22
110q_gft4_hi .req q23
111
112v_p1_1 .req v24
113v_p1_2 .req v25
114v_p2_1 .req v26
115v_p2_2 .req v27
116v_p3_1 .req v28
117v_p3_2 .req v29
118v_p4_1 .req v30
119v_p4_2 .req v31
120
121q_p1_1 .req q24
122q_p1_2 .req q25
123q_p2_1 .req q26
124q_p2_2 .req q27
125q_p3_1 .req q28
126q_p3_2 .req q29
127q_p4_1 .req q30
128q_p4_2 .req q31
129
130v_data .req v_tmp1
131q_data .req q_tmp1
132v_data_lo .req v_tmp1_lo
133v_data_hi .req v_tmp1_hi
134
135gf_4vect_dot_prod_neon:
136 /* less than 16 bytes, return_fail */
137 cmp x_len, #16
138 blt .return_fail
139
140 movi v_mask0f.16b, #0x0f
141 mov x_pos, #0
142 lsl x_vec, x_vec, #3
143 ldr x_dest1, [x_dest, #8*0]
144 ldr x_dest2, [x_dest, #8*1]
145 ldr x_dest3, [x_dest, #8*2]
146 ldr x_dest4, [x_dest, #8*3]
147
148.Lloop64_init:
149 /* less than 64 bytes, goto Lloop16_init */
150 cmp x_len, #64
151 blt .Lloop16_init
152
153 /* save d8 ~ d15 to stack */
154 sub sp, sp, #64
155 stp d8, d9, [sp]
156 stp d10, d11, [sp, #16]
157 stp d12, d13, [sp, #32]
158 stp d14, d15, [sp, #48]
159
160 sub x_len, x_len, #64
161
162.Lloop64:
163 movi v_p1_0.16b, #0
164 movi v_p1_1.16b, #0
165 movi v_p1_2.16b, #0
166 movi v_p1_3.16b, #0
167 movi v_p2_0.16b, #0
168 movi v_p2_1.16b, #0
169 movi v_p2_2.16b, #0
170 movi v_p2_3.16b, #0
171 movi v_p3_0.16b, #0
172 movi v_p3_1.16b, #0
173 movi v_p3_2.16b, #0
174 movi v_p3_3.16b, #0
175 movi v_p4_0.16b, #0
176 movi v_p4_1.16b, #0
177 movi v_p4_2.16b, #0
178 movi v_p4_3.16b, #0
179
180 mov x_tbl1, x_tbl
181 add x_tbl2, x_tbl1, x_vec, lsl #2
182 add x_tbl3, x_tbl2, x_vec, lsl #2
183 add x_tbl4, x_tbl3, x_vec, lsl #2
184 mov x_vec_i, #0
185 prfm pldl1keep, [x_tbl1]
186 prfm pldl1keep, [x_tbl2]
187 prfm pldl1keep, [x_tbl3]
188 prfm pldl1keep, [x_tbl4]
189
190.Lloop64_vects:
191 ldr x_ptr, [x_src, x_vec_i]
192 add x_vec_i, x_vec_i, #8
193 add x_ptr, x_ptr, x_pos
194
195 ldr q_data_0, [x_ptr], #16
196 ldr q_data_1, [x_ptr], #16
197 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
198 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
199 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
200 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
201 ldr q_data_2, [x_ptr], #16
202 ldr q_data_3, [x_ptr], #16
203
204 prfm pldl1strm, [x_ptr]
205 prfm pldl1keep, [x_tbl1]
206 prfm pldl1keep, [x_tbl2]
207 prfm pldl1keep, [x_tbl3]
208 prfm pldl1keep, [x_tbl4]
209
210 /* data_0 */
211 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
212 ushr v_data_0.16b, v_data_0.16b, #4
213
214 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
215 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
216 eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
217 eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
218
219 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
220 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
221 eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
222 eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
223
224 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
225 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
226 eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
227 eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
228
229 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
230 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
231 eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
232 eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
233
234 /* data_1 */
235 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
236 ushr v_data_1.16b, v_data_1.16b, #4
237
238 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
239 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
240 eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
241 eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
242
243 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
244 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
245 eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
246 eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
247
248 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
249 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
250 eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
251 eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
252
253 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
254 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
255 eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
256 eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
257
258 /* data_2 */
259 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
260 ushr v_data_2.16b, v_data_2.16b, #4
261
262 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
263 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
264 eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
265 eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
266
267 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
268 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
269 eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
270 eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
271
272 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
273 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
274 eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
275 eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
276
277 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
278 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
279 eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
280 eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
281
282 /* data_3 */
283 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
284 ushr v_data_3.16b, v_data_3.16b, #4
285
286 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
287 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
288 eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
289 eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
290
291 tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
292 tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
293 eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
294 eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
295
296 tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
297 tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
298 eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
299 eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
300
301 tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
302 tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
303 eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
304 eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
305
306 cmp x_vec_i, x_vec
307 blt .Lloop64_vects
308
309.Lloop64_vects_end:
310 add x_ptr, x_dest1, x_pos
311 stp q_p1_0, q_p1_1, [x_ptr], #32
312 stp q_p1_2, q_p1_3, [x_ptr]
313
314 add x_ptr, x_dest2, x_pos
315 stp q_p2_0, q_p2_1, [x_ptr], #32
316 stp q_p2_2, q_p2_3, [x_ptr]
317
318 add x_ptr, x_dest3, x_pos
319 stp q_p3_0, q_p3_1, [x_ptr], #32
320 stp q_p3_2, q_p3_3, [x_ptr]
321
322 add x_ptr, x_dest4, x_pos
323 stp q_p4_0, q_p4_1, [x_ptr], #32
324 stp q_p4_2, q_p4_3, [x_ptr]
325
326 add x_pos, x_pos, #64
327 cmp x_pos, x_len
328 ble .Lloop64
329
330.Lloop64_end:
331 /* restore d8 ~ d15 */
332 ldp d8, d9, [sp]
333 ldp d10, d11, [sp, #16]
334 ldp d12, d13, [sp, #32]
335 ldp d14, d15, [sp, #48]
336 add sp, sp, #64
337
338 add x_len, x_len, #64
339 cmp x_pos, x_len
340 beq .return_pass
341
342.Lloop16_init:
343 sub x_len, x_len, #16
344 cmp x_pos, x_len
345 bgt .lessthan16_init
346
347.Lloop16:
348 movi v_p1_0.16b, #0
349 movi v_p2_0.16b, #0
350 movi v_p3_0.16b, #0
351 movi v_p4_0.16b, #0
352 mov x_tbl1, x_tbl
353 add x_tbl2, x_tbl1, x_vec, lsl #2
354 add x_tbl3, x_tbl2, x_vec, lsl #2
355 add x_tbl4, x_tbl3, x_vec, lsl #2
356 mov x_vec_i, #0
357
358.Lloop16_vects:
359 ldr x_ptr, [x_src, x_vec_i]
360 add x_vec_i, x_vec_i, #8
361 ldr q_data, [x_ptr, x_pos]
362
363 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
364 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
365 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
366 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
367
368 prfm pldl1keep, [x_tbl1]
369 prfm pldl1keep, [x_tbl2]
370 prfm pldl1keep, [x_tbl3]
371 prfm pldl1keep, [x_tbl4]
372
373 and v_data_lo.16b, v_data.16b, v_mask0f.16b
374 ushr v_data_hi.16b, v_data.16b, #4
375
376 tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
377 tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
378 tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
379 tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
380 tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
381 tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
382 tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
383 tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
384
385 eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
386 eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
387 eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
388 eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
389 eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
390 eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
391 eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
392 eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
393
394 cmp x_vec_i, x_vec
395 bne .Lloop16_vects
396
397.Lloop16_vects_end:
398 str q_p1_0, [x_dest1, x_pos]
399 str q_p2_0, [x_dest2, x_pos]
400 str q_p3_0, [x_dest3, x_pos]
401 str q_p4_0, [x_dest4, x_pos]
402 add x_pos, x_pos, #16
403 cmp x_pos, x_len
404 ble .Lloop16
405
406.Lloop16_end:
407 sub x_tmp, x_pos, x_len
408 cmp x_tmp, #16
409 beq .return_pass
410
411.lessthan16_init:
412 mov x_pos, x_len
413 b .Lloop16
414
415.return_pass:
416 mov w_ret, #0
417 ret
418
419.return_fail:
420 mov w_ret, #1
421 ret