]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_2vect_mad_neon.S
1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 .text
30
31 .global gf_2vect_mad_neon
32 .type gf_2vect_mad_neon, %function
33
34
35 /* arguments */
36 x_len .req x0
37 x_vec .req x1
38 x_vec_i .req x2
39 x_tbl .req x3
40 x_src .req x4
41 x_dest .req x5
42
43 /* returns */
44 w_ret .req w0
45
46 /* local variables */
47 x_src_end .req x6
48 x_dest1 .req x7
49 x_dest2 .req x8
50 x_tmp .req x9
51 x_tbl1 .req x10
52 x_tbl2 .req x11
53 x_const .req x12
54
55 /* vectors */
56 v_mask0f .req v0
57 v_tmp_lo .req v1
58 v_tmp_hi .req v2
59 v_tmp .req v3
60 q_tmp .req q3
61
62 v_gft1_lo .req v4
63 v_gft1_hi .req v5
64 v_gft2_lo .req v6
65 v_gft2_hi .req v7
66 q_gft1_lo .req q4
67 q_gft1_hi .req q5
68 q_gft2_lo .req q6
69 q_gft2_hi .req q7
70
71 v_data_0 .req v8
72 v_data_1 .req v9
73 v_data_2 .req v10
74 v_data_3 .req v11
75 v_data_4 .req v12
76 v_data_5 .req v13
77 v_data_6 .req v14
78 v_data_7 .req v15
79 q_data_0 .req q8
80 q_data_1 .req q9
81 q_data_2 .req q10
82 q_data_3 .req q11
83 q_data_4 .req q12
84 q_data_5 .req q13
85 q_data_6 .req q14
86 q_data_7 .req q15
87
88 v_data_0_lo .req v16
89 v_data_1_lo .req v17
90 v_data_2_lo .req v18
91 v_data_3_lo .req v19
92 v_data_4_lo .req v20
93 v_data_5_lo .req v21
94 v_data_6_lo .req v22
95 v_data_7_lo .req v23
96 v_data_0_hi .req v_data_0
97 v_data_1_hi .req v_data_1
98 v_data_2_hi .req v_data_2
99 v_data_3_hi .req v_data_3
100 v_data_4_hi .req v_data_4
101 v_data_5_hi .req v_data_5
102 v_data_6_hi .req v_data_6
103 v_data_7_hi .req v_data_7
104
105 v_d0 .req v24
106 v_d1 .req v25
107 v_d2 .req v26
108 v_d3 .req v27
109 v_d4 .req v28
110 v_d5 .req v29
111 v_d6 .req v30
112 v_d7 .req v31
113 q_d0 .req q24
114 q_d1 .req q25
115 q_d2 .req q26
116 q_d3 .req q27
117 q_d4 .req q28
118 q_d5 .req q29
119 q_d6 .req q30
120 q_d7 .req q31
121
122 v_data .req v16
123 q_data .req q16
124 v_data_lo .req v17
125 v_data_hi .req v18
126
127
128 gf_2vect_mad_neon:
129 /* less than 16 bytes, return_fail */
130 cmp x_len, #16
131 blt .return_fail
132
133 movi v_mask0f.16b, #0x0f
134 lsl x_vec_i, x_vec_i, #5
135 lsl x_vec, x_vec, #5
136 add x_tbl1, x_tbl, x_vec_i
137 add x_tbl2, x_tbl1, x_vec
138 add x_src_end, x_src, x_len
139
140 ldr x_dest1, [x_dest]
141 ldr x_dest2, [x_dest, #8]
142 ldr q_gft1_lo, [x_tbl1]
143 ldr q_gft1_hi, [x_tbl1, #16]
144 ldr q_gft2_lo, [x_tbl2]
145 ldr q_gft2_hi, [x_tbl2, #16]
146
147 .Lloop128_init:
148 /* less than 128 bytes, goto Lloop16_init */
149 cmp x_len, #128
150 blt .Lloop16_init
151
152 /* save d8 ~ d15 to stack */
153 sub sp, sp, #64
154 stp d8, d9, [sp]
155 stp d10, d11, [sp, #16]
156 stp d12, d13, [sp, #32]
157 stp d14, d15, [sp, #48]
158
159 sub x_src_end, x_src_end, #128
160
161 .Lloop128:
162 ldr q_data_0, [x_src, #16*0]
163 ldr q_data_1, [x_src, #16*1]
164 ldr q_data_2, [x_src, #16*2]
165 ldr q_data_3, [x_src, #16*3]
166 ldr q_data_4, [x_src, #16*4]
167 ldr q_data_5, [x_src, #16*5]
168 ldr q_data_6, [x_src, #16*6]
169 ldr q_data_7, [x_src, #16*7]
170
171 ldr q_d0, [x_dest1, #16*0]
172 ldr q_d1, [x_dest1, #16*1]
173 ldr q_d2, [x_dest1, #16*2]
174 ldr q_d3, [x_dest1, #16*3]
175 ldr q_d4, [x_dest1, #16*4]
176 ldr q_d5, [x_dest1, #16*5]
177 ldr q_d6, [x_dest1, #16*6]
178 ldr q_d7, [x_dest1, #16*7]
179
180 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
181 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
182 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
183 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
184 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
185 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
186 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
187 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
188
189 ushr v_data_0_hi.16b, v_data_0.16b, #4
190 ushr v_data_1_hi.16b, v_data_1.16b, #4
191 ushr v_data_2_hi.16b, v_data_2.16b, #4
192 ushr v_data_3_hi.16b, v_data_3.16b, #4
193 ushr v_data_4_hi.16b, v_data_4.16b, #4
194 ushr v_data_5_hi.16b, v_data_5.16b, #4
195 ushr v_data_6_hi.16b, v_data_6.16b, #4
196 ushr v_data_7_hi.16b, v_data_7.16b, #4
197
198 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
199 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
200 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
201 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
202
203 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
204 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
205 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
206 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
207
208 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
209 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
210 eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
211 eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
212
213 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
214 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
215 eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
216 eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
217
218 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
219 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
220 eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
221 eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
222
223 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
224 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
225 eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
226 eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
227
228 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
229 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
230 eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
231 eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
232
233 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
234 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
235 eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
236 eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
237
238 str q_d0, [x_dest1, #16*0]
239 str q_d1, [x_dest1, #16*1]
240 str q_d2, [x_dest1, #16*2]
241 str q_d3, [x_dest1, #16*3]
242 str q_d4, [x_dest1, #16*4]
243 str q_d5, [x_dest1, #16*5]
244 str q_d6, [x_dest1, #16*6]
245 str q_d7, [x_dest1, #16*7]
246
247 ldr q_d0, [x_dest2, #16*0]
248 ldr q_d1, [x_dest2, #16*1]
249 ldr q_d2, [x_dest2, #16*2]
250 ldr q_d3, [x_dest2, #16*3]
251 ldr q_d4, [x_dest2, #16*4]
252 ldr q_d5, [x_dest2, #16*5]
253 ldr q_d6, [x_dest2, #16*6]
254 ldr q_d7, [x_dest2, #16*7]
255
256 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
257 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
258 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
259 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
260
261 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
262 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
263 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
264 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
265
266 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
267 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
268 eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
269 eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
270
271 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
272 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
273 eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
274 eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
275
276 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
277 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
278 eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
279 eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
280
281 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
282 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
283 eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
284 eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
285
286 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
287 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
288 eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
289 eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
290
291 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
292 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
293 eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
294 eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
295
296 str q_d0, [x_dest2, #16*0]
297 str q_d1, [x_dest2, #16*1]
298 str q_d2, [x_dest2, #16*2]
299 str q_d3, [x_dest2, #16*3]
300 str q_d4, [x_dest2, #16*4]
301 str q_d5, [x_dest2, #16*5]
302 str q_d6, [x_dest2, #16*6]
303 str q_d7, [x_dest2, #16*7]
304
305 add x_src, x_src, #128
306 add x_dest1, x_dest1, #128
307 add x_dest2, x_dest2, #128
308 cmp x_src, x_src_end
309 bls .Lloop128
310
311 .Lloop128_end:
312 /* restore d8 ~ d15 */
313 ldp d8, d9, [sp]
314 ldp d10, d11, [sp, #16]
315 ldp d12, d13, [sp, #32]
316 ldp d14, d15, [sp, #48]
317 add sp, sp, #64
318 add x_src_end, x_src_end, #128
319
320 .Lloop16_init:
321 sub x_src_end, x_src_end, #16
322 cmp x_src, x_src_end
323 bhi .lessthan16_init
324
325 .Lloop16:
326 ldr q_data, [x_src]
327
328 ldr q_d0, [x_dest1]
329 ldr q_d1, [x_dest2]
330
331 and v_data_lo.16b, v_data.16b, v_mask0f.16b
332 ushr v_data_hi.16b, v_data.16b, #4
333
334 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
335 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
336 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
337 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
338
339 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
340 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
341 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
342 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
343
344 str q_d0, [x_dest1]
345 str q_d1, [x_dest2]
346
347 add x_dest1, x_dest1, #16
348 add x_dest2, x_dest2, #16
349 add x_src, x_src, #16
350 cmp x_src, x_src_end
351 bls .Lloop16
352
353 .lessthan16_init:
354 sub x_tmp, x_src, x_src_end
355 cmp x_tmp, #16
356 beq .return_pass
357
358 .lessthan16:
359 mov x_src, x_src_end
360 sub x_dest1, x_dest1, x_tmp
361 sub x_dest2, x_dest2, x_tmp
362
363 ldr x_const, =const_tbl
364 sub x_const, x_const, x_tmp
365 ldr q_tmp, [x_const, #16]
366
367 ldr q_data, [x_src]
368 ldr q_d0, [x_dest1]
369 ldr q_d1, [x_dest2]
370
371 and v_data_lo.16b, v_data.16b, v_mask0f.16b
372 ushr v_data_hi.16b, v_data.16b, #4
373
374 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
375 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
376 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
377 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
378 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
379
380 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
381 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
382 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
383 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
384 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
385
386 str q_d0, [x_dest1]
387 str q_d1, [x_dest2]
388
389 .return_pass:
390 mov w_ret, #0
391 ret
392
393 .return_fail:
394 mov w_ret, #1
395 ret
396
397 .section .data
398 .balign 8
399 const_tbl:
400 .dword 0x0000000000000000, 0x0000000000000000
401 .dword 0xffffffffffffffff, 0xffffffffffffffff