]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S
4886440ba82087de4edd2f0aa9f4219c95ca48e8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_6vect_mad_neon.S
1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29
30 .text
31 .global gf_6vect_mad_neon
32 .type gf_6vect_mad_neon, %function
33
34
35 /* arguments */
36 x_len .req x0
37 x_vec .req x1
38 x_vec_i .req x2
39 x_tbl .req x3
40 x_src .req x4
41 x_dest .req x5
42
43 /* returns */
44 w_ret .req w0
45
46 /* local variables */
47 x_src_end .req x6
48 x_dest1 .req x7
49 x_dest2 .req x8
50 x_dest3 .req x9
51 x_dest4 .req x10
52 x_dest5 .req x11
53 x_dest6 .req x_dest
54 x_tmp .req x12
55 x_tbl1 .req x13
56 x_tbl2 .req x14
57 x_tbl3 .req x15
58 x_tbl4 .req x16
59 x_tbl5 .req x17
60 x_tbl6 .req x_tbl
61 x_const .req x18
62
63 /* vectors */
64 v_mask0f .req v0
65 v_tmp_lo .req v1
66 v_tmp_hi .req v2
67 v_tmp .req v3
68 q_tmp .req q3
69
70 v_gft1_lo .req v4
71 v_gft1_hi .req v5
72 v_gft2_lo .req v6
73 v_gft2_hi .req v7
74 v_gft3_lo .req v16
75 v_gft3_hi .req v17
76 q_gft1_lo .req q4
77 q_gft1_hi .req q5
78 q_gft2_lo .req q6
79 q_gft2_hi .req q7
80 q_gft3_lo .req q16
81 q_gft3_hi .req q17
82
83 v_gft4_lo .req v18
84 v_gft4_hi .req v19
85 q_gft4_lo .req q18
86 q_gft4_hi .req q19
87 v_gft5_lo .req v_gft2_lo
88 v_gft5_hi .req v_gft2_hi
89 q_gft5_lo .req q_gft2_lo
90 q_gft5_hi .req q_gft2_hi
91 v_gft6_lo .req v_gft3_lo
92 v_gft6_hi .req v_gft3_hi
93 q_gft6_lo .req q_gft3_lo
94 q_gft6_hi .req q_gft3_hi
95
96 v_data_0 .req v8
97 v_data_1 .req v9
98 v_data_2 .req v10
99 v_data_3 .req v11
100 q_data_0 .req q8
101 q_data_1 .req q9
102 q_data_2 .req q10
103 q_data_3 .req q11
104
105 v_data_0_lo .req v12
106 v_data_1_lo .req v13
107 v_data_2_lo .req v14
108 v_data_3_lo .req v15
109 v_data_0_hi .req v_data_0
110 v_data_1_hi .req v_data_1
111 v_data_2_hi .req v_data_2
112 v_data_3_hi .req v_data_3
113
114 v_d1_0 .req v20
115 v_d1_1 .req v21
116 v_d1_2 .req v22
117 v_d1_3 .req v23
118 v_d2_0 .req v24
119 v_d2_1 .req v25
120 v_d2_2 .req v26
121 v_d2_3 .req v27
122 v_d3_0 .req v28
123 v_d3_1 .req v29
124 v_d3_2 .req v30
125 v_d3_3 .req v31
126 q_d1_0 .req q20
127 q_d1_1 .req q21
128 q_d1_2 .req q22
129 q_d1_3 .req q23
130 q_d2_0 .req q24
131 q_d2_1 .req q25
132 q_d2_2 .req q26
133 q_d2_3 .req q27
134 q_d3_0 .req q28
135 q_d3_1 .req q29
136 q_d3_2 .req q30
137 q_d3_3 .req q31
138
139 v_d4_0 .req v_d1_0
140 v_d4_1 .req v_d1_1
141 v_d4_2 .req v_d1_2
142 v_d4_3 .req v_d1_3
143 q_d4_0 .req q_d1_0
144 q_d4_1 .req q_d1_1
145 q_d4_2 .req q_d1_2
146 q_d4_3 .req q_d1_3
147 v_d5_0 .req v_d2_0
148 v_d5_1 .req v_d2_1
149 v_d5_2 .req v_d2_2
150 v_d5_3 .req v_d2_3
151 q_d5_0 .req q_d2_0
152 q_d5_1 .req q_d2_1
153 q_d5_2 .req q_d2_2
154 q_d5_3 .req q_d2_3
155 v_d6_0 .req v_d3_0
156 v_d6_1 .req v_d3_1
157 v_d6_2 .req v_d3_2
158 v_d6_3 .req v_d3_3
159 q_d6_0 .req q_d3_0
160 q_d6_1 .req q_d3_1
161 q_d6_2 .req q_d3_2
162 q_d6_3 .req q_d3_3
163
164 v_data .req v21
165 q_data .req q21
166 v_data_lo .req v22
167 v_data_hi .req v23
168
169 gf_6vect_mad_neon:
170 /* less than 16 bytes, return_fail */
171 cmp x_len, #16
172 blt .return_fail
173
174 movi v_mask0f.16b, #0x0f
175 lsl x_vec_i, x_vec_i, #5
176 lsl x_vec, x_vec, #5
177 add x_tbl1, x_tbl, x_vec_i
178 add x_tbl2, x_tbl1, x_vec
179 add x_tbl3, x_tbl2, x_vec
180 add x_tbl4, x_tbl3, x_vec
181 add x_tbl5, x_tbl4, x_vec
182 add x_tbl6, x_tbl5, x_vec
183 add x_src_end, x_src, x_len
184 ldr x_dest1, [x_dest, #8*0]
185 ldr x_dest2, [x_dest, #8*1]
186 ldr x_dest3, [x_dest, #8*2]
187 ldr x_dest4, [x_dest, #8*3]
188 ldr x_dest5, [x_dest, #8*4]
189 ldr x_dest6, [x_dest, #8*5]
190 ldr q_gft1_lo, [x_tbl1]
191 ldr q_gft1_hi, [x_tbl1, #16]
192 ldr q_gft4_lo, [x_tbl4]
193 ldr q_gft4_hi, [x_tbl4, #16]
194
195 .Lloop64_init:
196 /* less than 64 bytes, goto Lloop16_init */
197 cmp x_len, #64
198 blt .Lloop16_init
199
200 /* save d8 ~ d15 to stack */
201 sub sp, sp, #64
202 stp d8, d9, [sp]
203 stp d10, d11, [sp, #16]
204 stp d12, d13, [sp, #32]
205 stp d14, d15, [sp, #48]
206
207 sub x_src_end, x_src_end, #64
208
209 .Lloop64:
210 ldr q_data_0, [x_src, #16*0]
211 ldr q_data_1, [x_src, #16*1]
212 ldr q_data_2, [x_src, #16*2]
213 ldr q_data_3, [x_src, #16*3]
214 add x_src, x_src, #64
215
216 ldr q_d1_0, [x_dest1, #16*0]
217 ldr q_d1_1, [x_dest1, #16*1]
218 ldr q_d1_2, [x_dest1, #16*2]
219 ldr q_d1_3, [x_dest1, #16*3]
220
221 ldr q_d2_0, [x_dest2, #16*0]
222 ldr q_d2_1, [x_dest2, #16*1]
223 ldr q_d2_2, [x_dest2, #16*2]
224 ldr q_d2_3, [x_dest2, #16*3]
225
226 ldr q_d3_0, [x_dest3, #16*0]
227 ldr q_d3_1, [x_dest3, #16*1]
228 ldr q_d3_2, [x_dest3, #16*2]
229 ldr q_d3_3, [x_dest3, #16*3]
230
231 ldr q_gft2_lo, [x_tbl2]
232 ldr q_gft2_hi, [x_tbl2, #16]
233 ldr q_gft3_lo, [x_tbl3]
234 ldr q_gft3_hi, [x_tbl3, #16]
235
236 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
237 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
238 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
239 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
240
241 ushr v_data_0_hi.16b, v_data_0.16b, #4
242 ushr v_data_1_hi.16b, v_data_1.16b, #4
243 ushr v_data_2_hi.16b, v_data_2.16b, #4
244 ushr v_data_3_hi.16b, v_data_3.16b, #4
245
246 /* dest1 */
247 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
248 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
249 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
250 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
251
252 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
253 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
254 eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
255 eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
256
257 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
258 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
259 eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
260 eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
261
262 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
263 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
264 eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
265 eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
266
267 /* dest2 */
268 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
269 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
270 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
271 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
272
273 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
274 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
275 eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
276 eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
277
278 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
279 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
280 eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
281 eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
282
283 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
284 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
285 eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
286 eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
287
288 /* dest3 */
289 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
290 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
291 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
292 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
293
294 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
295 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
296 eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
297 eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
298
299 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
300 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
301 eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
302 eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
303
304 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
305 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
306 eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
307 eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
308
309 str q_d1_0, [x_dest1, #16*0]
310 str q_d1_1, [x_dest1, #16*1]
311 str q_d1_2, [x_dest1, #16*2]
312 str q_d1_3, [x_dest1, #16*3]
313 add x_dest1, x_dest1, #64
314
315 str q_d2_0, [x_dest2, #16*0]
316 str q_d2_1, [x_dest2, #16*1]
317 str q_d2_2, [x_dest2, #16*2]
318 str q_d2_3, [x_dest2, #16*3]
319 add x_dest2, x_dest2, #64
320
321 str q_d3_0, [x_dest3, #16*0]
322 str q_d3_1, [x_dest3, #16*1]
323 str q_d3_2, [x_dest3, #16*2]
324 str q_d3_3, [x_dest3, #16*3]
325 add x_dest3, x_dest3, #64
326
327 ldr q_d4_0, [x_dest4, #16*0]
328 ldr q_d4_1, [x_dest4, #16*1]
329 ldr q_d4_2, [x_dest4, #16*2]
330 ldr q_d4_3, [x_dest4, #16*3]
331
332 ldr q_d5_0, [x_dest5, #16*0]
333 ldr q_d5_1, [x_dest5, #16*1]
334 ldr q_d5_2, [x_dest5, #16*2]
335 ldr q_d5_3, [x_dest5, #16*3]
336
337 ldr q_d6_0, [x_dest6, #16*0]
338 ldr q_d6_1, [x_dest6, #16*1]
339 ldr q_d6_2, [x_dest6, #16*2]
340 ldr q_d6_3, [x_dest6, #16*3]
341
342 ldr q_gft5_lo, [x_tbl5]
343 ldr q_gft5_hi, [x_tbl5, #16]
344 ldr q_gft6_lo, [x_tbl6]
345 ldr q_gft6_hi, [x_tbl6, #16]
346
347 /* dest4 */
348 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
349 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
350 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
351 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
352
353 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
354 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
355 eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
356 eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
357
358 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
359 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
360 eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
361 eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
362
363 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
364 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
365 eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
366 eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
367
368 /* dest5 */
369 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
370 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
371 eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
372 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
373
374 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
375 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
376 eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
377 eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
378
379 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
380 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
381 eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
382 eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
383
384 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
385 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
386 eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
387 eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
388
389 /* dest6 */
390 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
391 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
392 eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
393 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
394
395 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
396 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
397 eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
398 eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
399
400 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
401 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
402 eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
403 eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
404
405 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
406 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
407 eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
408 eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
409
410 str q_d4_0, [x_dest4, #16*0]
411 str q_d4_1, [x_dest4, #16*1]
412 str q_d4_2, [x_dest4, #16*2]
413 str q_d4_3, [x_dest4, #16*3]
414 add x_dest4, x_dest4, #64
415
416 str q_d5_0, [x_dest5, #16*0]
417 str q_d5_1, [x_dest5, #16*1]
418 str q_d5_2, [x_dest5, #16*2]
419 str q_d5_3, [x_dest5, #16*3]
420 add x_dest5, x_dest5, #64
421
422 str q_d6_0, [x_dest6, #16*0]
423 str q_d6_1, [x_dest6, #16*1]
424 str q_d6_2, [x_dest6, #16*2]
425 str q_d6_3, [x_dest6, #16*3]
426 add x_dest6, x_dest6, #64
427
428 cmp x_src, x_src_end
429 bls .Lloop64
430
431 .Lloop64_end:
432 /* restore d8 ~ d15 */
433 ldp d8, d9, [sp]
434 ldp d10, d11, [sp, #16]
435 ldp d12, d13, [sp, #32]
436 ldp d14, d15, [sp, #48]
437 add sp, sp, #64
438 add x_src_end, x_src_end, #64
439
440 .Lloop16_init:
441 sub x_src_end, x_src_end, #16
442 cmp x_src, x_src_end
443 bhi .lessthan16_init
444
445 .Lloop16:
446 ldr q_data, [x_src]
447
448 ldr q_d1_0, [x_dest1]
449 ldr q_d2_0, [x_dest2]
450 ldr q_d3_0, [x_dest3]
451 ldr q_gft2_lo, [x_tbl2]
452 ldr q_gft2_hi, [x_tbl2, #16]
453 ldr q_gft3_lo, [x_tbl3]
454 ldr q_gft3_hi, [x_tbl3, #16]
455
456 and v_data_lo.16b, v_data.16b, v_mask0f.16b
457 ushr v_data_hi.16b, v_data.16b, #4
458
459 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
460 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
461 eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
462 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
463
464 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
465 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
466 eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
467 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
468
469 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
470 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
471 eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
472 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
473
474 str q_d1_0, [x_dest1]
475 str q_d2_0, [x_dest2]
476 str q_d3_0, [x_dest3]
477
478 ldr q_d4_0, [x_dest4]
479 ldr q_d5_0, [x_dest5]
480 ldr q_d6_0, [x_dest6]
481 ldr q_gft5_lo, [x_tbl5]
482 ldr q_gft5_hi, [x_tbl5, #16]
483 ldr q_gft6_lo, [x_tbl6]
484 ldr q_gft6_hi, [x_tbl6, #16]
485
486 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
487 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
488 eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
489 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
490
491 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
492 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
493 eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
494 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
495
496 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
497 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
498 eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
499 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
500
501 str q_d4_0, [x_dest4]
502 str q_d5_0, [x_dest5]
503 str q_d6_0, [x_dest6]
504
505 add x_src, x_src, #16
506 add x_dest1, x_dest1, #16
507 add x_dest2, x_dest2, #16
508 add x_dest3, x_dest3, #16
509 add x_dest4, x_dest4, #16
510 add x_dest5, x_dest5, #16
511 add x_dest6, x_dest6, #16
512 cmp x_src, x_src_end
513 bls .Lloop16
514
515 .lessthan16_init:
516 sub x_tmp, x_src, x_src_end
517 cmp x_tmp, #16
518 beq .return_pass
519
520 .lessthan16:
521 mov x_src, x_src_end
522 sub x_dest1, x_dest1, x_tmp
523 sub x_dest2, x_dest2, x_tmp
524 sub x_dest3, x_dest3, x_tmp
525 sub x_dest4, x_dest4, x_tmp
526 sub x_dest5, x_dest5, x_tmp
527 sub x_dest6, x_dest6, x_tmp
528
529 ldr x_const, =const_tbl
530 sub x_const, x_const, x_tmp
531 ldr q_tmp, [x_const, #16]
532
533 ldr q_data, [x_src]
534 ldr q_d1_0, [x_dest1]
535 ldr q_d2_0, [x_dest2]
536 ldr q_d3_0, [x_dest3]
537 ldr q_gft2_lo, [x_tbl2]
538 ldr q_gft2_hi, [x_tbl2, #16]
539 ldr q_gft3_lo, [x_tbl3]
540 ldr q_gft3_hi, [x_tbl3, #16]
541
542 and v_data_lo.16b, v_data.16b, v_mask0f.16b
543 ushr v_data_hi.16b, v_data.16b, #4
544
545 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
546 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
547 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
548 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
549 eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
550
551 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
552 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
553 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
554 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
555 eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
556
557 tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
558 tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
559 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
560 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
561 eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
562
563 str q_d1_0, [x_dest1]
564 str q_d2_0, [x_dest2]
565 str q_d3_0, [x_dest3]
566
567 ldr q_d4_0, [x_dest4]
568 ldr q_d5_0, [x_dest5]
569 ldr q_d6_0, [x_dest6]
570 ldr q_gft5_lo, [x_tbl5]
571 ldr q_gft5_hi, [x_tbl5, #16]
572 ldr q_gft6_lo, [x_tbl6]
573 ldr q_gft6_hi, [x_tbl6, #16]
574
575 tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
576 tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
577 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
578 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
579 eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
580
581 tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
582 tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
583 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
584 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
585 eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
586
587 tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
588 tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
589 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
590 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
591 eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
592
593 str q_d4_0, [x_dest4]
594 str q_d5_0, [x_dest5]
595 str q_d6_0, [x_dest6]
596
597 .return_pass:
598 mov w_ret, #0
599 ret
600
601 .return_fail:
602 mov w_ret, #1
603 ret
604
605 .section .data
606 .balign 8
607 const_tbl:
608 .dword 0x0000000000000000, 0x0000000000000000
609 .dword 0xffffffffffffffff, 0xffffffffffffffff