1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
31 .global gf_2vect_mad_neon
32 .type gf_2vect_mad_neon, %function
96 v_data_0_hi .req v_data_0
97 v_data_1_hi .req v_data_1
98 v_data_2_hi .req v_data_2
99 v_data_3_hi .req v_data_3
100 v_data_4_hi .req v_data_4
101 v_data_5_hi .req v_data_5
102 v_data_6_hi .req v_data_6
103 v_data_7_hi .req v_data_7
129 /* less than 16 bytes, return_fail */
133 movi v_mask0f.16b, #0x0f
134 lsl x_vec_i, x_vec_i, #5
136 add x_tbl1, x_tbl, x_vec_i
137 add x_tbl2, x_tbl1, x_vec
138 add x_src_end, x_src, x_len
140 ldr x_dest1, [x_dest]
141 ldr x_dest2, [x_dest, #8]
142 ldr q_gft1_lo, [x_tbl1]
143 ldr q_gft1_hi, [x_tbl1, #16]
144 ldr q_gft2_lo, [x_tbl2]
145 ldr q_gft2_hi, [x_tbl2, #16]
148 /* less than 128 bytes, goto Lloop16_init */
152 /* save d8 ~ d15 to stack */
155 stp d10, d11, [sp, #16]
156 stp d12, d13, [sp, #32]
157 stp d14, d15, [sp, #48]
159 sub x_src_end, x_src_end, #128
162 ldr q_data_0, [x_src, #16*0]
163 ldr q_data_1, [x_src, #16*1]
164 ldr q_data_2, [x_src, #16*2]
165 ldr q_data_3, [x_src, #16*3]
166 ldr q_data_4, [x_src, #16*4]
167 ldr q_data_5, [x_src, #16*5]
168 ldr q_data_6, [x_src, #16*6]
169 ldr q_data_7, [x_src, #16*7]
171 ldr q_d0, [x_dest1, #16*0]
172 ldr q_d1, [x_dest1, #16*1]
173 ldr q_d2, [x_dest1, #16*2]
174 ldr q_d3, [x_dest1, #16*3]
175 ldr q_d4, [x_dest1, #16*4]
176 ldr q_d5, [x_dest1, #16*5]
177 ldr q_d6, [x_dest1, #16*6]
178 ldr q_d7, [x_dest1, #16*7]
180 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
181 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
182 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
183 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
184 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
185 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
186 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
187 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
189 ushr v_data_0_hi.16b, v_data_0.16b, #4
190 ushr v_data_1_hi.16b, v_data_1.16b, #4
191 ushr v_data_2_hi.16b, v_data_2.16b, #4
192 ushr v_data_3_hi.16b, v_data_3.16b, #4
193 ushr v_data_4_hi.16b, v_data_4.16b, #4
194 ushr v_data_5_hi.16b, v_data_5.16b, #4
195 ushr v_data_6_hi.16b, v_data_6.16b, #4
196 ushr v_data_7_hi.16b, v_data_7.16b, #4
198 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
199 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
200 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
201 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
203 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
204 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
205 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
206 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
208 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
209 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
210 eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
211 eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
213 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
214 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
215 eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
216 eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
218 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
219 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
220 eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
221 eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
223 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
224 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
225 eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
226 eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
228 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
229 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
230 eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
231 eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
233 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
234 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
235 eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
236 eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
238 str q_d0, [x_dest1, #16*0]
239 str q_d1, [x_dest1, #16*1]
240 str q_d2, [x_dest1, #16*2]
241 str q_d3, [x_dest1, #16*3]
242 str q_d4, [x_dest1, #16*4]
243 str q_d5, [x_dest1, #16*5]
244 str q_d6, [x_dest1, #16*6]
245 str q_d7, [x_dest1, #16*7]
247 ldr q_d0, [x_dest2, #16*0]
248 ldr q_d1, [x_dest2, #16*1]
249 ldr q_d2, [x_dest2, #16*2]
250 ldr q_d3, [x_dest2, #16*3]
251 ldr q_d4, [x_dest2, #16*4]
252 ldr q_d5, [x_dest2, #16*5]
253 ldr q_d6, [x_dest2, #16*6]
254 ldr q_d7, [x_dest2, #16*7]
256 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
257 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
258 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
259 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
261 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
262 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
263 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
264 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
266 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
267 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
268 eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
269 eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
271 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
272 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
273 eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
274 eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
276 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
277 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
278 eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
279 eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
281 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
282 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
283 eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
284 eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
286 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
287 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
288 eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
289 eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
291 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
292 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
293 eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
294 eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
296 str q_d0, [x_dest2, #16*0]
297 str q_d1, [x_dest2, #16*1]
298 str q_d2, [x_dest2, #16*2]
299 str q_d3, [x_dest2, #16*3]
300 str q_d4, [x_dest2, #16*4]
301 str q_d5, [x_dest2, #16*5]
302 str q_d6, [x_dest2, #16*6]
303 str q_d7, [x_dest2, #16*7]
305 add x_src, x_src, #128
306 add x_dest1, x_dest1, #128
307 add x_dest2, x_dest2, #128
312 /* restore d8 ~ d15 */
314 ldp d10, d11, [sp, #16]
315 ldp d12, d13, [sp, #32]
316 ldp d14, d15, [sp, #48]
318 add x_src_end, x_src_end, #128
321 sub x_src_end, x_src_end, #16
331 and v_data_lo.16b, v_data.16b, v_mask0f.16b
332 ushr v_data_hi.16b, v_data.16b, #4
334 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
335 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
336 eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
337 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
339 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
340 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
341 eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
342 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
347 add x_dest1, x_dest1, #16
348 add x_dest2, x_dest2, #16
349 add x_src, x_src, #16
354 sub x_tmp, x_src, x_src_end
360 sub x_dest1, x_dest1, x_tmp
361 sub x_dest2, x_dest2, x_tmp
363 ldr x_const, =const_tbl
364 sub x_const, x_const, x_tmp
365 ldr q_tmp, [x_const, #16]
371 and v_data_lo.16b, v_data.16b, v_mask0f.16b
372 ushr v_data_hi.16b, v_data.16b, #4
374 tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
375 tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
376 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
377 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
378 eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
380 tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
381 tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
382 eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
383 and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
384 eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
400 .dword 0x0000000000000000, 0x0000000000000000
401 .dword 0xffffffffffffffff, 0xffffffffffffffff