1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
31 .global gf_vect_mad_neon
32 .type gf_vect_mad_neon, %function
92 v_data_0_hi .req v_data_0
93 v_data_1_hi .req v_data_1
94 v_data_2_hi .req v_data_2
95 v_data_3_hi .req v_data_3
96 v_data_4_hi .req v_data_4
97 v_data_5_hi .req v_data_5
98 v_data_6_hi .req v_data_6
99 v_data_7_hi .req v_data_7
120 v_data_lo .req v_d1_2
121 v_data_hi .req v_d1_3
125 /* less than 16 bytes, return_fail */
129 movi v_mask0f.16b, #0x0f
130 lsl x_vec_i, x_vec_i, #5
131 add x_tbl, x_tbl, x_vec_i
132 add x_src_end, x_src, x_len
134 ldr q_gft1_lo, [x_tbl]
135 ldr q_gft1_hi, [x_tbl, #16]
138 /* less than 128 bytes, goto Lloop16_init */
142 /* save d8 ~ d15 to stack */
145 stp d10, d11, [sp, #16]
146 stp d12, d13, [sp, #32]
147 stp d14, d15, [sp, #48]
149 sub x_src_end, x_src_end, #128
152 ldr q_data_0, [x_src, #16*0]
153 ldr q_data_1, [x_src, #16*1]
154 ldr q_data_2, [x_src, #16*2]
155 ldr q_data_3, [x_src, #16*3]
156 ldr q_data_4, [x_src, #16*4]
157 ldr q_data_5, [x_src, #16*5]
158 ldr q_data_6, [x_src, #16*6]
159 ldr q_data_7, [x_src, #16*7]
161 ldr q_d1_0, [x_dest1, #16*0]
162 ldr q_d1_1, [x_dest1, #16*1]
163 ldr q_d1_2, [x_dest1, #16*2]
164 ldr q_d1_3, [x_dest1, #16*3]
165 ldr q_d1_4, [x_dest1, #16*4]
166 ldr q_d1_5, [x_dest1, #16*5]
167 ldr q_d1_6, [x_dest1, #16*6]
168 ldr q_d1_7, [x_dest1, #16*7]
170 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
171 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
172 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
173 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
174 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
175 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
176 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
177 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
179 ushr v_data_0_hi.16b, v_data_0.16b, #4
180 ushr v_data_1_hi.16b, v_data_1.16b, #4
181 ushr v_data_2_hi.16b, v_data_2.16b, #4
182 ushr v_data_3_hi.16b, v_data_3.16b, #4
183 ushr v_data_4_hi.16b, v_data_4.16b, #4
184 ushr v_data_5_hi.16b, v_data_5.16b, #4
185 ushr v_data_6_hi.16b, v_data_6.16b, #4
186 ushr v_data_7_hi.16b, v_data_7.16b, #4
188 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
189 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
190 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
191 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
193 eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
194 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
195 eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
196 eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
198 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
199 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
200 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
201 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
203 eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
204 eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
205 eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
206 eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
208 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
209 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
210 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
211 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
213 eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
214 eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
215 eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
216 eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
218 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
219 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
220 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
221 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
223 eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
224 eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
225 eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
226 eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
228 str q_d1_0, [x_dest1, #16*0]
229 str q_d1_1, [x_dest1, #16*1]
230 str q_d1_2, [x_dest1, #16*2]
231 str q_d1_3, [x_dest1, #16*3]
232 str q_d1_4, [x_dest1, #16*4]
233 str q_d1_5, [x_dest1, #16*5]
234 str q_d1_6, [x_dest1, #16*6]
235 str q_d1_7, [x_dest1, #16*7]
237 add x_src, x_src, #128
238 add x_dest1, x_dest1, #128
243 /* restore d8 ~ d15 */
245 ldp d10, d11, [sp, #16]
246 ldp d12, d13, [sp, #32]
247 ldp d14, d15, [sp, #48]
249 add x_src_end, x_src_end, #128
252 sub x_src_end, x_src_end, #16
258 ldr q_d1_0, [x_dest1]
260 and v_data_lo.16b, v_data.16b, v_mask0f.16b
261 ushr v_data_hi.16b, v_data.16b, #4
263 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
264 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
265 eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
266 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
268 str q_d1_0, [x_dest1]
270 add x_dest1, x_dest1, #16
271 add x_src, x_src, #16
276 sub x_tmp, x_src, x_src_end
282 sub x_dest1, x_dest1, x_tmp
284 ldr x_const, =const_tbl
285 sub x_const, x_const, x_tmp
286 ldr q_tmp, [x_const, #16]
289 ldr q_d1_0, [x_dest1]
291 and v_data_lo.16b, v_data.16b, v_mask0f.16b
292 ushr v_data_hi.16b, v_data.16b, #4
294 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
295 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
296 eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
297 and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
298 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
300 str q_d1_0, [x_dest1]
313 .dword 0x0000000000000000, 0x0000000000000000
314 .dword 0xffffffffffffffff, 0xffffffffffffffff