1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
31 .global gf_vect_dot_prod_neon
32 .type gf_vect_dot_prod_neon, %function
84 v_data_0_hi .req v_data_0
85 v_data_1_hi .req v_data_1
86 v_data_2_hi .req v_data_2
87 v_data_3_hi .req v_data_3
88 v_data_4_hi .req v_data_4
89 v_data_5_hi .req v_data_5
90 v_data_6_hi .req v_data_6
91 v_data_7_hi .req v_data_7
118 gf_vect_dot_prod_neon:
119 /* less than 16 bytes, return_fail */
123 movi v_mask0f.16b, #0x0f
129 /* less than 128 bytes, goto Lloop16_init */
133 /* save d8 ~ d15 to stack */
136 stp d10, d11, [sp, #16]
137 stp d12, d13, [sp, #32]
138 stp d14, d15, [sp, #48]
140 sub x_len, x_len, #128
156 ldr x_ptr, [x_src, x_vec_i]
157 add x_vec_i, x_vec_i, #8
158 add x_ptr, x_ptr, x_pos
160 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
162 ldp q_data_0, q_data_1, [x_ptr], #32
163 ldp q_data_2, q_data_3, [x_ptr], #32
164 ldp q_data_4, q_data_5, [x_ptr], #32
165 ldp q_data_6, q_data_7, [x_ptr]
167 prfm pldl1keep, [x_tbl1]
168 prfm pldl1strm, [x_ptr]
170 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
171 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
172 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
173 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
174 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
175 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
176 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
177 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
179 ushr v_data_0_hi.16b, v_data_0.16b, #4
180 ushr v_data_1_hi.16b, v_data_1.16b, #4
181 ushr v_data_2_hi.16b, v_data_2.16b, #4
182 ushr v_data_3_hi.16b, v_data_3.16b, #4
183 ushr v_data_4_hi.16b, v_data_4.16b, #4
184 ushr v_data_5_hi.16b, v_data_5.16b, #4
185 ushr v_data_6_hi.16b, v_data_6.16b, #4
186 ushr v_data_7_hi.16b, v_data_7.16b, #4
188 tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
189 tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
190 tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
191 tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
192 tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
193 tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
194 tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
195 tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
197 tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
198 tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
199 tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
200 tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
201 tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
202 tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
203 tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
204 tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
206 eor v_p0.16b, v_data_0_lo.16b, v_p0.16b
207 eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
208 eor v_p1.16b, v_data_1_lo.16b, v_p1.16b
209 eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
210 eor v_p2.16b, v_data_2_lo.16b, v_p2.16b
211 eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
212 eor v_p3.16b, v_data_3_lo.16b, v_p3.16b
213 eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
214 eor v_p4.16b, v_data_4_lo.16b, v_p4.16b
215 eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
216 eor v_p5.16b, v_data_5_lo.16b, v_p5.16b
217 eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
218 eor v_p6.16b, v_data_6_lo.16b, v_p6.16b
219 eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
220 eor v_p7.16b, v_data_7_lo.16b, v_p7.16b
221 eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
227 add x_ptr, x_dest1, x_pos
228 stp q_p0, q_p1, [x_ptr], #32
229 stp q_p2, q_p3, [x_ptr], #32
230 stp q_p4, q_p5, [x_ptr], #32
231 stp q_p6, q_p7, [x_ptr]
233 add x_pos, x_pos, #128
238 /* restore d8 ~ d15 */
240 ldp d10, d11, [sp, #16]
241 ldp d12, d13, [sp, #32]
242 ldp d14, d15, [sp, #48]
245 add x_len, x_len, #128
250 sub x_len, x_len, #16
260 ldr x_ptr, [x_src, x_vec_i]
261 ldr q_data, [x_ptr, x_pos]
262 add x_vec_i, x_vec_i, #8
264 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
266 and v_data_lo.16b, v_data.16b, v_mask0f.16b
267 ushr v_data_hi.16b, v_data.16b, #4
269 tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
270 tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
271 eor v_p.16b, v_data_lo.16b, v_p.16b
272 eor v_p.16b, v_p.16b, v_data_hi.16b
278 str q_p, [x_dest1, x_pos]
279 add x_pos, x_pos, #16
284 sub x_tmp, x_pos, x_len