]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_vect_dot_prod_neon.S
1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 .text
30
31 .global gf_vect_dot_prod_neon
32 .type gf_vect_dot_prod_neon, %function
33
34 /* arguments */
35 x_len .req x0
36 x_vec .req x1
37 x_tbl .req x2
38 x_src .req x3
39 x_dest1 .req x4
40
41 /* returns */
42 w_ret .req w0
43
44 /* local variables */
45 x_vec_i .req x5
46 x_ptr .req x6
47 x_pos .req x7
48 x_tmp .req x8
49 x_tbl1 .req x9
50
51 /* vectors */
52 v_gft1_lo .req v0
53 v_gft1_hi .req v1
54 q_gft1_lo .req q0
55 q_gft1_hi .req q1
56 v_mask0f .req v2
57 q_mask0f .req q2
58
59 v_data_0 .req v8
60 v_data_1 .req v9
61 v_data_2 .req v10
62 v_data_3 .req v11
63 v_data_4 .req v12
64 v_data_5 .req v13
65 v_data_6 .req v14
66 v_data_7 .req v15
67 q_data_0 .req q8
68 q_data_1 .req q9
69 q_data_2 .req q10
70 q_data_3 .req q11
71 q_data_4 .req q12
72 q_data_5 .req q13
73 q_data_6 .req q14
74 q_data_7 .req q15
75
76 v_data_0_lo .req v16
77 v_data_1_lo .req v17
78 v_data_2_lo .req v18
79 v_data_3_lo .req v19
80 v_data_4_lo .req v20
81 v_data_5_lo .req v21
82 v_data_6_lo .req v22
83 v_data_7_lo .req v23
84 v_data_0_hi .req v_data_0
85 v_data_1_hi .req v_data_1
86 v_data_2_hi .req v_data_2
87 v_data_3_hi .req v_data_3
88 v_data_4_hi .req v_data_4
89 v_data_5_hi .req v_data_5
90 v_data_6_hi .req v_data_6
91 v_data_7_hi .req v_data_7
92
93 v_p0 .req v24
94 v_p1 .req v25
95 v_p2 .req v26
96 v_p3 .req v27
97 v_p4 .req v28
98 v_p5 .req v29
99 v_p6 .req v30
100 v_p7 .req v31
101 q_p0 .req q24
102 q_p1 .req q25
103 q_p2 .req q26
104 q_p3 .req q27
105 q_p4 .req q28
106 q_p5 .req q29
107 q_p6 .req q30
108 q_p7 .req q31
109
110 v_p .req v_p0
111 q_p .req q_p0
112 v_data .req v_p1
113 q_data .req q_p1
114 v_data_lo .req v_p2
115 v_data_hi .req v_p3
116
117
118 gf_vect_dot_prod_neon:
119 /* less than 16 bytes, return_fail */
120 cmp x_len, #16
121 blt .return_fail
122
123 movi v_mask0f.16b, #0x0f
124 mov x_pos, #0
125
126 lsl x_vec, x_vec, #3
127
128 .Lloop128_init:
129 /* less than 128 bytes, goto Lloop16_init */
130 cmp x_len, #128
131 blt .Lloop16_init
132
133 /* save d8 ~ d15 to stack */
134 sub sp, sp, #64
135 stp d8, d9, [sp]
136 stp d10, d11, [sp, #16]
137 stp d12, d13, [sp, #32]
138 stp d14, d15, [sp, #48]
139
140 sub x_len, x_len, #128
141
142 .Lloop128:
143 movi v_p0.16b, #0
144 movi v_p1.16b, #0
145 movi v_p2.16b, #0
146 movi v_p3.16b, #0
147 movi v_p4.16b, #0
148 movi v_p5.16b, #0
149 movi v_p6.16b, #0
150 movi v_p7.16b, #0
151
152 mov x_tbl1, x_tbl
153 mov x_vec_i, #0
154
155 .Lloop128_vects:
156 ldr x_ptr, [x_src, x_vec_i]
157 add x_vec_i, x_vec_i, #8
158 add x_ptr, x_ptr, x_pos
159
160 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
161
162 ldp q_data_0, q_data_1, [x_ptr], #32
163 ldp q_data_2, q_data_3, [x_ptr], #32
164 ldp q_data_4, q_data_5, [x_ptr], #32
165 ldp q_data_6, q_data_7, [x_ptr]
166
167 prfm pldl1keep, [x_tbl1]
168 prfm pldl1strm, [x_ptr]
169
170 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
171 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
172 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
173 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
174 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
175 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
176 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
177 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
178
179 ushr v_data_0_hi.16b, v_data_0.16b, #4
180 ushr v_data_1_hi.16b, v_data_1.16b, #4
181 ushr v_data_2_hi.16b, v_data_2.16b, #4
182 ushr v_data_3_hi.16b, v_data_3.16b, #4
183 ushr v_data_4_hi.16b, v_data_4.16b, #4
184 ushr v_data_5_hi.16b, v_data_5.16b, #4
185 ushr v_data_6_hi.16b, v_data_6.16b, #4
186 ushr v_data_7_hi.16b, v_data_7.16b, #4
187
188 tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
189 tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
190 tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
191 tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
192 tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
193 tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
194 tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
195 tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
196
197 tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
198 tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
199 tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
200 tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
201 tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
202 tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
203 tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
204 tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
205
206 eor v_p0.16b, v_data_0_lo.16b, v_p0.16b
207 eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
208 eor v_p1.16b, v_data_1_lo.16b, v_p1.16b
209 eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
210 eor v_p2.16b, v_data_2_lo.16b, v_p2.16b
211 eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
212 eor v_p3.16b, v_data_3_lo.16b, v_p3.16b
213 eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
214 eor v_p4.16b, v_data_4_lo.16b, v_p4.16b
215 eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
216 eor v_p5.16b, v_data_5_lo.16b, v_p5.16b
217 eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
218 eor v_p6.16b, v_data_6_lo.16b, v_p6.16b
219 eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
220 eor v_p7.16b, v_data_7_lo.16b, v_p7.16b
221 eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
222
223 cmp x_vec_i, x_vec
224 blt .Lloop128_vects
225
226 .Lloop128_vects_end:
227 add x_ptr, x_dest1, x_pos
228 stp q_p0, q_p1, [x_ptr], #32
229 stp q_p2, q_p3, [x_ptr], #32
230 stp q_p4, q_p5, [x_ptr], #32
231 stp q_p6, q_p7, [x_ptr]
232
233 add x_pos, x_pos, #128
234 cmp x_pos, x_len
235 ble .Lloop128
236
237 .Lloop128_end:
238 /* restore d8 ~ d15 */
239 ldp d8, d9, [sp]
240 ldp d10, d11, [sp, #16]
241 ldp d12, d13, [sp, #32]
242 ldp d14, d15, [sp, #48]
243 add sp, sp, #64
244
245 add x_len, x_len, #128
246 cmp x_pos, x_len
247 beq .return_pass
248
249 .Lloop16_init:
250 sub x_len, x_len, #16
251 cmp x_pos, x_len
252 bgt .lessthan16_init
253
254 .Lloop16:
255 movi v_p.16b, #0
256 mov x_tbl1, x_tbl
257 mov x_vec_i, #0
258
259 .Lloop16_vects:
260 ldr x_ptr, [x_src, x_vec_i]
261 ldr q_data, [x_ptr, x_pos]
262 add x_vec_i, x_vec_i, #8
263
264 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
265
266 and v_data_lo.16b, v_data.16b, v_mask0f.16b
267 ushr v_data_hi.16b, v_data.16b, #4
268
269 tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
270 tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
271 eor v_p.16b, v_data_lo.16b, v_p.16b
272 eor v_p.16b, v_p.16b, v_data_hi.16b
273
274 cmp x_vec_i, x_vec
275 blt .Lloop16_vects
276
277 .Lloop16_vects_end:
278 str q_p, [x_dest1, x_pos]
279 add x_pos, x_pos, #16
280 cmp x_pos, x_len
281 ble .Lloop16
282
283 .Lloop16_end:
284 sub x_tmp, x_pos, x_len
285 cmp x_tmp, #16
286 beq .return_pass
287
288 .lessthan16_init:
289 mov x_pos, x_len
290 b .Lloop16
291
292 .return_pass:
293 mov w_ret, #0
294 ret
295
296 .return_fail:
297 mov w_ret, #1
298 ret