]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_vect_mad_neon.S
1 /**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 .text
30
31 .global gf_vect_mad_neon
32 .type gf_vect_mad_neon, %function
33
34
35 /* arguments */
36 x_len .req x0
37 x_vec .req x1
38 x_vec_i .req x2
39 x_tbl .req x3
40 x_src .req x4
41 x_dest .req x5
42
43 /* returns */
44 w_ret .req w0
45
46 /* local variables */
47 x_src_end .req x6
48 x_dest1 .req x_dest
49 x_tmp .req x7
50 x_const .req x8
51
52 /* vectors */
53 v_mask0f .req v0
54 v_tmp .req v1
55 q_tmp .req q1
56
57 v_tmp1_lo .req v2
58 v_tmp1_hi .req v3
59 v_tmp2_lo .req v4
60 v_tmp2_hi .req v5
61
62 v_gft1_lo .req v6
63 v_gft1_hi .req v7
64 q_gft1_lo .req q6
65 q_gft1_hi .req q7
66
67 v_data_0 .req v8
68 v_data_1 .req v9
69 v_data_2 .req v10
70 v_data_3 .req v11
71 v_data_4 .req v12
72 v_data_5 .req v13
73 v_data_6 .req v14
74 v_data_7 .req v15
75 q_data_0 .req q8
76 q_data_1 .req q9
77 q_data_2 .req q10
78 q_data_3 .req q11
79 q_data_4 .req q12
80 q_data_5 .req q13
81 q_data_6 .req q14
82 q_data_7 .req q15
83
84 v_data_0_lo .req v16
85 v_data_1_lo .req v17
86 v_data_2_lo .req v18
87 v_data_3_lo .req v19
88 v_data_4_lo .req v20
89 v_data_5_lo .req v21
90 v_data_6_lo .req v22
91 v_data_7_lo .req v23
92 v_data_0_hi .req v_data_0
93 v_data_1_hi .req v_data_1
94 v_data_2_hi .req v_data_2
95 v_data_3_hi .req v_data_3
96 v_data_4_hi .req v_data_4
97 v_data_5_hi .req v_data_5
98 v_data_6_hi .req v_data_6
99 v_data_7_hi .req v_data_7
100
101 v_d1_0 .req v24
102 v_d1_1 .req v25
103 v_d1_2 .req v26
104 v_d1_3 .req v27
105 v_d1_4 .req v28
106 v_d1_5 .req v29
107 v_d1_6 .req v30
108 v_d1_7 .req v31
109 q_d1_0 .req q24
110 q_d1_1 .req q25
111 q_d1_2 .req q26
112 q_d1_3 .req q27
113 q_d1_4 .req q28
114 q_d1_5 .req q29
115 q_d1_6 .req q30
116 q_d1_7 .req q31
117
118 v_data .req v_d1_1
119 q_data .req q_d1_1
120 v_data_lo .req v_d1_2
121 v_data_hi .req v_d1_3
122
123
124 gf_vect_mad_neon:
125 /* less than 16 bytes, return_fail */
126 cmp x_len, #16
127 blt .return_fail
128
129 movi v_mask0f.16b, #0x0f
130 lsl x_vec_i, x_vec_i, #5
131 add x_tbl, x_tbl, x_vec_i
132 add x_src_end, x_src, x_len
133
134 ldr q_gft1_lo, [x_tbl]
135 ldr q_gft1_hi, [x_tbl, #16]
136
137 .Lloop128_init:
138 /* less than 128 bytes, goto Lloop16_init */
139 cmp x_len, #128
140 blt .Lloop16_init
141
142 /* save d8 ~ d15 to stack */
143 sub sp, sp, #64
144 stp d8, d9, [sp]
145 stp d10, d11, [sp, #16]
146 stp d12, d13, [sp, #32]
147 stp d14, d15, [sp, #48]
148
149 sub x_src_end, x_src_end, #128
150
151 .Lloop128:
152 ldr q_data_0, [x_src, #16*0]
153 ldr q_data_1, [x_src, #16*1]
154 ldr q_data_2, [x_src, #16*2]
155 ldr q_data_3, [x_src, #16*3]
156 ldr q_data_4, [x_src, #16*4]
157 ldr q_data_5, [x_src, #16*5]
158 ldr q_data_6, [x_src, #16*6]
159 ldr q_data_7, [x_src, #16*7]
160
161 ldr q_d1_0, [x_dest1, #16*0]
162 ldr q_d1_1, [x_dest1, #16*1]
163 ldr q_d1_2, [x_dest1, #16*2]
164 ldr q_d1_3, [x_dest1, #16*3]
165 ldr q_d1_4, [x_dest1, #16*4]
166 ldr q_d1_5, [x_dest1, #16*5]
167 ldr q_d1_6, [x_dest1, #16*6]
168 ldr q_d1_7, [x_dest1, #16*7]
169
170 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
171 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
172 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
173 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
174 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
175 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
176 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
177 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
178
179 ushr v_data_0_hi.16b, v_data_0.16b, #4
180 ushr v_data_1_hi.16b, v_data_1.16b, #4
181 ushr v_data_2_hi.16b, v_data_2.16b, #4
182 ushr v_data_3_hi.16b, v_data_3.16b, #4
183 ushr v_data_4_hi.16b, v_data_4.16b, #4
184 ushr v_data_5_hi.16b, v_data_5.16b, #4
185 ushr v_data_6_hi.16b, v_data_6.16b, #4
186 ushr v_data_7_hi.16b, v_data_7.16b, #4
187
188 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
189 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
190 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
191 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
192
193 eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
194 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
195 eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
196 eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
197
198 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
199 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
200 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
201 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
202
203 eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
204 eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
205 eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
206 eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
207
208 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
209 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
210 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
211 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
212
213 eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
214 eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
215 eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
216 eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
217
218 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
219 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
220 tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
221 tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
222
223 eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
224 eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
225 eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
226 eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
227
228 str q_d1_0, [x_dest1, #16*0]
229 str q_d1_1, [x_dest1, #16*1]
230 str q_d1_2, [x_dest1, #16*2]
231 str q_d1_3, [x_dest1, #16*3]
232 str q_d1_4, [x_dest1, #16*4]
233 str q_d1_5, [x_dest1, #16*5]
234 str q_d1_6, [x_dest1, #16*6]
235 str q_d1_7, [x_dest1, #16*7]
236
237 add x_src, x_src, #128
238 add x_dest1, x_dest1, #128
239 cmp x_src, x_src_end
240 bls .Lloop128
241
242 .Lloop128_end:
243 /* restore d8 ~ d15 */
244 ldp d8, d9, [sp]
245 ldp d10, d11, [sp, #16]
246 ldp d12, d13, [sp, #32]
247 ldp d14, d15, [sp, #48]
248 add sp, sp, #64
249 add x_src_end, x_src_end, #128
250
251 .Lloop16_init:
252 sub x_src_end, x_src_end, #16
253 cmp x_src, x_src_end
254 bhi .lessthan16_init
255
256 .Lloop16:
257 ldr q_data, [x_src]
258 ldr q_d1_0, [x_dest1]
259
260 and v_data_lo.16b, v_data.16b, v_mask0f.16b
261 ushr v_data_hi.16b, v_data.16b, #4
262
263 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
264 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
265 eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
266 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
267
268 str q_d1_0, [x_dest1]
269
270 add x_dest1, x_dest1, #16
271 add x_src, x_src, #16
272 cmp x_src, x_src_end
273 bls .Lloop16
274
275 .lessthan16_init:
276 sub x_tmp, x_src, x_src_end
277 cmp x_tmp, #16
278 beq .return_pass
279
280 .lessthan16:
281 mov x_src, x_src_end
282 sub x_dest1, x_dest1, x_tmp
283
284 ldr x_const, =const_tbl
285 sub x_const, x_const, x_tmp
286 ldr q_tmp, [x_const, #16]
287
288 ldr q_data, [x_src]
289 ldr q_d1_0, [x_dest1]
290
291 and v_data_lo.16b, v_data.16b, v_mask0f.16b
292 ushr v_data_hi.16b, v_data.16b, #4
293
294 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
295 tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
296 eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
297 and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
298 eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
299
300 str q_d1_0, [x_dest1]
301
302 .return_pass:
303 mov w_ret, #0
304 ret
305
306 .return_fail:
307 mov w_ret, #1
308 ret
309
310 .section .data
311 .balign 8
312 const_tbl:
313 .dword 0x0000000000000000, 0x0000000000000000
314 .dword 0xffffffffffffffff, 0xffffffffffffffff