]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / erasure_code / aarch64 / gf_vect_mul_neon.S
CommitLineData
f91f0fd5
TL
1/**************************************************************
2 Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Huawei Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31
32.global gf_vect_mul_neon
33.type gf_vect_mul_neon, %function
34
35
36/* arguments */
37x_len .req x0
38x_tbl .req x1
39x_src .req x2
40x_dest .req x3
41
42/* returns */
43w_ret .req w0
44
45/* local variables */
46x_dest1 .req x_dest
47x_src_end .req x4
48x_tmp .req x5
49
50/* vectors */
51v_mask0f .req v0
52
53v_gft1_lo .req v2
54v_gft1_hi .req v3
55q_gft1_lo .req q2
56q_gft1_hi .req q3
57
58v_data_0 .req v16
59v_data_1 .req v17
60v_data_2 .req v18
61v_data_3 .req v19
62v_data_4 .req v20
63v_data_5 .req v21
64v_data_6 .req v22
65v_data_7 .req v23
66q_data_0 .req q16
67q_data_1 .req q17
68q_data_2 .req q18
69q_data_3 .req q19
70q_data_4 .req q20
71q_data_5 .req q21
72q_data_6 .req q22
73q_data_7 .req q23
74
75v_data_0_lo .req v24
76v_data_1_lo .req v25
77v_data_2_lo .req v26
78v_data_3_lo .req v27
79v_data_4_lo .req v28
80v_data_5_lo .req v29
81v_data_6_lo .req v30
82v_data_7_lo .req v31
83v_data_0_hi .req v_data_0
84v_data_1_hi .req v_data_1
85v_data_2_hi .req v_data_2
86v_data_3_hi .req v_data_3
87v_data_4_hi .req v_data_4
88v_data_5_hi .req v_data_5
89v_data_6_hi .req v_data_6
90v_data_7_hi .req v_data_7
91
92
93gf_vect_mul_neon:
94 /* less than 32 bytes, return_fail */
95 cmp x_len, #32
96 blt .return_fail
97
98 movi v_mask0f.16b, #0x0f
99 add x_src_end, x_src, x_len
100 ldr q_gft1_lo, [x_tbl]
101 ldr q_gft1_hi, [x_tbl, #16]
102
103
104.Lloop128_init:
105 /* less than 128 bytes, goto Lloop16_init */
106 cmp x_len, #128
107 blt .Lloop32_init
108
109 /* save d8 ~ d15 to stack */
110 sub sp, sp, #64
111 stp d8, d9, [sp]
112 stp d10, d11, [sp, #16]
113 stp d12, d13, [sp, #32]
114 stp d14, d15, [sp, #48]
115
116 sub x_src_end, x_src_end, #128
117
118.Lloop128:
119 ldr q_data_0, [x_src, #16*0]
120 ldr q_data_1, [x_src, #16*1]
121 ldr q_data_2, [x_src, #16*2]
122 ldr q_data_3, [x_src, #16*3]
123 ldr q_data_4, [x_src, #16*4]
124 ldr q_data_5, [x_src, #16*5]
125 ldr q_data_6, [x_src, #16*6]
126 ldr q_data_7, [x_src, #16*7]
127
128 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
129 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
130 and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
131 and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
132 and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
133 and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
134 and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
135 and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
136
137 ushr v_data_0_hi.16b, v_data_0.16b, #4
138 ushr v_data_1_hi.16b, v_data_1.16b, #4
139 ushr v_data_2_hi.16b, v_data_2.16b, #4
140 ushr v_data_3_hi.16b, v_data_3.16b, #4
141 ushr v_data_4_hi.16b, v_data_4.16b, #4
142 ushr v_data_5_hi.16b, v_data_5.16b, #4
143 ushr v_data_6_hi.16b, v_data_6.16b, #4
144 ushr v_data_7_hi.16b, v_data_7.16b, #4
145
146 tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
147 tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
148 tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
149 tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
150 tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
151 tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
152 tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
153 tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
154
155 tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
156 tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
157 tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
158 tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
159 tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
160 tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
161 tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
162 tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
163
164 eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
165 eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
166 eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b
167 eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b
168 eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b
169 eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b
170 eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b
171 eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b
172
173 str q_data_0, [x_dest1, #16*0]
174 str q_data_1, [x_dest1, #16*1]
175 str q_data_2, [x_dest1, #16*2]
176 str q_data_3, [x_dest1, #16*3]
177 str q_data_4, [x_dest1, #16*4]
178 str q_data_5, [x_dest1, #16*5]
179 str q_data_6, [x_dest1, #16*6]
180 str q_data_7, [x_dest1, #16*7]
181
182 add x_src, x_src, #128
183 add x_dest1, x_dest1, #128
184 cmp x_src, x_src_end
185 bls .Lloop128
186
187.Lloop128_end:
188 /* restore d8 ~ d15 */
189 ldp d8, d9, [sp]
190 ldp d10, d11, [sp, #16]
191 ldp d12, d13, [sp, #32]
192 ldp d14, d15, [sp, #48]
193 add sp, sp, #64
194 add x_src_end, x_src_end, #128
195
196.Lloop32_init:
197 sub x_src_end, x_src_end, #32
198 cmp x_src, x_src_end
199 bhi .return_fail
200
201.Lloop32:
202 ldr q_data_0, [x_src, #16*0]
203 ldr q_data_1, [x_src, #16*1]
204
205 and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
206 and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
207 ushr v_data_0_hi.16b, v_data_0.16b, #4
208 ushr v_data_1_hi.16b, v_data_1.16b, #4
209 tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
210 tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
211 tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
212 tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
213 eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
214 eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
215 str q_data_0, [x_dest1, #16*0]
216 str q_data_1, [x_dest1, #16*1]
217
218 add x_dest1, x_dest1, #32
219 add x_src, x_src, #32
220 cmp x_src, x_src_end
221 bls .Lloop32
222
223.Lloop32_end:
224 sub x_tmp, x_src, x_src_end
225 cmp x_tmp, #32
226 beq .return_pass
227 b .return_fail
228
229.return_pass:
230 mov w_ret, #0
231 ret
232
233.return_fail:
234 mov w_ret, #1
235 ret