]>
Commit | Line | Data |
---|---|---|
f91f0fd5 TL |
1 | /************************************************************** |
2 | Copyright (c) 2019 Huawei Technologies Co., Ltd. | |
3 | ||
4 | Redistribution and use in source and binary forms, with or without | |
5 | modification, are permitted provided that the following conditions | |
6 | are met: | |
7 | * Redistributions of source code must retain the above copyright | |
8 | notice, this list of conditions and the following disclaimer. | |
9 | * Redistributions in binary form must reproduce the above copyright | |
10 | notice, this list of conditions and the following disclaimer in | |
11 | the documentation and/or other materials provided with the | |
12 | distribution. | |
13 | * Neither the name of Huawei Corporation nor the names of its | |
14 | contributors may be used to endorse or promote products derived | |
15 | from this software without specific prior written permission. | |
16 | ||
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | **********************************************************************/ | |
29 | ||
30 | .text | |
31 | ||
32 | .global gf_vect_mul_neon | |
33 | .type gf_vect_mul_neon, %function | |
34 | ||
35 | ||
36 | /* arguments */ | |
37 | x_len .req x0 | |
38 | x_tbl .req x1 | |
39 | x_src .req x2 | |
40 | x_dest .req x3 | |
41 | ||
42 | /* returns */ | |
43 | w_ret .req w0 | |
44 | ||
45 | /* local variables */ | |
46 | x_dest1 .req x_dest | |
47 | x_src_end .req x4 | |
48 | x_tmp .req x5 | |
49 | ||
50 | /* vectors */ | |
51 | v_mask0f .req v0 | |
52 | ||
53 | v_gft1_lo .req v2 | |
54 | v_gft1_hi .req v3 | |
55 | q_gft1_lo .req q2 | |
56 | q_gft1_hi .req q3 | |
57 | ||
58 | v_data_0 .req v16 | |
59 | v_data_1 .req v17 | |
60 | v_data_2 .req v18 | |
61 | v_data_3 .req v19 | |
62 | v_data_4 .req v20 | |
63 | v_data_5 .req v21 | |
64 | v_data_6 .req v22 | |
65 | v_data_7 .req v23 | |
66 | q_data_0 .req q16 | |
67 | q_data_1 .req q17 | |
68 | q_data_2 .req q18 | |
69 | q_data_3 .req q19 | |
70 | q_data_4 .req q20 | |
71 | q_data_5 .req q21 | |
72 | q_data_6 .req q22 | |
73 | q_data_7 .req q23 | |
74 | ||
75 | v_data_0_lo .req v24 | |
76 | v_data_1_lo .req v25 | |
77 | v_data_2_lo .req v26 | |
78 | v_data_3_lo .req v27 | |
79 | v_data_4_lo .req v28 | |
80 | v_data_5_lo .req v29 | |
81 | v_data_6_lo .req v30 | |
82 | v_data_7_lo .req v31 | |
83 | v_data_0_hi .req v_data_0 | |
84 | v_data_1_hi .req v_data_1 | |
85 | v_data_2_hi .req v_data_2 | |
86 | v_data_3_hi .req v_data_3 | |
87 | v_data_4_hi .req v_data_4 | |
88 | v_data_5_hi .req v_data_5 | |
89 | v_data_6_hi .req v_data_6 | |
90 | v_data_7_hi .req v_data_7 | |
91 | ||
92 | ||
93 | gf_vect_mul_neon: | |
94 | /* less than 32 bytes, return_fail */ | |
95 | cmp x_len, #32 | |
96 | blt .return_fail | |
97 | ||
98 | movi v_mask0f.16b, #0x0f | |
99 | add x_src_end, x_src, x_len | |
100 | ldr q_gft1_lo, [x_tbl] | |
101 | ldr q_gft1_hi, [x_tbl, #16] | |
102 | ||
103 | ||
104 | .Lloop128_init: | |
105 | /* less than 128 bytes, goto Lloop16_init */ | |
106 | cmp x_len, #128 | |
107 | blt .Lloop32_init | |
108 | ||
109 | /* save d8 ~ d15 to stack */ | |
110 | sub sp, sp, #64 | |
111 | stp d8, d9, [sp] | |
112 | stp d10, d11, [sp, #16] | |
113 | stp d12, d13, [sp, #32] | |
114 | stp d14, d15, [sp, #48] | |
115 | ||
116 | sub x_src_end, x_src_end, #128 | |
117 | ||
118 | .Lloop128: | |
119 | ldr q_data_0, [x_src, #16*0] | |
120 | ldr q_data_1, [x_src, #16*1] | |
121 | ldr q_data_2, [x_src, #16*2] | |
122 | ldr q_data_3, [x_src, #16*3] | |
123 | ldr q_data_4, [x_src, #16*4] | |
124 | ldr q_data_5, [x_src, #16*5] | |
125 | ldr q_data_6, [x_src, #16*6] | |
126 | ldr q_data_7, [x_src, #16*7] | |
127 | ||
128 | and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b | |
129 | and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b | |
130 | and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b | |
131 | and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b | |
132 | and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b | |
133 | and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b | |
134 | and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b | |
135 | and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b | |
136 | ||
137 | ushr v_data_0_hi.16b, v_data_0.16b, #4 | |
138 | ushr v_data_1_hi.16b, v_data_1.16b, #4 | |
139 | ushr v_data_2_hi.16b, v_data_2.16b, #4 | |
140 | ushr v_data_3_hi.16b, v_data_3.16b, #4 | |
141 | ushr v_data_4_hi.16b, v_data_4.16b, #4 | |
142 | ushr v_data_5_hi.16b, v_data_5.16b, #4 | |
143 | ushr v_data_6_hi.16b, v_data_6.16b, #4 | |
144 | ushr v_data_7_hi.16b, v_data_7.16b, #4 | |
145 | ||
146 | tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b | |
147 | tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b | |
148 | tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b | |
149 | tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b | |
150 | tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b | |
151 | tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b | |
152 | tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b | |
153 | tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b | |
154 | ||
155 | tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b | |
156 | tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b | |
157 | tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b | |
158 | tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b | |
159 | tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b | |
160 | tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b | |
161 | tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b | |
162 | tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b | |
163 | ||
164 | eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b | |
165 | eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b | |
166 | eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b | |
167 | eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b | |
168 | eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b | |
169 | eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b | |
170 | eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b | |
171 | eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b | |
172 | ||
173 | str q_data_0, [x_dest1, #16*0] | |
174 | str q_data_1, [x_dest1, #16*1] | |
175 | str q_data_2, [x_dest1, #16*2] | |
176 | str q_data_3, [x_dest1, #16*3] | |
177 | str q_data_4, [x_dest1, #16*4] | |
178 | str q_data_5, [x_dest1, #16*5] | |
179 | str q_data_6, [x_dest1, #16*6] | |
180 | str q_data_7, [x_dest1, #16*7] | |
181 | ||
182 | add x_src, x_src, #128 | |
183 | add x_dest1, x_dest1, #128 | |
184 | cmp x_src, x_src_end | |
185 | bls .Lloop128 | |
186 | ||
187 | .Lloop128_end: | |
188 | /* restore d8 ~ d15 */ | |
189 | ldp d8, d9, [sp] | |
190 | ldp d10, d11, [sp, #16] | |
191 | ldp d12, d13, [sp, #32] | |
192 | ldp d14, d15, [sp, #48] | |
193 | add sp, sp, #64 | |
194 | add x_src_end, x_src_end, #128 | |
195 | ||
196 | .Lloop32_init: | |
197 | sub x_src_end, x_src_end, #32 | |
198 | cmp x_src, x_src_end | |
199 | bhi .return_fail | |
200 | ||
201 | .Lloop32: | |
202 | ldr q_data_0, [x_src, #16*0] | |
203 | ldr q_data_1, [x_src, #16*1] | |
204 | ||
205 | and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b | |
206 | and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b | |
207 | ushr v_data_0_hi.16b, v_data_0.16b, #4 | |
208 | ushr v_data_1_hi.16b, v_data_1.16b, #4 | |
209 | tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b | |
210 | tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b | |
211 | tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b | |
212 | tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b | |
213 | eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b | |
214 | eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b | |
215 | str q_data_0, [x_dest1, #16*0] | |
216 | str q_data_1, [x_dest1, #16*1] | |
217 | ||
218 | add x_dest1, x_dest1, #32 | |
219 | add x_src, x_src, #32 | |
220 | cmp x_src, x_src_end | |
221 | bls .Lloop32 | |
222 | ||
223 | .Lloop32_end: | |
224 | sub x_tmp, x_src, x_src_end | |
225 | cmp x_tmp, #32 | |
226 | beq .return_pass | |
227 | b .return_fail | |
228 | ||
229 | .return_pass: | |
230 | mov w_ret, #0 | |
231 | ret | |
232 | ||
233 | .return_fail: | |
234 | mov w_ret, #1 | |
235 | ret |