]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | /********************************************************************** |
2 | Copyright(c) 2020 Arm Corporation All rights reserved. | |
3 | ||
4 | Redistribution and use in source and binary forms, with or without | |
5 | modification, are permitted provided that the following conditions | |
6 | are met: | |
7 | * Redistributions of source code must retain the above copyright | |
8 | notice, this list of conditions and the following disclaimer. | |
9 | * Redistributions in binary form must reproduce the above copyright | |
10 | notice, this list of conditions and the following disclaimer in | |
11 | the documentation and/or other materials provided with the | |
12 | distribution. | |
13 | * Neither the name of Arm Corporation nor the names of its | |
14 | contributors may be used to endorse or promote products derived | |
15 | from this software without specific prior written permission. | |
16 | ||
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | **********************************************************************/ | |
29 | ||
30 | .macro declare_generic_reg name:req, reg:req, default:req | |
31 | \name .req \default\reg | |
32 | w_\name .req w\reg | |
33 | x_\name .req x\reg | |
34 | .endm | |
35 | ||
36 | .macro declare_neon_reg name:req, reg:req, default:req | |
37 | \name .req \default\reg | |
38 | v_\name .req v\reg | |
39 | q_\name .req q\reg | |
40 | d_\name .req d\reg | |
41 | s_\name .req s\reg | |
42 | .endm | |
43 | ||
44 | /********************************************************************** | |
45 | variables | |
46 | **********************************************************************/ | |
47 | declare_generic_reg crc, 0,w | |
48 | declare_generic_reg buf, 1,x | |
49 | declare_generic_reg len, 2,x | |
50 | declare_generic_reg buf_saved, 3,x | |
51 | declare_generic_reg buf_iter, 4,x | |
52 | declare_generic_reg len_saved, 5,x | |
53 | declare_generic_reg buf_tmp, 6,x | |
54 | ||
55 | declare_generic_reg crc0, 7,x | |
56 | declare_generic_reg crc1, 8,x | |
57 | declare_generic_reg crc2, 9,x | |
58 | declare_generic_reg pconst, 10,x | |
59 | declare_generic_reg data_crc0, 11,x | |
60 | declare_generic_reg data_crc1, 12,x | |
61 | declare_generic_reg data_crc2, 13,x | |
62 | ||
63 | declare_generic_reg size, 9,x | |
64 | declare_generic_reg crc_tmp, 10,w | |
65 | declare_generic_reg size_tmp, 11,x | |
66 | declare_generic_reg data_tmp1, 11,x | |
67 | declare_generic_reg data_tmp2, 12,x | |
68 | declare_generic_reg data_tmp3, 13,x | |
69 | ||
70 | declare_generic_reg tmp, 14,x | |
71 | declare_generic_reg tmp1, 15,x | |
72 | ||
73 | // return | |
74 | declare_generic_reg ret_crc, 0,w | |
75 | ||
76 | /********************************************************************** | |
77 | simd variables | |
78 | **********************************************************************/ | |
79 | declare_neon_reg a0, 0,v | |
80 | declare_neon_reg a1, 1,v | |
81 | declare_neon_reg a2, 2,v | |
82 | declare_neon_reg a3, 3,v | |
83 | declare_neon_reg a4, 4,v | |
84 | ||
85 | declare_neon_reg a5, 16,v | |
86 | declare_neon_reg a6, 17,v | |
87 | declare_neon_reg a7, 18,v | |
88 | declare_neon_reg a8, 19,v | |
89 | ||
90 | declare_neon_reg y5, 20,v | |
91 | declare_neon_reg y6, 21,v | |
92 | declare_neon_reg y7, 22,v | |
93 | declare_neon_reg y8, 23,v | |
94 | ||
95 | declare_neon_reg neon_zero, 24,v | |
96 | declare_neon_reg neon_tmp, 24,v | |
97 | ||
98 | declare_neon_reg k5k0, 25,v | |
99 | declare_neon_reg neon_tmp1, 26,v | |
100 | declare_neon_reg neon_tmp2, 27,v | |
101 | declare_neon_reg neon_tmp3, 28,v | |
102 | ||
103 | declare_neon_reg crc_pmull, 29,v | |
104 | declare_neon_reg neon_crc0, 30,v | |
105 | declare_neon_reg neon_crc1, 31,v | |
106 | ||
107 | declare_neon_reg neon_const0, 5,v | |
108 | declare_neon_reg neon_const1, 6,v | |
109 | declare_neon_reg neon_const2, 7,v | |
110 | ||
111 | // constants | |
112 | .equ offset_k3k4, 16 | |
113 | .equ offset_k5k0, 32 | |
114 | .equ offset_poly, 48 | |
115 | .equ offset_crc32_const, 64 | |
116 | ||
117 | // pmull fold | |
118 | .macro pmull_fold | |
119 | ldr x_data_crc0, [x_buf_tmp, 464] | |
120 | ldr x_data_crc1, [x_buf_tmp, 976] | |
121 | ldr x_data_crc2, [x_buf_tmp, 1488] | |
122 | ||
123 | pmull v_a5.1q, v_a1.1d, v_a0.1d | |
124 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
125 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
126 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
127 | ||
128 | ldr x_data_crc0, [x_buf_tmp, 472] | |
129 | ldr x_data_crc1, [x_buf_tmp, 984] | |
130 | ldr x_data_crc2, [x_buf_tmp, 1496] | |
131 | ||
132 | pmull v_a6.1q, v_a2.1d, v_a0.1d | |
133 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
134 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
135 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
136 | ||
137 | ldr x_data_crc0, [x_buf_tmp, 480] | |
138 | ldr x_data_crc1, [x_buf_tmp, 992] | |
139 | ldr x_data_crc2, [x_buf_tmp, 1504] | |
140 | ||
141 | pmull v_a7.1q, v_a3.1d, v_a0.1d | |
142 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
143 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
144 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
145 | ||
146 | ldr x_data_crc0, [x_buf_tmp, 488] | |
147 | ldr x_data_crc1, [x_buf_tmp, 1000] | |
148 | ldr x_data_crc2, [x_buf_tmp, 1512] | |
149 | ||
150 | pmull v_a8.1q, v_a4.1d, v_a0.1d | |
151 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
152 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
153 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
154 | ||
155 | ldr x_data_crc0, [x_buf_tmp, 496] | |
156 | ldr x_data_crc1, [x_buf_tmp, 1008] | |
157 | ldr x_data_crc2, [x_buf_tmp, 1520] | |
158 | ||
159 | pmull2 v_a1.1q, v_a1.2d, v_a0.2d | |
160 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
161 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
162 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
163 | ||
164 | ld1 {v_y5.4s, v_y6.4s, v_y7.4s, v_y8.4s}, [x_buf_tmp] | |
165 | ||
166 | ldr x_data_crc0, [x_buf_tmp, 504] | |
167 | ldr x_data_crc1, [x_buf_tmp, 1016] | |
168 | ldr x_data_crc2, [x_buf_tmp, 1528] | |
169 | ||
170 | pmull2 v_a2.1q, v_a2.2d, v_a0.2d | |
171 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
172 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
173 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
174 | ||
175 | pmull2 v_a3.1q, v_a3.2d, v_a0.2d | |
176 | pmull2 v_a4.1q, v_a4.2d, v_a0.2d | |
177 | ||
178 | eor v_y5.16b, v_y5.16b, v_a5.16b | |
179 | eor v_y6.16b, v_y6.16b, v_a6.16b | |
180 | eor v_y7.16b, v_y7.16b, v_a7.16b | |
181 | eor v_y8.16b, v_y8.16b, v_a8.16b | |
182 | ||
183 | ldr x_data_crc0, [x_buf_tmp, 512] | |
184 | ldr x_data_crc1, [x_buf_tmp, 1024] | |
185 | ldr x_data_crc2, [x_buf_tmp, 1536] | |
186 | ||
187 | eor v_a1.16b, v_y5.16b, v_a1.16b | |
188 | eor v_a2.16b, v_y6.16b, v_a2.16b | |
189 | eor v_a3.16b, v_y7.16b, v_a3.16b | |
190 | eor v_a4.16b, v_y8.16b, v_a4.16b | |
191 | ||
192 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
193 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
194 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
195 | ||
196 | ldr x_data_crc0, [x_buf_tmp, 520] | |
197 | ldr x_data_crc1, [x_buf_tmp, 1032] | |
198 | ldr x_data_crc2, [x_buf_tmp, 1544] | |
199 | ||
200 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
201 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
202 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
203 | .endm | |
204 | ||
205 | // crc32 mix for 2048 byte input data | |
206 | .macro crc32_mix2048 | |
207 | fmov s_a1, w_crc | |
208 | movi v_neon_tmp.4s, 0 | |
209 | ||
210 | adrp x_pconst, lanchor_crc32 | |
211 | add x_buf_tmp, x_buf, 64 | |
212 | ||
213 | ldr x_data_crc0, [x_buf, 512] | |
214 | ldr x_data_crc1, [x_buf, 1024] | |
215 | ldr x_data_crc2, [x_buf, 1536] | |
216 | ||
217 | crc32_u64 w_crc0, wzr, x_data_crc0 | |
218 | crc32_u64 w_crc1, wzr, x_data_crc1 | |
219 | crc32_u64 w_crc2, wzr, x_data_crc2 | |
220 | ||
221 | #ifdef CRC32 | |
222 | mvn v_a1.8b, v_a1.8b | |
223 | #endif | |
224 | ||
225 | ins v_neon_tmp.s[0], v_a1.s[0] | |
226 | ||
227 | ld1 {v_a1.4s, v_a2.4s, v_a3.4s, v_a4.4s}, [x_buf] | |
228 | ||
229 | ldr x_data_crc0, [x_buf, 520] | |
230 | ldr x_data_crc1, [x_buf, 1032] | |
231 | ldr x_data_crc2, [x_buf, 1544] | |
232 | ||
233 | eor v_a1.16b, v_a1.16b, v_neon_tmp.16b | |
234 | ldr q_a0, [x_pconst, #:lo12:lanchor_crc32] // k1k2 | |
235 | ||
236 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
237 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
238 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
239 | ||
240 | // loop start, unroll the loop | |
241 | .align 4 | |
242 | pmull_fold | |
243 | ||
244 | add x_buf_tmp, x_buf_tmp, 64 | |
245 | pmull_fold | |
246 | ||
247 | add x_buf_tmp, x_buf_tmp, 64 | |
248 | pmull_fold | |
249 | ||
250 | add x_buf_tmp, x_buf_tmp, 64 | |
251 | pmull_fold | |
252 | ||
253 | add x_buf_tmp, x_buf_tmp, 64 | |
254 | pmull_fold | |
255 | ||
256 | add x_buf_tmp, x_buf_tmp, 64 | |
257 | pmull_fold | |
258 | ||
259 | add x_buf_tmp, x_buf_tmp, 64 | |
260 | pmull_fold | |
261 | // loop end | |
262 | ||
263 | // PMULL: fold into 128-bits | |
264 | add x_pconst, x_pconst, :lo12:lanchor_crc32 | |
265 | ||
266 | ldr x_data_crc0, [x_buf, 976] | |
267 | ldr x_data_crc1, [x_buf, 1488] | |
268 | ldr x_data_crc2, [x_buf, 2000] | |
269 | ||
270 | ldr q_a0, [x_pconst, offset_k3k4] // k3k4 | |
271 | ||
272 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
273 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
274 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
275 | ||
276 | pmull v_a5.1q, v_a1.1d, v_a0.1d | |
277 | pmull2 v_a1.1q, v_a1.2d, v_a0.2d | |
278 | ||
279 | eor v_a1.16b, v_a5.16b, v_a1.16b | |
280 | eor v_a1.16b, v_a1.16b, v_a2.16b | |
281 | ||
282 | ldr x_data_crc0, [x_buf, 984] | |
283 | ldr x_data_crc1, [x_buf, 1496] | |
284 | ldr x_data_crc2, [x_buf, 2008] | |
285 | ||
286 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
287 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
288 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
289 | ||
290 | pmull v_a5.1q, v_a1.1d, v_a0.1d | |
291 | pmull2 v_a1.1q, v_a1.2d, v_a0.2d | |
292 | ||
293 | ldr x_data_crc0, [x_buf, 992] | |
294 | ldr x_data_crc1, [x_buf, 1504] | |
295 | ldr x_data_crc2, [x_buf, 2016] | |
296 | ||
297 | eor v_a1.16b, v_a5.16b, v_a1.16b | |
298 | eor v_a1.16b, v_a1.16b, v_a3.16b | |
299 | ||
300 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
301 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
302 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
303 | ||
304 | pmull v_a5.1q, v_a1.1d, v_a0.1d | |
305 | pmull2 v_a1.1q, v_a1.2d, v_a0.2d | |
306 | ||
307 | ldr x_data_crc0, [x_buf, 1000] | |
308 | ldr x_data_crc1, [x_buf, 1512] | |
309 | ldr x_data_crc2, [x_buf, 2024] | |
310 | ||
311 | eor v_a1.16b, v_a5.16b, v_a1.16b | |
312 | eor v_a1.16b, v_a1.16b, v_a4.16b | |
313 | ||
314 | // PMULL: fold 128-bits to 64-bits | |
315 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
316 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
317 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
318 | ||
319 | dup d_a0, v_a0.d[1] | |
320 | pmull v_a2.1q, v_a1.1d, v_a0.1d | |
321 | ||
322 | movi v_neon_zero.4s, 0 | |
323 | ldr q_k5k0, [x_pconst, offset_k5k0] // k5k0 | |
324 | adrp x_tmp, .lanchor_mask | |
325 | ||
326 | ldr x_data_crc0, [x_buf, 1008] | |
327 | ldr x_data_crc1, [x_buf, 1520] | |
328 | ldr x_data_crc2, [x_buf, 2032] | |
329 | ||
330 | ext v_a1.16b, v_a1.16b, v_neon_zero.16b, #8 | |
331 | eor v_a1.16b, v_a2.16b, v_a1.16b | |
332 | ldr q_neon_tmp3, [x_tmp, #:lo12:.lanchor_mask] | |
333 | ||
334 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
335 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
336 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
337 | ||
338 | dup d_a0, v_k5k0.d[1] | |
339 | pmull v_a3.1q, v_a2.1d, v_a0.1d | |
340 | ||
341 | ext v_a2.16b, v_a1.16b, v_neon_zero.16b, #4 | |
342 | and v_a1.16b, v_a1.16b, v_neon_tmp3.16b | |
343 | pmull v_a1.1q, v_a1.1d, v_k5k0.1d | |
344 | eor v_a1.16b, v_a2.16b, v_a1.16b | |
345 | ||
346 | // PMULL: barret reduce to 32-bits | |
347 | ldr q_neon_tmp1, [x_pconst, offset_poly] // poly | |
348 | ||
349 | ldr x_data_crc0, [x_buf, 1016] | |
350 | ldr x_data_crc1, [x_buf, 1528] | |
351 | ldr x_data_crc2, [x_buf, 2040] | |
352 | ||
353 | dup d_neon_tmp2, v_neon_tmp1.d[1] | |
354 | ||
355 | crc32_u64 w_crc0, w_crc0, x_data_crc0 | |
356 | crc32_u64 w_crc1, w_crc1, x_data_crc1 | |
357 | crc32_u64 w_crc2, w_crc2, x_data_crc2 | |
358 | ||
359 | and v_a2.16b, v_a1.16b, v_neon_tmp3.16b | |
360 | pmull v_a2.1q, v_a2.1d, v_neon_tmp2.1d | |
361 | and v_a2.16b, v_neon_tmp3.16b, v_a2.16b | |
362 | pmull v_a2.1q, v_a2.1d, v_neon_tmp1.1d | |
363 | ||
364 | // crc_pmull result | |
365 | eor v_a1.16b, v_a1.16b, v_a2.16b | |
366 | dup s_crc_pmull, v_a1.s[1] | |
367 | ||
368 | // merge crc_pmull, crc0, crc1, crc2 using pmull instruction | |
369 | fmov s_neon_crc0, w_crc0 | |
370 | fmov s_neon_crc1, w_crc1 | |
371 | ||
372 | ldr q_neon_const0, [x_pconst, offset_crc32_const] | |
373 | ldr q_neon_const1, [x_pconst, offset_crc32_const+16] | |
374 | ldr q_neon_const2, [x_pconst, offset_crc32_const+32] | |
375 | ||
376 | pmull v_crc_pmull.1q, v_crc_pmull.1d, v_neon_const0.1d | |
377 | pmull v_neon_crc0.1q, v_neon_crc0.1d, v_neon_const1.1d | |
378 | pmull v_neon_crc1.1q, v_neon_crc1.1d, v_neon_const2.1d | |
379 | ||
380 | fmov x_tmp1, d_neon_crc0 | |
381 | crc32_u64 w_crc0, wzr, x_tmp1 | |
382 | ||
383 | fmov x_tmp1, d_neon_crc1 | |
384 | crc32_u64 w_crc1, wzr, x_tmp1 | |
385 | ||
386 | eor w_ret_crc, w_crc1, w_crc0 | |
387 | ||
388 | fmov x_tmp1, d_crc_pmull | |
389 | crc32_u64 w_tmp, wzr, x_tmp1 | |
390 | ||
391 | eor w_crc2, w_tmp, w_crc2 | |
392 | ||
393 | // handle crc32/crc32c | |
394 | #ifdef CRC32 | |
395 | eon w_ret_crc, w_crc2, w_ret_crc | |
396 | #else | |
397 | eor w_ret_crc, w_crc2, w_ret_crc | |
398 | #endif | |
399 | .endm | |
400 | ||
401 | // crc32 mix main default | |
402 | .macro crc32_mix_main_default | |
403 | cmp x_len, 2047 | |
404 | mov x_len_saved, x_len | |
405 | mov x_buf_saved, x_buf | |
406 | bls .less_than_2048 | |
407 | ||
408 | sub x_buf_iter, x_len, #2048 | |
409 | stp x29, x30, [sp, -16]! | |
410 | ||
411 | mov x29, sp | |
412 | and x_buf_iter, x_buf_iter, -2048 | |
413 | add x_buf_iter, x_buf_iter, 2048 | |
414 | add x_buf_iter, x_buf, x_buf_iter | |
415 | ||
416 | .align 4 | |
417 | .loop_mix: | |
418 | mov x_buf, x_buf_saved | |
419 | crc32_mix2048 | |
420 | ||
421 | add x_buf_saved, x_buf_saved, 2048 | |
422 | cmp x_buf_saved, x_buf_iter | |
423 | bne .loop_mix | |
424 | ||
425 | and x_len_saved, x_len_saved, 2047 | |
426 | cbnz x_len_saved, .remain_ldp | |
427 | ||
428 | ldp x29, x30, [sp], 16 | |
429 | ret | |
430 | ||
431 | .align 4 | |
432 | .remain_ldp: | |
433 | mov w_crc_tmp, crc | |
434 | ldp x29, x30, [sp], 16 | |
435 | mov size, x_len_saved | |
436 | mov buf, x_buf_iter | |
437 | b .crc32_hw_handle | |
438 | ||
439 | .remain: | |
440 | mov w_crc_tmp, crc | |
441 | mov size, x_len_saved | |
442 | mov buf, x_buf_saved | |
443 | b .crc32_hw_handle | |
444 | ||
445 | .align 4 | |
446 | .less_than_2048: | |
447 | cbnz x_len, .remain | |
448 | ret | |
449 | ||
450 | .crc32_hw_handle: | |
451 | cmp size, 63 | |
452 | ||
453 | #ifdef CRC32 | |
454 | mvn crc_tmp, crc_tmp | |
455 | #endif | |
456 | ||
457 | bls .less_than_64 | |
458 | sub buf_saved, size, #64 | |
459 | and buf_saved, buf_saved, -64 | |
460 | add buf_saved, buf_saved, 64 | |
461 | add buf_saved, buf, buf_saved | |
462 | ||
463 | .align 4 | |
464 | .loop_64: | |
465 | ldp data_tmp1, data_tmp2, [buf] | |
466 | ldr data_tmp3, [buf, 16] | |
467 | crc32_u64 crc_tmp, crc_tmp, data_tmp1 | |
468 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
469 | ||
470 | ldp data_tmp1, data_tmp2, [buf, 24] | |
471 | add buf, buf, 64 | |
472 | ||
473 | crc32_u64 crc_tmp, crc_tmp, data_tmp3 | |
474 | ldr data_tmp3, [buf, -24] | |
475 | ||
476 | crc32_u64 crc_tmp, crc_tmp, data_tmp1 | |
477 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
478 | ||
479 | ldp data_tmp1, data_tmp2, [buf, -16] | |
480 | cmp buf_saved, buf | |
481 | crc32_u64 crc_tmp, crc_tmp, data_tmp3 | |
482 | ||
483 | crc32_u64 crc_tmp, crc_tmp, data_tmp1 | |
484 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
485 | bne .loop_64 | |
486 | ||
487 | and size, size, 63 | |
488 | .less_than_64: | |
489 | cmp size, 7 | |
490 | bls .crc32_hw_w | |
491 | ||
492 | ldr data_tmp2, [buf] | |
493 | sub size_tmp, size, #8 | |
494 | cmp size_tmp, 7 | |
495 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
496 | bls .crc32_hw_w_pre | |
497 | ||
498 | ldr data_tmp2, [buf, 8] | |
499 | sub data_tmp3, size, #16 | |
500 | cmp data_tmp3, 7 | |
501 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
502 | bls .crc32_hw_w_pre | |
503 | ||
504 | ldr data_tmp2, [buf, 16] | |
505 | sub data_tmp3, size, #24 | |
506 | cmp data_tmp3, 7 | |
507 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
508 | bls .crc32_hw_w_pre | |
509 | ||
510 | ldr data_tmp2, [buf, 24] | |
511 | sub data_tmp3, size, #32 | |
512 | cmp data_tmp3, 7 | |
513 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
514 | bls .crc32_hw_w_pre | |
515 | ||
516 | ldr data_tmp2, [buf, 32] | |
517 | sub data_tmp3, size, #40 | |
518 | cmp data_tmp3, 7 | |
519 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
520 | bls .crc32_hw_w_pre | |
521 | ||
522 | ldr data_tmp2, [buf, 40] | |
523 | sub data_tmp3, size, #48 | |
524 | cmp data_tmp3, 7 | |
525 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
526 | bls .crc32_hw_w_pre | |
527 | ||
528 | ldr data_tmp2, [buf, 48] | |
529 | crc32_u64 crc_tmp, crc_tmp, data_tmp2 | |
530 | ||
531 | .crc32_hw_w_pre: | |
532 | and size_tmp, size_tmp, -8 | |
533 | and size, size, 7 | |
534 | add size_tmp, size_tmp, 8 | |
535 | add buf, buf, size_tmp | |
536 | ||
537 | .crc32_hw_w: | |
538 | cmp size, 3 | |
539 | bls .crc32_hw_h | |
540 | ldr w_data_tmp2, [buf], 4 | |
541 | sub size, size, #4 | |
542 | crc32_u32 crc_tmp, crc_tmp, w_data_tmp2 | |
543 | ||
544 | .crc32_hw_h: | |
545 | cmp size, 1 | |
546 | bls .crc32_hw_b | |
547 | ldrh w_data_tmp2, [buf], 2 | |
548 | sub size, size, #2 | |
549 | crc32_u16 crc_tmp, crc_tmp, w_data_tmp2 | |
550 | ||
551 | .crc32_hw_b: | |
552 | cbz size, .crc32_hw_done | |
553 | ldrb w_data_tmp2, [buf] | |
554 | crc32_u8 crc_tmp, crc_tmp, w_data_tmp2 | |
555 | ||
556 | .crc32_hw_done: | |
557 | #ifdef CRC32 | |
558 | mvn ret_crc, crc_tmp | |
559 | #else | |
560 | mov ret_crc, crc_tmp | |
561 | #endif | |
562 | ret | |
563 | .endm |