]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/aarch64/crc32_mix_default_common.S
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / aarch64 / crc32_mix_default_common.S
CommitLineData
20effc67
TL
1/**********************************************************************
2 Copyright(c) 2020 Arm Corporation All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Arm Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.macro declare_generic_reg name:req, reg:req, default:req
31 \name .req \default\reg
32 w_\name .req w\reg
33 x_\name .req x\reg
34.endm
35
36.macro declare_neon_reg name:req, reg:req, default:req
37 \name .req \default\reg
38 v_\name .req v\reg
39 q_\name .req q\reg
40 d_\name .req d\reg
41 s_\name .req s\reg
42.endm
43
44/**********************************************************************
45 variables
46**********************************************************************/
47 declare_generic_reg crc, 0,w
48 declare_generic_reg buf, 1,x
49 declare_generic_reg len, 2,x
50 declare_generic_reg buf_saved, 3,x
51 declare_generic_reg buf_iter, 4,x
52 declare_generic_reg len_saved, 5,x
53 declare_generic_reg buf_tmp, 6,x
54
55 declare_generic_reg crc0, 7,x
56 declare_generic_reg crc1, 8,x
57 declare_generic_reg crc2, 9,x
58 declare_generic_reg pconst, 10,x
59 declare_generic_reg data_crc0, 11,x
60 declare_generic_reg data_crc1, 12,x
61 declare_generic_reg data_crc2, 13,x
62
63 declare_generic_reg size, 9,x
64 declare_generic_reg crc_tmp, 10,w
65 declare_generic_reg size_tmp, 11,x
66 declare_generic_reg data_tmp1, 11,x
67 declare_generic_reg data_tmp2, 12,x
68 declare_generic_reg data_tmp3, 13,x
69
70 declare_generic_reg tmp, 14,x
71 declare_generic_reg tmp1, 15,x
72
73// return
74 declare_generic_reg ret_crc, 0,w
75
76/**********************************************************************
77 simd variables
78**********************************************************************/
79 declare_neon_reg a0, 0,v
80 declare_neon_reg a1, 1,v
81 declare_neon_reg a2, 2,v
82 declare_neon_reg a3, 3,v
83 declare_neon_reg a4, 4,v
84
85 declare_neon_reg a5, 16,v
86 declare_neon_reg a6, 17,v
87 declare_neon_reg a7, 18,v
88 declare_neon_reg a8, 19,v
89
90 declare_neon_reg y5, 20,v
91 declare_neon_reg y6, 21,v
92 declare_neon_reg y7, 22,v
93 declare_neon_reg y8, 23,v
94
95 declare_neon_reg neon_zero, 24,v
96 declare_neon_reg neon_tmp, 24,v
97
98 declare_neon_reg k5k0, 25,v
99 declare_neon_reg neon_tmp1, 26,v
100 declare_neon_reg neon_tmp2, 27,v
101 declare_neon_reg neon_tmp3, 28,v
102
103 declare_neon_reg crc_pmull, 29,v
104 declare_neon_reg neon_crc0, 30,v
105 declare_neon_reg neon_crc1, 31,v
106
107 declare_neon_reg neon_const0, 5,v
108 declare_neon_reg neon_const1, 6,v
109 declare_neon_reg neon_const2, 7,v
110
111// constants
112 .equ offset_k3k4, 16
113 .equ offset_k5k0, 32
114 .equ offset_poly, 48
115 .equ offset_crc32_const, 64
116
117// pmull fold
118.macro pmull_fold
119 ldr x_data_crc0, [x_buf_tmp, 464]
120 ldr x_data_crc1, [x_buf_tmp, 976]
121 ldr x_data_crc2, [x_buf_tmp, 1488]
122
123 pmull v_a5.1q, v_a1.1d, v_a0.1d
124 crc32_u64 w_crc0, w_crc0, x_data_crc0
125 crc32_u64 w_crc1, w_crc1, x_data_crc1
126 crc32_u64 w_crc2, w_crc2, x_data_crc2
127
128 ldr x_data_crc0, [x_buf_tmp, 472]
129 ldr x_data_crc1, [x_buf_tmp, 984]
130 ldr x_data_crc2, [x_buf_tmp, 1496]
131
132 pmull v_a6.1q, v_a2.1d, v_a0.1d
133 crc32_u64 w_crc0, w_crc0, x_data_crc0
134 crc32_u64 w_crc1, w_crc1, x_data_crc1
135 crc32_u64 w_crc2, w_crc2, x_data_crc2
136
137 ldr x_data_crc0, [x_buf_tmp, 480]
138 ldr x_data_crc1, [x_buf_tmp, 992]
139 ldr x_data_crc2, [x_buf_tmp, 1504]
140
141 pmull v_a7.1q, v_a3.1d, v_a0.1d
142 crc32_u64 w_crc0, w_crc0, x_data_crc0
143 crc32_u64 w_crc1, w_crc1, x_data_crc1
144 crc32_u64 w_crc2, w_crc2, x_data_crc2
145
146 ldr x_data_crc0, [x_buf_tmp, 488]
147 ldr x_data_crc1, [x_buf_tmp, 1000]
148 ldr x_data_crc2, [x_buf_tmp, 1512]
149
150 pmull v_a8.1q, v_a4.1d, v_a0.1d
151 crc32_u64 w_crc0, w_crc0, x_data_crc0
152 crc32_u64 w_crc1, w_crc1, x_data_crc1
153 crc32_u64 w_crc2, w_crc2, x_data_crc2
154
155 ldr x_data_crc0, [x_buf_tmp, 496]
156 ldr x_data_crc1, [x_buf_tmp, 1008]
157 ldr x_data_crc2, [x_buf_tmp, 1520]
158
159 pmull2 v_a1.1q, v_a1.2d, v_a0.2d
160 crc32_u64 w_crc0, w_crc0, x_data_crc0
161 crc32_u64 w_crc1, w_crc1, x_data_crc1
162 crc32_u64 w_crc2, w_crc2, x_data_crc2
163
164 ld1 {v_y5.4s, v_y6.4s, v_y7.4s, v_y8.4s}, [x_buf_tmp]
165
166 ldr x_data_crc0, [x_buf_tmp, 504]
167 ldr x_data_crc1, [x_buf_tmp, 1016]
168 ldr x_data_crc2, [x_buf_tmp, 1528]
169
170 pmull2 v_a2.1q, v_a2.2d, v_a0.2d
171 crc32_u64 w_crc0, w_crc0, x_data_crc0
172 crc32_u64 w_crc1, w_crc1, x_data_crc1
173 crc32_u64 w_crc2, w_crc2, x_data_crc2
174
175 pmull2 v_a3.1q, v_a3.2d, v_a0.2d
176 pmull2 v_a4.1q, v_a4.2d, v_a0.2d
177
178 eor v_y5.16b, v_y5.16b, v_a5.16b
179 eor v_y6.16b, v_y6.16b, v_a6.16b
180 eor v_y7.16b, v_y7.16b, v_a7.16b
181 eor v_y8.16b, v_y8.16b, v_a8.16b
182
183 ldr x_data_crc0, [x_buf_tmp, 512]
184 ldr x_data_crc1, [x_buf_tmp, 1024]
185 ldr x_data_crc2, [x_buf_tmp, 1536]
186
187 eor v_a1.16b, v_y5.16b, v_a1.16b
188 eor v_a2.16b, v_y6.16b, v_a2.16b
189 eor v_a3.16b, v_y7.16b, v_a3.16b
190 eor v_a4.16b, v_y8.16b, v_a4.16b
191
192 crc32_u64 w_crc0, w_crc0, x_data_crc0
193 crc32_u64 w_crc1, w_crc1, x_data_crc1
194 crc32_u64 w_crc2, w_crc2, x_data_crc2
195
196 ldr x_data_crc0, [x_buf_tmp, 520]
197 ldr x_data_crc1, [x_buf_tmp, 1032]
198 ldr x_data_crc2, [x_buf_tmp, 1544]
199
200 crc32_u64 w_crc0, w_crc0, x_data_crc0
201 crc32_u64 w_crc1, w_crc1, x_data_crc1
202 crc32_u64 w_crc2, w_crc2, x_data_crc2
203.endm
204
205// crc32 mix for 2048 byte input data
206.macro crc32_mix2048
207 fmov s_a1, w_crc
208 movi v_neon_tmp.4s, 0
209
210 adrp x_pconst, lanchor_crc32
211 add x_buf_tmp, x_buf, 64
212
213 ldr x_data_crc0, [x_buf, 512]
214 ldr x_data_crc1, [x_buf, 1024]
215 ldr x_data_crc2, [x_buf, 1536]
216
217 crc32_u64 w_crc0, wzr, x_data_crc0
218 crc32_u64 w_crc1, wzr, x_data_crc1
219 crc32_u64 w_crc2, wzr, x_data_crc2
220
221#ifdef CRC32
222 mvn v_a1.8b, v_a1.8b
223#endif
224
225 ins v_neon_tmp.s[0], v_a1.s[0]
226
227 ld1 {v_a1.4s, v_a2.4s, v_a3.4s, v_a4.4s}, [x_buf]
228
229 ldr x_data_crc0, [x_buf, 520]
230 ldr x_data_crc1, [x_buf, 1032]
231 ldr x_data_crc2, [x_buf, 1544]
232
233 eor v_a1.16b, v_a1.16b, v_neon_tmp.16b
234 ldr q_a0, [x_pconst, #:lo12:lanchor_crc32] // k1k2
235
236 crc32_u64 w_crc0, w_crc0, x_data_crc0
237 crc32_u64 w_crc1, w_crc1, x_data_crc1
238 crc32_u64 w_crc2, w_crc2, x_data_crc2
239
240// loop start, unroll the loop
241 .align 4
242 pmull_fold
243
244 add x_buf_tmp, x_buf_tmp, 64
245 pmull_fold
246
247 add x_buf_tmp, x_buf_tmp, 64
248 pmull_fold
249
250 add x_buf_tmp, x_buf_tmp, 64
251 pmull_fold
252
253 add x_buf_tmp, x_buf_tmp, 64
254 pmull_fold
255
256 add x_buf_tmp, x_buf_tmp, 64
257 pmull_fold
258
259 add x_buf_tmp, x_buf_tmp, 64
260 pmull_fold
261// loop end
262
263// PMULL: fold into 128-bits
264 add x_pconst, x_pconst, :lo12:lanchor_crc32
265
266 ldr x_data_crc0, [x_buf, 976]
267 ldr x_data_crc1, [x_buf, 1488]
268 ldr x_data_crc2, [x_buf, 2000]
269
270 ldr q_a0, [x_pconst, offset_k3k4] // k3k4
271
272 crc32_u64 w_crc0, w_crc0, x_data_crc0
273 crc32_u64 w_crc1, w_crc1, x_data_crc1
274 crc32_u64 w_crc2, w_crc2, x_data_crc2
275
276 pmull v_a5.1q, v_a1.1d, v_a0.1d
277 pmull2 v_a1.1q, v_a1.2d, v_a0.2d
278
279 eor v_a1.16b, v_a5.16b, v_a1.16b
280 eor v_a1.16b, v_a1.16b, v_a2.16b
281
282 ldr x_data_crc0, [x_buf, 984]
283 ldr x_data_crc1, [x_buf, 1496]
284 ldr x_data_crc2, [x_buf, 2008]
285
286 crc32_u64 w_crc0, w_crc0, x_data_crc0
287 crc32_u64 w_crc1, w_crc1, x_data_crc1
288 crc32_u64 w_crc2, w_crc2, x_data_crc2
289
290 pmull v_a5.1q, v_a1.1d, v_a0.1d
291 pmull2 v_a1.1q, v_a1.2d, v_a0.2d
292
293 ldr x_data_crc0, [x_buf, 992]
294 ldr x_data_crc1, [x_buf, 1504]
295 ldr x_data_crc2, [x_buf, 2016]
296
297 eor v_a1.16b, v_a5.16b, v_a1.16b
298 eor v_a1.16b, v_a1.16b, v_a3.16b
299
300 crc32_u64 w_crc0, w_crc0, x_data_crc0
301 crc32_u64 w_crc1, w_crc1, x_data_crc1
302 crc32_u64 w_crc2, w_crc2, x_data_crc2
303
304 pmull v_a5.1q, v_a1.1d, v_a0.1d
305 pmull2 v_a1.1q, v_a1.2d, v_a0.2d
306
307 ldr x_data_crc0, [x_buf, 1000]
308 ldr x_data_crc1, [x_buf, 1512]
309 ldr x_data_crc2, [x_buf, 2024]
310
311 eor v_a1.16b, v_a5.16b, v_a1.16b
312 eor v_a1.16b, v_a1.16b, v_a4.16b
313
314// PMULL: fold 128-bits to 64-bits
315 crc32_u64 w_crc0, w_crc0, x_data_crc0
316 crc32_u64 w_crc1, w_crc1, x_data_crc1
317 crc32_u64 w_crc2, w_crc2, x_data_crc2
318
319 dup d_a0, v_a0.d[1]
320 pmull v_a2.1q, v_a1.1d, v_a0.1d
321
322 movi v_neon_zero.4s, 0
323 ldr q_k5k0, [x_pconst, offset_k5k0] // k5k0
324 adrp x_tmp, .lanchor_mask
325
326 ldr x_data_crc0, [x_buf, 1008]
327 ldr x_data_crc1, [x_buf, 1520]
328 ldr x_data_crc2, [x_buf, 2032]
329
330 ext v_a1.16b, v_a1.16b, v_neon_zero.16b, #8
331 eor v_a1.16b, v_a2.16b, v_a1.16b
332 ldr q_neon_tmp3, [x_tmp, #:lo12:.lanchor_mask]
333
334 crc32_u64 w_crc0, w_crc0, x_data_crc0
335 crc32_u64 w_crc1, w_crc1, x_data_crc1
336 crc32_u64 w_crc2, w_crc2, x_data_crc2
337
338 dup d_a0, v_k5k0.d[1]
339 pmull v_a3.1q, v_a2.1d, v_a0.1d
340
341 ext v_a2.16b, v_a1.16b, v_neon_zero.16b, #4
342 and v_a1.16b, v_a1.16b, v_neon_tmp3.16b
343 pmull v_a1.1q, v_a1.1d, v_k5k0.1d
344 eor v_a1.16b, v_a2.16b, v_a1.16b
345
346// PMULL: barret reduce to 32-bits
347 ldr q_neon_tmp1, [x_pconst, offset_poly] // poly
348
349 ldr x_data_crc0, [x_buf, 1016]
350 ldr x_data_crc1, [x_buf, 1528]
351 ldr x_data_crc2, [x_buf, 2040]
352
353 dup d_neon_tmp2, v_neon_tmp1.d[1]
354
355 crc32_u64 w_crc0, w_crc0, x_data_crc0
356 crc32_u64 w_crc1, w_crc1, x_data_crc1
357 crc32_u64 w_crc2, w_crc2, x_data_crc2
358
359 and v_a2.16b, v_a1.16b, v_neon_tmp3.16b
360 pmull v_a2.1q, v_a2.1d, v_neon_tmp2.1d
361 and v_a2.16b, v_neon_tmp3.16b, v_a2.16b
362 pmull v_a2.1q, v_a2.1d, v_neon_tmp1.1d
363
364// crc_pmull result
365 eor v_a1.16b, v_a1.16b, v_a2.16b
366 dup s_crc_pmull, v_a1.s[1]
367
368// merge crc_pmull, crc0, crc1, crc2 using pmull instruction
369 fmov s_neon_crc0, w_crc0
370 fmov s_neon_crc1, w_crc1
371
372 ldr q_neon_const0, [x_pconst, offset_crc32_const]
373 ldr q_neon_const1, [x_pconst, offset_crc32_const+16]
374 ldr q_neon_const2, [x_pconst, offset_crc32_const+32]
375
376 pmull v_crc_pmull.1q, v_crc_pmull.1d, v_neon_const0.1d
377 pmull v_neon_crc0.1q, v_neon_crc0.1d, v_neon_const1.1d
378 pmull v_neon_crc1.1q, v_neon_crc1.1d, v_neon_const2.1d
379
380 fmov x_tmp1, d_neon_crc0
381 crc32_u64 w_crc0, wzr, x_tmp1
382
383 fmov x_tmp1, d_neon_crc1
384 crc32_u64 w_crc1, wzr, x_tmp1
385
386 eor w_ret_crc, w_crc1, w_crc0
387
388 fmov x_tmp1, d_crc_pmull
389 crc32_u64 w_tmp, wzr, x_tmp1
390
391 eor w_crc2, w_tmp, w_crc2
392
393// handle crc32/crc32c
394#ifdef CRC32
395 eon w_ret_crc, w_crc2, w_ret_crc
396#else
397 eor w_ret_crc, w_crc2, w_ret_crc
398#endif
399.endm
400
401// crc32 mix main default
402.macro crc32_mix_main_default
403 cmp x_len, 2047
404 mov x_len_saved, x_len
405 mov x_buf_saved, x_buf
406 bls .less_than_2048
407
408 sub x_buf_iter, x_len, #2048
409 stp x29, x30, [sp, -16]!
410
411 mov x29, sp
412 and x_buf_iter, x_buf_iter, -2048
413 add x_buf_iter, x_buf_iter, 2048
414 add x_buf_iter, x_buf, x_buf_iter
415
416 .align 4
417.loop_mix:
418 mov x_buf, x_buf_saved
419 crc32_mix2048
420
421 add x_buf_saved, x_buf_saved, 2048
422 cmp x_buf_saved, x_buf_iter
423 bne .loop_mix
424
425 and x_len_saved, x_len_saved, 2047
426 cbnz x_len_saved, .remain_ldp
427
428 ldp x29, x30, [sp], 16
429 ret
430
431 .align 4
432.remain_ldp:
433 mov w_crc_tmp, crc
434 ldp x29, x30, [sp], 16
435 mov size, x_len_saved
436 mov buf, x_buf_iter
437 b .crc32_hw_handle
438
439.remain:
440 mov w_crc_tmp, crc
441 mov size, x_len_saved
442 mov buf, x_buf_saved
443 b .crc32_hw_handle
444
445 .align 4
446.less_than_2048:
447 cbnz x_len, .remain
448 ret
449
450.crc32_hw_handle:
451 cmp size, 63
452
453#ifdef CRC32
454 mvn crc_tmp, crc_tmp
455#endif
456
457 bls .less_than_64
458 sub buf_saved, size, #64
459 and buf_saved, buf_saved, -64
460 add buf_saved, buf_saved, 64
461 add buf_saved, buf, buf_saved
462
463 .align 4
464.loop_64:
465 ldp data_tmp1, data_tmp2, [buf]
466 ldr data_tmp3, [buf, 16]
467 crc32_u64 crc_tmp, crc_tmp, data_tmp1
468 crc32_u64 crc_tmp, crc_tmp, data_tmp2
469
470 ldp data_tmp1, data_tmp2, [buf, 24]
471 add buf, buf, 64
472
473 crc32_u64 crc_tmp, crc_tmp, data_tmp3
474 ldr data_tmp3, [buf, -24]
475
476 crc32_u64 crc_tmp, crc_tmp, data_tmp1
477 crc32_u64 crc_tmp, crc_tmp, data_tmp2
478
479 ldp data_tmp1, data_tmp2, [buf, -16]
480 cmp buf_saved, buf
481 crc32_u64 crc_tmp, crc_tmp, data_tmp3
482
483 crc32_u64 crc_tmp, crc_tmp, data_tmp1
484 crc32_u64 crc_tmp, crc_tmp, data_tmp2
485 bne .loop_64
486
487 and size, size, 63
488.less_than_64:
489 cmp size, 7
490 bls .crc32_hw_w
491
492 ldr data_tmp2, [buf]
493 sub size_tmp, size, #8
494 cmp size_tmp, 7
495 crc32_u64 crc_tmp, crc_tmp, data_tmp2
496 bls .crc32_hw_w_pre
497
498 ldr data_tmp2, [buf, 8]
499 sub data_tmp3, size, #16
500 cmp data_tmp3, 7
501 crc32_u64 crc_tmp, crc_tmp, data_tmp2
502 bls .crc32_hw_w_pre
503
504 ldr data_tmp2, [buf, 16]
505 sub data_tmp3, size, #24
506 cmp data_tmp3, 7
507 crc32_u64 crc_tmp, crc_tmp, data_tmp2
508 bls .crc32_hw_w_pre
509
510 ldr data_tmp2, [buf, 24]
511 sub data_tmp3, size, #32
512 cmp data_tmp3, 7
513 crc32_u64 crc_tmp, crc_tmp, data_tmp2
514 bls .crc32_hw_w_pre
515
516 ldr data_tmp2, [buf, 32]
517 sub data_tmp3, size, #40
518 cmp data_tmp3, 7
519 crc32_u64 crc_tmp, crc_tmp, data_tmp2
520 bls .crc32_hw_w_pre
521
522 ldr data_tmp2, [buf, 40]
523 sub data_tmp3, size, #48
524 cmp data_tmp3, 7
525 crc32_u64 crc_tmp, crc_tmp, data_tmp2
526 bls .crc32_hw_w_pre
527
528 ldr data_tmp2, [buf, 48]
529 crc32_u64 crc_tmp, crc_tmp, data_tmp2
530
531.crc32_hw_w_pre:
532 and size_tmp, size_tmp, -8
533 and size, size, 7
534 add size_tmp, size_tmp, 8
535 add buf, buf, size_tmp
536
537.crc32_hw_w:
538 cmp size, 3
539 bls .crc32_hw_h
540 ldr w_data_tmp2, [buf], 4
541 sub size, size, #4
542 crc32_u32 crc_tmp, crc_tmp, w_data_tmp2
543
544.crc32_hw_h:
545 cmp size, 1
546 bls .crc32_hw_b
547 ldrh w_data_tmp2, [buf], 2
548 sub size, size, #2
549 crc32_u16 crc_tmp, crc_tmp, w_data_tmp2
550
551.crc32_hw_b:
552 cbz size, .crc32_hw_done
553 ldrb w_data_tmp2, [buf]
554 crc32_u8 crc_tmp, crc_tmp, w_data_tmp2
555
556.crc32_hw_done:
557#ifdef CRC32
558 mvn ret_crc, crc_tmp
559#else
560 mov ret_crc, crc_tmp
561#endif
562 ret
563.endm