]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/aarch64/crc16_t10dif_pmull.S
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / aarch64 / crc16_t10dif_pmull.S
1 ########################################################################
2 # Copyright(c) 2019 Arm Corporation All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
6 # are met:
7 # * Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # * Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in
11 # the documentation and/or other materials provided with the
12 # distribution.
13 # * Neither the name of Arm Corporation nor the names of its
14 # contributors may be used to endorse or promote products derived
15 # from this software without specific prior written permission.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #########################################################################
29
30 .arch armv8-a+crc+crypto
31 .text
32 .align 3
33 .global crc16_t10dif_pmull
34 .type crc16_t10dif_pmull, %function
35
36 /* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
37
38 /* arguments */
39 w_seed .req w0
40 x_buf .req x1
41 x_len .req x2
42 w_len .req w2
43
44 /* returns */
45 w_ret .req w0
46
47 /* these as global temporary registers */
48 w_tmp .req w5
49 x_tmp .req x5
50 x_tmp1 .req x6
51 x_tmp2 .req x7
52
53 d_tmp1 .req d0
54 d_tmp2 .req d1
55 q_tmp1 .req q0
56 q_tmp2 .req q1
57 v_tmp1 .req v0
58 v_tmp2 .req v1
59
60 /* local variables */
61 w_counter .req w3
62 w_crc .req w0
63 x_crc .req x0
64 x_counter .req x3
65 x_crc16tab .req x4
66 x_buf_saved .req x0
67
68 crc16_t10dif_pmull:
69 cmp x_len, 1023
70 sub sp, sp, #16
71 uxth w_seed, w_seed
72 bhi .crc_fold
73
74 mov x_tmp, 0
75 mov w_counter, 0
76
77 .crc_table_loop_pre:
78 cmp x_len, x_tmp
79 bls .end
80
81 sxtw x_counter, w_counter
82 adrp x_crc16tab, .LANCHOR0
83 sub x_buf, x_buf, x_counter
84 add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
85
86 .align 2
87 .crc_table_loop:
88 ldrb w_tmp, [x_buf, x_counter]
89 add x_counter, x_counter, 1
90 cmp x_len, x_counter
91 eor w_tmp, w_tmp, w_crc, lsr 8
92 ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1]
93 eor w_crc, w_tmp, w_crc, lsl 8
94 uxth w_crc, w_crc
95 bhi .crc_table_loop
96
97 .end:
98 add sp, sp, 16
99 ret
100
101 /* carry less multiplication, part1 - before loop */
102 q_x0 .req q2
103 q_x1 .req q3
104 q_x2 .req q4
105 q_x3 .req q5
106
107 v_x0 .req v2
108 v_x1 .req v3
109 v_x2 .req v4
110 v_x3 .req v5
111
112 d_x0 .req d2
113 d_x1 .req d3
114 d_x2 .req d4
115 d_x3 .req d5
116
117 // the following registers only used this part1
118 d_tmp3 .req d16
119 v_tmp3 .req v16
120
121 .align 3
122 .crc_fold:
123 fmov d_tmp1, x_crc
124 fmov d_tmp2, xzr
125 dup d_tmp3, v_tmp2.d[0]
126 shl d_tmp1, d_tmp1, 48
127 ins v_tmp3.d[1], v_tmp1.d[0]
128
129 and x_counter, x_len, -64
130 sub x_counter, x_counter, #64
131 cmp x_counter, 63
132 add x_buf_saved, x_buf, 64
133
134 ldr q_x0, [x_buf]
135 ldr q_x1, [x_buf, 16]
136 ldr q_x2, [x_buf, 32]
137 ldr q_x3, [x_buf, 48]
138
139 adrp x_tmp, .shuffle_mask_lanchor
140 ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
141
142 tbl v_tmp1.16b, {v_x0.16b}, v7.16b
143 eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
144
145 tbl v_x1.16b, {v_x1.16b}, v7.16b
146 tbl v_x2.16b, {v_x2.16b}, v7.16b
147 tbl v_x3.16b, {v_x3.16b}, v7.16b
148 bls .crc_fold_loop_end
149
150 /* carry less multiplication, part2 - loop */
151 q_y0 .req q28
152 q_y1 .req q29
153 q_y2 .req q30
154 q_y3 .req q31
155
156 v_y0 .req v28
157 v_y1 .req v29
158 v_y2 .req v30
159 v_y3 .req v31
160
161 d_x0_h .req d24
162 d_x0_l .req d2
163 d_x1_h .req d25
164 d_x1_l .req d3
165 d_x2_h .req d26
166 d_x2_l .req d4
167 d_x3_h .req d27
168 d_x3_l .req d5
169
170 v_x0_h .req v24
171 v_x0_l .req v2
172 v_x1_h .req v25
173 v_x1_l .req v3
174 v_x2_h .req v26
175 v_x2_l .req v4
176 v_x3_h .req v27
177 v_x3_l .req v5
178
179 v_tmp1_x0 .req v24
180 v_tmp1_x1 .req v25
181 v_tmp1_x2 .req v26
182 v_tmp1_x3 .req v27
183
184 d_p4_h .req d19
185 v_p4_h .req v19
186 d_p4_l .req d17
187 v_p4_l .req v17
188
189 mov x_tmp, 0x371d0000 /* p4 [1] */
190 fmov d_p4_h, x_tmp
191 mov x_tmp, 0x87e70000 /* p4 [0] */
192 fmov d_p4_l, x_tmp
193
194 .align 2
195 .crc_fold_loop:
196 add x_buf_saved, x_buf_saved, 64
197 sub x_counter, x_counter, #64
198 cmp x_counter, 63
199
200 dup d_x0_h, v_x0.d[1]
201 dup d_x1_h, v_x1.d[1]
202 dup d_x2_h, v_x2.d[1]
203 dup d_x3_h, v_x3.d[1]
204
205 dup d_x0_l, v_x0.d[0]
206 dup d_x1_l, v_x1.d[0]
207 dup d_x2_l, v_x2.d[0]
208 dup d_x3_l, v_x3.d[0]
209
210 ldr q_y0, [x_buf_saved, -64]
211 ldr q_y1, [x_buf_saved, -48]
212 ldr q_y2, [x_buf_saved, -32]
213 ldr q_y3, [x_buf_saved, -16]
214
215 pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
216 pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
217 pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
218 pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
219 pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
220 pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
221 pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
222 pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
223
224 tbl v_y0.16b, {v_y0.16b}, v7.16b
225 tbl v_y1.16b, {v_y1.16b}, v7.16b
226 tbl v_y2.16b, {v_y2.16b}, v7.16b
227 tbl v_y3.16b, {v_y3.16b}, v7.16b
228
229 eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
230 eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
231 eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
232 eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
233
234 eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
235 eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
236 eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
237 eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
238
239 bhi .crc_fold_loop
240
241 /* carry less multiplication, part3 - after loop */
242 /* folding 512bit ---> 128bit */
243
244 // input parameters:
245 // v_x0 => v2
246 // v_x1 => v3
247 // v_x2 => v4
248 // v_x3 => v5
249
250 // v0, v1, v6, v30, are tmp registers
251
252 .crc_fold_loop_end:
253 mov x_tmp, 0x4c1a0000 /* p1 [1] */
254 fmov d0, x_tmp
255 mov x_tmp, 0xfb0b0000 /* p1 [0] */
256 fmov d1, x_tmp
257
258 and w_counter, w_len, -64
259 sxtw x_tmp, w_counter
260 add x_buf, x_buf, x_tmp
261
262 dup d6, v_x0.d[1]
263 dup d30, v_x0.d[0]
264 pmull v6.1q, v6.1d, v0.1d
265 pmull v30.1q, v30.1d, v1.1d
266 eor v6.16b, v6.16b, v30.16b
267 eor v_x1.16b, v6.16b, v_x1.16b
268
269 dup d6, v_x1.d[1]
270 dup d30, v_x1.d[0]
271 pmull v6.1q, v6.1d, v0.1d
272 pmull v16.1q, v30.1d, v1.1d
273 eor v6.16b, v6.16b, v16.16b
274 eor v_x2.16b, v6.16b, v_x2.16b
275
276 dup d_x0, v_x2.d[1]
277 dup d30, v_x2.d[0]
278 pmull v0.1q, v_x0.1d, v0.1d
279 pmull v_x0.1q, v30.1d, v1.1d
280 eor v1.16b, v0.16b, v_x0.16b
281 eor v_x0.16b, v1.16b, v_x3.16b
282
283 /* carry less multiplication, part3 - after loop */
284 /* crc16 fold function */
285 d_16fold_p0_h .req d18
286 v_16fold_p0_h .req v18
287
288 d_16fold_p0_l .req d4
289 v_16fold_p0_l .req v4
290
291 v_16fold_from .req v_x0
292 d_16fold_from_h .req d3
293 v_16fold_from_h .req v3
294
295 v_16fold_zero .req v7
296
297 v_16fold_from1 .req v16
298
299 v_16fold_from2 .req v0
300 d_16fold_from2_h .req d6
301 v_16fold_from2_h .req v6
302
303 v_16fold_tmp .req v0
304
305 movi v_16fold_zero.4s, 0
306 mov x_tmp1, 0x2d560000 /* p0 [1] */
307 mov x_tmp2, 0x13680000 /* p0 [0] */
308
309 ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
310 ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
311
312 dup d_16fold_from_h, v_16fold_from.d[1]
313 fmov d_16fold_p0_h, x_tmp1
314 pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
315 eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
316
317 dup d_16fold_from2_h, v_16fold_from2.d[1]
318 fmov d_16fold_p0_l, x_tmp2
319 pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
320 eor v_x0.16b, v0.16b, v6.16b
321
322 /* carry less multiplication, part3 - after loop */
323 /* crc16 barrett reduction function */
324
325 // input parameters:
326 // v_x0: v2
327 // barrett reduction constant: br[0], br[1]
328
329 d_br0 .req d3
330 v_br0 .req v3
331 d_br1 .req d5
332 v_br1 .req v5
333
334 mov x_tmp1, 0x57f9 /* br[0] low */
335 movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */
336 movk x_tmp1, 0x1, lsl 32
337 fmov d_br0, x_tmp1
338
339 dup d1, v_x0.d[0]
340 dup d1, v1.d[0]
341 ext v1.16b, v1.16b, v7.16b, #4
342 pmull v4.1q, v1.1d, v_br0.1d
343
344 ext v1.16b, v4.16b, v7.16b, #4
345 mov x_tmp1, 0x8bb70000 /* br[1] low */
346 movk x_tmp1, 0x1, lsl 32 /* br[1] high */
347
348 fmov d_br1, x_tmp1
349 pmull v_br1.1q, v1.1d, v_br1.1d
350 eor v_x0.16b, v_x0.16b, v_br1.16b
351
352 umov x0, v_x0.d[0]
353 ubfx x0, x0, 16, 16
354 b .crc_table_loop_pre
355
356 .size crc16_t10dif_pmull, .-crc16_t10dif_pmull
357
358 .section .rodata
359
360 .align 4
361 .shuffle_mask_lanchor = . + 0
362 .type shuffle_mask, %object
363 .size shuffle_mask, 16
364 shuffle_mask:
365 .byte 15, 14, 13, 12, 11, 10, 9, 8
366 .byte 7, 6, 5, 4, 3, 2, 1, 0
367
368 .align 4
369 .LANCHOR0 = . + 0
370 .type crc16tab, %object
371 .size crc16tab, 512
372 crc16tab:
373 .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
374 .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
375 .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
376 .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
377 .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
378 .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
379 .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
380 .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
381 .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
382 .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
383 .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
384 .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
385 .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
386 .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
387 .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
388 .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
389 .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
390 .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
391 .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
392 .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
393 .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
394 .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
395 .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
396 .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
397 .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
398 .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
399 .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
400 .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
401 .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
402 .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
403 .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
404 .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3