1 ########################################################################
2 # Copyright(c) 2019 Arm Corporation All rights reserved.
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # * Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # * Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in
11 # the documentation and/or other materials provided with the
13 # * Neither the name of Arm Corporation nor the names of its
14 # contributors may be used to endorse or promote products derived
15 # from this software without specific prior written permission.
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #########################################################################
30 .arch armv8-a+crc+crypto
33 .global crc16_t10dif_pmull
34 .type crc16_t10dif_pmull, %function
36 /* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
47 /* these as global temporary registers */
81 sxtw x_counter, w_counter
82 adrp x_crc16tab, .LANCHOR0
83 sub x_buf, x_buf, x_counter
84 add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
88 ldrb w_tmp, [x_buf, x_counter]
89 add x_counter, x_counter, 1
91 eor w_tmp, w_tmp, w_crc, lsr 8
92 ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1]
93 eor w_crc, w_tmp, w_crc, lsl 8
101 /* carry less multiplication, part1 - before loop */
117 // the following registers only used this part1
125 dup d_tmp3, v_tmp2.d[0]
126 shl d_tmp1, d_tmp1, 48
127 ins v_tmp3.d[1], v_tmp1.d[0]
129 and x_counter, x_len, -64
130 sub x_counter, x_counter, #64
132 add x_buf_saved, x_buf, 64
135 ldr q_x1, [x_buf, 16]
136 ldr q_x2, [x_buf, 32]
137 ldr q_x3, [x_buf, 48]
139 adrp x_tmp, .shuffle_mask_lanchor
140 ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
142 tbl v_tmp1.16b, {v_x0.16b}, v7.16b
143 eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
145 tbl v_x1.16b, {v_x1.16b}, v7.16b
146 tbl v_x2.16b, {v_x2.16b}, v7.16b
147 tbl v_x3.16b, {v_x3.16b}, v7.16b
148 bls .crc_fold_loop_end
150 /* carry less multiplication, part2 - loop */
189 mov x_tmp, 0x371d0000 /* p4 [1] */
191 mov x_tmp, 0x87e70000 /* p4 [0] */
196 add x_buf_saved, x_buf_saved, 64
197 sub x_counter, x_counter, #64
200 dup d_x0_h, v_x0.d[1]
201 dup d_x1_h, v_x1.d[1]
202 dup d_x2_h, v_x2.d[1]
203 dup d_x3_h, v_x3.d[1]
205 dup d_x0_l, v_x0.d[0]
206 dup d_x1_l, v_x1.d[0]
207 dup d_x2_l, v_x2.d[0]
208 dup d_x3_l, v_x3.d[0]
210 ldr q_y0, [x_buf_saved, -64]
211 ldr q_y1, [x_buf_saved, -48]
212 ldr q_y2, [x_buf_saved, -32]
213 ldr q_y3, [x_buf_saved, -16]
215 pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
216 pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
217 pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
218 pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
219 pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
220 pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
221 pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
222 pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
224 tbl v_y0.16b, {v_y0.16b}, v7.16b
225 tbl v_y1.16b, {v_y1.16b}, v7.16b
226 tbl v_y2.16b, {v_y2.16b}, v7.16b
227 tbl v_y3.16b, {v_y3.16b}, v7.16b
229 eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
230 eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
231 eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
232 eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
234 eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
235 eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
236 eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
237 eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
241 /* carry less multiplication, part3 - after loop */
242 /* folding 512bit ---> 128bit */
250 // v0, v1, v6, v30, are tmp registers
253 mov x_tmp, 0x4c1a0000 /* p1 [1] */
255 mov x_tmp, 0xfb0b0000 /* p1 [0] */
258 and w_counter, w_len, -64
259 sxtw x_tmp, w_counter
260 add x_buf, x_buf, x_tmp
264 pmull v6.1q, v6.1d, v0.1d
265 pmull v30.1q, v30.1d, v1.1d
266 eor v6.16b, v6.16b, v30.16b
267 eor v_x1.16b, v6.16b, v_x1.16b
271 pmull v6.1q, v6.1d, v0.1d
272 pmull v16.1q, v30.1d, v1.1d
273 eor v6.16b, v6.16b, v16.16b
274 eor v_x2.16b, v6.16b, v_x2.16b
278 pmull v0.1q, v_x0.1d, v0.1d
279 pmull v_x0.1q, v30.1d, v1.1d
280 eor v1.16b, v0.16b, v_x0.16b
281 eor v_x0.16b, v1.16b, v_x3.16b
283 /* carry less multiplication, part3 - after loop */
284 /* crc16 fold function */
285 d_16fold_p0_h .req d18
286 v_16fold_p0_h .req v18
288 d_16fold_p0_l .req d4
289 v_16fold_p0_l .req v4
291 v_16fold_from .req v_x0
292 d_16fold_from_h .req d3
293 v_16fold_from_h .req v3
295 v_16fold_zero .req v7
297 v_16fold_from1 .req v16
299 v_16fold_from2 .req v0
300 d_16fold_from2_h .req d6
301 v_16fold_from2_h .req v6
305 movi v_16fold_zero.4s, 0
306 mov x_tmp1, 0x2d560000 /* p0 [1] */
307 mov x_tmp2, 0x13680000 /* p0 [0] */
309 ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
310 ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
312 dup d_16fold_from_h, v_16fold_from.d[1]
313 fmov d_16fold_p0_h, x_tmp1
314 pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
315 eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
317 dup d_16fold_from2_h, v_16fold_from2.d[1]
318 fmov d_16fold_p0_l, x_tmp2
319 pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
320 eor v_x0.16b, v0.16b, v6.16b
322 /* carry less multiplication, part3 - after loop */
323 /* crc16 barrett reduction function */
327 // barrett reduction constant: br[0], br[1]
334 mov x_tmp1, 0x57f9 /* br[0] low */
335 movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */
336 movk x_tmp1, 0x1, lsl 32
341 ext v1.16b, v1.16b, v7.16b, #4
342 pmull v4.1q, v1.1d, v_br0.1d
344 ext v1.16b, v4.16b, v7.16b, #4
345 mov x_tmp1, 0x8bb70000 /* br[1] low */
346 movk x_tmp1, 0x1, lsl 32 /* br[1] high */
349 pmull v_br1.1q, v1.1d, v_br1.1d
350 eor v_x0.16b, v_x0.16b, v_br1.16b
354 b .crc_table_loop_pre
356 .size crc16_t10dif_pmull, .-crc16_t10dif_pmull
361 .shuffle_mask_lanchor = . + 0
362 .type shuffle_mask, %object
363 .size shuffle_mask, 16
365 .byte 15, 14, 13, 12, 11, 10, 9, 8
366 .byte 7, 6, 5, 4, 3, 2, 1, 0
370 .type crc16tab, %object
373 .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
374 .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
375 .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
376 .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
377 .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
378 .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
379 .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
380 .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
381 .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
382 .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
383 .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
384 .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
385 .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
386 .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
387 .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
388 .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
389 .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
390 .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
391 .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
392 .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
393 .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
394 .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
395 .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
396 .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
397 .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
398 .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
399 .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
400 .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
401 .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
402 .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
403 .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
404 .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3