]>
Commit | Line | Data |
---|---|---|
f91f0fd5 TL |
1 | ######################################################################## |
2 | # Copyright (c) 2019 Microsoft Corporation. | |
3 | # | |
4 | # Redistribution and use in source and binary forms, with or without | |
5 | # modification, are permitted provided that the following conditions | |
6 | # are met: | |
7 | # * Redistributions of source code must retain the above copyright | |
8 | # notice, this list of conditions and the following disclaimer. | |
9 | # * Redistributions in binary form must reproduce the above copyright | |
10 | # notice, this list of conditions and the following disclaimer in | |
11 | # the documentation and/or other materials provided with the | |
12 | # distribution. | |
13 | # * Neither the name of Microsoft Corporation nor the names of its | |
14 | # contributors may be used to endorse or promote products derived | |
15 | # from this software without specific prior written permission. | |
16 | # | |
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ######################################################################### | |
29 | ||
30 | // parameters | |
31 | #define w_seed w0 | |
32 | #define x_seed x0 | |
33 | #define x_buf x1 | |
34 | #define w_len w2 | |
35 | #define x_len x2 | |
36 | ||
37 | // return | |
38 | #define w_crc_ret w0 | |
39 | #define x_crc_ret x0 | |
40 | ||
41 | // constant | |
42 | #define FOLD_SIZE 64 | |
43 | ||
44 | // global variables | |
45 | #define x_buf_end x3 | |
46 | #define w_counter w4 | |
47 | #define x_counter x4 | |
48 | #define x_buf_iter x5 | |
49 | #define x_crc_tab_addr x6 | |
50 | #define x_tmp2 x6 | |
51 | #define w_tmp w7 | |
52 | #define x_tmp x7 | |
53 | ||
54 | #define v_x0 v0 | |
55 | #define d_x0 d0 | |
56 | #define s_x0 s0 | |
57 | ||
58 | #define q_x1 q1 | |
59 | #define v_x1 v1 | |
60 | ||
61 | #define q_x2 q2 | |
62 | #define v_x2 v2 | |
63 | ||
64 | #define q_x3 q3 | |
65 | #define v_x3 v3 | |
66 | #define d_x3 d3 | |
67 | #define s_x3 s3 | |
68 | ||
69 | #define q_y0 q4 | |
70 | #define v_y0 v4 | |
71 | #define v_tmp_high v4 | |
72 | #define d_tmp_high d4 | |
73 | ||
74 | #define q_y1 q5 | |
75 | #define v_y1 v5 | |
76 | #define v_tmp_low v5 | |
77 | ||
78 | #define q_y2 q6 | |
79 | #define v_y2 v6 | |
80 | ||
81 | #define q_y3 q7 | |
82 | #define v_y3 v7 | |
83 | ||
84 | #define q_x0_tmp q30 | |
85 | #define v_x0_tmp v30 | |
86 | #define d_p4_high v30.d[1] | |
87 | #define d_p4_low d30 | |
88 | #define v_p4 v30 | |
89 | #define d_p1_high v30.d[1] | |
90 | #define d_p1_low d30 | |
91 | #define v_p1 v30 | |
92 | #define d_p0_high v30.d[1] | |
93 | #define d_p0_low d30 | |
94 | #define v_p0 v30 | |
95 | #define d_br_low d30 | |
96 | #define d_br_low2 v30.d[1] | |
97 | #define v_br_low v30 | |
98 | ||
99 | #define q_shuffle q31 | |
100 | #define v_shuffle v31 | |
101 | #define d_br_high d31 | |
102 | #define d_br_high2 v31.d[1] | |
103 | #define v_br_high v31 | |
104 | #define d_p0_low2 d31 | |
105 | #define d_p0_high2 v31.d[1] | |
106 | #define v_p02 v31 | |
107 | ||
108 | #define v_x0_high v16 | |
109 | #define v_x1_high v17 | |
110 | #define v_x2_high v18 | |
111 | #define v_x3_high v19 | |
112 | ||
113 | .macro crc_refl_load_first_block | |
114 | ldr q_x0_tmp, [x_buf] | |
115 | ldr q_x1, [x_buf, 16] | |
116 | ldr q_x2, [x_buf, 32] | |
117 | ldr q_x3, [x_buf, 48] | |
118 | ||
119 | and x_counter, x_len, -64 | |
120 | sub x_tmp, x_counter, #64 | |
121 | cmp x_tmp, 63 | |
122 | ||
123 | add x_buf_iter, x_buf, 64 | |
124 | ||
125 | eor v_x0.16b, v_x0.16b, v_x0_tmp.16b | |
126 | .endm | |
127 | ||
128 | .macro crc_norm_load_first_block | |
129 | adrp x_tmp, .shuffle_data | |
130 | ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data] | |
131 | ||
132 | ldr q_x0_tmp, [x_buf] | |
133 | ldr q_x1, [x_buf, 16] | |
134 | ldr q_x2, [x_buf, 32] | |
135 | ldr q_x3, [x_buf, 48] | |
136 | ||
137 | and x_counter, x_len, -64 | |
138 | sub x_tmp, x_counter, #64 | |
139 | cmp x_tmp, 63 | |
140 | ||
141 | add x_buf_iter, x_buf, 64 | |
142 | ||
143 | tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b | |
144 | tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b | |
145 | tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b | |
146 | tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b | |
147 | ||
148 | eor v_x0.16b, v_x0.16b, v_x0_tmp.16b | |
149 | .endm | |
150 | ||
151 | .macro crc32_load_p4 | |
152 | add x_buf_end, x_buf_iter, x_tmp | |
153 | ||
154 | mov x_tmp, p4_low_b0 | |
155 | movk x_tmp, p4_low_b1, lsl 16 | |
156 | fmov d_p4_low, x_tmp | |
157 | ||
158 | mov x_tmp2, p4_high_b0 | |
159 | movk x_tmp2, p4_high_b1, lsl 16 | |
160 | fmov d_p4_high, x_tmp2 | |
161 | .endm | |
162 | ||
163 | .macro crc64_load_p4 | |
164 | add x_buf_end, x_buf_iter, x_tmp | |
165 | ||
166 | mov x_tmp, p4_low_b0 | |
167 | movk x_tmp, p4_low_b1, lsl 16 | |
168 | movk x_tmp, p4_low_b2, lsl 32 | |
169 | movk x_tmp, p4_low_b3, lsl 48 | |
170 | fmov d_p4_low, x_tmp | |
171 | ||
172 | mov x_tmp2, p4_high_b0 | |
173 | movk x_tmp2, p4_high_b1, lsl 16 | |
174 | movk x_tmp2, p4_high_b2, lsl 32 | |
175 | movk x_tmp2, p4_high_b3, lsl 48 | |
176 | fmov d_p4_high, x_tmp2 | |
177 | .endm | |
178 | ||
179 | .macro crc_refl_loop | |
180 | .align 3 | |
181 | .clmul_loop: | |
182 | // interleave ldr and pmull(2) for arch which can only issue quadword load every | |
183 | // other cycle (i.e. A55) | |
184 | ldr q_y0, [x_buf_iter] | |
185 | pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d | |
186 | ldr q_y1, [x_buf_iter, 16] | |
187 | pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d | |
188 | ldr q_y2, [x_buf_iter, 32] | |
189 | pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d | |
190 | ldr q_y3, [x_buf_iter, 48] | |
191 | pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d | |
192 | ||
193 | pmull v_x0.1q, v_x0.1d, v_p4.1d | |
194 | add x_buf_iter, x_buf_iter, 64 | |
195 | pmull v_x1.1q, v_x1.1d, v_p4.1d | |
196 | cmp x_buf_iter, x_buf_end | |
197 | pmull v_x2.1q, v_x2.1d, v_p4.1d | |
198 | pmull v_x3.1q, v_x3.1d, v_p4.1d | |
199 | ||
200 | eor v_x0.16b, v_x0.16b, v_x0_high.16b | |
201 | eor v_x1.16b, v_x1.16b, v_x1_high.16b | |
202 | eor v_x2.16b, v_x2.16b, v_x2_high.16b | |
203 | eor v_x3.16b, v_x3.16b, v_x3_high.16b | |
204 | ||
205 | eor v_x0.16b, v_x0.16b, v_y0.16b | |
206 | eor v_x1.16b, v_x1.16b, v_y1.16b | |
207 | eor v_x2.16b, v_x2.16b, v_y2.16b | |
208 | eor v_x3.16b, v_x3.16b, v_y3.16b | |
209 | bne .clmul_loop | |
210 | .endm | |
211 | ||
212 | .macro crc_norm_loop | |
213 | .align 3 | |
214 | .clmul_loop: | |
215 | // interleave ldr and pmull(2) for arch which can only issue quadword load every | |
216 | // other cycle (i.e. A55) | |
217 | ldr q_y0, [x_buf_iter] | |
218 | pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d | |
219 | ldr q_y1, [x_buf_iter, 16] | |
220 | pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d | |
221 | ldr q_y2, [x_buf_iter, 32] | |
222 | pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d | |
223 | ldr q_y3, [x_buf_iter, 48] | |
224 | pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d | |
225 | ||
226 | pmull v_x0.1q, v_x0.1d, v_p4.1d | |
227 | add x_buf_iter, x_buf_iter, 64 | |
228 | pmull v_x1.1q, v_x1.1d, v_p4.1d | |
229 | cmp x_buf_iter, x_buf_end | |
230 | pmull v_x2.1q, v_x2.1d, v_p4.1d | |
231 | pmull v_x3.1q, v_x3.1d, v_p4.1d | |
232 | ||
233 | tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b | |
234 | tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b | |
235 | tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b | |
236 | tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b | |
237 | ||
238 | eor v_x0.16b, v_x0.16b, v_x0_high.16b | |
239 | eor v_x1.16b, v_x1.16b, v_x1_high.16b | |
240 | eor v_x2.16b, v_x2.16b, v_x2_high.16b | |
241 | eor v_x3.16b, v_x3.16b, v_x3_high.16b | |
242 | ||
243 | eor v_x0.16b, v_x0.16b, v_y0.16b | |
244 | eor v_x1.16b, v_x1.16b, v_y1.16b | |
245 | eor v_x2.16b, v_x2.16b, v_y2.16b | |
246 | eor v_x3.16b, v_x3.16b, v_y3.16b | |
247 | bne .clmul_loop | |
248 | .endm | |
249 | ||
250 | .macro crc32_fold_512b_to_128b | |
251 | mov x_tmp, p1_low_b0 | |
252 | movk x_tmp, p1_low_b1, lsl 16 | |
253 | fmov d_p1_low, x_tmp | |
254 | ||
255 | mov x_tmp2, p1_high_b0 | |
256 | movk x_tmp2, p1_high_b1, lsl 16 | |
257 | fmov d_p1_high, x_tmp2 | |
258 | ||
259 | pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d | |
260 | pmull v_tmp_low.1q, v_x0.1d, v_p1.1d | |
261 | eor v_x1.16b, v_x1.16b, v_tmp_high.16b | |
262 | eor v_x1.16b, v_x1.16b, v_tmp_low.16b | |
263 | ||
264 | pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d | |
265 | pmull v_tmp_low.1q, v_x1.1d, v_p1.1d | |
266 | eor v_x2.16b, v_x2.16b, v_tmp_high.16b | |
267 | eor v_x2.16b, v_x2.16b, v_tmp_low.16b | |
268 | ||
269 | pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d | |
270 | pmull v_tmp_low.1q, v_x2.1d, v_p1.1d | |
271 | eor v_x3.16b, v_x3.16b, v_tmp_high.16b | |
272 | eor v_x3.16b, v_x3.16b, v_tmp_low.16b | |
273 | .endm | |
274 | ||
275 | .macro crc64_fold_512b_to_128b | |
276 | mov x_tmp, p1_low_b0 | |
277 | movk x_tmp, p1_low_b1, lsl 16 | |
278 | movk x_tmp, p1_low_b2, lsl 32 | |
279 | movk x_tmp, p1_low_b3, lsl 48 | |
280 | fmov d_p1_low, x_tmp | |
281 | ||
282 | mov x_tmp2, p1_high_b0 | |
283 | movk x_tmp2, p1_high_b1, lsl 16 | |
284 | movk x_tmp2, p1_high_b2, lsl 32 | |
285 | movk x_tmp2, p1_high_b3, lsl 48 | |
286 | fmov d_p1_high, x_tmp2 | |
287 | ||
288 | pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d | |
289 | pmull v_tmp_low.1q, v_x0.1d, v_p1.1d | |
290 | eor v_x1.16b, v_x1.16b, v_tmp_high.16b | |
291 | eor v_x1.16b, v_x1.16b, v_tmp_low.16b | |
292 | ||
293 | pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d | |
294 | pmull v_tmp_low.1q, v_x1.1d, v_p1.1d | |
295 | eor v_x2.16b, v_x2.16b, v_tmp_high.16b | |
296 | eor v_x2.16b, v_x2.16b, v_tmp_low.16b | |
297 | ||
298 | pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d | |
299 | pmull v_tmp_low.1q, v_x2.1d, v_p1.1d | |
300 | eor v_x3.16b, v_x3.16b, v_tmp_high.16b | |
301 | eor v_x3.16b, v_x3.16b, v_tmp_low.16b | |
302 | .endm |