]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/aarch64/crc_common_pmull.h
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / aarch64 / crc_common_pmull.h
CommitLineData
f91f0fd5
TL
1########################################################################
2# Copyright (c) 2019 Microsoft Corporation.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7# * Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9# * Redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in
11# the documentation and/or other materials provided with the
12# distribution.
13# * Neither the name of Microsoft Corporation nor the names of its
14# contributors may be used to endorse or promote products derived
15# from this software without specific prior written permission.
16#
17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30// parameters
31#define w_seed w0
32#define x_seed x0
33#define x_buf x1
34#define w_len w2
35#define x_len x2
36
37// return
38#define w_crc_ret w0
39#define x_crc_ret x0
40
41// constant
42#define FOLD_SIZE 64
43
44// global variables
45#define x_buf_end x3
46#define w_counter w4
47#define x_counter x4
48#define x_buf_iter x5
49#define x_crc_tab_addr x6
50#define x_tmp2 x6
51#define w_tmp w7
52#define x_tmp x7
53
54#define v_x0 v0
55#define d_x0 d0
56#define s_x0 s0
57
58#define q_x1 q1
59#define v_x1 v1
60
61#define q_x2 q2
62#define v_x2 v2
63
64#define q_x3 q3
65#define v_x3 v3
66#define d_x3 d3
67#define s_x3 s3
68
69#define q_y0 q4
70#define v_y0 v4
71#define v_tmp_high v4
72#define d_tmp_high d4
73
74#define q_y1 q5
75#define v_y1 v5
76#define v_tmp_low v5
77
78#define q_y2 q6
79#define v_y2 v6
80
81#define q_y3 q7
82#define v_y3 v7
83
84#define q_x0_tmp q30
85#define v_x0_tmp v30
86#define d_p4_high v30.d[1]
87#define d_p4_low d30
88#define v_p4 v30
89#define d_p1_high v30.d[1]
90#define d_p1_low d30
91#define v_p1 v30
92#define d_p0_high v30.d[1]
93#define d_p0_low d30
94#define v_p0 v30
95#define d_br_low d30
96#define d_br_low2 v30.d[1]
97#define v_br_low v30
98
99#define q_shuffle q31
100#define v_shuffle v31
101#define d_br_high d31
102#define d_br_high2 v31.d[1]
103#define v_br_high v31
104#define d_p0_low2 d31
105#define d_p0_high2 v31.d[1]
106#define v_p02 v31
107
108#define v_x0_high v16
109#define v_x1_high v17
110#define v_x2_high v18
111#define v_x3_high v19
112
113.macro crc_refl_load_first_block
114 ldr q_x0_tmp, [x_buf]
115 ldr q_x1, [x_buf, 16]
116 ldr q_x2, [x_buf, 32]
117 ldr q_x3, [x_buf, 48]
118
119 and x_counter, x_len, -64
120 sub x_tmp, x_counter, #64
121 cmp x_tmp, 63
122
123 add x_buf_iter, x_buf, 64
124
125 eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
126.endm
127
128.macro crc_norm_load_first_block
129 adrp x_tmp, .shuffle_data
130 ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data]
131
132 ldr q_x0_tmp, [x_buf]
133 ldr q_x1, [x_buf, 16]
134 ldr q_x2, [x_buf, 32]
135 ldr q_x3, [x_buf, 48]
136
137 and x_counter, x_len, -64
138 sub x_tmp, x_counter, #64
139 cmp x_tmp, 63
140
141 add x_buf_iter, x_buf, 64
142
143 tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
144 tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
145 tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
146 tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
147
148 eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
149.endm
150
151.macro crc32_load_p4
152 add x_buf_end, x_buf_iter, x_tmp
153
154 mov x_tmp, p4_low_b0
155 movk x_tmp, p4_low_b1, lsl 16
156 fmov d_p4_low, x_tmp
157
158 mov x_tmp2, p4_high_b0
159 movk x_tmp2, p4_high_b1, lsl 16
160 fmov d_p4_high, x_tmp2
161.endm
162
163.macro crc64_load_p4
164 add x_buf_end, x_buf_iter, x_tmp
165
166 mov x_tmp, p4_low_b0
167 movk x_tmp, p4_low_b1, lsl 16
168 movk x_tmp, p4_low_b2, lsl 32
169 movk x_tmp, p4_low_b3, lsl 48
170 fmov d_p4_low, x_tmp
171
172 mov x_tmp2, p4_high_b0
173 movk x_tmp2, p4_high_b1, lsl 16
174 movk x_tmp2, p4_high_b2, lsl 32
175 movk x_tmp2, p4_high_b3, lsl 48
176 fmov d_p4_high, x_tmp2
177.endm
178
179.macro crc_refl_loop
180 .align 3
181.clmul_loop:
182 // interleave ldr and pmull(2) for arch which can only issue quadword load every
183 // other cycle (i.e. A55)
184 ldr q_y0, [x_buf_iter]
185 pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
186 ldr q_y1, [x_buf_iter, 16]
187 pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
188 ldr q_y2, [x_buf_iter, 32]
189 pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
190 ldr q_y3, [x_buf_iter, 48]
191 pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
192
193 pmull v_x0.1q, v_x0.1d, v_p4.1d
194 add x_buf_iter, x_buf_iter, 64
195 pmull v_x1.1q, v_x1.1d, v_p4.1d
196 cmp x_buf_iter, x_buf_end
197 pmull v_x2.1q, v_x2.1d, v_p4.1d
198 pmull v_x3.1q, v_x3.1d, v_p4.1d
199
200 eor v_x0.16b, v_x0.16b, v_x0_high.16b
201 eor v_x1.16b, v_x1.16b, v_x1_high.16b
202 eor v_x2.16b, v_x2.16b, v_x2_high.16b
203 eor v_x3.16b, v_x3.16b, v_x3_high.16b
204
205 eor v_x0.16b, v_x0.16b, v_y0.16b
206 eor v_x1.16b, v_x1.16b, v_y1.16b
207 eor v_x2.16b, v_x2.16b, v_y2.16b
208 eor v_x3.16b, v_x3.16b, v_y3.16b
209 bne .clmul_loop
210.endm
211
212.macro crc_norm_loop
213 .align 3
214.clmul_loop:
215 // interleave ldr and pmull(2) for arch which can only issue quadword load every
216 // other cycle (i.e. A55)
217 ldr q_y0, [x_buf_iter]
218 pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
219 ldr q_y1, [x_buf_iter, 16]
220 pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
221 ldr q_y2, [x_buf_iter, 32]
222 pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
223 ldr q_y3, [x_buf_iter, 48]
224 pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
225
226 pmull v_x0.1q, v_x0.1d, v_p4.1d
227 add x_buf_iter, x_buf_iter, 64
228 pmull v_x1.1q, v_x1.1d, v_p4.1d
229 cmp x_buf_iter, x_buf_end
230 pmull v_x2.1q, v_x2.1d, v_p4.1d
231 pmull v_x3.1q, v_x3.1d, v_p4.1d
232
233 tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b
234 tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b
235 tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b
236 tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b
237
238 eor v_x0.16b, v_x0.16b, v_x0_high.16b
239 eor v_x1.16b, v_x1.16b, v_x1_high.16b
240 eor v_x2.16b, v_x2.16b, v_x2_high.16b
241 eor v_x3.16b, v_x3.16b, v_x3_high.16b
242
243 eor v_x0.16b, v_x0.16b, v_y0.16b
244 eor v_x1.16b, v_x1.16b, v_y1.16b
245 eor v_x2.16b, v_x2.16b, v_y2.16b
246 eor v_x3.16b, v_x3.16b, v_y3.16b
247 bne .clmul_loop
248.endm
249
250.macro crc32_fold_512b_to_128b
251 mov x_tmp, p1_low_b0
252 movk x_tmp, p1_low_b1, lsl 16
253 fmov d_p1_low, x_tmp
254
255 mov x_tmp2, p1_high_b0
256 movk x_tmp2, p1_high_b1, lsl 16
257 fmov d_p1_high, x_tmp2
258
259 pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
260 pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
261 eor v_x1.16b, v_x1.16b, v_tmp_high.16b
262 eor v_x1.16b, v_x1.16b, v_tmp_low.16b
263
264 pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
265 pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
266 eor v_x2.16b, v_x2.16b, v_tmp_high.16b
267 eor v_x2.16b, v_x2.16b, v_tmp_low.16b
268
269 pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
270 pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
271 eor v_x3.16b, v_x3.16b, v_tmp_high.16b
272 eor v_x3.16b, v_x3.16b, v_tmp_low.16b
273.endm
274
275.macro crc64_fold_512b_to_128b
276 mov x_tmp, p1_low_b0
277 movk x_tmp, p1_low_b1, lsl 16
278 movk x_tmp, p1_low_b2, lsl 32
279 movk x_tmp, p1_low_b3, lsl 48
280 fmov d_p1_low, x_tmp
281
282 mov x_tmp2, p1_high_b0
283 movk x_tmp2, p1_high_b1, lsl 16
284 movk x_tmp2, p1_high_b2, lsl 32
285 movk x_tmp2, p1_high_b3, lsl 48
286 fmov d_p1_high, x_tmp2
287
288 pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
289 pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
290 eor v_x1.16b, v_x1.16b, v_tmp_high.16b
291 eor v_x1.16b, v_x1.16b, v_tmp_low.16b
292
293 pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
294 pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
295 eor v_x2.16b, v_x2.16b, v_tmp_high.16b
296 eor v_x2.16b, v_x2.16b, v_tmp_low.16b
297
298 pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
299 pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
300 eor v_x3.16b, v_x3.16b, v_tmp_high.16b
301 eor v_x3.16b, v_x3.16b, v_tmp_low.16b
302.endm