]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | #include "acconfig.h" |
2 | #include "include/int_types.h" | |
3 | #include "common/crc32c_aarch64.h" | |
28e407b8 | 4 | #include "arch/arm.h" |
7c673cae FG |
5 | |
6 | #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS | |
7 | /* Request crc extension capabilities from the assembler */ | |
8 | asm(".arch_extension crc"); | |
9 | ||
10 | #ifdef HAVE_ARMV8_CRYPTO | |
11 | /* Request crypto extension capabilities from the assembler */ | |
12 | asm(".arch_extension crypto"); | |
13 | #endif | |
14 | ||
15 | #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) | |
16 | #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | |
17 | #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | |
18 | #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | |
19 | ||
20 | #define CRC32C3X8(ITR) \ | |
21 | __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\ | |
22 | __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\ | |
23 | __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR)))); | |
24 | ||
25 | #define CRC32C3X8_ZERO \ | |
26 | __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0)); | |
27 | ||
28 | #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
29 | ||
30 | #include <arm_acle.h> | |
31 | #include <arm_neon.h> | |
32 | ||
33 | #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value)) | |
34 | #define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value)) | |
35 | #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value)) | |
36 | #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value)) | |
37 | ||
38 | #define CRC32C3X8(ITR) \ | |
39 | crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ | |
40 | crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ | |
41 | crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); | |
42 | ||
43 | #define CRC32C3X8_ZERO \ | |
44 | crc0 = __crc32cd(crc0, (const uint64_t)0); | |
45 | ||
46 | #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
47 | ||
48 | #define CRC32C7X3X8(ITR) do {\ | |
49 | CRC32C3X8((ITR)*7+0) \ | |
50 | CRC32C3X8((ITR)*7+1) \ | |
51 | CRC32C3X8((ITR)*7+2) \ | |
52 | CRC32C3X8((ITR)*7+3) \ | |
53 | CRC32C3X8((ITR)*7+4) \ | |
54 | CRC32C3X8((ITR)*7+5) \ | |
55 | CRC32C3X8((ITR)*7+6) \ | |
56 | } while(0) | |
57 | ||
58 | #define CRC32C7X3X8_ZERO do {\ | |
59 | CRC32C3X8_ZERO \ | |
60 | CRC32C3X8_ZERO \ | |
61 | CRC32C3X8_ZERO \ | |
62 | CRC32C3X8_ZERO \ | |
63 | CRC32C3X8_ZERO \ | |
64 | CRC32C3X8_ZERO \ | |
65 | CRC32C3X8_ZERO \ | |
66 | } while(0) | |
67 | ||
68 | #define PREF4X64L1(PREF_OFFSET, ITR) \ | |
69 | __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ | |
70 | __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ | |
71 | __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ | |
72 | __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); | |
73 | ||
74 | #define PREF1KL1(PREF_OFFSET) \ | |
75 | PREF4X64L1((PREF_OFFSET), 0) \ | |
76 | PREF4X64L1((PREF_OFFSET), 4) \ | |
77 | PREF4X64L1((PREF_OFFSET), 8) \ | |
78 | PREF4X64L1((PREF_OFFSET), 12) | |
79 | ||
80 | #define PREF4X64L2(PREF_OFFSET, ITR) \ | |
81 | __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ | |
82 | __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ | |
83 | __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ | |
84 | __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); | |
85 | ||
86 | #define PREF1KL2(PREF_OFFSET) \ | |
87 | PREF4X64L2((PREF_OFFSET), 0) \ | |
88 | PREF4X64L2((PREF_OFFSET), 4) \ | |
89 | PREF4X64L2((PREF_OFFSET), 8) \ | |
90 | PREF4X64L2((PREF_OFFSET), 12) | |
91 | ||
92 | ||
93 | uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len) | |
94 | { | |
95 | int64_t length = len; | |
96 | uint32_t crc0, crc1, crc2; | |
97 | ||
98 | if (buffer) { | |
99 | #ifdef HAVE_ARMV8_CRYPTO | |
28e407b8 | 100 | if (ceph_arch_aarch64_pmull) { |
7c673cae FG |
101 | #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS |
102 | /* Calculate reflected crc with PMULL Instruction */ | |
103 | const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; | |
104 | uint64_t t0, t1; | |
105 | ||
106 | /* crc done "by 3" for fixed input block size of 1024 bytes */ | |
107 | while ((length -= 1024) >= 0) { | |
108 | /* Prefetch data for following block to avoid cache miss */ | |
109 | PREF1KL2(1024*3); | |
110 | /* Do first 8 bytes here for better pipelining */ | |
111 | crc0 = __crc32cd(crc, *(const uint64_t *)buffer); | |
112 | crc1 = 0; | |
113 | crc2 = 0; | |
114 | buffer += sizeof(uint64_t); | |
115 | ||
116 | /* Process block inline | |
117 | Process crc0 last to avoid dependency with above */ | |
118 | CRC32C7X3X8(0); | |
119 | CRC32C7X3X8(1); | |
120 | CRC32C7X3X8(2); | |
121 | CRC32C7X3X8(3); | |
122 | CRC32C7X3X8(4); | |
123 | CRC32C7X3X8(5); | |
124 | ||
125 | buffer += 42*3*sizeof(uint64_t); | |
126 | /* Prefetch data for following block to avoid cache miss */ | |
127 | PREF1KL1(1024); | |
128 | ||
129 | /* Merge crc0 and crc1 into crc2 | |
130 | crc1 multiply by K2 | |
131 | crc0 multiply by K1 */ | |
132 | ||
133 | t1 = (uint64_t)vmull_p64(crc1, k2); | |
134 | t0 = (uint64_t)vmull_p64(crc0, k1); | |
135 | crc = __crc32cd(crc2, *(const uint64_t *)buffer); | |
136 | crc1 = __crc32cd(0, t1); | |
137 | crc ^= crc1; | |
138 | crc0 = __crc32cd(0, t0); | |
139 | crc ^= crc0; | |
140 | ||
141 | buffer += sizeof(uint64_t); | |
142 | } | |
143 | #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
144 | __asm__("mov x16, #0xf38a \n\t" | |
145 | "movk x16, #0xe417, lsl 16 \n\t" | |
146 | "mov v1.2d[0], x16 \n\t" | |
147 | "mov x16, #0x8014 \n\t" | |
148 | "movk x16, #0x8f15, lsl 16 \n\t" | |
149 | "mov v0.2d[0], x16 \n\t" | |
20effc67 | 150 | :::"x16","v0","v1"); |
7c673cae FG |
151 | |
152 | while ((length -= 1024) >= 0) { | |
153 | PREF1KL2(1024*3); | |
154 | __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" | |
155 | :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); | |
156 | crc1 = 0; | |
157 | crc2 = 0; | |
158 | buffer += sizeof(uint64_t); | |
159 | ||
160 | CRC32C7X3X8(0); | |
161 | CRC32C7X3X8(1); | |
162 | CRC32C7X3X8(2); | |
163 | CRC32C7X3X8(3); | |
164 | CRC32C7X3X8(4); | |
165 | CRC32C7X3X8(5); | |
166 | ||
167 | buffer += 42*3*sizeof(uint64_t); | |
168 | PREF1KL1(1024); | |
169 | __asm__("mov v2.2d[0], %x[c1] \n\t" | |
170 | "pmull v2.1q, v2.1d, v0.1d \n\t" | |
171 | "mov v3.2d[0], %x[c0] \n\t" | |
172 | "pmull v3.1q, v3.1d, v1.1d \n\t" | |
173 | "crc32cx %w[c], %w[c2], %x[v] \n\t" | |
174 | "mov %x[c1], v2.2d[0] \n\t" | |
175 | "crc32cx %w[c1], wzr, %x[c1] \n\t" | |
176 | "eor %w[c], %w[c], %w[c1] \n\t" | |
177 | "mov %x[c0], v3.2d[0] \n\t" | |
178 | "crc32cx %w[c0], wzr, %x[c0] \n\t" | |
179 | "eor %w[c], %w[c], %w[c0] \n\t" | |
180 | :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) | |
20effc67 TL |
181 | :[v]"r"(*((const uint64_t *)buffer)) |
182 | :"v0","v1","v2","v3"); | |
7c673cae FG |
183 | buffer += sizeof(uint64_t); |
184 | } | |
185 | #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
186 | ||
187 | if(!(length += 1024)) | |
188 | return crc; | |
28e407b8 | 189 | } |
7c673cae FG |
190 | #endif /* HAVE_ARMV8_CRYPTO */ |
191 | while ((length -= sizeof(uint64_t)) >= 0) { | |
192 | CRC32CX(crc, *(uint64_t *)buffer); | |
193 | buffer += sizeof(uint64_t); | |
194 | } | |
195 | ||
196 | /* The following is more efficient than the straight loop */ | |
197 | if (length & sizeof(uint32_t)) { | |
198 | CRC32CW(crc, *(uint32_t *)buffer); | |
199 | buffer += sizeof(uint32_t); | |
200 | } | |
201 | if (length & sizeof(uint16_t)) { | |
202 | CRC32CH(crc, *(uint16_t *)buffer); | |
203 | buffer += sizeof(uint16_t); | |
204 | } | |
205 | if (length & sizeof(uint8_t)) | |
206 | CRC32CB(crc, *buffer); | |
207 | } else { | |
208 | #ifdef HAVE_ARMV8_CRYPTO | |
28e407b8 | 209 | if (ceph_arch_aarch64_pmull) { |
7c673cae FG |
210 | #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS |
211 | const poly64_t k1 = 0xe417f38a; | |
212 | uint64_t t0; | |
213 | ||
214 | while ((length -= 1024) >= 0) { | |
215 | crc0 = __crc32cd(crc, 0); | |
216 | ||
217 | CRC32C7X3X8_ZERO; | |
218 | CRC32C7X3X8_ZERO; | |
219 | CRC32C7X3X8_ZERO; | |
220 | CRC32C7X3X8_ZERO; | |
221 | CRC32C7X3X8_ZERO; | |
222 | CRC32C7X3X8_ZERO; | |
223 | ||
224 | /* Merge crc0 into crc: crc0 multiply by K1 */ | |
225 | ||
226 | t0 = (uint64_t)vmull_p64(crc0, k1); | |
227 | crc = __crc32cd(0, t0); | |
228 | } | |
229 | #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
230 | __asm__("mov x16, #0xf38a \n\t" | |
231 | "movk x16, #0xe417, lsl 16 \n\t" | |
232 | "mov v1.2d[0], x16 \n\t" | |
20effc67 | 233 | :::"x16","v1"); |
7c673cae FG |
234 | |
235 | while ((length -= 1024) >= 0) { | |
236 | __asm__("crc32cx %w[c0], %w[c], xzr\n\t" | |
237 | :[c0]"=r"(crc0):[c]"r"(crc)); | |
238 | ||
239 | CRC32C7X3X8_ZERO; | |
240 | CRC32C7X3X8_ZERO; | |
241 | CRC32C7X3X8_ZERO; | |
242 | CRC32C7X3X8_ZERO; | |
243 | CRC32C7X3X8_ZERO; | |
244 | CRC32C7X3X8_ZERO; | |
245 | ||
246 | __asm__("mov v3.2d[0], %x[c0] \n\t" | |
247 | "pmull v3.1q, v3.1d, v1.1d \n\t" | |
248 | "mov %x[c0], v3.2d[0] \n\t" | |
249 | "crc32cx %w[c], wzr, %x[c0] \n\t" | |
250 | :[c]"=r"(crc) | |
20effc67 TL |
251 | :[c0]"r"(crc0) |
252 | :"v1","v3"); | |
7c673cae FG |
253 | } |
254 | #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | |
255 | ||
256 | if(!(length += 1024)) | |
257 | return crc; | |
28e407b8 | 258 | } |
7c673cae FG |
259 | #endif /* HAVE_ARMV8_CRYPTO */ |
260 | while ((length -= sizeof(uint64_t)) >= 0) | |
261 | CRC32CX(crc, 0); | |
262 | ||
263 | /* The following is more efficient than the straight loop */ | |
264 | if (length & sizeof(uint32_t)) | |
265 | CRC32CW(crc, 0); | |
266 | ||
267 | if (length & sizeof(uint16_t)) | |
268 | CRC32CH(crc, 0); | |
269 | ||
270 | if (length & sizeof(uint8_t)) | |
271 | CRC32CB(crc, 0); | |
272 | } | |
273 | return crc; | |
274 | } |