]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/crc32c_aarch64.c
2 #include "include/int_types.h"
3 #include "common/crc32c_aarch64.h"
6 #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
7 /* Request crc extension capabilities from the assembler */
8 asm(".arch_extension crc");
10 #ifdef HAVE_ARMV8_CRYPTO
11 /* Request crypto extension capabilities from the assembler */
12 asm(".arch_extension crypto");
15 #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
16 #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
17 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
18 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
20 #define CRC32C3X8(ITR) \
21 __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
22 __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
23 __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
25 #define CRC32C3X8_ZERO \
26 __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
28 #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
33 #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
34 #define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
35 #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
36 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
38 #define CRC32C3X8(ITR) \
39 crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
40 crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
41 crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
43 #define CRC32C3X8_ZERO \
44 crc0 = __crc32cd(crc0, (const uint64_t)0);
46 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
48 #define CRC32C7X3X8(ITR) do {\
49 CRC32C3X8((ITR)*7+0) \
50 CRC32C3X8((ITR)*7+1) \
51 CRC32C3X8((ITR)*7+2) \
52 CRC32C3X8((ITR)*7+3) \
53 CRC32C3X8((ITR)*7+4) \
54 CRC32C3X8((ITR)*7+5) \
55 CRC32C3X8((ITR)*7+6) \
58 #define CRC32C7X3X8_ZERO do {\
68 #define PREF4X64L1(PREF_OFFSET, ITR) \
69 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
70 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
71 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
72 __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
74 #define PREF1KL1(PREF_OFFSET) \
75 PREF4X64L1((PREF_OFFSET), 0) \
76 PREF4X64L1((PREF_OFFSET), 4) \
77 PREF4X64L1((PREF_OFFSET), 8) \
78 PREF4X64L1((PREF_OFFSET), 12)
80 #define PREF4X64L2(PREF_OFFSET, ITR) \
81 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
82 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
83 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
84 __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
86 #define PREF1KL2(PREF_OFFSET) \
87 PREF4X64L2((PREF_OFFSET), 0) \
88 PREF4X64L2((PREF_OFFSET), 4) \
89 PREF4X64L2((PREF_OFFSET), 8) \
90 PREF4X64L2((PREF_OFFSET), 12)
93 uint32_t ceph_crc32c_aarch64(uint32_t crc
, unsigned char const *buffer
, unsigned len
)
96 uint32_t crc0
, crc1
, crc2
;
99 #ifdef HAVE_ARMV8_CRYPTO
100 if (ceph_arch_aarch64_pmull
) {
101 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
102 /* Calculate reflected crc with PMULL Instruction */
103 const poly64_t k1
= 0xe417f38a, k2
= 0x8f158014;
106 /* crc done "by 3" for fixed input block size of 1024 bytes */
107 while ((length
-= 1024) >= 0) {
108 /* Prefetch data for following block to avoid cache miss */
110 /* Do first 8 bytes here for better pipelining */
111 crc0
= __crc32cd(crc
, *(const uint64_t *)buffer
);
114 buffer
+= sizeof(uint64_t);
116 /* Process block inline
117 Process crc0 last to avoid dependency with above */
125 buffer
+= 42*3*sizeof(uint64_t);
126 /* Prefetch data for following block to avoid cache miss */
129 /* Merge crc0 and crc1 into crc2
131 crc0 multiply by K1 */
133 t1
= (uint64_t)vmull_p64(crc1
, k2
);
134 t0
= (uint64_t)vmull_p64(crc0
, k1
);
135 crc
= __crc32cd(crc2
, *(const uint64_t *)buffer
);
136 crc1
= __crc32cd(0, t1
);
138 crc0
= __crc32cd(0, t0
);
141 buffer
+= sizeof(uint64_t);
143 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
144 __asm__("mov x16, #0xf38a \n\t"
145 "movk x16, #0xe417, lsl 16 \n\t"
146 "mov v1.2d[0], x16 \n\t"
147 "mov x16, #0x8014 \n\t"
148 "movk x16, #0x8f15, lsl 16 \n\t"
149 "mov v0.2d[0], x16 \n\t"
152 while ((length
-= 1024) >= 0) {
154 __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
155 :[c0
]"=r"(crc0
):[c
]"r"(crc
), [v
]"r"(*(const uint64_t *)buffer
):);
158 buffer
+= sizeof(uint64_t);
167 buffer
+= 42*3*sizeof(uint64_t);
169 __asm__("mov v2.2d[0], %x[c1] \n\t"
170 "pmull v2.1q, v2.1d, v0.1d \n\t"
171 "mov v3.2d[0], %x[c0] \n\t"
172 "pmull v3.1q, v3.1d, v1.1d \n\t"
173 "crc32cx %w[c], %w[c2], %x[v] \n\t"
174 "mov %x[c1], v2.2d[0] \n\t"
175 "crc32cx %w[c1], wzr, %x[c1] \n\t"
176 "eor %w[c], %w[c], %w[c1] \n\t"
177 "mov %x[c0], v3.2d[0] \n\t"
178 "crc32cx %w[c0], wzr, %x[c0] \n\t"
179 "eor %w[c], %w[c], %w[c0] \n\t"
180 :[c1
]"+r"(crc1
), [c0
]"+r"(crc0
), [c2
]"+r"(crc2
), [c
]"+r"(crc
)
181 :[v
]"r"(*((const uint64_t *)buffer
))
182 :"v0","v1","v2","v3");
183 buffer
+= sizeof(uint64_t);
185 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
187 if(!(length
+= 1024))
190 #endif /* HAVE_ARMV8_CRYPTO */
191 while ((length
-= sizeof(uint64_t)) >= 0) {
192 CRC32CX(crc
, *(uint64_t *)buffer
);
193 buffer
+= sizeof(uint64_t);
196 /* The following is more efficient than the straight loop */
197 if (length
& sizeof(uint32_t)) {
198 CRC32CW(crc
, *(uint32_t *)buffer
);
199 buffer
+= sizeof(uint32_t);
201 if (length
& sizeof(uint16_t)) {
202 CRC32CH(crc
, *(uint16_t *)buffer
);
203 buffer
+= sizeof(uint16_t);
205 if (length
& sizeof(uint8_t))
206 CRC32CB(crc
, *buffer
);
208 #ifdef HAVE_ARMV8_CRYPTO
209 if (ceph_arch_aarch64_pmull
) {
210 #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
211 const poly64_t k1
= 0xe417f38a;
214 while ((length
-= 1024) >= 0) {
215 crc0
= __crc32cd(crc
, 0);
224 /* Merge crc0 into crc: crc0 multiply by K1 */
226 t0
= (uint64_t)vmull_p64(crc0
, k1
);
227 crc
= __crc32cd(0, t0
);
229 #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
230 __asm__("mov x16, #0xf38a \n\t"
231 "movk x16, #0xe417, lsl 16 \n\t"
232 "mov v1.2d[0], x16 \n\t"
235 while ((length
-= 1024) >= 0) {
236 __asm__("crc32cx %w[c0], %w[c], xzr\n\t"
237 :[c0
]"=r"(crc0
):[c
]"r"(crc
));
246 __asm__("mov v3.2d[0], %x[c0] \n\t"
247 "pmull v3.1q, v3.1d, v1.1d \n\t"
248 "mov %x[c0], v3.2d[0] \n\t"
249 "crc32cx %w[c], wzr, %x[c0] \n\t"
254 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
256 if(!(length
+= 1024))
259 #endif /* HAVE_ARMV8_CRYPTO */
260 while ((length
-= sizeof(uint64_t)) >= 0)
263 /* The following is more efficient than the straight loop */
264 if (length
& sizeof(uint32_t))
267 if (length
& sizeof(uint16_t))
270 if (length
& sizeof(uint8_t))