]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz_math_scalar.c
Add parity generation/rebuild using 128-bits NEON for Aarch64
[mirror_zfs.git] / module / zfs / vdev_raidz_math_scalar.c
CommitLineData
ab9f4b0b
GN
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24 */
25
26#include <sys/vdev_raidz_impl.h>
27
28/*
29 * Provide native CPU scalar routines.
30 * Support 32bit and 64bit CPUs.
31 */
32#if ((~(0x0ULL)) >> 24) == 0xffULL
33#define ELEM_SIZE 4
34typedef uint32_t iv_t;
35#elif ((~(0x0ULL)) >> 56) == 0xffULL
36#define ELEM_SIZE 8
37typedef uint64_t iv_t;
38#endif
39
40/*
41 * Vector type used in scalar implementation
42 *
43 * The union is expected to be of native CPU register size. Since addition
44 * uses XOR operation, it can be performed an all byte elements at once.
45 * Multiplication requires per byte access.
46 */
47typedef union {
48 iv_t e;
49 uint8_t b[ELEM_SIZE];
50} v_t;
51
52/*
53 * Precomputed lookup tables for multiplication by a constant
54 *
55 * Reconstruction path requires multiplication by a constant factors. Instead of
56 * performing two step lookup (log & exp tables), a direct lookup can be used
57 * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
58 *
59 * r = vdev_raidz_mul_lt[c_log][a];
60 *
61 * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
62 * they are faster to obtain while solving the syndrome equations.
63 *
64 * PERFORMANCE NOTE:
65 * Even though the complete lookup table uses 64kiB, only relatively small
66 * portion of it is used at the same time. Following shows number of accessed
67 * bytes for different cases:
68 * - 1 failed disk: 256B (1 mul. coefficient)
69 * - 2 failed disks: 512B (2 mul. coefficients)
70 * - 3 failed disks: 1536B (6 mul. coefficients)
71 *
72 * Size of actually accessed lookup table regions is only larger for
73 * reconstruction of 3 failed disks, when compared to traditional log/exp
74 * method. But since the result is obtained in one lookup step performance is
75 * doubled.
76 */
77static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
78
79static void
80raidz_init_scalar(void)
81{
82 int c, i;
83 for (c = 0; c < 256; c++)
84 for (i = 0; i < 256; i++)
85 vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
86
87}
88
89#define PREFETCHNTA(ptr, offset) {}
90#define PREFETCH(ptr, offset) {}
91
92#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e
93#define XOR(src, acc) acc.e ^= src.e
62a65a65 94#define ZERO(acc) acc.e = 0
ab9f4b0b
GN
95#define COPY(src, dst) dst = src
96#define LOAD(src, val) val = ((v_t *)src)[0]
97#define STORE(dst, val) ((v_t *)dst)[0] = val
98
99/*
100 * Constants used for optimized multiplication by 2.
101 */
102static const struct {
103 iv_t mod;
104 iv_t mask;
105 iv_t msb;
106} scalar_mul2_consts = {
107#if ELEM_SIZE == 8
108 .mod = 0x1d1d1d1d1d1d1d1dULL,
109 .mask = 0xfefefefefefefefeULL,
110 .msb = 0x8080808080808080ULL,
111#else
112 .mod = 0x1d1d1d1dULL,
113 .mask = 0xfefefefeULL,
114 .msb = 0x80808080ULL,
115#endif
116};
117
118#define MUL2_SETUP() {}
119
120#define MUL2(a) \
121{ \
122 iv_t _mask; \
123 \
124 _mask = (a).e & scalar_mul2_consts.msb; \
125 _mask = (_mask << 1) - (_mask >> 7); \
126 (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \
127 (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \
128}
129
130#define MUL4(a) \
131{ \
132 MUL2(a); \
133 MUL2(a); \
134}
135
136#define MUL(c, a) \
137{ \
138 const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \
139 switch (ELEM_SIZE) { \
140 case 8: \
141 a.b[7] = mul_lt[a.b[7]]; \
142 a.b[6] = mul_lt[a.b[6]]; \
143 a.b[5] = mul_lt[a.b[5]]; \
144 a.b[4] = mul_lt[a.b[4]]; \
145 case 4: \
146 a.b[3] = mul_lt[a.b[3]]; \
147 a.b[2] = mul_lt[a.b[2]]; \
148 a.b[1] = mul_lt[a.b[1]]; \
149 a.b[0] = mul_lt[a.b[0]]; \
150 break; \
151 } \
152}
153
154#define raidz_math_begin() {}
155#define raidz_math_end() {}
156
157#define GEN_P_DEFINE() v_t p0
158#define GEN_P_STRIDE 1
159#define GEN_P_P p0
160
161#define GEN_PQ_DEFINE() v_t d0, p0, q0
162#define GEN_PQ_STRIDE 1
163#define GEN_PQ_D d0
164#define GEN_PQ_P p0
165#define GEN_PQ_Q q0
166
167#define GEN_PQR_DEFINE() v_t d0, p0, q0, r0
168#define GEN_PQR_STRIDE 1
169#define GEN_PQR_D d0
170#define GEN_PQR_P p0
171#define GEN_PQR_Q q0
172#define GEN_PQR_R r0
173
174#define REC_P_DEFINE() v_t x0
175#define REC_P_STRIDE 1
176#define REC_P_X x0
177
178#define REC_Q_DEFINE() v_t x0
179#define REC_Q_STRIDE 1
180#define REC_Q_X x0
181
182#define REC_R_DEFINE() v_t x0
183#define REC_R_STRIDE 1
184#define REC_R_X x0
185
186#define REC_PQ_DEFINE() v_t x0, y0, d0
187#define REC_PQ_STRIDE 1
188#define REC_PQ_X x0
189#define REC_PQ_Y y0
190#define REC_PQ_D d0
191
192#define REC_PR_DEFINE() v_t x0, y0, d0
193#define REC_PR_STRIDE 1
194#define REC_PR_X x0
195#define REC_PR_Y y0
196#define REC_PR_D d0
197
198#define REC_QR_DEFINE() v_t x0, y0, d0
199#define REC_QR_STRIDE 1
200#define REC_QR_X x0
201#define REC_QR_Y y0
202#define REC_QR_D d0
203
204#define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0
205#define REC_PQR_STRIDE 1
206#define REC_PQR_X x0
207#define REC_PQR_Y y0
208#define REC_PQR_Z z0
209#define REC_PQR_D d0
210#define REC_PQR_XS d0
211#define REC_PQR_YS t0
212
213#include "vdev_raidz_math_impl.h"
214
590c9a09
GN
215/*
216 * If compiled with -O0, gcc doesn't do any stack frame coalescing
217 * and -Wframe-larger-than=1024 is triggered in debug mode.
218 * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which
219 * does not trigger this warning.
220 */
221#pragma GCC diagnostic ignored "-Wframe-larger-than="
222
ab9f4b0b
GN
223DEFINE_GEN_METHODS(scalar);
224DEFINE_REC_METHODS(scalar);
225
c9187d86 226boolean_t
ab9f4b0b
GN
227raidz_will_scalar_work(void)
228{
229 return (B_TRUE); /* always */
230}
231
232const raidz_impl_ops_t vdev_raidz_scalar_impl = {
233 .init = raidz_init_scalar,
234 .fini = NULL,
235 .gen = RAIDZ_GEN_METHODS(scalar),
236 .rec = RAIDZ_REC_METHODS(scalar),
237 .is_supported = &raidz_will_scalar_work,
238 .name = "scalar"
239};
240
241/* Powers of 2 in the RAID-Z Galois field. */
242const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
243 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
244 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
245 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
246 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
247 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
248 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
249 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
250 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
251 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
252 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
253 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
254 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
255 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
256 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
257 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
258 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
259 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
260 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
261 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
262 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
263 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
264 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
265 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
266 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
267 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
268 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
269 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
270 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
271 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
272 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
273 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
274 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
275};
276
277/* Logs of 2 in the RAID-Z Galois field. */
278const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
279 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
280 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
281 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
282 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
283 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
284 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
285 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
286 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
287 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
288 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
289 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
290 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
291 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
292 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
293 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
294 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
295 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
296 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
297 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
298 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
299 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
300 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
301 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
302 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
303 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
304 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
305 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
306 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
307 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
308 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
309 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
310 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
311};