]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_raidz_math_scalar.c
Fixes and enhancements of SIMD raidz parity
[mirror_zfs.git] / module / zfs / vdev_raidz_math_scalar.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24 */
25
26 #include <sys/vdev_raidz_impl.h>
27
28 /*
29 * Provide native CPU scalar routines.
30 * Support 32bit and 64bit CPUs.
31 */
32 #if ((~(0x0ULL)) >> 24) == 0xffULL
33 #define ELEM_SIZE 4
34 typedef uint32_t iv_t;
35 #elif ((~(0x0ULL)) >> 56) == 0xffULL
36 #define ELEM_SIZE 8
37 typedef uint64_t iv_t;
38 #endif
39
40 /*
41 * Vector type used in scalar implementation
42 *
43 * The union is expected to be of native CPU register size. Since addition
44 * uses XOR operation, it can be performed an all byte elements at once.
45 * Multiplication requires per byte access.
46 */
47 typedef union {
48 iv_t e;
49 uint8_t b[ELEM_SIZE];
50 } v_t;
51
52 /*
53 * Precomputed lookup tables for multiplication by a constant
54 *
55 * Reconstruction path requires multiplication by a constant factors. Instead of
56 * performing two step lookup (log & exp tables), a direct lookup can be used
57 * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
58 *
59 * r = vdev_raidz_mul_lt[c_log][a];
60 *
61 * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
62 * they are faster to obtain while solving the syndrome equations.
63 *
64 * PERFORMANCE NOTE:
65 * Even though the complete lookup table uses 64kiB, only relatively small
66 * portion of it is used at the same time. Following shows number of accessed
67 * bytes for different cases:
68 * - 1 failed disk: 256B (1 mul. coefficient)
69 * - 2 failed disks: 512B (2 mul. coefficients)
70 * - 3 failed disks: 1536B (6 mul. coefficients)
71 *
72 * Size of actually accessed lookup table regions is only larger for
73 * reconstruction of 3 failed disks, when compared to traditional log/exp
74 * method. But since the result is obtained in one lookup step performance is
75 * doubled.
76 */
77 static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
78
79 static void
80 raidz_init_scalar(void)
81 {
82 int c, i;
83 for (c = 0; c < 256; c++)
84 for (i = 0; i < 256; i++)
85 vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
86
87 }
88
89 #define PREFETCHNTA(ptr, offset) {}
90 #define PREFETCH(ptr, offset) {}
91
92 #define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e
93 #define XOR(src, acc) acc.e ^= src.e
94 #define COPY(src, dst) dst = src
95 #define LOAD(src, val) val = ((v_t *)src)[0]
96 #define STORE(dst, val) ((v_t *)dst)[0] = val
97
98 /*
99 * Constants used for optimized multiplication by 2.
100 */
101 static const struct {
102 iv_t mod;
103 iv_t mask;
104 iv_t msb;
105 } scalar_mul2_consts = {
106 #if ELEM_SIZE == 8
107 .mod = 0x1d1d1d1d1d1d1d1dULL,
108 .mask = 0xfefefefefefefefeULL,
109 .msb = 0x8080808080808080ULL,
110 #else
111 .mod = 0x1d1d1d1dULL,
112 .mask = 0xfefefefeULL,
113 .msb = 0x80808080ULL,
114 #endif
115 };
116
117 #define MUL2_SETUP() {}
118
119 #define MUL2(a) \
120 { \
121 iv_t _mask; \
122 \
123 _mask = (a).e & scalar_mul2_consts.msb; \
124 _mask = (_mask << 1) - (_mask >> 7); \
125 (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \
126 (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \
127 }
128
129 #define MUL4(a) \
130 { \
131 MUL2(a); \
132 MUL2(a); \
133 }
134
135 #define MUL(c, a) \
136 { \
137 const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \
138 switch (ELEM_SIZE) { \
139 case 8: \
140 a.b[7] = mul_lt[a.b[7]]; \
141 a.b[6] = mul_lt[a.b[6]]; \
142 a.b[5] = mul_lt[a.b[5]]; \
143 a.b[4] = mul_lt[a.b[4]]; \
144 case 4: \
145 a.b[3] = mul_lt[a.b[3]]; \
146 a.b[2] = mul_lt[a.b[2]]; \
147 a.b[1] = mul_lt[a.b[1]]; \
148 a.b[0] = mul_lt[a.b[0]]; \
149 break; \
150 } \
151 }
152
153 #define raidz_math_begin() {}
154 #define raidz_math_end() {}
155
156 #define GEN_P_DEFINE() v_t p0
157 #define GEN_P_STRIDE 1
158 #define GEN_P_P p0
159
160 #define GEN_PQ_DEFINE() v_t d0, p0, q0
161 #define GEN_PQ_STRIDE 1
162 #define GEN_PQ_D d0
163 #define GEN_PQ_P p0
164 #define GEN_PQ_Q q0
165
166 #define GEN_PQR_DEFINE() v_t d0, p0, q0, r0
167 #define GEN_PQR_STRIDE 1
168 #define GEN_PQR_D d0
169 #define GEN_PQR_P p0
170 #define GEN_PQR_Q q0
171 #define GEN_PQR_R r0
172
173 #define REC_P_DEFINE() v_t x0
174 #define REC_P_STRIDE 1
175 #define REC_P_X x0
176
177 #define REC_Q_DEFINE() v_t x0
178 #define REC_Q_STRIDE 1
179 #define REC_Q_X x0
180
181 #define REC_R_DEFINE() v_t x0
182 #define REC_R_STRIDE 1
183 #define REC_R_X x0
184
185 #define REC_PQ_DEFINE() v_t x0, y0, d0
186 #define REC_PQ_STRIDE 1
187 #define REC_PQ_X x0
188 #define REC_PQ_Y y0
189 #define REC_PQ_D d0
190
191 #define REC_PR_DEFINE() v_t x0, y0, d0
192 #define REC_PR_STRIDE 1
193 #define REC_PR_X x0
194 #define REC_PR_Y y0
195 #define REC_PR_D d0
196
197 #define REC_QR_DEFINE() v_t x0, y0, d0
198 #define REC_QR_STRIDE 1
199 #define REC_QR_X x0
200 #define REC_QR_Y y0
201 #define REC_QR_D d0
202
203 #define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0
204 #define REC_PQR_STRIDE 1
205 #define REC_PQR_X x0
206 #define REC_PQR_Y y0
207 #define REC_PQR_Z z0
208 #define REC_PQR_D d0
209 #define REC_PQR_XS d0
210 #define REC_PQR_YS t0
211
212 #include "vdev_raidz_math_impl.h"
213
214 /*
215 * If compiled with -O0, gcc doesn't do any stack frame coalescing
216 * and -Wframe-larger-than=1024 is triggered in debug mode.
217 * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which
218 * does not trigger this warning.
219 */
220 #pragma GCC diagnostic ignored "-Wframe-larger-than="
221
222 DEFINE_GEN_METHODS(scalar);
223 DEFINE_REC_METHODS(scalar);
224
225 boolean_t
226 raidz_will_scalar_work(void)
227 {
228 return (B_TRUE); /* always */
229 }
230
231 const raidz_impl_ops_t vdev_raidz_scalar_impl = {
232 .init = raidz_init_scalar,
233 .fini = NULL,
234 .gen = RAIDZ_GEN_METHODS(scalar),
235 .rec = RAIDZ_REC_METHODS(scalar),
236 .is_supported = &raidz_will_scalar_work,
237 .name = "scalar"
238 };
239
240 /* Powers of 2 in the RAID-Z Galois field. */
241 const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
242 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
243 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
244 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
245 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
246 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
247 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
248 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
249 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
250 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
251 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
252 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
253 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
254 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
255 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
256 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
257 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
258 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
259 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
260 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
261 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
262 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
263 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
264 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
265 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
266 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
267 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
268 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
269 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
270 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
271 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
272 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
273 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
274 };
275
276 /* Logs of 2 in the RAID-Z Galois field. */
277 const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
278 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
279 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
280 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
281 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
282 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
283 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
284 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
285 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
286 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
287 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
288 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
289 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
290 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
291 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
292 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
293 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
294 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
295 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
296 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
297 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
298 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
299 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
300 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
301 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
302 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
303 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
304 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
305 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
306 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
307 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
308 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
309 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
310 };