]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Ceph - scalable distributed file system | |
3 | * | |
4 | * Copyright (C) 2014 CERN (Switzerland) | |
5 | * | |
6 | * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch> | |
7 | * | |
8 | * This library is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | */ | |
14 | ||
15 | // ----------------------------------------------------------------------------- | |
16 | #include <algorithm> | |
7c673cae FG |
17 | #include <errno.h> |
18 | // ----------------------------------------------------------------------------- | |
19 | #include "common/debug.h" | |
20 | #include "ErasureCodeIsa.h" | |
21 | #include "xor_op.h" | |
22 | #include "crush/CrushWrapper.h" | |
23 | #include "osd/osd_types.h" | |
31f18b77 FG |
24 | using namespace std; |
25 | ||
7c673cae FG |
26 | // ----------------------------------------------------------------------------- |
27 | extern "C" { | |
28 | #include "isa-l/include/erasure_code.h" | |
29 | } | |
30 | // ----------------------------------------------------------------------------- | |
31 | #define dout_context g_ceph_context | |
32 | #define dout_subsys ceph_subsys_osd | |
33 | #undef dout_prefix | |
34 | #define dout_prefix _prefix(_dout) | |
35 | // ----------------------------------------------------------------------------- | |
36 | ||
37 | // ----------------------------------------------------------------------------- | |
38 | ||
39 | static ostream& | |
40 | _prefix(std::ostream* _dout) | |
41 | { | |
42 | return *_dout << "ErasureCodeIsa: "; | |
43 | } | |
44 | // ----------------------------------------------------------------------------- | |
45 | ||
46 | const std::string ErasureCodeIsaDefault::DEFAULT_K("7"); | |
47 | const std::string ErasureCodeIsaDefault::DEFAULT_M("3"); | |
48 | ||
49 | int | |
50 | ErasureCodeIsa::create_ruleset(const string &name, | |
51 | CrushWrapper &crush, | |
52 | ostream *ss) const | |
53 | { | |
31f18b77 FG |
54 | int ruleid = crush.add_simple_rule( |
55 | name, | |
56 | ruleset_root, | |
57 | ruleset_failure_domain, | |
58 | "indep", | |
59 | pg_pool_t::TYPE_ERASURE, | |
60 | ss); | |
7c673cae FG |
61 | |
62 | if (ruleid < 0) | |
63 | return ruleid; | |
64 | else { | |
65 | crush.set_rule_mask_max_size(ruleid, get_chunk_count()); | |
66 | return crush.get_rule_mask_ruleset(ruleid); | |
67 | } | |
68 | } | |
69 | ||
70 | // ----------------------------------------------------------------------------- | |
71 | ||
72 | int | |
73 | ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss) | |
74 | { | |
75 | int err = 0; | |
76 | err |= to_string("ruleset-root", profile, | |
77 | &ruleset_root, | |
78 | DEFAULT_RULESET_ROOT, ss); | |
79 | err |= to_string("ruleset-failure-domain", profile, | |
80 | &ruleset_failure_domain, | |
81 | DEFAULT_RULESET_FAILURE_DOMAIN, ss); | |
82 | err |= parse(profile, ss); | |
83 | if (err) | |
84 | return err; | |
85 | prepare(); | |
86 | ErasureCode::init(profile, ss); | |
87 | return err; | |
88 | } | |
89 | ||
90 | // ----------------------------------------------------------------------------- | |
91 | ||
92 | unsigned int | |
93 | ErasureCodeIsa::get_chunk_size(unsigned int object_size) const | |
94 | { | |
95 | unsigned alignment = get_alignment(); | |
96 | unsigned chunk_size = ( object_size + k - 1 ) / k; | |
97 | dout(20) << "get_chunk_size: chunk_size " << chunk_size | |
98 | << " must be modulo " << alignment << dendl; | |
99 | unsigned modulo = chunk_size % alignment; | |
100 | if (modulo) { | |
101 | dout(10) << "get_chunk_size: " << chunk_size | |
102 | << " padded to " << chunk_size + alignment - modulo << dendl; | |
103 | chunk_size += alignment - modulo; | |
104 | } | |
105 | return chunk_size; | |
106 | } | |
107 | ||
108 | // ----------------------------------------------------------------------------- | |
109 | ||
110 | int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode, | |
111 | map<int, bufferlist> *encoded) | |
112 | { | |
113 | char *chunks[k + m]; | |
114 | for (int i = 0; i < k + m; i++) | |
115 | chunks[i] = (*encoded)[i].c_str(); | |
116 | isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length()); | |
117 | return 0; | |
118 | } | |
119 | ||
120 | int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read, | |
121 | const map<int, bufferlist> &chunks, | |
122 | map<int, bufferlist> *decoded) | |
123 | { | |
124 | unsigned blocksize = (*chunks.begin()).second.length(); | |
125 | int erasures[k + m + 1]; | |
126 | int erasures_count = 0; | |
127 | char *data[k]; | |
128 | char *coding[m]; | |
129 | for (int i = 0; i < k + m; i++) { | |
130 | if (chunks.find(i) == chunks.end()) { | |
131 | erasures[erasures_count] = i; | |
132 | erasures_count++; | |
133 | } | |
134 | if (i < k) | |
135 | data[i] = (*decoded)[i].c_str(); | |
136 | else | |
137 | coding[i - k] = (*decoded)[i].c_str(); | |
138 | } | |
139 | erasures[erasures_count] = -1; | |
140 | assert(erasures_count > 0); | |
141 | return isa_decode(erasures, data, coding, blocksize); | |
142 | } | |
143 | ||
144 | // ----------------------------------------------------------------------------- | |
145 | ||
146 | void | |
147 | ErasureCodeIsaDefault::isa_encode(char **data, | |
148 | char **coding, | |
149 | int blocksize) | |
150 | { | |
151 | ||
152 | if (m == 1) | |
153 | // single parity stripe | |
154 | region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize); | |
155 | else | |
156 | ec_encode_data(blocksize, k, m, encode_tbls, | |
157 | (unsigned char**) data, (unsigned char**) coding); | |
158 | } | |
159 | ||
160 | // ----------------------------------------------------------------------------- | |
161 | ||
162 | bool | |
163 | ErasureCodeIsaDefault::erasure_contains(int *erasures, int i) | |
164 | { | |
165 | for (int l = 0; erasures[l] != -1; l++) { | |
166 | if (erasures[l] == i) | |
167 | return true; | |
168 | } | |
169 | return false; | |
170 | } | |
171 | ||
172 | // ----------------------------------------------------------------------------- | |
173 | ||
174 | ||
175 | ||
176 | // ----------------------------------------------------------------------------- | |
177 | ||
178 | int | |
179 | ErasureCodeIsaDefault::isa_decode(int *erasures, | |
180 | char **data, | |
181 | char **coding, | |
182 | int blocksize) | |
183 | { | |
184 | int nerrs = 0; | |
185 | int i, r, s; | |
186 | ||
187 | // count the errors | |
188 | for (int l = 0; erasures[l] != -1; l++) { | |
189 | nerrs++; | |
190 | } | |
191 | ||
192 | unsigned char *recover_source[k]; | |
193 | unsigned char *recover_target[m]; | |
194 | ||
195 | memset(recover_source, 0, sizeof (recover_source)); | |
196 | memset(recover_target, 0, sizeof (recover_target)); | |
197 | ||
198 | // --------------------------------------------- | |
199 | // Assign source and target buffers | |
200 | // --------------------------------------------- | |
201 | for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) { | |
202 | if (!erasure_contains(erasures, i)) { | |
203 | if (r < k) { | |
204 | if (i < k) { | |
205 | recover_source[r] = (unsigned char*) data[i]; | |
206 | } else { | |
207 | recover_source[r] = (unsigned char*) coding[i - k]; | |
208 | } | |
209 | r++; | |
210 | } | |
211 | } else { | |
212 | if (s < m) { | |
213 | if (i < k) { | |
214 | recover_target[s] = (unsigned char*) data[i]; | |
215 | } else { | |
216 | recover_target[s] = (unsigned char*) coding[i - k]; | |
217 | } | |
218 | s++; | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
223 | if (m == 1) { | |
224 | // single parity decoding | |
225 | assert(1 == nerrs); | |
226 | dout(20) << "isa_decode: reconstruct using region xor [" << | |
227 | erasures[0] << "]" << dendl; | |
228 | region_xor(recover_source, recover_target[0], k, blocksize); | |
229 | return 0; | |
230 | } | |
231 | ||
232 | ||
233 | if ((matrixtype == kVandermonde) && | |
234 | (nerrs == 1) && | |
235 | (erasures[0] < (k + 1))) { | |
236 | // use xor decoding if a data chunk is missing or the first coding chunk | |
237 | dout(20) << "isa_decode: reconstruct using region xor [" << | |
238 | erasures[0] << "]" << dendl; | |
239 | assert(1 == s); | |
240 | assert(k == r); | |
241 | region_xor(recover_source, recover_target[0], k, blocksize); | |
242 | return 0; | |
243 | } | |
244 | ||
245 | unsigned char d[k * (m + k)]; | |
246 | unsigned char decode_tbls[k * (m + k)*32]; | |
247 | unsigned char *p_tbls = decode_tbls; | |
248 | ||
249 | int decode_index[k]; | |
250 | ||
251 | if (nerrs > m) | |
252 | return -1; | |
253 | ||
254 | std::string erasure_signature; // describes a matrix configuration for caching | |
255 | ||
256 | // --------------------------------------------- | |
257 | // Construct b by removing error rows | |
258 | // --------------------------------------------- | |
259 | ||
260 | for (i = 0, r = 0; i < k; i++, r++) { | |
261 | char id[128]; | |
262 | while (erasure_contains(erasures, r)) | |
263 | r++; | |
264 | ||
265 | decode_index[i] = r; | |
266 | ||
267 | snprintf(id, sizeof (id), "+%d", r); | |
268 | erasure_signature += id; | |
269 | } | |
270 | ||
271 | for (int p = 0; p < nerrs; p++) { | |
272 | char id[128]; | |
273 | snprintf(id, sizeof (id), "-%d", erasures[p]); | |
274 | erasure_signature += id; | |
275 | } | |
276 | ||
277 | // --------------------------------------------- | |
278 | // Try to get an already computed matrix | |
279 | // --------------------------------------------- | |
280 | if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) { | |
281 | int j; | |
282 | unsigned char b[k * (m + k)]; | |
283 | unsigned char c[k * (m + k)]; | |
284 | ||
285 | for (i = 0; i < k; i++) { | |
286 | r = decode_index[i]; | |
287 | for (j = 0; j < k; j++) | |
288 | b[k * i + j] = encode_coeff[k * r + j]; | |
289 | } | |
290 | // --------------------------------------------- | |
291 | // Compute inverted matrix | |
292 | // --------------------------------------------- | |
293 | ||
294 | // -------------------------------------------------------- | |
295 | // Remark: this may fail for certain Vandermonde matrices ! | |
296 | // There is an advanced way trying to use different | |
297 | // source chunks to get an invertible matrix, however | |
298 | // there are also (k,m) combinations which cannot be | |
299 | // inverted when m chunks are lost and this optimizations | |
300 | // does not help. Therefor we keep the code simpler. | |
301 | // -------------------------------------------------------- | |
302 | if (gf_invert_matrix(b, d, k) < 0) { | |
303 | dout(0) << "isa_decode: bad matrix" << dendl; | |
304 | return -1; | |
305 | } | |
306 | ||
307 | for (int p = 0; p < nerrs; p++) { | |
308 | if (erasures[p] < k) { | |
309 | // decoding matrix elements for data chunks | |
310 | for (j = 0; j < k; j++) { | |
311 | c[k * p + j] = d[k * erasures[p] + j]; | |
312 | } | |
313 | } else { | |
314 | // decoding matrix element for coding chunks | |
315 | for (i = 0; i < k; i++) { | |
316 | int s = 0; | |
317 | for (j = 0; j < k; j++) | |
318 | s ^= gf_mul(d[j * k + i], | |
319 | encode_coeff[k * erasures[p] + j]); | |
320 | ||
321 | c[k * p + i] = s; | |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | // --------------------------------------------- | |
327 | // Initialize Decoding Table | |
328 | // --------------------------------------------- | |
329 | ec_init_tables(k, nerrs, c, decode_tbls); | |
330 | tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m); | |
331 | } | |
332 | // Recover data sources | |
333 | ec_encode_data(blocksize, | |
334 | k, nerrs, decode_tbls, recover_source, recover_target); | |
335 | ||
336 | ||
337 | return 0; | |
338 | } | |
339 | ||
340 | // ----------------------------------------------------------------------------- | |
341 | ||
342 | unsigned | |
343 | ErasureCodeIsaDefault::get_alignment() const | |
344 | { | |
345 | return EC_ISA_ADDRESS_ALIGNMENT; | |
346 | } | |
347 | ||
348 | // ----------------------------------------------------------------------------- | |
349 | ||
350 | int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile, | |
351 | ostream *ss) | |
352 | { | |
353 | int err = ErasureCode::parse(profile, ss); | |
354 | err |= to_int("k", profile, &k, DEFAULT_K, ss); | |
355 | err |= to_int("m", profile, &m, DEFAULT_M, ss); | |
356 | err |= sanity_check_k(k, ss); | |
357 | ||
358 | if (matrixtype == kVandermonde) { | |
359 | // these are verified safe values evaluated using the | |
360 | // benchmarktool and 10*(combinatoric for maximum loss) random | |
361 | // full erasures | |
362 | if (k > 32) { | |
363 | *ss << "Vandermonde: m=" << m | |
364 | << " should be less/equal than 32 : revert to k=32" << std::endl; | |
365 | k = 32; | |
366 | err = -EINVAL; | |
367 | } | |
368 | ||
369 | if (m > 4) { | |
370 | *ss << "Vandermonde: m=" << m | |
371 | << " should be less than 5 to guarantee an MDS codec:" | |
372 | << " revert to m=4" << std::endl; | |
373 | m = 4; | |
374 | err = -EINVAL; | |
375 | } | |
376 | switch (m) { | |
377 | case 4: | |
378 | if (k > 21) { | |
379 | *ss << "Vandermonde: k=" << k | |
380 | << " should be less than 22 to guarantee an MDS" | |
381 | << " codec with m=4: revert to k=21" << std::endl; | |
382 | k = 21; | |
383 | err = -EINVAL; | |
384 | } | |
385 | break; | |
386 | default: | |
387 | ; | |
388 | } | |
389 | } | |
390 | return err; | |
391 | } | |
392 | ||
393 | // ----------------------------------------------------------------------------- | |
394 | ||
395 | void | |
396 | ErasureCodeIsaDefault::prepare() | |
397 | { | |
398 | // setup shared encoding table and coefficients | |
399 | unsigned char** p_enc_table = | |
400 | tcache.getEncodingTable(matrixtype, k, m); | |
401 | ||
402 | unsigned char** p_enc_coeff = | |
403 | tcache.getEncodingCoefficient(matrixtype, k, m); | |
404 | ||
405 | if (!*p_enc_coeff) { | |
406 | dout(10) << "[ cache tables ] creating coeff for k=" << | |
407 | k << " m=" << m << dendl; | |
408 | // build encoding coefficients which need to be computed once for each (k,m) | |
409 | encode_coeff = (unsigned char*) malloc(k * (m + k)); | |
410 | ||
411 | if (matrixtype == kVandermonde) | |
412 | gf_gen_rs_matrix(encode_coeff, k + m, k); | |
413 | if (matrixtype == kCauchy) | |
414 | gf_gen_cauchy1_matrix(encode_coeff, k + m, k); | |
415 | ||
416 | // either our new created coefficients are stored or if they have been | |
417 | // created in the meanwhile the locally allocated coefficients will be | |
418 | // freed by setEncodingCoefficient | |
419 | encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff); | |
420 | } else { | |
421 | encode_coeff = *p_enc_coeff; | |
422 | } | |
423 | ||
424 | if (!*p_enc_table) { | |
425 | dout(10) << "[ cache tables ] creating tables for k=" << | |
426 | k << " m=" << m << dendl; | |
427 | // build encoding table which needs to be computed once for each (k,m) | |
428 | encode_tbls = (unsigned char*) malloc(k * (m + k)*32); | |
429 | ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls); | |
430 | ||
431 | // either our new created table is stored or if it has been | |
432 | // created in the meanwhile the locally allocated table will be | |
433 | // freed by setEncodingTable | |
434 | encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls); | |
435 | } else { | |
436 | encode_tbls = *p_enc_table; | |
437 | } | |
438 | ||
439 | unsigned memory_lru_cache = | |
440 | k * (m + k) * 32 * tcache.decoding_tables_lru_length; | |
441 | ||
442 | dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" << | |
443 | " [ matrix ] = " << | |
444 | ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl; | |
445 | ||
446 | assert((matrixtype == kVandermonde) || (matrixtype == kCauchy)); | |
447 | ||
448 | } | |
449 | // ----------------------------------------------------------------------------- |