]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/isa/ErasureCodeIsa.cc
4a6841abd6226e35bc7186afd66e1468c38b20d4
[ceph.git] / ceph / src / erasure-code / isa / ErasureCodeIsa.cc
1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 CERN (Switzerland)
5 *
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 */
14
15 // -----------------------------------------------------------------------------
16 #include <algorithm>
17 #include <errno.h>
18 // -----------------------------------------------------------------------------
19 #include "common/debug.h"
20 #include "ErasureCodeIsa.h"
21 #include "xor_op.h"
22 #include "crush/CrushWrapper.h"
23 #include "osd/osd_types.h"
24 using namespace std;
25
26 // -----------------------------------------------------------------------------
27 extern "C" {
28 #include "isa-l/include/erasure_code.h"
29 }
30 // -----------------------------------------------------------------------------
31 #define dout_context g_ceph_context
32 #define dout_subsys ceph_subsys_osd
33 #undef dout_prefix
34 #define dout_prefix _prefix(_dout)
35 // -----------------------------------------------------------------------------
36
37 // -----------------------------------------------------------------------------
38
39 static ostream&
40 _prefix(std::ostream* _dout)
41 {
42 return *_dout << "ErasureCodeIsa: ";
43 }
44 // -----------------------------------------------------------------------------
45
46 const std::string ErasureCodeIsaDefault::DEFAULT_K("7");
47 const std::string ErasureCodeIsaDefault::DEFAULT_M("3");
48
49 int
50 ErasureCodeIsa::create_ruleset(const string &name,
51 CrushWrapper &crush,
52 ostream *ss) const
53 {
54 int ruleid = crush.add_simple_rule(
55 name,
56 ruleset_root,
57 ruleset_failure_domain,
58 "indep",
59 pg_pool_t::TYPE_ERASURE,
60 ss);
61
62 if (ruleid < 0)
63 return ruleid;
64 else {
65 crush.set_rule_mask_max_size(ruleid, get_chunk_count());
66 return crush.get_rule_mask_ruleset(ruleid);
67 }
68 }
69
70 // -----------------------------------------------------------------------------
71
72 int
73 ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss)
74 {
75 int err = 0;
76 err |= to_string("ruleset-root", profile,
77 &ruleset_root,
78 DEFAULT_RULESET_ROOT, ss);
79 err |= to_string("ruleset-failure-domain", profile,
80 &ruleset_failure_domain,
81 DEFAULT_RULESET_FAILURE_DOMAIN, ss);
82 err |= parse(profile, ss);
83 if (err)
84 return err;
85 prepare();
86 ErasureCode::init(profile, ss);
87 return err;
88 }
89
90 // -----------------------------------------------------------------------------
91
92 unsigned int
93 ErasureCodeIsa::get_chunk_size(unsigned int object_size) const
94 {
95 unsigned alignment = get_alignment();
96 unsigned chunk_size = ( object_size + k - 1 ) / k;
97 dout(20) << "get_chunk_size: chunk_size " << chunk_size
98 << " must be modulo " << alignment << dendl;
99 unsigned modulo = chunk_size % alignment;
100 if (modulo) {
101 dout(10) << "get_chunk_size: " << chunk_size
102 << " padded to " << chunk_size + alignment - modulo << dendl;
103 chunk_size += alignment - modulo;
104 }
105 return chunk_size;
106 }
107
108 // -----------------------------------------------------------------------------
109
110 int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode,
111 map<int, bufferlist> *encoded)
112 {
113 char *chunks[k + m];
114 for (int i = 0; i < k + m; i++)
115 chunks[i] = (*encoded)[i].c_str();
116 isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length());
117 return 0;
118 }
119
120 int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read,
121 const map<int, bufferlist> &chunks,
122 map<int, bufferlist> *decoded)
123 {
124 unsigned blocksize = (*chunks.begin()).second.length();
125 int erasures[k + m + 1];
126 int erasures_count = 0;
127 char *data[k];
128 char *coding[m];
129 for (int i = 0; i < k + m; i++) {
130 if (chunks.find(i) == chunks.end()) {
131 erasures[erasures_count] = i;
132 erasures_count++;
133 }
134 if (i < k)
135 data[i] = (*decoded)[i].c_str();
136 else
137 coding[i - k] = (*decoded)[i].c_str();
138 }
139 erasures[erasures_count] = -1;
140 assert(erasures_count > 0);
141 return isa_decode(erasures, data, coding, blocksize);
142 }
143
144 // -----------------------------------------------------------------------------
145
146 void
147 ErasureCodeIsaDefault::isa_encode(char **data,
148 char **coding,
149 int blocksize)
150 {
151
152 if (m == 1)
153 // single parity stripe
154 region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize);
155 else
156 ec_encode_data(blocksize, k, m, encode_tbls,
157 (unsigned char**) data, (unsigned char**) coding);
158 }
159
160 // -----------------------------------------------------------------------------
161
162 bool
163 ErasureCodeIsaDefault::erasure_contains(int *erasures, int i)
164 {
165 for (int l = 0; erasures[l] != -1; l++) {
166 if (erasures[l] == i)
167 return true;
168 }
169 return false;
170 }
171
172 // -----------------------------------------------------------------------------
173
174
175
176 // -----------------------------------------------------------------------------
177
178 int
179 ErasureCodeIsaDefault::isa_decode(int *erasures,
180 char **data,
181 char **coding,
182 int blocksize)
183 {
184 int nerrs = 0;
185 int i, r, s;
186
187 // count the errors
188 for (int l = 0; erasures[l] != -1; l++) {
189 nerrs++;
190 }
191
192 unsigned char *recover_source[k];
193 unsigned char *recover_target[m];
194
195 memset(recover_source, 0, sizeof (recover_source));
196 memset(recover_target, 0, sizeof (recover_target));
197
198 // ---------------------------------------------
199 // Assign source and target buffers
200 // ---------------------------------------------
201 for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) {
202 if (!erasure_contains(erasures, i)) {
203 if (r < k) {
204 if (i < k) {
205 recover_source[r] = (unsigned char*) data[i];
206 } else {
207 recover_source[r] = (unsigned char*) coding[i - k];
208 }
209 r++;
210 }
211 } else {
212 if (s < m) {
213 if (i < k) {
214 recover_target[s] = (unsigned char*) data[i];
215 } else {
216 recover_target[s] = (unsigned char*) coding[i - k];
217 }
218 s++;
219 }
220 }
221 }
222
223 if (m == 1) {
224 // single parity decoding
225 assert(1 == nerrs);
226 dout(20) << "isa_decode: reconstruct using region xor [" <<
227 erasures[0] << "]" << dendl;
228 region_xor(recover_source, recover_target[0], k, blocksize);
229 return 0;
230 }
231
232
233 if ((matrixtype == kVandermonde) &&
234 (nerrs == 1) &&
235 (erasures[0] < (k + 1))) {
236 // use xor decoding if a data chunk is missing or the first coding chunk
237 dout(20) << "isa_decode: reconstruct using region xor [" <<
238 erasures[0] << "]" << dendl;
239 assert(1 == s);
240 assert(k == r);
241 region_xor(recover_source, recover_target[0], k, blocksize);
242 return 0;
243 }
244
245 unsigned char d[k * (m + k)];
246 unsigned char decode_tbls[k * (m + k)*32];
247 unsigned char *p_tbls = decode_tbls;
248
249 int decode_index[k];
250
251 if (nerrs > m)
252 return -1;
253
254 std::string erasure_signature; // describes a matrix configuration for caching
255
256 // ---------------------------------------------
257 // Construct b by removing error rows
258 // ---------------------------------------------
259
260 for (i = 0, r = 0; i < k; i++, r++) {
261 char id[128];
262 while (erasure_contains(erasures, r))
263 r++;
264
265 decode_index[i] = r;
266
267 snprintf(id, sizeof (id), "+%d", r);
268 erasure_signature += id;
269 }
270
271 for (int p = 0; p < nerrs; p++) {
272 char id[128];
273 snprintf(id, sizeof (id), "-%d", erasures[p]);
274 erasure_signature += id;
275 }
276
277 // ---------------------------------------------
278 // Try to get an already computed matrix
279 // ---------------------------------------------
280 if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) {
281 int j;
282 unsigned char b[k * (m + k)];
283 unsigned char c[k * (m + k)];
284
285 for (i = 0; i < k; i++) {
286 r = decode_index[i];
287 for (j = 0; j < k; j++)
288 b[k * i + j] = encode_coeff[k * r + j];
289 }
290 // ---------------------------------------------
291 // Compute inverted matrix
292 // ---------------------------------------------
293
294 // --------------------------------------------------------
295 // Remark: this may fail for certain Vandermonde matrices !
296 // There is an advanced way trying to use different
297 // source chunks to get an invertible matrix, however
298 // there are also (k,m) combinations which cannot be
299 // inverted when m chunks are lost and this optimizations
300 // does not help. Therefor we keep the code simpler.
301 // --------------------------------------------------------
302 if (gf_invert_matrix(b, d, k) < 0) {
303 dout(0) << "isa_decode: bad matrix" << dendl;
304 return -1;
305 }
306
307 for (int p = 0; p < nerrs; p++) {
308 if (erasures[p] < k) {
309 // decoding matrix elements for data chunks
310 for (j = 0; j < k; j++) {
311 c[k * p + j] = d[k * erasures[p] + j];
312 }
313 } else {
314 // decoding matrix element for coding chunks
315 for (i = 0; i < k; i++) {
316 int s = 0;
317 for (j = 0; j < k; j++)
318 s ^= gf_mul(d[j * k + i],
319 encode_coeff[k * erasures[p] + j]);
320
321 c[k * p + i] = s;
322 }
323 }
324 }
325
326 // ---------------------------------------------
327 // Initialize Decoding Table
328 // ---------------------------------------------
329 ec_init_tables(k, nerrs, c, decode_tbls);
330 tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m);
331 }
332 // Recover data sources
333 ec_encode_data(blocksize,
334 k, nerrs, decode_tbls, recover_source, recover_target);
335
336
337 return 0;
338 }
339
340 // -----------------------------------------------------------------------------
341
342 unsigned
343 ErasureCodeIsaDefault::get_alignment() const
344 {
345 return EC_ISA_ADDRESS_ALIGNMENT;
346 }
347
348 // -----------------------------------------------------------------------------
349
350 int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile,
351 ostream *ss)
352 {
353 int err = ErasureCode::parse(profile, ss);
354 err |= to_int("k", profile, &k, DEFAULT_K, ss);
355 err |= to_int("m", profile, &m, DEFAULT_M, ss);
356 err |= sanity_check_k(k, ss);
357
358 if (matrixtype == kVandermonde) {
359 // these are verified safe values evaluated using the
360 // benchmarktool and 10*(combinatoric for maximum loss) random
361 // full erasures
362 if (k > 32) {
363 *ss << "Vandermonde: m=" << m
364 << " should be less/equal than 32 : revert to k=32" << std::endl;
365 k = 32;
366 err = -EINVAL;
367 }
368
369 if (m > 4) {
370 *ss << "Vandermonde: m=" << m
371 << " should be less than 5 to guarantee an MDS codec:"
372 << " revert to m=4" << std::endl;
373 m = 4;
374 err = -EINVAL;
375 }
376 switch (m) {
377 case 4:
378 if (k > 21) {
379 *ss << "Vandermonde: k=" << k
380 << " should be less than 22 to guarantee an MDS"
381 << " codec with m=4: revert to k=21" << std::endl;
382 k = 21;
383 err = -EINVAL;
384 }
385 break;
386 default:
387 ;
388 }
389 }
390 return err;
391 }
392
393 // -----------------------------------------------------------------------------
394
395 void
396 ErasureCodeIsaDefault::prepare()
397 {
398 // setup shared encoding table and coefficients
399 unsigned char** p_enc_table =
400 tcache.getEncodingTable(matrixtype, k, m);
401
402 unsigned char** p_enc_coeff =
403 tcache.getEncodingCoefficient(matrixtype, k, m);
404
405 if (!*p_enc_coeff) {
406 dout(10) << "[ cache tables ] creating coeff for k=" <<
407 k << " m=" << m << dendl;
408 // build encoding coefficients which need to be computed once for each (k,m)
409 encode_coeff = (unsigned char*) malloc(k * (m + k));
410
411 if (matrixtype == kVandermonde)
412 gf_gen_rs_matrix(encode_coeff, k + m, k);
413 if (matrixtype == kCauchy)
414 gf_gen_cauchy1_matrix(encode_coeff, k + m, k);
415
416 // either our new created coefficients are stored or if they have been
417 // created in the meanwhile the locally allocated coefficients will be
418 // freed by setEncodingCoefficient
419 encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff);
420 } else {
421 encode_coeff = *p_enc_coeff;
422 }
423
424 if (!*p_enc_table) {
425 dout(10) << "[ cache tables ] creating tables for k=" <<
426 k << " m=" << m << dendl;
427 // build encoding table which needs to be computed once for each (k,m)
428 encode_tbls = (unsigned char*) malloc(k * (m + k)*32);
429 ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls);
430
431 // either our new created table is stored or if it has been
432 // created in the meanwhile the locally allocated table will be
433 // freed by setEncodingTable
434 encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls);
435 } else {
436 encode_tbls = *p_enc_table;
437 }
438
439 unsigned memory_lru_cache =
440 k * (m + k) * 32 * tcache.decoding_tables_lru_length;
441
442 dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" <<
443 " [ matrix ] = " <<
444 ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl;
445
446 assert((matrixtype == kVandermonde) || (matrixtype == kCauchy));
447
448 }
449 // -----------------------------------------------------------------------------