]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/isa/ErasureCodeIsa.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / erasure-code / isa / ErasureCodeIsa.cc
1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 CERN (Switzerland)
5 *
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 */
14
15 // -----------------------------------------------------------------------------
16 #include <algorithm>
17 #include <dlfcn.h>
18 #include <errno.h>
19 // -----------------------------------------------------------------------------
20 #include "common/debug.h"
21 #include "ErasureCodeIsa.h"
22 #include "xor_op.h"
23 #include "crush/CrushWrapper.h"
24 #include "osd/osd_types.h"
25 // -----------------------------------------------------------------------------
26 extern "C" {
27 #include "isa-l/include/erasure_code.h"
28 }
29 // -----------------------------------------------------------------------------
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_osd
32 #undef dout_prefix
33 #define dout_prefix _prefix(_dout)
34 // -----------------------------------------------------------------------------
35
36 // -----------------------------------------------------------------------------
37
38 static ostream&
39 _prefix(std::ostream* _dout)
40 {
41 return *_dout << "ErasureCodeIsa: ";
42 }
43 // -----------------------------------------------------------------------------
44
45 const std::string ErasureCodeIsaDefault::DEFAULT_K("7");
46 const std::string ErasureCodeIsaDefault::DEFAULT_M("3");
47
48 int
49 ErasureCodeIsa::create_ruleset(const string &name,
50 CrushWrapper &crush,
51 ostream *ss) const
52 {
53 int ruleid = crush.add_simple_ruleset(name,
54 ruleset_root,
55 ruleset_failure_domain,
56 "indep",
57 pg_pool_t::TYPE_ERASURE,
58 ss);
59
60 if (ruleid < 0)
61 return ruleid;
62 else {
63 crush.set_rule_mask_max_size(ruleid, get_chunk_count());
64 return crush.get_rule_mask_ruleset(ruleid);
65 }
66 }
67
68 // -----------------------------------------------------------------------------
69
70 int
71 ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss)
72 {
73 int err = 0;
74 err |= to_string("ruleset-root", profile,
75 &ruleset_root,
76 DEFAULT_RULESET_ROOT, ss);
77 err |= to_string("ruleset-failure-domain", profile,
78 &ruleset_failure_domain,
79 DEFAULT_RULESET_FAILURE_DOMAIN, ss);
80 err |= parse(profile, ss);
81 if (err)
82 return err;
83 prepare();
84 ErasureCode::init(profile, ss);
85 return err;
86 }
87
88 // -----------------------------------------------------------------------------
89
90 unsigned int
91 ErasureCodeIsa::get_chunk_size(unsigned int object_size) const
92 {
93 unsigned alignment = get_alignment();
94 unsigned chunk_size = ( object_size + k - 1 ) / k;
95 dout(20) << "get_chunk_size: chunk_size " << chunk_size
96 << " must be modulo " << alignment << dendl;
97 unsigned modulo = chunk_size % alignment;
98 if (modulo) {
99 dout(10) << "get_chunk_size: " << chunk_size
100 << " padded to " << chunk_size + alignment - modulo << dendl;
101 chunk_size += alignment - modulo;
102 }
103 return chunk_size;
104 }
105
106 // -----------------------------------------------------------------------------
107
108 int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode,
109 map<int, bufferlist> *encoded)
110 {
111 char *chunks[k + m];
112 for (int i = 0; i < k + m; i++)
113 chunks[i] = (*encoded)[i].c_str();
114 isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length());
115 return 0;
116 }
117
118 int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read,
119 const map<int, bufferlist> &chunks,
120 map<int, bufferlist> *decoded)
121 {
122 unsigned blocksize = (*chunks.begin()).second.length();
123 int erasures[k + m + 1];
124 int erasures_count = 0;
125 char *data[k];
126 char *coding[m];
127 for (int i = 0; i < k + m; i++) {
128 if (chunks.find(i) == chunks.end()) {
129 erasures[erasures_count] = i;
130 erasures_count++;
131 }
132 if (i < k)
133 data[i] = (*decoded)[i].c_str();
134 else
135 coding[i - k] = (*decoded)[i].c_str();
136 }
137 erasures[erasures_count] = -1;
138 assert(erasures_count > 0);
139 return isa_decode(erasures, data, coding, blocksize);
140 }
141
142 // -----------------------------------------------------------------------------
143
144 void
145 ErasureCodeIsaDefault::isa_encode(char **data,
146 char **coding,
147 int blocksize)
148 {
149
150 if (m == 1)
151 // single parity stripe
152 region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize);
153 else
154 ec_encode_data(blocksize, k, m, encode_tbls,
155 (unsigned char**) data, (unsigned char**) coding);
156 }
157
158 // -----------------------------------------------------------------------------
159
160 bool
161 ErasureCodeIsaDefault::erasure_contains(int *erasures, int i)
162 {
163 for (int l = 0; erasures[l] != -1; l++) {
164 if (erasures[l] == i)
165 return true;
166 }
167 return false;
168 }
169
170 // -----------------------------------------------------------------------------
171
172
173
174 // -----------------------------------------------------------------------------
175
176 int
177 ErasureCodeIsaDefault::isa_decode(int *erasures,
178 char **data,
179 char **coding,
180 int blocksize)
181 {
182 int nerrs = 0;
183 int i, r, s;
184
185 // count the errors
186 for (int l = 0; erasures[l] != -1; l++) {
187 nerrs++;
188 }
189
190 unsigned char *recover_source[k];
191 unsigned char *recover_target[m];
192
193 memset(recover_source, 0, sizeof (recover_source));
194 memset(recover_target, 0, sizeof (recover_target));
195
196 // ---------------------------------------------
197 // Assign source and target buffers
198 // ---------------------------------------------
199 for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) {
200 if (!erasure_contains(erasures, i)) {
201 if (r < k) {
202 if (i < k) {
203 recover_source[r] = (unsigned char*) data[i];
204 } else {
205 recover_source[r] = (unsigned char*) coding[i - k];
206 }
207 r++;
208 }
209 } else {
210 if (s < m) {
211 if (i < k) {
212 recover_target[s] = (unsigned char*) data[i];
213 } else {
214 recover_target[s] = (unsigned char*) coding[i - k];
215 }
216 s++;
217 }
218 }
219 }
220
221 if (m == 1) {
222 // single parity decoding
223 assert(1 == nerrs);
224 dout(20) << "isa_decode: reconstruct using region xor [" <<
225 erasures[0] << "]" << dendl;
226 region_xor(recover_source, recover_target[0], k, blocksize);
227 return 0;
228 }
229
230
231 if ((matrixtype == kVandermonde) &&
232 (nerrs == 1) &&
233 (erasures[0] < (k + 1))) {
234 // use xor decoding if a data chunk is missing or the first coding chunk
235 dout(20) << "isa_decode: reconstruct using region xor [" <<
236 erasures[0] << "]" << dendl;
237 assert(1 == s);
238 assert(k == r);
239 region_xor(recover_source, recover_target[0], k, blocksize);
240 return 0;
241 }
242
243 unsigned char d[k * (m + k)];
244 unsigned char decode_tbls[k * (m + k)*32];
245 unsigned char *p_tbls = decode_tbls;
246
247 int decode_index[k];
248
249 if (nerrs > m)
250 return -1;
251
252 std::string erasure_signature; // describes a matrix configuration for caching
253
254 // ---------------------------------------------
255 // Construct b by removing error rows
256 // ---------------------------------------------
257
258 for (i = 0, r = 0; i < k; i++, r++) {
259 char id[128];
260 while (erasure_contains(erasures, r))
261 r++;
262
263 decode_index[i] = r;
264
265 snprintf(id, sizeof (id), "+%d", r);
266 erasure_signature += id;
267 }
268
269 for (int p = 0; p < nerrs; p++) {
270 char id[128];
271 snprintf(id, sizeof (id), "-%d", erasures[p]);
272 erasure_signature += id;
273 }
274
275 // ---------------------------------------------
276 // Try to get an already computed matrix
277 // ---------------------------------------------
278 if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) {
279 int j;
280 unsigned char b[k * (m + k)];
281 unsigned char c[k * (m + k)];
282
283 for (i = 0; i < k; i++) {
284 r = decode_index[i];
285 for (j = 0; j < k; j++)
286 b[k * i + j] = encode_coeff[k * r + j];
287 }
288 // ---------------------------------------------
289 // Compute inverted matrix
290 // ---------------------------------------------
291
292 // --------------------------------------------------------
293 // Remark: this may fail for certain Vandermonde matrices !
294 // There is an advanced way trying to use different
295 // source chunks to get an invertible matrix, however
296 // there are also (k,m) combinations which cannot be
297 // inverted when m chunks are lost and this optimizations
298 // does not help. Therefor we keep the code simpler.
299 // --------------------------------------------------------
300 if (gf_invert_matrix(b, d, k) < 0) {
301 dout(0) << "isa_decode: bad matrix" << dendl;
302 return -1;
303 }
304
305 for (int p = 0; p < nerrs; p++) {
306 if (erasures[p] < k) {
307 // decoding matrix elements for data chunks
308 for (j = 0; j < k; j++) {
309 c[k * p + j] = d[k * erasures[p] + j];
310 }
311 } else {
312 // decoding matrix element for coding chunks
313 for (i = 0; i < k; i++) {
314 int s = 0;
315 for (j = 0; j < k; j++)
316 s ^= gf_mul(d[j * k + i],
317 encode_coeff[k * erasures[p] + j]);
318
319 c[k * p + i] = s;
320 }
321 }
322 }
323
324 // ---------------------------------------------
325 // Initialize Decoding Table
326 // ---------------------------------------------
327 ec_init_tables(k, nerrs, c, decode_tbls);
328 tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m);
329 }
330 // Recover data sources
331 ec_encode_data(blocksize,
332 k, nerrs, decode_tbls, recover_source, recover_target);
333
334
335 return 0;
336 }
337
338 // -----------------------------------------------------------------------------
339
340 unsigned
341 ErasureCodeIsaDefault::get_alignment() const
342 {
343 return EC_ISA_ADDRESS_ALIGNMENT;
344 }
345
346 // -----------------------------------------------------------------------------
347
348 int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile,
349 ostream *ss)
350 {
351 int err = ErasureCode::parse(profile, ss);
352 err |= to_int("k", profile, &k, DEFAULT_K, ss);
353 err |= to_int("m", profile, &m, DEFAULT_M, ss);
354 err |= sanity_check_k(k, ss);
355
356 if (matrixtype == kVandermonde) {
357 // these are verified safe values evaluated using the
358 // benchmarktool and 10*(combinatoric for maximum loss) random
359 // full erasures
360 if (k > 32) {
361 *ss << "Vandermonde: m=" << m
362 << " should be less/equal than 32 : revert to k=32" << std::endl;
363 k = 32;
364 err = -EINVAL;
365 }
366
367 if (m > 4) {
368 *ss << "Vandermonde: m=" << m
369 << " should be less than 5 to guarantee an MDS codec:"
370 << " revert to m=4" << std::endl;
371 m = 4;
372 err = -EINVAL;
373 }
374 switch (m) {
375 case 4:
376 if (k > 21) {
377 *ss << "Vandermonde: k=" << k
378 << " should be less than 22 to guarantee an MDS"
379 << " codec with m=4: revert to k=21" << std::endl;
380 k = 21;
381 err = -EINVAL;
382 }
383 break;
384 default:
385 ;
386 }
387 }
388 return err;
389 }
390
391 // -----------------------------------------------------------------------------
392
393 void
394 ErasureCodeIsaDefault::prepare()
395 {
396 // setup shared encoding table and coefficients
397 unsigned char** p_enc_table =
398 tcache.getEncodingTable(matrixtype, k, m);
399
400 unsigned char** p_enc_coeff =
401 tcache.getEncodingCoefficient(matrixtype, k, m);
402
403 if (!*p_enc_coeff) {
404 dout(10) << "[ cache tables ] creating coeff for k=" <<
405 k << " m=" << m << dendl;
406 // build encoding coefficients which need to be computed once for each (k,m)
407 encode_coeff = (unsigned char*) malloc(k * (m + k));
408
409 if (matrixtype == kVandermonde)
410 gf_gen_rs_matrix(encode_coeff, k + m, k);
411 if (matrixtype == kCauchy)
412 gf_gen_cauchy1_matrix(encode_coeff, k + m, k);
413
414 // either our new created coefficients are stored or if they have been
415 // created in the meanwhile the locally allocated coefficients will be
416 // freed by setEncodingCoefficient
417 encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff);
418 } else {
419 encode_coeff = *p_enc_coeff;
420 }
421
422 if (!*p_enc_table) {
423 dout(10) << "[ cache tables ] creating tables for k=" <<
424 k << " m=" << m << dendl;
425 // build encoding table which needs to be computed once for each (k,m)
426 encode_tbls = (unsigned char*) malloc(k * (m + k)*32);
427 ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls);
428
429 // either our new created table is stored or if it has been
430 // created in the meanwhile the locally allocated table will be
431 // freed by setEncodingTable
432 encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls);
433 } else {
434 encode_tbls = *p_enc_table;
435 }
436
437 unsigned memory_lru_cache =
438 k * (m + k) * 32 * tcache.decoding_tables_lru_length;
439
440 dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" <<
441 " [ matrix ] = " <<
442 ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl;
443
444 assert((matrixtype == kVandermonde) || (matrixtype == kCauchy));
445
446 }
447 // -----------------------------------------------------------------------------