2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 CERN (Switzerland)
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
15 // -----------------------------------------------------------------------------
18 // -----------------------------------------------------------------------------
19 #include "common/debug.h"
20 #include "ErasureCodeIsa.h"
22 #include "crush/CrushWrapper.h"
23 #include "osd/osd_types.h"
26 // -----------------------------------------------------------------------------
28 #include "isa-l/include/erasure_code.h"
30 // -----------------------------------------------------------------------------
31 #define dout_context g_ceph_context
32 #define dout_subsys ceph_subsys_osd
34 #define dout_prefix _prefix(_dout)
35 // -----------------------------------------------------------------------------
37 // -----------------------------------------------------------------------------
40 _prefix(std::ostream
* _dout
)
42 return *_dout
<< "ErasureCodeIsa: ";
44 // -----------------------------------------------------------------------------
46 const std::string
ErasureCodeIsaDefault::DEFAULT_K("7");
47 const std::string
ErasureCodeIsaDefault::DEFAULT_M("3");
50 ErasureCodeIsa::create_ruleset(const string
&name
,
54 int ruleid
= crush
.add_simple_rule(
57 ruleset_failure_domain
,
59 pg_pool_t::TYPE_ERASURE
,
65 crush
.set_rule_mask_max_size(ruleid
, get_chunk_count());
66 return crush
.get_rule_mask_ruleset(ruleid
);
70 // -----------------------------------------------------------------------------
73 ErasureCodeIsa::init(ErasureCodeProfile
&profile
, ostream
*ss
)
76 err
|= to_string("ruleset-root", profile
,
78 DEFAULT_RULESET_ROOT
, ss
);
79 err
|= to_string("ruleset-failure-domain", profile
,
80 &ruleset_failure_domain
,
81 DEFAULT_RULESET_FAILURE_DOMAIN
, ss
);
82 err
|= parse(profile
, ss
);
86 ErasureCode::init(profile
, ss
);
90 // -----------------------------------------------------------------------------
93 ErasureCodeIsa::get_chunk_size(unsigned int object_size
) const
95 unsigned alignment
= get_alignment();
96 unsigned chunk_size
= ( object_size
+ k
- 1 ) / k
;
97 dout(20) << "get_chunk_size: chunk_size " << chunk_size
98 << " must be modulo " << alignment
<< dendl
;
99 unsigned modulo
= chunk_size
% alignment
;
101 dout(10) << "get_chunk_size: " << chunk_size
102 << " padded to " << chunk_size
+ alignment
- modulo
<< dendl
;
103 chunk_size
+= alignment
- modulo
;
108 // -----------------------------------------------------------------------------
110 int ErasureCodeIsa::encode_chunks(const set
<int> &want_to_encode
,
111 map
<int, bufferlist
> *encoded
)
114 for (int i
= 0; i
< k
+ m
; i
++)
115 chunks
[i
] = (*encoded
)[i
].c_str();
116 isa_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
120 int ErasureCodeIsa::decode_chunks(const set
<int> &want_to_read
,
121 const map
<int, bufferlist
> &chunks
,
122 map
<int, bufferlist
> *decoded
)
124 unsigned blocksize
= (*chunks
.begin()).second
.length();
125 int erasures
[k
+ m
+ 1];
126 int erasures_count
= 0;
129 for (int i
= 0; i
< k
+ m
; i
++) {
130 if (chunks
.find(i
) == chunks
.end()) {
131 erasures
[erasures_count
] = i
;
135 data
[i
] = (*decoded
)[i
].c_str();
137 coding
[i
- k
] = (*decoded
)[i
].c_str();
139 erasures
[erasures_count
] = -1;
140 assert(erasures_count
> 0);
141 return isa_decode(erasures
, data
, coding
, blocksize
);
144 // -----------------------------------------------------------------------------
147 ErasureCodeIsaDefault::isa_encode(char **data
,
153 // single parity stripe
154 region_xor((unsigned char**) data
, (unsigned char*) coding
[0], k
, blocksize
);
156 ec_encode_data(blocksize
, k
, m
, encode_tbls
,
157 (unsigned char**) data
, (unsigned char**) coding
);
160 // -----------------------------------------------------------------------------
163 ErasureCodeIsaDefault::erasure_contains(int *erasures
, int i
)
165 for (int l
= 0; erasures
[l
] != -1; l
++) {
166 if (erasures
[l
] == i
)
172 // -----------------------------------------------------------------------------
176 // -----------------------------------------------------------------------------
179 ErasureCodeIsaDefault::isa_decode(int *erasures
,
188 for (int l
= 0; erasures
[l
] != -1; l
++) {
192 unsigned char *recover_source
[k
];
193 unsigned char *recover_target
[m
];
195 memset(recover_source
, 0, sizeof (recover_source
));
196 memset(recover_target
, 0, sizeof (recover_target
));
198 // ---------------------------------------------
199 // Assign source and target buffers
200 // ---------------------------------------------
201 for (i
= 0, s
= 0, r
= 0; ((r
< k
) || (s
< nerrs
)) && (i
< (k
+ m
)); i
++) {
202 if (!erasure_contains(erasures
, i
)) {
205 recover_source
[r
] = (unsigned char*) data
[i
];
207 recover_source
[r
] = (unsigned char*) coding
[i
- k
];
214 recover_target
[s
] = (unsigned char*) data
[i
];
216 recover_target
[s
] = (unsigned char*) coding
[i
- k
];
224 // single parity decoding
226 dout(20) << "isa_decode: reconstruct using region xor [" <<
227 erasures
[0] << "]" << dendl
;
228 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
233 if ((matrixtype
== kVandermonde
) &&
235 (erasures
[0] < (k
+ 1))) {
236 // use xor decoding if a data chunk is missing or the first coding chunk
237 dout(20) << "isa_decode: reconstruct using region xor [" <<
238 erasures
[0] << "]" << dendl
;
241 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
245 unsigned char d
[k
* (m
+ k
)];
246 unsigned char decode_tbls
[k
* (m
+ k
)*32];
247 unsigned char *p_tbls
= decode_tbls
;
254 std::string erasure_signature
; // describes a matrix configuration for caching
256 // ---------------------------------------------
257 // Construct b by removing error rows
258 // ---------------------------------------------
260 for (i
= 0, r
= 0; i
< k
; i
++, r
++) {
262 while (erasure_contains(erasures
, r
))
267 snprintf(id
, sizeof (id
), "+%d", r
);
268 erasure_signature
+= id
;
271 for (int p
= 0; p
< nerrs
; p
++) {
273 snprintf(id
, sizeof (id
), "-%d", erasures
[p
]);
274 erasure_signature
+= id
;
277 // ---------------------------------------------
278 // Try to get an already computed matrix
279 // ---------------------------------------------
280 if (!tcache
.getDecodingTableFromCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
)) {
282 unsigned char b
[k
* (m
+ k
)];
283 unsigned char c
[k
* (m
+ k
)];
285 for (i
= 0; i
< k
; i
++) {
287 for (j
= 0; j
< k
; j
++)
288 b
[k
* i
+ j
] = encode_coeff
[k
* r
+ j
];
290 // ---------------------------------------------
291 // Compute inverted matrix
292 // ---------------------------------------------
294 // --------------------------------------------------------
295 // Remark: this may fail for certain Vandermonde matrices !
296 // There is an advanced way trying to use different
297 // source chunks to get an invertible matrix, however
298 // there are also (k,m) combinations which cannot be
299 // inverted when m chunks are lost and this optimizations
300 // does not help. Therefor we keep the code simpler.
301 // --------------------------------------------------------
302 if (gf_invert_matrix(b
, d
, k
) < 0) {
303 dout(0) << "isa_decode: bad matrix" << dendl
;
307 for (int p
= 0; p
< nerrs
; p
++) {
308 if (erasures
[p
] < k
) {
309 // decoding matrix elements for data chunks
310 for (j
= 0; j
< k
; j
++) {
311 c
[k
* p
+ j
] = d
[k
* erasures
[p
] + j
];
314 // decoding matrix element for coding chunks
315 for (i
= 0; i
< k
; i
++) {
317 for (j
= 0; j
< k
; j
++)
318 s
^= gf_mul(d
[j
* k
+ i
],
319 encode_coeff
[k
* erasures
[p
] + j
]);
326 // ---------------------------------------------
327 // Initialize Decoding Table
328 // ---------------------------------------------
329 ec_init_tables(k
, nerrs
, c
, decode_tbls
);
330 tcache
.putDecodingTableToCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
);
332 // Recover data sources
333 ec_encode_data(blocksize
,
334 k
, nerrs
, decode_tbls
, recover_source
, recover_target
);
340 // -----------------------------------------------------------------------------
343 ErasureCodeIsaDefault::get_alignment() const
345 return EC_ISA_ADDRESS_ALIGNMENT
;
348 // -----------------------------------------------------------------------------
350 int ErasureCodeIsaDefault::parse(ErasureCodeProfile
&profile
,
353 int err
= ErasureCode::parse(profile
, ss
);
354 err
|= to_int("k", profile
, &k
, DEFAULT_K
, ss
);
355 err
|= to_int("m", profile
, &m
, DEFAULT_M
, ss
);
356 err
|= sanity_check_k(k
, ss
);
358 if (matrixtype
== kVandermonde
) {
359 // these are verified safe values evaluated using the
360 // benchmarktool and 10*(combinatoric for maximum loss) random
363 *ss
<< "Vandermonde: m=" << m
364 << " should be less/equal than 32 : revert to k=32" << std::endl
;
370 *ss
<< "Vandermonde: m=" << m
371 << " should be less than 5 to guarantee an MDS codec:"
372 << " revert to m=4" << std::endl
;
379 *ss
<< "Vandermonde: k=" << k
380 << " should be less than 22 to guarantee an MDS"
381 << " codec with m=4: revert to k=21" << std::endl
;
393 // -----------------------------------------------------------------------------
396 ErasureCodeIsaDefault::prepare()
398 // setup shared encoding table and coefficients
399 unsigned char** p_enc_table
=
400 tcache
.getEncodingTable(matrixtype
, k
, m
);
402 unsigned char** p_enc_coeff
=
403 tcache
.getEncodingCoefficient(matrixtype
, k
, m
);
406 dout(10) << "[ cache tables ] creating coeff for k=" <<
407 k
<< " m=" << m
<< dendl
;
408 // build encoding coefficients which need to be computed once for each (k,m)
409 encode_coeff
= (unsigned char*) malloc(k
* (m
+ k
));
411 if (matrixtype
== kVandermonde
)
412 gf_gen_rs_matrix(encode_coeff
, k
+ m
, k
);
413 if (matrixtype
== kCauchy
)
414 gf_gen_cauchy1_matrix(encode_coeff
, k
+ m
, k
);
416 // either our new created coefficients are stored or if they have been
417 // created in the meanwhile the locally allocated coefficients will be
418 // freed by setEncodingCoefficient
419 encode_coeff
= tcache
.setEncodingCoefficient(matrixtype
, k
, m
, encode_coeff
);
421 encode_coeff
= *p_enc_coeff
;
425 dout(10) << "[ cache tables ] creating tables for k=" <<
426 k
<< " m=" << m
<< dendl
;
427 // build encoding table which needs to be computed once for each (k,m)
428 encode_tbls
= (unsigned char*) malloc(k
* (m
+ k
)*32);
429 ec_init_tables(k
, m
, &encode_coeff
[k
* k
], encode_tbls
);
431 // either our new created table is stored or if it has been
432 // created in the meanwhile the locally allocated table will be
433 // freed by setEncodingTable
434 encode_tbls
= tcache
.setEncodingTable(matrixtype
, k
, m
, encode_tbls
);
436 encode_tbls
= *p_enc_table
;
439 unsigned memory_lru_cache
=
440 k
* (m
+ k
) * 32 * tcache
.decoding_tables_lru_length
;
442 dout(10) << "[ cache memory ] = " << memory_lru_cache
<< " bytes" <<
444 ((matrixtype
== kVandermonde
) ? "Vandermonde" : "Cauchy") << dendl
;
446 assert((matrixtype
== kVandermonde
) || (matrixtype
== kCauchy
));
449 // -----------------------------------------------------------------------------