2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 CERN (Switzerland)
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
15 // -----------------------------------------------------------------------------
19 // -----------------------------------------------------------------------------
20 #include "common/debug.h"
21 #include "ErasureCodeIsa.h"
23 #include "crush/CrushWrapper.h"
24 #include "osd/osd_types.h"
25 // -----------------------------------------------------------------------------
27 #include "isa-l/include/erasure_code.h"
29 // -----------------------------------------------------------------------------
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_osd
33 #define dout_prefix _prefix(_dout)
34 // -----------------------------------------------------------------------------
36 // -----------------------------------------------------------------------------
39 _prefix(std::ostream
* _dout
)
41 return *_dout
<< "ErasureCodeIsa: ";
43 // -----------------------------------------------------------------------------
45 const std::string
ErasureCodeIsaDefault::DEFAULT_K("7");
46 const std::string
ErasureCodeIsaDefault::DEFAULT_M("3");
49 ErasureCodeIsa::create_ruleset(const string
&name
,
53 int ruleid
= crush
.add_simple_ruleset(name
,
55 ruleset_failure_domain
,
57 pg_pool_t::TYPE_ERASURE
,
63 crush
.set_rule_mask_max_size(ruleid
, get_chunk_count());
64 return crush
.get_rule_mask_ruleset(ruleid
);
68 // -----------------------------------------------------------------------------
71 ErasureCodeIsa::init(ErasureCodeProfile
&profile
, ostream
*ss
)
74 err
|= to_string("ruleset-root", profile
,
76 DEFAULT_RULESET_ROOT
, ss
);
77 err
|= to_string("ruleset-failure-domain", profile
,
78 &ruleset_failure_domain
,
79 DEFAULT_RULESET_FAILURE_DOMAIN
, ss
);
80 err
|= parse(profile
, ss
);
84 ErasureCode::init(profile
, ss
);
88 // -----------------------------------------------------------------------------
91 ErasureCodeIsa::get_chunk_size(unsigned int object_size
) const
93 unsigned alignment
= get_alignment();
94 unsigned chunk_size
= ( object_size
+ k
- 1 ) / k
;
95 dout(20) << "get_chunk_size: chunk_size " << chunk_size
96 << " must be modulo " << alignment
<< dendl
;
97 unsigned modulo
= chunk_size
% alignment
;
99 dout(10) << "get_chunk_size: " << chunk_size
100 << " padded to " << chunk_size
+ alignment
- modulo
<< dendl
;
101 chunk_size
+= alignment
- modulo
;
106 // -----------------------------------------------------------------------------
108 int ErasureCodeIsa::encode_chunks(const set
<int> &want_to_encode
,
109 map
<int, bufferlist
> *encoded
)
112 for (int i
= 0; i
< k
+ m
; i
++)
113 chunks
[i
] = (*encoded
)[i
].c_str();
114 isa_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
118 int ErasureCodeIsa::decode_chunks(const set
<int> &want_to_read
,
119 const map
<int, bufferlist
> &chunks
,
120 map
<int, bufferlist
> *decoded
)
122 unsigned blocksize
= (*chunks
.begin()).second
.length();
123 int erasures
[k
+ m
+ 1];
124 int erasures_count
= 0;
127 for (int i
= 0; i
< k
+ m
; i
++) {
128 if (chunks
.find(i
) == chunks
.end()) {
129 erasures
[erasures_count
] = i
;
133 data
[i
] = (*decoded
)[i
].c_str();
135 coding
[i
- k
] = (*decoded
)[i
].c_str();
137 erasures
[erasures_count
] = -1;
138 assert(erasures_count
> 0);
139 return isa_decode(erasures
, data
, coding
, blocksize
);
142 // -----------------------------------------------------------------------------
145 ErasureCodeIsaDefault::isa_encode(char **data
,
151 // single parity stripe
152 region_xor((unsigned char**) data
, (unsigned char*) coding
[0], k
, blocksize
);
154 ec_encode_data(blocksize
, k
, m
, encode_tbls
,
155 (unsigned char**) data
, (unsigned char**) coding
);
158 // -----------------------------------------------------------------------------
161 ErasureCodeIsaDefault::erasure_contains(int *erasures
, int i
)
163 for (int l
= 0; erasures
[l
] != -1; l
++) {
164 if (erasures
[l
] == i
)
170 // -----------------------------------------------------------------------------
174 // -----------------------------------------------------------------------------
177 ErasureCodeIsaDefault::isa_decode(int *erasures
,
186 for (int l
= 0; erasures
[l
] != -1; l
++) {
190 unsigned char *recover_source
[k
];
191 unsigned char *recover_target
[m
];
193 memset(recover_source
, 0, sizeof (recover_source
));
194 memset(recover_target
, 0, sizeof (recover_target
));
196 // ---------------------------------------------
197 // Assign source and target buffers
198 // ---------------------------------------------
199 for (i
= 0, s
= 0, r
= 0; ((r
< k
) || (s
< nerrs
)) && (i
< (k
+ m
)); i
++) {
200 if (!erasure_contains(erasures
, i
)) {
203 recover_source
[r
] = (unsigned char*) data
[i
];
205 recover_source
[r
] = (unsigned char*) coding
[i
- k
];
212 recover_target
[s
] = (unsigned char*) data
[i
];
214 recover_target
[s
] = (unsigned char*) coding
[i
- k
];
222 // single parity decoding
224 dout(20) << "isa_decode: reconstruct using region xor [" <<
225 erasures
[0] << "]" << dendl
;
226 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
231 if ((matrixtype
== kVandermonde
) &&
233 (erasures
[0] < (k
+ 1))) {
234 // use xor decoding if a data chunk is missing or the first coding chunk
235 dout(20) << "isa_decode: reconstruct using region xor [" <<
236 erasures
[0] << "]" << dendl
;
239 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
243 unsigned char d
[k
* (m
+ k
)];
244 unsigned char decode_tbls
[k
* (m
+ k
)*32];
245 unsigned char *p_tbls
= decode_tbls
;
252 std::string erasure_signature
; // describes a matrix configuration for caching
254 // ---------------------------------------------
255 // Construct b by removing error rows
256 // ---------------------------------------------
258 for (i
= 0, r
= 0; i
< k
; i
++, r
++) {
260 while (erasure_contains(erasures
, r
))
265 snprintf(id
, sizeof (id
), "+%d", r
);
266 erasure_signature
+= id
;
269 for (int p
= 0; p
< nerrs
; p
++) {
271 snprintf(id
, sizeof (id
), "-%d", erasures
[p
]);
272 erasure_signature
+= id
;
275 // ---------------------------------------------
276 // Try to get an already computed matrix
277 // ---------------------------------------------
278 if (!tcache
.getDecodingTableFromCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
)) {
280 unsigned char b
[k
* (m
+ k
)];
281 unsigned char c
[k
* (m
+ k
)];
283 for (i
= 0; i
< k
; i
++) {
285 for (j
= 0; j
< k
; j
++)
286 b
[k
* i
+ j
] = encode_coeff
[k
* r
+ j
];
288 // ---------------------------------------------
289 // Compute inverted matrix
290 // ---------------------------------------------
292 // --------------------------------------------------------
293 // Remark: this may fail for certain Vandermonde matrices !
294 // There is an advanced way trying to use different
295 // source chunks to get an invertible matrix, however
296 // there are also (k,m) combinations which cannot be
297 // inverted when m chunks are lost and this optimizations
298 // does not help. Therefor we keep the code simpler.
299 // --------------------------------------------------------
300 if (gf_invert_matrix(b
, d
, k
) < 0) {
301 dout(0) << "isa_decode: bad matrix" << dendl
;
305 for (int p
= 0; p
< nerrs
; p
++) {
306 if (erasures
[p
] < k
) {
307 // decoding matrix elements for data chunks
308 for (j
= 0; j
< k
; j
++) {
309 c
[k
* p
+ j
] = d
[k
* erasures
[p
] + j
];
312 // decoding matrix element for coding chunks
313 for (i
= 0; i
< k
; i
++) {
315 for (j
= 0; j
< k
; j
++)
316 s
^= gf_mul(d
[j
* k
+ i
],
317 encode_coeff
[k
* erasures
[p
] + j
]);
324 // ---------------------------------------------
325 // Initialize Decoding Table
326 // ---------------------------------------------
327 ec_init_tables(k
, nerrs
, c
, decode_tbls
);
328 tcache
.putDecodingTableToCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
);
330 // Recover data sources
331 ec_encode_data(blocksize
,
332 k
, nerrs
, decode_tbls
, recover_source
, recover_target
);
338 // -----------------------------------------------------------------------------
341 ErasureCodeIsaDefault::get_alignment() const
343 return EC_ISA_ADDRESS_ALIGNMENT
;
346 // -----------------------------------------------------------------------------
348 int ErasureCodeIsaDefault::parse(ErasureCodeProfile
&profile
,
351 int err
= ErasureCode::parse(profile
, ss
);
352 err
|= to_int("k", profile
, &k
, DEFAULT_K
, ss
);
353 err
|= to_int("m", profile
, &m
, DEFAULT_M
, ss
);
354 err
|= sanity_check_k(k
, ss
);
356 if (matrixtype
== kVandermonde
) {
357 // these are verified safe values evaluated using the
358 // benchmarktool and 10*(combinatoric for maximum loss) random
361 *ss
<< "Vandermonde: m=" << m
362 << " should be less/equal than 32 : revert to k=32" << std::endl
;
368 *ss
<< "Vandermonde: m=" << m
369 << " should be less than 5 to guarantee an MDS codec:"
370 << " revert to m=4" << std::endl
;
377 *ss
<< "Vandermonde: k=" << k
378 << " should be less than 22 to guarantee an MDS"
379 << " codec with m=4: revert to k=21" << std::endl
;
391 // -----------------------------------------------------------------------------
394 ErasureCodeIsaDefault::prepare()
396 // setup shared encoding table and coefficients
397 unsigned char** p_enc_table
=
398 tcache
.getEncodingTable(matrixtype
, k
, m
);
400 unsigned char** p_enc_coeff
=
401 tcache
.getEncodingCoefficient(matrixtype
, k
, m
);
404 dout(10) << "[ cache tables ] creating coeff for k=" <<
405 k
<< " m=" << m
<< dendl
;
406 // build encoding coefficients which need to be computed once for each (k,m)
407 encode_coeff
= (unsigned char*) malloc(k
* (m
+ k
));
409 if (matrixtype
== kVandermonde
)
410 gf_gen_rs_matrix(encode_coeff
, k
+ m
, k
);
411 if (matrixtype
== kCauchy
)
412 gf_gen_cauchy1_matrix(encode_coeff
, k
+ m
, k
);
414 // either our new created coefficients are stored or if they have been
415 // created in the meanwhile the locally allocated coefficients will be
416 // freed by setEncodingCoefficient
417 encode_coeff
= tcache
.setEncodingCoefficient(matrixtype
, k
, m
, encode_coeff
);
419 encode_coeff
= *p_enc_coeff
;
423 dout(10) << "[ cache tables ] creating tables for k=" <<
424 k
<< " m=" << m
<< dendl
;
425 // build encoding table which needs to be computed once for each (k,m)
426 encode_tbls
= (unsigned char*) malloc(k
* (m
+ k
)*32);
427 ec_init_tables(k
, m
, &encode_coeff
[k
* k
], encode_tbls
);
429 // either our new created table is stored or if it has been
430 // created in the meanwhile the locally allocated table will be
431 // freed by setEncodingTable
432 encode_tbls
= tcache
.setEncodingTable(matrixtype
, k
, m
, encode_tbls
);
434 encode_tbls
= *p_enc_table
;
437 unsigned memory_lru_cache
=
438 k
* (m
+ k
) * 32 * tcache
.decoding_tables_lru_length
;
440 dout(10) << "[ cache memory ] = " << memory_lru_cache
<< " bytes" <<
442 ((matrixtype
== kVandermonde
) ? "Vandermonde" : "Cauchy") << dendl
;
444 assert((matrixtype
== kVandermonde
) || (matrixtype
== kCauchy
));
447 // -----------------------------------------------------------------------------