2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 CERN (Switzerland)
6 * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch>
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
15 // -----------------------------------------------------------------------------
18 // -----------------------------------------------------------------------------
19 #include "common/debug.h"
20 #include "ErasureCodeIsa.h"
22 #include "include/assert.h"
25 // -----------------------------------------------------------------------------
27 #include "isa-l/include/erasure_code.h"
29 // -----------------------------------------------------------------------------
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_osd
33 #define dout_prefix _prefix(_dout)
34 // -----------------------------------------------------------------------------
36 // -----------------------------------------------------------------------------
39 _prefix(std::ostream
* _dout
)
41 return *_dout
<< "ErasureCodeIsa: ";
43 // -----------------------------------------------------------------------------
45 const std::string
ErasureCodeIsaDefault::DEFAULT_K("7");
46 const std::string
ErasureCodeIsaDefault::DEFAULT_M("3");
49 // -----------------------------------------------------------------------------
52 ErasureCodeIsa::init(ErasureCodeProfile
&profile
, ostream
*ss
)
55 err
|= parse(profile
, ss
);
59 return ErasureCode::init(profile
, ss
);
62 // -----------------------------------------------------------------------------
65 ErasureCodeIsa::get_chunk_size(unsigned int object_size
) const
67 unsigned alignment
= get_alignment();
68 unsigned chunk_size
= ( object_size
+ k
- 1 ) / k
;
69 dout(20) << "get_chunk_size: chunk_size " << chunk_size
70 << " must be modulo " << alignment
<< dendl
;
71 unsigned modulo
= chunk_size
% alignment
;
73 dout(10) << "get_chunk_size: " << chunk_size
74 << " padded to " << chunk_size
+ alignment
- modulo
<< dendl
;
75 chunk_size
+= alignment
- modulo
;
80 // -----------------------------------------------------------------------------
82 int ErasureCodeIsa::encode_chunks(const set
<int> &want_to_encode
,
83 map
<int, bufferlist
> *encoded
)
86 for (int i
= 0; i
< k
+ m
; i
++)
87 chunks
[i
] = (*encoded
)[i
].c_str();
88 isa_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
92 int ErasureCodeIsa::decode_chunks(const set
<int> &want_to_read
,
93 const map
<int, bufferlist
> &chunks
,
94 map
<int, bufferlist
> *decoded
)
96 unsigned blocksize
= (*chunks
.begin()).second
.length();
97 int erasures
[k
+ m
+ 1];
98 int erasures_count
= 0;
101 for (int i
= 0; i
< k
+ m
; i
++) {
102 if (chunks
.find(i
) == chunks
.end()) {
103 erasures
[erasures_count
] = i
;
107 data
[i
] = (*decoded
)[i
].c_str();
109 coding
[i
- k
] = (*decoded
)[i
].c_str();
111 erasures
[erasures_count
] = -1;
112 assert(erasures_count
> 0);
113 return isa_decode(erasures
, data
, coding
, blocksize
);
116 // -----------------------------------------------------------------------------
119 ErasureCodeIsaDefault::isa_encode(char **data
,
125 // single parity stripe
126 region_xor((unsigned char**) data
, (unsigned char*) coding
[0], k
, blocksize
);
128 ec_encode_data(blocksize
, k
, m
, encode_tbls
,
129 (unsigned char**) data
, (unsigned char**) coding
);
132 // -----------------------------------------------------------------------------
135 ErasureCodeIsaDefault::erasure_contains(int *erasures
, int i
)
137 for (int l
= 0; erasures
[l
] != -1; l
++) {
138 if (erasures
[l
] == i
)
144 // -----------------------------------------------------------------------------
148 // -----------------------------------------------------------------------------
151 ErasureCodeIsaDefault::isa_decode(int *erasures
,
160 for (int l
= 0; erasures
[l
] != -1; l
++) {
164 unsigned char *recover_source
[k
];
165 unsigned char *recover_target
[m
];
167 memset(recover_source
, 0, sizeof (recover_source
));
168 memset(recover_target
, 0, sizeof (recover_target
));
170 // ---------------------------------------------
171 // Assign source and target buffers
172 // ---------------------------------------------
173 for (i
= 0, s
= 0, r
= 0; ((r
< k
) || (s
< nerrs
)) && (i
< (k
+ m
)); i
++) {
174 if (!erasure_contains(erasures
, i
)) {
177 recover_source
[r
] = (unsigned char*) data
[i
];
179 recover_source
[r
] = (unsigned char*) coding
[i
- k
];
186 recover_target
[s
] = (unsigned char*) data
[i
];
188 recover_target
[s
] = (unsigned char*) coding
[i
- k
];
196 // single parity decoding
198 dout(20) << "isa_decode: reconstruct using region xor [" <<
199 erasures
[0] << "]" << dendl
;
200 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
205 if ((matrixtype
== kVandermonde
) &&
207 (erasures
[0] < (k
+ 1))) {
208 // use xor decoding if a data chunk is missing or the first coding chunk
209 dout(20) << "isa_decode: reconstruct using region xor [" <<
210 erasures
[0] << "]" << dendl
;
213 region_xor(recover_source
, recover_target
[0], k
, blocksize
);
217 unsigned char d
[k
* (m
+ k
)];
218 unsigned char decode_tbls
[k
* (m
+ k
)*32];
219 unsigned char *p_tbls
= decode_tbls
;
226 std::string erasure_signature
; // describes a matrix configuration for caching
228 // ---------------------------------------------
229 // Construct b by removing error rows
230 // ---------------------------------------------
232 for (i
= 0, r
= 0; i
< k
; i
++, r
++) {
234 while (erasure_contains(erasures
, r
))
239 snprintf(id
, sizeof (id
), "+%d", r
);
240 erasure_signature
+= id
;
243 for (int p
= 0; p
< nerrs
; p
++) {
245 snprintf(id
, sizeof (id
), "-%d", erasures
[p
]);
246 erasure_signature
+= id
;
249 // ---------------------------------------------
250 // Try to get an already computed matrix
251 // ---------------------------------------------
252 if (!tcache
.getDecodingTableFromCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
)) {
254 unsigned char b
[k
* (m
+ k
)];
255 unsigned char c
[k
* (m
+ k
)];
257 for (i
= 0; i
< k
; i
++) {
259 for (j
= 0; j
< k
; j
++)
260 b
[k
* i
+ j
] = encode_coeff
[k
* r
+ j
];
262 // ---------------------------------------------
263 // Compute inverted matrix
264 // ---------------------------------------------
266 // --------------------------------------------------------
267 // Remark: this may fail for certain Vandermonde matrices !
268 // There is an advanced way trying to use different
269 // source chunks to get an invertible matrix, however
270 // there are also (k,m) combinations which cannot be
271 // inverted when m chunks are lost and this optimizations
272 // does not help. Therefor we keep the code simpler.
273 // --------------------------------------------------------
274 if (gf_invert_matrix(b
, d
, k
) < 0) {
275 dout(0) << "isa_decode: bad matrix" << dendl
;
279 for (int p
= 0; p
< nerrs
; p
++) {
280 if (erasures
[p
] < k
) {
281 // decoding matrix elements for data chunks
282 for (j
= 0; j
< k
; j
++) {
283 c
[k
* p
+ j
] = d
[k
* erasures
[p
] + j
];
286 // decoding matrix element for coding chunks
287 for (i
= 0; i
< k
; i
++) {
289 for (j
= 0; j
< k
; j
++)
290 s
^= gf_mul(d
[j
* k
+ i
],
291 encode_coeff
[k
* erasures
[p
] + j
]);
298 // ---------------------------------------------
299 // Initialize Decoding Table
300 // ---------------------------------------------
301 ec_init_tables(k
, nerrs
, c
, decode_tbls
);
302 tcache
.putDecodingTableToCache(erasure_signature
, p_tbls
, matrixtype
, k
, m
);
304 // Recover data sources
305 ec_encode_data(blocksize
,
306 k
, nerrs
, decode_tbls
, recover_source
, recover_target
);
312 // -----------------------------------------------------------------------------
315 ErasureCodeIsaDefault::get_alignment() const
317 return EC_ISA_ADDRESS_ALIGNMENT
;
320 // -----------------------------------------------------------------------------
322 int ErasureCodeIsaDefault::parse(ErasureCodeProfile
&profile
,
325 int err
= ErasureCode::parse(profile
, ss
);
326 err
|= to_int("k", profile
, &k
, DEFAULT_K
, ss
);
327 err
|= to_int("m", profile
, &m
, DEFAULT_M
, ss
);
328 err
|= sanity_check_k(k
, ss
);
330 if (matrixtype
== kVandermonde
) {
331 // these are verified safe values evaluated using the
332 // benchmarktool and 10*(combinatoric for maximum loss) random
335 *ss
<< "Vandermonde: m=" << m
336 << " should be less/equal than 32 : revert to k=32" << std::endl
;
342 *ss
<< "Vandermonde: m=" << m
343 << " should be less than 5 to guarantee an MDS codec:"
344 << " revert to m=4" << std::endl
;
351 *ss
<< "Vandermonde: k=" << k
352 << " should be less than 22 to guarantee an MDS"
353 << " codec with m=4: revert to k=21" << std::endl
;
365 // -----------------------------------------------------------------------------
368 ErasureCodeIsaDefault::prepare()
370 // setup shared encoding table and coefficients
371 unsigned char** p_enc_table
=
372 tcache
.getEncodingTable(matrixtype
, k
, m
);
374 unsigned char** p_enc_coeff
=
375 tcache
.getEncodingCoefficient(matrixtype
, k
, m
);
378 dout(10) << "[ cache tables ] creating coeff for k=" <<
379 k
<< " m=" << m
<< dendl
;
380 // build encoding coefficients which need to be computed once for each (k,m)
381 encode_coeff
= (unsigned char*) malloc(k
* (m
+ k
));
383 if (matrixtype
== kVandermonde
)
384 gf_gen_rs_matrix(encode_coeff
, k
+ m
, k
);
385 if (matrixtype
== kCauchy
)
386 gf_gen_cauchy1_matrix(encode_coeff
, k
+ m
, k
);
388 // either our new created coefficients are stored or if they have been
389 // created in the meanwhile the locally allocated coefficients will be
390 // freed by setEncodingCoefficient
391 encode_coeff
= tcache
.setEncodingCoefficient(matrixtype
, k
, m
, encode_coeff
);
393 encode_coeff
= *p_enc_coeff
;
397 dout(10) << "[ cache tables ] creating tables for k=" <<
398 k
<< " m=" << m
<< dendl
;
399 // build encoding table which needs to be computed once for each (k,m)
400 encode_tbls
= (unsigned char*) malloc(k
* (m
+ k
)*32);
401 ec_init_tables(k
, m
, &encode_coeff
[k
* k
], encode_tbls
);
403 // either our new created table is stored or if it has been
404 // created in the meanwhile the locally allocated table will be
405 // freed by setEncodingTable
406 encode_tbls
= tcache
.setEncodingTable(matrixtype
, k
, m
, encode_tbls
);
408 encode_tbls
= *p_enc_table
;
411 unsigned memory_lru_cache
=
412 k
* (m
+ k
) * 32 * tcache
.decoding_tables_lru_length
;
414 dout(10) << "[ cache memory ] = " << memory_lru_cache
<< " bytes" <<
416 ((matrixtype
== kVandermonde
) ? "Vandermonde" : "Cauchy") << dendl
;
418 assert((matrixtype
== kVandermonde
) || (matrixtype
== kCauchy
));
421 // -----------------------------------------------------------------------------