]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Ceph - scalable distributed file system | |
3 | * | |
4 | * Copyright (C) 2014 CERN (Switzerland) | |
5 | * | |
6 | * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch> | |
7 | * | |
8 | * This library is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | */ | |
14 | ||
15 | // ----------------------------------------------------------------------------- | |
16 | #include <algorithm> | |
7c673cae FG |
17 | #include <errno.h> |
18 | // ----------------------------------------------------------------------------- | |
19 | #include "common/debug.h" | |
20 | #include "ErasureCodeIsa.h" | |
21 | #include "xor_op.h" | |
224ce89b | 22 | #include "include/assert.h" |
31f18b77 FG |
23 | using namespace std; |
24 | ||
7c673cae FG |
25 | // ----------------------------------------------------------------------------- |
26 | extern "C" { | |
27 | #include "isa-l/include/erasure_code.h" | |
28 | } | |
29 | // ----------------------------------------------------------------------------- | |
30 | #define dout_context g_ceph_context | |
31 | #define dout_subsys ceph_subsys_osd | |
32 | #undef dout_prefix | |
33 | #define dout_prefix _prefix(_dout) | |
34 | // ----------------------------------------------------------------------------- | |
35 | ||
36 | // ----------------------------------------------------------------------------- | |
37 | ||
38 | static ostream& | |
39 | _prefix(std::ostream* _dout) | |
40 | { | |
41 | return *_dout << "ErasureCodeIsa: "; | |
42 | } | |
43 | // ----------------------------------------------------------------------------- | |
44 | ||
45 | const std::string ErasureCodeIsaDefault::DEFAULT_K("7"); | |
46 | const std::string ErasureCodeIsaDefault::DEFAULT_M("3"); | |
47 | ||
7c673cae FG |
48 | |
49 | // ----------------------------------------------------------------------------- | |
50 | ||
51 | int | |
52 | ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss) | |
53 | { | |
54 | int err = 0; | |
7c673cae FG |
55 | err |= parse(profile, ss); |
56 | if (err) | |
57 | return err; | |
58 | prepare(); | |
224ce89b | 59 | return ErasureCode::init(profile, ss); |
7c673cae FG |
60 | } |
61 | ||
62 | // ----------------------------------------------------------------------------- | |
63 | ||
64 | unsigned int | |
65 | ErasureCodeIsa::get_chunk_size(unsigned int object_size) const | |
66 | { | |
67 | unsigned alignment = get_alignment(); | |
68 | unsigned chunk_size = ( object_size + k - 1 ) / k; | |
69 | dout(20) << "get_chunk_size: chunk_size " << chunk_size | |
70 | << " must be modulo " << alignment << dendl; | |
71 | unsigned modulo = chunk_size % alignment; | |
72 | if (modulo) { | |
73 | dout(10) << "get_chunk_size: " << chunk_size | |
74 | << " padded to " << chunk_size + alignment - modulo << dendl; | |
75 | chunk_size += alignment - modulo; | |
76 | } | |
77 | return chunk_size; | |
78 | } | |
79 | ||
80 | // ----------------------------------------------------------------------------- | |
81 | ||
82 | int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode, | |
83 | map<int, bufferlist> *encoded) | |
84 | { | |
85 | char *chunks[k + m]; | |
86 | for (int i = 0; i < k + m; i++) | |
87 | chunks[i] = (*encoded)[i].c_str(); | |
88 | isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length()); | |
89 | return 0; | |
90 | } | |
91 | ||
92 | int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read, | |
93 | const map<int, bufferlist> &chunks, | |
94 | map<int, bufferlist> *decoded) | |
95 | { | |
96 | unsigned blocksize = (*chunks.begin()).second.length(); | |
97 | int erasures[k + m + 1]; | |
98 | int erasures_count = 0; | |
99 | char *data[k]; | |
100 | char *coding[m]; | |
101 | for (int i = 0; i < k + m; i++) { | |
102 | if (chunks.find(i) == chunks.end()) { | |
103 | erasures[erasures_count] = i; | |
104 | erasures_count++; | |
105 | } | |
106 | if (i < k) | |
107 | data[i] = (*decoded)[i].c_str(); | |
108 | else | |
109 | coding[i - k] = (*decoded)[i].c_str(); | |
110 | } | |
111 | erasures[erasures_count] = -1; | |
112 | assert(erasures_count > 0); | |
113 | return isa_decode(erasures, data, coding, blocksize); | |
114 | } | |
115 | ||
116 | // ----------------------------------------------------------------------------- | |
117 | ||
118 | void | |
119 | ErasureCodeIsaDefault::isa_encode(char **data, | |
120 | char **coding, | |
121 | int blocksize) | |
122 | { | |
123 | ||
124 | if (m == 1) | |
125 | // single parity stripe | |
126 | region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize); | |
127 | else | |
128 | ec_encode_data(blocksize, k, m, encode_tbls, | |
129 | (unsigned char**) data, (unsigned char**) coding); | |
130 | } | |
131 | ||
132 | // ----------------------------------------------------------------------------- | |
133 | ||
134 | bool | |
135 | ErasureCodeIsaDefault::erasure_contains(int *erasures, int i) | |
136 | { | |
137 | for (int l = 0; erasures[l] != -1; l++) { | |
138 | if (erasures[l] == i) | |
139 | return true; | |
140 | } | |
141 | return false; | |
142 | } | |
143 | ||
144 | // ----------------------------------------------------------------------------- | |
145 | ||
146 | ||
147 | ||
148 | // ----------------------------------------------------------------------------- | |
149 | ||
150 | int | |
151 | ErasureCodeIsaDefault::isa_decode(int *erasures, | |
152 | char **data, | |
153 | char **coding, | |
154 | int blocksize) | |
155 | { | |
156 | int nerrs = 0; | |
157 | int i, r, s; | |
158 | ||
159 | // count the errors | |
160 | for (int l = 0; erasures[l] != -1; l++) { | |
161 | nerrs++; | |
162 | } | |
163 | ||
164 | unsigned char *recover_source[k]; | |
165 | unsigned char *recover_target[m]; | |
166 | ||
167 | memset(recover_source, 0, sizeof (recover_source)); | |
168 | memset(recover_target, 0, sizeof (recover_target)); | |
169 | ||
170 | // --------------------------------------------- | |
171 | // Assign source and target buffers | |
172 | // --------------------------------------------- | |
173 | for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) { | |
174 | if (!erasure_contains(erasures, i)) { | |
175 | if (r < k) { | |
176 | if (i < k) { | |
177 | recover_source[r] = (unsigned char*) data[i]; | |
178 | } else { | |
179 | recover_source[r] = (unsigned char*) coding[i - k]; | |
180 | } | |
181 | r++; | |
182 | } | |
183 | } else { | |
184 | if (s < m) { | |
185 | if (i < k) { | |
186 | recover_target[s] = (unsigned char*) data[i]; | |
187 | } else { | |
188 | recover_target[s] = (unsigned char*) coding[i - k]; | |
189 | } | |
190 | s++; | |
191 | } | |
192 | } | |
193 | } | |
194 | ||
195 | if (m == 1) { | |
196 | // single parity decoding | |
197 | assert(1 == nerrs); | |
198 | dout(20) << "isa_decode: reconstruct using region xor [" << | |
199 | erasures[0] << "]" << dendl; | |
200 | region_xor(recover_source, recover_target[0], k, blocksize); | |
201 | return 0; | |
202 | } | |
203 | ||
204 | ||
205 | if ((matrixtype == kVandermonde) && | |
206 | (nerrs == 1) && | |
207 | (erasures[0] < (k + 1))) { | |
208 | // use xor decoding if a data chunk is missing or the first coding chunk | |
209 | dout(20) << "isa_decode: reconstruct using region xor [" << | |
210 | erasures[0] << "]" << dendl; | |
211 | assert(1 == s); | |
212 | assert(k == r); | |
213 | region_xor(recover_source, recover_target[0], k, blocksize); | |
214 | return 0; | |
215 | } | |
216 | ||
217 | unsigned char d[k * (m + k)]; | |
218 | unsigned char decode_tbls[k * (m + k)*32]; | |
219 | unsigned char *p_tbls = decode_tbls; | |
220 | ||
221 | int decode_index[k]; | |
222 | ||
223 | if (nerrs > m) | |
224 | return -1; | |
225 | ||
226 | std::string erasure_signature; // describes a matrix configuration for caching | |
227 | ||
228 | // --------------------------------------------- | |
229 | // Construct b by removing error rows | |
230 | // --------------------------------------------- | |
231 | ||
232 | for (i = 0, r = 0; i < k; i++, r++) { | |
233 | char id[128]; | |
234 | while (erasure_contains(erasures, r)) | |
235 | r++; | |
236 | ||
237 | decode_index[i] = r; | |
238 | ||
239 | snprintf(id, sizeof (id), "+%d", r); | |
240 | erasure_signature += id; | |
241 | } | |
242 | ||
243 | for (int p = 0; p < nerrs; p++) { | |
244 | char id[128]; | |
245 | snprintf(id, sizeof (id), "-%d", erasures[p]); | |
246 | erasure_signature += id; | |
247 | } | |
248 | ||
249 | // --------------------------------------------- | |
250 | // Try to get an already computed matrix | |
251 | // --------------------------------------------- | |
252 | if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) { | |
253 | int j; | |
254 | unsigned char b[k * (m + k)]; | |
255 | unsigned char c[k * (m + k)]; | |
256 | ||
257 | for (i = 0; i < k; i++) { | |
258 | r = decode_index[i]; | |
259 | for (j = 0; j < k; j++) | |
260 | b[k * i + j] = encode_coeff[k * r + j]; | |
261 | } | |
262 | // --------------------------------------------- | |
263 | // Compute inverted matrix | |
264 | // --------------------------------------------- | |
265 | ||
266 | // -------------------------------------------------------- | |
267 | // Remark: this may fail for certain Vandermonde matrices ! | |
268 | // There is an advanced way trying to use different | |
269 | // source chunks to get an invertible matrix, however | |
270 | // there are also (k,m) combinations which cannot be | |
271 | // inverted when m chunks are lost and this optimizations | |
272 | // does not help. Therefor we keep the code simpler. | |
273 | // -------------------------------------------------------- | |
274 | if (gf_invert_matrix(b, d, k) < 0) { | |
275 | dout(0) << "isa_decode: bad matrix" << dendl; | |
276 | return -1; | |
277 | } | |
278 | ||
279 | for (int p = 0; p < nerrs; p++) { | |
280 | if (erasures[p] < k) { | |
281 | // decoding matrix elements for data chunks | |
282 | for (j = 0; j < k; j++) { | |
283 | c[k * p + j] = d[k * erasures[p] + j]; | |
284 | } | |
285 | } else { | |
286 | // decoding matrix element for coding chunks | |
287 | for (i = 0; i < k; i++) { | |
288 | int s = 0; | |
289 | for (j = 0; j < k; j++) | |
290 | s ^= gf_mul(d[j * k + i], | |
291 | encode_coeff[k * erasures[p] + j]); | |
292 | ||
293 | c[k * p + i] = s; | |
294 | } | |
295 | } | |
296 | } | |
297 | ||
298 | // --------------------------------------------- | |
299 | // Initialize Decoding Table | |
300 | // --------------------------------------------- | |
301 | ec_init_tables(k, nerrs, c, decode_tbls); | |
302 | tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m); | |
303 | } | |
304 | // Recover data sources | |
305 | ec_encode_data(blocksize, | |
306 | k, nerrs, decode_tbls, recover_source, recover_target); | |
307 | ||
308 | ||
309 | return 0; | |
310 | } | |
311 | ||
312 | // ----------------------------------------------------------------------------- | |
313 | ||
314 | unsigned | |
315 | ErasureCodeIsaDefault::get_alignment() const | |
316 | { | |
317 | return EC_ISA_ADDRESS_ALIGNMENT; | |
318 | } | |
319 | ||
320 | // ----------------------------------------------------------------------------- | |
321 | ||
322 | int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile, | |
323 | ostream *ss) | |
324 | { | |
325 | int err = ErasureCode::parse(profile, ss); | |
326 | err |= to_int("k", profile, &k, DEFAULT_K, ss); | |
327 | err |= to_int("m", profile, &m, DEFAULT_M, ss); | |
328 | err |= sanity_check_k(k, ss); | |
329 | ||
330 | if (matrixtype == kVandermonde) { | |
331 | // these are verified safe values evaluated using the | |
332 | // benchmarktool and 10*(combinatoric for maximum loss) random | |
333 | // full erasures | |
334 | if (k > 32) { | |
335 | *ss << "Vandermonde: m=" << m | |
336 | << " should be less/equal than 32 : revert to k=32" << std::endl; | |
337 | k = 32; | |
338 | err = -EINVAL; | |
339 | } | |
340 | ||
341 | if (m > 4) { | |
342 | *ss << "Vandermonde: m=" << m | |
343 | << " should be less than 5 to guarantee an MDS codec:" | |
344 | << " revert to m=4" << std::endl; | |
345 | m = 4; | |
346 | err = -EINVAL; | |
347 | } | |
348 | switch (m) { | |
349 | case 4: | |
350 | if (k > 21) { | |
351 | *ss << "Vandermonde: k=" << k | |
352 | << " should be less than 22 to guarantee an MDS" | |
353 | << " codec with m=4: revert to k=21" << std::endl; | |
354 | k = 21; | |
355 | err = -EINVAL; | |
356 | } | |
357 | break; | |
358 | default: | |
359 | ; | |
360 | } | |
361 | } | |
362 | return err; | |
363 | } | |
364 | ||
365 | // ----------------------------------------------------------------------------- | |
366 | ||
367 | void | |
368 | ErasureCodeIsaDefault::prepare() | |
369 | { | |
370 | // setup shared encoding table and coefficients | |
371 | unsigned char** p_enc_table = | |
372 | tcache.getEncodingTable(matrixtype, k, m); | |
373 | ||
374 | unsigned char** p_enc_coeff = | |
375 | tcache.getEncodingCoefficient(matrixtype, k, m); | |
376 | ||
377 | if (!*p_enc_coeff) { | |
378 | dout(10) << "[ cache tables ] creating coeff for k=" << | |
379 | k << " m=" << m << dendl; | |
380 | // build encoding coefficients which need to be computed once for each (k,m) | |
381 | encode_coeff = (unsigned char*) malloc(k * (m + k)); | |
382 | ||
383 | if (matrixtype == kVandermonde) | |
384 | gf_gen_rs_matrix(encode_coeff, k + m, k); | |
385 | if (matrixtype == kCauchy) | |
386 | gf_gen_cauchy1_matrix(encode_coeff, k + m, k); | |
387 | ||
388 | // either our new created coefficients are stored or if they have been | |
389 | // created in the meanwhile the locally allocated coefficients will be | |
390 | // freed by setEncodingCoefficient | |
391 | encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff); | |
392 | } else { | |
393 | encode_coeff = *p_enc_coeff; | |
394 | } | |
395 | ||
396 | if (!*p_enc_table) { | |
397 | dout(10) << "[ cache tables ] creating tables for k=" << | |
398 | k << " m=" << m << dendl; | |
399 | // build encoding table which needs to be computed once for each (k,m) | |
400 | encode_tbls = (unsigned char*) malloc(k * (m + k)*32); | |
401 | ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls); | |
402 | ||
403 | // either our new created table is stored or if it has been | |
404 | // created in the meanwhile the locally allocated table will be | |
405 | // freed by setEncodingTable | |
406 | encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls); | |
407 | } else { | |
408 | encode_tbls = *p_enc_table; | |
409 | } | |
410 | ||
411 | unsigned memory_lru_cache = | |
412 | k * (m + k) * 32 * tcache.decoding_tables_lru_length; | |
413 | ||
414 | dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" << | |
415 | " [ matrix ] = " << | |
416 | ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl; | |
417 | ||
418 | assert((matrixtype == kVandermonde) || (matrixtype == kCauchy)); | |
419 | ||
420 | } | |
421 | // ----------------------------------------------------------------------------- |