]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Ceph - scalable distributed file system | |
3 | * | |
4 | * Copyright (C) 2014 CERN (Switzerland) | |
5 | * | |
6 | * Author: Andreas-Joachim Peters <Andreas.Joachim.Peters@cern.ch> | |
7 | * | |
8 | * This library is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | */ | |
14 | ||
15 | // ----------------------------------------------------------------------------- | |
16 | #include <algorithm> | |
9f95a23c | 17 | #include <cerrno> |
7c673cae FG |
18 | // ----------------------------------------------------------------------------- |
19 | #include "common/debug.h" | |
20 | #include "ErasureCodeIsa.h" | |
21 | #include "xor_op.h" | |
11fdf7f2 | 22 | #include "include/ceph_assert.h" |
31f18b77 | 23 | using namespace std; |
9f95a23c | 24 | using namespace ceph; |
31f18b77 | 25 | |
7c673cae FG |
26 | // ----------------------------------------------------------------------------- |
27 | extern "C" { | |
28 | #include "isa-l/include/erasure_code.h" | |
29 | } | |
30 | // ----------------------------------------------------------------------------- | |
31 | #define dout_context g_ceph_context | |
32 | #define dout_subsys ceph_subsys_osd | |
33 | #undef dout_prefix | |
34 | #define dout_prefix _prefix(_dout) | |
35 | // ----------------------------------------------------------------------------- | |
36 | ||
37 | // ----------------------------------------------------------------------------- | |
38 | ||
39 | static ostream& | |
40 | _prefix(std::ostream* _dout) | |
41 | { | |
42 | return *_dout << "ErasureCodeIsa: "; | |
43 | } | |
44 | // ----------------------------------------------------------------------------- | |
45 | ||
46 | const std::string ErasureCodeIsaDefault::DEFAULT_K("7"); | |
47 | const std::string ErasureCodeIsaDefault::DEFAULT_M("3"); | |
48 | ||
7c673cae FG |
49 | |
50 | // ----------------------------------------------------------------------------- | |
51 | ||
52 | int | |
53 | ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss) | |
54 | { | |
55 | int err = 0; | |
7c673cae FG |
56 | err |= parse(profile, ss); |
57 | if (err) | |
58 | return err; | |
59 | prepare(); | |
224ce89b | 60 | return ErasureCode::init(profile, ss); |
7c673cae FG |
61 | } |
62 | ||
63 | // ----------------------------------------------------------------------------- | |
64 | ||
65 | unsigned int | |
66 | ErasureCodeIsa::get_chunk_size(unsigned int object_size) const | |
67 | { | |
68 | unsigned alignment = get_alignment(); | |
69 | unsigned chunk_size = ( object_size + k - 1 ) / k; | |
70 | dout(20) << "get_chunk_size: chunk_size " << chunk_size | |
71 | << " must be modulo " << alignment << dendl; | |
72 | unsigned modulo = chunk_size % alignment; | |
73 | if (modulo) { | |
74 | dout(10) << "get_chunk_size: " << chunk_size | |
75 | << " padded to " << chunk_size + alignment - modulo << dendl; | |
76 | chunk_size += alignment - modulo; | |
77 | } | |
78 | return chunk_size; | |
79 | } | |
80 | ||
81 | // ----------------------------------------------------------------------------- | |
82 | ||
83 | int ErasureCodeIsa::encode_chunks(const set<int> &want_to_encode, | |
84 | map<int, bufferlist> *encoded) | |
85 | { | |
86 | char *chunks[k + m]; | |
87 | for (int i = 0; i < k + m; i++) | |
88 | chunks[i] = (*encoded)[i].c_str(); | |
89 | isa_encode(&chunks[0], &chunks[k], (*encoded)[0].length()); | |
90 | return 0; | |
91 | } | |
92 | ||
93 | int ErasureCodeIsa::decode_chunks(const set<int> &want_to_read, | |
94 | const map<int, bufferlist> &chunks, | |
95 | map<int, bufferlist> *decoded) | |
96 | { | |
97 | unsigned blocksize = (*chunks.begin()).second.length(); | |
98 | int erasures[k + m + 1]; | |
99 | int erasures_count = 0; | |
100 | char *data[k]; | |
101 | char *coding[m]; | |
102 | for (int i = 0; i < k + m; i++) { | |
103 | if (chunks.find(i) == chunks.end()) { | |
104 | erasures[erasures_count] = i; | |
105 | erasures_count++; | |
106 | } | |
107 | if (i < k) | |
108 | data[i] = (*decoded)[i].c_str(); | |
109 | else | |
110 | coding[i - k] = (*decoded)[i].c_str(); | |
111 | } | |
112 | erasures[erasures_count] = -1; | |
11fdf7f2 | 113 | ceph_assert(erasures_count > 0); |
7c673cae FG |
114 | return isa_decode(erasures, data, coding, blocksize); |
115 | } | |
116 | ||
117 | // ----------------------------------------------------------------------------- | |
118 | ||
119 | void | |
120 | ErasureCodeIsaDefault::isa_encode(char **data, | |
121 | char **coding, | |
122 | int blocksize) | |
123 | { | |
124 | ||
125 | if (m == 1) | |
126 | // single parity stripe | |
127 | region_xor((unsigned char**) data, (unsigned char*) coding[0], k, blocksize); | |
128 | else | |
129 | ec_encode_data(blocksize, k, m, encode_tbls, | |
130 | (unsigned char**) data, (unsigned char**) coding); | |
131 | } | |
132 | ||
133 | // ----------------------------------------------------------------------------- | |
134 | ||
135 | bool | |
136 | ErasureCodeIsaDefault::erasure_contains(int *erasures, int i) | |
137 | { | |
138 | for (int l = 0; erasures[l] != -1; l++) { | |
139 | if (erasures[l] == i) | |
140 | return true; | |
141 | } | |
142 | return false; | |
143 | } | |
144 | ||
145 | // ----------------------------------------------------------------------------- | |
146 | ||
147 | ||
148 | ||
149 | // ----------------------------------------------------------------------------- | |
150 | ||
151 | int | |
152 | ErasureCodeIsaDefault::isa_decode(int *erasures, | |
153 | char **data, | |
154 | char **coding, | |
155 | int blocksize) | |
156 | { | |
157 | int nerrs = 0; | |
158 | int i, r, s; | |
159 | ||
160 | // count the errors | |
161 | for (int l = 0; erasures[l] != -1; l++) { | |
162 | nerrs++; | |
163 | } | |
164 | ||
165 | unsigned char *recover_source[k]; | |
166 | unsigned char *recover_target[m]; | |
167 | ||
168 | memset(recover_source, 0, sizeof (recover_source)); | |
169 | memset(recover_target, 0, sizeof (recover_target)); | |
170 | ||
171 | // --------------------------------------------- | |
172 | // Assign source and target buffers | |
173 | // --------------------------------------------- | |
174 | for (i = 0, s = 0, r = 0; ((r < k) || (s < nerrs)) && (i < (k + m)); i++) { | |
175 | if (!erasure_contains(erasures, i)) { | |
176 | if (r < k) { | |
177 | if (i < k) { | |
178 | recover_source[r] = (unsigned char*) data[i]; | |
179 | } else { | |
180 | recover_source[r] = (unsigned char*) coding[i - k]; | |
181 | } | |
182 | r++; | |
183 | } | |
184 | } else { | |
185 | if (s < m) { | |
186 | if (i < k) { | |
187 | recover_target[s] = (unsigned char*) data[i]; | |
188 | } else { | |
189 | recover_target[s] = (unsigned char*) coding[i - k]; | |
190 | } | |
191 | s++; | |
192 | } | |
193 | } | |
194 | } | |
195 | ||
196 | if (m == 1) { | |
197 | // single parity decoding | |
11fdf7f2 | 198 | ceph_assert(1 == nerrs); |
7c673cae FG |
199 | dout(20) << "isa_decode: reconstruct using region xor [" << |
200 | erasures[0] << "]" << dendl; | |
201 | region_xor(recover_source, recover_target[0], k, blocksize); | |
202 | return 0; | |
203 | } | |
204 | ||
205 | ||
206 | if ((matrixtype == kVandermonde) && | |
207 | (nerrs == 1) && | |
208 | (erasures[0] < (k + 1))) { | |
209 | // use xor decoding if a data chunk is missing or the first coding chunk | |
210 | dout(20) << "isa_decode: reconstruct using region xor [" << | |
211 | erasures[0] << "]" << dendl; | |
11fdf7f2 TL |
212 | ceph_assert(1 == s); |
213 | ceph_assert(k == r); | |
7c673cae FG |
214 | region_xor(recover_source, recover_target[0], k, blocksize); |
215 | return 0; | |
216 | } | |
217 | ||
218 | unsigned char d[k * (m + k)]; | |
219 | unsigned char decode_tbls[k * (m + k)*32]; | |
220 | unsigned char *p_tbls = decode_tbls; | |
221 | ||
222 | int decode_index[k]; | |
223 | ||
224 | if (nerrs > m) | |
225 | return -1; | |
226 | ||
227 | std::string erasure_signature; // describes a matrix configuration for caching | |
228 | ||
229 | // --------------------------------------------- | |
230 | // Construct b by removing error rows | |
231 | // --------------------------------------------- | |
232 | ||
233 | for (i = 0, r = 0; i < k; i++, r++) { | |
234 | char id[128]; | |
235 | while (erasure_contains(erasures, r)) | |
236 | r++; | |
237 | ||
238 | decode_index[i] = r; | |
239 | ||
240 | snprintf(id, sizeof (id), "+%d", r); | |
241 | erasure_signature += id; | |
242 | } | |
243 | ||
244 | for (int p = 0; p < nerrs; p++) { | |
245 | char id[128]; | |
246 | snprintf(id, sizeof (id), "-%d", erasures[p]); | |
247 | erasure_signature += id; | |
248 | } | |
249 | ||
250 | // --------------------------------------------- | |
251 | // Try to get an already computed matrix | |
252 | // --------------------------------------------- | |
253 | if (!tcache.getDecodingTableFromCache(erasure_signature, p_tbls, matrixtype, k, m)) { | |
254 | int j; | |
255 | unsigned char b[k * (m + k)]; | |
256 | unsigned char c[k * (m + k)]; | |
257 | ||
258 | for (i = 0; i < k; i++) { | |
259 | r = decode_index[i]; | |
260 | for (j = 0; j < k; j++) | |
261 | b[k * i + j] = encode_coeff[k * r + j]; | |
262 | } | |
263 | // --------------------------------------------- | |
264 | // Compute inverted matrix | |
265 | // --------------------------------------------- | |
266 | ||
267 | // -------------------------------------------------------- | |
268 | // Remark: this may fail for certain Vandermonde matrices ! | |
269 | // There is an advanced way trying to use different | |
270 | // source chunks to get an invertible matrix, however | |
271 | // there are also (k,m) combinations which cannot be | |
272 | // inverted when m chunks are lost and this optimizations | |
273 | // does not help. Therefor we keep the code simpler. | |
274 | // -------------------------------------------------------- | |
275 | if (gf_invert_matrix(b, d, k) < 0) { | |
276 | dout(0) << "isa_decode: bad matrix" << dendl; | |
277 | return -1; | |
278 | } | |
279 | ||
280 | for (int p = 0; p < nerrs; p++) { | |
281 | if (erasures[p] < k) { | |
282 | // decoding matrix elements for data chunks | |
283 | for (j = 0; j < k; j++) { | |
284 | c[k * p + j] = d[k * erasures[p] + j]; | |
285 | } | |
286 | } else { | |
287 | // decoding matrix element for coding chunks | |
288 | for (i = 0; i < k; i++) { | |
289 | int s = 0; | |
290 | for (j = 0; j < k; j++) | |
291 | s ^= gf_mul(d[j * k + i], | |
292 | encode_coeff[k * erasures[p] + j]); | |
293 | ||
294 | c[k * p + i] = s; | |
295 | } | |
296 | } | |
297 | } | |
298 | ||
299 | // --------------------------------------------- | |
300 | // Initialize Decoding Table | |
301 | // --------------------------------------------- | |
302 | ec_init_tables(k, nerrs, c, decode_tbls); | |
303 | tcache.putDecodingTableToCache(erasure_signature, p_tbls, matrixtype, k, m); | |
304 | } | |
305 | // Recover data sources | |
306 | ec_encode_data(blocksize, | |
307 | k, nerrs, decode_tbls, recover_source, recover_target); | |
308 | ||
309 | ||
310 | return 0; | |
311 | } | |
312 | ||
313 | // ----------------------------------------------------------------------------- | |
314 | ||
315 | unsigned | |
316 | ErasureCodeIsaDefault::get_alignment() const | |
317 | { | |
318 | return EC_ISA_ADDRESS_ALIGNMENT; | |
319 | } | |
320 | ||
321 | // ----------------------------------------------------------------------------- | |
322 | ||
323 | int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile, | |
324 | ostream *ss) | |
325 | { | |
326 | int err = ErasureCode::parse(profile, ss); | |
327 | err |= to_int("k", profile, &k, DEFAULT_K, ss); | |
328 | err |= to_int("m", profile, &m, DEFAULT_M, ss); | |
11fdf7f2 | 329 | err |= sanity_check_k_m(k, m, ss); |
7c673cae FG |
330 | |
331 | if (matrixtype == kVandermonde) { | |
332 | // these are verified safe values evaluated using the | |
333 | // benchmarktool and 10*(combinatoric for maximum loss) random | |
334 | // full erasures | |
335 | if (k > 32) { | |
336 | *ss << "Vandermonde: m=" << m | |
337 | << " should be less/equal than 32 : revert to k=32" << std::endl; | |
338 | k = 32; | |
339 | err = -EINVAL; | |
340 | } | |
341 | ||
342 | if (m > 4) { | |
343 | *ss << "Vandermonde: m=" << m | |
344 | << " should be less than 5 to guarantee an MDS codec:" | |
345 | << " revert to m=4" << std::endl; | |
346 | m = 4; | |
347 | err = -EINVAL; | |
348 | } | |
349 | switch (m) { | |
350 | case 4: | |
351 | if (k > 21) { | |
352 | *ss << "Vandermonde: k=" << k | |
353 | << " should be less than 22 to guarantee an MDS" | |
354 | << " codec with m=4: revert to k=21" << std::endl; | |
355 | k = 21; | |
356 | err = -EINVAL; | |
357 | } | |
358 | break; | |
359 | default: | |
360 | ; | |
361 | } | |
362 | } | |
363 | return err; | |
364 | } | |
365 | ||
366 | // ----------------------------------------------------------------------------- | |
367 | ||
368 | void | |
369 | ErasureCodeIsaDefault::prepare() | |
370 | { | |
371 | // setup shared encoding table and coefficients | |
372 | unsigned char** p_enc_table = | |
373 | tcache.getEncodingTable(matrixtype, k, m); | |
374 | ||
375 | unsigned char** p_enc_coeff = | |
376 | tcache.getEncodingCoefficient(matrixtype, k, m); | |
377 | ||
378 | if (!*p_enc_coeff) { | |
379 | dout(10) << "[ cache tables ] creating coeff for k=" << | |
380 | k << " m=" << m << dendl; | |
381 | // build encoding coefficients which need to be computed once for each (k,m) | |
382 | encode_coeff = (unsigned char*) malloc(k * (m + k)); | |
383 | ||
384 | if (matrixtype == kVandermonde) | |
385 | gf_gen_rs_matrix(encode_coeff, k + m, k); | |
386 | if (matrixtype == kCauchy) | |
387 | gf_gen_cauchy1_matrix(encode_coeff, k + m, k); | |
388 | ||
389 | // either our new created coefficients are stored or if they have been | |
390 | // created in the meanwhile the locally allocated coefficients will be | |
391 | // freed by setEncodingCoefficient | |
392 | encode_coeff = tcache.setEncodingCoefficient(matrixtype, k, m, encode_coeff); | |
393 | } else { | |
394 | encode_coeff = *p_enc_coeff; | |
395 | } | |
396 | ||
397 | if (!*p_enc_table) { | |
398 | dout(10) << "[ cache tables ] creating tables for k=" << | |
399 | k << " m=" << m << dendl; | |
400 | // build encoding table which needs to be computed once for each (k,m) | |
401 | encode_tbls = (unsigned char*) malloc(k * (m + k)*32); | |
402 | ec_init_tables(k, m, &encode_coeff[k * k], encode_tbls); | |
403 | ||
404 | // either our new created table is stored or if it has been | |
405 | // created in the meanwhile the locally allocated table will be | |
406 | // freed by setEncodingTable | |
407 | encode_tbls = tcache.setEncodingTable(matrixtype, k, m, encode_tbls); | |
408 | } else { | |
409 | encode_tbls = *p_enc_table; | |
410 | } | |
411 | ||
412 | unsigned memory_lru_cache = | |
413 | k * (m + k) * 32 * tcache.decoding_tables_lru_length; | |
414 | ||
415 | dout(10) << "[ cache memory ] = " << memory_lru_cache << " bytes" << | |
416 | " [ matrix ] = " << | |
417 | ((matrixtype == kVandermonde) ? "Vandermonde" : "Cauchy") << dendl; | |
418 | ||
11fdf7f2 | 419 | ceph_assert((matrixtype == kVandermonde) || (matrixtype == kCauchy)); |
7c673cae FG |
420 | |
421 | } | |
422 | // ----------------------------------------------------------------------------- |