]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com> | |
7 | * Copyright (C) 2014 Red Hat <contact@redhat.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This library is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License as published by the Free Software Foundation; either | |
14 | * version 2.1 of the License, or (at your option) any later version. | |
15 | * | |
16 | */ | |
17 | ||
9f95a23c | 18 | #include <cerrno> |
7c673cae FG |
19 | #include <algorithm> |
20 | ||
21 | #include "include/str_map.h" | |
22 | #include "common/debug.h" | |
23 | #include "crush/CrushWrapper.h" | |
24 | #include "osd/osd_types.h" | |
25 | #include "include/stringify.h" | |
26 | #include "erasure-code/ErasureCodePlugin.h" | |
27 | #include "json_spirit/json_spirit_writer.h" | |
28 | ||
29 | #include "ErasureCodeLrc.h" | |
30 | ||
7c673cae FG |
31 | #define dout_context g_ceph_context |
32 | #define dout_subsys ceph_subsys_osd | |
33 | #undef dout_prefix | |
34 | #define dout_prefix _prefix(_dout) | |
35 | ||
31f18b77 | 36 | using namespace std; |
9f95a23c | 37 | using namespace ceph; |
31f18b77 | 38 | |
7c673cae FG |
39 | static ostream& _prefix(std::ostream* _dout) |
40 | { | |
41 | return *_dout << "ErasureCodeLrc: "; | |
42 | } | |
43 | ||
224ce89b | 44 | int ErasureCodeLrc::create_rule(const string &name, |
7c673cae FG |
45 | CrushWrapper &crush, |
46 | ostream *ss) const | |
47 | { | |
48 | if (crush.rule_exists(name)) { | |
49 | *ss << "rule " << name << " exists"; | |
50 | return -EEXIST; | |
51 | } | |
224ce89b WB |
52 | if (!crush.name_exists(rule_root)) { |
53 | *ss << "root item " << rule_root << " does not exist"; | |
7c673cae FG |
54 | return -ENOENT; |
55 | } | |
224ce89b WB |
56 | int root = crush.get_item_id(rule_root); |
57 | if (rule_device_class.size()) { | |
58 | if (!crush.class_exists(rule_device_class)) { | |
59 | *ss << "device class " << rule_device_class << " does not exist"; | |
60 | return -ENOENT; | |
61 | } | |
62 | int c = crush.get_class_id(rule_device_class); | |
63 | if (crush.class_bucket.count(root) == 0 || | |
64 | crush.class_bucket[root].count(c) == 0) { | |
65 | *ss << "root item " << rule_root << " has no devices with class " | |
66 | << rule_device_class; | |
67 | return -EINVAL; | |
68 | } | |
69 | root = crush.class_bucket[root][c]; | |
70 | } | |
7c673cae | 71 | |
7c673cae FG |
72 | int rno = 0; |
73 | for (rno = 0; rno < crush.get_max_rules(); rno++) { | |
74 | if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) | |
75 | break; | |
76 | } | |
7c673cae | 77 | |
224ce89b | 78 | int steps = 4 + rule_steps.size(); |
7c673cae FG |
79 | int min_rep = 3; |
80 | int max_rep = get_chunk_count(); | |
81 | int ret; | |
c07f9fc5 FG |
82 | ret = crush.add_rule(rno, steps, pg_pool_t::TYPE_ERASURE, |
83 | min_rep, max_rep); | |
11fdf7f2 | 84 | ceph_assert(ret == rno); |
7c673cae FG |
85 | int step = 0; |
86 | ||
87 | ret = crush.set_rule_step(rno, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); | |
11fdf7f2 | 88 | ceph_assert(ret == 0); |
7c673cae | 89 | ret = crush.set_rule_step(rno, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); |
11fdf7f2 | 90 | ceph_assert(ret == 0); |
7c673cae | 91 | ret = crush.set_rule_step(rno, step++, CRUSH_RULE_TAKE, root, 0); |
11fdf7f2 | 92 | ceph_assert(ret == 0); |
7c673cae FG |
93 | // [ [ "choose", "rack", 2 ], |
94 | // [ "chooseleaf", "host", 5 ] ] | |
224ce89b WB |
95 | for (vector<Step>::const_iterator i = rule_steps.begin(); |
96 | i != rule_steps.end(); | |
7c673cae FG |
97 | ++i) { |
98 | int op = i->op == "chooseleaf" ? | |
99 | CRUSH_RULE_CHOOSELEAF_INDEP : CRUSH_RULE_CHOOSE_INDEP; | |
100 | int type = crush.get_type_id(i->type); | |
101 | if (type < 0) { | |
102 | *ss << "unknown crush type " << i->type; | |
103 | return -EINVAL; | |
104 | } | |
105 | ret = crush.set_rule_step(rno, step++, op, i->n, type); | |
11fdf7f2 | 106 | ceph_assert(ret == 0); |
7c673cae FG |
107 | } |
108 | ret = crush.set_rule_step(rno, step++, CRUSH_RULE_EMIT, 0, 0); | |
11fdf7f2 | 109 | ceph_assert(ret == 0); |
7c673cae | 110 | crush.set_rule_name(rno, name); |
c07f9fc5 | 111 | return rno; |
7c673cae FG |
112 | } |
113 | ||
114 | int ErasureCodeLrc::layers_description(const ErasureCodeProfile &profile, | |
115 | json_spirit::mArray *description, | |
116 | ostream *ss) const | |
117 | { | |
118 | if (profile.count("layers") == 0) { | |
119 | *ss << "could not find 'layers' in " << profile << std::endl; | |
120 | return ERROR_LRC_DESCRIPTION; | |
121 | } | |
122 | string str = profile.find("layers")->second; | |
123 | try { | |
124 | json_spirit::mValue json; | |
125 | json_spirit::read_or_throw(str, json); | |
126 | ||
127 | if (json.type() != json_spirit::array_type) { | |
128 | *ss << "layers='" << str | |
129 | << "' must be a JSON array but is of type " | |
130 | << json.type() << " instead" << std::endl; | |
131 | return ERROR_LRC_ARRAY; | |
132 | } | |
133 | *description = json.get_array(); | |
134 | } catch (json_spirit::Error_position &e) { | |
135 | *ss << "failed to parse layers='" << str << "'" | |
136 | << " at line " << e.line_ << ", column " << e.column_ | |
137 | << " : " << e.reason_ << std::endl; | |
138 | return ERROR_LRC_PARSE_JSON; | |
139 | } | |
140 | return 0; | |
141 | } | |
142 | ||
11fdf7f2 | 143 | int ErasureCodeLrc::layers_parse(const string &description_string, |
7c673cae FG |
144 | json_spirit::mArray description, |
145 | ostream *ss) | |
146 | { | |
147 | int position = 0; | |
148 | for (vector<json_spirit::mValue>::iterator i = description.begin(); | |
149 | i != description.end(); | |
150 | ++i, position++) { | |
151 | if (i->type() != json_spirit::array_type) { | |
152 | stringstream json_string; | |
153 | json_spirit::write(*i, json_string); | |
154 | *ss << "each element of the array " | |
155 | << description_string << " must be a JSON array but " | |
156 | << json_string.str() << " at position " << position | |
157 | << " is of type " << i->type() << " instead" << std::endl; | |
158 | return ERROR_LRC_ARRAY; | |
159 | } | |
160 | json_spirit::mArray layer_json = i->get_array(); | |
161 | ErasureCodeProfile profile; | |
162 | int index = 0; | |
163 | for (vector<json_spirit::mValue>::iterator j = layer_json.begin(); | |
164 | j != layer_json.end(); | |
165 | ++j, ++index) { | |
166 | if (index == 0) { | |
167 | if (j->type() != json_spirit::str_type) { | |
168 | stringstream element; | |
169 | json_spirit::write(*j, element); | |
170 | *ss << "the first element of the entry " | |
171 | << element.str() << " (first is zero) " | |
172 | << position << " in " << description_string | |
173 | << " is of type " << (*j).type() << " instead of string" << std::endl; | |
174 | return ERROR_LRC_STR; | |
175 | } | |
176 | layers.push_back(Layer(j->get_str())); | |
177 | Layer &layer = layers.back(); | |
178 | layer.chunks_map = j->get_str(); | |
179 | } else if(index == 1) { | |
180 | Layer &layer = layers.back(); | |
181 | if (j->type() != json_spirit::str_type && | |
182 | j->type() != json_spirit::obj_type) { | |
183 | stringstream element; | |
184 | json_spirit::write(*j, element); | |
185 | *ss << "the second element of the entry " | |
186 | << element.str() << " (first is zero) " | |
187 | << position << " in " << description_string | |
188 | << " is of type " << (*j).type() << " instead of string or object" | |
189 | << std::endl; | |
190 | return ERROR_LRC_CONFIG_OPTIONS; | |
191 | } | |
192 | if (j->type() == json_spirit::str_type) { | |
193 | int err = get_json_str_map(j->get_str(), *ss, &layer.profile); | |
194 | if (err) | |
195 | return err; | |
196 | } else if (j->type() == json_spirit::obj_type) { | |
197 | json_spirit::mObject o = j->get_obj(); | |
198 | ||
199 | for (map<string, json_spirit::mValue>::iterator i = o.begin(); | |
200 | i != o.end(); | |
201 | ++i) { | |
202 | layer.profile[i->first] = i->second.get_str(); | |
203 | } | |
204 | } | |
205 | } else { | |
206 | // ignore trailing elements | |
207 | } | |
208 | } | |
209 | } | |
210 | return 0; | |
211 | } | |
212 | ||
213 | int ErasureCodeLrc::layers_init(ostream *ss) | |
214 | { | |
215 | ErasureCodePluginRegistry ®istry = ErasureCodePluginRegistry::instance(); | |
216 | for (unsigned int i = 0; i < layers.size(); i++) { | |
217 | Layer &layer = layers[i]; | |
218 | int position = 0; | |
219 | for(std::string::iterator it = layer.chunks_map.begin(); | |
220 | it != layer.chunks_map.end(); | |
221 | ++it) { | |
222 | if (*it == 'D') | |
223 | layer.data.push_back(position); | |
224 | if (*it == 'c') | |
225 | layer.coding.push_back(position); | |
226 | if (*it == 'c' || *it == 'D') | |
227 | layer.chunks_as_set.insert(position); | |
228 | position++; | |
229 | } | |
230 | layer.chunks = layer.data; | |
231 | layer.chunks.insert(layer.chunks.end(), | |
232 | layer.coding.begin(), layer.coding.end()); | |
233 | if (layer.profile.find("k") == layer.profile.end()) | |
234 | layer.profile["k"] = stringify(layer.data.size()); | |
235 | if (layer.profile.find("m") == layer.profile.end()) | |
236 | layer.profile["m"] = stringify(layer.coding.size()); | |
237 | if (layer.profile.find("plugin") == layer.profile.end()) | |
238 | layer.profile["plugin"] = "jerasure"; | |
239 | if (layer.profile.find("technique") == layer.profile.end()) | |
240 | layer.profile["technique"] = "reed_sol_van"; | |
241 | int err = registry.factory(layer.profile["plugin"], | |
242 | directory, | |
243 | layer.profile, | |
244 | &layer.erasure_code, | |
245 | ss); | |
246 | if (err) | |
247 | return err; | |
248 | } | |
249 | return 0; | |
250 | } | |
251 | ||
11fdf7f2 | 252 | int ErasureCodeLrc::layers_sanity_checks(const string &description_string, |
7c673cae FG |
253 | ostream *ss) const |
254 | { | |
255 | int position = 0; | |
256 | ||
257 | if (layers.size() < 1) { | |
258 | *ss << "layers parameter has " << layers.size() | |
259 | << " which is less than the minimum of one. " | |
260 | << description_string << std::endl; | |
261 | return ERROR_LRC_LAYERS_COUNT; | |
262 | } | |
263 | for (vector<Layer>::const_iterator layer = layers.begin(); | |
264 | layer != layers.end(); | |
265 | ++layer) { | |
266 | if (chunk_count != layer->chunks_map.length()) { | |
267 | *ss << "the first element of the array at position " | |
268 | << position << " (starting from zero) " | |
269 | << " is the string '" << layer->chunks_map | |
270 | << " found in the layers parameter " | |
271 | << description_string << ". It is expected to be " | |
272 | << chunk_count << " characters long but is " | |
273 | << layer->chunks_map.length() << " characters long instead " | |
274 | << std::endl; | |
275 | return ERROR_LRC_MAPPING_SIZE; | |
276 | } | |
277 | } | |
278 | return 0; | |
279 | } | |
280 | ||
281 | int ErasureCodeLrc::parse(ErasureCodeProfile &profile, | |
282 | ostream *ss) | |
283 | { | |
284 | int r = ErasureCode::parse(profile, ss); | |
285 | if (r) | |
286 | return r; | |
287 | ||
224ce89b | 288 | return parse_rule(profile, ss); |
7c673cae FG |
289 | } |
290 | ||
291 | const string ErasureCodeLrc::DEFAULT_KML("-1"); | |
292 | ||
293 | int ErasureCodeLrc::parse_kml(ErasureCodeProfile &profile, | |
294 | ostream *ss) | |
295 | { | |
296 | int err = ErasureCode::parse(profile, ss); | |
297 | const int DEFAULT_INT = -1; | |
298 | int k, m, l; | |
299 | err |= to_int("k", profile, &k, DEFAULT_KML, ss); | |
300 | err |= to_int("m", profile, &m, DEFAULT_KML, ss); | |
301 | err |= to_int("l", profile, &l, DEFAULT_KML, ss); | |
302 | ||
303 | if (k == DEFAULT_INT && m == DEFAULT_INT && l == DEFAULT_INT) | |
304 | return err; | |
305 | ||
306 | if ((k != DEFAULT_INT || m != DEFAULT_INT || l != DEFAULT_INT) && | |
307 | (k == DEFAULT_INT || m == DEFAULT_INT || l == DEFAULT_INT)) { | |
308 | *ss << "All of k, m, l must be set or none of them in " | |
309 | << profile << std::endl; | |
310 | return ERROR_LRC_ALL_OR_NOTHING; | |
311 | } | |
312 | ||
313 | const char *generated[] = { "mapping", | |
314 | "layers", | |
224ce89b | 315 | "crush-steps" }; |
7c673cae FG |
316 | |
317 | for (int i = 0; i < 3; i++) { | |
318 | if (profile.count(generated[i])) { | |
319 | *ss << "The " << generated[i] << " parameter cannot be set " | |
320 | << "when k, m, l are set in " << profile << std::endl; | |
321 | return ERROR_LRC_GENERATED; | |
322 | } | |
323 | } | |
324 | ||
11fdf7f2 | 325 | if (l == 0 || (k + m) % l) { |
7c673cae FG |
326 | *ss << "k + m must be a multiple of l in " |
327 | << profile << std::endl; | |
328 | return ERROR_LRC_K_M_MODULO; | |
329 | } | |
330 | ||
331 | int local_group_count = (k + m) / l; | |
332 | ||
333 | if (k % local_group_count) { | |
334 | *ss << "k must be a multiple of (k + m) / l in " | |
335 | << profile << std::endl; | |
336 | return ERROR_LRC_K_MODULO; | |
337 | } | |
338 | ||
339 | if (m % local_group_count) { | |
340 | *ss << "m must be a multiple of (k + m) / l in " | |
341 | << profile << std::endl; | |
342 | return ERROR_LRC_M_MODULO; | |
343 | } | |
344 | ||
345 | string mapping; | |
346 | for (int i = 0; i < local_group_count; i++) { | |
347 | mapping += string(k / local_group_count, 'D') + | |
348 | string(m / local_group_count, '_') + "_"; | |
349 | } | |
350 | profile["mapping"] = mapping; | |
351 | ||
352 | string layers = "[ "; | |
353 | ||
354 | // global layer | |
355 | layers += " [ \""; | |
356 | for (int i = 0; i < local_group_count; i++) { | |
357 | layers += string(k / local_group_count, 'D') + | |
358 | string(m / local_group_count, 'c') + "_"; | |
359 | } | |
360 | layers += "\", \"\" ],"; | |
361 | ||
362 | // local layers | |
363 | for (int i = 0; i < local_group_count; i++) { | |
364 | layers += " [ \""; | |
365 | for (int j = 0; j < local_group_count; j++) { | |
366 | if (i == j) | |
367 | layers += string(l, 'D') + "c"; | |
368 | else | |
369 | layers += string(l + 1, '_'); | |
370 | } | |
371 | layers += "\", \"\" ],"; | |
372 | } | |
373 | profile["layers"] = layers + "]"; | |
374 | ||
375 | ErasureCodeProfile::const_iterator parameter; | |
224ce89b WB |
376 | string rule_locality; |
377 | parameter = profile.find("crush-locality"); | |
7c673cae | 378 | if (parameter != profile.end()) |
224ce89b WB |
379 | rule_locality = parameter->second; |
380 | string rule_failure_domain = "host"; | |
381 | parameter = profile.find("crush-failure-domain"); | |
7c673cae | 382 | if (parameter != profile.end()) |
224ce89b | 383 | rule_failure_domain = parameter->second; |
7c673cae | 384 | |
224ce89b WB |
385 | if (rule_locality != "") { |
386 | rule_steps.clear(); | |
387 | rule_steps.push_back(Step("choose", rule_locality, | |
7c673cae | 388 | local_group_count)); |
224ce89b | 389 | rule_steps.push_back(Step("chooseleaf", rule_failure_domain, |
7c673cae | 390 | l + 1)); |
224ce89b WB |
391 | } else if (rule_failure_domain != "") { |
392 | rule_steps.clear(); | |
393 | rule_steps.push_back(Step("chooseleaf", rule_failure_domain, 0)); | |
7c673cae FG |
394 | } |
395 | ||
396 | return err; | |
397 | } | |
398 | ||
224ce89b | 399 | int ErasureCodeLrc::parse_rule(ErasureCodeProfile &profile, |
7c673cae FG |
400 | ostream *ss) |
401 | { | |
402 | int err = 0; | |
224ce89b WB |
403 | err |= to_string("crush-root", profile, |
404 | &rule_root, | |
7c673cae | 405 | "default", ss); |
224ce89b WB |
406 | err |= to_string("crush-device-class", profile, |
407 | &rule_device_class, | |
408 | "", ss); | |
7c673cae | 409 | |
224ce89b WB |
410 | if (profile.count("crush-steps") != 0) { |
411 | rule_steps.clear(); | |
412 | string str = profile.find("crush-steps")->second; | |
7c673cae FG |
413 | json_spirit::mArray description; |
414 | try { | |
415 | json_spirit::mValue json; | |
416 | json_spirit::read_or_throw(str, json); | |
417 | ||
418 | if (json.type() != json_spirit::array_type) { | |
224ce89b | 419 | *ss << "crush-steps='" << str |
7c673cae FG |
420 | << "' must be a JSON array but is of type " |
421 | << json.type() << " instead" << std::endl; | |
422 | return ERROR_LRC_ARRAY; | |
423 | } | |
424 | description = json.get_array(); | |
425 | } catch (json_spirit::Error_position &e) { | |
224ce89b | 426 | *ss << "failed to parse crush-steps='" << str << "'" |
7c673cae FG |
427 | << " at line " << e.line_ << ", column " << e.column_ |
428 | << " : " << e.reason_ << std::endl; | |
429 | return ERROR_LRC_PARSE_JSON; | |
430 | } | |
431 | ||
432 | int position = 0; | |
433 | for (vector<json_spirit::mValue>::iterator i = description.begin(); | |
434 | i != description.end(); | |
435 | ++i, position++) { | |
436 | if (i->type() != json_spirit::array_type) { | |
437 | stringstream json_string; | |
438 | json_spirit::write(*i, json_string); | |
439 | *ss << "element of the array " | |
440 | << str << " must be a JSON array but " | |
441 | << json_string.str() << " at position " << position | |
442 | << " is of type " << i->type() << " instead" << std::endl; | |
443 | return ERROR_LRC_ARRAY; | |
444 | } | |
224ce89b | 445 | int r = parse_rule_step(str, i->get_array(), ss); |
7c673cae FG |
446 | if (r) |
447 | return r; | |
448 | } | |
449 | } | |
450 | return 0; | |
451 | } | |
452 | ||
11fdf7f2 | 453 | int ErasureCodeLrc::parse_rule_step(const string &description_string, |
7c673cae FG |
454 | json_spirit::mArray description, |
455 | ostream *ss) | |
456 | { | |
457 | stringstream json_string; | |
458 | json_spirit::write(description, json_string); | |
459 | string op; | |
460 | string type; | |
461 | int n = 0; | |
462 | int position = 0; | |
463 | for (vector<json_spirit::mValue>::iterator i = description.begin(); | |
464 | i != description.end(); | |
465 | ++i, position++) { | |
466 | if ((position == 0 || position == 1) && | |
467 | i->type() != json_spirit::str_type) { | |
468 | *ss << "element " << position << " of the array " | |
469 | << json_string.str() << " found in " << description_string | |
470 | << " must be a JSON string but is of type " | |
471 | << i->type() << " instead" << std::endl; | |
224ce89b | 472 | return position == 0 ? ERROR_LRC_RULE_OP : ERROR_LRC_RULE_TYPE; |
7c673cae FG |
473 | } |
474 | if (position == 2 && i->type() != json_spirit::int_type) { | |
475 | *ss << "element " << position << " of the array " | |
476 | << json_string.str() << " found in " << description_string | |
477 | << " must be a JSON int but is of type " | |
478 | << i->type() << " instead" << std::endl; | |
224ce89b | 479 | return ERROR_LRC_RULE_N; |
7c673cae FG |
480 | } |
481 | ||
482 | if (position == 0) | |
483 | op = i->get_str(); | |
484 | if (position == 1) | |
485 | type = i->get_str(); | |
486 | if (position == 2) | |
487 | n = i->get_int(); | |
488 | } | |
224ce89b | 489 | rule_steps.push_back(Step(op, type, n)); |
7c673cae FG |
490 | return 0; |
491 | } | |
492 | ||
493 | int ErasureCodeLrc::init(ErasureCodeProfile &profile, | |
494 | ostream *ss) | |
495 | { | |
496 | int r; | |
497 | ||
498 | r = parse_kml(profile, ss); | |
499 | if (r) | |
500 | return r; | |
501 | ||
502 | r = parse(profile, ss); | |
503 | if (r) | |
504 | return r; | |
505 | ||
506 | json_spirit::mArray description; | |
507 | r = layers_description(profile, &description, ss); | |
508 | if (r) | |
509 | return r; | |
510 | ||
511 | string description_string = profile.find("layers")->second; | |
512 | ||
513 | dout(10) << "init(" << description_string << ")" << dendl; | |
514 | ||
515 | r = layers_parse(description_string, description, ss); | |
516 | if (r) | |
517 | return r; | |
518 | ||
519 | r = layers_init(ss); | |
520 | if (r) | |
521 | return r; | |
522 | ||
523 | if (profile.count("mapping") == 0) { | |
524 | *ss << "the 'mapping' profile is missing from " << profile; | |
525 | return ERROR_LRC_MAPPING; | |
526 | } | |
527 | string mapping = profile.find("mapping")->second; | |
11fdf7f2 | 528 | data_chunk_count = count(begin(mapping), end(mapping), 'D'); |
7c673cae FG |
529 | chunk_count = mapping.length(); |
530 | ||
531 | r = layers_sanity_checks(description_string, ss); | |
532 | if (r) | |
533 | return r; | |
534 | ||
535 | // | |
536 | // When initialized with kml, the profile parameters | |
537 | // that were generated should not be stored because | |
538 | // they would otherwise be exposed to the caller. | |
539 | // | |
540 | if (profile.find("l") != profile.end() && | |
541 | profile.find("l")->second != DEFAULT_KML) { | |
542 | profile.erase("mapping"); | |
543 | profile.erase("layers"); | |
544 | } | |
545 | ErasureCode::init(profile, ss); | |
546 | return 0; | |
547 | } | |
548 | ||
549 | set<int> ErasureCodeLrc::get_erasures(const set<int> &want, | |
550 | const set<int> &available) const | |
551 | { | |
552 | set<int> result; | |
553 | set_difference(want.begin(), want.end(), | |
554 | available.begin(), available.end(), | |
555 | inserter(result, result.end())); | |
556 | return result; | |
557 | } | |
558 | ||
559 | unsigned int ErasureCodeLrc::get_chunk_size(unsigned int object_size) const | |
560 | { | |
561 | return layers.front().erasure_code->get_chunk_size(object_size); | |
562 | } | |
563 | ||
564 | void p(const set<int> &s) { cerr << s; } // for gdb | |
565 | ||
11fdf7f2 TL |
566 | int ErasureCodeLrc::_minimum_to_decode(const set<int> &want_to_read, |
567 | const set<int> &available_chunks, | |
568 | set<int> *minimum) | |
7c673cae FG |
569 | { |
570 | dout(20) << __func__ << " want_to_read " << want_to_read | |
571 | << " available_chunks " << available_chunks << dendl; | |
572 | { | |
573 | set<int> erasures_total; | |
574 | set<int> erasures_not_recovered; | |
575 | set<int> erasures_want; | |
576 | for (unsigned int i = 0; i < get_chunk_count(); ++i) { | |
577 | if (available_chunks.count(i) == 0) { | |
578 | erasures_total.insert(i); | |
579 | erasures_not_recovered.insert(i); | |
580 | if (want_to_read.count(i) != 0) | |
581 | erasures_want.insert(i); | |
582 | } | |
583 | } | |
584 | ||
585 | // | |
586 | // Case 1: | |
587 | // | |
588 | // When no chunk is missing there is no need to read more than what | |
589 | // is wanted. | |
590 | // | |
591 | if (erasures_want.empty()) { | |
592 | *minimum = want_to_read; | |
593 | dout(20) << __func__ << " minimum == want_to_read == " | |
594 | << want_to_read << dendl; | |
595 | return 0; | |
596 | } | |
597 | ||
598 | // | |
599 | // Case 2: | |
600 | // | |
601 | // Try to recover erasures with as few chunks as possible. | |
602 | // | |
603 | for (vector<Layer>::reverse_iterator i = layers.rbegin(); | |
604 | i != layers.rend(); | |
605 | ++i) { | |
606 | // | |
607 | // If this layer has no chunk that we want, skip it. | |
608 | // | |
609 | set<int> layer_want; | |
610 | set_intersection(want_to_read.begin(), want_to_read.end(), | |
611 | i->chunks_as_set.begin(), i->chunks_as_set.end(), | |
612 | inserter(layer_want, layer_want.end())); | |
613 | if (layer_want.empty()) | |
614 | continue; | |
615 | // | |
616 | // Are some of the chunks we want missing ? | |
617 | // | |
618 | set<int> layer_erasures; | |
619 | set_intersection(layer_want.begin(), layer_want.end(), | |
620 | erasures_want.begin(), erasures_want.end(), | |
621 | inserter(layer_erasures, layer_erasures.end())); | |
622 | set<int> layer_minimum; | |
623 | if (layer_erasures.empty()) { | |
624 | // | |
625 | // The chunks we want are available, this is the minimum we need | |
626 | // to read. | |
627 | // | |
628 | layer_minimum = layer_want; | |
629 | } else { | |
630 | set<int> erasures; | |
631 | set_intersection(i->chunks_as_set.begin(), i->chunks_as_set.end(), | |
632 | erasures_not_recovered.begin(), erasures_not_recovered.end(), | |
633 | inserter(erasures, erasures.end())); | |
634 | ||
635 | if (erasures.size() > i->erasure_code->get_coding_chunk_count()) { | |
636 | // | |
637 | // There are too many erasures for this layer to recover: skip | |
638 | // it and hope that an upper layer will be do better. | |
639 | // | |
640 | continue; | |
641 | } else { | |
642 | // | |
643 | // Get all available chunks in that layer to recover the | |
644 | // missing one(s). | |
645 | // | |
646 | set_difference(i->chunks_as_set.begin(), i->chunks_as_set.end(), | |
647 | erasures_not_recovered.begin(), erasures_not_recovered.end(), | |
648 | inserter(layer_minimum, layer_minimum.end())); | |
649 | // | |
650 | // Chunks recovered by this layer are removed from the list of | |
651 | // erasures so that upper levels do not attempt to recover | |
652 | // them. | |
653 | // | |
654 | for (set<int>::const_iterator j = erasures.begin(); | |
655 | j != erasures.end(); | |
656 | ++j) { | |
657 | erasures_not_recovered.erase(*j); | |
11fdf7f2 | 658 | erasures_want.erase(*j); |
7c673cae FG |
659 | } |
660 | } | |
661 | } | |
662 | minimum->insert(layer_minimum.begin(), layer_minimum.end()); | |
663 | } | |
664 | if (erasures_want.empty()) { | |
665 | minimum->insert(want_to_read.begin(), want_to_read.end()); | |
666 | for (set<int>::const_iterator i = erasures_total.begin(); | |
667 | i != erasures_total.end(); | |
668 | ++i) { | |
669 | if (minimum->count(*i)) | |
670 | minimum->erase(*i); | |
671 | } | |
672 | dout(20) << __func__ << " minimum = " << *minimum << dendl; | |
673 | return 0; | |
674 | } | |
675 | } | |
676 | ||
677 | { | |
678 | // | |
679 | // Case 3: | |
680 | // | |
681 | // The previous strategy failed to recover from all erasures. | |
682 | // | |
683 | // Try to recover as many chunks as possible, even from layers | |
684 | // that do not contain chunks that we want, in the hope that it | |
685 | // will help the upper layers. | |
686 | // | |
687 | set<int> erasures_total; | |
688 | for (unsigned int i = 0; i < get_chunk_count(); ++i) { | |
689 | if (available_chunks.count(i) == 0) | |
690 | erasures_total.insert(i); | |
691 | } | |
692 | ||
693 | for (vector<Layer>::reverse_iterator i = layers.rbegin(); | |
694 | i != layers.rend(); | |
695 | ++i) { | |
696 | set<int> layer_erasures; | |
697 | set_intersection(i->chunks_as_set.begin(), i->chunks_as_set.end(), | |
698 | erasures_total.begin(), erasures_total.end(), | |
699 | inserter(layer_erasures, layer_erasures.end())); | |
700 | // | |
701 | // If this layer has no erasure, skip it | |
702 | // | |
703 | if (layer_erasures.empty()) | |
704 | continue; | |
705 | ||
706 | if (layer_erasures.size() > 0 && | |
707 | layer_erasures.size() <= i->erasure_code->get_coding_chunk_count()) { | |
708 | // | |
709 | // chunks recovered by this layer are removed from the list of | |
710 | // erasures so that upper levels know they can rely on their | |
711 | // availability | |
712 | // | |
713 | for (set<int>::const_iterator j = layer_erasures.begin(); | |
714 | j != layer_erasures.end(); | |
715 | ++j) { | |
716 | erasures_total.erase(*j); | |
717 | } | |
718 | } | |
719 | } | |
720 | if (erasures_total.empty()) { | |
721 | // | |
722 | // Do not try to be smart about what chunks are necessary to | |
723 | // recover, use all available chunks. | |
724 | // | |
725 | *minimum = available_chunks; | |
726 | dout(20) << __func__ << " minimum == available_chunks == " | |
727 | << available_chunks << dendl; | |
728 | return 0; | |
729 | } | |
730 | } | |
731 | ||
732 | derr << __func__ << " not enough chunks in " << available_chunks | |
733 | << " to read " << want_to_read << dendl; | |
734 | return -EIO; | |
735 | } | |
736 | ||
737 | int ErasureCodeLrc::encode_chunks(const set<int> &want_to_encode, | |
738 | map<int, bufferlist> *encoded) | |
739 | { | |
740 | unsigned int top = layers.size(); | |
741 | for (vector<Layer>::reverse_iterator i = layers.rbegin(); | |
742 | i != layers.rend(); | |
743 | ++i) { | |
744 | --top; | |
745 | if (includes(i->chunks_as_set.begin(), i->chunks_as_set.end(), | |
746 | want_to_encode.begin(), want_to_encode.end())) | |
747 | break; | |
748 | } | |
749 | ||
750 | for (unsigned int i = top; i < layers.size(); ++i) { | |
751 | const Layer &layer = layers[i]; | |
752 | set<int> layer_want_to_encode; | |
753 | map<int, bufferlist> layer_encoded; | |
754 | int j = 0; | |
11fdf7f2 TL |
755 | for (const auto& c : layer.chunks) { |
756 | std::swap(layer_encoded[j], (*encoded)[c]); | |
757 | if (want_to_encode.find(c) != want_to_encode.end()) | |
7c673cae FG |
758 | layer_want_to_encode.insert(j); |
759 | j++; | |
760 | } | |
761 | int err = layer.erasure_code->encode_chunks(layer_want_to_encode, | |
762 | &layer_encoded); | |
11fdf7f2 TL |
763 | j = 0; |
764 | for (const auto& c : layer.chunks) { | |
765 | std::swap(layer_encoded[j++], (*encoded)[c]); | |
766 | } | |
7c673cae FG |
767 | if (err) { |
768 | derr << __func__ << " layer " << layer.chunks_map | |
769 | << " failed with " << err << " trying to encode " | |
770 | << layer_want_to_encode << dendl; | |
771 | return err; | |
772 | } | |
773 | } | |
774 | return 0; | |
775 | } | |
776 | ||
777 | int ErasureCodeLrc::decode_chunks(const set<int> &want_to_read, | |
778 | const map<int, bufferlist> &chunks, | |
779 | map<int, bufferlist> *decoded) | |
780 | { | |
781 | set<int> available_chunks; | |
782 | set<int> erasures; | |
783 | for (unsigned int i = 0; i < get_chunk_count(); ++i) { | |
784 | if (chunks.count(i) != 0) | |
785 | available_chunks.insert(i); | |
786 | else | |
787 | erasures.insert(i); | |
788 | } | |
789 | ||
790 | set<int> want_to_read_erasures; | |
791 | ||
792 | for (vector<Layer>::reverse_iterator layer = layers.rbegin(); | |
793 | layer != layers.rend(); | |
794 | ++layer) { | |
795 | set<int> layer_erasures; | |
796 | set_intersection(layer->chunks_as_set.begin(), layer->chunks_as_set.end(), | |
797 | erasures.begin(), erasures.end(), | |
798 | inserter(layer_erasures, layer_erasures.end())); | |
799 | ||
800 | if (layer_erasures.size() > | |
801 | layer->erasure_code->get_coding_chunk_count()) { | |
802 | // skip because there are too many erasures for this layer to recover | |
803 | } else if(layer_erasures.size() == 0) { | |
804 | // skip because all chunks are already available | |
805 | } else { | |
806 | set<int> layer_want_to_read; | |
807 | map<int, bufferlist> layer_chunks; | |
808 | map<int, bufferlist> layer_decoded; | |
809 | int j = 0; | |
810 | for (vector<int>::const_iterator c = layer->chunks.begin(); | |
811 | c != layer->chunks.end(); | |
812 | ++c) { | |
813 | // | |
814 | // Pick chunks from *decoded* instead of *chunks* to re-use | |
815 | // chunks recovered by previous layers. In other words | |
816 | // *chunks* does not change but *decoded* gradually improves | |
817 | // as more layers recover from erasures. | |
818 | // | |
819 | if (erasures.count(*c) == 0) | |
820 | layer_chunks[j] = (*decoded)[*c]; | |
821 | if (want_to_read.count(*c) != 0) | |
822 | layer_want_to_read.insert(j); | |
823 | layer_decoded[j] = (*decoded)[*c]; | |
824 | ++j; | |
825 | } | |
826 | int err = layer->erasure_code->decode_chunks(layer_want_to_read, | |
827 | layer_chunks, | |
828 | &layer_decoded); | |
829 | if (err) { | |
830 | derr << __func__ << " layer " << layer->chunks_map | |
831 | << " failed with " << err << " trying to decode " | |
832 | << layer_want_to_read << " with " << available_chunks << dendl; | |
833 | return err; | |
834 | } | |
835 | j = 0; | |
836 | for (vector<int>::const_iterator c = layer->chunks.begin(); | |
837 | c != layer->chunks.end(); | |
838 | ++c) { | |
839 | (*decoded)[*c] = layer_decoded[j]; | |
840 | ++j; | |
11fdf7f2 | 841 | erasures.erase(*c); |
7c673cae FG |
842 | } |
843 | want_to_read_erasures.clear(); | |
844 | set_intersection(erasures.begin(), erasures.end(), | |
845 | want_to_read.begin(), want_to_read.end(), | |
846 | inserter(want_to_read_erasures, want_to_read_erasures.end())); | |
847 | if (want_to_read_erasures.size() == 0) | |
848 | break; | |
849 | } | |
850 | } | |
851 | ||
852 | if (want_to_read_erasures.size() > 0) { | |
853 | derr << __func__ << " want to read " << want_to_read | |
854 | << " with available_chunks = " << available_chunks | |
855 | << " end up being unable to read " << want_to_read_erasures << dendl; | |
856 | return -EIO; | |
857 | } else { | |
858 | return 0; | |
859 | } | |
860 | } |