]> git.proxmox.com Git - ceph.git/blame - ceph/src/erasure-code/lrc/ErasureCodeLrc.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / erasure-code / lrc / ErasureCodeLrc.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
7 * Copyright (C) 2014 Red Hat <contact@redhat.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 */
17
9f95a23c 18#include <cerrno>
7c673cae
FG
19#include <algorithm>
20
21#include "include/str_map.h"
22#include "common/debug.h"
23#include "crush/CrushWrapper.h"
24#include "osd/osd_types.h"
25#include "include/stringify.h"
26#include "erasure-code/ErasureCodePlugin.h"
27#include "json_spirit/json_spirit_writer.h"
28
29#include "ErasureCodeLrc.h"
30
7c673cae
FG
31#define dout_context g_ceph_context
32#define dout_subsys ceph_subsys_osd
33#undef dout_prefix
34#define dout_prefix _prefix(_dout)
35
31f18b77 36using namespace std;
9f95a23c 37using namespace ceph;
31f18b77 38
7c673cae
FG
39static ostream& _prefix(std::ostream* _dout)
40{
41 return *_dout << "ErasureCodeLrc: ";
42}
43
224ce89b 44int ErasureCodeLrc::create_rule(const string &name,
7c673cae
FG
45 CrushWrapper &crush,
46 ostream *ss) const
47{
48 if (crush.rule_exists(name)) {
49 *ss << "rule " << name << " exists";
50 return -EEXIST;
51 }
224ce89b
WB
52 if (!crush.name_exists(rule_root)) {
53 *ss << "root item " << rule_root << " does not exist";
7c673cae
FG
54 return -ENOENT;
55 }
224ce89b
WB
56 int root = crush.get_item_id(rule_root);
57 if (rule_device_class.size()) {
58 if (!crush.class_exists(rule_device_class)) {
59 *ss << "device class " << rule_device_class << " does not exist";
60 return -ENOENT;
61 }
62 int c = crush.get_class_id(rule_device_class);
63 if (crush.class_bucket.count(root) == 0 ||
64 crush.class_bucket[root].count(c) == 0) {
65 *ss << "root item " << rule_root << " has no devices with class "
66 << rule_device_class;
67 return -EINVAL;
68 }
69 root = crush.class_bucket[root][c];
70 }
7c673cae 71
7c673cae
FG
72 int rno = 0;
73 for (rno = 0; rno < crush.get_max_rules(); rno++) {
74 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
75 break;
76 }
7c673cae 77
224ce89b 78 int steps = 4 + rule_steps.size();
7c673cae
FG
79 int min_rep = 3;
80 int max_rep = get_chunk_count();
81 int ret;
c07f9fc5
FG
82 ret = crush.add_rule(rno, steps, pg_pool_t::TYPE_ERASURE,
83 min_rep, max_rep);
11fdf7f2 84 ceph_assert(ret == rno);
7c673cae
FG
85 int step = 0;
86
87 ret = crush.set_rule_step(rno, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
11fdf7f2 88 ceph_assert(ret == 0);
7c673cae 89 ret = crush.set_rule_step(rno, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
11fdf7f2 90 ceph_assert(ret == 0);
7c673cae 91 ret = crush.set_rule_step(rno, step++, CRUSH_RULE_TAKE, root, 0);
11fdf7f2 92 ceph_assert(ret == 0);
7c673cae
FG
93 // [ [ "choose", "rack", 2 ],
94 // [ "chooseleaf", "host", 5 ] ]
224ce89b
WB
95 for (vector<Step>::const_iterator i = rule_steps.begin();
96 i != rule_steps.end();
7c673cae
FG
97 ++i) {
98 int op = i->op == "chooseleaf" ?
99 CRUSH_RULE_CHOOSELEAF_INDEP : CRUSH_RULE_CHOOSE_INDEP;
100 int type = crush.get_type_id(i->type);
101 if (type < 0) {
102 *ss << "unknown crush type " << i->type;
103 return -EINVAL;
104 }
105 ret = crush.set_rule_step(rno, step++, op, i->n, type);
11fdf7f2 106 ceph_assert(ret == 0);
7c673cae
FG
107 }
108 ret = crush.set_rule_step(rno, step++, CRUSH_RULE_EMIT, 0, 0);
11fdf7f2 109 ceph_assert(ret == 0);
7c673cae 110 crush.set_rule_name(rno, name);
c07f9fc5 111 return rno;
7c673cae
FG
112}
113
114int ErasureCodeLrc::layers_description(const ErasureCodeProfile &profile,
115 json_spirit::mArray *description,
116 ostream *ss) const
117{
118 if (profile.count("layers") == 0) {
119 *ss << "could not find 'layers' in " << profile << std::endl;
120 return ERROR_LRC_DESCRIPTION;
121 }
122 string str = profile.find("layers")->second;
123 try {
124 json_spirit::mValue json;
125 json_spirit::read_or_throw(str, json);
126
127 if (json.type() != json_spirit::array_type) {
128 *ss << "layers='" << str
129 << "' must be a JSON array but is of type "
130 << json.type() << " instead" << std::endl;
131 return ERROR_LRC_ARRAY;
132 }
133 *description = json.get_array();
134 } catch (json_spirit::Error_position &e) {
135 *ss << "failed to parse layers='" << str << "'"
136 << " at line " << e.line_ << ", column " << e.column_
137 << " : " << e.reason_ << std::endl;
138 return ERROR_LRC_PARSE_JSON;
139 }
140 return 0;
141}
142
11fdf7f2 143int ErasureCodeLrc::layers_parse(const string &description_string,
7c673cae
FG
144 json_spirit::mArray description,
145 ostream *ss)
146{
147 int position = 0;
148 for (vector<json_spirit::mValue>::iterator i = description.begin();
149 i != description.end();
150 ++i, position++) {
151 if (i->type() != json_spirit::array_type) {
152 stringstream json_string;
153 json_spirit::write(*i, json_string);
154 *ss << "each element of the array "
155 << description_string << " must be a JSON array but "
156 << json_string.str() << " at position " << position
157 << " is of type " << i->type() << " instead" << std::endl;
158 return ERROR_LRC_ARRAY;
159 }
160 json_spirit::mArray layer_json = i->get_array();
161 ErasureCodeProfile profile;
162 int index = 0;
163 for (vector<json_spirit::mValue>::iterator j = layer_json.begin();
164 j != layer_json.end();
165 ++j, ++index) {
166 if (index == 0) {
167 if (j->type() != json_spirit::str_type) {
168 stringstream element;
169 json_spirit::write(*j, element);
170 *ss << "the first element of the entry "
171 << element.str() << " (first is zero) "
172 << position << " in " << description_string
173 << " is of type " << (*j).type() << " instead of string" << std::endl;
174 return ERROR_LRC_STR;
175 }
176 layers.push_back(Layer(j->get_str()));
177 Layer &layer = layers.back();
178 layer.chunks_map = j->get_str();
179 } else if(index == 1) {
180 Layer &layer = layers.back();
181 if (j->type() != json_spirit::str_type &&
182 j->type() != json_spirit::obj_type) {
183 stringstream element;
184 json_spirit::write(*j, element);
185 *ss << "the second element of the entry "
186 << element.str() << " (first is zero) "
187 << position << " in " << description_string
188 << " is of type " << (*j).type() << " instead of string or object"
189 << std::endl;
190 return ERROR_LRC_CONFIG_OPTIONS;
191 }
192 if (j->type() == json_spirit::str_type) {
193 int err = get_json_str_map(j->get_str(), *ss, &layer.profile);
194 if (err)
195 return err;
196 } else if (j->type() == json_spirit::obj_type) {
197 json_spirit::mObject o = j->get_obj();
198
199 for (map<string, json_spirit::mValue>::iterator i = o.begin();
200 i != o.end();
201 ++i) {
202 layer.profile[i->first] = i->second.get_str();
203 }
204 }
205 } else {
206 // ignore trailing elements
207 }
208 }
209 }
210 return 0;
211}
212
213int ErasureCodeLrc::layers_init(ostream *ss)
214{
215 ErasureCodePluginRegistry &registry = ErasureCodePluginRegistry::instance();
216 for (unsigned int i = 0; i < layers.size(); i++) {
217 Layer &layer = layers[i];
218 int position = 0;
219 for(std::string::iterator it = layer.chunks_map.begin();
220 it != layer.chunks_map.end();
221 ++it) {
222 if (*it == 'D')
223 layer.data.push_back(position);
224 if (*it == 'c')
225 layer.coding.push_back(position);
226 if (*it == 'c' || *it == 'D')
227 layer.chunks_as_set.insert(position);
228 position++;
229 }
230 layer.chunks = layer.data;
231 layer.chunks.insert(layer.chunks.end(),
232 layer.coding.begin(), layer.coding.end());
233 if (layer.profile.find("k") == layer.profile.end())
234 layer.profile["k"] = stringify(layer.data.size());
235 if (layer.profile.find("m") == layer.profile.end())
236 layer.profile["m"] = stringify(layer.coding.size());
237 if (layer.profile.find("plugin") == layer.profile.end())
238 layer.profile["plugin"] = "jerasure";
239 if (layer.profile.find("technique") == layer.profile.end())
240 layer.profile["technique"] = "reed_sol_van";
241 int err = registry.factory(layer.profile["plugin"],
242 directory,
243 layer.profile,
244 &layer.erasure_code,
245 ss);
246 if (err)
247 return err;
248 }
249 return 0;
250}
251
11fdf7f2 252int ErasureCodeLrc::layers_sanity_checks(const string &description_string,
7c673cae
FG
253 ostream *ss) const
254{
255 int position = 0;
256
257 if (layers.size() < 1) {
258 *ss << "layers parameter has " << layers.size()
259 << " which is less than the minimum of one. "
260 << description_string << std::endl;
261 return ERROR_LRC_LAYERS_COUNT;
262 }
263 for (vector<Layer>::const_iterator layer = layers.begin();
264 layer != layers.end();
265 ++layer) {
266 if (chunk_count != layer->chunks_map.length()) {
267 *ss << "the first element of the array at position "
268 << position << " (starting from zero) "
269 << " is the string '" << layer->chunks_map
270 << " found in the layers parameter "
271 << description_string << ". It is expected to be "
272 << chunk_count << " characters long but is "
273 << layer->chunks_map.length() << " characters long instead "
274 << std::endl;
275 return ERROR_LRC_MAPPING_SIZE;
276 }
277 }
278 return 0;
279}
280
281int ErasureCodeLrc::parse(ErasureCodeProfile &profile,
282 ostream *ss)
283{
284 int r = ErasureCode::parse(profile, ss);
285 if (r)
286 return r;
287
224ce89b 288 return parse_rule(profile, ss);
7c673cae
FG
289}
290
291const string ErasureCodeLrc::DEFAULT_KML("-1");
292
293int ErasureCodeLrc::parse_kml(ErasureCodeProfile &profile,
294 ostream *ss)
295{
296 int err = ErasureCode::parse(profile, ss);
297 const int DEFAULT_INT = -1;
298 int k, m, l;
299 err |= to_int("k", profile, &k, DEFAULT_KML, ss);
300 err |= to_int("m", profile, &m, DEFAULT_KML, ss);
301 err |= to_int("l", profile, &l, DEFAULT_KML, ss);
302
303 if (k == DEFAULT_INT && m == DEFAULT_INT && l == DEFAULT_INT)
304 return err;
305
306 if ((k != DEFAULT_INT || m != DEFAULT_INT || l != DEFAULT_INT) &&
307 (k == DEFAULT_INT || m == DEFAULT_INT || l == DEFAULT_INT)) {
308 *ss << "All of k, m, l must be set or none of them in "
309 << profile << std::endl;
310 return ERROR_LRC_ALL_OR_NOTHING;
311 }
312
313 const char *generated[] = { "mapping",
314 "layers",
224ce89b 315 "crush-steps" };
7c673cae
FG
316
317 for (int i = 0; i < 3; i++) {
318 if (profile.count(generated[i])) {
319 *ss << "The " << generated[i] << " parameter cannot be set "
320 << "when k, m, l are set in " << profile << std::endl;
321 return ERROR_LRC_GENERATED;
322 }
323 }
324
11fdf7f2 325 if (l == 0 || (k + m) % l) {
7c673cae
FG
326 *ss << "k + m must be a multiple of l in "
327 << profile << std::endl;
328 return ERROR_LRC_K_M_MODULO;
329 }
330
331 int local_group_count = (k + m) / l;
332
333 if (k % local_group_count) {
334 *ss << "k must be a multiple of (k + m) / l in "
335 << profile << std::endl;
336 return ERROR_LRC_K_MODULO;
337 }
338
339 if (m % local_group_count) {
340 *ss << "m must be a multiple of (k + m) / l in "
341 << profile << std::endl;
342 return ERROR_LRC_M_MODULO;
343 }
344
345 string mapping;
346 for (int i = 0; i < local_group_count; i++) {
347 mapping += string(k / local_group_count, 'D') +
348 string(m / local_group_count, '_') + "_";
349 }
350 profile["mapping"] = mapping;
351
352 string layers = "[ ";
353
354 // global layer
355 layers += " [ \"";
356 for (int i = 0; i < local_group_count; i++) {
357 layers += string(k / local_group_count, 'D') +
358 string(m / local_group_count, 'c') + "_";
359 }
360 layers += "\", \"\" ],";
361
362 // local layers
363 for (int i = 0; i < local_group_count; i++) {
364 layers += " [ \"";
365 for (int j = 0; j < local_group_count; j++) {
366 if (i == j)
367 layers += string(l, 'D') + "c";
368 else
369 layers += string(l + 1, '_');
370 }
371 layers += "\", \"\" ],";
372 }
373 profile["layers"] = layers + "]";
374
375 ErasureCodeProfile::const_iterator parameter;
224ce89b
WB
376 string rule_locality;
377 parameter = profile.find("crush-locality");
7c673cae 378 if (parameter != profile.end())
224ce89b
WB
379 rule_locality = parameter->second;
380 string rule_failure_domain = "host";
381 parameter = profile.find("crush-failure-domain");
7c673cae 382 if (parameter != profile.end())
224ce89b 383 rule_failure_domain = parameter->second;
7c673cae 384
224ce89b
WB
385 if (rule_locality != "") {
386 rule_steps.clear();
387 rule_steps.push_back(Step("choose", rule_locality,
7c673cae 388 local_group_count));
224ce89b 389 rule_steps.push_back(Step("chooseleaf", rule_failure_domain,
7c673cae 390 l + 1));
224ce89b
WB
391 } else if (rule_failure_domain != "") {
392 rule_steps.clear();
393 rule_steps.push_back(Step("chooseleaf", rule_failure_domain, 0));
7c673cae
FG
394 }
395
396 return err;
397}
398
224ce89b 399int ErasureCodeLrc::parse_rule(ErasureCodeProfile &profile,
7c673cae
FG
400 ostream *ss)
401{
402 int err = 0;
224ce89b
WB
403 err |= to_string("crush-root", profile,
404 &rule_root,
7c673cae 405 "default", ss);
224ce89b
WB
406 err |= to_string("crush-device-class", profile,
407 &rule_device_class,
408 "", ss);
7c673cae 409
224ce89b
WB
410 if (profile.count("crush-steps") != 0) {
411 rule_steps.clear();
412 string str = profile.find("crush-steps")->second;
7c673cae
FG
413 json_spirit::mArray description;
414 try {
415 json_spirit::mValue json;
416 json_spirit::read_or_throw(str, json);
417
418 if (json.type() != json_spirit::array_type) {
224ce89b 419 *ss << "crush-steps='" << str
7c673cae
FG
420 << "' must be a JSON array but is of type "
421 << json.type() << " instead" << std::endl;
422 return ERROR_LRC_ARRAY;
423 }
424 description = json.get_array();
425 } catch (json_spirit::Error_position &e) {
224ce89b 426 *ss << "failed to parse crush-steps='" << str << "'"
7c673cae
FG
427 << " at line " << e.line_ << ", column " << e.column_
428 << " : " << e.reason_ << std::endl;
429 return ERROR_LRC_PARSE_JSON;
430 }
431
432 int position = 0;
433 for (vector<json_spirit::mValue>::iterator i = description.begin();
434 i != description.end();
435 ++i, position++) {
436 if (i->type() != json_spirit::array_type) {
437 stringstream json_string;
438 json_spirit::write(*i, json_string);
439 *ss << "element of the array "
440 << str << " must be a JSON array but "
441 << json_string.str() << " at position " << position
442 << " is of type " << i->type() << " instead" << std::endl;
443 return ERROR_LRC_ARRAY;
444 }
224ce89b 445 int r = parse_rule_step(str, i->get_array(), ss);
7c673cae
FG
446 if (r)
447 return r;
448 }
449 }
450 return 0;
451}
452
11fdf7f2 453int ErasureCodeLrc::parse_rule_step(const string &description_string,
7c673cae
FG
454 json_spirit::mArray description,
455 ostream *ss)
456{
457 stringstream json_string;
458 json_spirit::write(description, json_string);
459 string op;
460 string type;
461 int n = 0;
462 int position = 0;
463 for (vector<json_spirit::mValue>::iterator i = description.begin();
464 i != description.end();
465 ++i, position++) {
466 if ((position == 0 || position == 1) &&
467 i->type() != json_spirit::str_type) {
468 *ss << "element " << position << " of the array "
469 << json_string.str() << " found in " << description_string
470 << " must be a JSON string but is of type "
471 << i->type() << " instead" << std::endl;
224ce89b 472 return position == 0 ? ERROR_LRC_RULE_OP : ERROR_LRC_RULE_TYPE;
7c673cae
FG
473 }
474 if (position == 2 && i->type() != json_spirit::int_type) {
475 *ss << "element " << position << " of the array "
476 << json_string.str() << " found in " << description_string
477 << " must be a JSON int but is of type "
478 << i->type() << " instead" << std::endl;
224ce89b 479 return ERROR_LRC_RULE_N;
7c673cae
FG
480 }
481
482 if (position == 0)
483 op = i->get_str();
484 if (position == 1)
485 type = i->get_str();
486 if (position == 2)
487 n = i->get_int();
488 }
224ce89b 489 rule_steps.push_back(Step(op, type, n));
7c673cae
FG
490 return 0;
491}
492
493int ErasureCodeLrc::init(ErasureCodeProfile &profile,
494 ostream *ss)
495{
496 int r;
497
498 r = parse_kml(profile, ss);
499 if (r)
500 return r;
501
502 r = parse(profile, ss);
503 if (r)
504 return r;
505
506 json_spirit::mArray description;
507 r = layers_description(profile, &description, ss);
508 if (r)
509 return r;
510
511 string description_string = profile.find("layers")->second;
512
513 dout(10) << "init(" << description_string << ")" << dendl;
514
515 r = layers_parse(description_string, description, ss);
516 if (r)
517 return r;
518
519 r = layers_init(ss);
520 if (r)
521 return r;
522
523 if (profile.count("mapping") == 0) {
524 *ss << "the 'mapping' profile is missing from " << profile;
525 return ERROR_LRC_MAPPING;
526 }
527 string mapping = profile.find("mapping")->second;
11fdf7f2 528 data_chunk_count = count(begin(mapping), end(mapping), 'D');
7c673cae
FG
529 chunk_count = mapping.length();
530
531 r = layers_sanity_checks(description_string, ss);
532 if (r)
533 return r;
534
535 //
536 // When initialized with kml, the profile parameters
537 // that were generated should not be stored because
538 // they would otherwise be exposed to the caller.
539 //
540 if (profile.find("l") != profile.end() &&
541 profile.find("l")->second != DEFAULT_KML) {
542 profile.erase("mapping");
543 profile.erase("layers");
544 }
545 ErasureCode::init(profile, ss);
546 return 0;
547}
548
549set<int> ErasureCodeLrc::get_erasures(const set<int> &want,
550 const set<int> &available) const
551{
552 set<int> result;
553 set_difference(want.begin(), want.end(),
554 available.begin(), available.end(),
555 inserter(result, result.end()));
556 return result;
557}
558
559unsigned int ErasureCodeLrc::get_chunk_size(unsigned int object_size) const
560{
561 return layers.front().erasure_code->get_chunk_size(object_size);
562}
563
564void p(const set<int> &s) { cerr << s; } // for gdb
565
11fdf7f2
TL
566int ErasureCodeLrc::_minimum_to_decode(const set<int> &want_to_read,
567 const set<int> &available_chunks,
568 set<int> *minimum)
7c673cae
FG
569{
570 dout(20) << __func__ << " want_to_read " << want_to_read
571 << " available_chunks " << available_chunks << dendl;
572 {
573 set<int> erasures_total;
574 set<int> erasures_not_recovered;
575 set<int> erasures_want;
576 for (unsigned int i = 0; i < get_chunk_count(); ++i) {
577 if (available_chunks.count(i) == 0) {
578 erasures_total.insert(i);
579 erasures_not_recovered.insert(i);
580 if (want_to_read.count(i) != 0)
581 erasures_want.insert(i);
582 }
583 }
584
585 //
586 // Case 1:
587 //
588 // When no chunk is missing there is no need to read more than what
589 // is wanted.
590 //
591 if (erasures_want.empty()) {
592 *minimum = want_to_read;
593 dout(20) << __func__ << " minimum == want_to_read == "
594 << want_to_read << dendl;
595 return 0;
596 }
597
598 //
599 // Case 2:
600 //
601 // Try to recover erasures with as few chunks as possible.
602 //
603 for (vector<Layer>::reverse_iterator i = layers.rbegin();
604 i != layers.rend();
605 ++i) {
606 //
607 // If this layer has no chunk that we want, skip it.
608 //
609 set<int> layer_want;
610 set_intersection(want_to_read.begin(), want_to_read.end(),
611 i->chunks_as_set.begin(), i->chunks_as_set.end(),
612 inserter(layer_want, layer_want.end()));
613 if (layer_want.empty())
614 continue;
615 //
616 // Are some of the chunks we want missing ?
617 //
618 set<int> layer_erasures;
619 set_intersection(layer_want.begin(), layer_want.end(),
620 erasures_want.begin(), erasures_want.end(),
621 inserter(layer_erasures, layer_erasures.end()));
622 set<int> layer_minimum;
623 if (layer_erasures.empty()) {
624 //
625 // The chunks we want are available, this is the minimum we need
626 // to read.
627 //
628 layer_minimum = layer_want;
629 } else {
630 set<int> erasures;
631 set_intersection(i->chunks_as_set.begin(), i->chunks_as_set.end(),
632 erasures_not_recovered.begin(), erasures_not_recovered.end(),
633 inserter(erasures, erasures.end()));
634
635 if (erasures.size() > i->erasure_code->get_coding_chunk_count()) {
636 //
637 // There are too many erasures for this layer to recover: skip
638 // it and hope that an upper layer will be do better.
639 //
640 continue;
641 } else {
642 //
643 // Get all available chunks in that layer to recover the
644 // missing one(s).
645 //
646 set_difference(i->chunks_as_set.begin(), i->chunks_as_set.end(),
647 erasures_not_recovered.begin(), erasures_not_recovered.end(),
648 inserter(layer_minimum, layer_minimum.end()));
649 //
650 // Chunks recovered by this layer are removed from the list of
651 // erasures so that upper levels do not attempt to recover
652 // them.
653 //
654 for (set<int>::const_iterator j = erasures.begin();
655 j != erasures.end();
656 ++j) {
657 erasures_not_recovered.erase(*j);
11fdf7f2 658 erasures_want.erase(*j);
7c673cae
FG
659 }
660 }
661 }
662 minimum->insert(layer_minimum.begin(), layer_minimum.end());
663 }
664 if (erasures_want.empty()) {
665 minimum->insert(want_to_read.begin(), want_to_read.end());
666 for (set<int>::const_iterator i = erasures_total.begin();
667 i != erasures_total.end();
668 ++i) {
669 if (minimum->count(*i))
670 minimum->erase(*i);
671 }
672 dout(20) << __func__ << " minimum = " << *minimum << dendl;
673 return 0;
674 }
675 }
676
677 {
678 //
679 // Case 3:
680 //
681 // The previous strategy failed to recover from all erasures.
682 //
683 // Try to recover as many chunks as possible, even from layers
684 // that do not contain chunks that we want, in the hope that it
685 // will help the upper layers.
686 //
687 set<int> erasures_total;
688 for (unsigned int i = 0; i < get_chunk_count(); ++i) {
689 if (available_chunks.count(i) == 0)
690 erasures_total.insert(i);
691 }
692
693 for (vector<Layer>::reverse_iterator i = layers.rbegin();
694 i != layers.rend();
695 ++i) {
696 set<int> layer_erasures;
697 set_intersection(i->chunks_as_set.begin(), i->chunks_as_set.end(),
698 erasures_total.begin(), erasures_total.end(),
699 inserter(layer_erasures, layer_erasures.end()));
700 //
701 // If this layer has no erasure, skip it
702 //
703 if (layer_erasures.empty())
704 continue;
705
706 if (layer_erasures.size() > 0 &&
707 layer_erasures.size() <= i->erasure_code->get_coding_chunk_count()) {
708 //
709 // chunks recovered by this layer are removed from the list of
710 // erasures so that upper levels know they can rely on their
711 // availability
712 //
713 for (set<int>::const_iterator j = layer_erasures.begin();
714 j != layer_erasures.end();
715 ++j) {
716 erasures_total.erase(*j);
717 }
718 }
719 }
720 if (erasures_total.empty()) {
721 //
722 // Do not try to be smart about what chunks are necessary to
723 // recover, use all available chunks.
724 //
725 *minimum = available_chunks;
726 dout(20) << __func__ << " minimum == available_chunks == "
727 << available_chunks << dendl;
728 return 0;
729 }
730 }
731
732 derr << __func__ << " not enough chunks in " << available_chunks
733 << " to read " << want_to_read << dendl;
734 return -EIO;
735}
736
737int ErasureCodeLrc::encode_chunks(const set<int> &want_to_encode,
738 map<int, bufferlist> *encoded)
739{
740 unsigned int top = layers.size();
741 for (vector<Layer>::reverse_iterator i = layers.rbegin();
742 i != layers.rend();
743 ++i) {
744 --top;
745 if (includes(i->chunks_as_set.begin(), i->chunks_as_set.end(),
746 want_to_encode.begin(), want_to_encode.end()))
747 break;
748 }
749
750 for (unsigned int i = top; i < layers.size(); ++i) {
751 const Layer &layer = layers[i];
752 set<int> layer_want_to_encode;
753 map<int, bufferlist> layer_encoded;
754 int j = 0;
11fdf7f2
TL
755 for (const auto& c : layer.chunks) {
756 std::swap(layer_encoded[j], (*encoded)[c]);
757 if (want_to_encode.find(c) != want_to_encode.end())
7c673cae
FG
758 layer_want_to_encode.insert(j);
759 j++;
760 }
761 int err = layer.erasure_code->encode_chunks(layer_want_to_encode,
762 &layer_encoded);
11fdf7f2
TL
763 j = 0;
764 for (const auto& c : layer.chunks) {
765 std::swap(layer_encoded[j++], (*encoded)[c]);
766 }
7c673cae
FG
767 if (err) {
768 derr << __func__ << " layer " << layer.chunks_map
769 << " failed with " << err << " trying to encode "
770 << layer_want_to_encode << dendl;
771 return err;
772 }
773 }
774 return 0;
775}
776
777int ErasureCodeLrc::decode_chunks(const set<int> &want_to_read,
778 const map<int, bufferlist> &chunks,
779 map<int, bufferlist> *decoded)
780{
781 set<int> available_chunks;
782 set<int> erasures;
783 for (unsigned int i = 0; i < get_chunk_count(); ++i) {
784 if (chunks.count(i) != 0)
785 available_chunks.insert(i);
786 else
787 erasures.insert(i);
788 }
789
790 set<int> want_to_read_erasures;
791
792 for (vector<Layer>::reverse_iterator layer = layers.rbegin();
793 layer != layers.rend();
794 ++layer) {
795 set<int> layer_erasures;
796 set_intersection(layer->chunks_as_set.begin(), layer->chunks_as_set.end(),
797 erasures.begin(), erasures.end(),
798 inserter(layer_erasures, layer_erasures.end()));
799
800 if (layer_erasures.size() >
801 layer->erasure_code->get_coding_chunk_count()) {
802 // skip because there are too many erasures for this layer to recover
803 } else if(layer_erasures.size() == 0) {
804 // skip because all chunks are already available
805 } else {
806 set<int> layer_want_to_read;
807 map<int, bufferlist> layer_chunks;
808 map<int, bufferlist> layer_decoded;
809 int j = 0;
810 for (vector<int>::const_iterator c = layer->chunks.begin();
811 c != layer->chunks.end();
812 ++c) {
813 //
814 // Pick chunks from *decoded* instead of *chunks* to re-use
815 // chunks recovered by previous layers. In other words
816 // *chunks* does not change but *decoded* gradually improves
817 // as more layers recover from erasures.
818 //
819 if (erasures.count(*c) == 0)
820 layer_chunks[j] = (*decoded)[*c];
821 if (want_to_read.count(*c) != 0)
822 layer_want_to_read.insert(j);
823 layer_decoded[j] = (*decoded)[*c];
824 ++j;
825 }
826 int err = layer->erasure_code->decode_chunks(layer_want_to_read,
827 layer_chunks,
828 &layer_decoded);
829 if (err) {
830 derr << __func__ << " layer " << layer->chunks_map
831 << " failed with " << err << " trying to decode "
832 << layer_want_to_read << " with " << available_chunks << dendl;
833 return err;
834 }
835 j = 0;
836 for (vector<int>::const_iterator c = layer->chunks.begin();
837 c != layer->chunks.end();
838 ++c) {
839 (*decoded)[*c] = layer_decoded[j];
840 ++j;
11fdf7f2 841 erasures.erase(*c);
7c673cae
FG
842 }
843 want_to_read_erasures.clear();
844 set_intersection(erasures.begin(), erasures.end(),
845 want_to_read.begin(), want_to_read.end(),
846 inserter(want_to_read_erasures, want_to_read_erasures.end()));
847 if (want_to_read_erasures.size() == 0)
848 break;
849 }
850 }
851
852 if (want_to_read_erasures.size() > 0) {
853 derr << __func__ << " want to read " << want_to_read
854 << " with available_chunks = " << available_chunks
855 << " end up being unable to read " << want_to_read_erasures << dendl;
856 return -EIO;
857 } else {
858 return 0;
859 }
860}