]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
2891056c90564c9e431b7d35ca44d14a68f5fc62
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
10
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
13
14 #define dout_subsys ceph_subsys_crush
15
16 bool CrushWrapper::has_legacy_rulesets() const
17 {
18 for (unsigned i=0; i<crush->max_rules; i++) {
19 crush_rule *r = crush->rules[i];
20 if (r &&
21 r->mask.ruleset != i) {
22 return true;
23 }
24 }
25 return false;
26 }
27
28 int CrushWrapper::renumber_rules_by_ruleset()
29 {
30 int max_ruleset = 0;
31 for (unsigned i=0; i<crush->max_rules; i++) {
32 crush_rule *r = crush->rules[i];
33 if (r && r->mask.ruleset >= max_ruleset) {
34 max_ruleset = r->mask.ruleset + 1;
35 }
36 }
37 struct crush_rule **newrules =
38 (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
39 for (unsigned i=0; i<crush->max_rules; i++) {
40 crush_rule *r = crush->rules[i];
41 if (!r)
42 continue;
43 if (newrules[r->mask.ruleset]) {
44 // collision, we can't do it.
45 free(newrules);
46 return -EINVAL;
47 }
48 newrules[r->mask.ruleset] = r;
49 }
50
51 // success, swap!
52 free(crush->rules);
53 crush->rules = newrules;
54 crush->max_rules = max_ruleset;
55 return 0;
56 }
57
58 bool CrushWrapper::has_multirule_rulesets() const
59 {
60 for (unsigned i=0; i<crush->max_rules; i++) {
61 crush_rule *r = crush->rules[i];
62 if (!r)
63 continue;
64 for (unsigned j=i+1; j<crush->max_rules; j++) {
65 crush_rule *s = crush->rules[j];
66 if (!s)
67 continue;
68 if (r->mask.ruleset == s->mask.ruleset)
69 return true;
70 }
71 }
72 return false;
73 }
74
75 bool CrushWrapper::has_non_straw2_buckets() const
76 {
77 for (int i=0; i<crush->max_buckets; ++i) {
78 crush_bucket *b = crush->buckets[i];
79 if (!b)
80 continue;
81 if (b->alg != CRUSH_BUCKET_STRAW2)
82 return true;
83 }
84 return false;
85 }
86
87 bool CrushWrapper::has_v2_rules() const
88 {
89 for (unsigned i=0; i<crush->max_rules; i++) {
90 if (is_v2_rule(i)) {
91 return true;
92 }
93 }
94 return false;
95 }
96
97 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
98 {
99 // check rule for use of indep or new SET_* rule steps
100 if (ruleid >= crush->max_rules)
101 return false;
102 crush_rule *r = crush->rules[ruleid];
103 if (!r)
104 return false;
105 for (unsigned j=0; j<r->len; j++) {
106 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
107 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
108 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
109 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
110 return true;
111 }
112 }
113 return false;
114 }
115
116 bool CrushWrapper::has_v3_rules() const
117 {
118 for (unsigned i=0; i<crush->max_rules; i++) {
119 if (is_v3_rule(i)) {
120 return true;
121 }
122 }
123 return false;
124 }
125
126 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
127 {
128 // check rule for use of SET_CHOOSELEAF_VARY_R step
129 if (ruleid >= crush->max_rules)
130 return false;
131 crush_rule *r = crush->rules[ruleid];
132 if (!r)
133 return false;
134 for (unsigned j=0; j<r->len; j++) {
135 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
136 return true;
137 }
138 }
139 return false;
140 }
141
142 bool CrushWrapper::has_v4_buckets() const
143 {
144 for (int i=0; i<crush->max_buckets; ++i) {
145 crush_bucket *b = crush->buckets[i];
146 if (!b)
147 continue;
148 if (b->alg == CRUSH_BUCKET_STRAW2)
149 return true;
150 }
151 return false;
152 }
153
154 bool CrushWrapper::has_v5_rules() const
155 {
156 for (unsigned i=0; i<crush->max_rules; i++) {
157 if (is_v5_rule(i)) {
158 return true;
159 }
160 }
161 return false;
162 }
163
164 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
165 {
166 // check rule for use of SET_CHOOSELEAF_STABLE step
167 if (ruleid >= crush->max_rules)
168 return false;
169 crush_rule *r = crush->rules[ruleid];
170 if (!r)
171 return false;
172 for (unsigned j=0; j<r->len; j++) {
173 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
174 return true;
175 }
176 }
177 return false;
178 }
179
180 bool CrushWrapper::has_choose_args() const
181 {
182 return !choose_args.empty();
183 }
184
185 bool CrushWrapper::has_incompat_choose_args() const
186 {
187 if (choose_args.empty())
188 return false;
189 if (choose_args.size() > 1)
190 return true;
191 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
192 return true;
193 crush_choose_arg_map arg_map = choose_args.begin()->second;
194 for (__u32 i = 0; i < arg_map.size; i++) {
195 crush_choose_arg *arg = &arg_map.args[i];
196 if (arg->weight_set_size == 0 &&
197 arg->ids_size == 0)
198 continue;
199 if (arg->weight_set_size != 1)
200 return true;
201 if (arg->ids_size != 0)
202 return true;
203 }
204 return false;
205 }
206
207 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
208 {
209 if (!item_exists(i))
210 return -EINVAL;
211 string name = get_item_name(i);
212 size_t pos = name.find("~");
213 if (pos == string::npos) {
214 *idout = i;
215 *classout = -1;
216 return 0;
217 }
218 string name_no_class = name.substr(0, pos);
219 if (!name_exists(name_no_class))
220 return -ENOENT;
221 string class_name = name.substr(pos + 1);
222 if (!class_exists(class_name))
223 return -ENOENT;
224 *idout = get_item_id(name_no_class);
225 *classout = get_class_id(class_name);
226 return 0;
227 }
228
229 int CrushWrapper::can_rename_item(const string& srcname,
230 const string& dstname,
231 ostream *ss) const
232 {
233 if (name_exists(srcname)) {
234 if (name_exists(dstname)) {
235 *ss << "dstname = '" << dstname << "' already exists";
236 return -EEXIST;
237 }
238 if (is_valid_crush_name(dstname)) {
239 return 0;
240 } else {
241 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
242 return -EINVAL;
243 }
244 } else {
245 if (name_exists(dstname)) {
246 *ss << "srcname = '" << srcname << "' does not exist "
247 << "and dstname = '" << dstname << "' already exists";
248 return -EALREADY;
249 } else {
250 *ss << "srcname = '" << srcname << "' does not exist";
251 return -ENOENT;
252 }
253 }
254 }
255
256 int CrushWrapper::rename_item(const string& srcname,
257 const string& dstname,
258 ostream *ss)
259 {
260 int ret = can_rename_item(srcname, dstname, ss);
261 if (ret < 0)
262 return ret;
263 int oldid = get_item_id(srcname);
264 return set_item_name(oldid, dstname);
265 }
266
267 int CrushWrapper::can_rename_bucket(const string& srcname,
268 const string& dstname,
269 ostream *ss) const
270 {
271 int ret = can_rename_item(srcname, dstname, ss);
272 if (ret)
273 return ret;
274 int srcid = get_item_id(srcname);
275 if (srcid >= 0) {
276 *ss << "srcname = '" << srcname << "' is not a bucket "
277 << "because its id = " << srcid << " is >= 0";
278 return -ENOTDIR;
279 }
280 return 0;
281 }
282
283 int CrushWrapper::rename_bucket(const string& srcname,
284 const string& dstname,
285 ostream *ss)
286 {
287 int ret = can_rename_bucket(srcname, dstname, ss);
288 if (ret < 0)
289 return ret;
290 int oldid = get_item_id(srcname);
291 return set_item_name(oldid, dstname);
292 }
293
294 void CrushWrapper::find_takes(set<int>& roots) const
295 {
296 for (unsigned i=0; i<crush->max_rules; i++) {
297 crush_rule *r = crush->rules[i];
298 if (!r)
299 continue;
300 for (unsigned j=0; j<r->len; j++) {
301 if (r->steps[j].op == CRUSH_RULE_TAKE)
302 roots.insert(r->steps[j].arg1);
303 }
304 }
305 }
306
307 void CrushWrapper::find_roots(set<int>& roots) const
308 {
309 for (int i = 0; i < crush->max_buckets; i++) {
310 if (!crush->buckets[i])
311 continue;
312 crush_bucket *b = crush->buckets[i];
313 if (!_search_item_exists(b->id))
314 roots.insert(b->id);
315 }
316 }
317
318 bool CrushWrapper::subtree_contains(int root, int item) const
319 {
320 if (root == item)
321 return true;
322
323 if (root >= 0)
324 return false; // root is a leaf
325
326 const crush_bucket *b = get_bucket(root);
327 if (IS_ERR(b))
328 return false;
329
330 for (unsigned j=0; j<b->size; j++) {
331 if (subtree_contains(b->items[j], item))
332 return true;
333 }
334 return false;
335 }
336
337 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
338 {
339 // last instance?
340 if (_search_item_exists(item)) {
341 return false;
342 }
343 if (item < 0 && _bucket_is_in_use(item)) {
344 return false;
345 }
346
347 if (item < 0 && !unlink_only) {
348 crush_bucket *t = get_bucket(item);
349 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
350 crush_remove_bucket(crush, t);
351 if (class_bucket.count(item) != 0)
352 class_bucket.erase(item);
353 class_remove_item(item);
354 }
355 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
356 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
357 name_map.erase(item);
358 have_rmaps = false;
359 if (item >= 0 && !unlink_only) {
360 class_remove_item(item);
361 }
362 }
363 return true;
364 }
365
366 int CrushWrapper::remove_root(int item, bool unused)
367 {
368 if (unused && _bucket_is_in_use(item))
369 return 0;
370
371 crush_bucket *b = get_bucket(item);
372 if (IS_ERR(b))
373 return -ENOENT;
374
375 for (unsigned n = 0; n < b->size; n++) {
376 if (b->items[n] >= 0)
377 continue;
378 int r = remove_root(b->items[n], unused);
379 if (r < 0)
380 return r;
381 }
382
383 crush_remove_bucket(crush, b);
384 if (name_map.count(item) != 0) {
385 name_map.erase(item);
386 have_rmaps = false;
387 }
388 if (class_bucket.count(item) != 0)
389 class_bucket.erase(item);
390 class_remove_item(item);
391 return 0;
392 }
393
394 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
395 {
396 ldout(cct, 5) << "remove_item " << item
397 << (unlink_only ? " unlink_only":"") << dendl;
398
399 int ret = -ENOENT;
400
401 if (item < 0 && !unlink_only) {
402 crush_bucket *t = get_bucket(item);
403 if (IS_ERR(t)) {
404 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
405 << dendl;
406 return -ENOENT;
407 }
408
409 if (t->size) {
410 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
411 << " items, not empty" << dendl;
412 return -ENOTEMPTY;
413 }
414 if (_bucket_is_in_use(item)) {
415 return -EBUSY;
416 }
417 }
418
419 for (int i = 0; i < crush->max_buckets; i++) {
420 if (!crush->buckets[i])
421 continue;
422 crush_bucket *b = crush->buckets[i];
423
424 for (unsigned i=0; i<b->size; ++i) {
425 int id = b->items[i];
426 if (id == item) {
427 ldout(cct, 5) << "remove_item removing item " << item
428 << " from bucket " << b->id << dendl;
429 for (auto& p : choose_args) {
430 // weight down each weight-set to 0 before we remove the item
431 vector<int> weightv(get_choose_args_positions(p.second), 0);
432 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
433 }
434 bucket_remove_item(b, item);
435 adjust_item_weight(cct, b->id, b->weight);
436 ret = 0;
437 }
438 }
439 }
440
441 if (_maybe_remove_last_instance(cct, item, unlink_only))
442 ret = 0;
443
444 return ret;
445 }
446
447 bool CrushWrapper::_search_item_exists(int item) const
448 {
449 for (int i = 0; i < crush->max_buckets; i++) {
450 if (!crush->buckets[i])
451 continue;
452 crush_bucket *b = crush->buckets[i];
453 for (unsigned j=0; j<b->size; ++j) {
454 if (b->items[j] == item)
455 return true;
456 }
457 }
458 return false;
459 }
460
461 bool CrushWrapper::_bucket_is_in_use(int item)
462 {
463 for (auto &i : class_bucket)
464 for (auto &j : i.second)
465 if (j.second == item)
466 return true;
467 for (unsigned i = 0; i < crush->max_rules; ++i) {
468 crush_rule *r = crush->rules[i];
469 if (!r)
470 continue;
471 for (unsigned j = 0; j < r->len; ++j) {
472 if (r->steps[j].op == CRUSH_RULE_TAKE) {
473 int step_item = r->steps[j].arg1;
474 int original_item;
475 int c;
476 int res = split_id_class(step_item, &original_item, &c);
477 if (res < 0)
478 return false;
479 if (step_item == item || original_item == item)
480 return true;
481 }
482 }
483 }
484 return false;
485 }
486
487 int CrushWrapper::_remove_item_under(
488 CephContext *cct, int item, int ancestor, bool unlink_only)
489 {
490 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
491 << (unlink_only ? " unlink_only":"") << dendl;
492
493 if (ancestor >= 0) {
494 return -EINVAL;
495 }
496
497 if (!bucket_exists(ancestor))
498 return -EINVAL;
499
500 int ret = -ENOENT;
501
502 crush_bucket *b = get_bucket(ancestor);
503 for (unsigned i=0; i<b->size; ++i) {
504 int id = b->items[i];
505 if (id == item) {
506 ldout(cct, 5) << "_remove_item_under removing item " << item
507 << " from bucket " << b->id << dendl;
508 bucket_remove_item(b, item);
509 for (auto& p : choose_args) {
510 // weight down each weight-set to 0 before we remove the item
511 vector<int> weightv(get_choose_args_positions(p.second), 0);
512 _choose_args_adjust_item_weight_in_bucket(
513 cct, p.second, b->id, item, weightv, nullptr);
514 }
515 adjust_item_weight(cct, b->id, b->weight);
516 ret = 0;
517 } else if (id < 0) {
518 int r = remove_item_under(cct, item, id, unlink_only);
519 if (r == 0)
520 ret = 0;
521 }
522 }
523 return ret;
524 }
525
526 int CrushWrapper::remove_item_under(
527 CephContext *cct, int item, int ancestor, bool unlink_only)
528 {
529 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
530 << (unlink_only ? " unlink_only":"") << dendl;
531
532 if (!unlink_only && _bucket_is_in_use(item)) {
533 return -EBUSY;
534 }
535
536 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
537 if (ret < 0)
538 return ret;
539
540 if (item < 0 && !unlink_only) {
541 crush_bucket *t = get_bucket(item);
542 if (IS_ERR(t)) {
543 ldout(cct, 1) << "remove_item_under bucket " << item
544 << " does not exist" << dendl;
545 return -ENOENT;
546 }
547
548 if (t->size) {
549 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
550 << " items, not empty" << dendl;
551 return -ENOTEMPTY;
552 }
553 }
554
555 if (_maybe_remove_last_instance(cct, item, unlink_only))
556 ret = 0;
557
558 return ret;
559 }
560
561 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
562 const std::multimap<string,string>& loc)
563 {
564 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
565 if (!item_exists(id))
566 return -ENOENT;
567 map<string,string> id_loc = get_full_location(id);
568 ldout(cct, 20) << " id is at " << id_loc << dendl;
569
570 for (map<int,string>::const_iterator p = type_map.begin();
571 p != type_map.end();
572 ++p) {
573 map<string,string>::iterator ip = id_loc.find(p->second);
574 if (ip == id_loc.end())
575 continue;
576 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
577 q != loc.end();
578 ++q) {
579 if (q->first != p->second)
580 break;
581 if (q->second == ip->second)
582 return p->first;
583 }
584 }
585 return -ERANGE;
586 }
587
588 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
589 std::map<string,string> *ploc)
590 {
591 ploc->clear();
592 for (unsigned i = 0; i < args.size(); ++i) {
593 const char *s = args[i].c_str();
594 const char *pos = strchr(s, '=');
595 if (!pos)
596 return -EINVAL;
597 string key(s, 0, pos-s);
598 string value(pos+1);
599 if (value.length())
600 (*ploc)[key] = value;
601 else
602 return -EINVAL;
603 }
604 return 0;
605 }
606
607 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
608 std::multimap<string,string> *ploc)
609 {
610 ploc->clear();
611 for (unsigned i = 0; i < args.size(); ++i) {
612 const char *s = args[i].c_str();
613 const char *pos = strchr(s, '=');
614 if (!pos)
615 return -EINVAL;
616 string key(s, 0, pos-s);
617 string value(pos+1);
618 if (value.length())
619 ploc->insert(make_pair(key, value));
620 else
621 return -EINVAL;
622 }
623 return 0;
624 }
625
626 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
627 int *weight)
628 {
629 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
630
631 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
632 // ignore device
633 if (p->first == 0)
634 continue;
635
636 // ignore types that aren't specified in loc
637 map<string,string>::const_iterator q = loc.find(p->second);
638 if (q == loc.end()) {
639 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
640 << type_map << ")" << dendl;
641 continue;
642 }
643
644 if (!name_exists(q->second)) {
645 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
646 return false;
647 }
648
649 int id = get_item_id(q->second);
650 if (id >= 0) {
651 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
652 << " is a device, not bucket" << dendl;
653 return false;
654 }
655
656 assert(bucket_exists(id));
657 crush_bucket *b = get_bucket(id);
658
659 // see if item exists in this bucket
660 for (unsigned j=0; j<b->size; j++) {
661 if (b->items[j] == item) {
662 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
663 if (weight)
664 *weight = crush_get_bucket_item_weight(b, j);
665 return true;
666 }
667 }
668 return false;
669 }
670
671 ldout(cct, 1) << "check_item_loc item " << item << " loc " << loc << dendl;
672 return false;
673 }
674
675 map<string, string> CrushWrapper::get_full_location(int id)
676 {
677 vector<pair<string, string> > full_location_ordered;
678 map<string,string> full_location;
679
680 get_full_location_ordered(id, full_location_ordered);
681
682 std::copy(full_location_ordered.begin(),
683 full_location_ordered.end(),
684 std::inserter(full_location, full_location.begin()));
685
686 return full_location;
687 }
688
689 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
690 {
691 if (!item_exists(id))
692 return -ENOENT;
693 int cur = id;
694 int ret;
695 while (true) {
696 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
697 if (ret != 0)
698 break;
699 path.push_back(parent_coord);
700 cur = get_item_id(parent_coord.second);
701 }
702 return 0;
703 }
704
705 string CrushWrapper::get_full_location_ordered_string(int id)
706 {
707 vector<pair<string, string> > full_location_ordered;
708 string full_location;
709 get_full_location_ordered(id, full_location_ordered);
710 reverse(begin(full_location_ordered), end(full_location_ordered));
711 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
712 full_location = full_location + i->first + "=" + i->second;
713 if (i != full_location_ordered.end() - 1) {
714 full_location = full_location + ",";
715 }
716 }
717 return full_location;
718 }
719
720 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
721 {
722 map<int,string> parent_hierarchy;
723 pair<string, string> parent_coord = get_immediate_parent(id);
724 int parent_id;
725
726 // get the integer type for id and create a counter from there
727 int type_counter = get_bucket_type(id);
728
729 // if we get a negative type then we can assume that we have an OSD
730 // change behavior in get_item_type FIXME
731 if (type_counter < 0)
732 type_counter = 0;
733
734 // read the type map and get the name of the type with the largest ID
735 int high_type = 0;
736 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
737 if ( (*it).first > high_type )
738 high_type = (*it).first;
739 }
740
741 parent_id = get_item_id(parent_coord.second);
742
743 while (type_counter < high_type) {
744 type_counter++;
745 parent_hierarchy[ type_counter ] = parent_coord.first;
746
747 if (type_counter < high_type){
748 // get the coordinate information for the next parent
749 parent_coord = get_immediate_parent(parent_id);
750 parent_id = get_item_id(parent_coord.second);
751 }
752 }
753
754 return parent_hierarchy;
755 }
756
757 int CrushWrapper::get_children(int id, list<int> *children)
758 {
759 // leaf?
760 if (id >= 0) {
761 return 0;
762 }
763
764 crush_bucket *b = get_bucket(id);
765 if (IS_ERR(b)) {
766 return -ENOENT;
767 }
768
769 for (unsigned n=0; n<b->size; n++) {
770 children->push_back(b->items[n]);
771 }
772 return b->size;
773 }
774
775 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
776 {
777 assert(leaves);
778
779 // Already leaf?
780 if (id >= 0) {
781 leaves->push_back(id);
782 return 0;
783 }
784
785 crush_bucket *b = get_bucket(id);
786 if (IS_ERR(b)) {
787 return -ENOENT;
788 }
789
790 for (unsigned n = 0; n < b->size; n++) {
791 if (b->items[n] >= 0) {
792 leaves->push_back(b->items[n]);
793 } else {
794 // is a bucket, do recursive call
795 int r = _get_leaves(b->items[n], leaves);
796 if (r < 0) {
797 return r;
798 }
799 }
800 }
801
802 return 0; // all is well
803 }
804
805 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
806 {
807 assert(leaves);
808 leaves->clear();
809
810 if (!name_exists(name)) {
811 return -ENOENT;
812 }
813
814 int id = get_item_id(name);
815 if (id >= 0) {
816 // already leaf
817 leaves->insert(id);
818 return 0;
819 }
820
821 list<int> unordered;
822 int r = _get_leaves(id, &unordered);
823 if (r < 0) {
824 return r;
825 }
826
827 for (auto &p : unordered) {
828 leaves->insert(p);
829 }
830
831 return 0;
832 }
833
834 int CrushWrapper::insert_item(
835 CephContext *cct, int item, float weight, string name,
836 const map<string,string>& loc) // typename -> bucketname
837 {
838 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
839 << " name " << name << " loc " << loc << dendl;
840
841 if (!is_valid_crush_name(name))
842 return -EINVAL;
843
844 if (!is_valid_crush_loc(cct, loc))
845 return -EINVAL;
846
847 int r = validate_weightf(weight);
848 if (r < 0) {
849 return r;
850 }
851
852 if (name_exists(name)) {
853 if (get_item_id(name) != item) {
854 ldout(cct, 10) << "device name '" << name << "' already exists as id "
855 << get_item_id(name) << dendl;
856 return -EEXIST;
857 }
858 } else {
859 set_item_name(item, name);
860 }
861
862 int cur = item;
863
864 // create locations if locations don't exist and add child in
865 // location with 0 weight the more detail in the insert_item method
866 // declaration in CrushWrapper.h
867 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
868 // ignore device type
869 if (p->first == 0)
870 continue;
871
872 // skip types that are unspecified
873 map<string,string>::const_iterator q = loc.find(p->second);
874 if (q == loc.end()) {
875 ldout(cct, 2) << "warning: did not specify location for '"
876 << p->second << "' level (levels are "
877 << type_map << ")" << dendl;
878 continue;
879 }
880
881 if (!name_exists(q->second)) {
882 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
883 int empty = 0, newid;
884 int r = add_bucket(0, 0,
885 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
886 if (r < 0) {
887 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
888 << dendl;
889 return r;
890 }
891 set_item_name(newid, q->second);
892
893 cur = newid;
894 continue;
895 }
896
897 // add to an existing bucket
898 int id = get_item_id(q->second);
899 if (!bucket_exists(id)) {
900 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
901 return -EINVAL;
902 }
903
904 // check that we aren't creating a cycle.
905 if (subtree_contains(id, cur)) {
906 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
907 << id << dendl;
908 return -EINVAL;
909 }
910
911 // we have done sanity check above
912 crush_bucket *b = get_bucket(id);
913
914 if (p->first != b->type) {
915 ldout(cct, 1) << "insert_item existing bucket has type "
916 << "'" << type_map[b->type] << "' != "
917 << "'" << type_map[p->first] << "'" << dendl;
918 return -EINVAL;
919 }
920
921 // are we forming a loop?
922 if (subtree_contains(cur, b->id)) {
923 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
924 << "; cannot form loop" << dendl;
925 return -ELOOP;
926 }
927
928 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
929 << " to bucket " << id << dendl;
930 int r = bucket_add_item(b, cur, 0);
931 assert (!r);
932 break;
933 }
934
935 // adjust the item's weight in location
936 if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
937 if (item >= crush->max_devices) {
938 crush->max_devices = item + 1;
939 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
940 << dendl;
941 }
942 r = rebuild_roots_with_classes();
943 if (r < 0) {
944 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
945 << cpp_strerror(r) << dendl;
946 return r;
947 }
948 return 0;
949 }
950
951 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
952 << " in " << loc << dendl;
953 return -EINVAL;
954 }
955
956
957 int CrushWrapper::move_bucket(
958 CephContext *cct, int id, const map<string,string>& loc)
959 {
960 // sorry this only works for buckets
961 if (id >= 0)
962 return -EINVAL;
963
964 if (!item_exists(id))
965 return -ENOENT;
966
967 // get the name of the bucket we are trying to move for later
968 string id_name = get_item_name(id);
969
970 // detach the bucket
971 int bucket_weight = detach_bucket(cct, id);
972
973 // insert the bucket back into the hierarchy
974 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
975 }
976
977 int CrushWrapper::detach_bucket(CephContext *cct, int item)
978 {
979 if (!crush)
980 return (-EINVAL);
981
982 if (item >= 0)
983 return (-EINVAL);
984
985 // check that the bucket that we want to detach exists
986 assert(bucket_exists(item));
987
988 // get the bucket's weight
989 crush_bucket *b = get_bucket(item);
990 unsigned bucket_weight = b->weight;
991
992 // get where the bucket is located
993 pair<string, string> bucket_location = get_immediate_parent(item);
994
995 // get the id of the parent bucket
996 int parent_id = get_item_id(bucket_location.second);
997
998 // get the parent bucket
999 crush_bucket *parent_bucket = get_bucket(parent_id);
1000
1001 if (!IS_ERR(parent_bucket)) {
1002 // zero out the bucket weight
1003 bucket_adjust_item_weight(cct, parent_bucket, item, 0);
1004 adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
1005 for (auto& p : choose_args) {
1006 // weight down each weight-set to 0 before we remove the item
1007 vector<int> weightv(get_choose_args_positions(p.second), 0);
1008 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
1009 }
1010
1011 // remove the bucket from the parent
1012 bucket_remove_item(parent_bucket, item);
1013 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1014 return PTR_ERR(parent_bucket);
1015 }
1016
1017 // check that we're happy
1018 int test_weight = 0;
1019 map<string,string> test_location;
1020 test_location[ bucket_location.first ] = (bucket_location.second);
1021
1022 bool successful_detach = !(check_item_loc(cct, item, test_location,
1023 &test_weight));
1024 assert(successful_detach);
1025 assert(test_weight == 0);
1026
1027 return bucket_weight;
1028 }
1029
1030 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
1031 {
1032 if (src >= 0 || dst >= 0)
1033 return -EINVAL;
1034 if (!item_exists(src) || !item_exists(dst))
1035 return -EINVAL;
1036 crush_bucket *a = get_bucket(src);
1037 crush_bucket *b = get_bucket(dst);
1038 unsigned aw = a->weight;
1039 unsigned bw = b->weight;
1040
1041 // swap weights
1042 adjust_item_weight(cct, a->id, bw);
1043 adjust_item_weight(cct, b->id, aw);
1044
1045 // swap items
1046 map<int,unsigned> tmp;
1047 unsigned as = a->size;
1048 unsigned bs = b->size;
1049 for (unsigned i = 0; i < as; ++i) {
1050 int item = a->items[0];
1051 int itemw = crush_get_bucket_item_weight(a, 0);
1052 tmp[item] = itemw;
1053 bucket_remove_item(a, item);
1054 }
1055 assert(a->size == 0);
1056 assert(b->size == bs);
1057 for (unsigned i = 0; i < bs; ++i) {
1058 int item = b->items[0];
1059 int itemw = crush_get_bucket_item_weight(b, 0);
1060 bucket_remove_item(b, item);
1061 bucket_add_item(a, item, itemw);
1062 }
1063 assert(a->size == bs);
1064 assert(b->size == 0);
1065 for (auto t : tmp) {
1066 bucket_add_item(b, t.first, t.second);
1067 }
1068 assert(a->size == bs);
1069 assert(b->size == as);
1070
1071 // swap names
1072 swap_names(src, dst);
1073 return 0;
1074 }
1075
1076 int CrushWrapper::link_bucket(
1077 CephContext *cct, int id, const map<string,string>& loc)
1078 {
1079 // sorry this only works for buckets
1080 if (id >= 0)
1081 return -EINVAL;
1082
1083 if (!item_exists(id))
1084 return -ENOENT;
1085
1086 // get the name of the bucket we are trying to move for later
1087 string id_name = get_item_name(id);
1088
1089 crush_bucket *b = get_bucket(id);
1090 unsigned bucket_weight = b->weight;
1091
1092 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1093 }
1094
1095 int CrushWrapper::create_or_move_item(
1096 CephContext *cct, int item, float weight, string name,
1097 const map<string,string>& loc) // typename -> bucketname
1098 {
1099 int ret = 0;
1100 int old_iweight;
1101
1102 if (!is_valid_crush_name(name))
1103 return -EINVAL;
1104
1105 if (check_item_loc(cct, item, loc, &old_iweight)) {
1106 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1107 << dendl;
1108 } else {
1109 if (_search_item_exists(item)) {
1110 weight = get_item_weightf(item);
1111 ldout(cct, 10) << "create_or_move_item " << item
1112 << " exists with weight " << weight << dendl;
1113 remove_item(cct, item, true);
1114 }
1115 ldout(cct, 5) << "create_or_move_item adding " << item
1116 << " weight " << weight
1117 << " at " << loc << dendl;
1118 ret = insert_item(cct, item, weight, name, loc);
1119 if (ret == 0)
1120 ret = 1; // changed
1121 }
1122 return ret;
1123 }
1124
1125 int CrushWrapper::update_item(
1126 CephContext *cct, int item, float weight, string name,
1127 const map<string,string>& loc) // typename -> bucketname
1128 {
1129 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1130 << " name " << name << " loc " << loc << dendl;
1131 int ret = 0;
1132
1133 if (!is_valid_crush_name(name))
1134 return -EINVAL;
1135
1136 if (!is_valid_crush_loc(cct, loc))
1137 return -EINVAL;
1138
1139 ret = validate_weightf(weight);
1140 if (ret < 0) {
1141 return ret;
1142 }
1143
1144 // compare quantized (fixed-point integer) weights!
1145 int iweight = (int)(weight * (float)0x10000);
1146 int old_iweight;
1147 if (check_item_loc(cct, item, loc, &old_iweight)) {
1148 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1149 if (old_iweight != iweight) {
1150 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1151 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1152 << dendl;
1153 adjust_item_weight_in_loc(cct, item, iweight, loc);
1154 ret = 1;
1155 }
1156 if (get_item_name(item) != name) {
1157 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1158 << dendl;
1159 set_item_name(item, name);
1160 ret = 1;
1161 }
1162 } else {
1163 if (item_exists(item)) {
1164 remove_item(cct, item, true);
1165 }
1166 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1167 << " at " << loc << dendl;
1168 ret = insert_item(cct, item, weight, name, loc);
1169 if (ret == 0)
1170 ret = 1; // changed
1171 }
1172 return ret;
1173 }
1174
1175 int CrushWrapper::get_item_weight(int id) const
1176 {
1177 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1178 crush_bucket *b = crush->buckets[bidx];
1179 if (b == NULL)
1180 continue;
1181 if (b->id == id)
1182 return b->weight;
1183 for (unsigned i = 0; i < b->size; i++)
1184 if (b->items[i] == id)
1185 return crush_get_bucket_item_weight(b, i);
1186 }
1187 return -ENOENT;
1188 }
1189
1190 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1191 {
1192 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1193
1194 int bid = get_item_id(l->second);
1195 if (!bucket_exists(bid))
1196 continue;
1197 crush_bucket *b = get_bucket(bid);
1198 for (unsigned int i = 0; i < b->size; i++) {
1199 if (b->items[i] == id) {
1200 return crush_get_bucket_item_weight(b, i);
1201 }
1202 }
1203 }
1204 return -ENOENT;
1205 }
1206
1207 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1208 {
1209 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1210 int changed = 0;
1211 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1212 crush_bucket *b = crush->buckets[bidx];
1213 if (b == 0)
1214 continue;
1215 for (unsigned i = 0; i < b->size; i++) {
1216 if (b->items[i] == id) {
1217 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1218 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
1219 << " in bucket " << bidx << dendl;
1220 adjust_item_weight(cct, -1 - bidx, b->weight);
1221 changed++;
1222 }
1223 }
1224 }
1225 if (!changed)
1226 return -ENOENT;
1227 return changed;
1228 }
1229
1230 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1231 {
1232 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
1233 << " in " << loc << dendl;
1234 int changed = 0;
1235
1236 for (auto l = loc.begin(); l != loc.end(); ++l) {
1237 int bid = get_item_id(l->second);
1238 if (!bucket_exists(bid))
1239 continue;
1240 crush_bucket *b = get_bucket(bid);
1241 for (unsigned int i = 0; i < b->size; i++) {
1242 if (b->items[i] == id) {
1243 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1244 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
1245 << " in bucket " << bid << dendl;
1246 adjust_item_weight(cct, bid, b->weight);
1247 changed++;
1248 }
1249 }
1250 }
1251 if (!changed)
1252 return -ENOENT;
1253 return changed;
1254 }
1255
1256 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1257 {
1258 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1259 crush_bucket *b = get_bucket(id);
1260 if (IS_ERR(b))
1261 return PTR_ERR(b);
1262 int changed = 0;
1263 list<crush_bucket*> q;
1264 q.push_back(b);
1265 while (!q.empty()) {
1266 b = q.front();
1267 q.pop_front();
1268 int local_changed = 0;
1269 for (unsigned i=0; i<b->size; ++i) {
1270 int n = b->items[i];
1271 if (n >= 0) {
1272 bucket_adjust_item_weight(cct, b, n, weight);
1273 ++changed;
1274 ++local_changed;
1275 } else {
1276 crush_bucket *sub = get_bucket(n);
1277 if (IS_ERR(sub))
1278 continue;
1279 q.push_back(sub);
1280 }
1281 }
1282 if (local_changed) {
1283 adjust_item_weight(cct, b->id, b->weight);
1284 }
1285 }
1286 return changed;
1287 }
1288
1289 bool CrushWrapper::check_item_present(int id) const
1290 {
1291 bool found = false;
1292
1293 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1294 crush_bucket *b = crush->buckets[bidx];
1295 if (b == 0)
1296 continue;
1297 for (unsigned i = 0; i < b->size; i++)
1298 if (b->items[i] == id)
1299 found = true;
1300 }
1301 return found;
1302 }
1303
1304
1305 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1306 {
1307
1308 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1309 crush_bucket *b = crush->buckets[bidx];
1310 if (b == 0)
1311 continue;
1312 if (is_shadow_item(b->id))
1313 continue;
1314 for (unsigned i = 0; i < b->size; i++)
1315 if (b->items[i] == id) {
1316 string parent_id = name_map[b->id];
1317 string parent_bucket_type = type_map[b->type];
1318 if (_ret)
1319 *_ret = 0;
1320 return make_pair(parent_bucket_type, parent_id);
1321 }
1322 }
1323
1324 if (_ret)
1325 *_ret = -ENOENT;
1326
1327 return pair<string, string>();
1328 }
1329
1330 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1331 {
1332 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1333 crush_bucket *b = crush->buckets[bidx];
1334 if (b == 0)
1335 continue;
1336 if (is_shadow_item(b->id))
1337 continue;
1338 for (unsigned i = 0; i < b->size; i++) {
1339 if (b->items[i] == id) {
1340 *parent = b->id;
1341 return 0;
1342 }
1343 }
1344 }
1345 return -ENOENT;
1346 }
1347
1348 int CrushWrapper::get_parent_of_type(int item, int type) const
1349 {
1350 do {
1351 int r = get_immediate_parent_id(item, &item);
1352 if (r < 0) {
1353 return 0;
1354 }
1355 } while (get_bucket_type(item) != type);
1356 return item;
1357 }
1358
1359
1360 bool CrushWrapper::class_is_in_use(int class_id, ostream *ss)
1361 {
1362 list<unsigned> rules;
1363 for (unsigned i = 0; i < crush->max_rules; ++i) {
1364 crush_rule *r = crush->rules[i];
1365 if (!r)
1366 continue;
1367 for (unsigned j = 0; j < r->len; ++j) {
1368 if (r->steps[j].op == CRUSH_RULE_TAKE) {
1369 int root = r->steps[j].arg1;
1370 for (auto &p : class_bucket) {
1371 auto& q = p.second;
1372 if (q.count(class_id) && q[class_id] == root) {
1373 rules.push_back(i);
1374 }
1375 }
1376 }
1377 }
1378 }
1379 if (rules.empty()) {
1380 return false;
1381 }
1382 if (ss) {
1383 ostringstream os;
1384 for (auto &p: rules) {
1385 os << "'" << get_rule_name(p) <<"',";
1386 }
1387 string out(os.str());
1388 out.resize(out.size() - 1); // drop last ','
1389 *ss << "still referenced by crush_rule(s): " << out;
1390 }
1391 return true;
1392 }
1393
1394 int CrushWrapper::populate_classes()
1395 {
1396 set<int> roots;
1397 find_nonshadow_roots(roots);
1398 for (auto &r : roots) {
1399 if (r >= 0)
1400 continue;
1401 for (auto &c : class_name) {
1402 int clone;
1403 int res = device_class_clone(r, c.first, &clone);
1404 if (res < 0)
1405 return res;
1406 }
1407 }
1408 return 0;
1409 }
1410
1411 int CrushWrapper::cleanup_classes()
1412 {
1413 return trim_roots_with_class(true);
1414 }
1415
1416 int CrushWrapper::trim_roots_with_class(bool unused)
1417 {
1418 set<int> roots;
1419 find_shadow_roots(roots);
1420 for (auto &r : roots) {
1421 if (r >= 0)
1422 continue;
1423 int res = remove_root(r, unused);
1424 if (res)
1425 return res;
1426 }
1427 // there is no need to reweight because we only remove from the
1428 // root and down
1429 return 0;
1430 }
1431
1432 int32_t CrushWrapper::_alloc_class_id() const {
1433 if (class_name.empty()) {
1434 return 0;
1435 }
1436 int32_t class_id = class_name.rbegin()->first + 1;
1437 if (class_id >= 0) {
1438 return class_id;
1439 }
1440 // wrapped, pick a random start and do exhaustive search
1441 uint32_t upperlimit = numeric_limits<int32_t>::max();
1442 upperlimit++;
1443 class_id = rand() % upperlimit;
1444 const auto start = class_id;
1445 do {
1446 if (!class_name.count(class_id)) {
1447 return class_id;
1448 } else {
1449 class_id++;
1450 if (class_id < 0) {
1451 class_id = 0;
1452 }
1453 }
1454 } while (class_id != start);
1455 assert(0 == "no available class id");
1456 }
1457
1458 void CrushWrapper::reweight(CephContext *cct)
1459 {
1460 set<int> roots;
1461 find_roots(roots);
1462 for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
1463 if (*p >= 0)
1464 continue;
1465 crush_bucket *b = get_bucket(*p);
1466 ldout(cct, 5) << "reweight bucket " << *p << dendl;
1467 int r = crush_reweight_bucket(crush, b);
1468 assert(r == 0);
1469 }
1470 }
1471
1472 int CrushWrapper::add_simple_rule_at(
1473 string name, string root_name,
1474 string failure_domain_name,
1475 string device_class,
1476 string mode, int rule_type,
1477 int rno,
1478 ostream *err)
1479 {
1480 if (rule_exists(name)) {
1481 if (err)
1482 *err << "rule " << name << " exists";
1483 return -EEXIST;
1484 }
1485 if (rno >= 0) {
1486 if (rule_exists(rno)) {
1487 if (err)
1488 *err << "rule with ruleno " << rno << " exists";
1489 return -EEXIST;
1490 }
1491 if (ruleset_exists(rno)) {
1492 if (err)
1493 *err << "ruleset " << rno << " exists";
1494 return -EEXIST;
1495 }
1496 } else {
1497 for (rno = 0; rno < get_max_rules(); rno++) {
1498 if (!rule_exists(rno) && !ruleset_exists(rno))
1499 break;
1500 }
1501 }
1502 if (!name_exists(root_name)) {
1503 if (err)
1504 *err << "root item " << root_name << " does not exist";
1505 return -ENOENT;
1506 }
1507 int root = get_item_id(root_name);
1508 int type = 0;
1509 if (failure_domain_name.length()) {
1510 type = get_type_id(failure_domain_name);
1511 if (type < 0) {
1512 if (err)
1513 *err << "unknown type " << failure_domain_name;
1514 return -EINVAL;
1515 }
1516 }
1517 if (device_class.size()) {
1518 if (!class_exists(device_class)) {
1519 if (err)
1520 *err << "device class " << device_class << " does not exist";
1521 return -EINVAL;
1522 }
1523 int c = get_class_id(device_class);
1524 if (class_bucket.count(root) == 0 ||
1525 class_bucket[root].count(c) == 0) {
1526 if (err)
1527 *err << "root " << root_name << " has no devices with class "
1528 << device_class;
1529 return -EINVAL;
1530 }
1531 root = class_bucket[root][c];
1532 }
1533 if (mode != "firstn" && mode != "indep") {
1534 if (err)
1535 *err << "unknown mode " << mode;
1536 return -EINVAL;
1537 }
1538
1539 int steps = 3;
1540 if (mode == "indep")
1541 steps = 5;
1542 int min_rep = mode == "firstn" ? 1 : 3;
1543 int max_rep = mode == "firstn" ? 10 : 20;
1544 //set the ruleset the same as rule_id(rno)
1545 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
1546 assert(rule);
1547 int step = 0;
1548 if (mode == "indep") {
1549 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1550 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1551 }
1552 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1553 if (type)
1554 crush_rule_set_step(rule, step++,
1555 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
1556 CRUSH_RULE_CHOOSELEAF_INDEP,
1557 CRUSH_CHOOSE_N,
1558 type);
1559 else
1560 crush_rule_set_step(rule, step++,
1561 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
1562 CRUSH_RULE_CHOOSE_INDEP,
1563 CRUSH_CHOOSE_N,
1564 0);
1565 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1566
1567 int ret = crush_add_rule(crush, rule, rno);
1568 if(ret < 0) {
1569 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
1570 return ret;
1571 }
1572 set_rule_name(rno, name);
1573 have_rmaps = false;
1574 return rno;
1575 }
1576
1577 int CrushWrapper::add_simple_rule(
1578 string name, string root_name,
1579 string failure_domain_name,
1580 string device_class,
1581 string mode, int rule_type,
1582 ostream *err)
1583 {
1584 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
1585 mode,
1586 rule_type, -1, err);
1587 }
1588
1589 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
1590 {
1591 if (ruleno >= crush->max_rules)
1592 return -ENOENT;
1593 if (crush->rules[ruleno] == NULL)
1594 return -ENOENT;
1595 crush_rule *rule = crush->rules[ruleno];
1596
1597 // build a weight map for each TAKE in the rule, and then merge them
1598
1599 // FIXME: if there are multiple takes that place a different number of
1600 // objects we do not take that into account. (Also, note that doing this
1601 // right is also a function of the pool, since the crush rule
1602 // might choose 2 + choose 2 but pool size may only be 3.)
1603 for (unsigned i=0; i<rule->len; ++i) {
1604 map<int,float> m;
1605 float sum = 0;
1606 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
1607 int n = rule->steps[i].arg1;
1608 if (n >= 0) {
1609 m[n] = 1.0;
1610 sum = 1.0;
1611 } else {
1612 list<int> q;
1613 q.push_back(n);
1614 //breadth first iterate the OSD tree
1615 while (!q.empty()) {
1616 int bno = q.front();
1617 q.pop_front();
1618 crush_bucket *b = crush->buckets[-1-bno];
1619 assert(b);
1620 for (unsigned j=0; j<b->size; ++j) {
1621 int item_id = b->items[j];
1622 if (item_id >= 0) { //it's an OSD
1623 float w = crush_get_bucket_item_weight(b, j);
1624 m[item_id] = w;
1625 sum += w;
1626 } else { //not an OSD, expand the child later
1627 q.push_back(item_id);
1628 }
1629 }
1630 }
1631 }
1632 }
1633 for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
1634 map<int,float>::iterator q = pmap->find(p->first);
1635 if (q == pmap->end()) {
1636 (*pmap)[p->first] = p->second / sum;
1637 } else {
1638 q->second += p->second / sum;
1639 }
1640 }
1641 }
1642
1643 return 0;
1644 }
1645
1646 int CrushWrapper::remove_rule(int ruleno)
1647 {
1648 if (ruleno >= (int)crush->max_rules)
1649 return -ENOENT;
1650 if (crush->rules[ruleno] == NULL)
1651 return -ENOENT;
1652 crush_destroy_rule(crush->rules[ruleno]);
1653 crush->rules[ruleno] = NULL;
1654 rule_name_map.erase(ruleno);
1655 have_rmaps = false;
1656 return 0;
1657 }
1658
1659 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
1660 {
1661 if (cct->_conf->osd_crush_update_weight_set) {
1662 unsigned position;
1663 for (position = 0; position < bucket->size; position++)
1664 if (bucket->items[position] == item)
1665 break;
1666 assert(position != bucket->size);
1667 for (auto w : choose_args) {
1668 crush_choose_arg_map arg_map = w.second;
1669 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1670 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1671 crush_weight_set *weight_set = &arg->weight_set[j];
1672 weight_set->weights[position] = weight;
1673 }
1674 }
1675 }
1676 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
1677 }
1678
1679 int CrushWrapper::add_bucket(
1680 int bucketno, int alg, int hash, int type, int size,
1681 int *items, int *weights, int *idout)
1682 {
1683 if (alg == 0) {
1684 alg = get_default_bucket_alg();
1685 if (alg == 0)
1686 return -EINVAL;
1687 }
1688 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
1689 weights);
1690 assert(b);
1691 int r = crush_add_bucket(crush, bucketno, b, idout);
1692 for (auto& p : choose_args) {
1693 crush_choose_arg_map& cmap = p.second;
1694 if (cmap.args) {
1695 if ((int)cmap.size <= *idout) {
1696 cmap.args = (crush_choose_arg*)realloc(
1697 cmap.args,
1698 sizeof(crush_choose_arg) * (*idout + 1));
1699 memset(&cmap.args[cmap.size], 0,
1700 sizeof(crush_choose_arg) * (*idout + 1 - cmap.size));
1701 cmap.size = *idout + 1;
1702 }
1703 } else {
1704 cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
1705 *idout + 1);
1706 cmap.size = *idout + 1;
1707 }
1708 if (size > 0) {
1709 int positions = get_choose_args_positions(cmap);
1710 crush_choose_arg& carg = cmap.args[*idout];
1711 carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
1712 size);
1713 carg.weight_set_size = positions;
1714 for (int ppos = 0; ppos < positions; ++ppos) {
1715 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
1716 carg.weight_set[ppos].size = size;
1717 for (int bpos = 0; bpos < size; ++bpos) {
1718 carg.weight_set[ppos].weights[bpos] = weights[bpos];
1719 }
1720 }
1721 }
1722 }
1723 return r;
1724 }
1725
1726 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
1727 {
1728 __u32 new_size = bucket->size + 1;
1729 for (auto w : choose_args) {
1730 crush_choose_arg_map arg_map = w.second;
1731 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1732 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1733 crush_weight_set *weight_set = &arg->weight_set[j];
1734 weight_set->weights = (__u32*)realloc(weight_set->weights,
1735 new_size * sizeof(__u32));
1736 assert(weight_set->size + 1 == new_size);
1737 weight_set->weights[weight_set->size] = weight;
1738 weight_set->size = new_size;
1739 }
1740 if (arg->ids_size) {
1741 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1742 assert(arg->ids_size + 1 == new_size);
1743 arg->ids[arg->ids_size] = item;
1744 arg->ids_size = new_size;
1745 }
1746 }
1747 return crush_bucket_add_item(crush, bucket, item, weight);
1748 }
1749
1750 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
1751 {
1752 __u32 new_size = bucket->size - 1;
1753 unsigned position;
1754 for (position = 0; position < bucket->size; position++)
1755 if (bucket->items[position] == item)
1756 break;
1757 assert(position != bucket->size);
1758 for (auto w : choose_args) {
1759 crush_choose_arg_map arg_map = w.second;
1760 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1761 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1762 crush_weight_set *weight_set = &arg->weight_set[j];
1763 assert(weight_set->size - 1 == new_size);
1764 for (__u32 k = position; k < new_size; k++)
1765 weight_set->weights[k] = weight_set->weights[k+1];
1766 weight_set->weights = (__u32*)realloc(weight_set->weights,
1767 new_size * sizeof(__u32));
1768 weight_set->size = new_size;
1769 }
1770 if (arg->ids_size) {
1771 assert(arg->ids_size - 1 == new_size);
1772 for (__u32 k = position; k < new_size; k++)
1773 arg->ids[k] = arg->ids[k+1];
1774 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1775 arg->ids_size = new_size;
1776 }
1777 }
1778 return crush_bucket_remove_item(crush, bucket, item);
1779 }
1780
1781 int CrushWrapper::update_device_class(int id,
1782 const string& class_name,
1783 const string& name,
1784 ostream *ss)
1785 {
1786 assert(item_exists(id));
1787 auto old_class_name = get_item_class(id);
1788 if (old_class_name && old_class_name != class_name) {
1789 *ss << "osd." << id << " has already bound to class '" << old_class_name
1790 << "', can not reset class to '" << class_name << "'; "
1791 << "use 'ceph osd crush rm-device-class <osd>' to "
1792 << "remove old class first";
1793 return -EBUSY;
1794 }
1795
1796 int class_id = get_or_create_class_id(class_name);
1797 if (id < 0) {
1798 *ss << name << " id " << id << " is negative";
1799 return -EINVAL;
1800 }
1801
1802 if (class_map.count(id) != 0 && class_map[id] == class_id) {
1803 *ss << name << " already set to class " << class_name;
1804 return 0;
1805 }
1806
1807 set_item_class(id, class_id);
1808
1809 int r = rebuild_roots_with_classes();
1810 if (r < 0)
1811 return r;
1812 return 1;
1813 }
1814
1815 int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
1816 {
1817 assert(ss);
1818 const char *name = get_item_name(id);
1819 if (!name) {
1820 *ss << "osd." << id << " does not have a name";
1821 return -ENOENT;
1822 }
1823
1824 const char *class_name = get_item_class(id);
1825 if (!class_name) {
1826 *ss << "osd." << id << " has not been bound to a specific class yet";
1827 return 0;
1828 }
1829 class_remove_item(id);
1830
1831 // note that there is no need to remove ourselves from shadow parent
1832 // and reweight because we are going to destroy all shadow trees
1833 // rebuild them all (if necessary) later.
1834
1835 // see if there is any osds that still reference this class
1836 set<int> devices;
1837 get_devices_by_class(class_name, &devices);
1838 if (devices.empty()) {
1839 // class has no more devices
1840 remove_class_name(class_name);
1841 }
1842
1843 int r = rebuild_roots_with_classes();
1844 if (r < 0) {
1845 *ss << "unable to rebuild roots with class '" << class_name << "' "
1846 << "of osd." << id << ": " << cpp_strerror(r);
1847 return r;
1848 }
1849 return 0;
1850 }
1851
1852 int CrushWrapper::device_class_clone(int original_id, int device_class, int *clone)
1853 {
1854 const char *item_name = get_item_name(original_id);
1855 if (item_name == NULL)
1856 return -ECHILD;
1857 const char *class_name = get_class_name(device_class);
1858 if (class_name == NULL)
1859 return -EBADF;
1860 string copy_name = item_name + string("~") + class_name;
1861 if (name_exists(copy_name)) {
1862 *clone = get_item_id(copy_name);
1863 return 0;
1864 }
1865 crush_bucket *original = get_bucket(original_id);
1866 if (IS_ERR(original))
1867 return -ENOENT;
1868 crush_bucket *copy = crush_make_bucket(crush,
1869 original->alg,
1870 original->hash,
1871 original->type,
1872 0, NULL, NULL);
1873 if(copy == NULL)
1874 return -ENOMEM;
1875 for (unsigned i = 0; i < original->size; i++) {
1876 int item = original->items[i];
1877 int weight = crush_get_bucket_item_weight(original, i);
1878 if (item >= 0) {
1879 if (class_map.count(item) != 0 && class_map[item] == device_class) {
1880 int res = bucket_add_item(copy, item, weight);
1881 if (res)
1882 return res;
1883 }
1884 } else {
1885 int child_copy_id;
1886 int res = device_class_clone(item, device_class, &child_copy_id);
1887 if (res < 0)
1888 return res;
1889 crush_bucket *child_copy = get_bucket(child_copy_id);
1890 if (IS_ERR(child_copy))
1891 return -ENOENT;
1892 res = bucket_add_item(copy, child_copy_id, child_copy->weight);
1893 if (res)
1894 return res;
1895 }
1896 }
1897 int res = crush_add_bucket(crush, 0, copy, clone);
1898 if (res)
1899 return res;
1900 res = set_item_class(*clone, device_class);
1901 if (res < 0)
1902 return res;
1903 // we do not use set_item_name because the name is intentionally invalid
1904 name_map[*clone] = copy_name;
1905 if (have_rmaps)
1906 name_rmap[copy_name] = *clone;
1907 class_bucket[original_id][device_class] = *clone;
1908 return 0;
1909 }
1910
1911 int CrushWrapper::rebuild_roots_with_classes()
1912 {
1913 int r = trim_roots_with_class(false);
1914 if (r < 0)
1915 return r;
1916 r = populate_classes();
1917 if (r < 0)
1918 return r;
1919 return trim_roots_with_class(true);
1920 }
1921
1922 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
1923 {
1924 assert(crush);
1925
1926 __u32 magic = CRUSH_MAGIC;
1927 ::encode(magic, bl);
1928
1929 ::encode(crush->max_buckets, bl);
1930 ::encode(crush->max_rules, bl);
1931 ::encode(crush->max_devices, bl);
1932
1933 bool encode_compat_choose_args = false;
1934 crush_choose_arg_map arg_map;
1935 memset(&arg_map, '\0', sizeof(arg_map));
1936 if (has_choose_args() &&
1937 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
1938 assert(!has_incompat_choose_args());
1939 encode_compat_choose_args = true;
1940 arg_map = choose_args.begin()->second;
1941 }
1942
1943 // buckets
1944 for (int i=0; i<crush->max_buckets; i++) {
1945 __u32 alg = 0;
1946 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
1947 ::encode(alg, bl);
1948 if (!alg)
1949 continue;
1950
1951 ::encode(crush->buckets[i]->id, bl);
1952 ::encode(crush->buckets[i]->type, bl);
1953 ::encode(crush->buckets[i]->alg, bl);
1954 ::encode(crush->buckets[i]->hash, bl);
1955 ::encode(crush->buckets[i]->weight, bl);
1956 ::encode(crush->buckets[i]->size, bl);
1957 for (unsigned j=0; j<crush->buckets[i]->size; j++)
1958 ::encode(crush->buckets[i]->items[j], bl);
1959
1960 switch (crush->buckets[i]->alg) {
1961 case CRUSH_BUCKET_UNIFORM:
1962 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
1963 break;
1964
1965 case CRUSH_BUCKET_LIST:
1966 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1967 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
1968 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
1969 }
1970 break;
1971
1972 case CRUSH_BUCKET_TREE:
1973 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
1974 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
1975 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
1976 break;
1977
1978 case CRUSH_BUCKET_STRAW:
1979 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1980 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
1981 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
1982 }
1983 break;
1984
1985 case CRUSH_BUCKET_STRAW2:
1986 {
1987 __u32 *weights;
1988 if (encode_compat_choose_args &&
1989 arg_map.args[i].weight_set_size > 0) {
1990 weights = arg_map.args[i].weight_set[0].weights;
1991 } else {
1992 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
1993 }
1994 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1995 ::encode(weights[j], bl);
1996 }
1997 }
1998 break;
1999
2000 default:
2001 ceph_abort();
2002 break;
2003 }
2004 }
2005
2006 // rules
2007 for (unsigned i=0; i<crush->max_rules; i++) {
2008 __u32 yes = crush->rules[i] ? 1:0;
2009 ::encode(yes, bl);
2010 if (!yes)
2011 continue;
2012
2013 ::encode(crush->rules[i]->len, bl);
2014 ::encode(crush->rules[i]->mask, bl);
2015 for (unsigned j=0; j<crush->rules[i]->len; j++)
2016 ::encode(crush->rules[i]->steps[j], bl);
2017 }
2018
2019 // name info
2020 ::encode(type_map, bl);
2021 ::encode(name_map, bl);
2022 ::encode(rule_name_map, bl);
2023
2024 // tunables
2025 ::encode(crush->choose_local_tries, bl);
2026 ::encode(crush->choose_local_fallback_tries, bl);
2027 ::encode(crush->choose_total_tries, bl);
2028 ::encode(crush->chooseleaf_descend_once, bl);
2029 ::encode(crush->chooseleaf_vary_r, bl);
2030 ::encode(crush->straw_calc_version, bl);
2031 ::encode(crush->allowed_bucket_algs, bl);
2032 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
2033 ::encode(crush->chooseleaf_stable, bl);
2034 }
2035
2036 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2037 // device classes
2038 ::encode(class_map, bl);
2039 ::encode(class_name, bl);
2040 ::encode(class_bucket, bl);
2041
2042 // choose args
2043 __u32 size = (__u32)choose_args.size();
2044 ::encode(size, bl);
2045 for (auto c : choose_args) {
2046 ::encode(c.first, bl);
2047 crush_choose_arg_map arg_map = c.second;
2048 size = 0;
2049 for (__u32 i = 0; i < arg_map.size; i++) {
2050 crush_choose_arg *arg = &arg_map.args[i];
2051 if (arg->weight_set_size == 0 &&
2052 arg->ids_size == 0)
2053 continue;
2054 size++;
2055 }
2056 ::encode(size, bl);
2057 for (__u32 i = 0; i < arg_map.size; i++) {
2058 crush_choose_arg *arg = &arg_map.args[i];
2059 if (arg->weight_set_size == 0 &&
2060 arg->ids_size == 0)
2061 continue;
2062 ::encode(i, bl);
2063 ::encode(arg->weight_set_size, bl);
2064 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2065 crush_weight_set *weight_set = &arg->weight_set[j];
2066 ::encode(weight_set->size, bl);
2067 for (__u32 k = 0; k < weight_set->size; k++)
2068 ::encode(weight_set->weights[k], bl);
2069 }
2070 ::encode(arg->ids_size, bl);
2071 for (__u32 j = 0; j < arg->ids_size; j++)
2072 ::encode(arg->ids[j], bl);
2073 }
2074 }
2075 }
2076 }
2077
2078 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
2079 {
2080 m.clear();
2081 __u32 n;
2082 ::decode(n, blp);
2083 while (n--) {
2084 __s32 key;
2085 ::decode(key, blp);
2086
2087 __u32 strlen;
2088 ::decode(strlen, blp);
2089 if (strlen == 0) {
2090 // der, key was actually 64-bits!
2091 ::decode(strlen, blp);
2092 }
2093 ::decode_nohead(strlen, m[key], blp);
2094 }
2095 }
2096
2097 void CrushWrapper::decode(bufferlist::iterator& blp)
2098 {
2099 create();
2100
2101 __u32 magic;
2102 ::decode(magic, blp);
2103 if (magic != CRUSH_MAGIC)
2104 throw buffer::malformed_input("bad magic number");
2105
2106 ::decode(crush->max_buckets, blp);
2107 ::decode(crush->max_rules, blp);
2108 ::decode(crush->max_devices, blp);
2109
2110 // legacy tunables, unless we decode something newer
2111 set_tunables_legacy();
2112
2113 try {
2114 // buckets
2115 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
2116 for (int i=0; i<crush->max_buckets; i++) {
2117 decode_crush_bucket(&crush->buckets[i], blp);
2118 }
2119
2120 // rules
2121 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
2122 for (unsigned i = 0; i < crush->max_rules; ++i) {
2123 __u32 yes;
2124 ::decode(yes, blp);
2125 if (!yes) {
2126 crush->rules[i] = NULL;
2127 continue;
2128 }
2129
2130 __u32 len;
2131 ::decode(len, blp);
2132 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
2133 crush->rules[i]->len = len;
2134 ::decode(crush->rules[i]->mask, blp);
2135 for (unsigned j=0; j<crush->rules[i]->len; j++)
2136 ::decode(crush->rules[i]->steps[j], blp);
2137 }
2138
2139 // name info
2140 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2141 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2142 // both by assuming the string is always non-empty.
2143 decode_32_or_64_string_map(type_map, blp);
2144 decode_32_or_64_string_map(name_map, blp);
2145 decode_32_or_64_string_map(rule_name_map, blp);
2146
2147 // tunables
2148 if (!blp.end()) {
2149 ::decode(crush->choose_local_tries, blp);
2150 ::decode(crush->choose_local_fallback_tries, blp);
2151 ::decode(crush->choose_total_tries, blp);
2152 }
2153 if (!blp.end()) {
2154 ::decode(crush->chooseleaf_descend_once, blp);
2155 }
2156 if (!blp.end()) {
2157 ::decode(crush->chooseleaf_vary_r, blp);
2158 }
2159 if (!blp.end()) {
2160 ::decode(crush->straw_calc_version, blp);
2161 }
2162 if (!blp.end()) {
2163 ::decode(crush->allowed_bucket_algs, blp);
2164 }
2165 if (!blp.end()) {
2166 ::decode(crush->chooseleaf_stable, blp);
2167 }
2168 if (!blp.end()) {
2169 ::decode(class_map, blp);
2170 ::decode(class_name, blp);
2171 for (auto &c : class_name)
2172 class_rname[c.second] = c.first;
2173 ::decode(class_bucket, blp);
2174 cleanup_classes();
2175 }
2176 if (!blp.end()) {
2177 __u32 choose_args_size;
2178 ::decode(choose_args_size, blp);
2179 for (__u32 i = 0; i < choose_args_size; i++) {
2180 uint64_t choose_args_index;
2181 ::decode(choose_args_index, blp);
2182 crush_choose_arg_map arg_map;
2183 arg_map.size = crush->max_buckets;
2184 arg_map.args = (crush_choose_arg*)calloc(
2185 arg_map.size, sizeof(crush_choose_arg));
2186 __u32 size;
2187 ::decode(size, blp);
2188 for (__u32 j = 0; j < size; j++) {
2189 __u32 bucket_index;
2190 ::decode(bucket_index, blp);
2191 assert(bucket_index < arg_map.size);
2192 crush_choose_arg *arg = &arg_map.args[bucket_index];
2193 ::decode(arg->weight_set_size, blp);
2194 if (arg->weight_set_size) {
2195 arg->weight_set = (crush_weight_set*)calloc(
2196 arg->weight_set_size, sizeof(crush_weight_set));
2197 for (__u32 k = 0; k < arg->weight_set_size; k++) {
2198 crush_weight_set *weight_set = &arg->weight_set[k];
2199 ::decode(weight_set->size, blp);
2200 weight_set->weights = (__u32*)calloc(
2201 weight_set->size, sizeof(__u32));
2202 for (__u32 l = 0; l < weight_set->size; l++)
2203 ::decode(weight_set->weights[l], blp);
2204 }
2205 }
2206 ::decode(arg->ids_size, blp);
2207 if (arg->ids_size) {
2208 assert(arg->ids_size == crush->buckets[bucket_index]->size);
2209 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
2210 for (__u32 k = 0; k < arg->ids_size; k++)
2211 ::decode(arg->ids[k], blp);
2212 }
2213 }
2214 choose_args[choose_args_index] = arg_map;
2215 }
2216 }
2217 finalize();
2218 }
2219 catch (...) {
2220 crush_destroy(crush);
2221 throw;
2222 }
2223 }
2224
2225 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
2226 {
2227 __u32 alg;
2228 ::decode(alg, blp);
2229 if (!alg) {
2230 *bptr = NULL;
2231 return;
2232 }
2233
2234 int size = 0;
2235 switch (alg) {
2236 case CRUSH_BUCKET_UNIFORM:
2237 size = sizeof(crush_bucket_uniform);
2238 break;
2239 case CRUSH_BUCKET_LIST:
2240 size = sizeof(crush_bucket_list);
2241 break;
2242 case CRUSH_BUCKET_TREE:
2243 size = sizeof(crush_bucket_tree);
2244 break;
2245 case CRUSH_BUCKET_STRAW:
2246 size = sizeof(crush_bucket_straw);
2247 break;
2248 case CRUSH_BUCKET_STRAW2:
2249 size = sizeof(crush_bucket_straw2);
2250 break;
2251 default:
2252 {
2253 char str[128];
2254 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
2255 throw buffer::malformed_input(str);
2256 }
2257 }
2258 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
2259 *bptr = bucket;
2260
2261 ::decode(bucket->id, blp);
2262 ::decode(bucket->type, blp);
2263 ::decode(bucket->alg, blp);
2264 ::decode(bucket->hash, blp);
2265 ::decode(bucket->weight, blp);
2266 ::decode(bucket->size, blp);
2267
2268 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
2269 for (unsigned j = 0; j < bucket->size; ++j) {
2270 ::decode(bucket->items[j], blp);
2271 }
2272
2273 switch (bucket->alg) {
2274 case CRUSH_BUCKET_UNIFORM:
2275 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
2276 break;
2277
2278 case CRUSH_BUCKET_LIST: {
2279 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
2280 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2281 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2282
2283 for (unsigned j = 0; j < bucket->size; ++j) {
2284 ::decode(cbl->item_weights[j], blp);
2285 ::decode(cbl->sum_weights[j], blp);
2286 }
2287 break;
2288 }
2289
2290 case CRUSH_BUCKET_TREE: {
2291 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
2292 ::decode(cbt->num_nodes, blp);
2293 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
2294 for (unsigned j=0; j<cbt->num_nodes; j++) {
2295 ::decode(cbt->node_weights[j], blp);
2296 }
2297 break;
2298 }
2299
2300 case CRUSH_BUCKET_STRAW: {
2301 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
2302 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2303 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2304 for (unsigned j = 0; j < bucket->size; ++j) {
2305 ::decode(cbs->item_weights[j], blp);
2306 ::decode(cbs->straws[j], blp);
2307 }
2308 break;
2309 }
2310
2311 case CRUSH_BUCKET_STRAW2: {
2312 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
2313 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2314 for (unsigned j = 0; j < bucket->size; ++j) {
2315 ::decode(cbs->item_weights[j], blp);
2316 }
2317 break;
2318 }
2319
2320 default:
2321 // We should have handled this case in the first switch statement
2322 ceph_abort();
2323 break;
2324 }
2325 }
2326
2327
2328 void CrushWrapper::dump(Formatter *f) const
2329 {
2330 f->open_array_section("devices");
2331 for (int i=0; i<get_max_devices(); i++) {
2332 f->open_object_section("device");
2333 f->dump_int("id", i);
2334 const char *n = get_item_name(i);
2335 if (n) {
2336 f->dump_string("name", n);
2337 } else {
2338 char name[20];
2339 sprintf(name, "device%d", i);
2340 f->dump_string("name", name);
2341 }
2342 const char *device_class = get_item_class(i);
2343 if (device_class != NULL)
2344 f->dump_string("class", device_class);
2345 f->close_section();
2346 }
2347 f->close_section();
2348
2349 f->open_array_section("types");
2350 int n = get_num_type_names();
2351 for (int i=0; n; i++) {
2352 const char *name = get_type_name(i);
2353 if (!name) {
2354 if (i == 0) {
2355 f->open_object_section("type");
2356 f->dump_int("type_id", 0);
2357 f->dump_string("name", "device");
2358 f->close_section();
2359 }
2360 continue;
2361 }
2362 n--;
2363 f->open_object_section("type");
2364 f->dump_int("type_id", i);
2365 f->dump_string("name", name);
2366 f->close_section();
2367 }
2368 f->close_section();
2369
2370 f->open_array_section("buckets");
2371 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
2372 if (!bucket_exists(bucket))
2373 continue;
2374 f->open_object_section("bucket");
2375 f->dump_int("id", bucket);
2376 if (get_item_name(bucket))
2377 f->dump_string("name", get_item_name(bucket));
2378 f->dump_int("type_id", get_bucket_type(bucket));
2379 if (get_type_name(get_bucket_type(bucket)))
2380 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
2381 f->dump_int("weight", get_bucket_weight(bucket));
2382 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
2383 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
2384 f->open_array_section("items");
2385 for (int j=0; j<get_bucket_size(bucket); j++) {
2386 f->open_object_section("item");
2387 f->dump_int("id", get_bucket_item(bucket, j));
2388 f->dump_int("weight", get_bucket_item_weight(bucket, j));
2389 f->dump_int("pos", j);
2390 f->close_section();
2391 }
2392 f->close_section();
2393 f->close_section();
2394 }
2395 f->close_section();
2396
2397 f->open_array_section("rules");
2398 dump_rules(f);
2399 f->close_section();
2400
2401 f->open_object_section("tunables");
2402 dump_tunables(f);
2403 f->close_section();
2404
2405 dump_choose_args(f);
2406 }
2407
2408 namespace {
2409 // depth first walker
2410 class TreeDumper {
2411 typedef CrushTreeDumper::Item Item;
2412 const CrushWrapper *crush;
2413 const CrushTreeDumper::name_map_t& weight_set_names;
2414 public:
2415 explicit TreeDumper(const CrushWrapper *crush,
2416 const CrushTreeDumper::name_map_t& wsnames)
2417 : crush(crush), weight_set_names(wsnames) {}
2418
2419 void dump(Formatter *f) {
2420 set<int> roots;
2421 crush->find_roots(roots);
2422 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
2423 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
2424 }
2425 }
2426
2427 private:
2428 void dump_item(const Item& qi, Formatter* f) {
2429 if (qi.is_bucket()) {
2430 f->open_object_section("bucket");
2431 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2432 dump_bucket_children(qi, f);
2433 f->close_section();
2434 } else {
2435 f->open_object_section("device");
2436 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2437 f->close_section();
2438 }
2439 }
2440
2441 void dump_bucket_children(const Item& parent, Formatter* f) {
2442 f->open_array_section("items");
2443 const int max_pos = crush->get_bucket_size(parent.id);
2444 for (int pos = 0; pos < max_pos; pos++) {
2445 int id = crush->get_bucket_item(parent.id, pos);
2446 float weight = crush->get_bucket_item_weightf(parent.id, pos);
2447 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
2448 }
2449 f->close_section();
2450 }
2451 };
2452 }
2453
2454 void CrushWrapper::dump_tree(
2455 Formatter *f,
2456 const CrushTreeDumper::name_map_t& weight_set_names) const
2457 {
2458 assert(f);
2459 TreeDumper(this, weight_set_names).dump(f);
2460 }
2461
2462 void CrushWrapper::dump_tunables(Formatter *f) const
2463 {
2464 f->dump_int("choose_local_tries", get_choose_local_tries());
2465 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2466 f->dump_int("choose_total_tries", get_choose_total_tries());
2467 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2468 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2469 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2470 f->dump_int("straw_calc_version", get_straw_calc_version());
2471 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2472
2473 // be helpful about it
2474 if (has_jewel_tunables())
2475 f->dump_string("profile", "jewel");
2476 else if (has_hammer_tunables())
2477 f->dump_string("profile", "hammer");
2478 else if (has_firefly_tunables())
2479 f->dump_string("profile", "firefly");
2480 else if (has_bobtail_tunables())
2481 f->dump_string("profile", "bobtail");
2482 else if (has_argonaut_tunables())
2483 f->dump_string("profile", "argonaut");
2484 else
2485 f->dump_string("profile", "unknown");
2486 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
2487 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
2488
2489 // be helpful about minimum version required
2490 f->dump_string("minimum_required_version", get_min_required_version());
2491
2492 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2493 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2494 f->dump_int("has_v2_rules", (int)has_v2_rules());
2495 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2496 f->dump_int("has_v3_rules", (int)has_v3_rules());
2497 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
2498 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2499 f->dump_int("has_v5_rules", (int)has_v5_rules());
2500 }
2501
2502 void CrushWrapper::dump_choose_args(Formatter *f) const
2503 {
2504 f->open_object_section("choose_args");
2505 for (auto c : choose_args) {
2506 crush_choose_arg_map arg_map = c.second;
2507 f->open_array_section(stringify(c.first).c_str());
2508 for (__u32 i = 0; i < arg_map.size; i++) {
2509 crush_choose_arg *arg = &arg_map.args[i];
2510 if (arg->weight_set_size == 0 &&
2511 arg->ids_size == 0)
2512 continue;
2513 f->open_object_section("choose_args");
2514 int bucket_index = i;
2515 f->dump_int("bucket_id", -1-bucket_index);
2516 if (arg->weight_set_size > 0) {
2517 f->open_array_section("weight_set");
2518 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2519 f->open_array_section("weights");
2520 __u32 *weights = arg->weight_set[j].weights;
2521 __u32 size = arg->weight_set[j].size;
2522 for (__u32 k = 0; k < size; k++) {
2523 f->dump_float("weight", (float)weights[k]/(float)0x10000);
2524 }
2525 f->close_section();
2526 }
2527 f->close_section();
2528 }
2529 if (arg->ids_size > 0) {
2530 f->open_array_section("ids");
2531 for (__u32 j = 0; j < arg->ids_size; j++)
2532 f->dump_int("id", arg->ids[j]);
2533 f->close_section();
2534 }
2535 f->close_section();
2536 }
2537 f->close_section();
2538 }
2539 f->close_section();
2540 }
2541
2542 void CrushWrapper::dump_rules(Formatter *f) const
2543 {
2544 for (int i=0; i<get_max_rules(); i++) {
2545 if (!rule_exists(i))
2546 continue;
2547 dump_rule(i, f);
2548 }
2549 }
2550
2551 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
2552 {
2553 f->open_object_section("rule");
2554 f->dump_int("rule_id", ruleset);
2555 if (get_rule_name(ruleset))
2556 f->dump_string("rule_name", get_rule_name(ruleset));
2557 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
2558 f->dump_int("type", get_rule_mask_type(ruleset));
2559 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
2560 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
2561 f->open_array_section("steps");
2562 for (int j=0; j<get_rule_len(ruleset); j++) {
2563 f->open_object_section("step");
2564 switch (get_rule_op(ruleset, j)) {
2565 case CRUSH_RULE_NOOP:
2566 f->dump_string("op", "noop");
2567 break;
2568 case CRUSH_RULE_TAKE:
2569 f->dump_string("op", "take");
2570 {
2571 int item = get_rule_arg1(ruleset, j);
2572 f->dump_int("item", item);
2573
2574 const char *name = get_item_name(item);
2575 f->dump_string("item_name", name ? name : "");
2576 }
2577 break;
2578 case CRUSH_RULE_EMIT:
2579 f->dump_string("op", "emit");
2580 break;
2581 case CRUSH_RULE_CHOOSE_FIRSTN:
2582 f->dump_string("op", "choose_firstn");
2583 f->dump_int("num", get_rule_arg1(ruleset, j));
2584 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2585 break;
2586 case CRUSH_RULE_CHOOSE_INDEP:
2587 f->dump_string("op", "choose_indep");
2588 f->dump_int("num", get_rule_arg1(ruleset, j));
2589 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2590 break;
2591 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2592 f->dump_string("op", "chooseleaf_firstn");
2593 f->dump_int("num", get_rule_arg1(ruleset, j));
2594 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2595 break;
2596 case CRUSH_RULE_CHOOSELEAF_INDEP:
2597 f->dump_string("op", "chooseleaf_indep");
2598 f->dump_int("num", get_rule_arg1(ruleset, j));
2599 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2600 break;
2601 case CRUSH_RULE_SET_CHOOSE_TRIES:
2602 f->dump_string("op", "set_choose_tries");
2603 f->dump_int("num", get_rule_arg1(ruleset, j));
2604 break;
2605 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
2606 f->dump_string("op", "set_chooseleaf_tries");
2607 f->dump_int("num", get_rule_arg1(ruleset, j));
2608 break;
2609 default:
2610 f->dump_int("opcode", get_rule_op(ruleset, j));
2611 f->dump_int("arg1", get_rule_arg1(ruleset, j));
2612 f->dump_int("arg2", get_rule_arg2(ruleset, j));
2613 }
2614 f->close_section();
2615 }
2616 f->close_section();
2617 f->close_section();
2618 }
2619
2620 void CrushWrapper::list_rules(Formatter *f) const
2621 {
2622 for (int rule = 0; rule < get_max_rules(); rule++) {
2623 if (!rule_exists(rule))
2624 continue;
2625 f->dump_string("name", get_rule_name(rule));
2626 }
2627 }
2628
2629 void CrushWrapper::list_rules(ostream *ss) const
2630 {
2631 for (int rule = 0; rule < get_max_rules(); rule++) {
2632 if (!rule_exists(rule))
2633 continue;
2634 *ss << get_rule_name(rule) << "\n";
2635 }
2636 }
2637
2638 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
2639 public:
2640 typedef CrushTreeDumper::Dumper<TextTable> Parent;
2641
2642 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2643 const CrushTreeDumper::name_map_t& wsnames)
2644 : Parent(crush, wsnames) {}
2645 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2646 const CrushTreeDumper::name_map_t& wsnames,
2647 bool show_shadow)
2648 : Parent(crush, wsnames, show_shadow) {}
2649
2650
2651 void dump(TextTable *tbl) {
2652 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
2653 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
2654 for (auto& p : crush->choose_args) {
2655 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
2656 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
2657 } else {
2658 string name;
2659 auto q = weight_set_names.find(p.first);
2660 name = q != weight_set_names.end() ? q->second :
2661 stringify(p.first);
2662 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
2663 }
2664 }
2665 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
2666 Parent::dump(tbl);
2667 }
2668
2669 protected:
2670 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
2671 *tbl << qi.id
2672 << weightf_t(qi.weight);
2673 for (auto& p : crush->choose_args) {
2674 if (qi.parent < 0) {
2675 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
2676 int bidx = -1 - qi.parent;
2677 const crush_bucket *b = crush->get_bucket(qi.parent);
2678 if (b &&
2679 bidx < (int)cmap.size &&
2680 cmap.args[bidx].weight_set &&
2681 cmap.args[bidx].weight_set_size >= 1) {
2682 int pos;
2683 for (pos = 0;
2684 pos < (int)cmap.args[bidx].weight_set[0].size &&
2685 b->items[pos] != qi.id;
2686 ++pos) ;
2687 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
2688 (float)0x10000);
2689 continue;
2690 }
2691 }
2692 *tbl << "";
2693 }
2694 ostringstream ss;
2695 for (int k=0; k < qi.depth; k++) {
2696 ss << " ";
2697 }
2698 if (qi.is_bucket()) {
2699 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2700 << crush->get_item_name(qi.id);
2701 } else {
2702 ss << "osd." << qi.id;
2703 }
2704 *tbl << ss.str();
2705 *tbl << TextTable::endrow;
2706 }
2707 };
2708
2709
2710 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2711 public:
2712 typedef CrushTreeDumper::FormattingDumper Parent;
2713
2714 explicit CrushTreeFormattingDumper(
2715 const CrushWrapper *crush,
2716 const CrushTreeDumper::name_map_t& wsnames)
2717 : Parent(crush, wsnames) {}
2718
2719 explicit CrushTreeFormattingDumper(
2720 const CrushWrapper *crush,
2721 const CrushTreeDumper::name_map_t& wsnames,
2722 bool show_shadow)
2723 : Parent(crush, wsnames, show_shadow) {}
2724
2725 void dump(Formatter *f) {
2726 f->open_array_section("nodes");
2727 Parent::dump(f);
2728 f->close_section();
2729 f->open_array_section("stray");
2730 f->close_section();
2731 }
2732 };
2733
2734
2735 void CrushWrapper::dump_tree(
2736 ostream *out,
2737 Formatter *f,
2738 const CrushTreeDumper::name_map_t& weight_set_names,
2739 bool show_shadow) const
2740 {
2741 if (out) {
2742 TextTable tbl;
2743 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
2744 *out << tbl;
2745 }
2746 if (f) {
2747 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
2748 }
2749 }
2750
2751 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
2752 {
2753 o.push_back(new CrushWrapper);
2754 // fixme
2755 }
2756
2757 /**
2758 * Determine the default CRUSH ruleset ID to be used with
2759 * newly created replicated pools.
2760 *
2761 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2762 */
2763 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
2764 {
2765 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
2766 if (crush_ruleset < 0) {
2767 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
2768 } else if (!ruleset_exists(crush_ruleset)) {
2769 crush_ruleset = -1; // match find_first_ruleset() retval
2770 }
2771 return crush_ruleset;
2772 }
2773
2774 bool CrushWrapper::is_valid_crush_name(const string& s)
2775 {
2776 if (s.empty())
2777 return false;
2778 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
2779 if (!(*p == '-') &&
2780 !(*p == '_') &&
2781 !(*p == '.') &&
2782 !(*p >= '0' && *p <= '9') &&
2783 !(*p >= 'A' && *p <= 'Z') &&
2784 !(*p >= 'a' && *p <= 'z'))
2785 return false;
2786 }
2787 return true;
2788 }
2789
2790 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
2791 const map<string,string>& loc)
2792 {
2793 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
2794 if (!is_valid_crush_name(l->first) ||
2795 !is_valid_crush_name(l->second)) {
2796 ldout(cct, 1) << "loc["
2797 << l->first << "] = '"
2798 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
2799 << dendl;
2800 return false;
2801 }
2802 }
2803 return true;
2804 }
2805
2806 int CrushWrapper::_choose_type_stack(
2807 CephContext *cct,
2808 const vector<pair<int,int>>& stack,
2809 const set<int>& overfull,
2810 const vector<int>& underfull,
2811 const vector<int>& orig,
2812 vector<int>::const_iterator& i,
2813 set<int>& used,
2814 vector<int> *pw) const
2815 {
2816 vector<int> w = *pw;
2817 vector<int> o;
2818
2819 ldout(cct, 10) << __func__ << " stack " << stack
2820 << " orig " << orig
2821 << " at " << *i
2822 << " pw " << *pw
2823 << dendl;
2824
2825 vector<int> cumulative_fanout(stack.size());
2826 int f = 1;
2827 for (int j = (int)stack.size() - 1; j >= 0; --j) {
2828 cumulative_fanout[j] = f;
2829 f *= stack[j].second;
2830 }
2831 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
2832 << dendl;
2833
2834 // identify underful targets for each intermediate level.
2835 // this serves two purposes:
2836 // 1. we can tell when we are selecting a bucket that does not have any underfull
2837 // devices beneath it. that means that if the current input includes an overfull
2838 // device, we won't be able to find an underfull device with this parent to
2839 // swap for it.
2840 // 2. when we decide we should reject a bucket due to the above, this list gives us
2841 // a list of peers to consider that *do* have underfull devices available.. (we
2842 // are careful to pick one that has the same parent.)
2843 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
2844 underfull_buckets.resize(stack.size() - 1);
2845 for (auto osd : underfull) {
2846 int item = osd;
2847 for (int j = (int)stack.size() - 2; j >= 0; --j) {
2848 int type = stack[j].first;
2849 item = get_parent_of_type(item, type);
2850 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
2851 << " is " << item << dendl;
2852 underfull_buckets[j].insert(item);
2853 }
2854 }
2855 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
2856
2857 for (unsigned j = 0; j < stack.size(); ++j) {
2858 int type = stack[j].first;
2859 int fanout = stack[j].second;
2860 int cum_fanout = cumulative_fanout[j];
2861 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
2862 << " cumulative " << cum_fanout
2863 << " w " << w << dendl;
2864 vector<int> o;
2865 auto tmpi = i;
2866 for (auto from : w) {
2867 ldout(cct, 10) << " from " << from << dendl;
2868 // identify leaves under each choice. we use this to check whether any of these
2869 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2870 // to swap for them.)
2871 vector<set<int>> leaves;
2872 leaves.resize(fanout);
2873 for (int pos = 0; pos < fanout; ++pos) {
2874 if (type > 0) {
2875 // non-leaf
2876 int item = get_parent_of_type(*tmpi, type);
2877 o.push_back(item);
2878 int n = cum_fanout;
2879 while (n-- && tmpi != orig.end()) {
2880 leaves[pos].insert(*tmpi++);
2881 }
2882 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
2883 << " of type " << type << " over leaves " << leaves[pos] << dendl;
2884 } else {
2885 // leaf
2886 bool replaced = false;
2887 if (overfull.count(*i)) {
2888 for (auto item : underfull) {
2889 ldout(cct, 10) << __func__ << " pos " << pos
2890 << " was " << *i << " considering " << item
2891 << dendl;
2892 if (used.count(item)) {
2893 ldout(cct, 20) << __func__ << " in used " << used << dendl;
2894 continue;
2895 }
2896 if (!subtree_contains(from, item)) {
2897 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
2898 continue;
2899 }
2900 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
2901 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
2902 continue;
2903 }
2904 o.push_back(item);
2905 used.insert(item);
2906 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
2907 << *i << " -> " << item << dendl;
2908 replaced = true;
2909 ++i;
2910 break;
2911 }
2912 }
2913 if (!replaced) {
2914 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
2915 << dendl;
2916 o.push_back(*i);
2917 ++i;
2918 }
2919 if (i == orig.end()) {
2920 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
2921 break;
2922 }
2923 }
2924 }
2925 if (j + 1 < stack.size()) {
2926 // check if any buckets have overfull leaves but no underfull candidates
2927 for (int pos = 0; pos < fanout; ++pos) {
2928 if (underfull_buckets[j].count(o[pos]) == 0) {
2929 // are any leaves overfull?
2930 bool any_overfull = false;
2931 for (auto osd : leaves[pos]) {
2932 if (overfull.count(osd)) {
2933 any_overfull = true;
2934 }
2935 }
2936 if (any_overfull) {
2937 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
2938 << ">0 leaves " << leaves[pos] << " is overfull; alts "
2939 << underfull_buckets[j]
2940 << dendl;
2941 for (auto alt : underfull_buckets[j]) {
2942 if (std::find(o.begin(), o.end(), alt) == o.end()) {
2943 // see if alt has the same parent
2944 if (j == 0 ||
2945 get_parent_of_type(o[pos], stack[j-1].first) ==
2946 get_parent_of_type(alt, stack[j-1].first)) {
2947 if (j)
2948 ldout(cct, 10) << " replacing " << o[pos]
2949 << " (which has no underfull leaves) with " << alt
2950 << " (same parent "
2951 << get_parent_of_type(alt, stack[j-1].first) << " type "
2952 << type << ")" << dendl;
2953 else
2954 ldout(cct, 10) << " replacing " << o[pos]
2955 << " (which has no underfull leaves) with " << alt
2956 << " (first level)" << dendl;
2957 o[pos] = alt;
2958 break;
2959 } else {
2960 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
2961 << " has different parent, skipping" << dendl;
2962 }
2963 }
2964 }
2965 }
2966 }
2967 }
2968 }
2969 if (i == orig.end()) {
2970 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
2971 break;
2972 }
2973 }
2974 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
2975 w.swap(o);
2976 }
2977 *pw = w;
2978 return 0;
2979 }
2980
2981 int CrushWrapper::try_remap_rule(
2982 CephContext *cct,
2983 int ruleno,
2984 int maxout,
2985 const set<int>& overfull,
2986 const vector<int>& underfull,
2987 const vector<int>& orig,
2988 vector<int> *out) const
2989 {
2990 const crush_map *map = crush;
2991 const crush_rule *rule = get_rule(ruleno);
2992 assert(rule);
2993
2994 ldout(cct, 10) << __func__ << " ruleno " << ruleno
2995 << " numrep " << maxout << " overfull " << overfull
2996 << " underfull " << underfull << " orig " << orig
2997 << dendl;
2998 vector<int> w; // working set
2999 out->clear();
3000
3001 auto i = orig.begin();
3002 set<int> used;
3003
3004 vector<pair<int,int>> type_stack; // (type, fan-out)
3005
3006 for (unsigned step = 0; step < rule->len; ++step) {
3007 const crush_rule_step *curstep = &rule->steps[step];
3008 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
3009 switch (curstep->op) {
3010 case CRUSH_RULE_TAKE:
3011 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
3012 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
3013 map->buckets[-1-curstep->arg1])) {
3014 w.clear();
3015 w.push_back(curstep->arg1);
3016 ldout(cct, 10) << __func__ << " take " << w << dendl;
3017 } else {
3018 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
3019 }
3020 break;
3021
3022 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3023 case CRUSH_RULE_CHOOSELEAF_INDEP:
3024 {
3025 int numrep = curstep->arg1;
3026 int type = curstep->arg2;
3027 if (numrep <= 0)
3028 numrep += maxout;
3029 type_stack.push_back(make_pair(type, numrep));
3030 type_stack.push_back(make_pair(0, 1));
3031 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3032 i, used, &w);
3033 if (r < 0)
3034 return r;
3035 type_stack.clear();
3036 }
3037 break;
3038
3039 case CRUSH_RULE_CHOOSE_FIRSTN:
3040 case CRUSH_RULE_CHOOSE_INDEP:
3041 {
3042 int numrep = curstep->arg1;
3043 int type = curstep->arg2;
3044 if (numrep <= 0)
3045 numrep += maxout;
3046 type_stack.push_back(make_pair(type, numrep));
3047 }
3048 break;
3049
3050 case CRUSH_RULE_EMIT:
3051 ldout(cct, 10) << " emit " << w << dendl;
3052 if (!type_stack.empty()) {
3053 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3054 i, used, &w);
3055 if (r < 0)
3056 return r;
3057 type_stack.clear();
3058 }
3059 for (auto item : w) {
3060 out->push_back(item);
3061 }
3062 w.clear();
3063 break;
3064
3065 default:
3066 // ignore
3067 break;
3068 }
3069 }
3070
3071 return 0;
3072 }
3073
3074
3075 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3076 CephContext *cct,
3077 crush_choose_arg_map cmap,
3078 int bucketid,
3079 int id,
3080 const vector<int>& weight,
3081 ostream *ss)
3082 {
3083 int changed = 0;
3084 int bidx = -1 - bucketid;
3085 crush_bucket *b = crush->buckets[bidx];
3086 if (bidx >= (int)cmap.size) {
3087 if (ss)
3088 *ss << "no weight-set for bucket " << b->id;
3089 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
3090 << dendl;
3091 return 0;
3092 }
3093 crush_choose_arg *carg = &cmap.args[bidx];
3094 if (carg->weight_set == NULL) {
3095 if (ss)
3096 *ss << "no weight-set for bucket " << b->id;
3097 ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
3098 << dendl;
3099 return 0;
3100 }
3101 if (carg->weight_set_size != weight.size()) {
3102 if (ss)
3103 *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
3104 ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
3105 << " for bucket " << b->id << dendl;
3106 return 0;
3107 }
3108 for (unsigned i = 0; i < b->size; i++) {
3109 if (b->items[i] == id) {
3110 for (unsigned j = 0; j < weight.size(); ++j) {
3111 carg->weight_set[j].weights[i] = weight[j];
3112 }
3113 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
3114 << " in bucket " << b->id << dendl;
3115 changed++;
3116 }
3117 }
3118 if (changed) {
3119 vector<int> bucket_weight(weight.size(), 0);
3120 for (unsigned i = 0; i < b->size; i++) {
3121 for (unsigned j = 0; j < weight.size(); ++j) {
3122 bucket_weight[j] += carg->weight_set[j].weights[i];
3123 }
3124 }
3125 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
3126 }
3127 return changed;
3128 }
3129
3130 int CrushWrapper::choose_args_adjust_item_weight(
3131 CephContext *cct,
3132 crush_choose_arg_map cmap,
3133 int id,
3134 const vector<int>& weight,
3135 ostream *ss)
3136 {
3137 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
3138 int changed = 0;
3139 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
3140 crush_bucket *b = crush->buckets[bidx];
3141 if (b == nullptr) {
3142 continue;
3143 }
3144 changed += _choose_args_adjust_item_weight_in_bucket(
3145 cct, cmap, b->id, id, weight, ss);
3146 }
3147 if (!changed) {
3148 if (ss)
3149 *ss << "item " << id << " not found in crush map";
3150 return -ENOENT;
3151 }
3152 return changed;
3153 }