]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
update sources to v12.1.3
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
10
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
13
14 #define dout_subsys ceph_subsys_crush
15
16 bool CrushWrapper::has_legacy_rulesets() const
17 {
18 for (unsigned i=0; i<crush->max_rules; i++) {
19 crush_rule *r = crush->rules[i];
20 if (r &&
21 r->mask.ruleset != i) {
22 return true;
23 }
24 }
25 return false;
26 }
27
28 int CrushWrapper::renumber_rules_by_ruleset()
29 {
30 int max_ruleset = 0;
31 for (unsigned i=0; i<crush->max_rules; i++) {
32 crush_rule *r = crush->rules[i];
33 if (r && r->mask.ruleset >= max_ruleset) {
34 max_ruleset = r->mask.ruleset + 1;
35 }
36 }
37 struct crush_rule **newrules =
38 (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
39 for (unsigned i=0; i<crush->max_rules; i++) {
40 crush_rule *r = crush->rules[i];
41 if (!r)
42 continue;
43 if (newrules[r->mask.ruleset]) {
44 // collision, we can't do it.
45 free(newrules);
46 return -EINVAL;
47 }
48 newrules[r->mask.ruleset] = r;
49 }
50
51 // success, swap!
52 free(crush->rules);
53 crush->rules = newrules;
54 crush->max_rules = max_ruleset;
55 return 0;
56 }
57
58 bool CrushWrapper::has_multirule_rulesets() const
59 {
60 for (unsigned i=0; i<crush->max_rules; i++) {
61 crush_rule *r = crush->rules[i];
62 if (!r)
63 continue;
64 for (unsigned j=i+1; j<crush->max_rules; j++) {
65 crush_rule *s = crush->rules[j];
66 if (!s)
67 continue;
68 if (r->mask.ruleset == s->mask.ruleset)
69 return true;
70 }
71 }
72 return false;
73 }
74
75 bool CrushWrapper::has_non_straw2_buckets() const
76 {
77 for (int i=0; i<crush->max_buckets; ++i) {
78 crush_bucket *b = crush->buckets[i];
79 if (!b)
80 continue;
81 if (b->alg != CRUSH_BUCKET_STRAW2)
82 return true;
83 }
84 return false;
85 }
86
87 bool CrushWrapper::has_v2_rules() const
88 {
89 for (unsigned i=0; i<crush->max_rules; i++) {
90 if (is_v2_rule(i)) {
91 return true;
92 }
93 }
94 return false;
95 }
96
97 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
98 {
99 // check rule for use of indep or new SET_* rule steps
100 if (ruleid >= crush->max_rules)
101 return false;
102 crush_rule *r = crush->rules[ruleid];
103 if (!r)
104 return false;
105 for (unsigned j=0; j<r->len; j++) {
106 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
107 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
108 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
109 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
110 return true;
111 }
112 }
113 return false;
114 }
115
116 bool CrushWrapper::has_v3_rules() const
117 {
118 for (unsigned i=0; i<crush->max_rules; i++) {
119 if (is_v3_rule(i)) {
120 return true;
121 }
122 }
123 return false;
124 }
125
126 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
127 {
128 // check rule for use of SET_CHOOSELEAF_VARY_R step
129 if (ruleid >= crush->max_rules)
130 return false;
131 crush_rule *r = crush->rules[ruleid];
132 if (!r)
133 return false;
134 for (unsigned j=0; j<r->len; j++) {
135 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
136 return true;
137 }
138 }
139 return false;
140 }
141
142 bool CrushWrapper::has_v4_buckets() const
143 {
144 for (int i=0; i<crush->max_buckets; ++i) {
145 crush_bucket *b = crush->buckets[i];
146 if (!b)
147 continue;
148 if (b->alg == CRUSH_BUCKET_STRAW2)
149 return true;
150 }
151 return false;
152 }
153
154 bool CrushWrapper::has_v5_rules() const
155 {
156 for (unsigned i=0; i<crush->max_rules; i++) {
157 if (is_v5_rule(i)) {
158 return true;
159 }
160 }
161 return false;
162 }
163
164 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
165 {
166 // check rule for use of SET_CHOOSELEAF_STABLE step
167 if (ruleid >= crush->max_rules)
168 return false;
169 crush_rule *r = crush->rules[ruleid];
170 if (!r)
171 return false;
172 for (unsigned j=0; j<r->len; j++) {
173 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
174 return true;
175 }
176 }
177 return false;
178 }
179
180 bool CrushWrapper::has_choose_args() const
181 {
182 return !choose_args.empty();
183 }
184
185 bool CrushWrapper::has_incompat_choose_args() const
186 {
187 if (choose_args.empty())
188 return false;
189 if (choose_args.size() > 1)
190 return true;
191 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
192 return true;
193 crush_choose_arg_map arg_map = choose_args.begin()->second;
194 for (__u32 i = 0; i < arg_map.size; i++) {
195 crush_choose_arg *arg = &arg_map.args[i];
196 if (arg->weight_set_size == 0 &&
197 arg->ids_size == 0)
198 continue;
199 if (arg->weight_set_size != 1)
200 return true;
201 if (arg->ids_size != 0)
202 return true;
203 }
204 return false;
205 }
206
207 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
208 {
209 if (!item_exists(i))
210 return -EINVAL;
211 string name = get_item_name(i);
212 size_t pos = name.find("~");
213 if (pos == string::npos) {
214 *idout = i;
215 *classout = -1;
216 return 0;
217 }
218 string name_no_class = name.substr(0, pos);
219 if (!name_exists(name_no_class))
220 return -ENOENT;
221 string class_name = name.substr(pos + 1);
222 if (!class_exists(class_name))
223 return -ENOENT;
224 *idout = get_item_id(name_no_class);
225 *classout = get_class_id(class_name);
226 return 0;
227 }
228
229 int CrushWrapper::can_rename_item(const string& srcname,
230 const string& dstname,
231 ostream *ss) const
232 {
233 if (name_exists(srcname)) {
234 if (name_exists(dstname)) {
235 *ss << "dstname = '" << dstname << "' already exists";
236 return -EEXIST;
237 }
238 if (is_valid_crush_name(dstname)) {
239 return 0;
240 } else {
241 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
242 return -EINVAL;
243 }
244 } else {
245 if (name_exists(dstname)) {
246 *ss << "srcname = '" << srcname << "' does not exist "
247 << "and dstname = '" << dstname << "' already exists";
248 return -EALREADY;
249 } else {
250 *ss << "srcname = '" << srcname << "' does not exist";
251 return -ENOENT;
252 }
253 }
254 }
255
256 int CrushWrapper::rename_item(const string& srcname,
257 const string& dstname,
258 ostream *ss)
259 {
260 int ret = can_rename_item(srcname, dstname, ss);
261 if (ret < 0)
262 return ret;
263 int oldid = get_item_id(srcname);
264 return set_item_name(oldid, dstname);
265 }
266
267 int CrushWrapper::can_rename_bucket(const string& srcname,
268 const string& dstname,
269 ostream *ss) const
270 {
271 int ret = can_rename_item(srcname, dstname, ss);
272 if (ret)
273 return ret;
274 int srcid = get_item_id(srcname);
275 if (srcid >= 0) {
276 *ss << "srcname = '" << srcname << "' is not a bucket "
277 << "because its id = " << srcid << " is >= 0";
278 return -ENOTDIR;
279 }
280 return 0;
281 }
282
283 int CrushWrapper::rename_bucket(const string& srcname,
284 const string& dstname,
285 ostream *ss)
286 {
287 int ret = can_rename_bucket(srcname, dstname, ss);
288 if (ret < 0)
289 return ret;
290 int oldid = get_item_id(srcname);
291 return set_item_name(oldid, dstname);
292 }
293
294 void CrushWrapper::find_takes(set<int>& roots) const
295 {
296 for (unsigned i=0; i<crush->max_rules; i++) {
297 crush_rule *r = crush->rules[i];
298 if (!r)
299 continue;
300 for (unsigned j=0; j<r->len; j++) {
301 if (r->steps[j].op == CRUSH_RULE_TAKE)
302 roots.insert(r->steps[j].arg1);
303 }
304 }
305 }
306
307 void CrushWrapper::find_roots(set<int>& roots) const
308 {
309 for (int i = 0; i < crush->max_buckets; i++) {
310 if (!crush->buckets[i])
311 continue;
312 crush_bucket *b = crush->buckets[i];
313 if (!_search_item_exists(b->id))
314 roots.insert(b->id);
315 }
316 }
317
318 bool CrushWrapper::subtree_contains(int root, int item) const
319 {
320 if (root == item)
321 return true;
322
323 if (root >= 0)
324 return false; // root is a leaf
325
326 const crush_bucket *b = get_bucket(root);
327 if (IS_ERR(b))
328 return false;
329
330 for (unsigned j=0; j<b->size; j++) {
331 if (subtree_contains(b->items[j], item))
332 return true;
333 }
334 return false;
335 }
336
337 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
338 {
339 // last instance?
340 if (_search_item_exists(item)) {
341 return false;
342 }
343 if (item < 0 && _bucket_is_in_use(item)) {
344 return false;
345 }
346
347 if (item < 0 && !unlink_only) {
348 crush_bucket *t = get_bucket(item);
349 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
350 crush_remove_bucket(crush, t);
351 if (class_bucket.count(item) != 0)
352 class_bucket.erase(item);
353 class_remove_item(item);
354 }
355 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
356 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
357 name_map.erase(item);
358 have_rmaps = false;
359 if (item >= 0 && !unlink_only) {
360 class_remove_item(item);
361 }
362 }
363 rebuild_roots_with_classes();
364 return true;
365 }
366
367 int CrushWrapper::remove_root(int item, bool unused)
368 {
369 if (unused && _bucket_is_in_use(item))
370 return 0;
371
372 crush_bucket *b = get_bucket(item);
373 if (IS_ERR(b)) {
374 // should be idempotent
375 // e.g.: we use 'crush link' to link same host into
376 // different roots, which as a result can cause different
377 // shadow trees reference same hosts too. This means
378 // we may need to destory the same buckets(hosts, racks, etc.)
379 // multiple times during rebuilding all shadow trees.
380 return 0;
381 }
382
383 for (unsigned n = 0; n < b->size; n++) {
384 if (b->items[n] >= 0)
385 continue;
386 int r = remove_root(b->items[n], unused);
387 if (r < 0)
388 return r;
389 }
390
391 crush_remove_bucket(crush, b);
392 if (name_map.count(item) != 0) {
393 name_map.erase(item);
394 have_rmaps = false;
395 }
396 if (class_bucket.count(item) != 0)
397 class_bucket.erase(item);
398 class_remove_item(item);
399 return 0;
400 }
401
402 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
403 {
404 ldout(cct, 5) << "remove_item " << item
405 << (unlink_only ? " unlink_only":"") << dendl;
406
407 int ret = -ENOENT;
408
409 if (item < 0 && !unlink_only) {
410 crush_bucket *t = get_bucket(item);
411 if (IS_ERR(t)) {
412 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
413 << dendl;
414 return -ENOENT;
415 }
416
417 if (t->size) {
418 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
419 << " items, not empty" << dendl;
420 return -ENOTEMPTY;
421 }
422 if (_bucket_is_in_use(item)) {
423 return -EBUSY;
424 }
425 }
426
427 for (int i = 0; i < crush->max_buckets; i++) {
428 if (!crush->buckets[i])
429 continue;
430 crush_bucket *b = crush->buckets[i];
431
432 for (unsigned i=0; i<b->size; ++i) {
433 int id = b->items[i];
434 if (id == item) {
435 ldout(cct, 5) << "remove_item removing item " << item
436 << " from bucket " << b->id << dendl;
437 for (auto& p : choose_args) {
438 // weight down each weight-set to 0 before we remove the item
439 vector<int> weightv(get_choose_args_positions(p.second), 0);
440 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
441 }
442 bucket_remove_item(b, item);
443 adjust_item_weight(cct, b->id, b->weight);
444 ret = 0;
445 }
446 }
447 }
448
449 if (_maybe_remove_last_instance(cct, item, unlink_only))
450 ret = 0;
451
452 return ret;
453 }
454
455 bool CrushWrapper::_search_item_exists(int item) const
456 {
457 for (int i = 0; i < crush->max_buckets; i++) {
458 if (!crush->buckets[i])
459 continue;
460 crush_bucket *b = crush->buckets[i];
461 for (unsigned j=0; j<b->size; ++j) {
462 if (b->items[j] == item)
463 return true;
464 }
465 }
466 return false;
467 }
468
469 bool CrushWrapper::_bucket_is_in_use(int item)
470 {
471 for (auto &i : class_bucket)
472 for (auto &j : i.second)
473 if (j.second == item)
474 return true;
475 for (unsigned i = 0; i < crush->max_rules; ++i) {
476 crush_rule *r = crush->rules[i];
477 if (!r)
478 continue;
479 for (unsigned j = 0; j < r->len; ++j) {
480 if (r->steps[j].op == CRUSH_RULE_TAKE) {
481 int step_item = r->steps[j].arg1;
482 int original_item;
483 int c;
484 int res = split_id_class(step_item, &original_item, &c);
485 if (res < 0)
486 return false;
487 if (step_item == item || original_item == item)
488 return true;
489 }
490 }
491 }
492 return false;
493 }
494
495 int CrushWrapper::_remove_item_under(
496 CephContext *cct, int item, int ancestor, bool unlink_only)
497 {
498 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
499 << (unlink_only ? " unlink_only":"") << dendl;
500
501 if (ancestor >= 0) {
502 return -EINVAL;
503 }
504
505 if (!bucket_exists(ancestor))
506 return -EINVAL;
507
508 int ret = -ENOENT;
509
510 crush_bucket *b = get_bucket(ancestor);
511 for (unsigned i=0; i<b->size; ++i) {
512 int id = b->items[i];
513 if (id == item) {
514 ldout(cct, 5) << "_remove_item_under removing item " << item
515 << " from bucket " << b->id << dendl;
516 bucket_remove_item(b, item);
517 for (auto& p : choose_args) {
518 // weight down each weight-set to 0 before we remove the item
519 vector<int> weightv(get_choose_args_positions(p.second), 0);
520 _choose_args_adjust_item_weight_in_bucket(
521 cct, p.second, b->id, item, weightv, nullptr);
522 }
523 adjust_item_weight(cct, b->id, b->weight);
524 ret = 0;
525 } else if (id < 0) {
526 int r = remove_item_under(cct, item, id, unlink_only);
527 if (r == 0)
528 ret = 0;
529 }
530 }
531 return ret;
532 }
533
534 int CrushWrapper::remove_item_under(
535 CephContext *cct, int item, int ancestor, bool unlink_only)
536 {
537 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
538 << (unlink_only ? " unlink_only":"") << dendl;
539
540 if (!unlink_only && _bucket_is_in_use(item)) {
541 return -EBUSY;
542 }
543
544 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
545 if (ret < 0)
546 return ret;
547
548 if (item < 0 && !unlink_only) {
549 crush_bucket *t = get_bucket(item);
550 if (IS_ERR(t)) {
551 ldout(cct, 1) << "remove_item_under bucket " << item
552 << " does not exist" << dendl;
553 return -ENOENT;
554 }
555
556 if (t->size) {
557 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
558 << " items, not empty" << dendl;
559 return -ENOTEMPTY;
560 }
561 }
562
563 if (_maybe_remove_last_instance(cct, item, unlink_only))
564 ret = 0;
565
566 return ret;
567 }
568
569 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
570 const std::multimap<string,string>& loc)
571 {
572 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
573 if (!item_exists(id))
574 return -ENOENT;
575 map<string,string> id_loc = get_full_location(id);
576 ldout(cct, 20) << " id is at " << id_loc << dendl;
577
578 for (map<int,string>::const_iterator p = type_map.begin();
579 p != type_map.end();
580 ++p) {
581 map<string,string>::iterator ip = id_loc.find(p->second);
582 if (ip == id_loc.end())
583 continue;
584 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
585 q != loc.end();
586 ++q) {
587 if (q->first != p->second)
588 break;
589 if (q->second == ip->second)
590 return p->first;
591 }
592 }
593 return -ERANGE;
594 }
595
596 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
597 std::map<string,string> *ploc)
598 {
599 ploc->clear();
600 for (unsigned i = 0; i < args.size(); ++i) {
601 const char *s = args[i].c_str();
602 const char *pos = strchr(s, '=');
603 if (!pos)
604 return -EINVAL;
605 string key(s, 0, pos-s);
606 string value(pos+1);
607 if (value.length())
608 (*ploc)[key] = value;
609 else
610 return -EINVAL;
611 }
612 return 0;
613 }
614
615 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
616 std::multimap<string,string> *ploc)
617 {
618 ploc->clear();
619 for (unsigned i = 0; i < args.size(); ++i) {
620 const char *s = args[i].c_str();
621 const char *pos = strchr(s, '=');
622 if (!pos)
623 return -EINVAL;
624 string key(s, 0, pos-s);
625 string value(pos+1);
626 if (value.length())
627 ploc->insert(make_pair(key, value));
628 else
629 return -EINVAL;
630 }
631 return 0;
632 }
633
634 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
635 int *weight)
636 {
637 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
638
639 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
640 // ignore device
641 if (p->first == 0)
642 continue;
643
644 // ignore types that aren't specified in loc
645 map<string,string>::const_iterator q = loc.find(p->second);
646 if (q == loc.end()) {
647 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
648 << type_map << ")" << dendl;
649 continue;
650 }
651
652 if (!name_exists(q->second)) {
653 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
654 return false;
655 }
656
657 int id = get_item_id(q->second);
658 if (id >= 0) {
659 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
660 << " is a device, not bucket" << dendl;
661 return false;
662 }
663
664 assert(bucket_exists(id));
665 crush_bucket *b = get_bucket(id);
666
667 // see if item exists in this bucket
668 for (unsigned j=0; j<b->size; j++) {
669 if (b->items[j] == item) {
670 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
671 if (weight)
672 *weight = crush_get_bucket_item_weight(b, j);
673 return true;
674 }
675 }
676 return false;
677 }
678
679 ldout(cct, 1) << "check_item_loc item " << item << " loc " << loc << dendl;
680 return false;
681 }
682
683 map<string, string> CrushWrapper::get_full_location(int id)
684 {
685 vector<pair<string, string> > full_location_ordered;
686 map<string,string> full_location;
687
688 get_full_location_ordered(id, full_location_ordered);
689
690 std::copy(full_location_ordered.begin(),
691 full_location_ordered.end(),
692 std::inserter(full_location, full_location.begin()));
693
694 return full_location;
695 }
696
697 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
698 {
699 if (!item_exists(id))
700 return -ENOENT;
701 int cur = id;
702 int ret;
703 while (true) {
704 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
705 if (ret != 0)
706 break;
707 path.push_back(parent_coord);
708 cur = get_item_id(parent_coord.second);
709 }
710 return 0;
711 }
712
713 string CrushWrapper::get_full_location_ordered_string(int id)
714 {
715 vector<pair<string, string> > full_location_ordered;
716 string full_location;
717 get_full_location_ordered(id, full_location_ordered);
718 reverse(begin(full_location_ordered), end(full_location_ordered));
719 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
720 full_location = full_location + i->first + "=" + i->second;
721 if (i != full_location_ordered.end() - 1) {
722 full_location = full_location + ",";
723 }
724 }
725 return full_location;
726 }
727
728 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
729 {
730 map<int,string> parent_hierarchy;
731 pair<string, string> parent_coord = get_immediate_parent(id);
732 int parent_id;
733
734 // get the integer type for id and create a counter from there
735 int type_counter = get_bucket_type(id);
736
737 // if we get a negative type then we can assume that we have an OSD
738 // change behavior in get_item_type FIXME
739 if (type_counter < 0)
740 type_counter = 0;
741
742 // read the type map and get the name of the type with the largest ID
743 int high_type = 0;
744 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
745 if ( (*it).first > high_type )
746 high_type = (*it).first;
747 }
748
749 parent_id = get_item_id(parent_coord.second);
750
751 while (type_counter < high_type) {
752 type_counter++;
753 parent_hierarchy[ type_counter ] = parent_coord.first;
754
755 if (type_counter < high_type){
756 // get the coordinate information for the next parent
757 parent_coord = get_immediate_parent(parent_id);
758 parent_id = get_item_id(parent_coord.second);
759 }
760 }
761
762 return parent_hierarchy;
763 }
764
765 int CrushWrapper::get_children(int id, list<int> *children)
766 {
767 // leaf?
768 if (id >= 0) {
769 return 0;
770 }
771
772 crush_bucket *b = get_bucket(id);
773 if (IS_ERR(b)) {
774 return -ENOENT;
775 }
776
777 for (unsigned n=0; n<b->size; n++) {
778 children->push_back(b->items[n]);
779 }
780 return b->size;
781 }
782
783 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
784 {
785 assert(leaves);
786
787 // Already leaf?
788 if (id >= 0) {
789 leaves->push_back(id);
790 return 0;
791 }
792
793 crush_bucket *b = get_bucket(id);
794 if (IS_ERR(b)) {
795 return -ENOENT;
796 }
797
798 for (unsigned n = 0; n < b->size; n++) {
799 if (b->items[n] >= 0) {
800 leaves->push_back(b->items[n]);
801 } else {
802 // is a bucket, do recursive call
803 int r = _get_leaves(b->items[n], leaves);
804 if (r < 0) {
805 return r;
806 }
807 }
808 }
809
810 return 0; // all is well
811 }
812
813 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
814 {
815 assert(leaves);
816 leaves->clear();
817
818 if (!name_exists(name)) {
819 return -ENOENT;
820 }
821
822 int id = get_item_id(name);
823 if (id >= 0) {
824 // already leaf
825 leaves->insert(id);
826 return 0;
827 }
828
829 list<int> unordered;
830 int r = _get_leaves(id, &unordered);
831 if (r < 0) {
832 return r;
833 }
834
835 for (auto &p : unordered) {
836 leaves->insert(p);
837 }
838
839 return 0;
840 }
841
842 int CrushWrapper::insert_item(
843 CephContext *cct, int item, float weight, string name,
844 const map<string,string>& loc) // typename -> bucketname
845 {
846 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
847 << " name " << name << " loc " << loc << dendl;
848
849 if (!is_valid_crush_name(name))
850 return -EINVAL;
851
852 if (!is_valid_crush_loc(cct, loc))
853 return -EINVAL;
854
855 int r = validate_weightf(weight);
856 if (r < 0) {
857 return r;
858 }
859
860 if (name_exists(name)) {
861 if (get_item_id(name) != item) {
862 ldout(cct, 10) << "device name '" << name << "' already exists as id "
863 << get_item_id(name) << dendl;
864 return -EEXIST;
865 }
866 } else {
867 set_item_name(item, name);
868 }
869
870 int cur = item;
871
872 // create locations if locations don't exist and add child in
873 // location with 0 weight the more detail in the insert_item method
874 // declaration in CrushWrapper.h
875 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
876 // ignore device type
877 if (p->first == 0)
878 continue;
879
880 // skip types that are unspecified
881 map<string,string>::const_iterator q = loc.find(p->second);
882 if (q == loc.end()) {
883 ldout(cct, 2) << "warning: did not specify location for '"
884 << p->second << "' level (levels are "
885 << type_map << ")" << dendl;
886 continue;
887 }
888
889 if (!name_exists(q->second)) {
890 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
891 int empty = 0, newid;
892 int r = add_bucket(0, 0,
893 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
894 if (r < 0) {
895 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
896 << dendl;
897 return r;
898 }
899 set_item_name(newid, q->second);
900
901 cur = newid;
902 continue;
903 }
904
905 // add to an existing bucket
906 int id = get_item_id(q->second);
907 if (!bucket_exists(id)) {
908 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
909 return -EINVAL;
910 }
911
912 // check that we aren't creating a cycle.
913 if (subtree_contains(id, cur)) {
914 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
915 << id << dendl;
916 return -EINVAL;
917 }
918
919 // we have done sanity check above
920 crush_bucket *b = get_bucket(id);
921
922 if (p->first != b->type) {
923 ldout(cct, 1) << "insert_item existing bucket has type "
924 << "'" << type_map[b->type] << "' != "
925 << "'" << type_map[p->first] << "'" << dendl;
926 return -EINVAL;
927 }
928
929 // are we forming a loop?
930 if (subtree_contains(cur, b->id)) {
931 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
932 << "; cannot form loop" << dendl;
933 return -ELOOP;
934 }
935
936 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
937 << " to bucket " << id << dendl;
938 int r = bucket_add_item(b, cur, 0);
939 assert (!r);
940 break;
941 }
942
943 // adjust the item's weight in location
944 if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
945 if (item >= crush->max_devices) {
946 crush->max_devices = item + 1;
947 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
948 << dendl;
949 }
950 r = rebuild_roots_with_classes();
951 if (r < 0) {
952 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
953 << cpp_strerror(r) << dendl;
954 return r;
955 }
956 return 0;
957 }
958
959 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
960 << " in " << loc << dendl;
961 return -EINVAL;
962 }
963
964
965 int CrushWrapper::move_bucket(
966 CephContext *cct, int id, const map<string,string>& loc)
967 {
968 // sorry this only works for buckets
969 if (id >= 0)
970 return -EINVAL;
971
972 if (!item_exists(id))
973 return -ENOENT;
974
975 // get the name of the bucket we are trying to move for later
976 string id_name = get_item_name(id);
977
978 // detach the bucket
979 int bucket_weight = detach_bucket(cct, id);
980
981 // insert the bucket back into the hierarchy
982 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
983 }
984
985 int CrushWrapper::detach_bucket(CephContext *cct, int item)
986 {
987 if (!crush)
988 return (-EINVAL);
989
990 if (item >= 0)
991 return (-EINVAL);
992
993 // check that the bucket that we want to detach exists
994 assert(bucket_exists(item));
995
996 // get the bucket's weight
997 crush_bucket *b = get_bucket(item);
998 unsigned bucket_weight = b->weight;
999
1000 // get where the bucket is located
1001 pair<string, string> bucket_location = get_immediate_parent(item);
1002
1003 // get the id of the parent bucket
1004 int parent_id = get_item_id(bucket_location.second);
1005
1006 // get the parent bucket
1007 crush_bucket *parent_bucket = get_bucket(parent_id);
1008
1009 if (!IS_ERR(parent_bucket)) {
1010 // zero out the bucket weight
1011 bucket_adjust_item_weight(cct, parent_bucket, item, 0);
1012 adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
1013 for (auto& p : choose_args) {
1014 // weight down each weight-set to 0 before we remove the item
1015 vector<int> weightv(get_choose_args_positions(p.second), 0);
1016 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
1017 }
1018
1019 // remove the bucket from the parent
1020 bucket_remove_item(parent_bucket, item);
1021 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1022 return PTR_ERR(parent_bucket);
1023 }
1024
1025 // check that we're happy
1026 int test_weight = 0;
1027 map<string,string> test_location;
1028 test_location[ bucket_location.first ] = (bucket_location.second);
1029
1030 bool successful_detach = !(check_item_loc(cct, item, test_location,
1031 &test_weight));
1032 assert(successful_detach);
1033 assert(test_weight == 0);
1034
1035 return bucket_weight;
1036 }
1037
1038 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
1039 {
1040 if (src >= 0 || dst >= 0)
1041 return -EINVAL;
1042 if (!item_exists(src) || !item_exists(dst))
1043 return -EINVAL;
1044 crush_bucket *a = get_bucket(src);
1045 crush_bucket *b = get_bucket(dst);
1046 unsigned aw = a->weight;
1047 unsigned bw = b->weight;
1048
1049 // swap weights
1050 adjust_item_weight(cct, a->id, bw);
1051 adjust_item_weight(cct, b->id, aw);
1052
1053 // swap items
1054 map<int,unsigned> tmp;
1055 unsigned as = a->size;
1056 unsigned bs = b->size;
1057 for (unsigned i = 0; i < as; ++i) {
1058 int item = a->items[0];
1059 int itemw = crush_get_bucket_item_weight(a, 0);
1060 tmp[item] = itemw;
1061 bucket_remove_item(a, item);
1062 }
1063 assert(a->size == 0);
1064 assert(b->size == bs);
1065 for (unsigned i = 0; i < bs; ++i) {
1066 int item = b->items[0];
1067 int itemw = crush_get_bucket_item_weight(b, 0);
1068 bucket_remove_item(b, item);
1069 bucket_add_item(a, item, itemw);
1070 }
1071 assert(a->size == bs);
1072 assert(b->size == 0);
1073 for (auto t : tmp) {
1074 bucket_add_item(b, t.first, t.second);
1075 }
1076 assert(a->size == bs);
1077 assert(b->size == as);
1078
1079 // swap names
1080 swap_names(src, dst);
1081 return 0;
1082 }
1083
1084 int CrushWrapper::link_bucket(
1085 CephContext *cct, int id, const map<string,string>& loc)
1086 {
1087 // sorry this only works for buckets
1088 if (id >= 0)
1089 return -EINVAL;
1090
1091 if (!item_exists(id))
1092 return -ENOENT;
1093
1094 // get the name of the bucket we are trying to move for later
1095 string id_name = get_item_name(id);
1096
1097 crush_bucket *b = get_bucket(id);
1098 unsigned bucket_weight = b->weight;
1099
1100 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1101 }
1102
1103 int CrushWrapper::create_or_move_item(
1104 CephContext *cct, int item, float weight, string name,
1105 const map<string,string>& loc) // typename -> bucketname
1106 {
1107 int ret = 0;
1108 int old_iweight;
1109
1110 if (!is_valid_crush_name(name))
1111 return -EINVAL;
1112
1113 if (check_item_loc(cct, item, loc, &old_iweight)) {
1114 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1115 << dendl;
1116 } else {
1117 if (_search_item_exists(item)) {
1118 weight = get_item_weightf(item);
1119 ldout(cct, 10) << "create_or_move_item " << item
1120 << " exists with weight " << weight << dendl;
1121 remove_item(cct, item, true);
1122 }
1123 ldout(cct, 5) << "create_or_move_item adding " << item
1124 << " weight " << weight
1125 << " at " << loc << dendl;
1126 ret = insert_item(cct, item, weight, name, loc);
1127 if (ret == 0)
1128 ret = 1; // changed
1129 }
1130 return ret;
1131 }
1132
1133 int CrushWrapper::update_item(
1134 CephContext *cct, int item, float weight, string name,
1135 const map<string,string>& loc) // typename -> bucketname
1136 {
1137 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1138 << " name " << name << " loc " << loc << dendl;
1139 int ret = 0;
1140
1141 if (!is_valid_crush_name(name))
1142 return -EINVAL;
1143
1144 if (!is_valid_crush_loc(cct, loc))
1145 return -EINVAL;
1146
1147 ret = validate_weightf(weight);
1148 if (ret < 0) {
1149 return ret;
1150 }
1151
1152 // compare quantized (fixed-point integer) weights!
1153 int iweight = (int)(weight * (float)0x10000);
1154 int old_iweight;
1155 if (check_item_loc(cct, item, loc, &old_iweight)) {
1156 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1157 if (old_iweight != iweight) {
1158 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1159 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1160 << dendl;
1161 adjust_item_weight_in_loc(cct, item, iweight, loc);
1162 ret = 1;
1163 }
1164 if (get_item_name(item) != name) {
1165 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1166 << dendl;
1167 set_item_name(item, name);
1168 ret = 1;
1169 }
1170 } else {
1171 if (item_exists(item)) {
1172 remove_item(cct, item, true);
1173 }
1174 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1175 << " at " << loc << dendl;
1176 ret = insert_item(cct, item, weight, name, loc);
1177 if (ret == 0)
1178 ret = 1; // changed
1179 }
1180 return ret;
1181 }
1182
1183 int CrushWrapper::get_item_weight(int id) const
1184 {
1185 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1186 crush_bucket *b = crush->buckets[bidx];
1187 if (b == NULL)
1188 continue;
1189 if (b->id == id)
1190 return b->weight;
1191 for (unsigned i = 0; i < b->size; i++)
1192 if (b->items[i] == id)
1193 return crush_get_bucket_item_weight(b, i);
1194 }
1195 return -ENOENT;
1196 }
1197
1198 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1199 {
1200 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1201
1202 int bid = get_item_id(l->second);
1203 if (!bucket_exists(bid))
1204 continue;
1205 crush_bucket *b = get_bucket(bid);
1206 for (unsigned int i = 0; i < b->size; i++) {
1207 if (b->items[i] == id) {
1208 return crush_get_bucket_item_weight(b, i);
1209 }
1210 }
1211 }
1212 return -ENOENT;
1213 }
1214
1215 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1216 {
1217 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1218 int changed = 0;
1219 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1220 crush_bucket *b = crush->buckets[bidx];
1221 if (b == 0)
1222 continue;
1223 for (unsigned i = 0; i < b->size; i++) {
1224 if (b->items[i] == id) {
1225 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1226 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
1227 << " in bucket " << bidx << dendl;
1228 adjust_item_weight(cct, -1 - bidx, b->weight);
1229 changed++;
1230 }
1231 }
1232 }
1233 if (!changed)
1234 return -ENOENT;
1235 return changed;
1236 }
1237
1238 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1239 {
1240 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
1241 << " in " << loc << dendl;
1242 int changed = 0;
1243
1244 for (auto l = loc.begin(); l != loc.end(); ++l) {
1245 int bid = get_item_id(l->second);
1246 if (!bucket_exists(bid))
1247 continue;
1248 crush_bucket *b = get_bucket(bid);
1249 for (unsigned int i = 0; i < b->size; i++) {
1250 if (b->items[i] == id) {
1251 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1252 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
1253 << " in bucket " << bid << dendl;
1254 adjust_item_weight(cct, bid, b->weight);
1255 changed++;
1256 }
1257 }
1258 }
1259 if (!changed)
1260 return -ENOENT;
1261 return changed;
1262 }
1263
1264 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1265 {
1266 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1267 crush_bucket *b = get_bucket(id);
1268 if (IS_ERR(b))
1269 return PTR_ERR(b);
1270 int changed = 0;
1271 list<crush_bucket*> q;
1272 q.push_back(b);
1273 while (!q.empty()) {
1274 b = q.front();
1275 q.pop_front();
1276 int local_changed = 0;
1277 for (unsigned i=0; i<b->size; ++i) {
1278 int n = b->items[i];
1279 if (n >= 0) {
1280 bucket_adjust_item_weight(cct, b, n, weight);
1281 ++changed;
1282 ++local_changed;
1283 } else {
1284 crush_bucket *sub = get_bucket(n);
1285 if (IS_ERR(sub))
1286 continue;
1287 q.push_back(sub);
1288 }
1289 }
1290 if (local_changed) {
1291 adjust_item_weight(cct, b->id, b->weight);
1292 }
1293 }
1294 return changed;
1295 }
1296
1297 bool CrushWrapper::check_item_present(int id) const
1298 {
1299 bool found = false;
1300
1301 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1302 crush_bucket *b = crush->buckets[bidx];
1303 if (b == 0)
1304 continue;
1305 for (unsigned i = 0; i < b->size; i++)
1306 if (b->items[i] == id)
1307 found = true;
1308 }
1309 return found;
1310 }
1311
1312
1313 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1314 {
1315
1316 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1317 crush_bucket *b = crush->buckets[bidx];
1318 if (b == 0)
1319 continue;
1320 if (is_shadow_item(b->id))
1321 continue;
1322 for (unsigned i = 0; i < b->size; i++)
1323 if (b->items[i] == id) {
1324 string parent_id = name_map[b->id];
1325 string parent_bucket_type = type_map[b->type];
1326 if (_ret)
1327 *_ret = 0;
1328 return make_pair(parent_bucket_type, parent_id);
1329 }
1330 }
1331
1332 if (_ret)
1333 *_ret = -ENOENT;
1334
1335 return pair<string, string>();
1336 }
1337
1338 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1339 {
1340 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1341 crush_bucket *b = crush->buckets[bidx];
1342 if (b == 0)
1343 continue;
1344 if (is_shadow_item(b->id))
1345 continue;
1346 for (unsigned i = 0; i < b->size; i++) {
1347 if (b->items[i] == id) {
1348 *parent = b->id;
1349 return 0;
1350 }
1351 }
1352 }
1353 return -ENOENT;
1354 }
1355
1356 int CrushWrapper::get_parent_of_type(int item, int type) const
1357 {
1358 do {
1359 int r = get_immediate_parent_id(item, &item);
1360 if (r < 0) {
1361 return 0;
1362 }
1363 } while (get_bucket_type(item) != type);
1364 return item;
1365 }
1366
1367 int CrushWrapper::populate_classes(
1368 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
1369 {
1370 // build set of previous used shadow ids
1371 set<int32_t> used_ids;
1372 for (auto& p : old_class_bucket) {
1373 for (auto& q : p.second) {
1374 used_ids.insert(q.second);
1375 }
1376 }
1377 set<int> roots;
1378 find_nonshadow_roots(roots);
1379 for (auto &r : roots) {
1380 if (r >= 0)
1381 continue;
1382 for (auto &c : class_name) {
1383 int clone;
1384 int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
1385 &clone);
1386 if (res < 0)
1387 return res;
1388 }
1389 }
1390 return 0;
1391 }
1392
1393 int CrushWrapper::trim_roots_with_class(bool unused)
1394 {
1395 set<int> roots;
1396 find_shadow_roots(roots);
1397 for (auto &r : roots) {
1398 if (r >= 0)
1399 continue;
1400 int res = remove_root(r, unused);
1401 if (res)
1402 return res;
1403 }
1404 // there is no need to reweight because we only remove from the
1405 // root and down
1406 return 0;
1407 }
1408
1409 int32_t CrushWrapper::_alloc_class_id() const {
1410 if (class_name.empty()) {
1411 return 0;
1412 }
1413 int32_t class_id = class_name.rbegin()->first + 1;
1414 if (class_id >= 0) {
1415 return class_id;
1416 }
1417 // wrapped, pick a random start and do exhaustive search
1418 uint32_t upperlimit = numeric_limits<int32_t>::max();
1419 upperlimit++;
1420 class_id = rand() % upperlimit;
1421 const auto start = class_id;
1422 do {
1423 if (!class_name.count(class_id)) {
1424 return class_id;
1425 } else {
1426 class_id++;
1427 if (class_id < 0) {
1428 class_id = 0;
1429 }
1430 }
1431 } while (class_id != start);
1432 assert(0 == "no available class id");
1433 }
1434
1435 void CrushWrapper::reweight(CephContext *cct)
1436 {
1437 set<int> roots;
1438 find_roots(roots);
1439 for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
1440 if (*p >= 0)
1441 continue;
1442 crush_bucket *b = get_bucket(*p);
1443 ldout(cct, 5) << "reweight bucket " << *p << dendl;
1444 int r = crush_reweight_bucket(crush, b);
1445 assert(r == 0);
1446 }
1447 }
1448
1449 int CrushWrapper::add_simple_rule_at(
1450 string name, string root_name,
1451 string failure_domain_name,
1452 string device_class,
1453 string mode, int rule_type,
1454 int rno,
1455 ostream *err)
1456 {
1457 if (rule_exists(name)) {
1458 if (err)
1459 *err << "rule " << name << " exists";
1460 return -EEXIST;
1461 }
1462 if (rno >= 0) {
1463 if (rule_exists(rno)) {
1464 if (err)
1465 *err << "rule with ruleno " << rno << " exists";
1466 return -EEXIST;
1467 }
1468 if (ruleset_exists(rno)) {
1469 if (err)
1470 *err << "ruleset " << rno << " exists";
1471 return -EEXIST;
1472 }
1473 } else {
1474 for (rno = 0; rno < get_max_rules(); rno++) {
1475 if (!rule_exists(rno) && !ruleset_exists(rno))
1476 break;
1477 }
1478 }
1479 if (!name_exists(root_name)) {
1480 if (err)
1481 *err << "root item " << root_name << " does not exist";
1482 return -ENOENT;
1483 }
1484 int root = get_item_id(root_name);
1485 int type = 0;
1486 if (failure_domain_name.length()) {
1487 type = get_type_id(failure_domain_name);
1488 if (type < 0) {
1489 if (err)
1490 *err << "unknown type " << failure_domain_name;
1491 return -EINVAL;
1492 }
1493 }
1494 if (device_class.size()) {
1495 if (!class_exists(device_class)) {
1496 if (err)
1497 *err << "device class " << device_class << " does not exist";
1498 return -EINVAL;
1499 }
1500 int c = get_class_id(device_class);
1501 if (class_bucket.count(root) == 0 ||
1502 class_bucket[root].count(c) == 0) {
1503 if (err)
1504 *err << "root " << root_name << " has no devices with class "
1505 << device_class;
1506 return -EINVAL;
1507 }
1508 root = class_bucket[root][c];
1509 }
1510 if (mode != "firstn" && mode != "indep") {
1511 if (err)
1512 *err << "unknown mode " << mode;
1513 return -EINVAL;
1514 }
1515
1516 int steps = 3;
1517 if (mode == "indep")
1518 steps = 5;
1519 int min_rep = mode == "firstn" ? 1 : 3;
1520 int max_rep = mode == "firstn" ? 10 : 20;
1521 //set the ruleset the same as rule_id(rno)
1522 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
1523 assert(rule);
1524 int step = 0;
1525 if (mode == "indep") {
1526 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1527 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1528 }
1529 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1530 if (type)
1531 crush_rule_set_step(rule, step++,
1532 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
1533 CRUSH_RULE_CHOOSELEAF_INDEP,
1534 CRUSH_CHOOSE_N,
1535 type);
1536 else
1537 crush_rule_set_step(rule, step++,
1538 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
1539 CRUSH_RULE_CHOOSE_INDEP,
1540 CRUSH_CHOOSE_N,
1541 0);
1542 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1543
1544 int ret = crush_add_rule(crush, rule, rno);
1545 if(ret < 0) {
1546 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
1547 return ret;
1548 }
1549 set_rule_name(rno, name);
1550 have_rmaps = false;
1551 return rno;
1552 }
1553
1554 int CrushWrapper::add_simple_rule(
1555 string name, string root_name,
1556 string failure_domain_name,
1557 string device_class,
1558 string mode, int rule_type,
1559 ostream *err)
1560 {
1561 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
1562 mode,
1563 rule_type, -1, err);
1564 }
1565
1566 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
1567 {
1568 if (ruleno >= crush->max_rules)
1569 return -ENOENT;
1570 if (crush->rules[ruleno] == NULL)
1571 return -ENOENT;
1572 crush_rule *rule = crush->rules[ruleno];
1573
1574 // build a weight map for each TAKE in the rule, and then merge them
1575
1576 // FIXME: if there are multiple takes that place a different number of
1577 // objects we do not take that into account. (Also, note that doing this
1578 // right is also a function of the pool, since the crush rule
1579 // might choose 2 + choose 2 but pool size may only be 3.)
1580 for (unsigned i=0; i<rule->len; ++i) {
1581 map<int,float> m;
1582 float sum = 0;
1583 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
1584 int n = rule->steps[i].arg1;
1585 if (n >= 0) {
1586 m[n] = 1.0;
1587 sum = 1.0;
1588 } else {
1589 list<int> q;
1590 q.push_back(n);
1591 //breadth first iterate the OSD tree
1592 while (!q.empty()) {
1593 int bno = q.front();
1594 q.pop_front();
1595 crush_bucket *b = crush->buckets[-1-bno];
1596 assert(b);
1597 for (unsigned j=0; j<b->size; ++j) {
1598 int item_id = b->items[j];
1599 if (item_id >= 0) { //it's an OSD
1600 float w = crush_get_bucket_item_weight(b, j);
1601 m[item_id] = w;
1602 sum += w;
1603 } else { //not an OSD, expand the child later
1604 q.push_back(item_id);
1605 }
1606 }
1607 }
1608 }
1609 }
1610 for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
1611 map<int,float>::iterator q = pmap->find(p->first);
1612 if (q == pmap->end()) {
1613 (*pmap)[p->first] = p->second / sum;
1614 } else {
1615 q->second += p->second / sum;
1616 }
1617 }
1618 }
1619
1620 return 0;
1621 }
1622
1623 int CrushWrapper::remove_rule(int ruleno)
1624 {
1625 if (ruleno >= (int)crush->max_rules)
1626 return -ENOENT;
1627 if (crush->rules[ruleno] == NULL)
1628 return -ENOENT;
1629 crush_destroy_rule(crush->rules[ruleno]);
1630 crush->rules[ruleno] = NULL;
1631 rule_name_map.erase(ruleno);
1632 have_rmaps = false;
1633 return 0;
1634 }
1635
1636 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
1637 {
1638 if (cct->_conf->osd_crush_update_weight_set) {
1639 unsigned position;
1640 for (position = 0; position < bucket->size; position++)
1641 if (bucket->items[position] == item)
1642 break;
1643 assert(position != bucket->size);
1644 for (auto w : choose_args) {
1645 crush_choose_arg_map arg_map = w.second;
1646 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1647 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1648 crush_weight_set *weight_set = &arg->weight_set[j];
1649 weight_set->weights[position] = weight;
1650 }
1651 }
1652 }
1653 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
1654 }
1655
1656 int CrushWrapper::add_bucket(
1657 int bucketno, int alg, int hash, int type, int size,
1658 int *items, int *weights, int *idout)
1659 {
1660 if (alg == 0) {
1661 alg = get_default_bucket_alg();
1662 if (alg == 0)
1663 return -EINVAL;
1664 }
1665 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
1666 weights);
1667 assert(b);
1668 int r = crush_add_bucket(crush, bucketno, b, idout);
1669 for (auto& p : choose_args) {
1670 crush_choose_arg_map& cmap = p.second;
1671 if (cmap.args) {
1672 if ((int)cmap.size <= *idout) {
1673 cmap.args = (crush_choose_arg*)realloc(
1674 cmap.args,
1675 sizeof(crush_choose_arg) * (*idout + 1));
1676 memset(&cmap.args[cmap.size], 0,
1677 sizeof(crush_choose_arg) * (*idout + 1 - cmap.size));
1678 cmap.size = *idout + 1;
1679 }
1680 } else {
1681 cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
1682 *idout + 1);
1683 cmap.size = *idout + 1;
1684 }
1685 if (size > 0) {
1686 int positions = get_choose_args_positions(cmap);
1687 crush_choose_arg& carg = cmap.args[*idout];
1688 carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
1689 size);
1690 carg.weight_set_size = positions;
1691 for (int ppos = 0; ppos < positions; ++ppos) {
1692 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
1693 carg.weight_set[ppos].size = size;
1694 for (int bpos = 0; bpos < size; ++bpos) {
1695 carg.weight_set[ppos].weights[bpos] = weights[bpos];
1696 }
1697 }
1698 }
1699 }
1700 return r;
1701 }
1702
1703 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
1704 {
1705 __u32 new_size = bucket->size + 1;
1706 for (auto w : choose_args) {
1707 crush_choose_arg_map arg_map = w.second;
1708 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1709 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1710 crush_weight_set *weight_set = &arg->weight_set[j];
1711 weight_set->weights = (__u32*)realloc(weight_set->weights,
1712 new_size * sizeof(__u32));
1713 assert(weight_set->size + 1 == new_size);
1714 weight_set->weights[weight_set->size] = weight;
1715 weight_set->size = new_size;
1716 }
1717 if (arg->ids_size) {
1718 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1719 assert(arg->ids_size + 1 == new_size);
1720 arg->ids[arg->ids_size] = item;
1721 arg->ids_size = new_size;
1722 }
1723 }
1724 return crush_bucket_add_item(crush, bucket, item, weight);
1725 }
1726
1727 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
1728 {
1729 __u32 new_size = bucket->size - 1;
1730 unsigned position;
1731 for (position = 0; position < bucket->size; position++)
1732 if (bucket->items[position] == item)
1733 break;
1734 assert(position != bucket->size);
1735 for (auto w : choose_args) {
1736 crush_choose_arg_map arg_map = w.second;
1737 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1738 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1739 crush_weight_set *weight_set = &arg->weight_set[j];
1740 assert(weight_set->size - 1 == new_size);
1741 for (__u32 k = position; k < new_size; k++)
1742 weight_set->weights[k] = weight_set->weights[k+1];
1743 weight_set->weights = (__u32*)realloc(weight_set->weights,
1744 new_size * sizeof(__u32));
1745 weight_set->size = new_size;
1746 }
1747 if (arg->ids_size) {
1748 assert(arg->ids_size - 1 == new_size);
1749 for (__u32 k = position; k < new_size; k++)
1750 arg->ids[k] = arg->ids[k+1];
1751 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1752 arg->ids_size = new_size;
1753 }
1754 }
1755 return crush_bucket_remove_item(crush, bucket, item);
1756 }
1757
1758 int CrushWrapper::update_device_class(int id,
1759 const string& class_name,
1760 const string& name,
1761 ostream *ss)
1762 {
1763 assert(item_exists(id));
1764 auto old_class_name = get_item_class(id);
1765 if (old_class_name && old_class_name != class_name) {
1766 *ss << "osd." << id << " has already bound to class '" << old_class_name
1767 << "', can not reset class to '" << class_name << "'; "
1768 << "use 'ceph osd crush rm-device-class <osd>' to "
1769 << "remove old class first";
1770 return -EBUSY;
1771 }
1772
1773 int class_id = get_or_create_class_id(class_name);
1774 if (id < 0) {
1775 *ss << name << " id " << id << " is negative";
1776 return -EINVAL;
1777 }
1778
1779 if (class_map.count(id) != 0 && class_map[id] == class_id) {
1780 *ss << name << " already set to class " << class_name;
1781 return 0;
1782 }
1783
1784 set_item_class(id, class_id);
1785
1786 int r = rebuild_roots_with_classes();
1787 if (r < 0)
1788 return r;
1789 return 1;
1790 }
1791
1792 int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
1793 {
1794 assert(ss);
1795 const char *name = get_item_name(id);
1796 if (!name) {
1797 *ss << "osd." << id << " does not have a name";
1798 return -ENOENT;
1799 }
1800
1801 const char *class_name = get_item_class(id);
1802 if (!class_name) {
1803 *ss << "osd." << id << " has not been bound to a specific class yet";
1804 return 0;
1805 }
1806 class_remove_item(id);
1807
1808 int r = rebuild_roots_with_classes();
1809 if (r < 0) {
1810 *ss << "unable to rebuild roots with class '" << class_name << "' "
1811 << "of osd." << id << ": " << cpp_strerror(r);
1812 return r;
1813 }
1814 return 0;
1815 }
1816
1817 int CrushWrapper::device_class_clone(
1818 int original_id, int device_class,
1819 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
1820 const std::set<int32_t>& used_ids,
1821 int *clone)
1822 {
1823 const char *item_name = get_item_name(original_id);
1824 if (item_name == NULL)
1825 return -ECHILD;
1826 const char *class_name = get_class_name(device_class);
1827 if (class_name == NULL)
1828 return -EBADF;
1829 string copy_name = item_name + string("~") + class_name;
1830 if (name_exists(copy_name)) {
1831 *clone = get_item_id(copy_name);
1832 return 0;
1833 }
1834 crush_bucket *original = get_bucket(original_id);
1835 assert(!IS_ERR(original));
1836 crush_bucket *copy = crush_make_bucket(crush,
1837 original->alg,
1838 original->hash,
1839 original->type,
1840 0, NULL, NULL);
1841 assert(copy);
1842 for (unsigned i = 0; i < original->size; i++) {
1843 int item = original->items[i];
1844 int weight = crush_get_bucket_item_weight(original, i);
1845 if (item >= 0) {
1846 if (class_map.count(item) != 0 && class_map[item] == device_class) {
1847 int res = bucket_add_item(copy, item, weight);
1848 if (res)
1849 return res;
1850 }
1851 } else {
1852 int child_copy_id;
1853 int res = device_class_clone(item, device_class, old_class_bucket,
1854 used_ids, &child_copy_id);
1855 if (res < 0)
1856 return res;
1857 crush_bucket *child_copy = get_bucket(child_copy_id);
1858 assert(!IS_ERR(child_copy));
1859 res = bucket_add_item(copy, child_copy_id, child_copy->weight);
1860 if (res)
1861 return res;
1862 }
1863 }
1864 int bno = 0;
1865 if (old_class_bucket.count(original_id) &&
1866 old_class_bucket.at(original_id).count(device_class)) {
1867 bno = old_class_bucket.at(original_id).at(device_class);
1868 } else {
1869 // pick a new shadow bucket id that is not used by the current map
1870 // *or* any previous shadow buckets.
1871 bno = -1;
1872 while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
1873 used_ids.count(bno)) {
1874 --bno;
1875 }
1876 }
1877 int res = crush_add_bucket(crush, bno, copy, clone);
1878 if (res)
1879 return res;
1880 assert(!bno || bno == *clone);
1881 res = set_item_class(*clone, device_class);
1882 if (res < 0)
1883 return res;
1884 // we do not use set_item_name because the name is intentionally invalid
1885 name_map[*clone] = copy_name;
1886 if (have_rmaps)
1887 name_rmap[copy_name] = *clone;
1888 class_bucket[original_id][device_class] = *clone;
1889 return 0;
1890 }
1891
1892 bool CrushWrapper::_class_is_dead(int class_id)
1893 {
1894 for (auto &p: class_map) {
1895 if (p.first >= 0 && p.second == class_id) {
1896 return false;
1897 }
1898 }
1899 for (unsigned i = 0; i < crush->max_rules; ++i) {
1900 crush_rule *r = crush->rules[i];
1901 if (!r)
1902 continue;
1903 for (unsigned j = 0; j < r->len; ++j) {
1904 if (r->steps[j].op == CRUSH_RULE_TAKE) {
1905 int root = r->steps[j].arg1;
1906 for (auto &p : class_bucket) {
1907 auto& q = p.second;
1908 if (q.count(class_id) && q[class_id] == root) {
1909 return false;
1910 }
1911 }
1912 }
1913 }
1914 }
1915 // no more referenced by any devices or crush rules
1916 return true;
1917 }
1918
1919 void CrushWrapper::cleanup_dead_classes()
1920 {
1921 for (auto &c: class_name) {
1922 if (_class_is_dead(c.first))
1923 remove_class_name(c.second);
1924 }
1925 }
1926
1927 int CrushWrapper::rebuild_roots_with_classes()
1928 {
1929 std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
1930 cleanup_dead_classes();
1931 int r = trim_roots_with_class(false);
1932 if (r < 0)
1933 return r;
1934 class_bucket.clear();
1935 return populate_classes(old_class_bucket);
1936 }
1937
1938 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
1939 {
1940 assert(crush);
1941
1942 __u32 magic = CRUSH_MAGIC;
1943 ::encode(magic, bl);
1944
1945 ::encode(crush->max_buckets, bl);
1946 ::encode(crush->max_rules, bl);
1947 ::encode(crush->max_devices, bl);
1948
1949 bool encode_compat_choose_args = false;
1950 crush_choose_arg_map arg_map;
1951 memset(&arg_map, '\0', sizeof(arg_map));
1952 if (has_choose_args() &&
1953 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
1954 assert(!has_incompat_choose_args());
1955 encode_compat_choose_args = true;
1956 arg_map = choose_args.begin()->second;
1957 }
1958
1959 // buckets
1960 for (int i=0; i<crush->max_buckets; i++) {
1961 __u32 alg = 0;
1962 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
1963 ::encode(alg, bl);
1964 if (!alg)
1965 continue;
1966
1967 ::encode(crush->buckets[i]->id, bl);
1968 ::encode(crush->buckets[i]->type, bl);
1969 ::encode(crush->buckets[i]->alg, bl);
1970 ::encode(crush->buckets[i]->hash, bl);
1971 ::encode(crush->buckets[i]->weight, bl);
1972 ::encode(crush->buckets[i]->size, bl);
1973 for (unsigned j=0; j<crush->buckets[i]->size; j++)
1974 ::encode(crush->buckets[i]->items[j], bl);
1975
1976 switch (crush->buckets[i]->alg) {
1977 case CRUSH_BUCKET_UNIFORM:
1978 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
1979 break;
1980
1981 case CRUSH_BUCKET_LIST:
1982 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1983 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
1984 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
1985 }
1986 break;
1987
1988 case CRUSH_BUCKET_TREE:
1989 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
1990 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
1991 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
1992 break;
1993
1994 case CRUSH_BUCKET_STRAW:
1995 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1996 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
1997 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
1998 }
1999 break;
2000
2001 case CRUSH_BUCKET_STRAW2:
2002 {
2003 __u32 *weights;
2004 if (encode_compat_choose_args &&
2005 arg_map.args[i].weight_set_size > 0) {
2006 weights = arg_map.args[i].weight_set[0].weights;
2007 } else {
2008 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
2009 }
2010 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2011 ::encode(weights[j], bl);
2012 }
2013 }
2014 break;
2015
2016 default:
2017 ceph_abort();
2018 break;
2019 }
2020 }
2021
2022 // rules
2023 for (unsigned i=0; i<crush->max_rules; i++) {
2024 __u32 yes = crush->rules[i] ? 1:0;
2025 ::encode(yes, bl);
2026 if (!yes)
2027 continue;
2028
2029 ::encode(crush->rules[i]->len, bl);
2030 ::encode(crush->rules[i]->mask, bl);
2031 for (unsigned j=0; j<crush->rules[i]->len; j++)
2032 ::encode(crush->rules[i]->steps[j], bl);
2033 }
2034
2035 // name info
2036 ::encode(type_map, bl);
2037 ::encode(name_map, bl);
2038 ::encode(rule_name_map, bl);
2039
2040 // tunables
2041 ::encode(crush->choose_local_tries, bl);
2042 ::encode(crush->choose_local_fallback_tries, bl);
2043 ::encode(crush->choose_total_tries, bl);
2044 ::encode(crush->chooseleaf_descend_once, bl);
2045 ::encode(crush->chooseleaf_vary_r, bl);
2046 ::encode(crush->straw_calc_version, bl);
2047 ::encode(crush->allowed_bucket_algs, bl);
2048 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
2049 ::encode(crush->chooseleaf_stable, bl);
2050 }
2051
2052 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2053 // device classes
2054 ::encode(class_map, bl);
2055 ::encode(class_name, bl);
2056 ::encode(class_bucket, bl);
2057
2058 // choose args
2059 __u32 size = (__u32)choose_args.size();
2060 ::encode(size, bl);
2061 for (auto c : choose_args) {
2062 ::encode(c.first, bl);
2063 crush_choose_arg_map arg_map = c.second;
2064 size = 0;
2065 for (__u32 i = 0; i < arg_map.size; i++) {
2066 crush_choose_arg *arg = &arg_map.args[i];
2067 if (arg->weight_set_size == 0 &&
2068 arg->ids_size == 0)
2069 continue;
2070 size++;
2071 }
2072 ::encode(size, bl);
2073 for (__u32 i = 0; i < arg_map.size; i++) {
2074 crush_choose_arg *arg = &arg_map.args[i];
2075 if (arg->weight_set_size == 0 &&
2076 arg->ids_size == 0)
2077 continue;
2078 ::encode(i, bl);
2079 ::encode(arg->weight_set_size, bl);
2080 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2081 crush_weight_set *weight_set = &arg->weight_set[j];
2082 ::encode(weight_set->size, bl);
2083 for (__u32 k = 0; k < weight_set->size; k++)
2084 ::encode(weight_set->weights[k], bl);
2085 }
2086 ::encode(arg->ids_size, bl);
2087 for (__u32 j = 0; j < arg->ids_size; j++)
2088 ::encode(arg->ids[j], bl);
2089 }
2090 }
2091 }
2092 }
2093
2094 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
2095 {
2096 m.clear();
2097 __u32 n;
2098 ::decode(n, blp);
2099 while (n--) {
2100 __s32 key;
2101 ::decode(key, blp);
2102
2103 __u32 strlen;
2104 ::decode(strlen, blp);
2105 if (strlen == 0) {
2106 // der, key was actually 64-bits!
2107 ::decode(strlen, blp);
2108 }
2109 ::decode_nohead(strlen, m[key], blp);
2110 }
2111 }
2112
2113 void CrushWrapper::decode(bufferlist::iterator& blp)
2114 {
2115 create();
2116
2117 __u32 magic;
2118 ::decode(magic, blp);
2119 if (magic != CRUSH_MAGIC)
2120 throw buffer::malformed_input("bad magic number");
2121
2122 ::decode(crush->max_buckets, blp);
2123 ::decode(crush->max_rules, blp);
2124 ::decode(crush->max_devices, blp);
2125
2126 // legacy tunables, unless we decode something newer
2127 set_tunables_legacy();
2128
2129 try {
2130 // buckets
2131 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
2132 for (int i=0; i<crush->max_buckets; i++) {
2133 decode_crush_bucket(&crush->buckets[i], blp);
2134 }
2135
2136 // rules
2137 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
2138 for (unsigned i = 0; i < crush->max_rules; ++i) {
2139 __u32 yes;
2140 ::decode(yes, blp);
2141 if (!yes) {
2142 crush->rules[i] = NULL;
2143 continue;
2144 }
2145
2146 __u32 len;
2147 ::decode(len, blp);
2148 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
2149 crush->rules[i]->len = len;
2150 ::decode(crush->rules[i]->mask, blp);
2151 for (unsigned j=0; j<crush->rules[i]->len; j++)
2152 ::decode(crush->rules[i]->steps[j], blp);
2153 }
2154
2155 // name info
2156 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2157 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2158 // both by assuming the string is always non-empty.
2159 decode_32_or_64_string_map(type_map, blp);
2160 decode_32_or_64_string_map(name_map, blp);
2161 decode_32_or_64_string_map(rule_name_map, blp);
2162
2163 // tunables
2164 if (!blp.end()) {
2165 ::decode(crush->choose_local_tries, blp);
2166 ::decode(crush->choose_local_fallback_tries, blp);
2167 ::decode(crush->choose_total_tries, blp);
2168 }
2169 if (!blp.end()) {
2170 ::decode(crush->chooseleaf_descend_once, blp);
2171 }
2172 if (!blp.end()) {
2173 ::decode(crush->chooseleaf_vary_r, blp);
2174 }
2175 if (!blp.end()) {
2176 ::decode(crush->straw_calc_version, blp);
2177 }
2178 if (!blp.end()) {
2179 ::decode(crush->allowed_bucket_algs, blp);
2180 }
2181 if (!blp.end()) {
2182 ::decode(crush->chooseleaf_stable, blp);
2183 }
2184 if (!blp.end()) {
2185 ::decode(class_map, blp);
2186 ::decode(class_name, blp);
2187 for (auto &c : class_name)
2188 class_rname[c.second] = c.first;
2189 ::decode(class_bucket, blp);
2190 }
2191 if (!blp.end()) {
2192 __u32 choose_args_size;
2193 ::decode(choose_args_size, blp);
2194 for (__u32 i = 0; i < choose_args_size; i++) {
2195 uint64_t choose_args_index;
2196 ::decode(choose_args_index, blp);
2197 crush_choose_arg_map arg_map;
2198 arg_map.size = crush->max_buckets;
2199 arg_map.args = (crush_choose_arg*)calloc(
2200 arg_map.size, sizeof(crush_choose_arg));
2201 __u32 size;
2202 ::decode(size, blp);
2203 for (__u32 j = 0; j < size; j++) {
2204 __u32 bucket_index;
2205 ::decode(bucket_index, blp);
2206 assert(bucket_index < arg_map.size);
2207 crush_choose_arg *arg = &arg_map.args[bucket_index];
2208 ::decode(arg->weight_set_size, blp);
2209 if (arg->weight_set_size) {
2210 arg->weight_set = (crush_weight_set*)calloc(
2211 arg->weight_set_size, sizeof(crush_weight_set));
2212 for (__u32 k = 0; k < arg->weight_set_size; k++) {
2213 crush_weight_set *weight_set = &arg->weight_set[k];
2214 ::decode(weight_set->size, blp);
2215 weight_set->weights = (__u32*)calloc(
2216 weight_set->size, sizeof(__u32));
2217 for (__u32 l = 0; l < weight_set->size; l++)
2218 ::decode(weight_set->weights[l], blp);
2219 }
2220 }
2221 ::decode(arg->ids_size, blp);
2222 if (arg->ids_size) {
2223 assert(arg->ids_size == crush->buckets[bucket_index]->size);
2224 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
2225 for (__u32 k = 0; k < arg->ids_size; k++)
2226 ::decode(arg->ids[k], blp);
2227 }
2228 }
2229 choose_args[choose_args_index] = arg_map;
2230 }
2231 }
2232 finalize();
2233 }
2234 catch (...) {
2235 crush_destroy(crush);
2236 throw;
2237 }
2238 }
2239
2240 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
2241 {
2242 __u32 alg;
2243 ::decode(alg, blp);
2244 if (!alg) {
2245 *bptr = NULL;
2246 return;
2247 }
2248
2249 int size = 0;
2250 switch (alg) {
2251 case CRUSH_BUCKET_UNIFORM:
2252 size = sizeof(crush_bucket_uniform);
2253 break;
2254 case CRUSH_BUCKET_LIST:
2255 size = sizeof(crush_bucket_list);
2256 break;
2257 case CRUSH_BUCKET_TREE:
2258 size = sizeof(crush_bucket_tree);
2259 break;
2260 case CRUSH_BUCKET_STRAW:
2261 size = sizeof(crush_bucket_straw);
2262 break;
2263 case CRUSH_BUCKET_STRAW2:
2264 size = sizeof(crush_bucket_straw2);
2265 break;
2266 default:
2267 {
2268 char str[128];
2269 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
2270 throw buffer::malformed_input(str);
2271 }
2272 }
2273 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
2274 *bptr = bucket;
2275
2276 ::decode(bucket->id, blp);
2277 ::decode(bucket->type, blp);
2278 ::decode(bucket->alg, blp);
2279 ::decode(bucket->hash, blp);
2280 ::decode(bucket->weight, blp);
2281 ::decode(bucket->size, blp);
2282
2283 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
2284 for (unsigned j = 0; j < bucket->size; ++j) {
2285 ::decode(bucket->items[j], blp);
2286 }
2287
2288 switch (bucket->alg) {
2289 case CRUSH_BUCKET_UNIFORM:
2290 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
2291 break;
2292
2293 case CRUSH_BUCKET_LIST: {
2294 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
2295 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2296 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2297
2298 for (unsigned j = 0; j < bucket->size; ++j) {
2299 ::decode(cbl->item_weights[j], blp);
2300 ::decode(cbl->sum_weights[j], blp);
2301 }
2302 break;
2303 }
2304
2305 case CRUSH_BUCKET_TREE: {
2306 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
2307 ::decode(cbt->num_nodes, blp);
2308 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
2309 for (unsigned j=0; j<cbt->num_nodes; j++) {
2310 ::decode(cbt->node_weights[j], blp);
2311 }
2312 break;
2313 }
2314
2315 case CRUSH_BUCKET_STRAW: {
2316 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
2317 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2318 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2319 for (unsigned j = 0; j < bucket->size; ++j) {
2320 ::decode(cbs->item_weights[j], blp);
2321 ::decode(cbs->straws[j], blp);
2322 }
2323 break;
2324 }
2325
2326 case CRUSH_BUCKET_STRAW2: {
2327 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
2328 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2329 for (unsigned j = 0; j < bucket->size; ++j) {
2330 ::decode(cbs->item_weights[j], blp);
2331 }
2332 break;
2333 }
2334
2335 default:
2336 // We should have handled this case in the first switch statement
2337 ceph_abort();
2338 break;
2339 }
2340 }
2341
2342
2343 void CrushWrapper::dump(Formatter *f) const
2344 {
2345 f->open_array_section("devices");
2346 for (int i=0; i<get_max_devices(); i++) {
2347 f->open_object_section("device");
2348 f->dump_int("id", i);
2349 const char *n = get_item_name(i);
2350 if (n) {
2351 f->dump_string("name", n);
2352 } else {
2353 char name[20];
2354 sprintf(name, "device%d", i);
2355 f->dump_string("name", name);
2356 }
2357 const char *device_class = get_item_class(i);
2358 if (device_class != NULL)
2359 f->dump_string("class", device_class);
2360 f->close_section();
2361 }
2362 f->close_section();
2363
2364 f->open_array_section("types");
2365 int n = get_num_type_names();
2366 for (int i=0; n; i++) {
2367 const char *name = get_type_name(i);
2368 if (!name) {
2369 if (i == 0) {
2370 f->open_object_section("type");
2371 f->dump_int("type_id", 0);
2372 f->dump_string("name", "device");
2373 f->close_section();
2374 }
2375 continue;
2376 }
2377 n--;
2378 f->open_object_section("type");
2379 f->dump_int("type_id", i);
2380 f->dump_string("name", name);
2381 f->close_section();
2382 }
2383 f->close_section();
2384
2385 f->open_array_section("buckets");
2386 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
2387 if (!bucket_exists(bucket))
2388 continue;
2389 f->open_object_section("bucket");
2390 f->dump_int("id", bucket);
2391 if (get_item_name(bucket))
2392 f->dump_string("name", get_item_name(bucket));
2393 f->dump_int("type_id", get_bucket_type(bucket));
2394 if (get_type_name(get_bucket_type(bucket)))
2395 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
2396 f->dump_int("weight", get_bucket_weight(bucket));
2397 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
2398 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
2399 f->open_array_section("items");
2400 for (int j=0; j<get_bucket_size(bucket); j++) {
2401 f->open_object_section("item");
2402 f->dump_int("id", get_bucket_item(bucket, j));
2403 f->dump_int("weight", get_bucket_item_weight(bucket, j));
2404 f->dump_int("pos", j);
2405 f->close_section();
2406 }
2407 f->close_section();
2408 f->close_section();
2409 }
2410 f->close_section();
2411
2412 f->open_array_section("rules");
2413 dump_rules(f);
2414 f->close_section();
2415
2416 f->open_object_section("tunables");
2417 dump_tunables(f);
2418 f->close_section();
2419
2420 dump_choose_args(f);
2421 }
2422
2423 namespace {
2424 // depth first walker
2425 class TreeDumper {
2426 typedef CrushTreeDumper::Item Item;
2427 const CrushWrapper *crush;
2428 const CrushTreeDumper::name_map_t& weight_set_names;
2429 public:
2430 explicit TreeDumper(const CrushWrapper *crush,
2431 const CrushTreeDumper::name_map_t& wsnames)
2432 : crush(crush), weight_set_names(wsnames) {}
2433
2434 void dump(Formatter *f) {
2435 set<int> roots;
2436 crush->find_roots(roots);
2437 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
2438 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
2439 }
2440 }
2441
2442 private:
2443 void dump_item(const Item& qi, Formatter* f) {
2444 if (qi.is_bucket()) {
2445 f->open_object_section("bucket");
2446 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2447 dump_bucket_children(qi, f);
2448 f->close_section();
2449 } else {
2450 f->open_object_section("device");
2451 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2452 f->close_section();
2453 }
2454 }
2455
2456 void dump_bucket_children(const Item& parent, Formatter* f) {
2457 f->open_array_section("items");
2458 const int max_pos = crush->get_bucket_size(parent.id);
2459 for (int pos = 0; pos < max_pos; pos++) {
2460 int id = crush->get_bucket_item(parent.id, pos);
2461 float weight = crush->get_bucket_item_weightf(parent.id, pos);
2462 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
2463 }
2464 f->close_section();
2465 }
2466 };
2467 }
2468
2469 void CrushWrapper::dump_tree(
2470 Formatter *f,
2471 const CrushTreeDumper::name_map_t& weight_set_names) const
2472 {
2473 assert(f);
2474 TreeDumper(this, weight_set_names).dump(f);
2475 }
2476
2477 void CrushWrapper::dump_tunables(Formatter *f) const
2478 {
2479 f->dump_int("choose_local_tries", get_choose_local_tries());
2480 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2481 f->dump_int("choose_total_tries", get_choose_total_tries());
2482 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2483 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2484 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2485 f->dump_int("straw_calc_version", get_straw_calc_version());
2486 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2487
2488 // be helpful about it
2489 if (has_jewel_tunables())
2490 f->dump_string("profile", "jewel");
2491 else if (has_hammer_tunables())
2492 f->dump_string("profile", "hammer");
2493 else if (has_firefly_tunables())
2494 f->dump_string("profile", "firefly");
2495 else if (has_bobtail_tunables())
2496 f->dump_string("profile", "bobtail");
2497 else if (has_argonaut_tunables())
2498 f->dump_string("profile", "argonaut");
2499 else
2500 f->dump_string("profile", "unknown");
2501 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
2502 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
2503
2504 // be helpful about minimum version required
2505 f->dump_string("minimum_required_version", get_min_required_version());
2506
2507 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2508 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2509 f->dump_int("has_v2_rules", (int)has_v2_rules());
2510 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2511 f->dump_int("has_v3_rules", (int)has_v3_rules());
2512 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
2513 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2514 f->dump_int("has_v5_rules", (int)has_v5_rules());
2515 }
2516
2517 void CrushWrapper::dump_choose_args(Formatter *f) const
2518 {
2519 f->open_object_section("choose_args");
2520 for (auto c : choose_args) {
2521 crush_choose_arg_map arg_map = c.second;
2522 f->open_array_section(stringify(c.first).c_str());
2523 for (__u32 i = 0; i < arg_map.size; i++) {
2524 crush_choose_arg *arg = &arg_map.args[i];
2525 if (arg->weight_set_size == 0 &&
2526 arg->ids_size == 0)
2527 continue;
2528 f->open_object_section("choose_args");
2529 int bucket_index = i;
2530 f->dump_int("bucket_id", -1-bucket_index);
2531 if (arg->weight_set_size > 0) {
2532 f->open_array_section("weight_set");
2533 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2534 f->open_array_section("weights");
2535 __u32 *weights = arg->weight_set[j].weights;
2536 __u32 size = arg->weight_set[j].size;
2537 for (__u32 k = 0; k < size; k++) {
2538 f->dump_float("weight", (float)weights[k]/(float)0x10000);
2539 }
2540 f->close_section();
2541 }
2542 f->close_section();
2543 }
2544 if (arg->ids_size > 0) {
2545 f->open_array_section("ids");
2546 for (__u32 j = 0; j < arg->ids_size; j++)
2547 f->dump_int("id", arg->ids[j]);
2548 f->close_section();
2549 }
2550 f->close_section();
2551 }
2552 f->close_section();
2553 }
2554 f->close_section();
2555 }
2556
2557 void CrushWrapper::dump_rules(Formatter *f) const
2558 {
2559 for (int i=0; i<get_max_rules(); i++) {
2560 if (!rule_exists(i))
2561 continue;
2562 dump_rule(i, f);
2563 }
2564 }
2565
2566 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
2567 {
2568 f->open_object_section("rule");
2569 f->dump_int("rule_id", ruleset);
2570 if (get_rule_name(ruleset))
2571 f->dump_string("rule_name", get_rule_name(ruleset));
2572 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
2573 f->dump_int("type", get_rule_mask_type(ruleset));
2574 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
2575 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
2576 f->open_array_section("steps");
2577 for (int j=0; j<get_rule_len(ruleset); j++) {
2578 f->open_object_section("step");
2579 switch (get_rule_op(ruleset, j)) {
2580 case CRUSH_RULE_NOOP:
2581 f->dump_string("op", "noop");
2582 break;
2583 case CRUSH_RULE_TAKE:
2584 f->dump_string("op", "take");
2585 {
2586 int item = get_rule_arg1(ruleset, j);
2587 f->dump_int("item", item);
2588
2589 const char *name = get_item_name(item);
2590 f->dump_string("item_name", name ? name : "");
2591 }
2592 break;
2593 case CRUSH_RULE_EMIT:
2594 f->dump_string("op", "emit");
2595 break;
2596 case CRUSH_RULE_CHOOSE_FIRSTN:
2597 f->dump_string("op", "choose_firstn");
2598 f->dump_int("num", get_rule_arg1(ruleset, j));
2599 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2600 break;
2601 case CRUSH_RULE_CHOOSE_INDEP:
2602 f->dump_string("op", "choose_indep");
2603 f->dump_int("num", get_rule_arg1(ruleset, j));
2604 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2605 break;
2606 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2607 f->dump_string("op", "chooseleaf_firstn");
2608 f->dump_int("num", get_rule_arg1(ruleset, j));
2609 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2610 break;
2611 case CRUSH_RULE_CHOOSELEAF_INDEP:
2612 f->dump_string("op", "chooseleaf_indep");
2613 f->dump_int("num", get_rule_arg1(ruleset, j));
2614 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2615 break;
2616 case CRUSH_RULE_SET_CHOOSE_TRIES:
2617 f->dump_string("op", "set_choose_tries");
2618 f->dump_int("num", get_rule_arg1(ruleset, j));
2619 break;
2620 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
2621 f->dump_string("op", "set_chooseleaf_tries");
2622 f->dump_int("num", get_rule_arg1(ruleset, j));
2623 break;
2624 default:
2625 f->dump_int("opcode", get_rule_op(ruleset, j));
2626 f->dump_int("arg1", get_rule_arg1(ruleset, j));
2627 f->dump_int("arg2", get_rule_arg2(ruleset, j));
2628 }
2629 f->close_section();
2630 }
2631 f->close_section();
2632 f->close_section();
2633 }
2634
2635 void CrushWrapper::list_rules(Formatter *f) const
2636 {
2637 for (int rule = 0; rule < get_max_rules(); rule++) {
2638 if (!rule_exists(rule))
2639 continue;
2640 f->dump_string("name", get_rule_name(rule));
2641 }
2642 }
2643
2644 void CrushWrapper::list_rules(ostream *ss) const
2645 {
2646 for (int rule = 0; rule < get_max_rules(); rule++) {
2647 if (!rule_exists(rule))
2648 continue;
2649 *ss << get_rule_name(rule) << "\n";
2650 }
2651 }
2652
2653 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
2654 public:
2655 typedef CrushTreeDumper::Dumper<TextTable> Parent;
2656
2657 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2658 const CrushTreeDumper::name_map_t& wsnames)
2659 : Parent(crush, wsnames) {}
2660 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2661 const CrushTreeDumper::name_map_t& wsnames,
2662 bool show_shadow)
2663 : Parent(crush, wsnames, show_shadow) {}
2664
2665
2666 void dump(TextTable *tbl) {
2667 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
2668 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
2669 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
2670 for (auto& p : crush->choose_args) {
2671 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
2672 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
2673 } else {
2674 string name;
2675 auto q = weight_set_names.find(p.first);
2676 name = q != weight_set_names.end() ? q->second :
2677 stringify(p.first);
2678 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
2679 }
2680 }
2681 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
2682 Parent::dump(tbl);
2683 }
2684
2685 protected:
2686 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
2687 const char *c = crush->get_item_class(qi.id);
2688 if (!c)
2689 c = "";
2690 *tbl << qi.id
2691 << c
2692 << weightf_t(qi.weight);
2693 for (auto& p : crush->choose_args) {
2694 if (qi.parent < 0) {
2695 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
2696 int bidx = -1 - qi.parent;
2697 const crush_bucket *b = crush->get_bucket(qi.parent);
2698 if (b &&
2699 bidx < (int)cmap.size &&
2700 cmap.args[bidx].weight_set &&
2701 cmap.args[bidx].weight_set_size >= 1) {
2702 int pos;
2703 for (pos = 0;
2704 pos < (int)cmap.args[bidx].weight_set[0].size &&
2705 b->items[pos] != qi.id;
2706 ++pos) ;
2707 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
2708 (float)0x10000);
2709 continue;
2710 }
2711 }
2712 *tbl << "";
2713 }
2714 ostringstream ss;
2715 for (int k=0; k < qi.depth; k++) {
2716 ss << " ";
2717 }
2718 if (qi.is_bucket()) {
2719 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2720 << crush->get_item_name(qi.id);
2721 } else {
2722 ss << "osd." << qi.id;
2723 }
2724 *tbl << ss.str();
2725 *tbl << TextTable::endrow;
2726 }
2727 };
2728
2729
2730 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2731 public:
2732 typedef CrushTreeDumper::FormattingDumper Parent;
2733
2734 explicit CrushTreeFormattingDumper(
2735 const CrushWrapper *crush,
2736 const CrushTreeDumper::name_map_t& wsnames)
2737 : Parent(crush, wsnames) {}
2738
2739 explicit CrushTreeFormattingDumper(
2740 const CrushWrapper *crush,
2741 const CrushTreeDumper::name_map_t& wsnames,
2742 bool show_shadow)
2743 : Parent(crush, wsnames, show_shadow) {}
2744
2745 void dump(Formatter *f) {
2746 f->open_array_section("nodes");
2747 Parent::dump(f);
2748 f->close_section();
2749 f->open_array_section("stray");
2750 f->close_section();
2751 }
2752 };
2753
2754
2755 void CrushWrapper::dump_tree(
2756 ostream *out,
2757 Formatter *f,
2758 const CrushTreeDumper::name_map_t& weight_set_names,
2759 bool show_shadow) const
2760 {
2761 if (out) {
2762 TextTable tbl;
2763 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
2764 *out << tbl;
2765 }
2766 if (f) {
2767 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
2768 }
2769 }
2770
2771 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
2772 {
2773 o.push_back(new CrushWrapper);
2774 // fixme
2775 }
2776
2777 /**
2778 * Determine the default CRUSH ruleset ID to be used with
2779 * newly created replicated pools.
2780 *
2781 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2782 */
2783 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
2784 {
2785 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
2786 if (crush_ruleset < 0) {
2787 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
2788 } else if (!ruleset_exists(crush_ruleset)) {
2789 crush_ruleset = -1; // match find_first_ruleset() retval
2790 }
2791 return crush_ruleset;
2792 }
2793
2794 bool CrushWrapper::is_valid_crush_name(const string& s)
2795 {
2796 if (s.empty())
2797 return false;
2798 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
2799 if (!(*p == '-') &&
2800 !(*p == '_') &&
2801 !(*p == '.') &&
2802 !(*p >= '0' && *p <= '9') &&
2803 !(*p >= 'A' && *p <= 'Z') &&
2804 !(*p >= 'a' && *p <= 'z'))
2805 return false;
2806 }
2807 return true;
2808 }
2809
2810 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
2811 const map<string,string>& loc)
2812 {
2813 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
2814 if (!is_valid_crush_name(l->first) ||
2815 !is_valid_crush_name(l->second)) {
2816 ldout(cct, 1) << "loc["
2817 << l->first << "] = '"
2818 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
2819 << dendl;
2820 return false;
2821 }
2822 }
2823 return true;
2824 }
2825
2826 int CrushWrapper::_choose_type_stack(
2827 CephContext *cct,
2828 const vector<pair<int,int>>& stack,
2829 const set<int>& overfull,
2830 const vector<int>& underfull,
2831 const vector<int>& orig,
2832 vector<int>::const_iterator& i,
2833 set<int>& used,
2834 vector<int> *pw) const
2835 {
2836 vector<int> w = *pw;
2837 vector<int> o;
2838
2839 ldout(cct, 10) << __func__ << " stack " << stack
2840 << " orig " << orig
2841 << " at " << *i
2842 << " pw " << *pw
2843 << dendl;
2844
2845 vector<int> cumulative_fanout(stack.size());
2846 int f = 1;
2847 for (int j = (int)stack.size() - 1; j >= 0; --j) {
2848 cumulative_fanout[j] = f;
2849 f *= stack[j].second;
2850 }
2851 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
2852 << dendl;
2853
2854 // identify underful targets for each intermediate level.
2855 // this serves two purposes:
2856 // 1. we can tell when we are selecting a bucket that does not have any underfull
2857 // devices beneath it. that means that if the current input includes an overfull
2858 // device, we won't be able to find an underfull device with this parent to
2859 // swap for it.
2860 // 2. when we decide we should reject a bucket due to the above, this list gives us
2861 // a list of peers to consider that *do* have underfull devices available.. (we
2862 // are careful to pick one that has the same parent.)
2863 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
2864 underfull_buckets.resize(stack.size() - 1);
2865 for (auto osd : underfull) {
2866 int item = osd;
2867 for (int j = (int)stack.size() - 2; j >= 0; --j) {
2868 int type = stack[j].first;
2869 item = get_parent_of_type(item, type);
2870 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
2871 << " is " << item << dendl;
2872 underfull_buckets[j].insert(item);
2873 }
2874 }
2875 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
2876
2877 for (unsigned j = 0; j < stack.size(); ++j) {
2878 int type = stack[j].first;
2879 int fanout = stack[j].second;
2880 int cum_fanout = cumulative_fanout[j];
2881 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
2882 << " cumulative " << cum_fanout
2883 << " w " << w << dendl;
2884 vector<int> o;
2885 auto tmpi = i;
2886 for (auto from : w) {
2887 ldout(cct, 10) << " from " << from << dendl;
2888 // identify leaves under each choice. we use this to check whether any of these
2889 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2890 // to swap for them.)
2891 vector<set<int>> leaves;
2892 leaves.resize(fanout);
2893 for (int pos = 0; pos < fanout; ++pos) {
2894 if (type > 0) {
2895 // non-leaf
2896 int item = get_parent_of_type(*tmpi, type);
2897 o.push_back(item);
2898 int n = cum_fanout;
2899 while (n-- && tmpi != orig.end()) {
2900 leaves[pos].insert(*tmpi++);
2901 }
2902 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
2903 << " of type " << type << " over leaves " << leaves[pos] << dendl;
2904 } else {
2905 // leaf
2906 bool replaced = false;
2907 if (overfull.count(*i)) {
2908 for (auto item : underfull) {
2909 ldout(cct, 10) << __func__ << " pos " << pos
2910 << " was " << *i << " considering " << item
2911 << dendl;
2912 if (used.count(item)) {
2913 ldout(cct, 20) << __func__ << " in used " << used << dendl;
2914 continue;
2915 }
2916 if (!subtree_contains(from, item)) {
2917 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
2918 continue;
2919 }
2920 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
2921 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
2922 continue;
2923 }
2924 o.push_back(item);
2925 used.insert(item);
2926 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
2927 << *i << " -> " << item << dendl;
2928 replaced = true;
2929 ++i;
2930 break;
2931 }
2932 }
2933 if (!replaced) {
2934 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
2935 << dendl;
2936 o.push_back(*i);
2937 ++i;
2938 }
2939 if (i == orig.end()) {
2940 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
2941 break;
2942 }
2943 }
2944 }
2945 if (j + 1 < stack.size()) {
2946 // check if any buckets have overfull leaves but no underfull candidates
2947 for (int pos = 0; pos < fanout; ++pos) {
2948 if (underfull_buckets[j].count(o[pos]) == 0) {
2949 // are any leaves overfull?
2950 bool any_overfull = false;
2951 for (auto osd : leaves[pos]) {
2952 if (overfull.count(osd)) {
2953 any_overfull = true;
2954 }
2955 }
2956 if (any_overfull) {
2957 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
2958 << ">0 leaves " << leaves[pos] << " is overfull; alts "
2959 << underfull_buckets[j]
2960 << dendl;
2961 for (auto alt : underfull_buckets[j]) {
2962 if (std::find(o.begin(), o.end(), alt) == o.end()) {
2963 // see if alt has the same parent
2964 if (j == 0 ||
2965 get_parent_of_type(o[pos], stack[j-1].first) ==
2966 get_parent_of_type(alt, stack[j-1].first)) {
2967 if (j)
2968 ldout(cct, 10) << " replacing " << o[pos]
2969 << " (which has no underfull leaves) with " << alt
2970 << " (same parent "
2971 << get_parent_of_type(alt, stack[j-1].first) << " type "
2972 << type << ")" << dendl;
2973 else
2974 ldout(cct, 10) << " replacing " << o[pos]
2975 << " (which has no underfull leaves) with " << alt
2976 << " (first level)" << dendl;
2977 o[pos] = alt;
2978 break;
2979 } else {
2980 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
2981 << " has different parent, skipping" << dendl;
2982 }
2983 }
2984 }
2985 }
2986 }
2987 }
2988 }
2989 if (i == orig.end()) {
2990 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
2991 break;
2992 }
2993 }
2994 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
2995 w.swap(o);
2996 }
2997 *pw = w;
2998 return 0;
2999 }
3000
3001 int CrushWrapper::try_remap_rule(
3002 CephContext *cct,
3003 int ruleno,
3004 int maxout,
3005 const set<int>& overfull,
3006 const vector<int>& underfull,
3007 const vector<int>& orig,
3008 vector<int> *out) const
3009 {
3010 const crush_map *map = crush;
3011 const crush_rule *rule = get_rule(ruleno);
3012 assert(rule);
3013
3014 ldout(cct, 10) << __func__ << " ruleno " << ruleno
3015 << " numrep " << maxout << " overfull " << overfull
3016 << " underfull " << underfull << " orig " << orig
3017 << dendl;
3018 vector<int> w; // working set
3019 out->clear();
3020
3021 auto i = orig.begin();
3022 set<int> used;
3023
3024 vector<pair<int,int>> type_stack; // (type, fan-out)
3025
3026 for (unsigned step = 0; step < rule->len; ++step) {
3027 const crush_rule_step *curstep = &rule->steps[step];
3028 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
3029 switch (curstep->op) {
3030 case CRUSH_RULE_TAKE:
3031 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
3032 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
3033 map->buckets[-1-curstep->arg1])) {
3034 w.clear();
3035 w.push_back(curstep->arg1);
3036 ldout(cct, 10) << __func__ << " take " << w << dendl;
3037 } else {
3038 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
3039 }
3040 break;
3041
3042 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3043 case CRUSH_RULE_CHOOSELEAF_INDEP:
3044 {
3045 int numrep = curstep->arg1;
3046 int type = curstep->arg2;
3047 if (numrep <= 0)
3048 numrep += maxout;
3049 type_stack.push_back(make_pair(type, numrep));
3050 type_stack.push_back(make_pair(0, 1));
3051 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3052 i, used, &w);
3053 if (r < 0)
3054 return r;
3055 type_stack.clear();
3056 }
3057 break;
3058
3059 case CRUSH_RULE_CHOOSE_FIRSTN:
3060 case CRUSH_RULE_CHOOSE_INDEP:
3061 {
3062 int numrep = curstep->arg1;
3063 int type = curstep->arg2;
3064 if (numrep <= 0)
3065 numrep += maxout;
3066 type_stack.push_back(make_pair(type, numrep));
3067 }
3068 break;
3069
3070 case CRUSH_RULE_EMIT:
3071 ldout(cct, 10) << " emit " << w << dendl;
3072 if (!type_stack.empty()) {
3073 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3074 i, used, &w);
3075 if (r < 0)
3076 return r;
3077 type_stack.clear();
3078 }
3079 for (auto item : w) {
3080 out->push_back(item);
3081 }
3082 w.clear();
3083 break;
3084
3085 default:
3086 // ignore
3087 break;
3088 }
3089 }
3090
3091 return 0;
3092 }
3093
3094
3095 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3096 CephContext *cct,
3097 crush_choose_arg_map cmap,
3098 int bucketid,
3099 int id,
3100 const vector<int>& weight,
3101 ostream *ss)
3102 {
3103 int changed = 0;
3104 int bidx = -1 - bucketid;
3105 crush_bucket *b = crush->buckets[bidx];
3106 if (bidx >= (int)cmap.size) {
3107 if (ss)
3108 *ss << "no weight-set for bucket " << b->id;
3109 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
3110 << dendl;
3111 return 0;
3112 }
3113 crush_choose_arg *carg = &cmap.args[bidx];
3114 if (carg->weight_set == NULL) {
3115 if (ss)
3116 *ss << "no weight-set for bucket " << b->id;
3117 ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
3118 << dendl;
3119 return 0;
3120 }
3121 if (carg->weight_set_size != weight.size()) {
3122 if (ss)
3123 *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
3124 ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
3125 << " for bucket " << b->id << dendl;
3126 return 0;
3127 }
3128 for (unsigned i = 0; i < b->size; i++) {
3129 if (b->items[i] == id) {
3130 for (unsigned j = 0; j < weight.size(); ++j) {
3131 carg->weight_set[j].weights[i] = weight[j];
3132 }
3133 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
3134 << " in bucket " << b->id << dendl;
3135 changed++;
3136 }
3137 }
3138 if (changed) {
3139 vector<int> bucket_weight(weight.size(), 0);
3140 for (unsigned i = 0; i < b->size; i++) {
3141 for (unsigned j = 0; j < weight.size(); ++j) {
3142 bucket_weight[j] += carg->weight_set[j].weights[i];
3143 }
3144 }
3145 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
3146 }
3147 return changed;
3148 }
3149
3150 int CrushWrapper::choose_args_adjust_item_weight(
3151 CephContext *cct,
3152 crush_choose_arg_map cmap,
3153 int id,
3154 const vector<int>& weight,
3155 ostream *ss)
3156 {
3157 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
3158 int changed = 0;
3159 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
3160 crush_bucket *b = crush->buckets[bidx];
3161 if (b == nullptr) {
3162 continue;
3163 }
3164 changed += _choose_args_adjust_item_weight_in_bucket(
3165 cct, cmap, b->id, id, weight, ss);
3166 }
3167 if (!changed) {
3168 if (ss)
3169 *ss << "item " << id << " not found in crush map";
3170 return -ENOENT;
3171 }
3172 return changed;
3173 }