]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
822f1d9c183be517da956c739048fe4ac2139f3b
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
10
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
13
14 #define dout_subsys ceph_subsys_crush
15
16 bool CrushWrapper::has_legacy_rule_ids() const
17 {
18 for (unsigned i=0; i<crush->max_rules; i++) {
19 crush_rule *r = crush->rules[i];
20 if (r &&
21 r->mask.ruleset != i) {
22 return true;
23 }
24 }
25 return false;
26 }
27
28 std::map<int, int> CrushWrapper::renumber_rules()
29 {
30 std::map<int, int> result;
31 for (unsigned i=0; i<crush->max_rules; i++) {
32 crush_rule *r = crush->rules[i];
33 if (r && r->mask.ruleset != i) {
34 result[r->mask.ruleset] = i;
35 r->mask.ruleset = i;
36 }
37 }
38 return result;
39 }
40
41 bool CrushWrapper::has_non_straw2_buckets() const
42 {
43 for (int i=0; i<crush->max_buckets; ++i) {
44 crush_bucket *b = crush->buckets[i];
45 if (!b)
46 continue;
47 if (b->alg != CRUSH_BUCKET_STRAW2)
48 return true;
49 }
50 return false;
51 }
52
53 bool CrushWrapper::has_v2_rules() const
54 {
55 for (unsigned i=0; i<crush->max_rules; i++) {
56 if (is_v2_rule(i)) {
57 return true;
58 }
59 }
60 return false;
61 }
62
63 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
64 {
65 // check rule for use of indep or new SET_* rule steps
66 if (ruleid >= crush->max_rules)
67 return false;
68 crush_rule *r = crush->rules[ruleid];
69 if (!r)
70 return false;
71 for (unsigned j=0; j<r->len; j++) {
72 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
73 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
74 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
75 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
76 return true;
77 }
78 }
79 return false;
80 }
81
82 bool CrushWrapper::has_v3_rules() const
83 {
84 for (unsigned i=0; i<crush->max_rules; i++) {
85 if (is_v3_rule(i)) {
86 return true;
87 }
88 }
89 return false;
90 }
91
92 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
93 {
94 // check rule for use of SET_CHOOSELEAF_VARY_R step
95 if (ruleid >= crush->max_rules)
96 return false;
97 crush_rule *r = crush->rules[ruleid];
98 if (!r)
99 return false;
100 for (unsigned j=0; j<r->len; j++) {
101 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
102 return true;
103 }
104 }
105 return false;
106 }
107
108 bool CrushWrapper::has_v4_buckets() const
109 {
110 for (int i=0; i<crush->max_buckets; ++i) {
111 crush_bucket *b = crush->buckets[i];
112 if (!b)
113 continue;
114 if (b->alg == CRUSH_BUCKET_STRAW2)
115 return true;
116 }
117 return false;
118 }
119
120 bool CrushWrapper::has_v5_rules() const
121 {
122 for (unsigned i=0; i<crush->max_rules; i++) {
123 if (is_v5_rule(i)) {
124 return true;
125 }
126 }
127 return false;
128 }
129
130 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
131 {
132 // check rule for use of SET_CHOOSELEAF_STABLE step
133 if (ruleid >= crush->max_rules)
134 return false;
135 crush_rule *r = crush->rules[ruleid];
136 if (!r)
137 return false;
138 for (unsigned j=0; j<r->len; j++) {
139 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
140 return true;
141 }
142 }
143 return false;
144 }
145
146 bool CrushWrapper::has_choose_args() const
147 {
148 return !choose_args.empty();
149 }
150
151 bool CrushWrapper::has_incompat_choose_args() const
152 {
153 if (choose_args.empty())
154 return false;
155 if (choose_args.size() > 1)
156 return true;
157 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
158 return true;
159 crush_choose_arg_map arg_map = choose_args.begin()->second;
160 for (__u32 i = 0; i < arg_map.size; i++) {
161 crush_choose_arg *arg = &arg_map.args[i];
162 if (arg->weight_set_size == 0 &&
163 arg->ids_size == 0)
164 continue;
165 if (arg->weight_set_size != 1)
166 return true;
167 if (arg->ids_size != 0)
168 return true;
169 }
170 return false;
171 }
172
173 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
174 {
175 if (!item_exists(i))
176 return -EINVAL;
177 string name = get_item_name(i);
178 size_t pos = name.find("~");
179 if (pos == string::npos) {
180 *idout = i;
181 *classout = -1;
182 return 0;
183 }
184 string name_no_class = name.substr(0, pos);
185 if (!name_exists(name_no_class))
186 return -ENOENT;
187 string class_name = name.substr(pos + 1);
188 if (!class_exists(class_name))
189 return -ENOENT;
190 *idout = get_item_id(name_no_class);
191 *classout = get_class_id(class_name);
192 return 0;
193 }
194
195 int CrushWrapper::can_rename_item(const string& srcname,
196 const string& dstname,
197 ostream *ss) const
198 {
199 if (name_exists(srcname)) {
200 if (name_exists(dstname)) {
201 *ss << "dstname = '" << dstname << "' already exists";
202 return -EEXIST;
203 }
204 if (is_valid_crush_name(dstname)) {
205 return 0;
206 } else {
207 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
208 return -EINVAL;
209 }
210 } else {
211 if (name_exists(dstname)) {
212 *ss << "srcname = '" << srcname << "' does not exist "
213 << "and dstname = '" << dstname << "' already exists";
214 return -EALREADY;
215 } else {
216 *ss << "srcname = '" << srcname << "' does not exist";
217 return -ENOENT;
218 }
219 }
220 }
221
222 int CrushWrapper::rename_item(const string& srcname,
223 const string& dstname,
224 ostream *ss)
225 {
226 int ret = can_rename_item(srcname, dstname, ss);
227 if (ret < 0)
228 return ret;
229 int oldid = get_item_id(srcname);
230 return set_item_name(oldid, dstname);
231 }
232
233 int CrushWrapper::can_rename_bucket(const string& srcname,
234 const string& dstname,
235 ostream *ss) const
236 {
237 int ret = can_rename_item(srcname, dstname, ss);
238 if (ret)
239 return ret;
240 int srcid = get_item_id(srcname);
241 if (srcid >= 0) {
242 *ss << "srcname = '" << srcname << "' is not a bucket "
243 << "because its id = " << srcid << " is >= 0";
244 return -ENOTDIR;
245 }
246 return 0;
247 }
248
249 int CrushWrapper::rename_bucket(const string& srcname,
250 const string& dstname,
251 ostream *ss)
252 {
253 int ret = can_rename_bucket(srcname, dstname, ss);
254 if (ret < 0)
255 return ret;
256 int oldid = get_item_id(srcname);
257 return set_item_name(oldid, dstname);
258 }
259
260 int CrushWrapper::rename_rule(const string& srcname,
261 const string& dstname,
262 ostream *ss)
263 {
264 if (!rule_exists(srcname)) {
265 if (ss) {
266 *ss << "source rule name '" << srcname << "' does not exist";
267 }
268 return -ENOENT;
269 }
270 if (rule_exists(dstname)) {
271 if (ss) {
272 *ss << "destination rule name '" << dstname << "' already exists";
273 }
274 return -EEXIST;
275 }
276 int rule_id = get_rule_id(srcname);
277 auto it = rule_name_map.find(rule_id);
278 assert(it != rule_name_map.end());
279 it->second = dstname;
280 if (have_rmaps) {
281 rule_name_rmap.erase(srcname);
282 rule_name_rmap[dstname] = rule_id;
283 }
284 return 0;
285 }
286
287 void CrushWrapper::find_takes(set<int> *roots) const
288 {
289 for (unsigned i=0; i<crush->max_rules; i++) {
290 crush_rule *r = crush->rules[i];
291 if (!r)
292 continue;
293 for (unsigned j=0; j<r->len; j++) {
294 if (r->steps[j].op == CRUSH_RULE_TAKE)
295 roots->insert(r->steps[j].arg1);
296 }
297 }
298 }
299
300 void CrushWrapper::find_roots(set<int> *roots) const
301 {
302 for (int i = 0; i < crush->max_buckets; i++) {
303 if (!crush->buckets[i])
304 continue;
305 crush_bucket *b = crush->buckets[i];
306 if (!_search_item_exists(b->id))
307 roots->insert(b->id);
308 }
309 }
310
311 bool CrushWrapper::subtree_contains(int root, int item) const
312 {
313 if (root == item)
314 return true;
315
316 if (root >= 0)
317 return false; // root is a leaf
318
319 const crush_bucket *b = get_bucket(root);
320 if (IS_ERR(b))
321 return false;
322
323 for (unsigned j=0; j<b->size; j++) {
324 if (subtree_contains(b->items[j], item))
325 return true;
326 }
327 return false;
328 }
329
330 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
331 {
332 // last instance?
333 if (_search_item_exists(item)) {
334 return false;
335 }
336 if (item < 0 && _bucket_is_in_use(item)) {
337 return false;
338 }
339
340 if (item < 0 && !unlink_only) {
341 crush_bucket *t = get_bucket(item);
342 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
343 crush_remove_bucket(crush, t);
344 if (class_bucket.count(item) != 0)
345 class_bucket.erase(item);
346 class_remove_item(item);
347 }
348 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
349 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
350 name_map.erase(item);
351 have_rmaps = false;
352 if (item >= 0 && !unlink_only) {
353 class_remove_item(item);
354 }
355 }
356 rebuild_roots_with_classes();
357 return true;
358 }
359
360 int CrushWrapper::remove_root(int item)
361 {
362 crush_bucket *b = get_bucket(item);
363 if (IS_ERR(b)) {
364 // should be idempotent
365 // e.g.: we use 'crush link' to link same host into
366 // different roots, which as a result can cause different
367 // shadow trees reference same hosts too. This means
368 // we may need to destory the same buckets(hosts, racks, etc.)
369 // multiple times during rebuilding all shadow trees.
370 return 0;
371 }
372
373 for (unsigned n = 0; n < b->size; n++) {
374 if (b->items[n] >= 0)
375 continue;
376 int r = remove_root(b->items[n]);
377 if (r < 0)
378 return r;
379 }
380
381 crush_remove_bucket(crush, b);
382 if (name_map.count(item) != 0) {
383 name_map.erase(item);
384 have_rmaps = false;
385 }
386 if (class_bucket.count(item) != 0)
387 class_bucket.erase(item);
388 class_remove_item(item);
389 return 0;
390 }
391
392 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
393 {
394 ldout(cct, 5) << "remove_item " << item
395 << (unlink_only ? " unlink_only":"") << dendl;
396
397 int ret = -ENOENT;
398
399 if (item < 0 && !unlink_only) {
400 crush_bucket *t = get_bucket(item);
401 if (IS_ERR(t)) {
402 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
403 << dendl;
404 return -ENOENT;
405 }
406
407 if (t->size) {
408 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
409 << " items, not empty" << dendl;
410 return -ENOTEMPTY;
411 }
412 if (_bucket_is_in_use(item)) {
413 return -EBUSY;
414 }
415 }
416
417 for (int i = 0; i < crush->max_buckets; i++) {
418 if (!crush->buckets[i])
419 continue;
420 crush_bucket *b = crush->buckets[i];
421
422 for (unsigned i=0; i<b->size; ++i) {
423 int id = b->items[i];
424 if (id == item) {
425 ldout(cct, 5) << "remove_item removing item " << item
426 << " from bucket " << b->id << dendl;
427 for (auto& p : choose_args) {
428 // weight down each weight-set to 0 before we remove the item
429 vector<int> weightv(get_choose_args_positions(p.second), 0);
430 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
431 }
432 bucket_remove_item(b, item);
433 adjust_item_weight(cct, b->id, b->weight);
434 ret = 0;
435 }
436 }
437 }
438
439 if (_maybe_remove_last_instance(cct, item, unlink_only))
440 ret = 0;
441
442 return ret;
443 }
444
445 bool CrushWrapper::_search_item_exists(int item) const
446 {
447 for (int i = 0; i < crush->max_buckets; i++) {
448 if (!crush->buckets[i])
449 continue;
450 crush_bucket *b = crush->buckets[i];
451 for (unsigned j=0; j<b->size; ++j) {
452 if (b->items[j] == item)
453 return true;
454 }
455 }
456 return false;
457 }
458
459 bool CrushWrapper::_bucket_is_in_use(int item)
460 {
461 for (auto &i : class_bucket)
462 for (auto &j : i.second)
463 if (j.second == item)
464 return true;
465 for (unsigned i = 0; i < crush->max_rules; ++i) {
466 crush_rule *r = crush->rules[i];
467 if (!r)
468 continue;
469 for (unsigned j = 0; j < r->len; ++j) {
470 if (r->steps[j].op == CRUSH_RULE_TAKE) {
471 int step_item = r->steps[j].arg1;
472 int original_item;
473 int c;
474 int res = split_id_class(step_item, &original_item, &c);
475 if (res < 0)
476 return false;
477 if (step_item == item || original_item == item)
478 return true;
479 }
480 }
481 }
482 return false;
483 }
484
485 int CrushWrapper::_remove_item_under(
486 CephContext *cct, int item, int ancestor, bool unlink_only)
487 {
488 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
489 << (unlink_only ? " unlink_only":"") << dendl;
490
491 if (ancestor >= 0) {
492 return -EINVAL;
493 }
494
495 if (!bucket_exists(ancestor))
496 return -EINVAL;
497
498 int ret = -ENOENT;
499
500 crush_bucket *b = get_bucket(ancestor);
501 for (unsigned i=0; i<b->size; ++i) {
502 int id = b->items[i];
503 if (id == item) {
504 ldout(cct, 5) << "_remove_item_under removing item " << item
505 << " from bucket " << b->id << dendl;
506 for (auto& p : choose_args) {
507 // weight down each weight-set to 0 before we remove the item
508 vector<int> weightv(get_choose_args_positions(p.second), 0);
509 _choose_args_adjust_item_weight_in_bucket(
510 cct, p.second, b->id, item, weightv, nullptr);
511 }
512 bucket_remove_item(b, item);
513 adjust_item_weight(cct, b->id, b->weight);
514 ret = 0;
515 } else if (id < 0) {
516 int r = remove_item_under(cct, item, id, unlink_only);
517 if (r == 0)
518 ret = 0;
519 }
520 }
521 return ret;
522 }
523
524 int CrushWrapper::remove_item_under(
525 CephContext *cct, int item, int ancestor, bool unlink_only)
526 {
527 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
528 << (unlink_only ? " unlink_only":"") << dendl;
529
530 if (!unlink_only && _bucket_is_in_use(item)) {
531 return -EBUSY;
532 }
533
534 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
535 if (ret < 0)
536 return ret;
537
538 if (item < 0 && !unlink_only) {
539 crush_bucket *t = get_bucket(item);
540 if (IS_ERR(t)) {
541 ldout(cct, 1) << "remove_item_under bucket " << item
542 << " does not exist" << dendl;
543 return -ENOENT;
544 }
545
546 if (t->size) {
547 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
548 << " items, not empty" << dendl;
549 return -ENOTEMPTY;
550 }
551 }
552
553 if (_maybe_remove_last_instance(cct, item, unlink_only))
554 ret = 0;
555
556 return ret;
557 }
558
559 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
560 const std::multimap<string,string>& loc)
561 {
562 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
563 if (!item_exists(id))
564 return -ENOENT;
565 map<string,string> id_loc = get_full_location(id);
566 ldout(cct, 20) << " id is at " << id_loc << dendl;
567
568 for (map<int,string>::const_iterator p = type_map.begin();
569 p != type_map.end();
570 ++p) {
571 map<string,string>::iterator ip = id_loc.find(p->second);
572 if (ip == id_loc.end())
573 continue;
574 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
575 q != loc.end();
576 ++q) {
577 if (q->first != p->second)
578 break;
579 if (q->second == ip->second)
580 return p->first;
581 }
582 }
583 return -ERANGE;
584 }
585
586 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
587 std::map<string,string> *ploc)
588 {
589 ploc->clear();
590 for (unsigned i = 0; i < args.size(); ++i) {
591 const char *s = args[i].c_str();
592 const char *pos = strchr(s, '=');
593 if (!pos)
594 return -EINVAL;
595 string key(s, 0, pos-s);
596 string value(pos+1);
597 if (value.length())
598 (*ploc)[key] = value;
599 else
600 return -EINVAL;
601 }
602 return 0;
603 }
604
605 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
606 std::multimap<string,string> *ploc)
607 {
608 ploc->clear();
609 for (unsigned i = 0; i < args.size(); ++i) {
610 const char *s = args[i].c_str();
611 const char *pos = strchr(s, '=');
612 if (!pos)
613 return -EINVAL;
614 string key(s, 0, pos-s);
615 string value(pos+1);
616 if (value.length())
617 ploc->insert(make_pair(key, value));
618 else
619 return -EINVAL;
620 }
621 return 0;
622 }
623
624 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
625 int *weight)
626 {
627 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
628
629 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
630 // ignore device
631 if (p->first == 0)
632 continue;
633
634 // ignore types that aren't specified in loc
635 map<string,string>::const_iterator q = loc.find(p->second);
636 if (q == loc.end()) {
637 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
638 << type_map << ")" << dendl;
639 continue;
640 }
641
642 if (!name_exists(q->second)) {
643 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
644 return false;
645 }
646
647 int id = get_item_id(q->second);
648 if (id >= 0) {
649 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
650 << " is a device, not bucket" << dendl;
651 return false;
652 }
653
654 assert(bucket_exists(id));
655 crush_bucket *b = get_bucket(id);
656
657 // see if item exists in this bucket
658 for (unsigned j=0; j<b->size; j++) {
659 if (b->items[j] == item) {
660 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
661 if (weight)
662 *weight = crush_get_bucket_item_weight(b, j);
663 return true;
664 }
665 }
666 return false;
667 }
668
669 ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
670 return false;
671 }
672
673 map<string, string> CrushWrapper::get_full_location(int id)
674 {
675 vector<pair<string, string> > full_location_ordered;
676 map<string,string> full_location;
677
678 get_full_location_ordered(id, full_location_ordered);
679
680 std::copy(full_location_ordered.begin(),
681 full_location_ordered.end(),
682 std::inserter(full_location, full_location.begin()));
683
684 return full_location;
685 }
686
687 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
688 {
689 if (!item_exists(id))
690 return -ENOENT;
691 int cur = id;
692 int ret;
693 while (true) {
694 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
695 if (ret != 0)
696 break;
697 path.push_back(parent_coord);
698 cur = get_item_id(parent_coord.second);
699 }
700 return 0;
701 }
702
703 string CrushWrapper::get_full_location_ordered_string(int id)
704 {
705 vector<pair<string, string> > full_location_ordered;
706 string full_location;
707 get_full_location_ordered(id, full_location_ordered);
708 reverse(begin(full_location_ordered), end(full_location_ordered));
709 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
710 full_location = full_location + i->first + "=" + i->second;
711 if (i != full_location_ordered.end() - 1) {
712 full_location = full_location + ",";
713 }
714 }
715 return full_location;
716 }
717
718 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
719 {
720 map<int,string> parent_hierarchy;
721 pair<string, string> parent_coord = get_immediate_parent(id);
722 int parent_id;
723
724 // get the integer type for id and create a counter from there
725 int type_counter = get_bucket_type(id);
726
727 // if we get a negative type then we can assume that we have an OSD
728 // change behavior in get_item_type FIXME
729 if (type_counter < 0)
730 type_counter = 0;
731
732 // read the type map and get the name of the type with the largest ID
733 int high_type = 0;
734 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
735 if ( (*it).first > high_type )
736 high_type = (*it).first;
737 }
738
739 parent_id = get_item_id(parent_coord.second);
740
741 while (type_counter < high_type) {
742 type_counter++;
743 parent_hierarchy[ type_counter ] = parent_coord.first;
744
745 if (type_counter < high_type){
746 // get the coordinate information for the next parent
747 parent_coord = get_immediate_parent(parent_id);
748 parent_id = get_item_id(parent_coord.second);
749 }
750 }
751
752 return parent_hierarchy;
753 }
754
755 int CrushWrapper::get_children(int id, list<int> *children)
756 {
757 // leaf?
758 if (id >= 0) {
759 return 0;
760 }
761
762 crush_bucket *b = get_bucket(id);
763 if (IS_ERR(b)) {
764 return -ENOENT;
765 }
766
767 for (unsigned n=0; n<b->size; n++) {
768 children->push_back(b->items[n]);
769 }
770 return b->size;
771 }
772
773 int CrushWrapper::get_rule_failure_domain(int rule_id)
774 {
775 crush_rule *rule = get_rule(rule_id);
776 if (IS_ERR(rule)) {
777 return -ENOENT;
778 }
779 int type = 0; // default to osd-level
780 for (unsigned s = 0; s < rule->len; ++s) {
781 if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
782 rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
783 rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
784 rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
785 rule->steps[s].arg2 > type) {
786 type = rule->steps[s].arg2;
787 }
788 }
789 return type;
790 }
791
792 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
793 {
794 assert(leaves);
795
796 // Already leaf?
797 if (id >= 0) {
798 leaves->push_back(id);
799 return 0;
800 }
801
802 crush_bucket *b = get_bucket(id);
803 if (IS_ERR(b)) {
804 return -ENOENT;
805 }
806
807 for (unsigned n = 0; n < b->size; n++) {
808 if (b->items[n] >= 0) {
809 leaves->push_back(b->items[n]);
810 } else {
811 // is a bucket, do recursive call
812 int r = _get_leaves(b->items[n], leaves);
813 if (r < 0) {
814 return r;
815 }
816 }
817 }
818
819 return 0; // all is well
820 }
821
822 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
823 {
824 assert(leaves);
825 leaves->clear();
826
827 if (!name_exists(name)) {
828 return -ENOENT;
829 }
830
831 int id = get_item_id(name);
832 if (id >= 0) {
833 // already leaf
834 leaves->insert(id);
835 return 0;
836 }
837
838 list<int> unordered;
839 int r = _get_leaves(id, &unordered);
840 if (r < 0) {
841 return r;
842 }
843
844 for (auto &p : unordered) {
845 leaves->insert(p);
846 }
847
848 return 0;
849 }
850
851 int CrushWrapper::insert_item(
852 CephContext *cct, int item, float weight, string name,
853 const map<string,string>& loc) // typename -> bucketname
854 {
855 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
856 << " name " << name << " loc " << loc << dendl;
857
858 if (!is_valid_crush_name(name))
859 return -EINVAL;
860
861 if (!is_valid_crush_loc(cct, loc))
862 return -EINVAL;
863
864 int r = validate_weightf(weight);
865 if (r < 0) {
866 return r;
867 }
868
869 if (name_exists(name)) {
870 if (get_item_id(name) != item) {
871 ldout(cct, 10) << "device name '" << name << "' already exists as id "
872 << get_item_id(name) << dendl;
873 return -EEXIST;
874 }
875 } else {
876 set_item_name(item, name);
877 }
878
879 int cur = item;
880
881 // create locations if locations don't exist and add child in
882 // location with 0 weight the more detail in the insert_item method
883 // declaration in CrushWrapper.h
884 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
885 // ignore device type
886 if (p->first == 0)
887 continue;
888
889 // skip types that are unspecified
890 map<string,string>::const_iterator q = loc.find(p->second);
891 if (q == loc.end()) {
892 ldout(cct, 2) << "warning: did not specify location for '"
893 << p->second << "' level (levels are "
894 << type_map << ")" << dendl;
895 continue;
896 }
897
898 if (!name_exists(q->second)) {
899 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
900 int empty = 0, newid;
901 int r = add_bucket(0, 0,
902 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
903 if (r < 0) {
904 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
905 << dendl;
906 return r;
907 }
908 set_item_name(newid, q->second);
909
910 cur = newid;
911 continue;
912 }
913
914 // add to an existing bucket
915 int id = get_item_id(q->second);
916 if (!bucket_exists(id)) {
917 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
918 return -EINVAL;
919 }
920
921 // check that we aren't creating a cycle.
922 if (subtree_contains(id, cur)) {
923 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
924 << id << dendl;
925 return -EINVAL;
926 }
927
928 // we have done sanity check above
929 crush_bucket *b = get_bucket(id);
930
931 if (p->first != b->type) {
932 ldout(cct, 1) << "insert_item existing bucket has type "
933 << "'" << type_map[b->type] << "' != "
934 << "'" << type_map[p->first] << "'" << dendl;
935 return -EINVAL;
936 }
937
938 // are we forming a loop?
939 if (subtree_contains(cur, b->id)) {
940 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
941 << "; cannot form loop" << dendl;
942 return -ELOOP;
943 }
944
945 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
946 << " to bucket " << id << dendl;
947 int r = bucket_add_item(b, cur, 0);
948 assert (!r);
949 break;
950 }
951
952 // adjust the item's weight in location
953 if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
954 if (item >= crush->max_devices) {
955 crush->max_devices = item + 1;
956 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
957 << dendl;
958 }
959 r = rebuild_roots_with_classes();
960 if (r < 0) {
961 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
962 << cpp_strerror(r) << dendl;
963 return r;
964 }
965 return 0;
966 }
967
968 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
969 << " in " << loc << dendl;
970 return -EINVAL;
971 }
972
973
974 int CrushWrapper::move_bucket(
975 CephContext *cct, int id, const map<string,string>& loc)
976 {
977 // sorry this only works for buckets
978 if (id >= 0)
979 return -EINVAL;
980
981 if (!item_exists(id))
982 return -ENOENT;
983
984 // get the name of the bucket we are trying to move for later
985 string id_name = get_item_name(id);
986
987 // detach the bucket
988 int bucket_weight = detach_bucket(cct, id);
989
990 // insert the bucket back into the hierarchy
991 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
992 }
993
994 int CrushWrapper::detach_bucket(CephContext *cct, int item)
995 {
996 if (!crush)
997 return (-EINVAL);
998
999 if (item >= 0)
1000 return (-EINVAL);
1001
1002 // check that the bucket that we want to detach exists
1003 assert(bucket_exists(item));
1004
1005 // get the bucket's weight
1006 crush_bucket *b = get_bucket(item);
1007 unsigned bucket_weight = b->weight;
1008
1009 // get where the bucket is located
1010 pair<string, string> bucket_location = get_immediate_parent(item);
1011
1012 // get the id of the parent bucket
1013 int parent_id = get_item_id(bucket_location.second);
1014
1015 // get the parent bucket
1016 crush_bucket *parent_bucket = get_bucket(parent_id);
1017
1018 if (!IS_ERR(parent_bucket)) {
1019 // zero out the bucket weight
1020 bucket_adjust_item_weight(cct, parent_bucket, item, 0);
1021 adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
1022 for (auto& p : choose_args) {
1023 // weight down each weight-set to 0 before we remove the item
1024 vector<int> weightv(get_choose_args_positions(p.second), 0);
1025 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
1026 }
1027
1028 // remove the bucket from the parent
1029 bucket_remove_item(parent_bucket, item);
1030 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1031 return PTR_ERR(parent_bucket);
1032 }
1033
1034 // check that we're happy
1035 int test_weight = 0;
1036 map<string,string> test_location;
1037 test_location[ bucket_location.first ] = (bucket_location.second);
1038
1039 bool successful_detach = !(check_item_loc(cct, item, test_location,
1040 &test_weight));
1041 assert(successful_detach);
1042 assert(test_weight == 0);
1043
1044 return bucket_weight;
1045 }
1046
1047 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
1048 {
1049 if (src >= 0 || dst >= 0)
1050 return -EINVAL;
1051 if (!item_exists(src) || !item_exists(dst))
1052 return -EINVAL;
1053 crush_bucket *a = get_bucket(src);
1054 crush_bucket *b = get_bucket(dst);
1055 unsigned aw = a->weight;
1056 unsigned bw = b->weight;
1057
1058 // swap weights
1059 adjust_item_weight(cct, a->id, bw);
1060 adjust_item_weight(cct, b->id, aw);
1061
1062 // swap items
1063 map<int,unsigned> tmp;
1064 unsigned as = a->size;
1065 unsigned bs = b->size;
1066 for (unsigned i = 0; i < as; ++i) {
1067 int item = a->items[0];
1068 int itemw = crush_get_bucket_item_weight(a, 0);
1069 tmp[item] = itemw;
1070 bucket_remove_item(a, item);
1071 }
1072 assert(a->size == 0);
1073 assert(b->size == bs);
1074 for (unsigned i = 0; i < bs; ++i) {
1075 int item = b->items[0];
1076 int itemw = crush_get_bucket_item_weight(b, 0);
1077 bucket_remove_item(b, item);
1078 bucket_add_item(a, item, itemw);
1079 }
1080 assert(a->size == bs);
1081 assert(b->size == 0);
1082 for (auto t : tmp) {
1083 bucket_add_item(b, t.first, t.second);
1084 }
1085 assert(a->size == bs);
1086 assert(b->size == as);
1087
1088 // swap names
1089 swap_names(src, dst);
1090 return rebuild_roots_with_classes();
1091 }
1092
1093 int CrushWrapper::link_bucket(
1094 CephContext *cct, int id, const map<string,string>& loc)
1095 {
1096 // sorry this only works for buckets
1097 if (id >= 0)
1098 return -EINVAL;
1099
1100 if (!item_exists(id))
1101 return -ENOENT;
1102
1103 // get the name of the bucket we are trying to move for later
1104 string id_name = get_item_name(id);
1105
1106 crush_bucket *b = get_bucket(id);
1107 unsigned bucket_weight = b->weight;
1108
1109 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1110 }
1111
1112 int CrushWrapper::create_or_move_item(
1113 CephContext *cct, int item, float weight, string name,
1114 const map<string,string>& loc) // typename -> bucketname
1115 {
1116 int ret = 0;
1117 int old_iweight;
1118
1119 if (!is_valid_crush_name(name))
1120 return -EINVAL;
1121
1122 if (check_item_loc(cct, item, loc, &old_iweight)) {
1123 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1124 << dendl;
1125 } else {
1126 if (_search_item_exists(item)) {
1127 weight = get_item_weightf(item);
1128 ldout(cct, 10) << "create_or_move_item " << item
1129 << " exists with weight " << weight << dendl;
1130 remove_item(cct, item, true);
1131 }
1132 ldout(cct, 5) << "create_or_move_item adding " << item
1133 << " weight " << weight
1134 << " at " << loc << dendl;
1135 ret = insert_item(cct, item, weight, name, loc);
1136 if (ret == 0)
1137 ret = 1; // changed
1138 }
1139 return ret;
1140 }
1141
1142 int CrushWrapper::update_item(
1143 CephContext *cct, int item, float weight, string name,
1144 const map<string,string>& loc) // typename -> bucketname
1145 {
1146 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1147 << " name " << name << " loc " << loc << dendl;
1148 int ret = 0;
1149
1150 if (!is_valid_crush_name(name))
1151 return -EINVAL;
1152
1153 if (!is_valid_crush_loc(cct, loc))
1154 return -EINVAL;
1155
1156 ret = validate_weightf(weight);
1157 if (ret < 0) {
1158 return ret;
1159 }
1160
1161 // compare quantized (fixed-point integer) weights!
1162 int iweight = (int)(weight * (float)0x10000);
1163 int old_iweight;
1164 if (check_item_loc(cct, item, loc, &old_iweight)) {
1165 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1166 if (old_iweight != iweight) {
1167 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1168 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1169 << dendl;
1170 adjust_item_weight_in_loc(cct, item, iweight, loc);
1171 ret = 1;
1172 }
1173 if (get_item_name(item) != name) {
1174 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1175 << dendl;
1176 set_item_name(item, name);
1177 ret = 1;
1178 }
1179 } else {
1180 if (item_exists(item)) {
1181 remove_item(cct, item, true);
1182 }
1183 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1184 << " at " << loc << dendl;
1185 ret = insert_item(cct, item, weight, name, loc);
1186 if (ret == 0)
1187 ret = 1; // changed
1188 }
1189 return ret;
1190 }
1191
1192 int CrushWrapper::get_item_weight(int id) const
1193 {
1194 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1195 crush_bucket *b = crush->buckets[bidx];
1196 if (b == NULL)
1197 continue;
1198 if (b->id == id)
1199 return b->weight;
1200 for (unsigned i = 0; i < b->size; i++)
1201 if (b->items[i] == id)
1202 return crush_get_bucket_item_weight(b, i);
1203 }
1204 return -ENOENT;
1205 }
1206
1207 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1208 {
1209 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1210
1211 int bid = get_item_id(l->second);
1212 if (!bucket_exists(bid))
1213 continue;
1214 crush_bucket *b = get_bucket(bid);
1215 for (unsigned int i = 0; i < b->size; i++) {
1216 if (b->items[i] == id) {
1217 return crush_get_bucket_item_weight(b, i);
1218 }
1219 }
1220 }
1221 return -ENOENT;
1222 }
1223
1224 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1225 {
1226 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1227 int changed = 0;
1228 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1229 crush_bucket *b = crush->buckets[bidx];
1230 if (b == 0)
1231 continue;
1232 for (unsigned i = 0; i < b->size; i++) {
1233 if (b->items[i] == id) {
1234 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1235 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
1236 << " in bucket " << bidx << dendl;
1237 adjust_item_weight(cct, -1 - bidx, b->weight);
1238 changed++;
1239 }
1240 }
1241 }
1242 if (!changed)
1243 return -ENOENT;
1244 return changed;
1245 }
1246
1247 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1248 {
1249 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
1250 << " in " << loc << dendl;
1251 int changed = 0;
1252
1253 for (auto l = loc.begin(); l != loc.end(); ++l) {
1254 int bid = get_item_id(l->second);
1255 if (!bucket_exists(bid))
1256 continue;
1257 crush_bucket *b = get_bucket(bid);
1258 for (unsigned int i = 0; i < b->size; i++) {
1259 if (b->items[i] == id) {
1260 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1261 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
1262 << " in bucket " << bid << dendl;
1263 adjust_item_weight(cct, bid, b->weight);
1264 changed++;
1265 }
1266 }
1267 }
1268 if (!changed)
1269 return -ENOENT;
1270 return changed;
1271 }
1272
1273 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1274 {
1275 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1276 crush_bucket *b = get_bucket(id);
1277 if (IS_ERR(b))
1278 return PTR_ERR(b);
1279 int changed = 0;
1280 list<crush_bucket*> q;
1281 q.push_back(b);
1282 while (!q.empty()) {
1283 b = q.front();
1284 q.pop_front();
1285 int local_changed = 0;
1286 for (unsigned i=0; i<b->size; ++i) {
1287 int n = b->items[i];
1288 if (n >= 0) {
1289 bucket_adjust_item_weight(cct, b, n, weight);
1290 ++changed;
1291 ++local_changed;
1292 } else {
1293 crush_bucket *sub = get_bucket(n);
1294 if (IS_ERR(sub))
1295 continue;
1296 q.push_back(sub);
1297 }
1298 }
1299 if (local_changed) {
1300 adjust_item_weight(cct, b->id, b->weight);
1301 }
1302 }
1303 return changed;
1304 }
1305
1306 bool CrushWrapper::check_item_present(int id) const
1307 {
1308 bool found = false;
1309
1310 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1311 crush_bucket *b = crush->buckets[bidx];
1312 if (b == 0)
1313 continue;
1314 for (unsigned i = 0; i < b->size; i++)
1315 if (b->items[i] == id)
1316 found = true;
1317 }
1318 return found;
1319 }
1320
1321
1322 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1323 {
1324
1325 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1326 crush_bucket *b = crush->buckets[bidx];
1327 if (b == 0)
1328 continue;
1329 if (is_shadow_item(b->id))
1330 continue;
1331 for (unsigned i = 0; i < b->size; i++)
1332 if (b->items[i] == id) {
1333 string parent_id = name_map[b->id];
1334 string parent_bucket_type = type_map[b->type];
1335 if (_ret)
1336 *_ret = 0;
1337 return make_pair(parent_bucket_type, parent_id);
1338 }
1339 }
1340
1341 if (_ret)
1342 *_ret = -ENOENT;
1343
1344 return pair<string, string>();
1345 }
1346
1347 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1348 {
1349 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1350 crush_bucket *b = crush->buckets[bidx];
1351 if (b == 0)
1352 continue;
1353 if (is_shadow_item(b->id))
1354 continue;
1355 for (unsigned i = 0; i < b->size; i++) {
1356 if (b->items[i] == id) {
1357 *parent = b->id;
1358 return 0;
1359 }
1360 }
1361 }
1362 return -ENOENT;
1363 }
1364
1365 int CrushWrapper::get_parent_of_type(int item, int type) const
1366 {
1367 do {
1368 int r = get_immediate_parent_id(item, &item);
1369 if (r < 0) {
1370 return 0;
1371 }
1372 } while (get_bucket_type(item) != type);
1373 return item;
1374 }
1375
1376 int CrushWrapper::rename_class(const string& srcname, const string& dstname)
1377 {
1378 auto i = class_rname.find(srcname);
1379 if (i == class_rname.end())
1380 return -ENOENT;
1381 auto j = class_rname.find(dstname);
1382 if (j != class_rname.end())
1383 return -EEXIST;
1384
1385 int class_id = i->second;
1386 assert(class_name.count(class_id));
1387 // rename any shadow buckets of old class name
1388 for (auto &it: class_map) {
1389 if (it.first < 0 && it.second == class_id) {
1390 string old_name = get_item_name(it.first);
1391 size_t pos = old_name.find("~");
1392 assert(pos != string::npos);
1393 string name_no_class = old_name.substr(0, pos);
1394 string old_class_name = old_name.substr(pos + 1);
1395 assert(old_class_name == srcname);
1396 string new_name = name_no_class + "~" + dstname;
1397 // we do not use set_item_name
1398 // because the name is intentionally invalid
1399 name_map[it.first] = new_name;
1400 have_rmaps = false;
1401 }
1402 }
1403
1404 // rename class
1405 class_rname.erase(srcname);
1406 class_name.erase(class_id);
1407 class_rname[dstname] = class_id;
1408 class_name[class_id] = dstname;
1409 return 0;
1410 }
1411
1412 int CrushWrapper::populate_classes(
1413 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
1414 {
1415 // build set of previous used shadow ids
1416 set<int32_t> used_ids;
1417 for (auto& p : old_class_bucket) {
1418 for (auto& q : p.second) {
1419 used_ids.insert(q.second);
1420 }
1421 }
1422 // accumulate weight values for each carg and bucket as we go. because it is
1423 // depth first, we will have the nested bucket weights we need when we
1424 // finish constructing the containing buckets.
1425 map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
1426 set<int> roots;
1427 find_nonshadow_roots(&roots);
1428 for (auto &r : roots) {
1429 if (r >= 0)
1430 continue;
1431 for (auto &c : class_name) {
1432 int clone;
1433 int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
1434 &clone, &cmap_item_weight);
1435 if (res < 0)
1436 return res;
1437 }
1438 }
1439 return 0;
1440 }
1441
1442 int CrushWrapper::trim_roots_with_class()
1443 {
1444 set<int> roots;
1445 find_shadow_roots(&roots);
1446 for (auto &r : roots) {
1447 if (r >= 0)
1448 continue;
1449 int res = remove_root(r);
1450 if (res)
1451 return res;
1452 }
1453 // there is no need to reweight because we only remove from the
1454 // root and down
1455 return 0;
1456 }
1457
1458 int32_t CrushWrapper::_alloc_class_id() const {
1459 if (class_name.empty()) {
1460 return 0;
1461 }
1462 int32_t class_id = class_name.rbegin()->first + 1;
1463 if (class_id >= 0) {
1464 return class_id;
1465 }
1466 // wrapped, pick a random start and do exhaustive search
1467 uint32_t upperlimit = numeric_limits<int32_t>::max();
1468 upperlimit++;
1469 class_id = rand() % upperlimit;
1470 const auto start = class_id;
1471 do {
1472 if (!class_name.count(class_id)) {
1473 return class_id;
1474 } else {
1475 class_id++;
1476 if (class_id < 0) {
1477 class_id = 0;
1478 }
1479 }
1480 } while (class_id != start);
1481 assert(0 == "no available class id");
1482 }
1483
1484 void CrushWrapper::reweight(CephContext *cct)
1485 {
1486 set<int> roots;
1487 find_roots(&roots);
1488 for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
1489 if (*p >= 0)
1490 continue;
1491 crush_bucket *b = get_bucket(*p);
1492 ldout(cct, 5) << "reweight bucket " << *p << dendl;
1493 int r = crush_reweight_bucket(crush, b);
1494 assert(r == 0);
1495 }
1496 }
1497
1498 int CrushWrapper::add_simple_rule_at(
1499 string name, string root_name,
1500 string failure_domain_name,
1501 string device_class,
1502 string mode, int rule_type,
1503 int rno,
1504 ostream *err)
1505 {
1506 if (rule_exists(name)) {
1507 if (err)
1508 *err << "rule " << name << " exists";
1509 return -EEXIST;
1510 }
1511 if (rno >= 0) {
1512 if (rule_exists(rno)) {
1513 if (err)
1514 *err << "rule with ruleno " << rno << " exists";
1515 return -EEXIST;
1516 }
1517 if (ruleset_exists(rno)) {
1518 if (err)
1519 *err << "ruleset " << rno << " exists";
1520 return -EEXIST;
1521 }
1522 } else {
1523 for (rno = 0; rno < get_max_rules(); rno++) {
1524 if (!rule_exists(rno) && !ruleset_exists(rno))
1525 break;
1526 }
1527 }
1528 if (!name_exists(root_name)) {
1529 if (err)
1530 *err << "root item " << root_name << " does not exist";
1531 return -ENOENT;
1532 }
1533 int root = get_item_id(root_name);
1534 int type = 0;
1535 if (failure_domain_name.length()) {
1536 type = get_type_id(failure_domain_name);
1537 if (type < 0) {
1538 if (err)
1539 *err << "unknown type " << failure_domain_name;
1540 return -EINVAL;
1541 }
1542 }
1543 if (device_class.size()) {
1544 if (!class_exists(device_class)) {
1545 if (err)
1546 *err << "device class " << device_class << " does not exist";
1547 return -EINVAL;
1548 }
1549 int c = get_class_id(device_class);
1550 if (class_bucket.count(root) == 0 ||
1551 class_bucket[root].count(c) == 0) {
1552 if (err)
1553 *err << "root " << root_name << " has no devices with class "
1554 << device_class;
1555 return -EINVAL;
1556 }
1557 root = class_bucket[root][c];
1558 }
1559 if (mode != "firstn" && mode != "indep") {
1560 if (err)
1561 *err << "unknown mode " << mode;
1562 return -EINVAL;
1563 }
1564
1565 int steps = 3;
1566 if (mode == "indep")
1567 steps = 5;
1568 int min_rep = mode == "firstn" ? 1 : 3;
1569 int max_rep = mode == "firstn" ? 10 : 20;
1570 //set the ruleset the same as rule_id(rno)
1571 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
1572 assert(rule);
1573 int step = 0;
1574 if (mode == "indep") {
1575 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1576 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1577 }
1578 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1579 if (type)
1580 crush_rule_set_step(rule, step++,
1581 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
1582 CRUSH_RULE_CHOOSELEAF_INDEP,
1583 CRUSH_CHOOSE_N,
1584 type);
1585 else
1586 crush_rule_set_step(rule, step++,
1587 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
1588 CRUSH_RULE_CHOOSE_INDEP,
1589 CRUSH_CHOOSE_N,
1590 0);
1591 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1592
1593 int ret = crush_add_rule(crush, rule, rno);
1594 if(ret < 0) {
1595 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
1596 return ret;
1597 }
1598 set_rule_name(rno, name);
1599 have_rmaps = false;
1600 return rno;
1601 }
1602
1603 int CrushWrapper::add_simple_rule(
1604 string name, string root_name,
1605 string failure_domain_name,
1606 string device_class,
1607 string mode, int rule_type,
1608 ostream *err)
1609 {
1610 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
1611 mode,
1612 rule_type, -1, err);
1613 }
1614
1615 float CrushWrapper::_get_take_weight_osd_map(int root,
1616 map<int,float> *pmap) const
1617 {
1618 float sum = 0.0;
1619 list<int> q;
1620 q.push_back(root);
1621 //breadth first iterate the OSD tree
1622 while (!q.empty()) {
1623 int bno = q.front();
1624 q.pop_front();
1625 crush_bucket *b = crush->buckets[-1-bno];
1626 assert(b);
1627 for (unsigned j=0; j<b->size; ++j) {
1628 int item_id = b->items[j];
1629 if (item_id >= 0) { //it's an OSD
1630 float w = crush_get_bucket_item_weight(b, j);
1631 (*pmap)[item_id] = w;
1632 sum += w;
1633 } else { //not an OSD, expand the child later
1634 q.push_back(item_id);
1635 }
1636 }
1637 }
1638 return sum;
1639 }
1640
1641 void CrushWrapper::_normalize_weight_map(float sum,
1642 const map<int,float>& m,
1643 map<int,float> *pmap) const
1644 {
1645 for (auto& p : m) {
1646 map<int,float>::iterator q = pmap->find(p.first);
1647 if (q == pmap->end()) {
1648 (*pmap)[p.first] = p.second / sum;
1649 } else {
1650 q->second += p.second / sum;
1651 }
1652 }
1653 }
1654
1655 int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
1656 {
1657 map<int,float> m;
1658 float sum = _get_take_weight_osd_map(root, &m);
1659 _normalize_weight_map(sum, m, pmap);
1660 return 0;
1661 }
1662
1663 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
1664 map<int,float> *pmap) const
1665 {
1666 if (ruleno >= crush->max_rules)
1667 return -ENOENT;
1668 if (crush->rules[ruleno] == NULL)
1669 return -ENOENT;
1670 crush_rule *rule = crush->rules[ruleno];
1671
1672 // build a weight map for each TAKE in the rule, and then merge them
1673
1674 // FIXME: if there are multiple takes that place a different number of
1675 // objects we do not take that into account. (Also, note that doing this
1676 // right is also a function of the pool, since the crush rule
1677 // might choose 2 + choose 2 but pool size may only be 3.)
1678 for (unsigned i=0; i<rule->len; ++i) {
1679 map<int,float> m;
1680 float sum = 0;
1681 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
1682 int n = rule->steps[i].arg1;
1683 if (n >= 0) {
1684 m[n] = 1.0;
1685 sum = 1.0;
1686 } else {
1687 sum += _get_take_weight_osd_map(n, &m);
1688 }
1689 }
1690 _normalize_weight_map(sum, m, pmap);
1691 }
1692
1693 return 0;
1694 }
1695
1696 int CrushWrapper::remove_rule(int ruleno)
1697 {
1698 if (ruleno >= (int)crush->max_rules)
1699 return -ENOENT;
1700 if (crush->rules[ruleno] == NULL)
1701 return -ENOENT;
1702 crush_destroy_rule(crush->rules[ruleno]);
1703 crush->rules[ruleno] = NULL;
1704 rule_name_map.erase(ruleno);
1705 have_rmaps = false;
1706 return rebuild_roots_with_classes();
1707 }
1708
1709 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
1710 {
1711 if (cct->_conf->osd_crush_update_weight_set) {
1712 unsigned position;
1713 for (position = 0; position < bucket->size; position++)
1714 if (bucket->items[position] == item)
1715 break;
1716 assert(position != bucket->size);
1717 for (auto &w : choose_args) {
1718 crush_choose_arg_map &arg_map = w.second;
1719 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1720 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1721 crush_weight_set *weight_set = &arg->weight_set[j];
1722 weight_set->weights[position] = weight;
1723 }
1724 }
1725 }
1726 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
1727 }
1728
1729 int CrushWrapper::add_bucket(
1730 int bucketno, int alg, int hash, int type, int size,
1731 int *items, int *weights, int *idout)
1732 {
1733 if (alg == 0) {
1734 alg = get_default_bucket_alg();
1735 if (alg == 0)
1736 return -EINVAL;
1737 }
1738 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
1739 weights);
1740 assert(b);
1741 assert(idout);
1742 int r = crush_add_bucket(crush, bucketno, b, idout);
1743 int pos = -1 - *idout;
1744 for (auto& p : choose_args) {
1745 crush_choose_arg_map& cmap = p.second;
1746 if (cmap.args) {
1747 if ((int)cmap.size <= pos) {
1748 cmap.args = (crush_choose_arg*)realloc(
1749 cmap.args,
1750 sizeof(crush_choose_arg) * (pos + 1));
1751 assert(cmap.args);
1752 memset(&cmap.args[cmap.size], 0,
1753 sizeof(crush_choose_arg) * (pos + 1 - cmap.size));
1754 cmap.size = pos + 1;
1755 }
1756 } else {
1757 cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
1758 pos + 1);
1759 assert(cmap.args);
1760 cmap.size = pos + 1;
1761 }
1762 if (size > 0) {
1763 int positions = get_choose_args_positions(cmap);
1764 crush_choose_arg& carg = cmap.args[pos];
1765 carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
1766 size);
1767 carg.weight_set_size = positions;
1768 for (int ppos = 0; ppos < positions; ++ppos) {
1769 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
1770 carg.weight_set[ppos].size = size;
1771 for (int bpos = 0; bpos < size; ++bpos) {
1772 carg.weight_set[ppos].weights[bpos] = weights[bpos];
1773 }
1774 }
1775 }
1776 }
1777 return r;
1778 }
1779
1780 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
1781 {
1782 __u32 new_size = bucket->size + 1;
1783 int r = crush_bucket_add_item(crush, bucket, item, weight);
1784 if (r < 0) {
1785 return r;
1786 }
1787 for (auto &w : choose_args) {
1788 crush_choose_arg_map &arg_map = w.second;
1789 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1790 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1791 crush_weight_set *weight_set = &arg->weight_set[j];
1792 weight_set->weights = (__u32*)realloc(weight_set->weights,
1793 new_size * sizeof(__u32));
1794 assert(weight_set->size + 1 == new_size);
1795 weight_set->weights[weight_set->size] = weight;
1796 weight_set->size = new_size;
1797 }
1798 if (arg->ids_size) {
1799 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1800 assert(arg->ids_size + 1 == new_size);
1801 arg->ids[arg->ids_size] = item;
1802 arg->ids_size = new_size;
1803 }
1804 }
1805 return 0;
1806 }
1807
1808 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
1809 {
1810 __u32 new_size = bucket->size - 1;
1811 unsigned position;
1812 for (position = 0; position < bucket->size; position++)
1813 if (bucket->items[position] == item)
1814 break;
1815 assert(position != bucket->size);
1816 int r = crush_bucket_remove_item(crush, bucket, item);
1817 if (r < 0) {
1818 return r;
1819 }
1820 for (auto &w : choose_args) {
1821 crush_choose_arg_map &arg_map = w.second;
1822 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1823 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1824 crush_weight_set *weight_set = &arg->weight_set[j];
1825 assert(weight_set->size - 1 == new_size);
1826 for (__u32 k = position; k < new_size; k++)
1827 weight_set->weights[k] = weight_set->weights[k+1];
1828 if (new_size) {
1829 weight_set->weights = (__u32*)realloc(weight_set->weights,
1830 new_size * sizeof(__u32));
1831 } else {
1832 weight_set->weights = NULL;
1833 }
1834 weight_set->size = new_size;
1835 }
1836 if (arg->ids_size) {
1837 assert(arg->ids_size - 1 == new_size);
1838 for (__u32 k = position; k < new_size; k++)
1839 arg->ids[k] = arg->ids[k+1];
1840 if (new_size) {
1841 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1842 } else {
1843 arg->ids = NULL;
1844 }
1845 arg->ids_size = new_size;
1846 }
1847 }
1848 return 0;
1849 }
1850
1851 int CrushWrapper::bucket_set_alg(int bid, int alg)
1852 {
1853 crush_bucket *b = get_bucket(bid);
1854 if (!b) {
1855 return -ENOENT;
1856 }
1857 b->alg = alg;
1858 return 0;
1859 }
1860
1861 int CrushWrapper::update_device_class(int id,
1862 const string& class_name,
1863 const string& name,
1864 ostream *ss)
1865 {
1866 assert(item_exists(id));
1867 auto old_class_name = get_item_class(id);
1868 if (old_class_name && old_class_name != class_name) {
1869 *ss << "osd." << id << " has already bound to class '" << old_class_name
1870 << "', can not reset class to '" << class_name << "'; "
1871 << "use 'ceph osd crush rm-device-class <osd>' to "
1872 << "remove old class first";
1873 return -EBUSY;
1874 }
1875
1876 int class_id = get_or_create_class_id(class_name);
1877 if (id < 0) {
1878 *ss << name << " id " << id << " is negative";
1879 return -EINVAL;
1880 }
1881
1882 if (class_map.count(id) != 0 && class_map[id] == class_id) {
1883 *ss << name << " already set to class " << class_name;
1884 return 0;
1885 }
1886
1887 set_item_class(id, class_id);
1888
1889 int r = rebuild_roots_with_classes();
1890 if (r < 0)
1891 return r;
1892 return 1;
1893 }
1894
1895 int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
1896 {
1897 assert(ss);
1898 const char *name = get_item_name(id);
1899 if (!name) {
1900 *ss << "osd." << id << " does not have a name";
1901 return -ENOENT;
1902 }
1903
1904 const char *class_name = get_item_class(id);
1905 if (!class_name) {
1906 *ss << "osd." << id << " has not been bound to a specific class yet";
1907 return 0;
1908 }
1909 class_remove_item(id);
1910
1911 int r = rebuild_roots_with_classes();
1912 if (r < 0) {
1913 *ss << "unable to rebuild roots with class '" << class_name << "' "
1914 << "of osd." << id << ": " << cpp_strerror(r);
1915 return r;
1916 }
1917 return 0;
1918 }
1919
1920 int CrushWrapper::device_class_clone(
1921 int original_id, int device_class,
1922 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
1923 const std::set<int32_t>& used_ids,
1924 int *clone,
1925 map<int,map<int,vector<int>>> *cmap_item_weight)
1926 {
1927 const char *item_name = get_item_name(original_id);
1928 if (item_name == NULL)
1929 return -ECHILD;
1930 const char *class_name = get_class_name(device_class);
1931 if (class_name == NULL)
1932 return -EBADF;
1933 string copy_name = item_name + string("~") + class_name;
1934 if (name_exists(copy_name)) {
1935 *clone = get_item_id(copy_name);
1936 return 0;
1937 }
1938
1939 crush_bucket *original = get_bucket(original_id);
1940 assert(!IS_ERR(original));
1941 crush_bucket *copy = crush_make_bucket(crush,
1942 original->alg,
1943 original->hash,
1944 original->type,
1945 0, NULL, NULL);
1946 assert(copy);
1947
1948 vector<unsigned> item_orig_pos; // new item pos -> orig item pos
1949 for (unsigned i = 0; i < original->size; i++) {
1950 int item = original->items[i];
1951 int weight = crush_get_bucket_item_weight(original, i);
1952 if (item >= 0) {
1953 if (class_map.count(item) != 0 && class_map[item] == device_class) {
1954 int res = crush_bucket_add_item(crush, copy, item, weight);
1955 if (res)
1956 return res;
1957 } else {
1958 continue;
1959 }
1960 } else {
1961 int child_copy_id;
1962 int res = device_class_clone(item, device_class, old_class_bucket,
1963 used_ids, &child_copy_id,
1964 cmap_item_weight);
1965 if (res < 0)
1966 return res;
1967 crush_bucket *child_copy = get_bucket(child_copy_id);
1968 assert(!IS_ERR(child_copy));
1969 res = crush_bucket_add_item(crush, copy, child_copy_id,
1970 child_copy->weight);
1971 if (res)
1972 return res;
1973 }
1974 item_orig_pos.push_back(i);
1975 }
1976 assert(item_orig_pos.size() == copy->size);
1977
1978 int bno = 0;
1979 if (old_class_bucket.count(original_id) &&
1980 old_class_bucket.at(original_id).count(device_class)) {
1981 bno = old_class_bucket.at(original_id).at(device_class);
1982 } else {
1983 // pick a new shadow bucket id that is not used by the current map
1984 // *or* any previous shadow buckets.
1985 bno = -1;
1986 while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
1987 used_ids.count(bno)) {
1988 --bno;
1989 }
1990 }
1991 int res = crush_add_bucket(crush, bno, copy, clone);
1992 if (res)
1993 return res;
1994 assert(!bno || bno == *clone);
1995
1996 res = set_item_class(*clone, device_class);
1997 if (res < 0)
1998 return res;
1999
2000 // we do not use set_item_name because the name is intentionally invalid
2001 name_map[*clone] = copy_name;
2002 if (have_rmaps)
2003 name_rmap[copy_name] = *clone;
2004 class_bucket[original_id][device_class] = *clone;
2005
2006 // set up choose_args for the new bucket.
2007 for (auto& w : choose_args) {
2008 crush_choose_arg_map& cmap = w.second;
2009 if (-1-bno >= (int)cmap.size) {
2010 unsigned new_size = -1-bno + 1;
2011 cmap.args = (crush_choose_arg*)realloc(cmap.args,
2012 new_size * sizeof(cmap.args[0]));
2013 assert(cmap.args);
2014 memset(cmap.args + cmap.size, 0,
2015 (new_size - cmap.size) * sizeof(cmap.args[0]));
2016 cmap.size = new_size;
2017 }
2018 auto& o = cmap.args[-1-original_id];
2019 auto& n = cmap.args[-1-bno];
2020 n.ids_size = 0; // FIXME: implement me someday
2021 n.weight_set_size = o.weight_set_size;
2022 n.weight_set = (crush_weight_set*)calloc(
2023 n.weight_set_size, sizeof(crush_weight_set));
2024 for (size_t s = 0; s < n.weight_set_size; ++s) {
2025 n.weight_set[s].size = copy->size;
2026 n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
2027 }
2028 for (size_t s = 0; s < n.weight_set_size; ++s) {
2029 vector<int> bucket_weights(n.weight_set_size);
2030 for (size_t i = 0; i < copy->size; ++i) {
2031 int item = copy->items[i];
2032 if (item >= 0) {
2033 n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
2034 } else {
2035 n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
2036 }
2037 bucket_weights[s] += n.weight_set[s].weights[i];
2038 }
2039 (*cmap_item_weight)[w.first][bno] = bucket_weights;
2040 }
2041 }
2042 return 0;
2043 }
2044
2045 int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
2046 {
2047 assert(rules);
2048 rules->clear();
2049 if (!class_exists(class_name)) {
2050 return -ENOENT;
2051 }
2052 int class_id = get_class_id(class_name);
2053 for (unsigned i = 0; i < crush->max_rules; ++i) {
2054 crush_rule *r = crush->rules[i];
2055 if (!r)
2056 continue;
2057 for (unsigned j = 0; j < r->len; ++j) {
2058 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2059 int step_item = r->steps[j].arg1;
2060 int original_item;
2061 int c;
2062 int res = split_id_class(step_item, &original_item, &c);
2063 if (res < 0) {
2064 return res;
2065 }
2066 if (c != -1 && c == class_id) {
2067 rules->insert(i);
2068 break;
2069 }
2070 }
2071 }
2072 }
2073 return 0;
2074 }
2075
2076 // return rules that might reference the given osd
2077 int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
2078 {
2079 assert(rules);
2080 rules->clear();
2081 if (osd < 0) {
2082 return -EINVAL;
2083 }
2084 for (unsigned i = 0; i < crush->max_rules; ++i) {
2085 crush_rule *r = crush->rules[i];
2086 if (!r)
2087 continue;
2088 for (unsigned j = 0; j < r->len; ++j) {
2089 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2090 int step_item = r->steps[j].arg1;
2091 list<int> unordered;
2092 int rc = _get_leaves(step_item, &unordered);
2093 if (rc < 0) {
2094 return rc; // propagate fatal errors!
2095 }
2096 bool match = false;
2097 for (auto &o: unordered) {
2098 assert(o >= 0);
2099 if (o == osd) {
2100 match = true;
2101 break;
2102 }
2103 }
2104 if (match) {
2105 rules->insert(i);
2106 break;
2107 }
2108 }
2109 }
2110 }
2111 return 0;
2112 }
2113
2114 bool CrushWrapper::_class_is_dead(int class_id)
2115 {
2116 for (auto &p: class_map) {
2117 if (p.first >= 0 && p.second == class_id) {
2118 return false;
2119 }
2120 }
2121 for (unsigned i = 0; i < crush->max_rules; ++i) {
2122 crush_rule *r = crush->rules[i];
2123 if (!r)
2124 continue;
2125 for (unsigned j = 0; j < r->len; ++j) {
2126 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2127 int root = r->steps[j].arg1;
2128 for (auto &p : class_bucket) {
2129 auto& q = p.second;
2130 if (q.count(class_id) && q[class_id] == root) {
2131 return false;
2132 }
2133 }
2134 }
2135 }
2136 }
2137 // no more referenced by any devices or crush rules
2138 return true;
2139 }
2140
2141 void CrushWrapper::cleanup_dead_classes()
2142 {
2143 auto p = class_name.begin();
2144 while (p != class_name.end()) {
2145 if (_class_is_dead(p->first)) {
2146 string n = p->second;
2147 ++p;
2148 remove_class_name(n);
2149 } else {
2150 ++p;
2151 }
2152 }
2153 }
2154
2155 int CrushWrapper::rebuild_roots_with_classes()
2156 {
2157 std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
2158 cleanup_dead_classes();
2159 int r = trim_roots_with_class();
2160 if (r < 0)
2161 return r;
2162 class_bucket.clear();
2163 return populate_classes(old_class_bucket);
2164 }
2165
2166 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
2167 {
2168 assert(crush);
2169
2170 __u32 magic = CRUSH_MAGIC;
2171 ::encode(magic, bl);
2172
2173 ::encode(crush->max_buckets, bl);
2174 ::encode(crush->max_rules, bl);
2175 ::encode(crush->max_devices, bl);
2176
2177 bool encode_compat_choose_args = false;
2178 crush_choose_arg_map arg_map;
2179 memset(&arg_map, '\0', sizeof(arg_map));
2180 if (has_choose_args() &&
2181 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
2182 assert(!has_incompat_choose_args());
2183 encode_compat_choose_args = true;
2184 arg_map = choose_args.begin()->second;
2185 }
2186
2187 // buckets
2188 for (int i=0; i<crush->max_buckets; i++) {
2189 __u32 alg = 0;
2190 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
2191 ::encode(alg, bl);
2192 if (!alg)
2193 continue;
2194
2195 ::encode(crush->buckets[i]->id, bl);
2196 ::encode(crush->buckets[i]->type, bl);
2197 ::encode(crush->buckets[i]->alg, bl);
2198 ::encode(crush->buckets[i]->hash, bl);
2199 ::encode(crush->buckets[i]->weight, bl);
2200 ::encode(crush->buckets[i]->size, bl);
2201 for (unsigned j=0; j<crush->buckets[i]->size; j++)
2202 ::encode(crush->buckets[i]->items[j], bl);
2203
2204 switch (crush->buckets[i]->alg) {
2205 case CRUSH_BUCKET_UNIFORM:
2206 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
2207 break;
2208
2209 case CRUSH_BUCKET_LIST:
2210 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2211 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
2212 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
2213 }
2214 break;
2215
2216 case CRUSH_BUCKET_TREE:
2217 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
2218 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
2219 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
2220 break;
2221
2222 case CRUSH_BUCKET_STRAW:
2223 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2224 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
2225 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
2226 }
2227 break;
2228
2229 case CRUSH_BUCKET_STRAW2:
2230 {
2231 __u32 *weights;
2232 if (encode_compat_choose_args &&
2233 arg_map.args[i].weight_set_size > 0) {
2234 weights = arg_map.args[i].weight_set[0].weights;
2235 } else {
2236 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
2237 }
2238 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2239 ::encode(weights[j], bl);
2240 }
2241 }
2242 break;
2243
2244 default:
2245 ceph_abort();
2246 break;
2247 }
2248 }
2249
2250 // rules
2251 for (unsigned i=0; i<crush->max_rules; i++) {
2252 __u32 yes = crush->rules[i] ? 1:0;
2253 ::encode(yes, bl);
2254 if (!yes)
2255 continue;
2256
2257 ::encode(crush->rules[i]->len, bl);
2258 ::encode(crush->rules[i]->mask, bl);
2259 for (unsigned j=0; j<crush->rules[i]->len; j++)
2260 ::encode(crush->rules[i]->steps[j], bl);
2261 }
2262
2263 // name info
2264 ::encode(type_map, bl);
2265 ::encode(name_map, bl);
2266 ::encode(rule_name_map, bl);
2267
2268 // tunables
2269 ::encode(crush->choose_local_tries, bl);
2270 ::encode(crush->choose_local_fallback_tries, bl);
2271 ::encode(crush->choose_total_tries, bl);
2272 ::encode(crush->chooseleaf_descend_once, bl);
2273 ::encode(crush->chooseleaf_vary_r, bl);
2274 ::encode(crush->straw_calc_version, bl);
2275 ::encode(crush->allowed_bucket_algs, bl);
2276 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
2277 ::encode(crush->chooseleaf_stable, bl);
2278 }
2279
2280 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2281 // device classes
2282 ::encode(class_map, bl);
2283 ::encode(class_name, bl);
2284 ::encode(class_bucket, bl);
2285
2286 // choose args
2287 __u32 size = (__u32)choose_args.size();
2288 ::encode(size, bl);
2289 for (auto c : choose_args) {
2290 ::encode(c.first, bl);
2291 crush_choose_arg_map arg_map = c.second;
2292 size = 0;
2293 for (__u32 i = 0; i < arg_map.size; i++) {
2294 crush_choose_arg *arg = &arg_map.args[i];
2295 if (arg->weight_set_size == 0 &&
2296 arg->ids_size == 0)
2297 continue;
2298 size++;
2299 }
2300 ::encode(size, bl);
2301 for (__u32 i = 0; i < arg_map.size; i++) {
2302 crush_choose_arg *arg = &arg_map.args[i];
2303 if (arg->weight_set_size == 0 &&
2304 arg->ids_size == 0)
2305 continue;
2306 ::encode(i, bl);
2307 ::encode(arg->weight_set_size, bl);
2308 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2309 crush_weight_set *weight_set = &arg->weight_set[j];
2310 ::encode(weight_set->size, bl);
2311 for (__u32 k = 0; k < weight_set->size; k++)
2312 ::encode(weight_set->weights[k], bl);
2313 }
2314 ::encode(arg->ids_size, bl);
2315 for (__u32 j = 0; j < arg->ids_size; j++)
2316 ::encode(arg->ids[j], bl);
2317 }
2318 }
2319 }
2320 }
2321
2322 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
2323 {
2324 m.clear();
2325 __u32 n;
2326 ::decode(n, blp);
2327 while (n--) {
2328 __s32 key;
2329 ::decode(key, blp);
2330
2331 __u32 strlen;
2332 ::decode(strlen, blp);
2333 if (strlen == 0) {
2334 // der, key was actually 64-bits!
2335 ::decode(strlen, blp);
2336 }
2337 ::decode_nohead(strlen, m[key], blp);
2338 }
2339 }
2340
2341 void CrushWrapper::decode(bufferlist::iterator& blp)
2342 {
2343 create();
2344
2345 __u32 magic;
2346 ::decode(magic, blp);
2347 if (magic != CRUSH_MAGIC)
2348 throw buffer::malformed_input("bad magic number");
2349
2350 ::decode(crush->max_buckets, blp);
2351 ::decode(crush->max_rules, blp);
2352 ::decode(crush->max_devices, blp);
2353
2354 // legacy tunables, unless we decode something newer
2355 set_tunables_legacy();
2356
2357 try {
2358 // buckets
2359 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
2360 for (int i=0; i<crush->max_buckets; i++) {
2361 decode_crush_bucket(&crush->buckets[i], blp);
2362 }
2363
2364 // rules
2365 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
2366 for (unsigned i = 0; i < crush->max_rules; ++i) {
2367 __u32 yes;
2368 ::decode(yes, blp);
2369 if (!yes) {
2370 crush->rules[i] = NULL;
2371 continue;
2372 }
2373
2374 __u32 len;
2375 ::decode(len, blp);
2376 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
2377 crush->rules[i]->len = len;
2378 ::decode(crush->rules[i]->mask, blp);
2379 for (unsigned j=0; j<crush->rules[i]->len; j++)
2380 ::decode(crush->rules[i]->steps[j], blp);
2381 }
2382
2383 // name info
2384 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2385 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2386 // both by assuming the string is always non-empty.
2387 decode_32_or_64_string_map(type_map, blp);
2388 decode_32_or_64_string_map(name_map, blp);
2389 decode_32_or_64_string_map(rule_name_map, blp);
2390
2391 // tunables
2392 if (!blp.end()) {
2393 ::decode(crush->choose_local_tries, blp);
2394 ::decode(crush->choose_local_fallback_tries, blp);
2395 ::decode(crush->choose_total_tries, blp);
2396 }
2397 if (!blp.end()) {
2398 ::decode(crush->chooseleaf_descend_once, blp);
2399 }
2400 if (!blp.end()) {
2401 ::decode(crush->chooseleaf_vary_r, blp);
2402 }
2403 if (!blp.end()) {
2404 ::decode(crush->straw_calc_version, blp);
2405 }
2406 if (!blp.end()) {
2407 ::decode(crush->allowed_bucket_algs, blp);
2408 }
2409 if (!blp.end()) {
2410 ::decode(crush->chooseleaf_stable, blp);
2411 }
2412 if (!blp.end()) {
2413 ::decode(class_map, blp);
2414 ::decode(class_name, blp);
2415 for (auto &c : class_name)
2416 class_rname[c.second] = c.first;
2417 ::decode(class_bucket, blp);
2418 }
2419 if (!blp.end()) {
2420 __u32 choose_args_size;
2421 ::decode(choose_args_size, blp);
2422 for (__u32 i = 0; i < choose_args_size; i++) {
2423 typename decltype(choose_args)::key_type choose_args_index;
2424 ::decode(choose_args_index, blp);
2425 crush_choose_arg_map arg_map;
2426 arg_map.size = crush->max_buckets;
2427 arg_map.args = (crush_choose_arg*)calloc(
2428 arg_map.size, sizeof(crush_choose_arg));
2429 __u32 size;
2430 ::decode(size, blp);
2431 for (__u32 j = 0; j < size; j++) {
2432 __u32 bucket_index;
2433 ::decode(bucket_index, blp);
2434 assert(bucket_index < arg_map.size);
2435 crush_choose_arg *arg = &arg_map.args[bucket_index];
2436 ::decode(arg->weight_set_size, blp);
2437 if (arg->weight_set_size) {
2438 arg->weight_set = (crush_weight_set*)calloc(
2439 arg->weight_set_size, sizeof(crush_weight_set));
2440 for (__u32 k = 0; k < arg->weight_set_size; k++) {
2441 crush_weight_set *weight_set = &arg->weight_set[k];
2442 ::decode(weight_set->size, blp);
2443 weight_set->weights = (__u32*)calloc(
2444 weight_set->size, sizeof(__u32));
2445 for (__u32 l = 0; l < weight_set->size; l++)
2446 ::decode(weight_set->weights[l], blp);
2447 }
2448 }
2449 ::decode(arg->ids_size, blp);
2450 if (arg->ids_size) {
2451 assert(arg->ids_size == crush->buckets[bucket_index]->size);
2452 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
2453 for (__u32 k = 0; k < arg->ids_size; k++)
2454 ::decode(arg->ids[k], blp);
2455 }
2456 }
2457 choose_args[choose_args_index] = arg_map;
2458 }
2459 }
2460 finalize();
2461 }
2462 catch (...) {
2463 crush_destroy(crush);
2464 throw;
2465 }
2466 }
2467
2468 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
2469 {
2470 __u32 alg;
2471 ::decode(alg, blp);
2472 if (!alg) {
2473 *bptr = NULL;
2474 return;
2475 }
2476
2477 int size = 0;
2478 switch (alg) {
2479 case CRUSH_BUCKET_UNIFORM:
2480 size = sizeof(crush_bucket_uniform);
2481 break;
2482 case CRUSH_BUCKET_LIST:
2483 size = sizeof(crush_bucket_list);
2484 break;
2485 case CRUSH_BUCKET_TREE:
2486 size = sizeof(crush_bucket_tree);
2487 break;
2488 case CRUSH_BUCKET_STRAW:
2489 size = sizeof(crush_bucket_straw);
2490 break;
2491 case CRUSH_BUCKET_STRAW2:
2492 size = sizeof(crush_bucket_straw2);
2493 break;
2494 default:
2495 {
2496 char str[128];
2497 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
2498 throw buffer::malformed_input(str);
2499 }
2500 }
2501 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
2502 *bptr = bucket;
2503
2504 ::decode(bucket->id, blp);
2505 ::decode(bucket->type, blp);
2506 ::decode(bucket->alg, blp);
2507 ::decode(bucket->hash, blp);
2508 ::decode(bucket->weight, blp);
2509 ::decode(bucket->size, blp);
2510
2511 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
2512 for (unsigned j = 0; j < bucket->size; ++j) {
2513 ::decode(bucket->items[j], blp);
2514 }
2515
2516 switch (bucket->alg) {
2517 case CRUSH_BUCKET_UNIFORM:
2518 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
2519 break;
2520
2521 case CRUSH_BUCKET_LIST: {
2522 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
2523 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2524 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2525
2526 for (unsigned j = 0; j < bucket->size; ++j) {
2527 ::decode(cbl->item_weights[j], blp);
2528 ::decode(cbl->sum_weights[j], blp);
2529 }
2530 break;
2531 }
2532
2533 case CRUSH_BUCKET_TREE: {
2534 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
2535 ::decode(cbt->num_nodes, blp);
2536 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
2537 for (unsigned j=0; j<cbt->num_nodes; j++) {
2538 ::decode(cbt->node_weights[j], blp);
2539 }
2540 break;
2541 }
2542
2543 case CRUSH_BUCKET_STRAW: {
2544 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
2545 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2546 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2547 for (unsigned j = 0; j < bucket->size; ++j) {
2548 ::decode(cbs->item_weights[j], blp);
2549 ::decode(cbs->straws[j], blp);
2550 }
2551 break;
2552 }
2553
2554 case CRUSH_BUCKET_STRAW2: {
2555 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
2556 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2557 for (unsigned j = 0; j < bucket->size; ++j) {
2558 ::decode(cbs->item_weights[j], blp);
2559 }
2560 break;
2561 }
2562
2563 default:
2564 // We should have handled this case in the first switch statement
2565 ceph_abort();
2566 break;
2567 }
2568 }
2569
2570
2571 void CrushWrapper::dump(Formatter *f) const
2572 {
2573 f->open_array_section("devices");
2574 for (int i=0; i<get_max_devices(); i++) {
2575 f->open_object_section("device");
2576 f->dump_int("id", i);
2577 const char *n = get_item_name(i);
2578 if (n) {
2579 f->dump_string("name", n);
2580 } else {
2581 char name[20];
2582 sprintf(name, "device%d", i);
2583 f->dump_string("name", name);
2584 }
2585 const char *device_class = get_item_class(i);
2586 if (device_class != NULL)
2587 f->dump_string("class", device_class);
2588 f->close_section();
2589 }
2590 f->close_section();
2591
2592 f->open_array_section("types");
2593 int n = get_num_type_names();
2594 for (int i=0; n; i++) {
2595 const char *name = get_type_name(i);
2596 if (!name) {
2597 if (i == 0) {
2598 f->open_object_section("type");
2599 f->dump_int("type_id", 0);
2600 f->dump_string("name", "device");
2601 f->close_section();
2602 }
2603 continue;
2604 }
2605 n--;
2606 f->open_object_section("type");
2607 f->dump_int("type_id", i);
2608 f->dump_string("name", name);
2609 f->close_section();
2610 }
2611 f->close_section();
2612
2613 f->open_array_section("buckets");
2614 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
2615 if (!bucket_exists(bucket))
2616 continue;
2617 f->open_object_section("bucket");
2618 f->dump_int("id", bucket);
2619 if (get_item_name(bucket))
2620 f->dump_string("name", get_item_name(bucket));
2621 f->dump_int("type_id", get_bucket_type(bucket));
2622 if (get_type_name(get_bucket_type(bucket)))
2623 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
2624 f->dump_int("weight", get_bucket_weight(bucket));
2625 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
2626 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
2627 f->open_array_section("items");
2628 for (int j=0; j<get_bucket_size(bucket); j++) {
2629 f->open_object_section("item");
2630 f->dump_int("id", get_bucket_item(bucket, j));
2631 f->dump_int("weight", get_bucket_item_weight(bucket, j));
2632 f->dump_int("pos", j);
2633 f->close_section();
2634 }
2635 f->close_section();
2636 f->close_section();
2637 }
2638 f->close_section();
2639
2640 f->open_array_section("rules");
2641 dump_rules(f);
2642 f->close_section();
2643
2644 f->open_object_section("tunables");
2645 dump_tunables(f);
2646 f->close_section();
2647
2648 dump_choose_args(f);
2649 }
2650
2651 namespace {
2652 // depth first walker
2653 class TreeDumper {
2654 typedef CrushTreeDumper::Item Item;
2655 const CrushWrapper *crush;
2656 const CrushTreeDumper::name_map_t& weight_set_names;
2657 public:
2658 explicit TreeDumper(const CrushWrapper *crush,
2659 const CrushTreeDumper::name_map_t& wsnames)
2660 : crush(crush), weight_set_names(wsnames) {}
2661
2662 void dump(Formatter *f) {
2663 set<int> roots;
2664 crush->find_roots(&roots);
2665 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
2666 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
2667 }
2668 }
2669
2670 private:
2671 void dump_item(const Item& qi, Formatter* f) {
2672 if (qi.is_bucket()) {
2673 f->open_object_section("bucket");
2674 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2675 dump_bucket_children(qi, f);
2676 f->close_section();
2677 } else {
2678 f->open_object_section("device");
2679 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2680 f->close_section();
2681 }
2682 }
2683
2684 void dump_bucket_children(const Item& parent, Formatter* f) {
2685 f->open_array_section("items");
2686 const int max_pos = crush->get_bucket_size(parent.id);
2687 for (int pos = 0; pos < max_pos; pos++) {
2688 int id = crush->get_bucket_item(parent.id, pos);
2689 float weight = crush->get_bucket_item_weightf(parent.id, pos);
2690 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
2691 }
2692 f->close_section();
2693 }
2694 };
2695 }
2696
2697 void CrushWrapper::dump_tree(
2698 Formatter *f,
2699 const CrushTreeDumper::name_map_t& weight_set_names) const
2700 {
2701 assert(f);
2702 TreeDumper(this, weight_set_names).dump(f);
2703 }
2704
2705 void CrushWrapper::dump_tunables(Formatter *f) const
2706 {
2707 f->dump_int("choose_local_tries", get_choose_local_tries());
2708 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2709 f->dump_int("choose_total_tries", get_choose_total_tries());
2710 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2711 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2712 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2713 f->dump_int("straw_calc_version", get_straw_calc_version());
2714 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2715
2716 // be helpful about it
2717 if (has_jewel_tunables())
2718 f->dump_string("profile", "jewel");
2719 else if (has_hammer_tunables())
2720 f->dump_string("profile", "hammer");
2721 else if (has_firefly_tunables())
2722 f->dump_string("profile", "firefly");
2723 else if (has_bobtail_tunables())
2724 f->dump_string("profile", "bobtail");
2725 else if (has_argonaut_tunables())
2726 f->dump_string("profile", "argonaut");
2727 else
2728 f->dump_string("profile", "unknown");
2729 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
2730 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
2731
2732 // be helpful about minimum version required
2733 f->dump_string("minimum_required_version", get_min_required_version());
2734
2735 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2736 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2737 f->dump_int("has_v2_rules", (int)has_v2_rules());
2738 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2739 f->dump_int("has_v3_rules", (int)has_v3_rules());
2740 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
2741 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2742 f->dump_int("has_v5_rules", (int)has_v5_rules());
2743 }
2744
2745 void CrushWrapper::dump_choose_args(Formatter *f) const
2746 {
2747 f->open_object_section("choose_args");
2748 for (auto c : choose_args) {
2749 crush_choose_arg_map arg_map = c.second;
2750 f->open_array_section(stringify(c.first).c_str());
2751 for (__u32 i = 0; i < arg_map.size; i++) {
2752 crush_choose_arg *arg = &arg_map.args[i];
2753 if (arg->weight_set_size == 0 &&
2754 arg->ids_size == 0)
2755 continue;
2756 f->open_object_section("choose_args");
2757 int bucket_index = i;
2758 f->dump_int("bucket_id", -1-bucket_index);
2759 if (arg->weight_set_size > 0) {
2760 f->open_array_section("weight_set");
2761 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2762 f->open_array_section("weights");
2763 __u32 *weights = arg->weight_set[j].weights;
2764 __u32 size = arg->weight_set[j].size;
2765 for (__u32 k = 0; k < size; k++) {
2766 f->dump_float("weight", (float)weights[k]/(float)0x10000);
2767 }
2768 f->close_section();
2769 }
2770 f->close_section();
2771 }
2772 if (arg->ids_size > 0) {
2773 f->open_array_section("ids");
2774 for (__u32 j = 0; j < arg->ids_size; j++)
2775 f->dump_int("id", arg->ids[j]);
2776 f->close_section();
2777 }
2778 f->close_section();
2779 }
2780 f->close_section();
2781 }
2782 f->close_section();
2783 }
2784
2785 void CrushWrapper::dump_rules(Formatter *f) const
2786 {
2787 for (int i=0; i<get_max_rules(); i++) {
2788 if (!rule_exists(i))
2789 continue;
2790 dump_rule(i, f);
2791 }
2792 }
2793
2794 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
2795 {
2796 f->open_object_section("rule");
2797 f->dump_int("rule_id", ruleset);
2798 if (get_rule_name(ruleset))
2799 f->dump_string("rule_name", get_rule_name(ruleset));
2800 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
2801 f->dump_int("type", get_rule_mask_type(ruleset));
2802 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
2803 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
2804 f->open_array_section("steps");
2805 for (int j=0; j<get_rule_len(ruleset); j++) {
2806 f->open_object_section("step");
2807 switch (get_rule_op(ruleset, j)) {
2808 case CRUSH_RULE_NOOP:
2809 f->dump_string("op", "noop");
2810 break;
2811 case CRUSH_RULE_TAKE:
2812 f->dump_string("op", "take");
2813 {
2814 int item = get_rule_arg1(ruleset, j);
2815 f->dump_int("item", item);
2816
2817 const char *name = get_item_name(item);
2818 f->dump_string("item_name", name ? name : "");
2819 }
2820 break;
2821 case CRUSH_RULE_EMIT:
2822 f->dump_string("op", "emit");
2823 break;
2824 case CRUSH_RULE_CHOOSE_FIRSTN:
2825 f->dump_string("op", "choose_firstn");
2826 f->dump_int("num", get_rule_arg1(ruleset, j));
2827 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2828 break;
2829 case CRUSH_RULE_CHOOSE_INDEP:
2830 f->dump_string("op", "choose_indep");
2831 f->dump_int("num", get_rule_arg1(ruleset, j));
2832 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2833 break;
2834 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2835 f->dump_string("op", "chooseleaf_firstn");
2836 f->dump_int("num", get_rule_arg1(ruleset, j));
2837 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2838 break;
2839 case CRUSH_RULE_CHOOSELEAF_INDEP:
2840 f->dump_string("op", "chooseleaf_indep");
2841 f->dump_int("num", get_rule_arg1(ruleset, j));
2842 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2843 break;
2844 case CRUSH_RULE_SET_CHOOSE_TRIES:
2845 f->dump_string("op", "set_choose_tries");
2846 f->dump_int("num", get_rule_arg1(ruleset, j));
2847 break;
2848 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
2849 f->dump_string("op", "set_chooseleaf_tries");
2850 f->dump_int("num", get_rule_arg1(ruleset, j));
2851 break;
2852 default:
2853 f->dump_int("opcode", get_rule_op(ruleset, j));
2854 f->dump_int("arg1", get_rule_arg1(ruleset, j));
2855 f->dump_int("arg2", get_rule_arg2(ruleset, j));
2856 }
2857 f->close_section();
2858 }
2859 f->close_section();
2860 f->close_section();
2861 }
2862
2863 void CrushWrapper::list_rules(Formatter *f) const
2864 {
2865 for (int rule = 0; rule < get_max_rules(); rule++) {
2866 if (!rule_exists(rule))
2867 continue;
2868 f->dump_string("name", get_rule_name(rule));
2869 }
2870 }
2871
2872 void CrushWrapper::list_rules(ostream *ss) const
2873 {
2874 for (int rule = 0; rule < get_max_rules(); rule++) {
2875 if (!rule_exists(rule))
2876 continue;
2877 *ss << get_rule_name(rule) << "\n";
2878 }
2879 }
2880
2881 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
2882 public:
2883 typedef CrushTreeDumper::Dumper<TextTable> Parent;
2884
2885 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2886 const CrushTreeDumper::name_map_t& wsnames)
2887 : Parent(crush, wsnames) {}
2888 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2889 const CrushTreeDumper::name_map_t& wsnames,
2890 bool show_shadow)
2891 : Parent(crush, wsnames, show_shadow) {}
2892
2893
2894 void dump(TextTable *tbl) {
2895 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
2896 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
2897 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
2898 for (auto& p : crush->choose_args) {
2899 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
2900 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
2901 } else {
2902 string name;
2903 auto q = weight_set_names.find(p.first);
2904 name = q != weight_set_names.end() ? q->second :
2905 stringify(p.first);
2906 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
2907 }
2908 }
2909 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
2910 Parent::dump(tbl);
2911 }
2912
2913 protected:
2914 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
2915 const char *c = crush->get_item_class(qi.id);
2916 if (!c)
2917 c = "";
2918 *tbl << qi.id
2919 << c
2920 << weightf_t(qi.weight);
2921 for (auto& p : crush->choose_args) {
2922 if (qi.parent < 0) {
2923 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
2924 int bidx = -1 - qi.parent;
2925 const crush_bucket *b = crush->get_bucket(qi.parent);
2926 if (b &&
2927 bidx < (int)cmap.size &&
2928 cmap.args[bidx].weight_set &&
2929 cmap.args[bidx].weight_set_size >= 1) {
2930 int pos;
2931 for (pos = 0;
2932 pos < (int)cmap.args[bidx].weight_set[0].size &&
2933 b->items[pos] != qi.id;
2934 ++pos) ;
2935 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
2936 (float)0x10000);
2937 continue;
2938 }
2939 }
2940 *tbl << "";
2941 }
2942 ostringstream ss;
2943 for (int k=0; k < qi.depth; k++) {
2944 ss << " ";
2945 }
2946 if (qi.is_bucket()) {
2947 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2948 << crush->get_item_name(qi.id);
2949 } else {
2950 ss << "osd." << qi.id;
2951 }
2952 *tbl << ss.str();
2953 *tbl << TextTable::endrow;
2954 }
2955 };
2956
2957
2958 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2959 public:
2960 typedef CrushTreeDumper::FormattingDumper Parent;
2961
2962 explicit CrushTreeFormattingDumper(
2963 const CrushWrapper *crush,
2964 const CrushTreeDumper::name_map_t& wsnames)
2965 : Parent(crush, wsnames) {}
2966
2967 explicit CrushTreeFormattingDumper(
2968 const CrushWrapper *crush,
2969 const CrushTreeDumper::name_map_t& wsnames,
2970 bool show_shadow)
2971 : Parent(crush, wsnames, show_shadow) {}
2972
2973 void dump(Formatter *f) {
2974 f->open_array_section("nodes");
2975 Parent::dump(f);
2976 f->close_section();
2977 f->open_array_section("stray");
2978 f->close_section();
2979 }
2980 };
2981
2982
2983 void CrushWrapper::dump_tree(
2984 ostream *out,
2985 Formatter *f,
2986 const CrushTreeDumper::name_map_t& weight_set_names,
2987 bool show_shadow) const
2988 {
2989 if (out) {
2990 TextTable tbl;
2991 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
2992 *out << tbl;
2993 }
2994 if (f) {
2995 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
2996 }
2997 }
2998
2999 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
3000 {
3001 o.push_back(new CrushWrapper);
3002 // fixme
3003 }
3004
3005 /**
3006 * Determine the default CRUSH ruleset ID to be used with
3007 * newly created replicated pools.
3008 *
3009 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
3010 */
3011 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
3012 {
3013 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
3014 if (crush_ruleset < 0) {
3015 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
3016 } else if (!ruleset_exists(crush_ruleset)) {
3017 crush_ruleset = -1; // match find_first_ruleset() retval
3018 }
3019 return crush_ruleset;
3020 }
3021
3022 bool CrushWrapper::is_valid_crush_name(const string& s)
3023 {
3024 if (s.empty())
3025 return false;
3026 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
3027 if (!(*p == '-') &&
3028 !(*p == '_') &&
3029 !(*p == '.') &&
3030 !(*p >= '0' && *p <= '9') &&
3031 !(*p >= 'A' && *p <= 'Z') &&
3032 !(*p >= 'a' && *p <= 'z'))
3033 return false;
3034 }
3035 return true;
3036 }
3037
3038 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
3039 const map<string,string>& loc)
3040 {
3041 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
3042 if (!is_valid_crush_name(l->first) ||
3043 !is_valid_crush_name(l->second)) {
3044 ldout(cct, 1) << "loc["
3045 << l->first << "] = '"
3046 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
3047 << dendl;
3048 return false;
3049 }
3050 }
3051 return true;
3052 }
3053
3054 int CrushWrapper::_choose_type_stack(
3055 CephContext *cct,
3056 const vector<pair<int,int>>& stack,
3057 const set<int>& overfull,
3058 const vector<int>& underfull,
3059 const vector<int>& orig,
3060 vector<int>::const_iterator& i,
3061 set<int>& used,
3062 vector<int> *pw) const
3063 {
3064 vector<int> w = *pw;
3065 vector<int> o;
3066
3067 ldout(cct, 10) << __func__ << " stack " << stack
3068 << " orig " << orig
3069 << " at " << *i
3070 << " pw " << *pw
3071 << dendl;
3072
3073 vector<int> cumulative_fanout(stack.size());
3074 int f = 1;
3075 for (int j = (int)stack.size() - 1; j >= 0; --j) {
3076 cumulative_fanout[j] = f;
3077 f *= stack[j].second;
3078 }
3079 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
3080 << dendl;
3081
3082 // identify underful targets for each intermediate level.
3083 // this serves two purposes:
3084 // 1. we can tell when we are selecting a bucket that does not have any underfull
3085 // devices beneath it. that means that if the current input includes an overfull
3086 // device, we won't be able to find an underfull device with this parent to
3087 // swap for it.
3088 // 2. when we decide we should reject a bucket due to the above, this list gives us
3089 // a list of peers to consider that *do* have underfull devices available.. (we
3090 // are careful to pick one that has the same parent.)
3091 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
3092 underfull_buckets.resize(stack.size() - 1);
3093 for (auto osd : underfull) {
3094 int item = osd;
3095 for (int j = (int)stack.size() - 2; j >= 0; --j) {
3096 int type = stack[j].first;
3097 item = get_parent_of_type(item, type);
3098 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
3099 << " is " << item << dendl;
3100 underfull_buckets[j].insert(item);
3101 }
3102 }
3103 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
3104
3105 for (unsigned j = 0; j < stack.size(); ++j) {
3106 int type = stack[j].first;
3107 int fanout = stack[j].second;
3108 int cum_fanout = cumulative_fanout[j];
3109 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
3110 << " cumulative " << cum_fanout
3111 << " w " << w << dendl;
3112 vector<int> o;
3113 auto tmpi = i;
3114 for (auto from : w) {
3115 ldout(cct, 10) << " from " << from << dendl;
3116 // identify leaves under each choice. we use this to check whether any of these
3117 // leaves are overfull. (if so, we need to make sure there are underfull candidates
3118 // to swap for them.)
3119 vector<set<int>> leaves;
3120 leaves.resize(fanout);
3121 for (int pos = 0; pos < fanout; ++pos) {
3122 if (type > 0) {
3123 // non-leaf
3124 int item = get_parent_of_type(*tmpi, type);
3125 o.push_back(item);
3126 int n = cum_fanout;
3127 while (n-- && tmpi != orig.end()) {
3128 leaves[pos].insert(*tmpi++);
3129 }
3130 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
3131 << " of type " << type << " over leaves " << leaves[pos] << dendl;
3132 } else {
3133 // leaf
3134 bool replaced = false;
3135 if (overfull.count(*i)) {
3136 for (auto item : underfull) {
3137 ldout(cct, 10) << __func__ << " pos " << pos
3138 << " was " << *i << " considering " << item
3139 << dendl;
3140 if (used.count(item)) {
3141 ldout(cct, 20) << __func__ << " in used " << used << dendl;
3142 continue;
3143 }
3144 if (!subtree_contains(from, item)) {
3145 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
3146 continue;
3147 }
3148 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
3149 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
3150 continue;
3151 }
3152 o.push_back(item);
3153 used.insert(item);
3154 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
3155 << *i << " -> " << item << dendl;
3156 replaced = true;
3157 ++i;
3158 break;
3159 }
3160 }
3161 if (!replaced) {
3162 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
3163 << dendl;
3164 o.push_back(*i);
3165 ++i;
3166 }
3167 if (i == orig.end()) {
3168 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
3169 break;
3170 }
3171 }
3172 }
3173 if (j + 1 < stack.size()) {
3174 // check if any buckets have overfull leaves but no underfull candidates
3175 for (int pos = 0; pos < fanout; ++pos) {
3176 if (underfull_buckets[j].count(o[pos]) == 0) {
3177 // are any leaves overfull?
3178 bool any_overfull = false;
3179 for (auto osd : leaves[pos]) {
3180 if (overfull.count(osd)) {
3181 any_overfull = true;
3182 }
3183 }
3184 if (any_overfull) {
3185 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
3186 << ">0 leaves " << leaves[pos] << " is overfull; alts "
3187 << underfull_buckets[j]
3188 << dendl;
3189 for (auto alt : underfull_buckets[j]) {
3190 if (std::find(o.begin(), o.end(), alt) == o.end()) {
3191 // see if alt has the same parent
3192 if (j == 0 ||
3193 get_parent_of_type(o[pos], stack[j-1].first) ==
3194 get_parent_of_type(alt, stack[j-1].first)) {
3195 if (j)
3196 ldout(cct, 10) << " replacing " << o[pos]
3197 << " (which has no underfull leaves) with " << alt
3198 << " (same parent "
3199 << get_parent_of_type(alt, stack[j-1].first) << " type "
3200 << type << ")" << dendl;
3201 else
3202 ldout(cct, 10) << " replacing " << o[pos]
3203 << " (which has no underfull leaves) with " << alt
3204 << " (first level)" << dendl;
3205 o[pos] = alt;
3206 break;
3207 } else {
3208 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
3209 << " has different parent, skipping" << dendl;
3210 }
3211 }
3212 }
3213 }
3214 }
3215 }
3216 }
3217 if (i == orig.end()) {
3218 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
3219 break;
3220 }
3221 }
3222 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
3223 w.swap(o);
3224 }
3225 *pw = w;
3226 return 0;
3227 }
3228
3229 int CrushWrapper::try_remap_rule(
3230 CephContext *cct,
3231 int ruleno,
3232 int maxout,
3233 const set<int>& overfull,
3234 const vector<int>& underfull,
3235 const vector<int>& orig,
3236 vector<int> *out) const
3237 {
3238 const crush_map *map = crush;
3239 const crush_rule *rule = get_rule(ruleno);
3240 assert(rule);
3241
3242 ldout(cct, 10) << __func__ << " ruleno " << ruleno
3243 << " numrep " << maxout << " overfull " << overfull
3244 << " underfull " << underfull << " orig " << orig
3245 << dendl;
3246 vector<int> w; // working set
3247 out->clear();
3248
3249 auto i = orig.begin();
3250 set<int> used;
3251
3252 vector<pair<int,int>> type_stack; // (type, fan-out)
3253
3254 for (unsigned step = 0; step < rule->len; ++step) {
3255 const crush_rule_step *curstep = &rule->steps[step];
3256 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
3257 switch (curstep->op) {
3258 case CRUSH_RULE_TAKE:
3259 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
3260 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
3261 map->buckets[-1-curstep->arg1])) {
3262 w.clear();
3263 w.push_back(curstep->arg1);
3264 ldout(cct, 10) << __func__ << " take " << w << dendl;
3265 } else {
3266 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
3267 }
3268 break;
3269
3270 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3271 case CRUSH_RULE_CHOOSELEAF_INDEP:
3272 {
3273 int numrep = curstep->arg1;
3274 int type = curstep->arg2;
3275 if (numrep <= 0)
3276 numrep += maxout;
3277 type_stack.push_back(make_pair(type, numrep));
3278 type_stack.push_back(make_pair(0, 1));
3279 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3280 i, used, &w);
3281 if (r < 0)
3282 return r;
3283 type_stack.clear();
3284 }
3285 break;
3286
3287 case CRUSH_RULE_CHOOSE_FIRSTN:
3288 case CRUSH_RULE_CHOOSE_INDEP:
3289 {
3290 int numrep = curstep->arg1;
3291 int type = curstep->arg2;
3292 if (numrep <= 0)
3293 numrep += maxout;
3294 type_stack.push_back(make_pair(type, numrep));
3295 }
3296 break;
3297
3298 case CRUSH_RULE_EMIT:
3299 ldout(cct, 10) << " emit " << w << dendl;
3300 if (!type_stack.empty()) {
3301 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3302 i, used, &w);
3303 if (r < 0)
3304 return r;
3305 type_stack.clear();
3306 }
3307 for (auto item : w) {
3308 out->push_back(item);
3309 }
3310 w.clear();
3311 break;
3312
3313 default:
3314 // ignore
3315 break;
3316 }
3317 }
3318
3319 return 0;
3320 }
3321
3322
3323 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3324 CephContext *cct,
3325 crush_choose_arg_map cmap,
3326 int bucketid,
3327 int id,
3328 const vector<int>& weight,
3329 ostream *ss)
3330 {
3331 int changed = 0;
3332 int bidx = -1 - bucketid;
3333 crush_bucket *b = crush->buckets[bidx];
3334 if (bidx >= (int)cmap.size) {
3335 if (ss)
3336 *ss << "no weight-set for bucket " << b->id;
3337 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
3338 << dendl;
3339 return 0;
3340 }
3341 crush_choose_arg *carg = &cmap.args[bidx];
3342 if (carg->weight_set == NULL) {
3343 if (ss)
3344 *ss << "no weight-set for bucket " << b->id;
3345 ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
3346 << dendl;
3347 return 0;
3348 }
3349 if (carg->weight_set_size != weight.size()) {
3350 if (ss)
3351 *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
3352 ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
3353 << " for bucket " << b->id << dendl;
3354 return 0;
3355 }
3356 for (unsigned i = 0; i < b->size; i++) {
3357 if (b->items[i] == id) {
3358 for (unsigned j = 0; j < weight.size(); ++j) {
3359 carg->weight_set[j].weights[i] = weight[j];
3360 }
3361 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
3362 << " in bucket " << b->id << dendl;
3363 changed++;
3364 }
3365 }
3366 if (changed) {
3367 vector<int> bucket_weight(weight.size(), 0);
3368 for (unsigned i = 0; i < b->size; i++) {
3369 for (unsigned j = 0; j < weight.size(); ++j) {
3370 bucket_weight[j] += carg->weight_set[j].weights[i];
3371 }
3372 }
3373 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
3374 }
3375 return changed;
3376 }
3377
3378 int CrushWrapper::choose_args_adjust_item_weight(
3379 CephContext *cct,
3380 crush_choose_arg_map cmap,
3381 int id,
3382 const vector<int>& weight,
3383 ostream *ss)
3384 {
3385 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
3386 int changed = 0;
3387 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
3388 crush_bucket *b = crush->buckets[bidx];
3389 if (b == nullptr) {
3390 continue;
3391 }
3392 changed += _choose_args_adjust_item_weight_in_bucket(
3393 cct, cmap, b->id, id, weight, ss);
3394 }
3395 if (!changed) {
3396 if (ss)
3397 *ss << "item " << id << " not found in crush map";
3398 return -ENOENT;
3399 }
3400 return changed;
3401 }