]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
3a669e959f0235103c32bb2551e8d8bcdace17e5
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "include/stringify.h"
9
10 #include "CrushWrapper.h"
11 #include "CrushTreeDumper.h"
12
13 #define dout_subsys ceph_subsys_crush
14
15 bool CrushWrapper::has_legacy_rulesets() const
16 {
17 for (unsigned i=0; i<crush->max_rules; i++) {
18 crush_rule *r = crush->rules[i];
19 if (r &&
20 r->mask.ruleset != i) {
21 return true;
22 }
23 }
24 return false;
25 }
26
27 int CrushWrapper::renumber_rules_by_ruleset()
28 {
29 int max_ruleset = 0;
30 for (unsigned i=0; i<crush->max_rules; i++) {
31 crush_rule *r = crush->rules[i];
32 if (r && r->mask.ruleset >= max_ruleset) {
33 max_ruleset = r->mask.ruleset + 1;
34 }
35 }
36 struct crush_rule **newrules =
37 (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
38 for (unsigned i=0; i<crush->max_rules; i++) {
39 crush_rule *r = crush->rules[i];
40 if (!r)
41 continue;
42 if (newrules[r->mask.ruleset]) {
43 // collision, we can't do it.
44 free(newrules);
45 return -EINVAL;
46 }
47 newrules[r->mask.ruleset] = r;
48 }
49
50 // success, swap!
51 free(crush->rules);
52 crush->rules = newrules;
53 crush->max_rules = max_ruleset;
54 return 0;
55 }
56
57 bool CrushWrapper::has_multirule_rulesets() const
58 {
59 for (unsigned i=0; i<crush->max_rules; i++) {
60 crush_rule *r = crush->rules[i];
61 if (!r)
62 continue;
63 for (unsigned j=i+1; j<crush->max_rules; j++) {
64 crush_rule *s = crush->rules[j];
65 if (!s)
66 continue;
67 if (r->mask.ruleset == s->mask.ruleset)
68 return true;
69 }
70 }
71 return false;
72 }
73
74 bool CrushWrapper::has_v2_rules() const
75 {
76 for (unsigned i=0; i<crush->max_rules; i++) {
77 if (is_v2_rule(i)) {
78 return true;
79 }
80 }
81 return false;
82 }
83
84 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
85 {
86 // check rule for use of indep or new SET_* rule steps
87 if (ruleid >= crush->max_rules)
88 return false;
89 crush_rule *r = crush->rules[ruleid];
90 if (!r)
91 return false;
92 for (unsigned j=0; j<r->len; j++) {
93 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
94 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
95 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
96 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
97 return true;
98 }
99 }
100 return false;
101 }
102
103 bool CrushWrapper::has_v3_rules() const
104 {
105 for (unsigned i=0; i<crush->max_rules; i++) {
106 if (is_v3_rule(i)) {
107 return true;
108 }
109 }
110 return false;
111 }
112
113 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
114 {
115 // check rule for use of SET_CHOOSELEAF_VARY_R step
116 if (ruleid >= crush->max_rules)
117 return false;
118 crush_rule *r = crush->rules[ruleid];
119 if (!r)
120 return false;
121 for (unsigned j=0; j<r->len; j++) {
122 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
123 return true;
124 }
125 }
126 return false;
127 }
128
129 bool CrushWrapper::has_v4_buckets() const
130 {
131 for (int i=0; i<crush->max_buckets; ++i) {
132 crush_bucket *b = crush->buckets[i];
133 if (!b)
134 continue;
135 if (b->alg == CRUSH_BUCKET_STRAW2)
136 return true;
137 }
138 return false;
139 }
140
141 bool CrushWrapper::has_v5_rules() const
142 {
143 for (unsigned i=0; i<crush->max_rules; i++) {
144 if (is_v5_rule(i)) {
145 return true;
146 }
147 }
148 return false;
149 }
150
151 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
152 {
153 // check rule for use of SET_CHOOSELEAF_STABLE step
154 if (ruleid >= crush->max_rules)
155 return false;
156 crush_rule *r = crush->rules[ruleid];
157 if (!r)
158 return false;
159 for (unsigned j=0; j<r->len; j++) {
160 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
161 return true;
162 }
163 }
164 return false;
165 }
166
167 bool CrushWrapper::has_choose_args() const
168 {
169 return !choose_args.empty();
170 }
171
172 bool CrushWrapper::has_incompat_choose_args() const
173 {
174 if (choose_args.empty())
175 return false;
176 if (choose_args.size() > 1)
177 return true;
178 crush_choose_arg_map arg_map = choose_args.begin()->second;
179 for (__u32 i = 0; i < arg_map.size; i++) {
180 crush_choose_arg *arg = &arg_map.args[i];
181 if (arg->weight_set_size == 0 &&
182 arg->ids_size == 0)
183 continue;
184 if (arg->weight_set_size != 1)
185 return true;
186 if (arg->ids_size != 0)
187 return true;
188 }
189 return false;
190 }
191
192 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
193 {
194 if (!item_exists(i))
195 return -EINVAL;
196 string name = get_item_name(i);
197 size_t pos = name.find("~");
198 if (pos == string::npos) {
199 *idout = i;
200 *classout = -1;
201 return 0;
202 }
203 string name_no_class = name.substr(0, pos);
204 if (!name_exists(name_no_class))
205 return -ENOENT;
206 string class_name = name.substr(pos + 1);
207 if (!class_exists(class_name))
208 return -ENOENT;
209 *idout = get_item_id(name_no_class);
210 *classout = get_class_id(class_name);
211 return 0;
212 }
213
214 int CrushWrapper::can_rename_item(const string& srcname,
215 const string& dstname,
216 ostream *ss) const
217 {
218 if (name_exists(srcname)) {
219 if (name_exists(dstname)) {
220 *ss << "dstname = '" << dstname << "' already exists";
221 return -EEXIST;
222 }
223 if (is_valid_crush_name(dstname)) {
224 return 0;
225 } else {
226 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
227 return -EINVAL;
228 }
229 } else {
230 if (name_exists(dstname)) {
231 *ss << "srcname = '" << srcname << "' does not exist "
232 << "and dstname = '" << dstname << "' already exists";
233 return -EALREADY;
234 } else {
235 *ss << "srcname = '" << srcname << "' does not exist";
236 return -ENOENT;
237 }
238 }
239 }
240
241 int CrushWrapper::rename_item(const string& srcname,
242 const string& dstname,
243 ostream *ss)
244 {
245 int ret = can_rename_item(srcname, dstname, ss);
246 if (ret < 0)
247 return ret;
248 int oldid = get_item_id(srcname);
249 return set_item_name(oldid, dstname);
250 }
251
252 int CrushWrapper::can_rename_bucket(const string& srcname,
253 const string& dstname,
254 ostream *ss) const
255 {
256 int ret = can_rename_item(srcname, dstname, ss);
257 if (ret)
258 return ret;
259 int srcid = get_item_id(srcname);
260 if (srcid >= 0) {
261 *ss << "srcname = '" << srcname << "' is not a bucket "
262 << "because its id = " << srcid << " is >= 0";
263 return -ENOTDIR;
264 }
265 return 0;
266 }
267
268 int CrushWrapper::rename_bucket(const string& srcname,
269 const string& dstname,
270 ostream *ss)
271 {
272 int ret = can_rename_bucket(srcname, dstname, ss);
273 if (ret < 0)
274 return ret;
275 int oldid = get_item_id(srcname);
276 return set_item_name(oldid, dstname);
277 }
278
279 void CrushWrapper::find_takes(set<int>& roots) const
280 {
281 for (unsigned i=0; i<crush->max_rules; i++) {
282 crush_rule *r = crush->rules[i];
283 if (!r)
284 continue;
285 for (unsigned j=0; j<r->len; j++) {
286 if (r->steps[j].op == CRUSH_RULE_TAKE)
287 roots.insert(r->steps[j].arg1);
288 }
289 }
290 }
291
292 void CrushWrapper::find_roots(set<int>& roots) const
293 {
294 for (int i = 0; i < crush->max_buckets; i++) {
295 if (!crush->buckets[i])
296 continue;
297 crush_bucket *b = crush->buckets[i];
298 if (!_search_item_exists(b->id))
299 roots.insert(b->id);
300 }
301 }
302
303 bool CrushWrapper::subtree_contains(int root, int item) const
304 {
305 if (root == item)
306 return true;
307
308 if (root >= 0)
309 return false; // root is a leaf
310
311 const crush_bucket *b = get_bucket(root);
312 if (IS_ERR(b))
313 return false;
314
315 for (unsigned j=0; j<b->size; j++) {
316 if (subtree_contains(b->items[j], item))
317 return true;
318 }
319 return false;
320 }
321
322 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
323 {
324 // last instance?
325 if (_search_item_exists(item)) {
326 return false;
327 }
328 if (item < 0 && _bucket_is_in_use(item)) {
329 return false;
330 }
331
332 if (item < 0 && !unlink_only) {
333 crush_bucket *t = get_bucket(item);
334 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
335 crush_remove_bucket(crush, t);
336 if (class_bucket.count(item) != 0)
337 class_bucket.erase(item);
338 }
339 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
340 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
341 name_map.erase(item);
342 have_rmaps = false;
343 }
344 return true;
345 }
346
347 int CrushWrapper::remove_root(int item, bool unused)
348 {
349 if (unused && _bucket_is_in_use(item))
350 return 0;
351
352 crush_bucket *b = get_bucket(item);
353 if (IS_ERR(b))
354 return -ENOENT;
355
356 for (unsigned n = 0; n < b->size; n++) {
357 if (b->items[n] >= 0)
358 continue;
359 int r = remove_root(b->items[n], unused);
360 if (r < 0)
361 return r;
362 }
363
364 crush_remove_bucket(crush, b);
365 if (name_map.count(item) != 0) {
366 name_map.erase(item);
367 have_rmaps = false;
368 }
369 if (class_bucket.count(item) != 0)
370 class_bucket.erase(item);
371 return 0;
372 }
373
374 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
375 {
376 ldout(cct, 5) << "remove_item " << item << (unlink_only ? " unlink_only":"") << dendl;
377
378 int ret = -ENOENT;
379
380 if (item < 0 && !unlink_only) {
381 crush_bucket *t = get_bucket(item);
382 if (IS_ERR(t)) {
383 ldout(cct, 1) << "remove_item bucket " << item << " does not exist" << dendl;
384 return -ENOENT;
385 }
386
387 if (t->size) {
388 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
389 << " items, not empty" << dendl;
390 return -ENOTEMPTY;
391 }
392 if (_bucket_is_in_use(item)) {
393 return -EBUSY;
394 }
395 }
396
397 for (int i = 0; i < crush->max_buckets; i++) {
398 if (!crush->buckets[i])
399 continue;
400 crush_bucket *b = crush->buckets[i];
401
402 for (unsigned i=0; i<b->size; ++i) {
403 int id = b->items[i];
404 if (id == item) {
405 ldout(cct, 5) << "remove_item removing item " << item
406 << " from bucket " << b->id << dendl;
407 bucket_remove_item(b, item);
408 adjust_item_weight(cct, b->id, b->weight);
409 ret = 0;
410 }
411 }
412 }
413
414 if (_maybe_remove_last_instance(cct, item, unlink_only))
415 ret = 0;
416
417 return ret;
418 }
419
420 bool CrushWrapper::_search_item_exists(int item) const
421 {
422 for (int i = 0; i < crush->max_buckets; i++) {
423 if (!crush->buckets[i])
424 continue;
425 crush_bucket *b = crush->buckets[i];
426 for (unsigned j=0; j<b->size; ++j) {
427 if (b->items[j] == item)
428 return true;
429 }
430 }
431 return false;
432 }
433
434 bool CrushWrapper::_bucket_is_in_use(int item)
435 {
436 for (auto &i : class_bucket)
437 for (auto &j : i.second)
438 if (j.second == item)
439 return true;
440 for (unsigned i = 0; i < crush->max_rules; ++i) {
441 crush_rule *r = crush->rules[i];
442 if (!r)
443 continue;
444 for (unsigned j = 0; j < r->len; ++j) {
445 if (r->steps[j].op == CRUSH_RULE_TAKE) {
446 int step_item = r->steps[j].arg1;
447 int original_item;
448 int c;
449 int res = split_id_class(step_item, &original_item, &c);
450 if (res < 0)
451 return false;
452 if (step_item == item || original_item == item)
453 return true;
454 }
455 }
456 }
457 return false;
458 }
459
460 int CrushWrapper::_remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
461 {
462 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
463 << (unlink_only ? " unlink_only":"") << dendl;
464
465 if (ancestor >= 0) {
466 return -EINVAL;
467 }
468
469 if (!bucket_exists(ancestor))
470 return -EINVAL;
471
472 int ret = -ENOENT;
473
474 crush_bucket *b = get_bucket(ancestor);
475 for (unsigned i=0; i<b->size; ++i) {
476 int id = b->items[i];
477 if (id == item) {
478 ldout(cct, 5) << "_remove_item_under removing item " << item << " from bucket " << b->id << dendl;
479 bucket_remove_item(b, item);
480 adjust_item_weight(cct, b->id, b->weight);
481 ret = 0;
482 } else if (id < 0) {
483 int r = remove_item_under(cct, item, id, unlink_only);
484 if (r == 0)
485 ret = 0;
486 }
487 }
488 return ret;
489 }
490
491 int CrushWrapper::remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
492 {
493 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
494 << (unlink_only ? " unlink_only":"") << dendl;
495
496 if (!unlink_only && _bucket_is_in_use(item)) {
497 return -EBUSY;
498 }
499
500 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
501 if (ret < 0)
502 return ret;
503
504 if (item < 0 && !unlink_only) {
505 crush_bucket *t = get_bucket(item);
506 if (IS_ERR(t)) {
507 ldout(cct, 1) << "remove_item_under bucket " << item
508 << " does not exist" << dendl;
509 return -ENOENT;
510 }
511
512 if (t->size) {
513 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
514 << " items, not empty" << dendl;
515 return -ENOTEMPTY;
516 }
517 }
518
519 if (_maybe_remove_last_instance(cct, item, unlink_only))
520 ret = 0;
521
522 return ret;
523 }
524
525 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
526 const std::multimap<string,string>& loc)
527 {
528 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
529 if (!item_exists(id))
530 return -ENOENT;
531 map<string,string> id_loc = get_full_location(id);
532 ldout(cct, 20) << " id is at " << id_loc << dendl;
533
534 for (map<int,string>::const_iterator p = type_map.begin();
535 p != type_map.end();
536 ++p) {
537 map<string,string>::iterator ip = id_loc.find(p->second);
538 if (ip == id_loc.end())
539 continue;
540 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
541 q != loc.end();
542 ++q) {
543 if (q->first != p->second)
544 break;
545 if (q->second == ip->second)
546 return p->first;
547 }
548 }
549 return -ERANGE;
550 }
551
552 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
553 std::map<string,string> *ploc)
554 {
555 ploc->clear();
556 for (unsigned i = 0; i < args.size(); ++i) {
557 const char *s = args[i].c_str();
558 const char *pos = strchr(s, '=');
559 if (!pos)
560 return -EINVAL;
561 string key(s, 0, pos-s);
562 string value(pos+1);
563 if (value.length())
564 (*ploc)[key] = value;
565 else
566 return -EINVAL;
567 }
568 return 0;
569 }
570
571 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
572 std::multimap<string,string> *ploc)
573 {
574 ploc->clear();
575 for (unsigned i = 0; i < args.size(); ++i) {
576 const char *s = args[i].c_str();
577 const char *pos = strchr(s, '=');
578 if (!pos)
579 return -EINVAL;
580 string key(s, 0, pos-s);
581 string value(pos+1);
582 if (value.length())
583 ploc->insert(make_pair(key, value));
584 else
585 return -EINVAL;
586 }
587 return 0;
588 }
589
590 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
591 int *weight)
592 {
593 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
594
595 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
596 // ignore device
597 if (p->first == 0)
598 continue;
599
600 // ignore types that aren't specified in loc
601 map<string,string>::const_iterator q = loc.find(p->second);
602 if (q == loc.end()) {
603 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
604 << type_map << ")" << dendl;
605 continue;
606 }
607
608 if (!name_exists(q->second)) {
609 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
610 return false;
611 }
612
613 int id = get_item_id(q->second);
614 if (id >= 0) {
615 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
616 << " is a device, not bucket" << dendl;
617 return false;
618 }
619
620 assert(bucket_exists(id));
621 crush_bucket *b = get_bucket(id);
622
623 // see if item exists in this bucket
624 for (unsigned j=0; j<b->size; j++) {
625 if (b->items[j] == item) {
626 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
627 if (weight)
628 *weight = crush_get_bucket_item_weight(b, j);
629 return true;
630 }
631 }
632 return false;
633 }
634
635 ldout(cct, 1) << "check_item_loc item " << item << " loc " << loc << dendl;
636 return false;
637 }
638
639 map<string, string> CrushWrapper::get_full_location(int id)
640 {
641 vector<pair<string, string> > full_location_ordered;
642 map<string,string> full_location;
643
644 get_full_location_ordered(id, full_location_ordered);
645
646 std::copy(full_location_ordered.begin(),
647 full_location_ordered.end(),
648 std::inserter(full_location, full_location.begin()));
649
650 return full_location;
651 }
652
653 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
654 {
655 if (!item_exists(id))
656 return -ENOENT;
657 int cur = id;
658 int ret;
659 while (true) {
660 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
661 if (ret != 0)
662 break;
663 path.push_back(parent_coord);
664 cur = get_item_id(parent_coord.second);
665 }
666 return 0;
667 }
668
669 string CrushWrapper::get_full_location_ordered_string(int id)
670 {
671 vector<pair<string, string> > full_location_ordered;
672 string full_location;
673 get_full_location_ordered(id, full_location_ordered);
674 reverse(begin(full_location_ordered), end(full_location_ordered));
675 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
676 full_location = full_location + i->first + "=" + i->second;
677 if (i != full_location_ordered.end() - 1) {
678 full_location = full_location + ",";
679 }
680 }
681 return full_location;
682 }
683
684 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
685 {
686 map<int,string> parent_hierarchy;
687 pair<string, string> parent_coord = get_immediate_parent(id);
688 int parent_id;
689
690 // get the integer type for id and create a counter from there
691 int type_counter = get_bucket_type(id);
692
693 // if we get a negative type then we can assume that we have an OSD
694 // change behavior in get_item_type FIXME
695 if (type_counter < 0)
696 type_counter = 0;
697
698 // read the type map and get the name of the type with the largest ID
699 int high_type = 0;
700 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
701 if ( (*it).first > high_type )
702 high_type = (*it).first;
703 }
704
705 parent_id = get_item_id(parent_coord.second);
706
707 while (type_counter < high_type) {
708 type_counter++;
709 parent_hierarchy[ type_counter ] = parent_coord.first;
710
711 if (type_counter < high_type){
712 // get the coordinate information for the next parent
713 parent_coord = get_immediate_parent(parent_id);
714 parent_id = get_item_id(parent_coord.second);
715 }
716 }
717
718 return parent_hierarchy;
719 }
720
721 int CrushWrapper::get_children(int id, list<int> *children)
722 {
723 // leaf?
724 if (id >= 0) {
725 return 0;
726 }
727
728 crush_bucket *b = get_bucket(id);
729 if (IS_ERR(b)) {
730 return -ENOENT;
731 }
732
733 for (unsigned n=0; n<b->size; n++) {
734 children->push_back(b->items[n]);
735 }
736 return b->size;
737 }
738
739 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
740 {
741 assert(leaves);
742
743 // Already leaf?
744 if (id >= 0) {
745 leaves->push_back(id);
746 return 0;
747 }
748
749 crush_bucket *b = get_bucket(id);
750 if (IS_ERR(b)) {
751 return -ENOENT;
752 }
753
754 for (unsigned n = 0; n < b->size; n++) {
755 if (b->items[n] >= 0) {
756 leaves->push_back(b->items[n]);
757 } else {
758 // is a bucket, do recursive call
759 int r = _get_leaves(b->items[n], leaves);
760 if (r < 0) {
761 return r;
762 }
763 }
764 }
765
766 return 0; // all is well
767 }
768
769 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
770 {
771 assert(leaves);
772 leaves->clear();
773
774 if (!name_exists(name)) {
775 return -ENOENT;
776 }
777
778 int id = get_item_id(name);
779 if (id >= 0) {
780 // already leaf
781 leaves->insert(id);
782 return 0;
783 }
784
785 list<int> unordered;
786 int r = _get_leaves(id, &unordered);
787 if (r < 0) {
788 return r;
789 }
790
791 for (auto &p : unordered) {
792 leaves->insert(p);
793 }
794
795 return 0;
796 }
797
798 int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
799 const map<string,string>& loc) // typename -> bucketname
800 {
801 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
802 << " name " << name << " loc " << loc << dendl;
803
804 if (!is_valid_crush_name(name))
805 return -EINVAL;
806
807 if (!is_valid_crush_loc(cct, loc))
808 return -EINVAL;
809
810 if (name_exists(name)) {
811 if (get_item_id(name) != item) {
812 ldout(cct, 10) << "device name '" << name << "' already exists as id "
813 << get_item_id(name) << dendl;
814 return -EEXIST;
815 }
816 } else {
817 set_item_name(item, name);
818 }
819
820 int cur = item;
821
822 // create locations if locations don't exist and add child in location with 0 weight
823 // the more detail in the insert_item method declaration in CrushWrapper.h
824 for (map<int,string>::iterator p = type_map.begin(); p != type_map.end(); ++p) {
825 // ignore device type
826 if (p->first == 0)
827 continue;
828
829 // skip types that are unspecified
830 map<string,string>::const_iterator q = loc.find(p->second);
831 if (q == loc.end()) {
832 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
833 << type_map << ")" << dendl;
834 continue;
835 }
836
837 if (!name_exists(q->second)) {
838 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
839 int empty = 0, newid;
840 int r = add_bucket(0, 0,
841 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
842 if (r < 0) {
843 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r) << dendl;
844 return r;
845 }
846 set_item_name(newid, q->second);
847
848 cur = newid;
849 continue;
850 }
851
852 // add to an existing bucket
853 int id = get_item_id(q->second);
854 if (!bucket_exists(id)) {
855 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
856 return -EINVAL;
857 }
858
859 // check that we aren't creating a cycle.
860 if (subtree_contains(id, cur)) {
861 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath " << id << dendl;
862 return -EINVAL;
863 }
864
865 // we have done sanity check above
866 crush_bucket *b = get_bucket(id);
867
868 if (p->first != b->type) {
869 ldout(cct, 1) << "insert_item existing bucket has type "
870 << "'" << type_map[b->type] << "' != "
871 << "'" << type_map[p->first] << "'" << dendl;
872 return -EINVAL;
873 }
874
875 // are we forming a loop?
876 if (subtree_contains(cur, b->id)) {
877 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
878 << "; cannot form loop" << dendl;
879 return -ELOOP;
880 }
881
882 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
883 << " to bucket " << id << dendl;
884 int r = bucket_add_item(b, cur, 0);
885 assert (!r);
886 break;
887 }
888
889 // adjust the item's weight in location
890 if(adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
891 if (item >= crush->max_devices) {
892 crush->max_devices = item + 1;
893 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices << dendl;
894 }
895 return 0;
896 }
897
898 ldout(cct, 1) << "error: didn't find anywhere to add item " << item << " in " << loc << dendl;
899 return -EINVAL;
900 }
901
902 int CrushWrapper::move_bucket(CephContext *cct, int id, const map<string,string>& loc)
903 {
904 // sorry this only works for buckets
905 if (id >= 0)
906 return -EINVAL;
907
908 if (!item_exists(id))
909 return -ENOENT;
910
911 // get the name of the bucket we are trying to move for later
912 string id_name = get_item_name(id);
913
914 // detach the bucket
915 int bucket_weight = detach_bucket(cct, id);
916
917 // insert the bucket back into the hierarchy
918 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
919 }
920
921 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
922 {
923 if (src >= 0 || dst >= 0)
924 return -EINVAL;
925 if (!item_exists(src) || !item_exists(dst))
926 return -EINVAL;
927 crush_bucket *a = get_bucket(src);
928 crush_bucket *b = get_bucket(dst);
929 unsigned aw = a->weight;
930 unsigned bw = b->weight;
931
932 // swap weights
933 adjust_item_weight(cct, a->id, bw);
934 adjust_item_weight(cct, b->id, aw);
935
936 // swap items
937 map<int,unsigned> tmp;
938 unsigned as = a->size;
939 unsigned bs = b->size;
940 for (unsigned i = 0; i < as; ++i) {
941 int item = a->items[0];
942 int itemw = crush_get_bucket_item_weight(a, 0);
943 tmp[item] = itemw;
944 bucket_remove_item(a, item);
945 }
946 assert(a->size == 0);
947 assert(b->size == bs);
948 for (unsigned i = 0; i < bs; ++i) {
949 int item = b->items[0];
950 int itemw = crush_get_bucket_item_weight(b, 0);
951 bucket_remove_item(b, item);
952 bucket_add_item(a, item, itemw);
953 }
954 assert(a->size == bs);
955 assert(b->size == 0);
956 for (auto t : tmp) {
957 bucket_add_item(b, t.first, t.second);
958 }
959 assert(a->size == bs);
960 assert(b->size == as);
961
962 // swap names
963 swap_names(src, dst);
964 return 0;
965 }
966
967 int CrushWrapper::link_bucket(CephContext *cct, int id, const map<string,string>& loc)
968 {
969 // sorry this only works for buckets
970 if (id >= 0)
971 return -EINVAL;
972
973 if (!item_exists(id))
974 return -ENOENT;
975
976 // get the name of the bucket we are trying to move for later
977 string id_name = get_item_name(id);
978
979 crush_bucket *b = get_bucket(id);
980 unsigned bucket_weight = b->weight;
981
982 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
983 }
984
985 int CrushWrapper::create_or_move_item(CephContext *cct, int item, float weight, string name,
986 const map<string,string>& loc) // typename -> bucketname
987 {
988 int ret = 0;
989 int old_iweight;
990
991 if (!is_valid_crush_name(name))
992 return -EINVAL;
993
994 if (check_item_loc(cct, item, loc, &old_iweight)) {
995 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc << dendl;
996 } else {
997 if (_search_item_exists(item)) {
998 weight = get_item_weightf(item);
999 ldout(cct, 10) << "create_or_move_item " << item << " exists with weight " << weight << dendl;
1000 remove_item(cct, item, true);
1001 }
1002 ldout(cct, 5) << "create_or_move_item adding " << item << " weight " << weight
1003 << " at " << loc << dendl;
1004 ret = insert_item(cct, item, weight, name, loc);
1005 if (ret == 0)
1006 ret = 1; // changed
1007 }
1008 return ret;
1009 }
1010
1011 int CrushWrapper::update_item(CephContext *cct, int item, float weight, string name,
1012 const map<string,string>& loc) // typename -> bucketname
1013 {
1014 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1015 << " name " << name << " loc " << loc << dendl;
1016 int ret = 0;
1017
1018 if (!is_valid_crush_name(name))
1019 return -EINVAL;
1020
1021 if (!is_valid_crush_loc(cct, loc))
1022 return -EINVAL;
1023
1024 // compare quantized (fixed-point integer) weights!
1025 int iweight = (int)(weight * (float)0x10000);
1026 int old_iweight;
1027 if (check_item_loc(cct, item, loc, &old_iweight)) {
1028 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1029 if (old_iweight != iweight) {
1030 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1031 << ((float)old_iweight/(float)0x10000) << " -> " << weight << dendl;
1032 adjust_item_weight_in_loc(cct, item, iweight, loc);
1033 ret = 1;
1034 }
1035 if (get_item_name(item) != name) {
1036 ldout(cct, 5) << "update_item setting " << item << " name to " << name << dendl;
1037 set_item_name(item, name);
1038 ret = 1;
1039 }
1040 } else {
1041 if (item_exists(item)) {
1042 remove_item(cct, item, true);
1043 }
1044 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1045 << " at " << loc << dendl;
1046 ret = insert_item(cct, item, weight, name, loc);
1047 if (ret == 0)
1048 ret = 1; // changed
1049 }
1050 return ret;
1051 }
1052
1053 int CrushWrapper::get_item_weight(int id) const
1054 {
1055 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1056 crush_bucket *b = crush->buckets[bidx];
1057 if (b == NULL)
1058 continue;
1059 if (b->id == id)
1060 return b->weight;
1061 for (unsigned i = 0; i < b->size; i++)
1062 if (b->items[i] == id)
1063 return crush_get_bucket_item_weight(b, i);
1064 }
1065 return -ENOENT;
1066 }
1067
1068 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1069 {
1070 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1071
1072 int bid = get_item_id(l->second);
1073 if (!bucket_exists(bid))
1074 continue;
1075 crush_bucket *b = get_bucket(bid);
1076 for (unsigned int i = 0; i < b->size; i++) {
1077 if (b->items[i] == id) {
1078 return crush_get_bucket_item_weight(b, i);
1079 }
1080 }
1081 }
1082 return -ENOENT;
1083 }
1084
1085 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1086 {
1087 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1088 int changed = 0;
1089 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1090 crush_bucket *b = crush->buckets[bidx];
1091 if (b == 0)
1092 continue;
1093 for (unsigned i = 0; i < b->size; i++) {
1094 if (b->items[i] == id) {
1095 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1096 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff << " in bucket " << bidx << dendl;
1097 adjust_item_weight(cct, -1 - bidx, b->weight);
1098 changed++;
1099 }
1100 }
1101 }
1102 if (!changed)
1103 return -ENOENT;
1104 return changed;
1105 }
1106
1107 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1108 {
1109 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight << " in " << loc << dendl;
1110 int changed = 0;
1111
1112 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1113 int bid = get_item_id(l->second);
1114 if (!bucket_exists(bid))
1115 continue;
1116 crush_bucket *b = get_bucket(bid);
1117 for (unsigned int i = 0; i < b->size; i++) {
1118 if (b->items[i] == id) {
1119 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1120 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff << " in bucket " << bid << dendl;
1121 adjust_item_weight(cct, bid, b->weight);
1122 changed++;
1123 }
1124 }
1125 }
1126 if (!changed)
1127 return -ENOENT;
1128 return changed;
1129 }
1130
1131 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1132 {
1133 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1134 crush_bucket *b = get_bucket(id);
1135 if (IS_ERR(b))
1136 return PTR_ERR(b);
1137 int changed = 0;
1138 list<crush_bucket*> q;
1139 q.push_back(b);
1140 while (!q.empty()) {
1141 b = q.front();
1142 q.pop_front();
1143 int local_changed = 0;
1144 for (unsigned i=0; i<b->size; ++i) {
1145 int n = b->items[i];
1146 if (n >= 0) {
1147 bucket_adjust_item_weight(cct, b, n, weight);
1148 ++changed;
1149 ++local_changed;
1150 } else {
1151 crush_bucket *sub = get_bucket(n);
1152 if (IS_ERR(sub))
1153 continue;
1154 q.push_back(sub);
1155 }
1156 }
1157 if (local_changed) {
1158 adjust_item_weight(cct, b->id, b->weight);
1159 }
1160 }
1161 return changed;
1162 }
1163
1164 bool CrushWrapper::check_item_present(int id) const
1165 {
1166 bool found = false;
1167
1168 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1169 crush_bucket *b = crush->buckets[bidx];
1170 if (b == 0)
1171 continue;
1172 for (unsigned i = 0; i < b->size; i++)
1173 if (b->items[i] == id)
1174 found = true;
1175 }
1176 return found;
1177 }
1178
1179
1180 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1181 {
1182
1183 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1184 crush_bucket *b = crush->buckets[bidx];
1185 if (b == 0)
1186 continue;
1187 for (unsigned i = 0; i < b->size; i++)
1188 if (b->items[i] == id) {
1189 string parent_id = name_map[b->id];
1190 string parent_bucket_type = type_map[b->type];
1191 if (_ret)
1192 *_ret = 0;
1193 return make_pair(parent_bucket_type, parent_id);
1194 }
1195 }
1196
1197 if (_ret)
1198 *_ret = -ENOENT;
1199
1200 return pair<string, string>();
1201 }
1202
1203 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1204 {
1205 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1206 crush_bucket *b = crush->buckets[bidx];
1207 if (b == 0)
1208 continue;
1209 for (unsigned i = 0; i < b->size; i++) {
1210 if (b->items[i] == id) {
1211 *parent = b->id;
1212 return 0;
1213 }
1214 }
1215 }
1216 return -ENOENT;
1217 }
1218
1219 int CrushWrapper::get_parent_of_type(int item, int type) const
1220 {
1221 do {
1222 int r = get_immediate_parent_id(item, &item);
1223 if (r < 0) {
1224 return 0;
1225 }
1226 } while (get_bucket_type(item) != type);
1227 return item;
1228 }
1229
1230 bool CrushWrapper::class_is_in_use(int class_id)
1231 {
1232 for (auto &i : class_bucket)
1233 for (auto &j : i.second)
1234 if (j.first == class_id)
1235 return true;
1236
1237 for (auto &i : class_map)
1238 if (i.second == class_id)
1239 return true;
1240
1241 return false;
1242 }
1243
1244 int CrushWrapper::populate_classes()
1245 {
1246 set<int> roots;
1247 find_roots(roots);
1248 for (auto &r : roots) {
1249 if (r >= 0)
1250 continue;
1251 if (id_has_class(r))
1252 continue;
1253 for (auto &c : class_name) {
1254 int clone;
1255 int res = device_class_clone(r, c.first, &clone);
1256 if (res < 0)
1257 return res;
1258 }
1259 }
1260 return 0;
1261 }
1262
1263 int CrushWrapper::cleanup_classes()
1264 {
1265 return trim_roots_with_class(true);
1266 }
1267
1268 int CrushWrapper::trim_roots_with_class(bool unused)
1269 {
1270 set<int> roots;
1271 find_roots(roots);
1272 for (auto &r : roots) {
1273 if (r >= 0)
1274 continue;
1275 if (!id_has_class(r))
1276 continue;
1277 int res = remove_root(r, unused);
1278 if (res)
1279 return res;
1280 }
1281 // there is no need to reweight because we only remove from the
1282 // root and down
1283 return 0;
1284 }
1285
1286 void CrushWrapper::reweight(CephContext *cct)
1287 {
1288 set<int> roots;
1289 find_roots(roots);
1290 for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
1291 if (*p >= 0)
1292 continue;
1293 crush_bucket *b = get_bucket(*p);
1294 ldout(cct, 5) << "reweight bucket " << *p << dendl;
1295 int r = crush_reweight_bucket(crush, b);
1296 assert(r == 0);
1297 }
1298 }
1299
1300 int CrushWrapper::add_simple_rule_at(
1301 string name, string root_name,
1302 string failure_domain_name,
1303 string mode, int rule_type,
1304 int rno, ostream *err)
1305 {
1306 if (rule_exists(name)) {
1307 if (err)
1308 *err << "rule " << name << " exists";
1309 return -EEXIST;
1310 }
1311 if (rno >= 0) {
1312 if (rule_exists(rno)) {
1313 if (err)
1314 *err << "rule with ruleno " << rno << " exists";
1315 return -EEXIST;
1316 }
1317 if (ruleset_exists(rno)) {
1318 if (err)
1319 *err << "ruleset " << rno << " exists";
1320 return -EEXIST;
1321 }
1322 } else {
1323 for (rno = 0; rno < get_max_rules(); rno++) {
1324 if (!rule_exists(rno) && !ruleset_exists(rno))
1325 break;
1326 }
1327 }
1328 if (!name_exists(root_name)) {
1329 if (err)
1330 *err << "root item " << root_name << " does not exist";
1331 return -ENOENT;
1332 }
1333 int root = get_item_id(root_name);
1334 int type = 0;
1335 if (failure_domain_name.length()) {
1336 type = get_type_id(failure_domain_name);
1337 if (type < 0) {
1338 if (err)
1339 *err << "unknown type " << failure_domain_name;
1340 return -EINVAL;
1341 }
1342 }
1343 if (mode != "firstn" && mode != "indep") {
1344 if (err)
1345 *err << "unknown mode " << mode;
1346 return -EINVAL;
1347 }
1348
1349 int steps = 3;
1350 if (mode == "indep")
1351 steps = 5;
1352 int min_rep = mode == "firstn" ? 1 : 3;
1353 int max_rep = mode == "firstn" ? 10 : 20;
1354 //set the ruleset the same as rule_id(rno)
1355 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
1356 assert(rule);
1357 int step = 0;
1358 if (mode == "indep") {
1359 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1360 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1361 }
1362 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1363 if (type)
1364 crush_rule_set_step(rule, step++,
1365 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
1366 CRUSH_RULE_CHOOSELEAF_INDEP,
1367 CRUSH_CHOOSE_N,
1368 type);
1369 else
1370 crush_rule_set_step(rule, step++,
1371 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
1372 CRUSH_RULE_CHOOSE_INDEP,
1373 CRUSH_CHOOSE_N,
1374 0);
1375 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1376
1377 int ret = crush_add_rule(crush, rule, rno);
1378 if(ret < 0) {
1379 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
1380 return ret;
1381 }
1382 set_rule_name(rno, name);
1383 have_rmaps = false;
1384 return rno;
1385 }
1386
1387 int CrushWrapper::add_simple_rule(
1388 string name, string root_name,
1389 string failure_domain_name,
1390 string mode, int rule_type,
1391 ostream *err)
1392 {
1393 return add_simple_rule_at(name, root_name, failure_domain_name, mode,
1394 rule_type, -1, err);
1395 }
1396
1397 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
1398 {
1399 if (ruleno >= crush->max_rules)
1400 return -ENOENT;
1401 if (crush->rules[ruleno] == NULL)
1402 return -ENOENT;
1403 crush_rule *rule = crush->rules[ruleno];
1404
1405 // build a weight map for each TAKE in the rule, and then merge them
1406
1407 // FIXME: if there are multiple takes that place a different number of
1408 // objects we do not take that into account. (Also, note that doing this
1409 // right is also a function of the pool, since the crush rule
1410 // might choose 2 + choose 2 but pool size may only be 3.)
1411 for (unsigned i=0; i<rule->len; ++i) {
1412 map<int,float> m;
1413 float sum = 0;
1414 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
1415 int n = rule->steps[i].arg1;
1416 if (n >= 0) {
1417 m[n] = 1.0;
1418 sum = 1.0;
1419 } else {
1420 list<int> q;
1421 q.push_back(n);
1422 //breadth first iterate the OSD tree
1423 while (!q.empty()) {
1424 int bno = q.front();
1425 q.pop_front();
1426 crush_bucket *b = crush->buckets[-1-bno];
1427 assert(b);
1428 for (unsigned j=0; j<b->size; ++j) {
1429 int item_id = b->items[j];
1430 if (item_id >= 0) { //it's an OSD
1431 float w = crush_get_bucket_item_weight(b, j);
1432 m[item_id] = w;
1433 sum += w;
1434 } else { //not an OSD, expand the child later
1435 q.push_back(item_id);
1436 }
1437 }
1438 }
1439 }
1440 }
1441 for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
1442 map<int,float>::iterator q = pmap->find(p->first);
1443 if (q == pmap->end()) {
1444 (*pmap)[p->first] = p->second / sum;
1445 } else {
1446 q->second += p->second / sum;
1447 }
1448 }
1449 }
1450
1451 return 0;
1452 }
1453
1454 int CrushWrapper::remove_rule(int ruleno)
1455 {
1456 if (ruleno >= (int)crush->max_rules)
1457 return -ENOENT;
1458 if (crush->rules[ruleno] == NULL)
1459 return -ENOENT;
1460 crush_destroy_rule(crush->rules[ruleno]);
1461 crush->rules[ruleno] = NULL;
1462 rule_name_map.erase(ruleno);
1463 have_rmaps = false;
1464 return 0;
1465 }
1466
1467 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
1468 {
1469 if (cct->_conf->osd_crush_update_weight_set) {
1470 unsigned position;
1471 for (position = 0; position < bucket->size; position++)
1472 if (bucket->items[position] == item)
1473 break;
1474 assert(position != bucket->size);
1475 for (auto w : choose_args) {
1476 crush_choose_arg_map arg_map = w.second;
1477 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1478 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1479 crush_weight_set *weight_set = &arg->weight_set[j];
1480 weight_set->weights[position] = weight;
1481 }
1482 }
1483 }
1484 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
1485 }
1486
1487 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
1488 {
1489 __u32 new_size = bucket->size + 1;
1490 for (auto w : choose_args) {
1491 crush_choose_arg_map arg_map = w.second;
1492 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1493 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1494 crush_weight_set *weight_set = &arg->weight_set[j];
1495 weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
1496 assert(weight_set->size + 1 == new_size);
1497 weight_set->weights[weight_set->size] = weight;
1498 weight_set->size = new_size;
1499 }
1500 if (arg->ids_size) {
1501 arg->ids = (int*)realloc(arg->ids, new_size * sizeof(int));
1502 assert(arg->ids_size + 1 == new_size);
1503 arg->ids[arg->ids_size] = item;
1504 arg->ids_size = new_size;
1505 }
1506 }
1507 return crush_bucket_add_item(crush, bucket, item, weight);
1508 }
1509
1510 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
1511 {
1512 __u32 new_size = bucket->size - 1;
1513 unsigned position;
1514 for (position = 0; position < bucket->size; position++)
1515 if (bucket->items[position] == item)
1516 break;
1517 assert(position != bucket->size);
1518 for (auto w : choose_args) {
1519 crush_choose_arg_map arg_map = w.second;
1520 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1521 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1522 crush_weight_set *weight_set = &arg->weight_set[j];
1523 assert(weight_set->size - 1 == new_size);
1524 for (__u32 k = position; k < new_size; k++)
1525 weight_set->weights[k] = weight_set->weights[k+1];
1526 weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
1527 weight_set->size = new_size;
1528 }
1529 if (arg->ids_size) {
1530 assert(arg->ids_size - 1 == new_size);
1531 for (__u32 k = position; k < new_size; k++)
1532 arg->ids[k] = arg->ids[k+1];
1533 arg->ids = (int*)realloc(arg->ids, new_size * sizeof(int));
1534 arg->ids_size = new_size;
1535 }
1536 }
1537 return crush_bucket_remove_item(crush, bucket, item);
1538 }
1539
1540 int CrushWrapper::update_device_class(CephContext *cct, int id, const string& class_name, const string& name)
1541 {
1542 int class_id = get_class_id(class_name);
1543 if (class_id < 0) {
1544 ldout(cct, 0) << "update_device_class class " << class_name << " does not exist " << dendl;
1545 return -ENOENT;
1546 }
1547 if (id < 0) {
1548 ldout(cct, 0) << "update_device_class " << name << " id " << id << " is negative " << dendl;
1549 return -EINVAL;
1550 }
1551 assert(item_exists(id));
1552
1553 if (class_map.count(id) != 0 && class_map[id] == class_id) {
1554 ldout(cct, 5) << "update_device_class " << name << " already set to class " << class_name << dendl;
1555 return 0;
1556 }
1557
1558 set_item_class(id, class_id);
1559
1560 int r = rebuild_roots_with_classes();
1561 if (r < 0)
1562 return r;
1563 return 1;
1564 }
1565
1566 int CrushWrapper::device_class_clone(int original_id, int device_class, int *clone)
1567 {
1568 const char *item_name = get_item_name(original_id);
1569 if (item_name == NULL)
1570 return -ECHILD;
1571 const char *class_name = get_class_name(device_class);
1572 if (class_name == NULL)
1573 return -EBADF;
1574 string copy_name = item_name + string("~") + class_name;
1575 if (name_exists(copy_name)) {
1576 *clone = get_item_id(copy_name);
1577 return 0;
1578 }
1579 crush_bucket *original = get_bucket(original_id);
1580 if (original == NULL)
1581 return -ENOENT;
1582 crush_bucket *copy = crush_make_bucket(crush,
1583 original->alg,
1584 original->hash,
1585 original->type,
1586 0, NULL, NULL);
1587 if(copy == NULL)
1588 return -ENOMEM;
1589 for (unsigned i = 0; i < original->size; i++) {
1590 int item = original->items[i];
1591 int weight = crush_get_bucket_item_weight(original, i);
1592 if (item >= 0) {
1593 if (class_map.count(item) != 0 && class_map[item] == device_class) {
1594 int res = bucket_add_item(copy, item, weight);
1595 if (res)
1596 return res;
1597 }
1598 } else {
1599 int child_copy_id;
1600 int res = device_class_clone(item, device_class, &child_copy_id);
1601 if (res < 0)
1602 return res;
1603 crush_bucket *child_copy = get_bucket(child_copy_id);
1604 if (IS_ERR(child_copy))
1605 return -ENOENT;
1606 res = bucket_add_item(copy, child_copy_id, child_copy->weight);
1607 if (res)
1608 return res;
1609 }
1610 }
1611 int res = crush_add_bucket(crush, 0, copy, clone);
1612 if (res)
1613 return res;
1614 res = set_item_class(*clone, device_class);
1615 if (res < 0)
1616 return res;
1617 // we do not use set_item_name because the name is intentionally invalid
1618 name_map[*clone] = copy_name;
1619 if (have_rmaps)
1620 name_rmap[copy_name] = *clone;
1621 class_bucket[original_id][device_class] = *clone;
1622 return 0;
1623 }
1624
1625 int CrushWrapper::rebuild_roots_with_classes()
1626 {
1627 int r = trim_roots_with_class(false);
1628 if (r < 0)
1629 return r;
1630 r = populate_classes();
1631 if (r < 0)
1632 return r;
1633 return trim_roots_with_class(true);
1634 }
1635
1636 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
1637 {
1638 assert(crush);
1639
1640 __u32 magic = CRUSH_MAGIC;
1641 ::encode(magic, bl);
1642
1643 ::encode(crush->max_buckets, bl);
1644 ::encode(crush->max_rules, bl);
1645 ::encode(crush->max_devices, bl);
1646
1647 bool encode_compat_choose_args = false;
1648 crush_choose_arg_map arg_map;
1649 memset(&arg_map, '\0', sizeof(arg_map));
1650 if (has_choose_args() &&
1651 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
1652 assert(!has_incompat_choose_args());
1653 encode_compat_choose_args = true;
1654 arg_map = choose_args.begin()->second;
1655 }
1656
1657 // buckets
1658 for (int i=0; i<crush->max_buckets; i++) {
1659 __u32 alg = 0;
1660 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
1661 ::encode(alg, bl);
1662 if (!alg)
1663 continue;
1664
1665 ::encode(crush->buckets[i]->id, bl);
1666 ::encode(crush->buckets[i]->type, bl);
1667 ::encode(crush->buckets[i]->alg, bl);
1668 ::encode(crush->buckets[i]->hash, bl);
1669 ::encode(crush->buckets[i]->weight, bl);
1670 ::encode(crush->buckets[i]->size, bl);
1671 for (unsigned j=0; j<crush->buckets[i]->size; j++)
1672 ::encode(crush->buckets[i]->items[j], bl);
1673
1674 switch (crush->buckets[i]->alg) {
1675 case CRUSH_BUCKET_UNIFORM:
1676 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
1677 break;
1678
1679 case CRUSH_BUCKET_LIST:
1680 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1681 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
1682 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
1683 }
1684 break;
1685
1686 case CRUSH_BUCKET_TREE:
1687 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
1688 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
1689 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
1690 break;
1691
1692 case CRUSH_BUCKET_STRAW:
1693 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1694 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
1695 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
1696 }
1697 break;
1698
1699 case CRUSH_BUCKET_STRAW2:
1700 {
1701 __u32 *weights;
1702 if (encode_compat_choose_args &&
1703 arg_map.args[i].weight_set_size > 0) {
1704 weights = arg_map.args[i].weight_set[0].weights;
1705 } else {
1706 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
1707 }
1708 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
1709 ::encode(weights[j], bl);
1710 }
1711 }
1712 break;
1713
1714 default:
1715 ceph_abort();
1716 break;
1717 }
1718 }
1719
1720 // rules
1721 for (unsigned i=0; i<crush->max_rules; i++) {
1722 __u32 yes = crush->rules[i] ? 1:0;
1723 ::encode(yes, bl);
1724 if (!yes)
1725 continue;
1726
1727 ::encode(crush->rules[i]->len, bl);
1728 ::encode(crush->rules[i]->mask, bl);
1729 for (unsigned j=0; j<crush->rules[i]->len; j++)
1730 ::encode(crush->rules[i]->steps[j], bl);
1731 }
1732
1733 // name info
1734 ::encode(type_map, bl);
1735 ::encode(name_map, bl);
1736 ::encode(rule_name_map, bl);
1737
1738 // tunables
1739 ::encode(crush->choose_local_tries, bl);
1740 ::encode(crush->choose_local_fallback_tries, bl);
1741 ::encode(crush->choose_total_tries, bl);
1742 ::encode(crush->chooseleaf_descend_once, bl);
1743 ::encode(crush->chooseleaf_vary_r, bl);
1744 ::encode(crush->straw_calc_version, bl);
1745 ::encode(crush->allowed_bucket_algs, bl);
1746 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
1747 ::encode(crush->chooseleaf_stable, bl);
1748 }
1749
1750 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1751 // device classes
1752 ::encode(class_map, bl);
1753 ::encode(class_name, bl);
1754 ::encode(class_bucket, bl);
1755
1756 ::encode(choose_args.size(), bl);
1757 for (auto c : choose_args) {
1758 ::encode(c.first, bl);
1759 crush_choose_arg_map arg_map = c.second;
1760 __u32 size = 0;
1761 for (__u32 i = 0; i < arg_map.size; i++) {
1762 crush_choose_arg *arg = &arg_map.args[i];
1763 if (arg->weight_set_size == 0 &&
1764 arg->ids_size == 0)
1765 continue;
1766 size++;
1767 }
1768 ::encode(size, bl);
1769 for (__u32 i = 0; i < arg_map.size; i++) {
1770 crush_choose_arg *arg = &arg_map.args[i];
1771 if (arg->weight_set_size == 0 &&
1772 arg->ids_size == 0)
1773 continue;
1774 ::encode(i, bl);
1775 ::encode(arg->weight_set_size, bl);
1776 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1777 crush_weight_set *weight_set = &arg->weight_set[j];
1778 ::encode(weight_set->size, bl);
1779 for (__u32 k = 0; k < weight_set->size; k++)
1780 ::encode(weight_set->weights[k], bl);
1781 }
1782 ::encode(arg->ids_size, bl);
1783 for (__u32 j = 0; j < arg->ids_size; j++)
1784 ::encode(arg->ids[j], bl);
1785 }
1786 }
1787 }
1788 }
1789
1790 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
1791 {
1792 m.clear();
1793 __u32 n;
1794 ::decode(n, blp);
1795 while (n--) {
1796 __s32 key;
1797 ::decode(key, blp);
1798
1799 __u32 strlen;
1800 ::decode(strlen, blp);
1801 if (strlen == 0) {
1802 // der, key was actually 64-bits!
1803 ::decode(strlen, blp);
1804 }
1805 ::decode_nohead(strlen, m[key], blp);
1806 }
1807 }
1808
1809 void CrushWrapper::decode(bufferlist::iterator& blp)
1810 {
1811 create();
1812
1813 __u32 magic;
1814 ::decode(magic, blp);
1815 if (magic != CRUSH_MAGIC)
1816 throw buffer::malformed_input("bad magic number");
1817
1818 ::decode(crush->max_buckets, blp);
1819 ::decode(crush->max_rules, blp);
1820 ::decode(crush->max_devices, blp);
1821
1822 // legacy tunables, unless we decode something newer
1823 set_tunables_legacy();
1824
1825 try {
1826 // buckets
1827 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
1828 for (int i=0; i<crush->max_buckets; i++) {
1829 decode_crush_bucket(&crush->buckets[i], blp);
1830 }
1831
1832 // rules
1833 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
1834 for (unsigned i = 0; i < crush->max_rules; ++i) {
1835 __u32 yes;
1836 ::decode(yes, blp);
1837 if (!yes) {
1838 crush->rules[i] = NULL;
1839 continue;
1840 }
1841
1842 __u32 len;
1843 ::decode(len, blp);
1844 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
1845 crush->rules[i]->len = len;
1846 ::decode(crush->rules[i]->mask, blp);
1847 for (unsigned j=0; j<crush->rules[i]->len; j++)
1848 ::decode(crush->rules[i]->steps[j], blp);
1849 }
1850
1851 // name info
1852 // NOTE: we had a bug where we were incoding int instead of int32, which means the
1853 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
1854 // both by assuming the string is always non-empty.
1855 decode_32_or_64_string_map(type_map, blp);
1856 decode_32_or_64_string_map(name_map, blp);
1857 decode_32_or_64_string_map(rule_name_map, blp);
1858
1859 // tunables
1860 if (!blp.end()) {
1861 ::decode(crush->choose_local_tries, blp);
1862 ::decode(crush->choose_local_fallback_tries, blp);
1863 ::decode(crush->choose_total_tries, blp);
1864 }
1865 if (!blp.end()) {
1866 ::decode(crush->chooseleaf_descend_once, blp);
1867 }
1868 if (!blp.end()) {
1869 ::decode(crush->chooseleaf_vary_r, blp);
1870 }
1871 if (!blp.end()) {
1872 ::decode(crush->straw_calc_version, blp);
1873 }
1874 if (!blp.end()) {
1875 ::decode(crush->allowed_bucket_algs, blp);
1876 }
1877 if (!blp.end()) {
1878 ::decode(crush->chooseleaf_stable, blp);
1879 }
1880 if (!blp.end()) {
1881 ::decode(class_map, blp);
1882 ::decode(class_name, blp);
1883 for (auto &c : class_name)
1884 class_rname[c.second] = c.first;
1885 ::decode(class_bucket, blp);
1886 cleanup_classes();
1887 }
1888 if (!blp.end()) {
1889 size_t choose_args_size;
1890 ::decode(choose_args_size, blp);
1891 for (size_t i = 0; i < choose_args_size; i++) {
1892 uint64_t choose_args_index;
1893 ::decode(choose_args_index, blp);
1894 crush_choose_arg_map arg_map;
1895 arg_map.size = crush->max_buckets;
1896 arg_map.args = (crush_choose_arg*)calloc(arg_map.size, sizeof(crush_choose_arg));
1897 __u32 size;
1898 ::decode(size, blp);
1899 for (__u32 j = 0; j < size; j++) {
1900 __u32 bucket_index;
1901 ::decode(bucket_index, blp);
1902 assert(bucket_index < arg_map.size);
1903 crush_choose_arg *arg = &arg_map.args[bucket_index];
1904 ::decode(arg->weight_set_size, blp);
1905 arg->weight_set = (crush_weight_set*)calloc(arg->weight_set_size, sizeof(crush_weight_set));
1906 for (__u32 k = 0; k < arg->weight_set_size; k++) {
1907 crush_weight_set *weight_set = &arg->weight_set[k];
1908 ::decode(weight_set->size, blp);
1909 weight_set->weights = (__u32*)calloc(weight_set->size, sizeof(__u32));
1910 for (__u32 l = 0; l < weight_set->size; l++)
1911 ::decode(weight_set->weights[l], blp);
1912 }
1913 ::decode(arg->ids_size, blp);
1914 arg->ids = (int*)calloc(arg->ids_size, sizeof(int));
1915 for (__u32 k = 0; k < arg->ids_size; k++)
1916 ::decode(arg->ids[k], blp);
1917 }
1918 choose_args[choose_args_index] = arg_map;
1919 }
1920 }
1921 finalize();
1922 }
1923 catch (...) {
1924 crush_destroy(crush);
1925 throw;
1926 }
1927 }
1928
1929 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
1930 {
1931 __u32 alg;
1932 ::decode(alg, blp);
1933 if (!alg) {
1934 *bptr = NULL;
1935 return;
1936 }
1937
1938 int size = 0;
1939 switch (alg) {
1940 case CRUSH_BUCKET_UNIFORM:
1941 size = sizeof(crush_bucket_uniform);
1942 break;
1943 case CRUSH_BUCKET_LIST:
1944 size = sizeof(crush_bucket_list);
1945 break;
1946 case CRUSH_BUCKET_TREE:
1947 size = sizeof(crush_bucket_tree);
1948 break;
1949 case CRUSH_BUCKET_STRAW:
1950 size = sizeof(crush_bucket_straw);
1951 break;
1952 case CRUSH_BUCKET_STRAW2:
1953 size = sizeof(crush_bucket_straw2);
1954 break;
1955 default:
1956 {
1957 char str[128];
1958 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
1959 throw buffer::malformed_input(str);
1960 }
1961 }
1962 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
1963 *bptr = bucket;
1964
1965 ::decode(bucket->id, blp);
1966 ::decode(bucket->type, blp);
1967 ::decode(bucket->alg, blp);
1968 ::decode(bucket->hash, blp);
1969 ::decode(bucket->weight, blp);
1970 ::decode(bucket->size, blp);
1971
1972 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
1973 for (unsigned j = 0; j < bucket->size; ++j) {
1974 ::decode(bucket->items[j], blp);
1975 }
1976
1977 switch (bucket->alg) {
1978 case CRUSH_BUCKET_UNIFORM:
1979 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
1980 break;
1981
1982 case CRUSH_BUCKET_LIST: {
1983 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
1984 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
1985 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
1986
1987 for (unsigned j = 0; j < bucket->size; ++j) {
1988 ::decode(cbl->item_weights[j], blp);
1989 ::decode(cbl->sum_weights[j], blp);
1990 }
1991 break;
1992 }
1993
1994 case CRUSH_BUCKET_TREE: {
1995 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
1996 ::decode(cbt->num_nodes, blp);
1997 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
1998 for (unsigned j=0; j<cbt->num_nodes; j++) {
1999 ::decode(cbt->node_weights[j], blp);
2000 }
2001 break;
2002 }
2003
2004 case CRUSH_BUCKET_STRAW: {
2005 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
2006 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2007 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2008 for (unsigned j = 0; j < bucket->size; ++j) {
2009 ::decode(cbs->item_weights[j], blp);
2010 ::decode(cbs->straws[j], blp);
2011 }
2012 break;
2013 }
2014
2015 case CRUSH_BUCKET_STRAW2: {
2016 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
2017 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2018 for (unsigned j = 0; j < bucket->size; ++j) {
2019 ::decode(cbs->item_weights[j], blp);
2020 }
2021 break;
2022 }
2023
2024 default:
2025 // We should have handled this case in the first switch statement
2026 ceph_abort();
2027 break;
2028 }
2029 }
2030
2031
2032 void CrushWrapper::dump(Formatter *f) const
2033 {
2034 f->open_array_section("devices");
2035 for (int i=0; i<get_max_devices(); i++) {
2036 f->open_object_section("device");
2037 f->dump_int("id", i);
2038 const char *n = get_item_name(i);
2039 if (n) {
2040 f->dump_string("name", n);
2041 } else {
2042 char name[20];
2043 sprintf(name, "device%d", i);
2044 f->dump_string("name", name);
2045 }
2046 const char *device_class = get_item_class(i);
2047 if (device_class != NULL)
2048 f->dump_string("class", device_class);
2049 f->close_section();
2050 }
2051 f->close_section();
2052
2053 f->open_array_section("types");
2054 int n = get_num_type_names();
2055 for (int i=0; n; i++) {
2056 const char *name = get_type_name(i);
2057 if (!name) {
2058 if (i == 0) {
2059 f->open_object_section("type");
2060 f->dump_int("type_id", 0);
2061 f->dump_string("name", "device");
2062 f->close_section();
2063 }
2064 continue;
2065 }
2066 n--;
2067 f->open_object_section("type");
2068 f->dump_int("type_id", i);
2069 f->dump_string("name", name);
2070 f->close_section();
2071 }
2072 f->close_section();
2073
2074 f->open_array_section("buckets");
2075 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
2076 if (!bucket_exists(bucket))
2077 continue;
2078 f->open_object_section("bucket");
2079 f->dump_int("id", bucket);
2080 if (get_item_name(bucket))
2081 f->dump_string("name", get_item_name(bucket));
2082 f->dump_int("type_id", get_bucket_type(bucket));
2083 if (get_type_name(get_bucket_type(bucket)))
2084 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
2085 f->dump_int("weight", get_bucket_weight(bucket));
2086 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
2087 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
2088 f->open_array_section("items");
2089 for (int j=0; j<get_bucket_size(bucket); j++) {
2090 f->open_object_section("item");
2091 f->dump_int("id", get_bucket_item(bucket, j));
2092 f->dump_int("weight", get_bucket_item_weight(bucket, j));
2093 f->dump_int("pos", j);
2094 f->close_section();
2095 }
2096 f->close_section();
2097 f->close_section();
2098 }
2099 f->close_section();
2100
2101 f->open_array_section("rules");
2102 dump_rules(f);
2103 f->close_section();
2104
2105 f->open_object_section("tunables");
2106 dump_tunables(f);
2107 f->close_section();
2108
2109 dump_choose_args(f);
2110 }
2111
2112 namespace {
2113 // depth first walker
2114 class TreeDumper {
2115 typedef CrushTreeDumper::Item Item;
2116 const CrushWrapper *crush;
2117 public:
2118 explicit TreeDumper(const CrushWrapper *crush)
2119 : crush(crush) {}
2120
2121 void dump(Formatter *f) {
2122 set<int> roots;
2123 crush->find_roots(roots);
2124 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
2125 dump_item(Item(*root, 0, crush->get_bucket_weightf(*root)), f);
2126 }
2127 }
2128
2129 private:
2130 void dump_item(const Item& qi, Formatter* f) {
2131 if (qi.is_bucket()) {
2132 f->open_object_section("bucket");
2133 CrushTreeDumper::dump_item_fields(crush, qi, f);
2134 dump_bucket_children(qi, f);
2135 f->close_section();
2136 } else {
2137 f->open_object_section("device");
2138 CrushTreeDumper::dump_item_fields(crush, qi, f);
2139 f->close_section();
2140 }
2141 }
2142
2143 void dump_bucket_children(const Item& parent, Formatter* f) {
2144 f->open_array_section("items");
2145 const int max_pos = crush->get_bucket_size(parent.id);
2146 for (int pos = 0; pos < max_pos; pos++) {
2147 int id = crush->get_bucket_item(parent.id, pos);
2148 float weight = crush->get_bucket_item_weightf(parent.id, pos);
2149 dump_item(Item(id, parent.depth + 1, weight), f);
2150 }
2151 f->close_section();
2152 }
2153 };
2154 }
2155
2156 void CrushWrapper::dump_tree(Formatter *f) const
2157 {
2158 assert(f);
2159 TreeDumper(this).dump(f);
2160 }
2161
2162 void CrushWrapper::dump_tunables(Formatter *f) const
2163 {
2164 f->dump_int("choose_local_tries", get_choose_local_tries());
2165 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2166 f->dump_int("choose_total_tries", get_choose_total_tries());
2167 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2168 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2169 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2170 f->dump_int("straw_calc_version", get_straw_calc_version());
2171 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2172
2173 // be helpful about it
2174 if (has_jewel_tunables())
2175 f->dump_string("profile", "jewel");
2176 else if (has_hammer_tunables())
2177 f->dump_string("profile", "hammer");
2178 else if (has_firefly_tunables())
2179 f->dump_string("profile", "firefly");
2180 else if (has_bobtail_tunables())
2181 f->dump_string("profile", "bobtail");
2182 else if (has_argonaut_tunables())
2183 f->dump_string("profile", "argonaut");
2184 else
2185 f->dump_string("profile", "unknown");
2186 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
2187 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
2188
2189 // be helpful about minimum version required
2190 f->dump_string("minimum_required_version", get_min_required_version());
2191
2192 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2193 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2194 f->dump_int("has_v2_rules", (int)has_v2_rules());
2195 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2196 f->dump_int("has_v3_rules", (int)has_v3_rules());
2197 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
2198 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2199 f->dump_int("has_v5_rules", (int)has_v5_rules());
2200 }
2201
2202 void CrushWrapper::dump_choose_args(Formatter *f) const
2203 {
2204 f->open_object_section("choose_args");
2205 for (auto c : choose_args) {
2206 crush_choose_arg_map arg_map = c.second;
2207 f->open_array_section(stringify(c.first).c_str());
2208 for (__u32 i = 0; i < arg_map.size; i++) {
2209 crush_choose_arg *arg = &arg_map.args[i];
2210 if (arg->weight_set_size == 0 &&
2211 arg->ids_size == 0)
2212 continue;
2213 f->open_object_section("choose_args");
2214 int bucket_index = i;
2215 f->dump_int("bucket_id", -1-bucket_index);
2216 if (arg->weight_set_size > 0) {
2217 f->open_array_section("weight_set");
2218 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2219 f->open_array_section("weights");
2220 __u32 *weights = arg->weight_set[j].weights;
2221 __u32 size = arg->weight_set[j].size;
2222 for (__u32 k = 0; k < size; k++) {
2223 f->dump_float("weight", (float)weights[k]/(float)0x10000);
2224 }
2225 f->close_section();
2226 }
2227 f->close_section();
2228 }
2229 if (arg->ids_size > 0) {
2230 f->open_array_section("ids");
2231 for (__u32 j = 0; j < arg->ids_size; j++)
2232 f->dump_int("id", arg->ids[j]);
2233 f->close_section();
2234 }
2235 f->close_section();
2236 }
2237 f->close_section();
2238 }
2239 f->close_section();
2240 }
2241
2242 void CrushWrapper::dump_rules(Formatter *f) const
2243 {
2244 for (int i=0; i<get_max_rules(); i++) {
2245 if (!rule_exists(i))
2246 continue;
2247 dump_rule(i, f);
2248 }
2249 }
2250
2251 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
2252 {
2253 f->open_object_section("rule");
2254 f->dump_int("rule_id", ruleset);
2255 if (get_rule_name(ruleset))
2256 f->dump_string("rule_name", get_rule_name(ruleset));
2257 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
2258 f->dump_int("type", get_rule_mask_type(ruleset));
2259 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
2260 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
2261 f->open_array_section("steps");
2262 for (int j=0; j<get_rule_len(ruleset); j++) {
2263 f->open_object_section("step");
2264 switch (get_rule_op(ruleset, j)) {
2265 case CRUSH_RULE_NOOP:
2266 f->dump_string("op", "noop");
2267 break;
2268 case CRUSH_RULE_TAKE:
2269 f->dump_string("op", "take");
2270 {
2271 int item = get_rule_arg1(ruleset, j);
2272 f->dump_int("item", item);
2273
2274 const char *name = get_item_name(item);
2275 f->dump_string("item_name", name ? name : "");
2276 }
2277 break;
2278 case CRUSH_RULE_EMIT:
2279 f->dump_string("op", "emit");
2280 break;
2281 case CRUSH_RULE_CHOOSE_FIRSTN:
2282 f->dump_string("op", "choose_firstn");
2283 f->dump_int("num", get_rule_arg1(ruleset, j));
2284 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2285 break;
2286 case CRUSH_RULE_CHOOSE_INDEP:
2287 f->dump_string("op", "choose_indep");
2288 f->dump_int("num", get_rule_arg1(ruleset, j));
2289 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2290 break;
2291 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2292 f->dump_string("op", "chooseleaf_firstn");
2293 f->dump_int("num", get_rule_arg1(ruleset, j));
2294 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2295 break;
2296 case CRUSH_RULE_CHOOSELEAF_INDEP:
2297 f->dump_string("op", "chooseleaf_indep");
2298 f->dump_int("num", get_rule_arg1(ruleset, j));
2299 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2300 break;
2301 case CRUSH_RULE_SET_CHOOSE_TRIES:
2302 f->dump_string("op", "set_choose_tries");
2303 f->dump_int("num", get_rule_arg1(ruleset, j));
2304 break;
2305 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
2306 f->dump_string("op", "set_chooseleaf_tries");
2307 f->dump_int("num", get_rule_arg1(ruleset, j));
2308 break;
2309 default:
2310 f->dump_int("opcode", get_rule_op(ruleset, j));
2311 f->dump_int("arg1", get_rule_arg1(ruleset, j));
2312 f->dump_int("arg2", get_rule_arg2(ruleset, j));
2313 }
2314 f->close_section();
2315 }
2316 f->close_section();
2317 f->close_section();
2318 }
2319
2320 void CrushWrapper::list_rules(Formatter *f) const
2321 {
2322 for (int rule = 0; rule < get_max_rules(); rule++) {
2323 if (!rule_exists(rule))
2324 continue;
2325 f->dump_string("name", get_rule_name(rule));
2326 }
2327 }
2328
2329 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<ostream> {
2330 public:
2331 typedef CrushTreeDumper::Dumper<ostream> Parent;
2332
2333 explicit CrushTreePlainDumper(const CrushWrapper *crush)
2334 : Parent(crush) {}
2335
2336 void dump(ostream *out) {
2337 *out << "ID\tWEIGHT\tTYPE NAME\n";
2338 Parent::dump(out);
2339 }
2340
2341 protected:
2342 void dump_item(const CrushTreeDumper::Item &qi, ostream *out) override {
2343 *out << qi.id << "\t"
2344 << weightf_t(qi.weight) << "\t";
2345
2346 for (int k=0; k < qi.depth; k++)
2347 *out << "\t";
2348
2349 if (qi.is_bucket())
2350 {
2351 *out << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2352 << crush->get_item_name(qi.id);
2353 }
2354 else
2355 {
2356 *out << "osd." << qi.id;
2357 }
2358 *out << "\n";
2359 }
2360 };
2361
2362
2363 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2364 public:
2365 typedef CrushTreeDumper::FormattingDumper Parent;
2366
2367 explicit CrushTreeFormattingDumper(const CrushWrapper *crush)
2368 : Parent(crush) {}
2369
2370 void dump(Formatter *f) {
2371 f->open_array_section("nodes");
2372 Parent::dump(f);
2373 f->close_section();
2374 f->open_array_section("stray");
2375 f->close_section();
2376 }
2377 };
2378
2379
2380 void CrushWrapper::dump_tree(ostream *out, Formatter *f) const
2381 {
2382 if (out)
2383 CrushTreePlainDumper(this).dump(out);
2384 if (f)
2385 CrushTreeFormattingDumper(this).dump(f);
2386 }
2387
2388 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
2389 {
2390 o.push_back(new CrushWrapper);
2391 // fixme
2392 }
2393
2394 /**
2395 * Determine the default CRUSH ruleset ID to be used with
2396 * newly created replicated pools.
2397 *
2398 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2399 */
2400 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
2401 {
2402 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
2403 if (crush_ruleset < 0) {
2404 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
2405 } else if (!ruleset_exists(crush_ruleset)) {
2406 crush_ruleset = -1; // match find_first_ruleset() retval
2407 }
2408 return crush_ruleset;
2409 }
2410
2411 bool CrushWrapper::is_valid_crush_name(const string& s)
2412 {
2413 if (s.empty())
2414 return false;
2415 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
2416 if (!(*p == '-') &&
2417 !(*p == '_') &&
2418 !(*p == '.') &&
2419 !(*p >= '0' && *p <= '9') &&
2420 !(*p >= 'A' && *p <= 'Z') &&
2421 !(*p >= 'a' && *p <= 'z'))
2422 return false;
2423 }
2424 return true;
2425 }
2426
2427 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
2428 const map<string,string>& loc)
2429 {
2430 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
2431 if (!is_valid_crush_name(l->first) ||
2432 !is_valid_crush_name(l->second)) {
2433 ldout(cct, 1) << "loc["
2434 << l->first << "] = '"
2435 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
2436 << dendl;
2437 return false;
2438 }
2439 }
2440 return true;
2441 }
2442
2443 int CrushWrapper::_choose_type_stack(
2444 CephContext *cct,
2445 const vector<pair<int,int>>& stack,
2446 const set<int>& overfull,
2447 const vector<int>& underfull,
2448 const vector<int>& orig,
2449 vector<int>::const_iterator& i,
2450 set<int>& used,
2451 vector<int> *pw) const
2452 {
2453 vector<int> w = *pw;
2454 vector<int> o;
2455
2456 ldout(cct, 10) << __func__ << " stack " << stack
2457 << " orig " << orig
2458 << " at " << *i
2459 << " pw " << *pw
2460 << dendl;
2461
2462 vector<int> cumulative_fanout(stack.size());
2463 int f = 1;
2464 for (int j = (int)stack.size() - 1; j >= 0; --j) {
2465 cumulative_fanout[j] = f;
2466 f *= stack[j].second;
2467 }
2468 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
2469 << dendl;
2470
2471 // identify underful targets for each intermediate level.
2472 // this serves two purposes:
2473 // 1. we can tell when we are selecting a bucket that does not have any underfull
2474 // devices beneath it. that means that if the current input includes an overfull
2475 // device, we won't be able to find an underfull device with this parent to
2476 // swap for it.
2477 // 2. when we decide we should reject a bucket due to the above, this list gives us
2478 // a list of peers to consider that *do* have underfull devices available.. (we
2479 // are careful to pick one that has the same parent.)
2480 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
2481 underfull_buckets.resize(stack.size() - 1);
2482 for (auto osd : underfull) {
2483 int item = osd;
2484 for (int j = (int)stack.size() - 2; j >= 0; --j) {
2485 int type = stack[j].first;
2486 item = get_parent_of_type(item, type);
2487 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
2488 << " is " << item << dendl;
2489 underfull_buckets[j].insert(item);
2490 }
2491 }
2492 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
2493
2494 for (unsigned j = 0; j < stack.size(); ++j) {
2495 int type = stack[j].first;
2496 int fanout = stack[j].second;
2497 int cum_fanout = cumulative_fanout[j];
2498 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
2499 << " cumulative " << cum_fanout
2500 << " w " << w << dendl;
2501 vector<int> o;
2502 auto tmpi = i;
2503 for (auto from : w) {
2504 ldout(cct, 10) << " from " << from << dendl;
2505 // identify leaves under each choice. we use this to check whether any of these
2506 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2507 // to swap for them.)
2508 vector<set<int>> leaves;
2509 leaves.resize(fanout);
2510 for (int pos = 0; pos < fanout; ++pos) {
2511 if (type > 0) {
2512 // non-leaf
2513 int item = get_parent_of_type(*tmpi, type);
2514 o.push_back(item);
2515 int n = cum_fanout;
2516 while (n-- && tmpi != orig.end()) {
2517 leaves[pos].insert(*tmpi++);
2518 }
2519 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
2520 << " of type " << type << " over leaves " << leaves[pos] << dendl;
2521 } else {
2522 // leaf
2523 bool replaced = false;
2524 if (overfull.count(*i)) {
2525 for (auto item : underfull) {
2526 ldout(cct, 10) << __func__ << " pos " << pos
2527 << " was " << *i << " considering " << item
2528 << dendl;
2529 if (used.count(item)) {
2530 ldout(cct, 20) << __func__ << " in used " << used << dendl;
2531 continue;
2532 }
2533 if (!subtree_contains(from, item)) {
2534 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
2535 continue;
2536 }
2537 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
2538 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
2539 continue;
2540 }
2541 o.push_back(item);
2542 used.insert(item);
2543 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
2544 << *i << " -> " << item << dendl;
2545 replaced = true;
2546 ++i;
2547 break;
2548 }
2549 }
2550 if (!replaced) {
2551 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
2552 << dendl;
2553 o.push_back(*i);
2554 ++i;
2555 }
2556 if (i == orig.end()) {
2557 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
2558 break;
2559 }
2560 }
2561 }
2562 if (j + 1 < stack.size()) {
2563 // check if any buckets have overfull leaves but no underfull candidates
2564 for (int pos = 0; pos < fanout; ++pos) {
2565 if (underfull_buckets[j].count(o[pos]) == 0) {
2566 // are any leaves overfull?
2567 bool any_overfull = false;
2568 for (auto osd : leaves[pos]) {
2569 if (overfull.count(osd)) {
2570 any_overfull = true;
2571 }
2572 }
2573 if (any_overfull) {
2574 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
2575 << ">0 leaves " << leaves[pos] << " is overfull; alts "
2576 << underfull_buckets[j]
2577 << dendl;
2578 for (auto alt : underfull_buckets[j]) {
2579 if (std::find(o.begin(), o.end(), alt) == o.end()) {
2580 // see if alt has the same parent
2581 if (j == 0 ||
2582 get_parent_of_type(o[pos], stack[j-1].first) ==
2583 get_parent_of_type(alt, stack[j-1].first)) {
2584 if (j)
2585 ldout(cct, 10) << " replacing " << o[pos]
2586 << " (which has no underfull leaves) with " << alt
2587 << " (same parent "
2588 << get_parent_of_type(alt, stack[j-1].first) << " type "
2589 << type << ")" << dendl;
2590 else
2591 ldout(cct, 10) << " replacing " << o[pos]
2592 << " (which has no underfull leaves) with " << alt
2593 << " (first level)" << dendl;
2594 o[pos] = alt;
2595 break;
2596 } else {
2597 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
2598 << " has different parent, skipping" << dendl;
2599 }
2600 }
2601 }
2602 }
2603 }
2604 }
2605 }
2606 if (i == orig.end()) {
2607 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
2608 break;
2609 }
2610 }
2611 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
2612 w.swap(o);
2613 }
2614 *pw = w;
2615 return 0;
2616 }
2617
2618 int CrushWrapper::try_remap_rule(
2619 CephContext *cct,
2620 int ruleno,
2621 int maxout,
2622 const set<int>& overfull,
2623 const vector<int>& underfull,
2624 const vector<int>& orig,
2625 vector<int> *out) const
2626 {
2627 const crush_map *map = crush;
2628 const crush_rule *rule = get_rule(ruleno);
2629 assert(rule);
2630
2631 ldout(cct, 10) << __func__ << " ruleno " << ruleno
2632 << " numrep " << maxout << " overfull " << overfull
2633 << " underfull " << underfull << " orig " << orig
2634 << dendl;
2635 vector<int> w; // working set
2636 out->clear();
2637
2638 auto i = orig.begin();
2639 set<int> used;
2640
2641 vector<pair<int,int>> type_stack; // (type, fan-out)
2642
2643 for (unsigned step = 0; step < rule->len; ++step) {
2644 const crush_rule_step *curstep = &rule->steps[step];
2645 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
2646 switch (curstep->op) {
2647 case CRUSH_RULE_TAKE:
2648 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
2649 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
2650 map->buckets[-1-curstep->arg1])) {
2651 w.clear();
2652 w.push_back(curstep->arg1);
2653 ldout(cct, 10) << __func__ << " take " << w << dendl;
2654 } else {
2655 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
2656 }
2657 break;
2658
2659 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2660 case CRUSH_RULE_CHOOSELEAF_INDEP:
2661 {
2662 int numrep = curstep->arg1;
2663 int type = curstep->arg2;
2664 if (numrep <= 0)
2665 numrep += maxout;
2666 type_stack.push_back(make_pair(type, numrep));
2667 type_stack.push_back(make_pair(0, 1));
2668 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
2669 i, used, &w);
2670 if (r < 0)
2671 return r;
2672 type_stack.clear();
2673 }
2674 break;
2675
2676 case CRUSH_RULE_CHOOSE_FIRSTN:
2677 case CRUSH_RULE_CHOOSE_INDEP:
2678 {
2679 int numrep = curstep->arg1;
2680 int type = curstep->arg2;
2681 if (numrep <= 0)
2682 numrep += maxout;
2683 type_stack.push_back(make_pair(type, numrep));
2684 }
2685 break;
2686
2687 case CRUSH_RULE_EMIT:
2688 ldout(cct, 10) << " emit " << w << dendl;
2689 if (!type_stack.empty()) {
2690 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
2691 i, used, &w);
2692 if (r < 0)
2693 return r;
2694 type_stack.clear();
2695 }
2696 for (auto item : w) {
2697 out->push_back(item);
2698 }
2699 w.clear();
2700 break;
2701
2702 default:
2703 // ignore
2704 break;
2705 }
2706 }
2707
2708 return 0;
2709 }