]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
update source to 12.2.11
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
10
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
13
14 #define dout_subsys ceph_subsys_crush
15
16 bool CrushWrapper::has_legacy_rule_ids() const
17 {
18 for (unsigned i=0; i<crush->max_rules; i++) {
19 crush_rule *r = crush->rules[i];
20 if (r &&
21 r->mask.ruleset != i) {
22 return true;
23 }
24 }
25 return false;
26 }
27
28 std::map<int, int> CrushWrapper::renumber_rules()
29 {
30 std::map<int, int> result;
31 for (unsigned i=0; i<crush->max_rules; i++) {
32 crush_rule *r = crush->rules[i];
33 if (r && r->mask.ruleset != i) {
34 result[r->mask.ruleset] = i;
35 r->mask.ruleset = i;
36 }
37 }
38 return result;
39 }
40
41 bool CrushWrapper::has_non_straw2_buckets() const
42 {
43 for (int i=0; i<crush->max_buckets; ++i) {
44 crush_bucket *b = crush->buckets[i];
45 if (!b)
46 continue;
47 if (b->alg != CRUSH_BUCKET_STRAW2)
48 return true;
49 }
50 return false;
51 }
52
53 bool CrushWrapper::has_v2_rules() const
54 {
55 for (unsigned i=0; i<crush->max_rules; i++) {
56 if (is_v2_rule(i)) {
57 return true;
58 }
59 }
60 return false;
61 }
62
63 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
64 {
65 // check rule for use of indep or new SET_* rule steps
66 if (ruleid >= crush->max_rules)
67 return false;
68 crush_rule *r = crush->rules[ruleid];
69 if (!r)
70 return false;
71 for (unsigned j=0; j<r->len; j++) {
72 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
73 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
74 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
75 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
76 return true;
77 }
78 }
79 return false;
80 }
81
82 bool CrushWrapper::has_v3_rules() const
83 {
84 for (unsigned i=0; i<crush->max_rules; i++) {
85 if (is_v3_rule(i)) {
86 return true;
87 }
88 }
89 return false;
90 }
91
92 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
93 {
94 // check rule for use of SET_CHOOSELEAF_VARY_R step
95 if (ruleid >= crush->max_rules)
96 return false;
97 crush_rule *r = crush->rules[ruleid];
98 if (!r)
99 return false;
100 for (unsigned j=0; j<r->len; j++) {
101 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
102 return true;
103 }
104 }
105 return false;
106 }
107
108 bool CrushWrapper::has_v4_buckets() const
109 {
110 for (int i=0; i<crush->max_buckets; ++i) {
111 crush_bucket *b = crush->buckets[i];
112 if (!b)
113 continue;
114 if (b->alg == CRUSH_BUCKET_STRAW2)
115 return true;
116 }
117 return false;
118 }
119
120 bool CrushWrapper::has_v5_rules() const
121 {
122 for (unsigned i=0; i<crush->max_rules; i++) {
123 if (is_v5_rule(i)) {
124 return true;
125 }
126 }
127 return false;
128 }
129
130 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
131 {
132 // check rule for use of SET_CHOOSELEAF_STABLE step
133 if (ruleid >= crush->max_rules)
134 return false;
135 crush_rule *r = crush->rules[ruleid];
136 if (!r)
137 return false;
138 for (unsigned j=0; j<r->len; j++) {
139 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
140 return true;
141 }
142 }
143 return false;
144 }
145
146 bool CrushWrapper::has_choose_args() const
147 {
148 return !choose_args.empty();
149 }
150
151 bool CrushWrapper::has_incompat_choose_args() const
152 {
153 if (choose_args.empty())
154 return false;
155 if (choose_args.size() > 1)
156 return true;
157 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
158 return true;
159 crush_choose_arg_map arg_map = choose_args.begin()->second;
160 for (__u32 i = 0; i < arg_map.size; i++) {
161 crush_choose_arg *arg = &arg_map.args[i];
162 if (arg->weight_set_positions == 0 &&
163 arg->ids_size == 0)
164 continue;
165 if (arg->weight_set_positions != 1)
166 return true;
167 if (arg->ids_size != 0)
168 return true;
169 }
170 return false;
171 }
172
173 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
174 {
175 if (!item_exists(i))
176 return -EINVAL;
177 string name = get_item_name(i);
178 size_t pos = name.find("~");
179 if (pos == string::npos) {
180 *idout = i;
181 *classout = -1;
182 return 0;
183 }
184 string name_no_class = name.substr(0, pos);
185 if (!name_exists(name_no_class))
186 return -ENOENT;
187 string class_name = name.substr(pos + 1);
188 if (!class_exists(class_name))
189 return -ENOENT;
190 *idout = get_item_id(name_no_class);
191 *classout = get_class_id(class_name);
192 return 0;
193 }
194
195 int CrushWrapper::can_rename_item(const string& srcname,
196 const string& dstname,
197 ostream *ss) const
198 {
199 if (name_exists(srcname)) {
200 if (name_exists(dstname)) {
201 *ss << "dstname = '" << dstname << "' already exists";
202 return -EEXIST;
203 }
204 if (is_valid_crush_name(dstname)) {
205 return 0;
206 } else {
207 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
208 return -EINVAL;
209 }
210 } else {
211 if (name_exists(dstname)) {
212 *ss << "srcname = '" << srcname << "' does not exist "
213 << "and dstname = '" << dstname << "' already exists";
214 return -EALREADY;
215 } else {
216 *ss << "srcname = '" << srcname << "' does not exist";
217 return -ENOENT;
218 }
219 }
220 }
221
222 int CrushWrapper::rename_item(const string& srcname,
223 const string& dstname,
224 ostream *ss)
225 {
226 int ret = can_rename_item(srcname, dstname, ss);
227 if (ret < 0)
228 return ret;
229 int oldid = get_item_id(srcname);
230 return set_item_name(oldid, dstname);
231 }
232
233 int CrushWrapper::can_rename_bucket(const string& srcname,
234 const string& dstname,
235 ostream *ss) const
236 {
237 int ret = can_rename_item(srcname, dstname, ss);
238 if (ret)
239 return ret;
240 int srcid = get_item_id(srcname);
241 if (srcid >= 0) {
242 *ss << "srcname = '" << srcname << "' is not a bucket "
243 << "because its id = " << srcid << " is >= 0";
244 return -ENOTDIR;
245 }
246 return 0;
247 }
248
249 int CrushWrapper::rename_bucket(const string& srcname,
250 const string& dstname,
251 ostream *ss)
252 {
253 int ret = can_rename_bucket(srcname, dstname, ss);
254 if (ret < 0)
255 return ret;
256 int oldid = get_item_id(srcname);
257 return set_item_name(oldid, dstname);
258 }
259
260 int CrushWrapper::rename_rule(const string& srcname,
261 const string& dstname,
262 ostream *ss)
263 {
264 if (!rule_exists(srcname)) {
265 if (ss) {
266 *ss << "source rule name '" << srcname << "' does not exist";
267 }
268 return -ENOENT;
269 }
270 if (rule_exists(dstname)) {
271 if (ss) {
272 *ss << "destination rule name '" << dstname << "' already exists";
273 }
274 return -EEXIST;
275 }
276 int rule_id = get_rule_id(srcname);
277 auto it = rule_name_map.find(rule_id);
278 assert(it != rule_name_map.end());
279 it->second = dstname;
280 if (have_rmaps) {
281 rule_name_rmap.erase(srcname);
282 rule_name_rmap[dstname] = rule_id;
283 }
284 return 0;
285 }
286
287 void CrushWrapper::find_takes(set<int> *roots) const
288 {
289 for (unsigned i=0; i<crush->max_rules; i++) {
290 crush_rule *r = crush->rules[i];
291 if (!r)
292 continue;
293 for (unsigned j=0; j<r->len; j++) {
294 if (r->steps[j].op == CRUSH_RULE_TAKE)
295 roots->insert(r->steps[j].arg1);
296 }
297 }
298 }
299
300 void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
301 {
302 if (rule < 0 || rule >= (int)crush->max_rules)
303 return;
304 crush_rule *r = crush->rules[rule];
305 if (!r)
306 return;
307 for (unsigned i = 0; i < r->len; i++) {
308 if (r->steps[i].op == CRUSH_RULE_TAKE)
309 roots->insert(r->steps[i].arg1);
310 }
311 }
312
313 void CrushWrapper::find_roots(set<int> *roots) const
314 {
315 for (int i = 0; i < crush->max_buckets; i++) {
316 if (!crush->buckets[i])
317 continue;
318 crush_bucket *b = crush->buckets[i];
319 if (!_search_item_exists(b->id))
320 roots->insert(b->id);
321 }
322 }
323
324 bool CrushWrapper::subtree_contains(int root, int item) const
325 {
326 if (root == item)
327 return true;
328
329 if (root >= 0)
330 return false; // root is a leaf
331
332 const crush_bucket *b = get_bucket(root);
333 if (IS_ERR(b))
334 return false;
335
336 for (unsigned j=0; j<b->size; j++) {
337 if (subtree_contains(b->items[j], item))
338 return true;
339 }
340 return false;
341 }
342
343 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
344 {
345 // last instance?
346 if (_search_item_exists(item)) {
347 return false;
348 }
349 if (item < 0 && _bucket_is_in_use(item)) {
350 return false;
351 }
352
353 if (item < 0 && !unlink_only) {
354 crush_bucket *t = get_bucket(item);
355 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
356 crush_remove_bucket(crush, t);
357 if (class_bucket.count(item) != 0)
358 class_bucket.erase(item);
359 class_remove_item(item);
360 update_choose_args(cct);
361 }
362 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
363 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
364 name_map.erase(item);
365 have_rmaps = false;
366 if (item >= 0 && !unlink_only) {
367 class_remove_item(item);
368 }
369 }
370 rebuild_roots_with_classes();
371 return true;
372 }
373
374 int CrushWrapper::remove_root(int item)
375 {
376 crush_bucket *b = get_bucket(item);
377 if (IS_ERR(b)) {
378 // should be idempotent
379 // e.g.: we use 'crush link' to link same host into
380 // different roots, which as a result can cause different
381 // shadow trees reference same hosts too. This means
382 // we may need to destory the same buckets(hosts, racks, etc.)
383 // multiple times during rebuilding all shadow trees.
384 return 0;
385 }
386
387 for (unsigned n = 0; n < b->size; n++) {
388 if (b->items[n] >= 0)
389 continue;
390 int r = remove_root(b->items[n]);
391 if (r < 0)
392 return r;
393 }
394
395 crush_remove_bucket(crush, b);
396 if (name_map.count(item) != 0) {
397 name_map.erase(item);
398 have_rmaps = false;
399 }
400 if (class_bucket.count(item) != 0)
401 class_bucket.erase(item);
402 class_remove_item(item);
403 update_choose_args(nullptr);
404 return 0;
405 }
406
407 void CrushWrapper::update_choose_args(CephContext *cct)
408 {
409 for (auto& i : choose_args) {
410 crush_choose_arg_map &arg_map = i.second;
411 unsigned positions = get_choose_args_positions(arg_map);
412 for (int j = 0; j < crush->max_buckets; ++j) {
413 crush_bucket *b = crush->buckets[j];
414 auto& carg = arg_map.args[j];
415 // strip out choose_args for any buckets that no longer exist
416 if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
417 if (carg.ids) {
418 if (cct)
419 ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
420 << (-1-j) << " ids" << dendl;
421 free(carg.ids);
422 carg.ids = 0;
423 carg.ids_size = 0;
424 }
425 if (carg.weight_set) {
426 if (cct)
427 ldout(cct,0) << __func__ << " removing " << i.first << " bucket "
428 << (-1-j) << " weight_sets" << dendl;
429 for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
430 free(carg.weight_set[p].weights);
431 }
432 free(carg.weight_set);
433 carg.weight_set = 0;
434 carg.weight_set_positions = 0;
435 }
436 continue;
437 }
438 if (carg.weight_set_positions == 0) {
439 continue; // skip it
440 }
441 if (carg.weight_set_positions != positions) {
442 if (cct)
443 lderr(cct) << __func__ << " " << i.first << " bucket "
444 << (-1-j) << " positions " << carg.weight_set_positions
445 << " -> " << positions << dendl;
446 continue; // wth... skip!
447 }
448 // mis-sized weight_sets? this shouldn't ever happen.
449 for (unsigned p = 0; p < positions; ++p) {
450 if (carg.weight_set[p].size != b->size) {
451 if (cct)
452 lderr(cct) << __func__ << " fixing " << i.first << " bucket "
453 << (-1-j) << " position " << p
454 << " size " << carg.weight_set[p].size << " -> "
455 << b->size << dendl;
456 auto old_ws = carg.weight_set[p];
457 carg.weight_set[p].size = b->size;
458 carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
459 auto max = std::min<unsigned>(old_ws.size, b->size);
460 for (unsigned k = 0; k < max; ++k) {
461 carg.weight_set[p].weights[k] = old_ws.weights[k];
462 }
463 free(old_ws.weights);
464 }
465 }
466 }
467 }
468 }
469
470 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
471 {
472 ldout(cct, 5) << "remove_item " << item
473 << (unlink_only ? " unlink_only":"") << dendl;
474
475 int ret = -ENOENT;
476
477 if (item < 0 && !unlink_only) {
478 crush_bucket *t = get_bucket(item);
479 if (IS_ERR(t)) {
480 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
481 << dendl;
482 return -ENOENT;
483 }
484
485 if (t->size) {
486 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
487 << " items, not empty" << dendl;
488 return -ENOTEMPTY;
489 }
490 if (_bucket_is_in_use(item)) {
491 return -EBUSY;
492 }
493 }
494
495 for (int i = 0; i < crush->max_buckets; i++) {
496 if (!crush->buckets[i])
497 continue;
498 crush_bucket *b = crush->buckets[i];
499
500 for (unsigned i=0; i<b->size; ++i) {
501 int id = b->items[i];
502 if (id == item) {
503 ldout(cct, 5) << "remove_item removing item " << item
504 << " from bucket " << b->id << dendl;
505 for (auto& p : choose_args) {
506 // weight down each weight-set to 0 before we remove the item
507 vector<int> weightv(get_choose_args_positions(p.second), 0);
508 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
509 }
510 bucket_remove_item(b, item);
511 adjust_item_weight(cct, b->id, b->weight);
512 ret = 0;
513 }
514 }
515 }
516
517 if (_maybe_remove_last_instance(cct, item, unlink_only))
518 ret = 0;
519
520 return ret;
521 }
522
523 bool CrushWrapper::_search_item_exists(int item) const
524 {
525 for (int i = 0; i < crush->max_buckets; i++) {
526 if (!crush->buckets[i])
527 continue;
528 crush_bucket *b = crush->buckets[i];
529 for (unsigned j=0; j<b->size; ++j) {
530 if (b->items[j] == item)
531 return true;
532 }
533 }
534 return false;
535 }
536
537 bool CrushWrapper::_bucket_is_in_use(int item)
538 {
539 for (auto &i : class_bucket)
540 for (auto &j : i.second)
541 if (j.second == item)
542 return true;
543 for (unsigned i = 0; i < crush->max_rules; ++i) {
544 crush_rule *r = crush->rules[i];
545 if (!r)
546 continue;
547 for (unsigned j = 0; j < r->len; ++j) {
548 if (r->steps[j].op == CRUSH_RULE_TAKE) {
549 int step_item = r->steps[j].arg1;
550 int original_item;
551 int c;
552 int res = split_id_class(step_item, &original_item, &c);
553 if (res < 0)
554 return false;
555 if (step_item == item || original_item == item)
556 return true;
557 }
558 }
559 }
560 return false;
561 }
562
563 int CrushWrapper::_remove_item_under(
564 CephContext *cct, int item, int ancestor, bool unlink_only)
565 {
566 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
567 << (unlink_only ? " unlink_only":"") << dendl;
568
569 if (ancestor >= 0) {
570 return -EINVAL;
571 }
572
573 if (!bucket_exists(ancestor))
574 return -EINVAL;
575
576 int ret = -ENOENT;
577
578 crush_bucket *b = get_bucket(ancestor);
579 for (unsigned i=0; i<b->size; ++i) {
580 int id = b->items[i];
581 if (id == item) {
582 ldout(cct, 5) << "_remove_item_under removing item " << item
583 << " from bucket " << b->id << dendl;
584 for (auto& p : choose_args) {
585 // weight down each weight-set to 0 before we remove the item
586 vector<int> weightv(get_choose_args_positions(p.second), 0);
587 _choose_args_adjust_item_weight_in_bucket(
588 cct, p.second, b->id, item, weightv, nullptr);
589 }
590 bucket_remove_item(b, item);
591 adjust_item_weight(cct, b->id, b->weight);
592 ret = 0;
593 } else if (id < 0) {
594 int r = remove_item_under(cct, item, id, unlink_only);
595 if (r == 0)
596 ret = 0;
597 }
598 }
599 return ret;
600 }
601
602 int CrushWrapper::remove_item_under(
603 CephContext *cct, int item, int ancestor, bool unlink_only)
604 {
605 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
606 << (unlink_only ? " unlink_only":"") << dendl;
607
608 if (!unlink_only && _bucket_is_in_use(item)) {
609 return -EBUSY;
610 }
611
612 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
613 if (ret < 0)
614 return ret;
615
616 if (item < 0 && !unlink_only) {
617 crush_bucket *t = get_bucket(item);
618 if (IS_ERR(t)) {
619 ldout(cct, 1) << "remove_item_under bucket " << item
620 << " does not exist" << dendl;
621 return -ENOENT;
622 }
623
624 if (t->size) {
625 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
626 << " items, not empty" << dendl;
627 return -ENOTEMPTY;
628 }
629 }
630
631 if (_maybe_remove_last_instance(cct, item, unlink_only))
632 ret = 0;
633
634 return ret;
635 }
636
637 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
638 const std::multimap<string,string>& loc)
639 {
640 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
641 if (!item_exists(id))
642 return -ENOENT;
643 map<string,string> id_loc = get_full_location(id);
644 ldout(cct, 20) << " id is at " << id_loc << dendl;
645
646 for (map<int,string>::const_iterator p = type_map.begin();
647 p != type_map.end();
648 ++p) {
649 map<string,string>::iterator ip = id_loc.find(p->second);
650 if (ip == id_loc.end())
651 continue;
652 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
653 q != loc.end();
654 ++q) {
655 if (q->first != p->second)
656 break;
657 if (q->second == ip->second)
658 return p->first;
659 }
660 }
661 return -ERANGE;
662 }
663
664 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
665 std::map<string,string> *ploc)
666 {
667 ploc->clear();
668 for (unsigned i = 0; i < args.size(); ++i) {
669 const char *s = args[i].c_str();
670 const char *pos = strchr(s, '=');
671 if (!pos)
672 return -EINVAL;
673 string key(s, 0, pos-s);
674 string value(pos+1);
675 if (value.length())
676 (*ploc)[key] = value;
677 else
678 return -EINVAL;
679 }
680 return 0;
681 }
682
683 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
684 std::multimap<string,string> *ploc)
685 {
686 ploc->clear();
687 for (unsigned i = 0; i < args.size(); ++i) {
688 const char *s = args[i].c_str();
689 const char *pos = strchr(s, '=');
690 if (!pos)
691 return -EINVAL;
692 string key(s, 0, pos-s);
693 string value(pos+1);
694 if (value.length())
695 ploc->insert(make_pair(key, value));
696 else
697 return -EINVAL;
698 }
699 return 0;
700 }
701
702 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
703 int *weight)
704 {
705 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
706
707 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
708 // ignore device
709 if (p->first == 0)
710 continue;
711
712 // ignore types that aren't specified in loc
713 map<string,string>::const_iterator q = loc.find(p->second);
714 if (q == loc.end()) {
715 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
716 << type_map << ")" << dendl;
717 continue;
718 }
719
720 if (!name_exists(q->second)) {
721 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
722 return false;
723 }
724
725 int id = get_item_id(q->second);
726 if (id >= 0) {
727 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
728 << " is a device, not bucket" << dendl;
729 return false;
730 }
731
732 assert(bucket_exists(id));
733 crush_bucket *b = get_bucket(id);
734
735 // see if item exists in this bucket
736 for (unsigned j=0; j<b->size; j++) {
737 if (b->items[j] == item) {
738 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
739 if (weight)
740 *weight = crush_get_bucket_item_weight(b, j);
741 return true;
742 }
743 }
744 return false;
745 }
746
747 ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
748 return false;
749 }
750
751 map<string, string> CrushWrapper::get_full_location(int id)
752 {
753 vector<pair<string, string> > full_location_ordered;
754 map<string,string> full_location;
755
756 get_full_location_ordered(id, full_location_ordered);
757
758 std::copy(full_location_ordered.begin(),
759 full_location_ordered.end(),
760 std::inserter(full_location, full_location.begin()));
761
762 return full_location;
763 }
764
765 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
766 {
767 if (!item_exists(id))
768 return -ENOENT;
769 int cur = id;
770 int ret;
771 while (true) {
772 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
773 if (ret != 0)
774 break;
775 path.push_back(parent_coord);
776 cur = get_item_id(parent_coord.second);
777 }
778 return 0;
779 }
780
781 string CrushWrapper::get_full_location_ordered_string(int id)
782 {
783 vector<pair<string, string> > full_location_ordered;
784 string full_location;
785 get_full_location_ordered(id, full_location_ordered);
786 reverse(begin(full_location_ordered), end(full_location_ordered));
787 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
788 full_location = full_location + i->first + "=" + i->second;
789 if (i != full_location_ordered.end() - 1) {
790 full_location = full_location + ",";
791 }
792 }
793 return full_location;
794 }
795
796 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
797 {
798 map<int,string> parent_hierarchy;
799 pair<string, string> parent_coord = get_immediate_parent(id);
800 int parent_id;
801
802 // get the integer type for id and create a counter from there
803 int type_counter = get_bucket_type(id);
804
805 // if we get a negative type then we can assume that we have an OSD
806 // change behavior in get_item_type FIXME
807 if (type_counter < 0)
808 type_counter = 0;
809
810 // read the type map and get the name of the type with the largest ID
811 int high_type = 0;
812 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
813 if ( (*it).first > high_type )
814 high_type = (*it).first;
815 }
816
817 parent_id = get_item_id(parent_coord.second);
818
819 while (type_counter < high_type) {
820 type_counter++;
821 parent_hierarchy[ type_counter ] = parent_coord.first;
822
823 if (type_counter < high_type){
824 // get the coordinate information for the next parent
825 parent_coord = get_immediate_parent(parent_id);
826 parent_id = get_item_id(parent_coord.second);
827 }
828 }
829
830 return parent_hierarchy;
831 }
832
833 int CrushWrapper::get_children(int id, list<int> *children)
834 {
835 // leaf?
836 if (id >= 0) {
837 return 0;
838 }
839
840 crush_bucket *b = get_bucket(id);
841 if (IS_ERR(b)) {
842 return -ENOENT;
843 }
844
845 for (unsigned n=0; n<b->size; n++) {
846 children->push_back(b->items[n]);
847 }
848 return b->size;
849 }
850
851 void CrushWrapper::get_children_of_type(int id,
852 int type,
853 set<int> *children,
854 bool exclude_shadow) const
855 {
856 if (id >= 0) {
857 if (type == 0) {
858 // want leaf?
859 children->insert(id);
860 }
861 return;
862 }
863 auto b = get_bucket(id);
864 if (IS_ERR(b)) {
865 return;
866 }
867 if (b->type < type) {
868 // give up
869 return;
870 } else if (b->type == type) {
871 if (!is_shadow_item(b->id) || !exclude_shadow) {
872 children->insert(b->id);
873 }
874 return;
875 }
876 for (unsigned n = 0; n < b->size; n++) {
877 get_children_of_type(b->items[n], type, children, exclude_shadow);
878 }
879 }
880
881 int CrushWrapper::get_rule_failure_domain(int rule_id)
882 {
883 crush_rule *rule = get_rule(rule_id);
884 if (IS_ERR(rule)) {
885 return -ENOENT;
886 }
887 int type = 0; // default to osd-level
888 for (unsigned s = 0; s < rule->len; ++s) {
889 if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
890 rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
891 rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
892 rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
893 rule->steps[s].arg2 > type) {
894 type = rule->steps[s].arg2;
895 }
896 }
897 return type;
898 }
899
900 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
901 {
902 assert(leaves);
903
904 // Already leaf?
905 if (id >= 0) {
906 leaves->push_back(id);
907 return 0;
908 }
909
910 crush_bucket *b = get_bucket(id);
911 if (IS_ERR(b)) {
912 return -ENOENT;
913 }
914
915 for (unsigned n = 0; n < b->size; n++) {
916 if (b->items[n] >= 0) {
917 leaves->push_back(b->items[n]);
918 } else {
919 // is a bucket, do recursive call
920 int r = _get_leaves(b->items[n], leaves);
921 if (r < 0) {
922 return r;
923 }
924 }
925 }
926
927 return 0; // all is well
928 }
929
930 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
931 {
932 assert(leaves);
933 leaves->clear();
934
935 if (!name_exists(name)) {
936 return -ENOENT;
937 }
938
939 int id = get_item_id(name);
940 if (id >= 0) {
941 // already leaf
942 leaves->insert(id);
943 return 0;
944 }
945
946 list<int> unordered;
947 int r = _get_leaves(id, &unordered);
948 if (r < 0) {
949 return r;
950 }
951
952 for (auto &p : unordered) {
953 leaves->insert(p);
954 }
955
956 return 0;
957 }
958
959 int CrushWrapper::insert_item(
960 CephContext *cct, int item, float weight, string name,
961 const map<string,string>& loc) // typename -> bucketname
962 {
963 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
964 << " name " << name << " loc " << loc << dendl;
965
966 if (!is_valid_crush_name(name))
967 return -EINVAL;
968
969 if (!is_valid_crush_loc(cct, loc))
970 return -EINVAL;
971
972 int r = validate_weightf(weight);
973 if (r < 0) {
974 return r;
975 }
976
977 if (name_exists(name)) {
978 if (get_item_id(name) != item) {
979 ldout(cct, 10) << "device name '" << name << "' already exists as id "
980 << get_item_id(name) << dendl;
981 return -EEXIST;
982 }
983 } else {
984 set_item_name(item, name);
985 }
986
987 int cur = item;
988
989 // create locations if locations don't exist and add child in
990 // location with 0 weight the more detail in the insert_item method
991 // declaration in CrushWrapper.h
992 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
993 // ignore device type
994 if (p->first == 0)
995 continue;
996
997 // skip types that are unspecified
998 map<string,string>::const_iterator q = loc.find(p->second);
999 if (q == loc.end()) {
1000 ldout(cct, 2) << "warning: did not specify location for '"
1001 << p->second << "' level (levels are "
1002 << type_map << ")" << dendl;
1003 continue;
1004 }
1005
1006 if (!name_exists(q->second)) {
1007 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
1008 int empty = 0, newid;
1009 int r = add_bucket(0, 0,
1010 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
1011 if (r < 0) {
1012 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
1013 << dendl;
1014 return r;
1015 }
1016 set_item_name(newid, q->second);
1017
1018 cur = newid;
1019 continue;
1020 }
1021
1022 // add to an existing bucket
1023 int id = get_item_id(q->second);
1024 if (!bucket_exists(id)) {
1025 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
1026 return -EINVAL;
1027 }
1028
1029 // check that we aren't creating a cycle.
1030 if (subtree_contains(id, cur)) {
1031 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
1032 << id << dendl;
1033 return -EINVAL;
1034 }
1035
1036 // we have done sanity check above
1037 crush_bucket *b = get_bucket(id);
1038
1039 if (p->first != b->type) {
1040 ldout(cct, 1) << "insert_item existing bucket has type "
1041 << "'" << type_map[b->type] << "' != "
1042 << "'" << type_map[p->first] << "'" << dendl;
1043 return -EINVAL;
1044 }
1045
1046 // are we forming a loop?
1047 if (subtree_contains(cur, b->id)) {
1048 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
1049 << "; cannot form loop" << dendl;
1050 return -ELOOP;
1051 }
1052
1053 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
1054 << " to bucket " << id << dendl;
1055 int r = bucket_add_item(b, cur, 0);
1056 assert (!r);
1057 break;
1058 }
1059
1060 // adjust the item's weight in location
1061 if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
1062 if (item >= crush->max_devices) {
1063 crush->max_devices = item + 1;
1064 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
1065 << dendl;
1066 }
1067 r = rebuild_roots_with_classes();
1068 if (r < 0) {
1069 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
1070 << cpp_strerror(r) << dendl;
1071 return r;
1072 }
1073 return 0;
1074 }
1075
1076 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
1077 << " in " << loc << dendl;
1078 return -EINVAL;
1079 }
1080
1081
1082 int CrushWrapper::move_bucket(
1083 CephContext *cct, int id, const map<string,string>& loc)
1084 {
1085 // sorry this only works for buckets
1086 if (id >= 0)
1087 return -EINVAL;
1088
1089 if (!item_exists(id))
1090 return -ENOENT;
1091
1092 // get the name of the bucket we are trying to move for later
1093 string id_name = get_item_name(id);
1094
1095 // detach the bucket
1096 int bucket_weight = detach_bucket(cct, id);
1097
1098 // insert the bucket back into the hierarchy
1099 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1100 }
1101
1102 int CrushWrapper::detach_bucket(CephContext *cct, int item)
1103 {
1104 if (!crush)
1105 return (-EINVAL);
1106
1107 if (item >= 0)
1108 return (-EINVAL);
1109
1110 // check that the bucket that we want to detach exists
1111 assert(bucket_exists(item));
1112
1113 // get the bucket's weight
1114 crush_bucket *b = get_bucket(item);
1115 unsigned bucket_weight = b->weight;
1116
1117 // get where the bucket is located
1118 pair<string, string> bucket_location = get_immediate_parent(item);
1119
1120 // get the id of the parent bucket
1121 int parent_id = get_item_id(bucket_location.second);
1122
1123 // get the parent bucket
1124 crush_bucket *parent_bucket = get_bucket(parent_id);
1125
1126 if (!IS_ERR(parent_bucket)) {
1127 // zero out the bucket weight
1128 bucket_adjust_item_weight(cct, parent_bucket, item, 0);
1129 adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
1130 for (auto& p : choose_args) {
1131 // weight down each weight-set to 0 before we remove the item
1132 vector<int> weightv(get_choose_args_positions(p.second), 0);
1133 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
1134 }
1135
1136 // remove the bucket from the parent
1137 bucket_remove_item(parent_bucket, item);
1138 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1139 return PTR_ERR(parent_bucket);
1140 }
1141
1142 // check that we're happy
1143 int test_weight = 0;
1144 map<string,string> test_location;
1145 test_location[ bucket_location.first ] = (bucket_location.second);
1146
1147 bool successful_detach = !(check_item_loc(cct, item, test_location,
1148 &test_weight));
1149 assert(successful_detach);
1150 assert(test_weight == 0);
1151
1152 return bucket_weight;
1153 }
1154
1155 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
1156 {
1157 if (src >= 0 || dst >= 0)
1158 return -EINVAL;
1159 if (!item_exists(src) || !item_exists(dst))
1160 return -EINVAL;
1161 crush_bucket *a = get_bucket(src);
1162 crush_bucket *b = get_bucket(dst);
1163 unsigned aw = a->weight;
1164 unsigned bw = b->weight;
1165
1166 // swap weights
1167 adjust_item_weight(cct, a->id, bw);
1168 adjust_item_weight(cct, b->id, aw);
1169
1170 // swap items
1171 map<int,unsigned> tmp;
1172 unsigned as = a->size;
1173 unsigned bs = b->size;
1174 for (unsigned i = 0; i < as; ++i) {
1175 int item = a->items[0];
1176 int itemw = crush_get_bucket_item_weight(a, 0);
1177 tmp[item] = itemw;
1178 bucket_remove_item(a, item);
1179 }
1180 assert(a->size == 0);
1181 assert(b->size == bs);
1182 for (unsigned i = 0; i < bs; ++i) {
1183 int item = b->items[0];
1184 int itemw = crush_get_bucket_item_weight(b, 0);
1185 bucket_remove_item(b, item);
1186 bucket_add_item(a, item, itemw);
1187 }
1188 assert(a->size == bs);
1189 assert(b->size == 0);
1190 for (auto t : tmp) {
1191 bucket_add_item(b, t.first, t.second);
1192 }
1193 assert(a->size == bs);
1194 assert(b->size == as);
1195
1196 // swap names
1197 swap_names(src, dst);
1198 return rebuild_roots_with_classes();
1199 }
1200
1201 int CrushWrapper::link_bucket(
1202 CephContext *cct, int id, const map<string,string>& loc)
1203 {
1204 // sorry this only works for buckets
1205 if (id >= 0)
1206 return -EINVAL;
1207
1208 if (!item_exists(id))
1209 return -ENOENT;
1210
1211 // get the name of the bucket we are trying to move for later
1212 string id_name = get_item_name(id);
1213
1214 crush_bucket *b = get_bucket(id);
1215 unsigned bucket_weight = b->weight;
1216
1217 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1218 }
1219
1220 int CrushWrapper::create_or_move_item(
1221 CephContext *cct, int item, float weight, string name,
1222 const map<string,string>& loc) // typename -> bucketname
1223 {
1224 int ret = 0;
1225 int old_iweight;
1226
1227 if (!is_valid_crush_name(name))
1228 return -EINVAL;
1229
1230 if (check_item_loc(cct, item, loc, &old_iweight)) {
1231 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1232 << dendl;
1233 } else {
1234 if (_search_item_exists(item)) {
1235 weight = get_item_weightf(item);
1236 ldout(cct, 10) << "create_or_move_item " << item
1237 << " exists with weight " << weight << dendl;
1238 remove_item(cct, item, true);
1239 }
1240 ldout(cct, 5) << "create_or_move_item adding " << item
1241 << " weight " << weight
1242 << " at " << loc << dendl;
1243 ret = insert_item(cct, item, weight, name, loc);
1244 if (ret == 0)
1245 ret = 1; // changed
1246 }
1247 return ret;
1248 }
1249
1250 int CrushWrapper::update_item(
1251 CephContext *cct, int item, float weight, string name,
1252 const map<string,string>& loc) // typename -> bucketname
1253 {
1254 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1255 << " name " << name << " loc " << loc << dendl;
1256 int ret = 0;
1257
1258 if (!is_valid_crush_name(name))
1259 return -EINVAL;
1260
1261 if (!is_valid_crush_loc(cct, loc))
1262 return -EINVAL;
1263
1264 ret = validate_weightf(weight);
1265 if (ret < 0) {
1266 return ret;
1267 }
1268
1269 // compare quantized (fixed-point integer) weights!
1270 int iweight = (int)(weight * (float)0x10000);
1271 int old_iweight;
1272 if (check_item_loc(cct, item, loc, &old_iweight)) {
1273 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1274 if (old_iweight != iweight) {
1275 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1276 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1277 << dendl;
1278 adjust_item_weight_in_loc(cct, item, iweight, loc);
1279 ret = 1;
1280 }
1281 if (get_item_name(item) != name) {
1282 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1283 << dendl;
1284 set_item_name(item, name);
1285 ret = 1;
1286 }
1287 } else {
1288 if (item_exists(item)) {
1289 remove_item(cct, item, true);
1290 }
1291 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1292 << " at " << loc << dendl;
1293 ret = insert_item(cct, item, weight, name, loc);
1294 if (ret == 0)
1295 ret = 1; // changed
1296 }
1297 return ret;
1298 }
1299
1300 int CrushWrapper::get_item_weight(int id) const
1301 {
1302 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1303 crush_bucket *b = crush->buckets[bidx];
1304 if (b == NULL)
1305 continue;
1306 if (b->id == id)
1307 return b->weight;
1308 for (unsigned i = 0; i < b->size; i++)
1309 if (b->items[i] == id)
1310 return crush_get_bucket_item_weight(b, i);
1311 }
1312 return -ENOENT;
1313 }
1314
1315 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1316 {
1317 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1318
1319 int bid = get_item_id(l->second);
1320 if (!bucket_exists(bid))
1321 continue;
1322 crush_bucket *b = get_bucket(bid);
1323 for (unsigned int i = 0; i < b->size; i++) {
1324 if (b->items[i] == id) {
1325 return crush_get_bucket_item_weight(b, i);
1326 }
1327 }
1328 }
1329 return -ENOENT;
1330 }
1331
1332 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1333 {
1334 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1335 int changed = 0;
1336 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1337 crush_bucket *b = crush->buckets[bidx];
1338 if (b == 0)
1339 continue;
1340 for (unsigned i = 0; i < b->size; i++) {
1341 if (b->items[i] == id) {
1342 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1343 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
1344 << " in bucket " << bidx << dendl;
1345 adjust_item_weight(cct, -1 - bidx, b->weight);
1346 changed++;
1347 }
1348 }
1349 }
1350 if (!changed)
1351 return -ENOENT;
1352 return changed;
1353 }
1354
1355 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1356 {
1357 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
1358 << " in " << loc << dendl;
1359 int changed = 0;
1360
1361 for (auto l = loc.begin(); l != loc.end(); ++l) {
1362 int bid = get_item_id(l->second);
1363 if (!bucket_exists(bid))
1364 continue;
1365 crush_bucket *b = get_bucket(bid);
1366 for (unsigned int i = 0; i < b->size; i++) {
1367 if (b->items[i] == id) {
1368 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1369 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
1370 << " in bucket " << bid << dendl;
1371 adjust_item_weight(cct, bid, b->weight);
1372 changed++;
1373 }
1374 }
1375 }
1376 if (!changed)
1377 return -ENOENT;
1378 return changed;
1379 }
1380
1381 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1382 {
1383 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1384 crush_bucket *b = get_bucket(id);
1385 if (IS_ERR(b))
1386 return PTR_ERR(b);
1387 int changed = 0;
1388 list<crush_bucket*> q;
1389 q.push_back(b);
1390 while (!q.empty()) {
1391 b = q.front();
1392 q.pop_front();
1393 int local_changed = 0;
1394 for (unsigned i=0; i<b->size; ++i) {
1395 int n = b->items[i];
1396 if (n >= 0) {
1397 bucket_adjust_item_weight(cct, b, n, weight);
1398 ++changed;
1399 ++local_changed;
1400 } else {
1401 crush_bucket *sub = get_bucket(n);
1402 if (IS_ERR(sub))
1403 continue;
1404 q.push_back(sub);
1405 }
1406 }
1407 if (local_changed) {
1408 adjust_item_weight(cct, b->id, b->weight);
1409 }
1410 }
1411 return changed;
1412 }
1413
1414 bool CrushWrapper::check_item_present(int id) const
1415 {
1416 bool found = false;
1417
1418 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1419 crush_bucket *b = crush->buckets[bidx];
1420 if (b == 0)
1421 continue;
1422 for (unsigned i = 0; i < b->size; i++)
1423 if (b->items[i] == id)
1424 found = true;
1425 }
1426 return found;
1427 }
1428
1429
1430 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1431 {
1432
1433 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1434 crush_bucket *b = crush->buckets[bidx];
1435 if (b == 0)
1436 continue;
1437 if (is_shadow_item(b->id))
1438 continue;
1439 for (unsigned i = 0; i < b->size; i++)
1440 if (b->items[i] == id) {
1441 string parent_id = name_map[b->id];
1442 string parent_bucket_type = type_map[b->type];
1443 if (_ret)
1444 *_ret = 0;
1445 return make_pair(parent_bucket_type, parent_id);
1446 }
1447 }
1448
1449 if (_ret)
1450 *_ret = -ENOENT;
1451
1452 return pair<string, string>();
1453 }
1454
1455 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1456 {
1457 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1458 crush_bucket *b = crush->buckets[bidx];
1459 if (b == 0)
1460 continue;
1461 if (is_shadow_item(b->id))
1462 continue;
1463 for (unsigned i = 0; i < b->size; i++) {
1464 if (b->items[i] == id) {
1465 *parent = b->id;
1466 return 0;
1467 }
1468 }
1469 }
1470 return -ENOENT;
1471 }
1472
1473 int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
1474 {
1475 if (rule < 0) {
1476 // no rule specified
1477 do {
1478 int r = get_immediate_parent_id(item, &item);
1479 if (r < 0) {
1480 return 0;
1481 }
1482 } while (get_bucket_type(item) != type);
1483 return item;
1484 }
1485 set<int> roots;
1486 find_takes_by_rule(rule, &roots);
1487 for (auto root : roots) {
1488 set<int> candidates;
1489 get_children_of_type(root, type, &candidates, false);
1490 for (auto candidate : candidates) {
1491 if (subtree_contains(candidate, item)) {
1492 // note that here we assure that no two different buckets
1493 // from a single crush rule will share a same device,
1494 // which should generally be true.
1495 return candidate;
1496 }
1497 }
1498 }
1499 return 0; // not found
1500 }
1501
1502 int CrushWrapper::rename_class(const string& srcname, const string& dstname)
1503 {
1504 auto i = class_rname.find(srcname);
1505 if (i == class_rname.end())
1506 return -ENOENT;
1507 auto j = class_rname.find(dstname);
1508 if (j != class_rname.end())
1509 return -EEXIST;
1510
1511 int class_id = i->second;
1512 assert(class_name.count(class_id));
1513 // rename any shadow buckets of old class name
1514 for (auto &it: class_map) {
1515 if (it.first < 0 && it.second == class_id) {
1516 string old_name = get_item_name(it.first);
1517 size_t pos = old_name.find("~");
1518 assert(pos != string::npos);
1519 string name_no_class = old_name.substr(0, pos);
1520 string old_class_name = old_name.substr(pos + 1);
1521 assert(old_class_name == srcname);
1522 string new_name = name_no_class + "~" + dstname;
1523 // we do not use set_item_name
1524 // because the name is intentionally invalid
1525 name_map[it.first] = new_name;
1526 have_rmaps = false;
1527 }
1528 }
1529
1530 // rename class
1531 class_rname.erase(srcname);
1532 class_name.erase(class_id);
1533 class_rname[dstname] = class_id;
1534 class_name[class_id] = dstname;
1535 return 0;
1536 }
1537
1538 int CrushWrapper::populate_classes(
1539 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
1540 {
1541 // build set of previous used shadow ids
1542 set<int32_t> used_ids;
1543 for (auto& p : old_class_bucket) {
1544 for (auto& q : p.second) {
1545 used_ids.insert(q.second);
1546 }
1547 }
1548 // accumulate weight values for each carg and bucket as we go. because it is
1549 // depth first, we will have the nested bucket weights we need when we
1550 // finish constructing the containing buckets.
1551 map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
1552 set<int> roots;
1553 find_nonshadow_roots(&roots);
1554 for (auto &r : roots) {
1555 if (r >= 0)
1556 continue;
1557 for (auto &c : class_name) {
1558 int clone;
1559 int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
1560 &clone, &cmap_item_weight);
1561 if (res < 0)
1562 return res;
1563 }
1564 }
1565 return 0;
1566 }
1567
1568 int CrushWrapper::trim_roots_with_class()
1569 {
1570 set<int> roots;
1571 find_shadow_roots(&roots);
1572 for (auto &r : roots) {
1573 if (r >= 0)
1574 continue;
1575 int res = remove_root(r);
1576 if (res)
1577 return res;
1578 }
1579 // there is no need to reweight because we only remove from the
1580 // root and down
1581 return 0;
1582 }
1583
1584 int32_t CrushWrapper::_alloc_class_id() const {
1585 if (class_name.empty()) {
1586 return 0;
1587 }
1588 int32_t class_id = class_name.rbegin()->first + 1;
1589 if (class_id >= 0) {
1590 return class_id;
1591 }
1592 // wrapped, pick a random start and do exhaustive search
1593 uint32_t upperlimit = numeric_limits<int32_t>::max();
1594 upperlimit++;
1595 class_id = rand() % upperlimit;
1596 const auto start = class_id;
1597 do {
1598 if (!class_name.count(class_id)) {
1599 return class_id;
1600 } else {
1601 class_id++;
1602 if (class_id < 0) {
1603 class_id = 0;
1604 }
1605 }
1606 } while (class_id != start);
1607 assert(0 == "no available class id");
1608 }
1609
1610 int CrushWrapper::set_subtree_class(
1611 const string& subtree,
1612 const string& new_class)
1613 {
1614 if (!name_exists(subtree)) {
1615 return -ENOENT;
1616 }
1617
1618 int new_class_id = -1;
1619 if (class_exists(new_class)) {
1620 new_class_id = get_class_id(new_class);
1621 } else {
1622 for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
1623 class_name[new_class_id] = new_class;
1624 class_rname[new_class] = new_class_id;
1625 }
1626
1627 int id = get_item_id(subtree);
1628 list<int> q = { id };
1629 while (!q.empty()) {
1630 int id = q.front();
1631 q.pop_front();
1632 crush_bucket *b = get_bucket(id);
1633 if (IS_ERR(b)) {
1634 return PTR_ERR(b);
1635 }
1636 for (unsigned i = 0; i < b->size; ++i) {
1637 int item = b->items[i];
1638 if (item >= 0) {
1639 class_map[item] = new_class_id;
1640 } else {
1641 q.push_back(item);
1642 }
1643 }
1644 }
1645 return 0;
1646 }
1647
1648 int CrushWrapper::reclassify(
1649 CephContext *cct,
1650 ostream& out,
1651 const map<string,string>& classify_root,
1652 const map<string,pair<string,string>>& classify_bucket
1653 )
1654 {
1655 map<int,string> reclassified_bucket; // orig_id -> class
1656
1657 // classify_root
1658 for (auto& i : classify_root) {
1659 string root = i.first;
1660 if (!name_exists(root)) {
1661 out << "root " << root << " does not exist" << std::endl;
1662 return -EINVAL;
1663 }
1664 int root_id = get_item_id(root);
1665 string new_class = i.second;
1666 int new_class_id = -1;
1667 out << "classify_root " << root << " (" << root_id
1668 << ") as " << new_class << std::endl;
1669 if (class_exists(new_class)) {
1670 new_class_id = get_class_id(new_class);
1671 out << " new class " << new_class << " exists as " << new_class_id
1672 << std::endl;
1673 } else {
1674 for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
1675 class_name[new_class_id] = new_class;
1676 class_rname[new_class] = new_class_id;
1677 out << " created new class " << new_class << " as " << new_class_id
1678 << std::endl;
1679 }
1680
1681 // validate rules
1682 for (unsigned j = 0; j < crush->max_rules; j++) {
1683 if (crush->rules[j]) {
1684 auto rule = crush->rules[j];
1685 for (unsigned k = 0; k < rule->len; ++k) {
1686 if (rule->steps[k].op == CRUSH_RULE_TAKE) {
1687 int step_item = get_rule_arg1(j, k);
1688 int original_item;
1689 int c;
1690 int res = split_id_class(step_item, &original_item, &c);
1691 if (res < 0)
1692 return res;
1693 if (c >= 0) {
1694 if (original_item == root_id) {
1695 out << " rule " << j << " includes take on root "
1696 << root << " class " << c << std::endl;
1697 return -EINVAL;
1698 }
1699 }
1700 }
1701 }
1702 }
1703 }
1704
1705 // rebuild new buckets for root
1706 //cout << "before class_bucket: " << class_bucket << std::endl;
1707 map<int,int> renumber;
1708 list<int> q;
1709 q.push_back(root_id);
1710 while (!q.empty()) {
1711 int id = q.front();
1712 q.pop_front();
1713 crush_bucket *bucket = get_bucket(id);
1714 if (IS_ERR(bucket)) {
1715 out << "cannot find bucket " << id
1716 << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
1717 return PTR_ERR(bucket);
1718 }
1719
1720 // move bucket
1721 int new_id = get_new_bucket_id();
1722 out << " renumbering bucket " << id << " -> " << new_id << std::endl;
1723 renumber[id] = new_id;
1724 crush->buckets[-1-new_id] = bucket;
1725 bucket->id = new_id;
1726 crush->buckets[-1-id] = crush_make_bucket(crush,
1727 bucket->alg,
1728 bucket->hash,
1729 bucket->type,
1730 0, NULL, NULL);
1731 crush->buckets[-1-id]->id = id;
1732 for (auto& i : choose_args) {
1733 i.second.args[-1-new_id] = i.second.args[-1-id];
1734 memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
1735 }
1736 class_bucket.erase(id);
1737 class_bucket[new_id][new_class_id] = id;
1738 name_map[new_id] = string(get_item_name(id));
1739 name_map[id] = string(get_item_name(id)) + "~" + new_class;
1740
1741 for (unsigned j = 0; j < bucket->size; ++j) {
1742 if (bucket->items[j] < 0) {
1743 q.push_front(bucket->items[j]);
1744 } else {
1745 // we don't reclassify the device here; if the users wants that,
1746 // they can pass --set-subtree-class separately.
1747 }
1748 }
1749 }
1750 //cout << "mid class_bucket: " << class_bucket << std::endl;
1751
1752 for (int i = 0; i < crush->max_buckets; ++i) {
1753 crush_bucket *b = crush->buckets[i];
1754 if (!b) {
1755 continue;
1756 }
1757 for (unsigned j = 0; j < b->size; ++j) {
1758 if (renumber.count(b->items[j])) {
1759 b->items[j] = renumber[b->items[j]];
1760 }
1761 }
1762 }
1763
1764 int r = rebuild_roots_with_classes();
1765 if (r < 0) {
1766 out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
1767 << std::endl;
1768 return r;
1769 }
1770 //cout << "final class_bucket: " << class_bucket << std::endl;
1771 }
1772
1773 // classify_bucket
1774 map<int,int> send_to; // source bucket -> dest bucket
1775 map<int,map<int,int>> new_class_bucket;
1776 map<int,string> new_bucket_names;
1777 map<int,map<string,string>> new_buckets;
1778 map<string,int> new_bucket_by_name;
1779 for (auto& i : classify_bucket) {
1780 const string& match = i.first; // prefix% or %suffix
1781 const string& new_class = i.second.first;
1782 const string& default_parent = i.second.second;
1783 if (!name_exists(default_parent)) {
1784 out << "default parent " << default_parent << " does not exist"
1785 << std::endl;
1786 return -EINVAL;
1787 }
1788 int default_parent_id = get_item_id(default_parent);
1789 crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
1790 assert(default_parent_bucket);
1791 string default_parent_type_name = get_type_name(default_parent_bucket->type);
1792
1793 out << "classify_bucket " << match << " as " << new_class
1794 << " default bucket " << default_parent
1795 << " (" << default_parent_type_name << ")" << std::endl;
1796
1797 int new_class_id = -1;
1798 if (class_exists(new_class)) {
1799 new_class_id = get_class_id(new_class);
1800 out << " new class " << new_class << " exists as " << new_class_id
1801 << std::endl;
1802 } else {
1803 for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) {
1804 }
1805 class_name[new_class_id] = new_class;
1806 class_rname[new_class] = new_class_id;
1807 out << " created new class " << new_class << " as " << new_class_id
1808 << std::endl;
1809 }
1810
1811 for (int j = 0; j < crush->max_buckets; ++j) {
1812 crush_bucket *b = crush->buckets[j];
1813 if (!b || is_shadow_item(b->id)) {
1814 continue;
1815 }
1816 string name = get_item_name(b->id);
1817 if (name.length() < match.length()) {
1818 continue;
1819 }
1820 string basename;
1821 if (match[0] == '%') {
1822 if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
1823 continue;
1824 }
1825 basename = name.substr(0, name.size() - match.size() + 1);
1826 } else if (match[match.size() - 1] == '%') {
1827 if (match.substr(0, match.size() - 1) !=
1828 name.substr(0, match.size() - 1)) {
1829 continue;
1830 }
1831 basename = name.substr(match.size() - 1);
1832 } else if (match == name) {
1833 basename = default_parent;
1834 } else {
1835 continue;
1836 }
1837 cout << "match " << match << " to " << name << " basename " << basename
1838 << std::endl;
1839 // look up or create basename bucket
1840 int base_id;
1841 if (name_exists(basename)) {
1842 base_id = get_item_id(basename);
1843 cout << " have base " << base_id << std::endl;
1844 } else if (new_bucket_by_name.count(basename)) {
1845 base_id = new_bucket_by_name[basename];
1846 cout << " already creating base " << base_id << std::endl;
1847 } else {
1848 base_id = get_new_bucket_id();
1849 crush->buckets[-1-base_id] = crush_make_bucket(crush,
1850 b->alg,
1851 b->hash,
1852 b->type,
1853 0, NULL, NULL);
1854 crush->buckets[-1-base_id]->id = base_id;
1855 name_map[base_id] = basename;
1856 new_bucket_by_name[basename] = base_id;
1857 cout << " created base " << base_id << std::endl;
1858
1859 new_buckets[base_id][default_parent_type_name] = default_parent;
1860 }
1861 send_to[b->id] = base_id;
1862 new_class_bucket[base_id][new_class_id] = b->id;
1863 new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
1864
1865 // make sure devices are classified
1866 for (unsigned i = 0; i < b->size; ++i) {
1867 int item = b->items[i];
1868 if (item >= 0) {
1869 class_map[item] = new_class_id;
1870 }
1871 }
1872 }
1873 }
1874
1875 // no name_exists() works below,
1876 have_rmaps = false;
1877
1878 // copy items around
1879 //cout << "send_to " << send_to << std::endl;
1880 set<int> roots;
1881 find_roots(&roots);
1882 for (auto& i : send_to) {
1883 crush_bucket *from = get_bucket(i.first);
1884 crush_bucket *to = get_bucket(i.second);
1885 cout << "moving items from " << from->id << " (" << get_item_name(from->id)
1886 << ") to " << to->id << " (" << get_item_name(to->id) << ")"
1887 << std::endl;
1888 for (unsigned j = 0; j < from->size; ++j) {
1889 int item = from->items[j];
1890 int r;
1891 map<string,string> to_loc;
1892 to_loc[get_type_name(to->type)] = get_item_name(to->id);
1893 if (item >= 0) {
1894 if (subtree_contains(to->id, item)) {
1895 continue;
1896 }
1897 map<string,string> from_loc;
1898 from_loc[get_type_name(from->type)] = get_item_name(from->id);
1899 auto w = get_item_weightf_in_loc(item, from_loc);
1900 r = insert_item(cct, item,
1901 w,
1902 get_item_name(item),
1903 to_loc);
1904 } else {
1905 if (!send_to.count(item)) {
1906 lderr(cct) << "item " << item << " in bucket " << from->id
1907 << " is not also a reclassified bucket" << dendl;
1908 return -EINVAL;
1909 }
1910 int newitem = send_to[item];
1911 if (subtree_contains(to->id, newitem)) {
1912 continue;
1913 }
1914 r = link_bucket(cct, newitem, to_loc);
1915 }
1916 if (r != 0) {
1917 cout << __func__ << " err from insert_item: " << cpp_strerror(r)
1918 << std::endl;
1919 return r;
1920 }
1921 }
1922 }
1923
1924 // make sure new buckets have parents
1925 for (auto& i : new_buckets) {
1926 int parent;
1927 if (get_immediate_parent_id(i.first, &parent) < 0) {
1928 cout << "new bucket " << i.first << " missing parent, adding at "
1929 << i.second << std::endl;
1930 int r = link_bucket(cct, i.first, i.second);
1931 if (r != 0) {
1932 cout << __func__ << " err from insert_item: " << cpp_strerror(r)
1933 << std::endl;
1934 return r;
1935 }
1936 }
1937 }
1938
1939 // set class mappings
1940 //cout << "pre class_bucket: " << class_bucket << std::endl;
1941 for (auto& i : new_class_bucket) {
1942 for (auto& j : i.second) {
1943 class_bucket[i.first][j.first] = j.second;
1944 }
1945
1946 }
1947 //cout << "post class_bucket: " << class_bucket << std::endl;
1948 for (auto& i : new_bucket_names) {
1949 name_map[i.first] = i.second;
1950 }
1951
1952 int r = rebuild_roots_with_classes();
1953 if (r < 0) {
1954 out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
1955 << std::endl;
1956 return r;
1957 }
1958 //cout << "final class_bucket: " << class_bucket << std::endl;
1959
1960 return 0;
1961 }
1962
1963 int CrushWrapper::get_new_bucket_id()
1964 {
1965 int id = -1;
1966 while (crush->buckets[-1-id] &&
1967 -1-id < crush->max_buckets) {
1968 id--;
1969 }
1970 if (-1-id == crush->max_buckets) {
1971 ++crush->max_buckets;
1972 crush->buckets = (struct crush_bucket**)realloc(
1973 crush->buckets,
1974 sizeof(crush->buckets[0]) * crush->max_buckets);
1975 for (auto& i : choose_args) {
1976 assert(i.second.size == crush->max_buckets - 1);
1977 ++i.second.size;
1978 i.second.args = (struct crush_choose_arg*)realloc(
1979 i.second.args,
1980 sizeof(i.second.args[0]) * i.second.size);
1981 }
1982 }
1983 return id;
1984 }
1985
1986 void CrushWrapper::reweight(CephContext *cct)
1987 {
1988 set<int> roots;
1989 find_nonshadow_roots(&roots);
1990 for (auto id : roots) {
1991 if (id >= 0)
1992 continue;
1993 crush_bucket *b = get_bucket(id);
1994 ldout(cct, 5) << "reweight root bucket " << id << dendl;
1995 int r = crush_reweight_bucket(crush, b);
1996 assert(r == 0);
1997
1998 for (auto& i : choose_args) {
1999 //cout << "carg " << i.first << std::endl;
2000 vector<uint32_t> w; // discard top-level weights
2001 reweight_bucket(b, i.second, &w);
2002 }
2003 }
2004 int r = rebuild_roots_with_classes();
2005 ceph_assert(r == 0);
2006 }
2007
2008 void CrushWrapper::reweight_bucket(
2009 crush_bucket *b,
2010 crush_choose_arg_map& arg_map,
2011 vector<uint32_t> *weightv)
2012 {
2013 int idx = -1 - b->id;
2014 unsigned npos = arg_map.args[idx].weight_set_positions;
2015 //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
2016 weightv->resize(npos);
2017 for (unsigned i = 0; i < b->size; ++i) {
2018 int item = b->items[i];
2019 if (item >= 0) {
2020 for (unsigned pos = 0; pos < npos; ++pos) {
2021 (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
2022 }
2023 } else {
2024 vector<uint32_t> subw(npos);
2025 crush_bucket *sub = get_bucket(item);
2026 assert(sub);
2027 reweight_bucket(sub, arg_map, &subw);
2028 for (unsigned pos = 0; pos < npos; ++pos) {
2029 (*weightv)[pos] += subw[pos];
2030 // strash the real bucket weight as the weights for this reference
2031 arg_map.args[idx].weight_set->weights[i] = subw[pos];
2032 }
2033 }
2034 }
2035 //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
2036 }
2037
2038 int CrushWrapper::add_simple_rule_at(
2039 string name, string root_name,
2040 string failure_domain_name,
2041 string device_class,
2042 string mode, int rule_type,
2043 int rno,
2044 ostream *err)
2045 {
2046 if (rule_exists(name)) {
2047 if (err)
2048 *err << "rule " << name << " exists";
2049 return -EEXIST;
2050 }
2051 if (rno >= 0) {
2052 if (rule_exists(rno)) {
2053 if (err)
2054 *err << "rule with ruleno " << rno << " exists";
2055 return -EEXIST;
2056 }
2057 if (ruleset_exists(rno)) {
2058 if (err)
2059 *err << "ruleset " << rno << " exists";
2060 return -EEXIST;
2061 }
2062 } else {
2063 for (rno = 0; rno < get_max_rules(); rno++) {
2064 if (!rule_exists(rno) && !ruleset_exists(rno))
2065 break;
2066 }
2067 }
2068 if (!name_exists(root_name)) {
2069 if (err)
2070 *err << "root item " << root_name << " does not exist";
2071 return -ENOENT;
2072 }
2073 int root = get_item_id(root_name);
2074 int type = 0;
2075 if (failure_domain_name.length()) {
2076 type = get_type_id(failure_domain_name);
2077 if (type < 0) {
2078 if (err)
2079 *err << "unknown type " << failure_domain_name;
2080 return -EINVAL;
2081 }
2082 }
2083 if (device_class.size()) {
2084 if (!class_exists(device_class)) {
2085 if (err)
2086 *err << "device class " << device_class << " does not exist";
2087 return -EINVAL;
2088 }
2089 int c = get_class_id(device_class);
2090 if (class_bucket.count(root) == 0 ||
2091 class_bucket[root].count(c) == 0) {
2092 if (err)
2093 *err << "root " << root_name << " has no devices with class "
2094 << device_class;
2095 return -EINVAL;
2096 }
2097 root = class_bucket[root][c];
2098 }
2099 if (mode != "firstn" && mode != "indep") {
2100 if (err)
2101 *err << "unknown mode " << mode;
2102 return -EINVAL;
2103 }
2104
2105 int steps = 3;
2106 if (mode == "indep")
2107 steps = 5;
2108 int min_rep = mode == "firstn" ? 1 : 3;
2109 int max_rep = mode == "firstn" ? 10 : 20;
2110 //set the ruleset the same as rule_id(rno)
2111 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
2112 assert(rule);
2113 int step = 0;
2114 if (mode == "indep") {
2115 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
2116 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
2117 }
2118 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
2119 if (type)
2120 crush_rule_set_step(rule, step++,
2121 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
2122 CRUSH_RULE_CHOOSELEAF_INDEP,
2123 CRUSH_CHOOSE_N,
2124 type);
2125 else
2126 crush_rule_set_step(rule, step++,
2127 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
2128 CRUSH_RULE_CHOOSE_INDEP,
2129 CRUSH_CHOOSE_N,
2130 0);
2131 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
2132
2133 int ret = crush_add_rule(crush, rule, rno);
2134 if(ret < 0) {
2135 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
2136 return ret;
2137 }
2138 set_rule_name(rno, name);
2139 have_rmaps = false;
2140 return rno;
2141 }
2142
2143 int CrushWrapper::add_simple_rule(
2144 string name, string root_name,
2145 string failure_domain_name,
2146 string device_class,
2147 string mode, int rule_type,
2148 ostream *err)
2149 {
2150 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
2151 mode,
2152 rule_type, -1, err);
2153 }
2154
2155 float CrushWrapper::_get_take_weight_osd_map(int root,
2156 map<int,float> *pmap) const
2157 {
2158 float sum = 0.0;
2159 list<int> q;
2160 q.push_back(root);
2161 //breadth first iterate the OSD tree
2162 while (!q.empty()) {
2163 int bno = q.front();
2164 q.pop_front();
2165 crush_bucket *b = crush->buckets[-1-bno];
2166 assert(b);
2167 for (unsigned j=0; j<b->size; ++j) {
2168 int item_id = b->items[j];
2169 if (item_id >= 0) { //it's an OSD
2170 float w = crush_get_bucket_item_weight(b, j);
2171 (*pmap)[item_id] = w;
2172 sum += w;
2173 } else { //not an OSD, expand the child later
2174 q.push_back(item_id);
2175 }
2176 }
2177 }
2178 return sum;
2179 }
2180
2181 void CrushWrapper::_normalize_weight_map(float sum,
2182 const map<int,float>& m,
2183 map<int,float> *pmap) const
2184 {
2185 for (auto& p : m) {
2186 map<int,float>::iterator q = pmap->find(p.first);
2187 if (q == pmap->end()) {
2188 (*pmap)[p.first] = p.second / sum;
2189 } else {
2190 q->second += p.second / sum;
2191 }
2192 }
2193 }
2194
2195 int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
2196 {
2197 map<int,float> m;
2198 float sum = _get_take_weight_osd_map(root, &m);
2199 _normalize_weight_map(sum, m, pmap);
2200 return 0;
2201 }
2202
2203 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
2204 map<int,float> *pmap) const
2205 {
2206 if (ruleno >= crush->max_rules)
2207 return -ENOENT;
2208 if (crush->rules[ruleno] == NULL)
2209 return -ENOENT;
2210 crush_rule *rule = crush->rules[ruleno];
2211
2212 // build a weight map for each TAKE in the rule, and then merge them
2213
2214 // FIXME: if there are multiple takes that place a different number of
2215 // objects we do not take that into account. (Also, note that doing this
2216 // right is also a function of the pool, since the crush rule
2217 // might choose 2 + choose 2 but pool size may only be 3.)
2218 for (unsigned i=0; i<rule->len; ++i) {
2219 map<int,float> m;
2220 float sum = 0;
2221 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
2222 int n = rule->steps[i].arg1;
2223 if (n >= 0) {
2224 m[n] = 1.0;
2225 sum = 1.0;
2226 } else {
2227 sum += _get_take_weight_osd_map(n, &m);
2228 }
2229 }
2230 _normalize_weight_map(sum, m, pmap);
2231 }
2232
2233 return 0;
2234 }
2235
2236 int CrushWrapper::remove_rule(int ruleno)
2237 {
2238 if (ruleno >= (int)crush->max_rules)
2239 return -ENOENT;
2240 if (crush->rules[ruleno] == NULL)
2241 return -ENOENT;
2242 crush_destroy_rule(crush->rules[ruleno]);
2243 crush->rules[ruleno] = NULL;
2244 rule_name_map.erase(ruleno);
2245 have_rmaps = false;
2246 return rebuild_roots_with_classes();
2247 }
2248
2249 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
2250 {
2251 if (cct->_conf->osd_crush_update_weight_set) {
2252 unsigned position;
2253 for (position = 0; position < bucket->size; position++)
2254 if (bucket->items[position] == item)
2255 break;
2256 assert(position != bucket->size);
2257 for (auto &w : choose_args) {
2258 crush_choose_arg_map &arg_map = w.second;
2259 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
2260 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
2261 crush_weight_set *weight_set = &arg->weight_set[j];
2262 weight_set->weights[position] = weight;
2263 }
2264 }
2265 }
2266 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
2267 }
2268
2269 int CrushWrapper::add_bucket(
2270 int bucketno, int alg, int hash, int type, int size,
2271 int *items, int *weights, int *idout)
2272 {
2273 if (alg == 0) {
2274 alg = get_default_bucket_alg();
2275 if (alg == 0)
2276 return -EINVAL;
2277 }
2278 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
2279 weights);
2280 assert(b);
2281 assert(idout);
2282 int r = crush_add_bucket(crush, bucketno, b, idout);
2283 int pos = -1 - *idout;
2284 for (auto& p : choose_args) {
2285 crush_choose_arg_map& cmap = p.second;
2286 if (cmap.args) {
2287 if ((int)cmap.size <= pos) {
2288 cmap.args = (crush_choose_arg*)realloc(
2289 cmap.args,
2290 sizeof(crush_choose_arg) * (pos + 1));
2291 assert(cmap.args);
2292 memset(&cmap.args[cmap.size], 0,
2293 sizeof(crush_choose_arg) * (pos + 1 - cmap.size));
2294 cmap.size = pos + 1;
2295 }
2296 } else {
2297 cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
2298 pos + 1);
2299 assert(cmap.args);
2300 cmap.size = pos + 1;
2301 }
2302 if (size > 0) {
2303 int positions = get_choose_args_positions(cmap);
2304 crush_choose_arg& carg = cmap.args[pos];
2305 carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
2306 size);
2307 carg.weight_set_positions = positions;
2308 for (int ppos = 0; ppos < positions; ++ppos) {
2309 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
2310 carg.weight_set[ppos].size = size;
2311 for (int bpos = 0; bpos < size; ++bpos) {
2312 carg.weight_set[ppos].weights[bpos] = weights[bpos];
2313 }
2314 }
2315 }
2316 }
2317 return r;
2318 }
2319
2320 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
2321 {
2322 __u32 new_size = bucket->size + 1;
2323 int r = crush_bucket_add_item(crush, bucket, item, weight);
2324 if (r < 0) {
2325 return r;
2326 }
2327 for (auto &w : choose_args) {
2328 crush_choose_arg_map &arg_map = w.second;
2329 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
2330 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
2331 crush_weight_set *weight_set = &arg->weight_set[j];
2332 weight_set->weights = (__u32*)realloc(weight_set->weights,
2333 new_size * sizeof(__u32));
2334 assert(weight_set->size + 1 == new_size);
2335 weight_set->weights[weight_set->size] = weight;
2336 weight_set->size = new_size;
2337 }
2338 if (arg->ids_size) {
2339 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
2340 assert(arg->ids_size + 1 == new_size);
2341 arg->ids[arg->ids_size] = item;
2342 arg->ids_size = new_size;
2343 }
2344 }
2345 return 0;
2346 }
2347
2348 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
2349 {
2350 __u32 new_size = bucket->size - 1;
2351 unsigned position;
2352 for (position = 0; position < bucket->size; position++)
2353 if (bucket->items[position] == item)
2354 break;
2355 assert(position != bucket->size);
2356 int r = crush_bucket_remove_item(crush, bucket, item);
2357 if (r < 0) {
2358 return r;
2359 }
2360 for (auto &w : choose_args) {
2361 crush_choose_arg_map &arg_map = w.second;
2362 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
2363 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
2364 crush_weight_set *weight_set = &arg->weight_set[j];
2365 assert(weight_set->size - 1 == new_size);
2366 for (__u32 k = position; k < new_size; k++)
2367 weight_set->weights[k] = weight_set->weights[k+1];
2368 if (new_size) {
2369 weight_set->weights = (__u32*)realloc(weight_set->weights,
2370 new_size * sizeof(__u32));
2371 } else {
2372 weight_set->weights = NULL;
2373 }
2374 weight_set->size = new_size;
2375 }
2376 if (arg->ids_size) {
2377 assert(arg->ids_size - 1 == new_size);
2378 for (__u32 k = position; k < new_size; k++)
2379 arg->ids[k] = arg->ids[k+1];
2380 if (new_size) {
2381 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
2382 } else {
2383 arg->ids = NULL;
2384 }
2385 arg->ids_size = new_size;
2386 }
2387 }
2388 return 0;
2389 }
2390
2391 int CrushWrapper::bucket_set_alg(int bid, int alg)
2392 {
2393 crush_bucket *b = get_bucket(bid);
2394 if (!b) {
2395 return -ENOENT;
2396 }
2397 b->alg = alg;
2398 return 0;
2399 }
2400
2401 int CrushWrapper::update_device_class(int id,
2402 const string& class_name,
2403 const string& name,
2404 ostream *ss)
2405 {
2406 assert(item_exists(id));
2407 auto old_class_name = get_item_class(id);
2408 if (old_class_name && old_class_name != class_name) {
2409 *ss << "osd." << id << " has already bound to class '" << old_class_name
2410 << "', can not reset class to '" << class_name << "'; "
2411 << "use 'ceph osd crush rm-device-class <osd>' to "
2412 << "remove old class first";
2413 return -EBUSY;
2414 }
2415
2416 int class_id = get_or_create_class_id(class_name);
2417 if (id < 0) {
2418 *ss << name << " id " << id << " is negative";
2419 return -EINVAL;
2420 }
2421
2422 if (class_map.count(id) != 0 && class_map[id] == class_id) {
2423 *ss << name << " already set to class " << class_name;
2424 return 0;
2425 }
2426
2427 set_item_class(id, class_id);
2428
2429 int r = rebuild_roots_with_classes();
2430 if (r < 0)
2431 return r;
2432 return 1;
2433 }
2434
2435 int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
2436 {
2437 assert(ss);
2438 const char *name = get_item_name(id);
2439 if (!name) {
2440 *ss << "osd." << id << " does not have a name";
2441 return -ENOENT;
2442 }
2443
2444 const char *class_name = get_item_class(id);
2445 if (!class_name) {
2446 *ss << "osd." << id << " has not been bound to a specific class yet";
2447 return 0;
2448 }
2449 class_remove_item(id);
2450
2451 int r = rebuild_roots_with_classes();
2452 if (r < 0) {
2453 *ss << "unable to rebuild roots with class '" << class_name << "' "
2454 << "of osd." << id << ": " << cpp_strerror(r);
2455 return r;
2456 }
2457 return 0;
2458 }
2459
2460 int CrushWrapper::device_class_clone(
2461 int original_id, int device_class,
2462 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
2463 const std::set<int32_t>& used_ids,
2464 int *clone,
2465 map<int,map<int,vector<int>>> *cmap_item_weight)
2466 {
2467 const char *item_name = get_item_name(original_id);
2468 if (item_name == NULL)
2469 return -ECHILD;
2470 const char *class_name = get_class_name(device_class);
2471 if (class_name == NULL)
2472 return -EBADF;
2473 string copy_name = item_name + string("~") + class_name;
2474 if (name_exists(copy_name)) {
2475 *clone = get_item_id(copy_name);
2476 return 0;
2477 }
2478
2479 crush_bucket *original = get_bucket(original_id);
2480 assert(!IS_ERR(original));
2481 crush_bucket *copy = crush_make_bucket(crush,
2482 original->alg,
2483 original->hash,
2484 original->type,
2485 0, NULL, NULL);
2486 assert(copy);
2487
2488 vector<unsigned> item_orig_pos; // new item pos -> orig item pos
2489 for (unsigned i = 0; i < original->size; i++) {
2490 int item = original->items[i];
2491 int weight = crush_get_bucket_item_weight(original, i);
2492 if (item >= 0) {
2493 if (class_map.count(item) != 0 && class_map[item] == device_class) {
2494 int res = crush_bucket_add_item(crush, copy, item, weight);
2495 if (res)
2496 return res;
2497 } else {
2498 continue;
2499 }
2500 } else {
2501 int child_copy_id;
2502 int res = device_class_clone(item, device_class, old_class_bucket,
2503 used_ids, &child_copy_id,
2504 cmap_item_weight);
2505 if (res < 0)
2506 return res;
2507 crush_bucket *child_copy = get_bucket(child_copy_id);
2508 assert(!IS_ERR(child_copy));
2509 res = crush_bucket_add_item(crush, copy, child_copy_id,
2510 child_copy->weight);
2511 if (res)
2512 return res;
2513 }
2514 item_orig_pos.push_back(i);
2515 }
2516 assert(item_orig_pos.size() == copy->size);
2517
2518 int bno = 0;
2519 if (old_class_bucket.count(original_id) &&
2520 old_class_bucket.at(original_id).count(device_class)) {
2521 bno = old_class_bucket.at(original_id).at(device_class);
2522 } else {
2523 // pick a new shadow bucket id that is not used by the current map
2524 // *or* any previous shadow buckets.
2525 bno = -1;
2526 while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
2527 used_ids.count(bno)) {
2528 --bno;
2529 }
2530 }
2531 int res = crush_add_bucket(crush, bno, copy, clone);
2532 if (res)
2533 return res;
2534 assert(!bno || bno == *clone);
2535
2536 res = set_item_class(*clone, device_class);
2537 if (res < 0)
2538 return res;
2539
2540 // we do not use set_item_name because the name is intentionally invalid
2541 name_map[*clone] = copy_name;
2542 if (have_rmaps)
2543 name_rmap[copy_name] = *clone;
2544 class_bucket[original_id][device_class] = *clone;
2545
2546 // set up choose_args for the new bucket.
2547 for (auto& w : choose_args) {
2548 crush_choose_arg_map& cmap = w.second;
2549 if (-1-bno >= (int)cmap.size) {
2550 unsigned new_size = -1-bno + 1;
2551 cmap.args = (crush_choose_arg*)realloc(cmap.args,
2552 new_size * sizeof(cmap.args[0]));
2553 assert(cmap.args);
2554 memset(cmap.args + cmap.size, 0,
2555 (new_size - cmap.size) * sizeof(cmap.args[0]));
2556 cmap.size = new_size;
2557 }
2558 auto& o = cmap.args[-1-original_id];
2559 auto& n = cmap.args[-1-bno];
2560 n.ids_size = 0; // FIXME: implement me someday
2561 n.weight_set_positions = o.weight_set_positions;
2562 n.weight_set = (crush_weight_set*)calloc(
2563 n.weight_set_positions, sizeof(crush_weight_set));
2564 for (size_t s = 0; s < n.weight_set_positions; ++s) {
2565 n.weight_set[s].size = copy->size;
2566 n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
2567 }
2568 for (size_t s = 0; s < n.weight_set_positions; ++s) {
2569 vector<int> bucket_weights(n.weight_set_positions);
2570 for (size_t i = 0; i < copy->size; ++i) {
2571 int item = copy->items[i];
2572 if (item >= 0) {
2573 n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
2574 } else if ((*cmap_item_weight)[w.first].count(item)) {
2575 n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
2576 } else {
2577 n.weight_set[s].weights[i] = 0;
2578 }
2579 bucket_weights[s] += n.weight_set[s].weights[i];
2580 }
2581 (*cmap_item_weight)[w.first][bno] = bucket_weights;
2582 }
2583 }
2584 return 0;
2585 }
2586
2587 int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
2588 {
2589 assert(rules);
2590 rules->clear();
2591 if (!class_exists(class_name)) {
2592 return -ENOENT;
2593 }
2594 int class_id = get_class_id(class_name);
2595 for (unsigned i = 0; i < crush->max_rules; ++i) {
2596 crush_rule *r = crush->rules[i];
2597 if (!r)
2598 continue;
2599 for (unsigned j = 0; j < r->len; ++j) {
2600 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2601 int step_item = r->steps[j].arg1;
2602 int original_item;
2603 int c;
2604 int res = split_id_class(step_item, &original_item, &c);
2605 if (res < 0) {
2606 return res;
2607 }
2608 if (c != -1 && c == class_id) {
2609 rules->insert(i);
2610 break;
2611 }
2612 }
2613 }
2614 }
2615 return 0;
2616 }
2617
2618 // return rules that might reference the given osd
2619 int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
2620 {
2621 assert(rules);
2622 rules->clear();
2623 if (osd < 0) {
2624 return -EINVAL;
2625 }
2626 for (unsigned i = 0; i < crush->max_rules; ++i) {
2627 crush_rule *r = crush->rules[i];
2628 if (!r)
2629 continue;
2630 for (unsigned j = 0; j < r->len; ++j) {
2631 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2632 int step_item = r->steps[j].arg1;
2633 list<int> unordered;
2634 int rc = _get_leaves(step_item, &unordered);
2635 if (rc < 0) {
2636 return rc; // propagate fatal errors!
2637 }
2638 bool match = false;
2639 for (auto &o: unordered) {
2640 assert(o >= 0);
2641 if (o == osd) {
2642 match = true;
2643 break;
2644 }
2645 }
2646 if (match) {
2647 rules->insert(i);
2648 break;
2649 }
2650 }
2651 }
2652 }
2653 return 0;
2654 }
2655
2656 bool CrushWrapper::_class_is_dead(int class_id)
2657 {
2658 for (auto &p: class_map) {
2659 if (p.first >= 0 && p.second == class_id) {
2660 return false;
2661 }
2662 }
2663 for (unsigned i = 0; i < crush->max_rules; ++i) {
2664 crush_rule *r = crush->rules[i];
2665 if (!r)
2666 continue;
2667 for (unsigned j = 0; j < r->len; ++j) {
2668 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2669 int root = r->steps[j].arg1;
2670 for (auto &p : class_bucket) {
2671 auto& q = p.second;
2672 if (q.count(class_id) && q[class_id] == root) {
2673 return false;
2674 }
2675 }
2676 }
2677 }
2678 }
2679 // no more referenced by any devices or crush rules
2680 return true;
2681 }
2682
2683 void CrushWrapper::cleanup_dead_classes()
2684 {
2685 auto p = class_name.begin();
2686 while (p != class_name.end()) {
2687 if (_class_is_dead(p->first)) {
2688 string n = p->second;
2689 ++p;
2690 remove_class_name(n);
2691 } else {
2692 ++p;
2693 }
2694 }
2695 }
2696
2697 int CrushWrapper::rebuild_roots_with_classes()
2698 {
2699 std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
2700 cleanup_dead_classes();
2701 int r = trim_roots_with_class();
2702 if (r < 0)
2703 return r;
2704 class_bucket.clear();
2705 return populate_classes(old_class_bucket);
2706 }
2707
2708 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
2709 {
2710 assert(crush);
2711
2712 __u32 magic = CRUSH_MAGIC;
2713 ::encode(magic, bl);
2714
2715 ::encode(crush->max_buckets, bl);
2716 ::encode(crush->max_rules, bl);
2717 ::encode(crush->max_devices, bl);
2718
2719 bool encode_compat_choose_args = false;
2720 crush_choose_arg_map arg_map;
2721 memset(&arg_map, '\0', sizeof(arg_map));
2722 if (has_choose_args() &&
2723 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
2724 assert(!has_incompat_choose_args());
2725 encode_compat_choose_args = true;
2726 arg_map = choose_args.begin()->second;
2727 }
2728
2729 // buckets
2730 for (int i=0; i<crush->max_buckets; i++) {
2731 __u32 alg = 0;
2732 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
2733 ::encode(alg, bl);
2734 if (!alg)
2735 continue;
2736
2737 ::encode(crush->buckets[i]->id, bl);
2738 ::encode(crush->buckets[i]->type, bl);
2739 ::encode(crush->buckets[i]->alg, bl);
2740 ::encode(crush->buckets[i]->hash, bl);
2741 ::encode(crush->buckets[i]->weight, bl);
2742 ::encode(crush->buckets[i]->size, bl);
2743 for (unsigned j=0; j<crush->buckets[i]->size; j++)
2744 ::encode(crush->buckets[i]->items[j], bl);
2745
2746 switch (crush->buckets[i]->alg) {
2747 case CRUSH_BUCKET_UNIFORM:
2748 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
2749 break;
2750
2751 case CRUSH_BUCKET_LIST:
2752 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2753 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
2754 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
2755 }
2756 break;
2757
2758 case CRUSH_BUCKET_TREE:
2759 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
2760 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
2761 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
2762 break;
2763
2764 case CRUSH_BUCKET_STRAW:
2765 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2766 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
2767 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
2768 }
2769 break;
2770
2771 case CRUSH_BUCKET_STRAW2:
2772 {
2773 __u32 *weights;
2774 if (encode_compat_choose_args &&
2775 arg_map.args[i].weight_set_positions > 0) {
2776 weights = arg_map.args[i].weight_set[0].weights;
2777 } else {
2778 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
2779 }
2780 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2781 ::encode(weights[j], bl);
2782 }
2783 }
2784 break;
2785
2786 default:
2787 ceph_abort();
2788 break;
2789 }
2790 }
2791
2792 // rules
2793 for (unsigned i=0; i<crush->max_rules; i++) {
2794 __u32 yes = crush->rules[i] ? 1:0;
2795 ::encode(yes, bl);
2796 if (!yes)
2797 continue;
2798
2799 ::encode(crush->rules[i]->len, bl);
2800 ::encode(crush->rules[i]->mask, bl);
2801 for (unsigned j=0; j<crush->rules[i]->len; j++)
2802 ::encode(crush->rules[i]->steps[j], bl);
2803 }
2804
2805 // name info
2806 ::encode(type_map, bl);
2807 ::encode(name_map, bl);
2808 ::encode(rule_name_map, bl);
2809
2810 // tunables
2811 ::encode(crush->choose_local_tries, bl);
2812 ::encode(crush->choose_local_fallback_tries, bl);
2813 ::encode(crush->choose_total_tries, bl);
2814 ::encode(crush->chooseleaf_descend_once, bl);
2815 ::encode(crush->chooseleaf_vary_r, bl);
2816 ::encode(crush->straw_calc_version, bl);
2817 ::encode(crush->allowed_bucket_algs, bl);
2818 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
2819 ::encode(crush->chooseleaf_stable, bl);
2820 }
2821
2822 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2823 // device classes
2824 ::encode(class_map, bl);
2825 ::encode(class_name, bl);
2826 ::encode(class_bucket, bl);
2827
2828 // choose args
2829 __u32 size = (__u32)choose_args.size();
2830 ::encode(size, bl);
2831 for (auto c : choose_args) {
2832 ::encode(c.first, bl);
2833 crush_choose_arg_map arg_map = c.second;
2834 size = 0;
2835 for (__u32 i = 0; i < arg_map.size; i++) {
2836 crush_choose_arg *arg = &arg_map.args[i];
2837 if (arg->weight_set_positions == 0 &&
2838 arg->ids_size == 0)
2839 continue;
2840 size++;
2841 }
2842 ::encode(size, bl);
2843 for (__u32 i = 0; i < arg_map.size; i++) {
2844 crush_choose_arg *arg = &arg_map.args[i];
2845 if (arg->weight_set_positions == 0 &&
2846 arg->ids_size == 0)
2847 continue;
2848 ::encode(i, bl);
2849 ::encode(arg->weight_set_positions, bl);
2850 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
2851 crush_weight_set *weight_set = &arg->weight_set[j];
2852 ::encode(weight_set->size, bl);
2853 for (__u32 k = 0; k < weight_set->size; k++)
2854 ::encode(weight_set->weights[k], bl);
2855 }
2856 ::encode(arg->ids_size, bl);
2857 for (__u32 j = 0; j < arg->ids_size; j++)
2858 ::encode(arg->ids[j], bl);
2859 }
2860 }
2861 }
2862 }
2863
2864 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
2865 {
2866 m.clear();
2867 __u32 n;
2868 ::decode(n, blp);
2869 while (n--) {
2870 __s32 key;
2871 ::decode(key, blp);
2872
2873 __u32 strlen;
2874 ::decode(strlen, blp);
2875 if (strlen == 0) {
2876 // der, key was actually 64-bits!
2877 ::decode(strlen, blp);
2878 }
2879 ::decode_nohead(strlen, m[key], blp);
2880 }
2881 }
2882
2883 void CrushWrapper::decode(bufferlist::iterator& blp)
2884 {
2885 create();
2886
2887 __u32 magic;
2888 ::decode(magic, blp);
2889 if (magic != CRUSH_MAGIC)
2890 throw buffer::malformed_input("bad magic number");
2891
2892 ::decode(crush->max_buckets, blp);
2893 ::decode(crush->max_rules, blp);
2894 ::decode(crush->max_devices, blp);
2895
2896 // legacy tunables, unless we decode something newer
2897 set_tunables_legacy();
2898
2899 try {
2900 // buckets
2901 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
2902 for (int i=0; i<crush->max_buckets; i++) {
2903 decode_crush_bucket(&crush->buckets[i], blp);
2904 }
2905
2906 // rules
2907 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
2908 for (unsigned i = 0; i < crush->max_rules; ++i) {
2909 __u32 yes;
2910 ::decode(yes, blp);
2911 if (!yes) {
2912 crush->rules[i] = NULL;
2913 continue;
2914 }
2915
2916 __u32 len;
2917 ::decode(len, blp);
2918 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
2919 crush->rules[i]->len = len;
2920 ::decode(crush->rules[i]->mask, blp);
2921 for (unsigned j=0; j<crush->rules[i]->len; j++)
2922 ::decode(crush->rules[i]->steps[j], blp);
2923 }
2924
2925 // name info
2926 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2927 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2928 // both by assuming the string is always non-empty.
2929 decode_32_or_64_string_map(type_map, blp);
2930 decode_32_or_64_string_map(name_map, blp);
2931 decode_32_or_64_string_map(rule_name_map, blp);
2932
2933 // tunables
2934 if (!blp.end()) {
2935 ::decode(crush->choose_local_tries, blp);
2936 ::decode(crush->choose_local_fallback_tries, blp);
2937 ::decode(crush->choose_total_tries, blp);
2938 }
2939 if (!blp.end()) {
2940 ::decode(crush->chooseleaf_descend_once, blp);
2941 }
2942 if (!blp.end()) {
2943 ::decode(crush->chooseleaf_vary_r, blp);
2944 }
2945 if (!blp.end()) {
2946 ::decode(crush->straw_calc_version, blp);
2947 }
2948 if (!blp.end()) {
2949 ::decode(crush->allowed_bucket_algs, blp);
2950 }
2951 if (!blp.end()) {
2952 ::decode(crush->chooseleaf_stable, blp);
2953 }
2954 if (!blp.end()) {
2955 ::decode(class_map, blp);
2956 ::decode(class_name, blp);
2957 for (auto &c : class_name)
2958 class_rname[c.second] = c.first;
2959 ::decode(class_bucket, blp);
2960 }
2961 if (!blp.end()) {
2962 __u32 choose_args_size;
2963 ::decode(choose_args_size, blp);
2964 for (__u32 i = 0; i < choose_args_size; i++) {
2965 typename decltype(choose_args)::key_type choose_args_index;
2966 ::decode(choose_args_index, blp);
2967 crush_choose_arg_map arg_map;
2968 arg_map.size = crush->max_buckets;
2969 arg_map.args = (crush_choose_arg*)calloc(
2970 arg_map.size, sizeof(crush_choose_arg));
2971 __u32 size;
2972 ::decode(size, blp);
2973 for (__u32 j = 0; j < size; j++) {
2974 __u32 bucket_index;
2975 ::decode(bucket_index, blp);
2976 assert(bucket_index < arg_map.size);
2977 crush_choose_arg *arg = &arg_map.args[bucket_index];
2978 ::decode(arg->weight_set_positions, blp);
2979 if (arg->weight_set_positions) {
2980 arg->weight_set = (crush_weight_set*)calloc(
2981 arg->weight_set_positions, sizeof(crush_weight_set));
2982 for (__u32 k = 0; k < arg->weight_set_positions; k++) {
2983 crush_weight_set *weight_set = &arg->weight_set[k];
2984 ::decode(weight_set->size, blp);
2985 weight_set->weights = (__u32*)calloc(
2986 weight_set->size, sizeof(__u32));
2987 for (__u32 l = 0; l < weight_set->size; l++)
2988 ::decode(weight_set->weights[l], blp);
2989 }
2990 }
2991 ::decode(arg->ids_size, blp);
2992 if (arg->ids_size) {
2993 assert(arg->ids_size == crush->buckets[bucket_index]->size);
2994 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
2995 for (__u32 k = 0; k < arg->ids_size; k++)
2996 ::decode(arg->ids[k], blp);
2997 }
2998 }
2999 choose_args[choose_args_index] = arg_map;
3000 }
3001 }
3002 update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
3003 finalize();
3004 }
3005 catch (...) {
3006 crush_destroy(crush);
3007 throw;
3008 }
3009 }
3010
3011 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
3012 {
3013 __u32 alg;
3014 ::decode(alg, blp);
3015 if (!alg) {
3016 *bptr = NULL;
3017 return;
3018 }
3019
3020 int size = 0;
3021 switch (alg) {
3022 case CRUSH_BUCKET_UNIFORM:
3023 size = sizeof(crush_bucket_uniform);
3024 break;
3025 case CRUSH_BUCKET_LIST:
3026 size = sizeof(crush_bucket_list);
3027 break;
3028 case CRUSH_BUCKET_TREE:
3029 size = sizeof(crush_bucket_tree);
3030 break;
3031 case CRUSH_BUCKET_STRAW:
3032 size = sizeof(crush_bucket_straw);
3033 break;
3034 case CRUSH_BUCKET_STRAW2:
3035 size = sizeof(crush_bucket_straw2);
3036 break;
3037 default:
3038 {
3039 char str[128];
3040 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
3041 throw buffer::malformed_input(str);
3042 }
3043 }
3044 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
3045 *bptr = bucket;
3046
3047 ::decode(bucket->id, blp);
3048 ::decode(bucket->type, blp);
3049 ::decode(bucket->alg, blp);
3050 ::decode(bucket->hash, blp);
3051 ::decode(bucket->weight, blp);
3052 ::decode(bucket->size, blp);
3053
3054 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
3055 for (unsigned j = 0; j < bucket->size; ++j) {
3056 ::decode(bucket->items[j], blp);
3057 }
3058
3059 switch (bucket->alg) {
3060 case CRUSH_BUCKET_UNIFORM:
3061 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
3062 break;
3063
3064 case CRUSH_BUCKET_LIST: {
3065 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
3066 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3067 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3068
3069 for (unsigned j = 0; j < bucket->size; ++j) {
3070 ::decode(cbl->item_weights[j], blp);
3071 ::decode(cbl->sum_weights[j], blp);
3072 }
3073 break;
3074 }
3075
3076 case CRUSH_BUCKET_TREE: {
3077 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
3078 ::decode(cbt->num_nodes, blp);
3079 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
3080 for (unsigned j=0; j<cbt->num_nodes; j++) {
3081 ::decode(cbt->node_weights[j], blp);
3082 }
3083 break;
3084 }
3085
3086 case CRUSH_BUCKET_STRAW: {
3087 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
3088 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3089 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3090 for (unsigned j = 0; j < bucket->size; ++j) {
3091 ::decode(cbs->item_weights[j], blp);
3092 ::decode(cbs->straws[j], blp);
3093 }
3094 break;
3095 }
3096
3097 case CRUSH_BUCKET_STRAW2: {
3098 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
3099 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3100 for (unsigned j = 0; j < bucket->size; ++j) {
3101 ::decode(cbs->item_weights[j], blp);
3102 }
3103 break;
3104 }
3105
3106 default:
3107 // We should have handled this case in the first switch statement
3108 ceph_abort();
3109 break;
3110 }
3111 }
3112
3113
3114 void CrushWrapper::dump(Formatter *f) const
3115 {
3116 f->open_array_section("devices");
3117 for (int i=0; i<get_max_devices(); i++) {
3118 f->open_object_section("device");
3119 f->dump_int("id", i);
3120 const char *n = get_item_name(i);
3121 if (n) {
3122 f->dump_string("name", n);
3123 } else {
3124 char name[20];
3125 sprintf(name, "device%d", i);
3126 f->dump_string("name", name);
3127 }
3128 const char *device_class = get_item_class(i);
3129 if (device_class != NULL)
3130 f->dump_string("class", device_class);
3131 f->close_section();
3132 }
3133 f->close_section();
3134
3135 f->open_array_section("types");
3136 int n = get_num_type_names();
3137 for (int i=0; n; i++) {
3138 const char *name = get_type_name(i);
3139 if (!name) {
3140 if (i == 0) {
3141 f->open_object_section("type");
3142 f->dump_int("type_id", 0);
3143 f->dump_string("name", "device");
3144 f->close_section();
3145 }
3146 continue;
3147 }
3148 n--;
3149 f->open_object_section("type");
3150 f->dump_int("type_id", i);
3151 f->dump_string("name", name);
3152 f->close_section();
3153 }
3154 f->close_section();
3155
3156 f->open_array_section("buckets");
3157 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
3158 if (!bucket_exists(bucket))
3159 continue;
3160 f->open_object_section("bucket");
3161 f->dump_int("id", bucket);
3162 if (get_item_name(bucket))
3163 f->dump_string("name", get_item_name(bucket));
3164 f->dump_int("type_id", get_bucket_type(bucket));
3165 if (get_type_name(get_bucket_type(bucket)))
3166 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
3167 f->dump_int("weight", get_bucket_weight(bucket));
3168 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
3169 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
3170 f->open_array_section("items");
3171 for (int j=0; j<get_bucket_size(bucket); j++) {
3172 f->open_object_section("item");
3173 f->dump_int("id", get_bucket_item(bucket, j));
3174 f->dump_int("weight", get_bucket_item_weight(bucket, j));
3175 f->dump_int("pos", j);
3176 f->close_section();
3177 }
3178 f->close_section();
3179 f->close_section();
3180 }
3181 f->close_section();
3182
3183 f->open_array_section("rules");
3184 dump_rules(f);
3185 f->close_section();
3186
3187 f->open_object_section("tunables");
3188 dump_tunables(f);
3189 f->close_section();
3190
3191 dump_choose_args(f);
3192 }
3193
3194 namespace {
3195 // depth first walker
3196 class TreeDumper {
3197 typedef CrushTreeDumper::Item Item;
3198 const CrushWrapper *crush;
3199 const CrushTreeDumper::name_map_t& weight_set_names;
3200 public:
3201 explicit TreeDumper(const CrushWrapper *crush,
3202 const CrushTreeDumper::name_map_t& wsnames)
3203 : crush(crush), weight_set_names(wsnames) {}
3204
3205 void dump(Formatter *f) {
3206 set<int> roots;
3207 crush->find_roots(&roots);
3208 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
3209 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
3210 }
3211 }
3212
3213 private:
3214 void dump_item(const Item& qi, Formatter* f) {
3215 if (qi.is_bucket()) {
3216 f->open_object_section("bucket");
3217 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
3218 dump_bucket_children(qi, f);
3219 f->close_section();
3220 } else {
3221 f->open_object_section("device");
3222 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
3223 f->close_section();
3224 }
3225 }
3226
3227 void dump_bucket_children(const Item& parent, Formatter* f) {
3228 f->open_array_section("items");
3229 const int max_pos = crush->get_bucket_size(parent.id);
3230 for (int pos = 0; pos < max_pos; pos++) {
3231 int id = crush->get_bucket_item(parent.id, pos);
3232 float weight = crush->get_bucket_item_weightf(parent.id, pos);
3233 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
3234 }
3235 f->close_section();
3236 }
3237 };
3238 }
3239
3240 void CrushWrapper::dump_tree(
3241 Formatter *f,
3242 const CrushTreeDumper::name_map_t& weight_set_names) const
3243 {
3244 assert(f);
3245 TreeDumper(this, weight_set_names).dump(f);
3246 }
3247
3248 void CrushWrapper::dump_tunables(Formatter *f) const
3249 {
3250 f->dump_int("choose_local_tries", get_choose_local_tries());
3251 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
3252 f->dump_int("choose_total_tries", get_choose_total_tries());
3253 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
3254 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
3255 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
3256 f->dump_int("straw_calc_version", get_straw_calc_version());
3257 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
3258
3259 // be helpful about it
3260 if (has_jewel_tunables())
3261 f->dump_string("profile", "jewel");
3262 else if (has_hammer_tunables())
3263 f->dump_string("profile", "hammer");
3264 else if (has_firefly_tunables())
3265 f->dump_string("profile", "firefly");
3266 else if (has_bobtail_tunables())
3267 f->dump_string("profile", "bobtail");
3268 else if (has_argonaut_tunables())
3269 f->dump_string("profile", "argonaut");
3270 else
3271 f->dump_string("profile", "unknown");
3272 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
3273 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
3274
3275 // be helpful about minimum version required
3276 f->dump_string("minimum_required_version", get_min_required_version());
3277
3278 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
3279 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
3280 f->dump_int("has_v2_rules", (int)has_v2_rules());
3281 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
3282 f->dump_int("has_v3_rules", (int)has_v3_rules());
3283 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
3284 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
3285 f->dump_int("has_v5_rules", (int)has_v5_rules());
3286 }
3287
3288 void CrushWrapper::dump_choose_args(Formatter *f) const
3289 {
3290 f->open_object_section("choose_args");
3291 for (auto c : choose_args) {
3292 crush_choose_arg_map arg_map = c.second;
3293 f->open_array_section(stringify(c.first).c_str());
3294 for (__u32 i = 0; i < arg_map.size; i++) {
3295 crush_choose_arg *arg = &arg_map.args[i];
3296 if (arg->weight_set_positions == 0 &&
3297 arg->ids_size == 0)
3298 continue;
3299 f->open_object_section("choose_args");
3300 int bucket_index = i;
3301 f->dump_int("bucket_id", -1-bucket_index);
3302 if (arg->weight_set_positions > 0) {
3303 f->open_array_section("weight_set");
3304 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
3305 f->open_array_section("weights");
3306 __u32 *weights = arg->weight_set[j].weights;
3307 __u32 size = arg->weight_set[j].size;
3308 for (__u32 k = 0; k < size; k++) {
3309 f->dump_float("weight", (float)weights[k]/(float)0x10000);
3310 }
3311 f->close_section();
3312 }
3313 f->close_section();
3314 }
3315 if (arg->ids_size > 0) {
3316 f->open_array_section("ids");
3317 for (__u32 j = 0; j < arg->ids_size; j++)
3318 f->dump_int("id", arg->ids[j]);
3319 f->close_section();
3320 }
3321 f->close_section();
3322 }
3323 f->close_section();
3324 }
3325 f->close_section();
3326 }
3327
3328 void CrushWrapper::dump_rules(Formatter *f) const
3329 {
3330 for (int i=0; i<get_max_rules(); i++) {
3331 if (!rule_exists(i))
3332 continue;
3333 dump_rule(i, f);
3334 }
3335 }
3336
3337 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
3338 {
3339 f->open_object_section("rule");
3340 f->dump_int("rule_id", ruleset);
3341 if (get_rule_name(ruleset))
3342 f->dump_string("rule_name", get_rule_name(ruleset));
3343 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
3344 f->dump_int("type", get_rule_mask_type(ruleset));
3345 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
3346 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
3347 f->open_array_section("steps");
3348 for (int j=0; j<get_rule_len(ruleset); j++) {
3349 f->open_object_section("step");
3350 switch (get_rule_op(ruleset, j)) {
3351 case CRUSH_RULE_NOOP:
3352 f->dump_string("op", "noop");
3353 break;
3354 case CRUSH_RULE_TAKE:
3355 f->dump_string("op", "take");
3356 {
3357 int item = get_rule_arg1(ruleset, j);
3358 f->dump_int("item", item);
3359
3360 const char *name = get_item_name(item);
3361 f->dump_string("item_name", name ? name : "");
3362 }
3363 break;
3364 case CRUSH_RULE_EMIT:
3365 f->dump_string("op", "emit");
3366 break;
3367 case CRUSH_RULE_CHOOSE_FIRSTN:
3368 f->dump_string("op", "choose_firstn");
3369 f->dump_int("num", get_rule_arg1(ruleset, j));
3370 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
3371 break;
3372 case CRUSH_RULE_CHOOSE_INDEP:
3373 f->dump_string("op", "choose_indep");
3374 f->dump_int("num", get_rule_arg1(ruleset, j));
3375 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
3376 break;
3377 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3378 f->dump_string("op", "chooseleaf_firstn");
3379 f->dump_int("num", get_rule_arg1(ruleset, j));
3380 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
3381 break;
3382 case CRUSH_RULE_CHOOSELEAF_INDEP:
3383 f->dump_string("op", "chooseleaf_indep");
3384 f->dump_int("num", get_rule_arg1(ruleset, j));
3385 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
3386 break;
3387 case CRUSH_RULE_SET_CHOOSE_TRIES:
3388 f->dump_string("op", "set_choose_tries");
3389 f->dump_int("num", get_rule_arg1(ruleset, j));
3390 break;
3391 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
3392 f->dump_string("op", "set_chooseleaf_tries");
3393 f->dump_int("num", get_rule_arg1(ruleset, j));
3394 break;
3395 default:
3396 f->dump_int("opcode", get_rule_op(ruleset, j));
3397 f->dump_int("arg1", get_rule_arg1(ruleset, j));
3398 f->dump_int("arg2", get_rule_arg2(ruleset, j));
3399 }
3400 f->close_section();
3401 }
3402 f->close_section();
3403 f->close_section();
3404 }
3405
3406 void CrushWrapper::list_rules(Formatter *f) const
3407 {
3408 for (int rule = 0; rule < get_max_rules(); rule++) {
3409 if (!rule_exists(rule))
3410 continue;
3411 f->dump_string("name", get_rule_name(rule));
3412 }
3413 }
3414
3415 void CrushWrapper::list_rules(ostream *ss) const
3416 {
3417 for (int rule = 0; rule < get_max_rules(); rule++) {
3418 if (!rule_exists(rule))
3419 continue;
3420 *ss << get_rule_name(rule) << "\n";
3421 }
3422 }
3423
3424 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3425 public:
3426 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3427
3428 explicit CrushTreePlainDumper(const CrushWrapper *crush,
3429 const CrushTreeDumper::name_map_t& wsnames)
3430 : Parent(crush, wsnames) {}
3431 explicit CrushTreePlainDumper(const CrushWrapper *crush,
3432 const CrushTreeDumper::name_map_t& wsnames,
3433 bool show_shadow)
3434 : Parent(crush, wsnames, show_shadow) {}
3435
3436
3437 void dump(TextTable *tbl) {
3438 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
3439 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
3440 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3441 for (auto& p : crush->choose_args) {
3442 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
3443 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
3444 } else {
3445 string name;
3446 auto q = weight_set_names.find(p.first);
3447 name = q != weight_set_names.end() ? q->second :
3448 stringify(p.first);
3449 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
3450 }
3451 }
3452 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3453 Parent::dump(tbl);
3454 }
3455
3456 protected:
3457 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
3458 const char *c = crush->get_item_class(qi.id);
3459 if (!c)
3460 c = "";
3461 *tbl << qi.id
3462 << c
3463 << weightf_t(qi.weight);
3464 for (auto& p : crush->choose_args) {
3465 if (qi.parent < 0) {
3466 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
3467 int bidx = -1 - qi.parent;
3468 const crush_bucket *b = crush->get_bucket(qi.parent);
3469 if (b &&
3470 bidx < (int)cmap.size &&
3471 cmap.args[bidx].weight_set &&
3472 cmap.args[bidx].weight_set_positions >= 1) {
3473 int pos;
3474 for (pos = 0;
3475 pos < (int)cmap.args[bidx].weight_set[0].size &&
3476 b->items[pos] != qi.id;
3477 ++pos) ;
3478 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
3479 (float)0x10000);
3480 continue;
3481 }
3482 }
3483 *tbl << "";
3484 }
3485 ostringstream ss;
3486 for (int k=0; k < qi.depth; k++) {
3487 ss << " ";
3488 }
3489 if (qi.is_bucket()) {
3490 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3491 << crush->get_item_name(qi.id);
3492 } else {
3493 ss << "osd." << qi.id;
3494 }
3495 *tbl << ss.str();
3496 *tbl << TextTable::endrow;
3497 }
3498 };
3499
3500
3501 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3502 public:
3503 typedef CrushTreeDumper::FormattingDumper Parent;
3504
3505 explicit CrushTreeFormattingDumper(
3506 const CrushWrapper *crush,
3507 const CrushTreeDumper::name_map_t& wsnames)
3508 : Parent(crush, wsnames) {}
3509
3510 explicit CrushTreeFormattingDumper(
3511 const CrushWrapper *crush,
3512 const CrushTreeDumper::name_map_t& wsnames,
3513 bool show_shadow)
3514 : Parent(crush, wsnames, show_shadow) {}
3515
3516 void dump(Formatter *f) {
3517 f->open_array_section("nodes");
3518 Parent::dump(f);
3519 f->close_section();
3520
3521 // There is no stray bucket whose id is a negative number, so just get
3522 // the max_id and iterate from 0 to max_id to dump stray osds.
3523 f->open_array_section("stray");
3524 int32_t max_id = -1;
3525 if (!crush->name_map.empty()) {
3526 max_id = crush->name_map.rbegin()->first;
3527 }
3528 for (int32_t i = 0; i <= max_id; i++) {
3529 if (crush->item_exists(i) && !is_touched(i) && should_dump(i)) {
3530 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3531 }
3532 }
3533 f->close_section();
3534 }
3535 };
3536
3537
3538 void CrushWrapper::dump_tree(
3539 ostream *out,
3540 Formatter *f,
3541 const CrushTreeDumper::name_map_t& weight_set_names,
3542 bool show_shadow) const
3543 {
3544 if (out) {
3545 TextTable tbl;
3546 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
3547 *out << tbl;
3548 }
3549 if (f) {
3550 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
3551 }
3552 }
3553
3554 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
3555 {
3556 o.push_back(new CrushWrapper);
3557 // fixme
3558 }
3559
3560 /**
3561 * Determine the default CRUSH ruleset ID to be used with
3562 * newly created replicated pools.
3563 *
3564 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
3565 */
3566 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
3567 {
3568 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
3569 if (crush_ruleset < 0) {
3570 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
3571 } else if (!ruleset_exists(crush_ruleset)) {
3572 crush_ruleset = -1; // match find_first_ruleset() retval
3573 }
3574 return crush_ruleset;
3575 }
3576
3577 bool CrushWrapper::is_valid_crush_name(const string& s)
3578 {
3579 if (s.empty())
3580 return false;
3581 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
3582 if (!(*p == '-') &&
3583 !(*p == '_') &&
3584 !(*p == '.') &&
3585 !(*p >= '0' && *p <= '9') &&
3586 !(*p >= 'A' && *p <= 'Z') &&
3587 !(*p >= 'a' && *p <= 'z'))
3588 return false;
3589 }
3590 return true;
3591 }
3592
3593 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
3594 const map<string,string>& loc)
3595 {
3596 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
3597 if (!is_valid_crush_name(l->first) ||
3598 !is_valid_crush_name(l->second)) {
3599 ldout(cct, 1) << "loc["
3600 << l->first << "] = '"
3601 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
3602 << dendl;
3603 return false;
3604 }
3605 }
3606 return true;
3607 }
3608
3609 int CrushWrapper::_choose_type_stack(
3610 CephContext *cct,
3611 const vector<pair<int,int>>& stack,
3612 const set<int>& overfull,
3613 const vector<int>& underfull,
3614 const vector<int>& orig,
3615 vector<int>::const_iterator& i,
3616 set<int>& used,
3617 vector<int> *pw) const
3618 {
3619 vector<int> w = *pw;
3620 vector<int> o;
3621
3622 ldout(cct, 10) << __func__ << " stack " << stack
3623 << " orig " << orig
3624 << " at " << *i
3625 << " pw " << *pw
3626 << dendl;
3627
3628 vector<int> cumulative_fanout(stack.size());
3629 int f = 1;
3630 for (int j = (int)stack.size() - 1; j >= 0; --j) {
3631 cumulative_fanout[j] = f;
3632 f *= stack[j].second;
3633 }
3634 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
3635 << dendl;
3636
3637 // identify underful targets for each intermediate level.
3638 // this serves two purposes:
3639 // 1. we can tell when we are selecting a bucket that does not have any underfull
3640 // devices beneath it. that means that if the current input includes an overfull
3641 // device, we won't be able to find an underfull device with this parent to
3642 // swap for it.
3643 // 2. when we decide we should reject a bucket due to the above, this list gives us
3644 // a list of peers to consider that *do* have underfull devices available.. (we
3645 // are careful to pick one that has the same parent.)
3646 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
3647 underfull_buckets.resize(stack.size() - 1);
3648 for (auto osd : underfull) {
3649 int item = osd;
3650 for (int j = (int)stack.size() - 2; j >= 0; --j) {
3651 int type = stack[j].first;
3652 item = get_parent_of_type(item, type);
3653 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
3654 << " is " << item << dendl;
3655 underfull_buckets[j].insert(item);
3656 }
3657 }
3658 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
3659
3660 for (unsigned j = 0; j < stack.size(); ++j) {
3661 int type = stack[j].first;
3662 int fanout = stack[j].second;
3663 int cum_fanout = cumulative_fanout[j];
3664 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
3665 << " cumulative " << cum_fanout
3666 << " w " << w << dendl;
3667 vector<int> o;
3668 auto tmpi = i;
3669 if (i == orig.end()) {
3670 ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
3671 break;
3672 }
3673 for (auto from : w) {
3674 ldout(cct, 10) << " from " << from << dendl;
3675 // identify leaves under each choice. we use this to check whether any of these
3676 // leaves are overfull. (if so, we need to make sure there are underfull candidates
3677 // to swap for them.)
3678 vector<set<int>> leaves;
3679 leaves.resize(fanout);
3680 for (int pos = 0; pos < fanout; ++pos) {
3681 if (type > 0) {
3682 // non-leaf
3683 int item = get_parent_of_type(*tmpi, type);
3684 o.push_back(item);
3685 int n = cum_fanout;
3686 while (n-- && tmpi != orig.end()) {
3687 leaves[pos].insert(*tmpi++);
3688 }
3689 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
3690 << " of type " << type << " over leaves " << leaves[pos] << dendl;
3691 } else {
3692 // leaf
3693 bool replaced = false;
3694 if (overfull.count(*i)) {
3695 for (auto item : underfull) {
3696 ldout(cct, 10) << __func__ << " pos " << pos
3697 << " was " << *i << " considering " << item
3698 << dendl;
3699 if (used.count(item)) {
3700 ldout(cct, 20) << __func__ << " in used " << used << dendl;
3701 continue;
3702 }
3703 if (!subtree_contains(from, item)) {
3704 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
3705 continue;
3706 }
3707 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
3708 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
3709 continue;
3710 }
3711 o.push_back(item);
3712 used.insert(item);
3713 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
3714 << *i << " -> " << item << dendl;
3715 replaced = true;
3716 assert(i != orig.end());
3717 ++i;
3718 break;
3719 }
3720 }
3721 if (!replaced) {
3722 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
3723 << dendl;
3724 assert(i != orig.end());
3725 o.push_back(*i);
3726 ++i;
3727 }
3728 if (i == orig.end()) {
3729 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
3730 break;
3731 }
3732 }
3733 }
3734 if (j + 1 < stack.size()) {
3735 // check if any buckets have overfull leaves but no underfull candidates
3736 for (int pos = 0; pos < fanout; ++pos) {
3737 if (underfull_buckets[j].count(o[pos]) == 0) {
3738 // are any leaves overfull?
3739 bool any_overfull = false;
3740 for (auto osd : leaves[pos]) {
3741 if (overfull.count(osd)) {
3742 any_overfull = true;
3743 }
3744 }
3745 if (any_overfull) {
3746 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
3747 << ">0 leaves " << leaves[pos] << " is overfull; alts "
3748 << underfull_buckets[j]
3749 << dendl;
3750 for (auto alt : underfull_buckets[j]) {
3751 if (std::find(o.begin(), o.end(), alt) == o.end()) {
3752 // see if alt has the same parent
3753 if (j == 0 ||
3754 get_parent_of_type(o[pos], stack[j-1].first) ==
3755 get_parent_of_type(alt, stack[j-1].first)) {
3756 if (j)
3757 ldout(cct, 10) << " replacing " << o[pos]
3758 << " (which has no underfull leaves) with " << alt
3759 << " (same parent "
3760 << get_parent_of_type(alt, stack[j-1].first) << " type "
3761 << type << ")" << dendl;
3762 else
3763 ldout(cct, 10) << " replacing " << o[pos]
3764 << " (which has no underfull leaves) with " << alt
3765 << " (first level)" << dendl;
3766 o[pos] = alt;
3767 break;
3768 } else {
3769 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
3770 << " has different parent, skipping" << dendl;
3771 }
3772 }
3773 }
3774 }
3775 }
3776 }
3777 }
3778 if (i == orig.end()) {
3779 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
3780 break;
3781 }
3782 }
3783 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
3784 w.swap(o);
3785 }
3786 *pw = w;
3787 return 0;
3788 }
3789
3790 int CrushWrapper::try_remap_rule(
3791 CephContext *cct,
3792 int ruleno,
3793 int maxout,
3794 const set<int>& overfull,
3795 const vector<int>& underfull,
3796 const vector<int>& orig,
3797 vector<int> *out) const
3798 {
3799 const crush_map *map = crush;
3800 const crush_rule *rule = get_rule(ruleno);
3801 assert(rule);
3802
3803 ldout(cct, 10) << __func__ << " ruleno " << ruleno
3804 << " numrep " << maxout << " overfull " << overfull
3805 << " underfull " << underfull << " orig " << orig
3806 << dendl;
3807 vector<int> w; // working set
3808 out->clear();
3809
3810 auto i = orig.begin();
3811 set<int> used;
3812
3813 vector<pair<int,int>> type_stack; // (type, fan-out)
3814
3815 for (unsigned step = 0; step < rule->len; ++step) {
3816 const crush_rule_step *curstep = &rule->steps[step];
3817 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
3818 switch (curstep->op) {
3819 case CRUSH_RULE_TAKE:
3820 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
3821 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
3822 map->buckets[-1-curstep->arg1])) {
3823 w.clear();
3824 w.push_back(curstep->arg1);
3825 ldout(cct, 10) << __func__ << " take " << w << dendl;
3826 } else {
3827 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
3828 }
3829 break;
3830
3831 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3832 case CRUSH_RULE_CHOOSELEAF_INDEP:
3833 {
3834 int numrep = curstep->arg1;
3835 int type = curstep->arg2;
3836 if (numrep <= 0)
3837 numrep += maxout;
3838 type_stack.push_back(make_pair(type, numrep));
3839 if (type > 0)
3840 type_stack.push_back(make_pair(0, 1));
3841 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3842 i, used, &w);
3843 if (r < 0)
3844 return r;
3845 type_stack.clear();
3846 }
3847 break;
3848
3849 case CRUSH_RULE_CHOOSE_FIRSTN:
3850 case CRUSH_RULE_CHOOSE_INDEP:
3851 {
3852 int numrep = curstep->arg1;
3853 int type = curstep->arg2;
3854 if (numrep <= 0)
3855 numrep += maxout;
3856 type_stack.push_back(make_pair(type, numrep));
3857 }
3858 break;
3859
3860 case CRUSH_RULE_EMIT:
3861 ldout(cct, 10) << " emit " << w << dendl;
3862 if (!type_stack.empty()) {
3863 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3864 i, used, &w);
3865 if (r < 0)
3866 return r;
3867 type_stack.clear();
3868 }
3869 for (auto item : w) {
3870 out->push_back(item);
3871 }
3872 w.clear();
3873 break;
3874
3875 default:
3876 // ignore
3877 break;
3878 }
3879 }
3880
3881 return 0;
3882 }
3883
3884
3885 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3886 CephContext *cct,
3887 crush_choose_arg_map cmap,
3888 int bucketid,
3889 int id,
3890 const vector<int>& weight,
3891 ostream *ss)
3892 {
3893 int changed = 0;
3894 int bidx = -1 - bucketid;
3895 crush_bucket *b = crush->buckets[bidx];
3896 if (bidx >= (int)cmap.size) {
3897 if (ss)
3898 *ss << "no weight-set for bucket " << b->id;
3899 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
3900 << dendl;
3901 return 0;
3902 }
3903 crush_choose_arg *carg = &cmap.args[bidx];
3904 if (carg->weight_set == NULL) {
3905 // create a weight-set for this bucket and populate it with the
3906 // bucket weights
3907 unsigned positions = get_choose_args_positions(cmap);
3908 carg->weight_set_positions = positions;
3909 carg->weight_set = static_cast<crush_weight_set*>(
3910 calloc(sizeof(crush_weight_set), positions));
3911 for (unsigned p = 0; p < positions; ++p) {
3912 carg->weight_set[p].size = b->size;
3913 carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
3914 for (unsigned i = 0; i < b->size; ++i) {
3915 carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
3916 }
3917 }
3918 changed++;
3919 }
3920 if (carg->weight_set_positions != weight.size()) {
3921 if (ss)
3922 *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
3923 ldout(cct, 10) << __func__ << " weight_set_positions != " << weight.size()
3924 << " for bucket " << b->id << dendl;
3925 return 0;
3926 }
3927 for (unsigned i = 0; i < b->size; i++) {
3928 if (b->items[i] == id) {
3929 for (unsigned j = 0; j < weight.size(); ++j) {
3930 carg->weight_set[j].weights[i] = weight[j];
3931 }
3932 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
3933 << " in bucket " << b->id << dendl;
3934 changed++;
3935 }
3936 }
3937 if (changed) {
3938 vector<int> bucket_weight(weight.size(), 0);
3939 for (unsigned i = 0; i < b->size; i++) {
3940 for (unsigned j = 0; j < weight.size(); ++j) {
3941 bucket_weight[j] += carg->weight_set[j].weights[i];
3942 }
3943 }
3944 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
3945 }
3946 return changed;
3947 }
3948
3949 int CrushWrapper::choose_args_adjust_item_weight(
3950 CephContext *cct,
3951 crush_choose_arg_map cmap,
3952 int id,
3953 const vector<int>& weight,
3954 ostream *ss)
3955 {
3956 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
3957 int changed = 0;
3958 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
3959 crush_bucket *b = crush->buckets[bidx];
3960 if (b == nullptr) {
3961 continue;
3962 }
3963 changed += _choose_args_adjust_item_weight_in_bucket(
3964 cct, cmap, b->id, id, weight, ss);
3965 }
3966 if (!changed) {
3967 if (ss)
3968 *ss << "item " << id << " not found in crush map";
3969 return -ENOENT;
3970 }
3971 return changed;
3972 }