]> git.proxmox.com Git - ceph.git/blame - ceph/src/crush/CrushWrapper.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / crush / CrushWrapper.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "osd/osd_types.h"
5#include "common/debug.h"
6#include "common/Formatter.h"
7#include "common/errno.h"
c07f9fc5 8#include "common/TextTable.h"
7c673cae
FG
9#include "include/stringify.h"
10
11#include "CrushWrapper.h"
12#include "CrushTreeDumper.h"
13
14#define dout_subsys ceph_subsys_crush
15
9f95a23c
TL
16using std::cout;
17using std::list;
18using std::map;
19using std::make_pair;
20using std::ostream;
21using std::ostringstream;
22using std::pair;
23using std::set;
24using std::string;
25using std::vector;
26
27using ceph::bufferlist;
28using ceph::decode;
29using ceph::decode_nohead;
30using ceph::encode;
31using ceph::Formatter;
32
c07f9fc5
FG
33bool CrushWrapper::has_non_straw2_buckets() const
34{
35 for (int i=0; i<crush->max_buckets; ++i) {
36 crush_bucket *b = crush->buckets[i];
37 if (!b)
38 continue;
39 if (b->alg != CRUSH_BUCKET_STRAW2)
40 return true;
41 }
42 return false;
43}
44
7c673cae
FG
45bool CrushWrapper::has_v2_rules() const
46{
47 for (unsigned i=0; i<crush->max_rules; i++) {
48 if (is_v2_rule(i)) {
49 return true;
50 }
51 }
52 return false;
53}
54
55bool CrushWrapper::is_v2_rule(unsigned ruleid) const
56{
57 // check rule for use of indep or new SET_* rule steps
58 if (ruleid >= crush->max_rules)
59 return false;
60 crush_rule *r = crush->rules[ruleid];
61 if (!r)
62 return false;
63 for (unsigned j=0; j<r->len; j++) {
64 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
65 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
66 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
67 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
68 return true;
69 }
70 }
71 return false;
72}
73
74bool CrushWrapper::has_v3_rules() const
75{
76 for (unsigned i=0; i<crush->max_rules; i++) {
77 if (is_v3_rule(i)) {
78 return true;
79 }
80 }
81 return false;
82}
83
84bool CrushWrapper::is_v3_rule(unsigned ruleid) const
85{
86 // check rule for use of SET_CHOOSELEAF_VARY_R step
87 if (ruleid >= crush->max_rules)
88 return false;
89 crush_rule *r = crush->rules[ruleid];
90 if (!r)
91 return false;
92 for (unsigned j=0; j<r->len; j++) {
93 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
94 return true;
95 }
96 }
97 return false;
98}
99
100bool CrushWrapper::has_v4_buckets() const
101{
102 for (int i=0; i<crush->max_buckets; ++i) {
103 crush_bucket *b = crush->buckets[i];
104 if (!b)
105 continue;
106 if (b->alg == CRUSH_BUCKET_STRAW2)
107 return true;
108 }
109 return false;
110}
111
112bool CrushWrapper::has_v5_rules() const
113{
114 for (unsigned i=0; i<crush->max_rules; i++) {
115 if (is_v5_rule(i)) {
116 return true;
117 }
118 }
119 return false;
120}
121
122bool CrushWrapper::is_v5_rule(unsigned ruleid) const
123{
124 // check rule for use of SET_CHOOSELEAF_STABLE step
125 if (ruleid >= crush->max_rules)
126 return false;
127 crush_rule *r = crush->rules[ruleid];
128 if (!r)
129 return false;
130 for (unsigned j=0; j<r->len; j++) {
131 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
132 return true;
133 }
134 }
135 return false;
136}
137
31f18b77 138bool CrushWrapper::has_choose_args() const
7c673cae
FG
139{
140 return !choose_args.empty();
141}
142
31f18b77 143bool CrushWrapper::has_incompat_choose_args() const
7c673cae 144{
31f18b77
FG
145 if (choose_args.empty())
146 return false;
147 if (choose_args.size() > 1)
148 return true;
c07f9fc5
FG
149 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
150 return true;
31f18b77
FG
151 crush_choose_arg_map arg_map = choose_args.begin()->second;
152 for (__u32 i = 0; i < arg_map.size; i++) {
153 crush_choose_arg *arg = &arg_map.args[i];
28e407b8 154 if (arg->weight_set_positions == 0 &&
31f18b77
FG
155 arg->ids_size == 0)
156 continue;
28e407b8 157 if (arg->weight_set_positions != 1)
31f18b77
FG
158 return true;
159 if (arg->ids_size != 0)
160 return true;
161 }
162 return false;
7c673cae
FG
163}
164
165int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
166{
167 if (!item_exists(i))
168 return -EINVAL;
169 string name = get_item_name(i);
170 size_t pos = name.find("~");
171 if (pos == string::npos) {
172 *idout = i;
173 *classout = -1;
174 return 0;
175 }
176 string name_no_class = name.substr(0, pos);
177 if (!name_exists(name_no_class))
178 return -ENOENT;
179 string class_name = name.substr(pos + 1);
180 if (!class_exists(class_name))
181 return -ENOENT;
182 *idout = get_item_id(name_no_class);
183 *classout = get_class_id(class_name);
184 return 0;
185}
186
187int CrushWrapper::can_rename_item(const string& srcname,
188 const string& dstname,
189 ostream *ss) const
190{
191 if (name_exists(srcname)) {
192 if (name_exists(dstname)) {
193 *ss << "dstname = '" << dstname << "' already exists";
194 return -EEXIST;
195 }
196 if (is_valid_crush_name(dstname)) {
197 return 0;
198 } else {
199 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
200 return -EINVAL;
201 }
202 } else {
203 if (name_exists(dstname)) {
204 *ss << "srcname = '" << srcname << "' does not exist "
205 << "and dstname = '" << dstname << "' already exists";
206 return -EALREADY;
207 } else {
208 *ss << "srcname = '" << srcname << "' does not exist";
209 return -ENOENT;
210 }
211 }
212}
213
214int CrushWrapper::rename_item(const string& srcname,
215 const string& dstname,
216 ostream *ss)
217{
218 int ret = can_rename_item(srcname, dstname, ss);
219 if (ret < 0)
220 return ret;
221 int oldid = get_item_id(srcname);
222 return set_item_name(oldid, dstname);
223}
224
225int CrushWrapper::can_rename_bucket(const string& srcname,
226 const string& dstname,
227 ostream *ss) const
228{
229 int ret = can_rename_item(srcname, dstname, ss);
230 if (ret)
231 return ret;
232 int srcid = get_item_id(srcname);
233 if (srcid >= 0) {
234 *ss << "srcname = '" << srcname << "' is not a bucket "
235 << "because its id = " << srcid << " is >= 0";
236 return -ENOTDIR;
237 }
238 return 0;
239}
240
241int CrushWrapper::rename_bucket(const string& srcname,
242 const string& dstname,
243 ostream *ss)
244{
245 int ret = can_rename_bucket(srcname, dstname, ss);
246 if (ret < 0)
247 return ret;
248 int oldid = get_item_id(srcname);
249 return set_item_name(oldid, dstname);
250}
251
b5b8bbf5
FG
252int CrushWrapper::rename_rule(const string& srcname,
253 const string& dstname,
254 ostream *ss)
255{
256 if (!rule_exists(srcname)) {
257 if (ss) {
258 *ss << "source rule name '" << srcname << "' does not exist";
259 }
260 return -ENOENT;
261 }
262 if (rule_exists(dstname)) {
263 if (ss) {
264 *ss << "destination rule name '" << dstname << "' already exists";
265 }
266 return -EEXIST;
267 }
268 int rule_id = get_rule_id(srcname);
269 auto it = rule_name_map.find(rule_id);
11fdf7f2 270 ceph_assert(it != rule_name_map.end());
b5b8bbf5
FG
271 it->second = dstname;
272 if (have_rmaps) {
273 rule_name_rmap.erase(srcname);
274 rule_name_rmap[dstname] = rule_id;
275 }
276 return 0;
277}
278
3efd9988 279void CrushWrapper::find_takes(set<int> *roots) const
7c673cae
FG
280{
281 for (unsigned i=0; i<crush->max_rules; i++) {
282 crush_rule *r = crush->rules[i];
283 if (!r)
284 continue;
285 for (unsigned j=0; j<r->len; j++) {
286 if (r->steps[j].op == CRUSH_RULE_TAKE)
3efd9988 287 roots->insert(r->steps[j].arg1);
7c673cae
FG
288 }
289 }
290}
291
28e407b8
AA
292void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
293{
294 if (rule < 0 || rule >= (int)crush->max_rules)
295 return;
296 crush_rule *r = crush->rules[rule];
297 if (!r)
298 return;
299 for (unsigned i = 0; i < r->len; i++) {
300 if (r->steps[i].op == CRUSH_RULE_TAKE)
301 roots->insert(r->steps[i].arg1);
302 }
303}
304
3efd9988 305void CrushWrapper::find_roots(set<int> *roots) const
7c673cae
FG
306{
307 for (int i = 0; i < crush->max_buckets; i++) {
308 if (!crush->buckets[i])
309 continue;
310 crush_bucket *b = crush->buckets[i];
311 if (!_search_item_exists(b->id))
3efd9988 312 roots->insert(b->id);
7c673cae
FG
313 }
314}
315
316bool CrushWrapper::subtree_contains(int root, int item) const
317{
318 if (root == item)
319 return true;
320
321 if (root >= 0)
322 return false; // root is a leaf
323
324 const crush_bucket *b = get_bucket(root);
325 if (IS_ERR(b))
326 return false;
327
328 for (unsigned j=0; j<b->size; j++) {
329 if (subtree_contains(b->items[j], item))
330 return true;
331 }
332 return false;
333}
334
335bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
336{
337 // last instance?
338 if (_search_item_exists(item)) {
339 return false;
340 }
341 if (item < 0 && _bucket_is_in_use(item)) {
342 return false;
343 }
344
345 if (item < 0 && !unlink_only) {
346 crush_bucket *t = get_bucket(item);
347 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
348 crush_remove_bucket(crush, t);
349 if (class_bucket.count(item) != 0)
350 class_bucket.erase(item);
224ce89b 351 class_remove_item(item);
28e407b8 352 update_choose_args(cct);
7c673cae
FG
353 }
354 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
355 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
356 name_map.erase(item);
357 have_rmaps = false;
c07f9fc5
FG
358 if (item >= 0 && !unlink_only) {
359 class_remove_item(item);
360 }
7c673cae 361 }
11fdf7f2 362 rebuild_roots_with_classes(cct);
7c673cae
FG
363 return true;
364}
365
11fdf7f2 366int CrushWrapper::remove_root(CephContext *cct, int item)
7c673cae 367{
7c673cae 368 crush_bucket *b = get_bucket(item);
d2e6a577
FG
369 if (IS_ERR(b)) {
370 // should be idempotent
371 // e.g.: we use 'crush link' to link same host into
372 // different roots, which as a result can cause different
373 // shadow trees reference same hosts too. This means
374 // we may need to destory the same buckets(hosts, racks, etc.)
375 // multiple times during rebuilding all shadow trees.
376 return 0;
377 }
7c673cae
FG
378
379 for (unsigned n = 0; n < b->size; n++) {
380 if (b->items[n] >= 0)
381 continue;
11fdf7f2 382 int r = remove_root(cct, b->items[n]);
7c673cae
FG
383 if (r < 0)
384 return r;
385 }
386
387 crush_remove_bucket(crush, b);
388 if (name_map.count(item) != 0) {
389 name_map.erase(item);
390 have_rmaps = false;
391 }
392 if (class_bucket.count(item) != 0)
393 class_bucket.erase(item);
c07f9fc5 394 class_remove_item(item);
11fdf7f2 395 update_choose_args(cct);
7c673cae
FG
396 return 0;
397}
398
28e407b8
AA
399void CrushWrapper::update_choose_args(CephContext *cct)
400{
401 for (auto& i : choose_args) {
402 crush_choose_arg_map &arg_map = i.second;
11fdf7f2 403 assert(arg_map.size == (unsigned)crush->max_buckets);
28e407b8
AA
404 unsigned positions = get_choose_args_positions(arg_map);
405 for (int j = 0; j < crush->max_buckets; ++j) {
406 crush_bucket *b = crush->buckets[j];
11fdf7f2 407 assert(j < (int)arg_map.size);
28e407b8
AA
408 auto& carg = arg_map.args[j];
409 // strip out choose_args for any buckets that no longer exist
410 if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
411 if (carg.ids) {
412 if (cct)
11fdf7f2
TL
413 ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
414 << (-1-j) << " ids" << dendl;
28e407b8
AA
415 free(carg.ids);
416 carg.ids = 0;
417 carg.ids_size = 0;
418 }
419 if (carg.weight_set) {
420 if (cct)
11fdf7f2
TL
421 ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
422 << (-1-j) << " weight_sets" << dendl;
28e407b8
AA
423 for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
424 free(carg.weight_set[p].weights);
425 }
426 free(carg.weight_set);
427 carg.weight_set = 0;
428 carg.weight_set_positions = 0;
429 }
430 continue;
431 }
432 if (carg.weight_set_positions == 0) {
433 continue; // skip it
434 }
435 if (carg.weight_set_positions != positions) {
436 if (cct)
437 lderr(cct) << __func__ << " " << i.first << " bucket "
438 << (-1-j) << " positions " << carg.weight_set_positions
439 << " -> " << positions << dendl;
440 continue; // wth... skip!
441 }
442 // mis-sized weight_sets? this shouldn't ever happen.
443 for (unsigned p = 0; p < positions; ++p) {
444 if (carg.weight_set[p].size != b->size) {
445 if (cct)
446 lderr(cct) << __func__ << " fixing " << i.first << " bucket "
447 << (-1-j) << " position " << p
448 << " size " << carg.weight_set[p].size << " -> "
449 << b->size << dendl;
450 auto old_ws = carg.weight_set[p];
451 carg.weight_set[p].size = b->size;
452 carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
453 auto max = std::min<unsigned>(old_ws.size, b->size);
454 for (unsigned k = 0; k < max; ++k) {
455 carg.weight_set[p].weights[k] = old_ws.weights[k];
456 }
457 free(old_ws.weights);
458 }
459 }
460 }
461 }
462}
463
7c673cae
FG
464int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
465{
c07f9fc5
FG
466 ldout(cct, 5) << "remove_item " << item
467 << (unlink_only ? " unlink_only":"") << dendl;
7c673cae
FG
468
469 int ret = -ENOENT;
470
471 if (item < 0 && !unlink_only) {
472 crush_bucket *t = get_bucket(item);
473 if (IS_ERR(t)) {
c07f9fc5
FG
474 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
475 << dendl;
7c673cae
FG
476 return -ENOENT;
477 }
478
479 if (t->size) {
480 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
481 << " items, not empty" << dendl;
482 return -ENOTEMPTY;
483 }
484 if (_bucket_is_in_use(item)) {
485 return -EBUSY;
486 }
487 }
488
489 for (int i = 0; i < crush->max_buckets; i++) {
490 if (!crush->buckets[i])
491 continue;
492 crush_bucket *b = crush->buckets[i];
493
494 for (unsigned i=0; i<b->size; ++i) {
495 int id = b->items[i];
496 if (id == item) {
497 ldout(cct, 5) << "remove_item removing item " << item
498 << " from bucket " << b->id << dendl;
11fdf7f2 499 adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
31f18b77 500 bucket_remove_item(b, item);
7c673cae
FG
501 ret = 0;
502 }
503 }
504 }
505
506 if (_maybe_remove_last_instance(cct, item, unlink_only))
507 ret = 0;
508
509 return ret;
510}
511
512bool CrushWrapper::_search_item_exists(int item) const
513{
514 for (int i = 0; i < crush->max_buckets; i++) {
515 if (!crush->buckets[i])
516 continue;
517 crush_bucket *b = crush->buckets[i];
518 for (unsigned j=0; j<b->size; ++j) {
519 if (b->items[j] == item)
520 return true;
521 }
522 }
523 return false;
524}
525
526bool CrushWrapper::_bucket_is_in_use(int item)
527{
528 for (auto &i : class_bucket)
529 for (auto &j : i.second)
530 if (j.second == item)
531 return true;
532 for (unsigned i = 0; i < crush->max_rules; ++i) {
533 crush_rule *r = crush->rules[i];
534 if (!r)
535 continue;
536 for (unsigned j = 0; j < r->len; ++j) {
537 if (r->steps[j].op == CRUSH_RULE_TAKE) {
538 int step_item = r->steps[j].arg1;
539 int original_item;
540 int c;
541 int res = split_id_class(step_item, &original_item, &c);
542 if (res < 0)
543 return false;
544 if (step_item == item || original_item == item)
545 return true;
546 }
547 }
548 }
549 return false;
550}
551
c07f9fc5
FG
552int CrushWrapper::_remove_item_under(
553 CephContext *cct, int item, int ancestor, bool unlink_only)
7c673cae
FG
554{
555 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
556 << (unlink_only ? " unlink_only":"") << dendl;
557
558 if (ancestor >= 0) {
559 return -EINVAL;
560 }
561
562 if (!bucket_exists(ancestor))
563 return -EINVAL;
564
565 int ret = -ENOENT;
566
567 crush_bucket *b = get_bucket(ancestor);
568 for (unsigned i=0; i<b->size; ++i) {
569 int id = b->items[i];
570 if (id == item) {
c07f9fc5
FG
571 ldout(cct, 5) << "_remove_item_under removing item " << item
572 << " from bucket " << b->id << dendl;
11fdf7f2 573 adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
35e4c445 574 bucket_remove_item(b, item);
7c673cae
FG
575 ret = 0;
576 } else if (id < 0) {
577 int r = remove_item_under(cct, item, id, unlink_only);
578 if (r == 0)
579 ret = 0;
580 }
581 }
582 return ret;
583}
584
c07f9fc5
FG
585int CrushWrapper::remove_item_under(
586 CephContext *cct, int item, int ancestor, bool unlink_only)
7c673cae
FG
587{
588 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
589 << (unlink_only ? " unlink_only":"") << dendl;
590
591 if (!unlink_only && _bucket_is_in_use(item)) {
592 return -EBUSY;
593 }
594
595 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
596 if (ret < 0)
597 return ret;
598
599 if (item < 0 && !unlink_only) {
600 crush_bucket *t = get_bucket(item);
601 if (IS_ERR(t)) {
602 ldout(cct, 1) << "remove_item_under bucket " << item
603 << " does not exist" << dendl;
604 return -ENOENT;
605 }
606
607 if (t->size) {
608 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
609 << " items, not empty" << dendl;
610 return -ENOTEMPTY;
611 }
612 }
613
614 if (_maybe_remove_last_instance(cct, item, unlink_only))
615 ret = 0;
616
617 return ret;
618}
619
620int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
11fdf7f2 621 const std::multimap<string,string>& loc) const
7c673cae
FG
622{
623 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
624 if (!item_exists(id))
625 return -ENOENT;
626 map<string,string> id_loc = get_full_location(id);
627 ldout(cct, 20) << " id is at " << id_loc << dendl;
628
629 for (map<int,string>::const_iterator p = type_map.begin();
630 p != type_map.end();
631 ++p) {
632 map<string,string>::iterator ip = id_loc.find(p->second);
633 if (ip == id_loc.end())
634 continue;
635 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
636 q != loc.end();
637 ++q) {
638 if (q->first != p->second)
639 break;
640 if (q->second == ip->second)
641 return p->first;
642 }
643 }
644 return -ERANGE;
645}
646
647int CrushWrapper::parse_loc_map(const std::vector<string>& args,
648 std::map<string,string> *ploc)
649{
650 ploc->clear();
651 for (unsigned i = 0; i < args.size(); ++i) {
652 const char *s = args[i].c_str();
653 const char *pos = strchr(s, '=');
654 if (!pos)
655 return -EINVAL;
656 string key(s, 0, pos-s);
657 string value(pos+1);
658 if (value.length())
659 (*ploc)[key] = value;
660 else
661 return -EINVAL;
662 }
663 return 0;
664}
665
666int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
667 std::multimap<string,string> *ploc)
668{
669 ploc->clear();
670 for (unsigned i = 0; i < args.size(); ++i) {
671 const char *s = args[i].c_str();
672 const char *pos = strchr(s, '=');
673 if (!pos)
674 return -EINVAL;
675 string key(s, 0, pos-s);
676 string value(pos+1);
677 if (value.length())
678 ploc->insert(make_pair(key, value));
679 else
680 return -EINVAL;
681 }
682 return 0;
683}
684
685bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
686 int *weight)
687{
688 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
689
690 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
691 // ignore device
692 if (p->first == 0)
693 continue;
694
695 // ignore types that aren't specified in loc
696 map<string,string>::const_iterator q = loc.find(p->second);
697 if (q == loc.end()) {
698 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
699 << type_map << ")" << dendl;
700 continue;
701 }
702
703 if (!name_exists(q->second)) {
704 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
705 return false;
706 }
707
708 int id = get_item_id(q->second);
709 if (id >= 0) {
710 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
711 << " is a device, not bucket" << dendl;
712 return false;
713 }
714
11fdf7f2 715 ceph_assert(bucket_exists(id));
7c673cae
FG
716 crush_bucket *b = get_bucket(id);
717
718 // see if item exists in this bucket
719 for (unsigned j=0; j<b->size; j++) {
720 if (b->items[j] == item) {
721 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
722 if (weight)
723 *weight = crush_get_bucket_item_weight(b, j);
724 return true;
725 }
726 }
727 return false;
728 }
729
94b18763 730 ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
7c673cae
FG
731 return false;
732}
733
11fdf7f2 734map<string, string> CrushWrapper::get_full_location(int id) const
7c673cae
FG
735{
736 vector<pair<string, string> > full_location_ordered;
737 map<string,string> full_location;
738
739 get_full_location_ordered(id, full_location_ordered);
740
741 std::copy(full_location_ordered.begin(),
742 full_location_ordered.end(),
743 std::inserter(full_location, full_location.begin()));
744
745 return full_location;
746}
747
11fdf7f2
TL
748int CrushWrapper::get_full_location(const string& name,
749 map<string,string> *ploc)
750{
751 build_rmaps();
752 auto p = name_rmap.find(name);
753 if (p == name_rmap.end()) {
754 return -ENOENT;
755 }
756 *ploc = get_full_location(p->second);
757 return 0;
758}
759
760int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path) const
7c673cae
FG
761{
762 if (!item_exists(id))
763 return -ENOENT;
764 int cur = id;
765 int ret;
766 while (true) {
767 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
768 if (ret != 0)
769 break;
770 path.push_back(parent_coord);
771 cur = get_item_id(parent_coord.second);
772 }
773 return 0;
774}
775
11fdf7f2 776string CrushWrapper::get_full_location_ordered_string(int id) const
31f18b77
FG
777{
778 vector<pair<string, string> > full_location_ordered;
779 string full_location;
780 get_full_location_ordered(id, full_location_ordered);
781 reverse(begin(full_location_ordered), end(full_location_ordered));
782 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
783 full_location = full_location + i->first + "=" + i->second;
784 if (i != full_location_ordered.end() - 1) {
785 full_location = full_location + ",";
786 }
787 }
788 return full_location;
789}
7c673cae 790
11fdf7f2 791map<int, string> CrushWrapper::get_parent_hierarchy(int id) const
7c673cae
FG
792{
793 map<int,string> parent_hierarchy;
794 pair<string, string> parent_coord = get_immediate_parent(id);
795 int parent_id;
796
797 // get the integer type for id and create a counter from there
798 int type_counter = get_bucket_type(id);
799
800 // if we get a negative type then we can assume that we have an OSD
801 // change behavior in get_item_type FIXME
802 if (type_counter < 0)
803 type_counter = 0;
804
805 // read the type map and get the name of the type with the largest ID
806 int high_type = 0;
11fdf7f2
TL
807 if (!type_map.empty())
808 high_type = type_map.rbegin()->first;
7c673cae
FG
809
810 parent_id = get_item_id(parent_coord.second);
811
812 while (type_counter < high_type) {
813 type_counter++;
814 parent_hierarchy[ type_counter ] = parent_coord.first;
815
816 if (type_counter < high_type){
817 // get the coordinate information for the next parent
818 parent_coord = get_immediate_parent(parent_id);
819 parent_id = get_item_id(parent_coord.second);
820 }
821 }
822
823 return parent_hierarchy;
824}
825
11fdf7f2 826int CrushWrapper::get_children(int id, list<int> *children) const
7c673cae
FG
827{
828 // leaf?
829 if (id >= 0) {
830 return 0;
831 }
832
11fdf7f2 833 auto *b = get_bucket(id);
7c673cae
FG
834 if (IS_ERR(b)) {
835 return -ENOENT;
836 }
837
838 for (unsigned n=0; n<b->size; n++) {
839 children->push_back(b->items[n]);
840 }
841 return b->size;
842}
843
11fdf7f2
TL
844int CrushWrapper::get_all_children(int id, set<int> *children) const
845{
846 // leaf?
847 if (id >= 0) {
848 return 0;
849 }
850
851 auto *b = get_bucket(id);
852 if (IS_ERR(b)) {
853 return -ENOENT;
854 }
855
856 int c = 0;
857 for (unsigned n = 0; n < b->size; n++) {
858 children->insert(b->items[n]);
859 c++;
860 auto r = get_all_children(b->items[n], children);
861 if (r < 0)
862 return r;
863 c += r;
864 }
865 return c;
866}
867
28e407b8
AA
868void CrushWrapper::get_children_of_type(int id,
869 int type,
11fdf7f2 870 vector<int> *children,
28e407b8
AA
871 bool exclude_shadow) const
872{
873 if (id >= 0) {
874 if (type == 0) {
875 // want leaf?
11fdf7f2 876 children->push_back(id);
28e407b8
AA
877 }
878 return;
879 }
880 auto b = get_bucket(id);
881 if (IS_ERR(b)) {
882 return;
883 }
884 if (b->type < type) {
885 // give up
886 return;
887 } else if (b->type == type) {
888 if (!is_shadow_item(b->id) || !exclude_shadow) {
11fdf7f2 889 children->push_back(b->id);
28e407b8
AA
890 }
891 return;
892 }
893 for (unsigned n = 0; n < b->size; n++) {
894 get_children_of_type(b->items[n], type, children, exclude_shadow);
895 }
896}
897
a8e16298
TL
898int CrushWrapper::verify_upmap(CephContext *cct,
899 int rule_id,
900 int pool_size,
901 const vector<int>& up)
902{
903 auto rule = get_rule(rule_id);
904 if (IS_ERR(rule) || !rule) {
905 lderr(cct) << __func__ << " rule " << rule_id << " does not exist"
906 << dendl;
94b18763
FG
907 return -ENOENT;
908 }
9f95a23c
TL
909 int root_bucket = 0;
910 int cursor = 0;
911 std::map<int, int> type_stack;
a8e16298
TL
912 for (unsigned step = 0; step < rule->len; ++step) {
913 auto curstep = &rule->steps[step];
914 ldout(cct, 10) << __func__ << " step " << step << dendl;
915 switch (curstep->op) {
9f95a23c
TL
916 case CRUSH_RULE_TAKE:
917 {
918 root_bucket = curstep->arg1;
919 }
920 break;
a8e16298
TL
921 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
922 case CRUSH_RULE_CHOOSELEAF_INDEP:
923 {
9f95a23c 924 int numrep = curstep->arg1;
a8e16298 925 int type = curstep->arg2;
9f95a23c
TL
926 if (numrep <= 0)
927 numrep += pool_size;
928 type_stack.emplace(type, numrep);
a8e16298
TL
929 if (type == 0) // osd
930 break;
931 map<int, set<int>> osds_by_parent; // parent_of_desired_type -> osds
932 for (auto osd : up) {
933 auto parent = get_parent_of_type(osd, type, rule_id);
934 if (parent < 0) {
935 osds_by_parent[parent].insert(osd);
936 } else {
937 ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
938 << ", skipping for now"
939 << dendl;
940 }
941 }
942 for (auto i : osds_by_parent) {
943 if (i.second.size() > 1) {
944 lderr(cct) << __func__ << " multiple osds " << i.second
945 << " come from same failure domain " << i.first
946 << dendl;
947 return -EINVAL;
948 }
949 }
950 }
951 break;
952
953 case CRUSH_RULE_CHOOSE_FIRSTN:
954 case CRUSH_RULE_CHOOSE_INDEP:
955 {
956 int numrep = curstep->arg1;
957 int type = curstep->arg2;
a8e16298
TL
958 if (numrep <= 0)
959 numrep += pool_size;
9f95a23c
TL
960 type_stack.emplace(type, numrep);
961 if (type == 0) // osd
962 break;
a8e16298
TL
963 set<int> parents_of_type;
964 for (auto osd : up) {
965 auto parent = get_parent_of_type(osd, type, rule_id);
966 if (parent < 0) {
967 parents_of_type.insert(parent);
968 } else {
969 ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
970 << ", skipping for now"
971 << dendl;
972 }
973 }
974 if ((int)parents_of_type.size() > numrep) {
975 lderr(cct) << __func__ << " number of buckets "
976 << parents_of_type.size() << " exceeds desired " << numrep
977 << dendl;
978 return -EINVAL;
979 }
980 }
981 break;
982
9f95a23c
TL
983 case CRUSH_RULE_EMIT:
984 {
985 if (root_bucket < 0) {
986 int num_osds = 1;
987 for (auto &item : type_stack) {
988 num_osds *= item.second;
989 }
990 // validate the osd's in subtree
991 for (int c = 0; cursor < (int)up.size() && c < num_osds; ++cursor, ++c) {
992 int osd = up[cursor];
993 if (!subtree_contains(root_bucket, osd)) {
994 lderr(cct) << __func__ << " osd " << osd << " not in bucket " << root_bucket << dendl;
995 return -EINVAL;
996 }
997 }
998 }
999 type_stack.clear();
1000 root_bucket = 0;
1001 }
1002 break;
a8e16298
TL
1003 default:
1004 // ignore
1005 break;
94b18763
FG
1006 }
1007 }
a8e16298 1008 return 0;
94b18763
FG
1009}
1010
11fdf7f2 1011int CrushWrapper::_get_leaves(int id, list<int> *leaves) const
31f18b77 1012{
11fdf7f2 1013 ceph_assert(leaves);
7c673cae 1014
31f18b77
FG
1015 // Already leaf?
1016 if (id >= 0) {
1017 leaves->push_back(id);
1018 return 0;
1019 }
1020
11fdf7f2 1021 auto b = get_bucket(id);
31f18b77
FG
1022 if (IS_ERR(b)) {
1023 return -ENOENT;
1024 }
1025
1026 for (unsigned n = 0; n < b->size; n++) {
1027 if (b->items[n] >= 0) {
1028 leaves->push_back(b->items[n]);
1029 } else {
1030 // is a bucket, do recursive call
1031 int r = _get_leaves(b->items[n], leaves);
1032 if (r < 0) {
1033 return r;
1034 }
1035 }
1036 }
1037
1038 return 0; // all is well
1039}
1040
11fdf7f2 1041int CrushWrapper::get_leaves(const string &name, set<int> *leaves) const
7c673cae 1042{
11fdf7f2 1043 ceph_assert(leaves);
31f18b77
FG
1044 leaves->clear();
1045
1046 if (!name_exists(name)) {
1047 return -ENOENT;
1048 }
1049
1050 int id = get_item_id(name);
1051 if (id >= 0) {
1052 // already leaf
1053 leaves->insert(id);
1054 return 0;
7c673cae
FG
1055 }
1056
31f18b77
FG
1057 list<int> unordered;
1058 int r = _get_leaves(id, &unordered);
1059 if (r < 0) {
1060 return r;
1061 }
1062
1063 for (auto &p : unordered) {
1064 leaves->insert(p);
1065 }
1066
1067 return 0;
1068}
1069
c07f9fc5
FG
1070int CrushWrapper::insert_item(
1071 CephContext *cct, int item, float weight, string name,
11fdf7f2
TL
1072 const map<string,string>& loc, // typename -> bucketname
1073 bool init_weight_sets)
31f18b77 1074{
7c673cae
FG
1075 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
1076 << " name " << name << " loc " << loc << dendl;
1077
1078 if (!is_valid_crush_name(name))
1079 return -EINVAL;
1080
1081 if (!is_valid_crush_loc(cct, loc))
1082 return -EINVAL;
1083
224ce89b
WB
1084 int r = validate_weightf(weight);
1085 if (r < 0) {
1086 return r;
1087 }
1088
7c673cae
FG
1089 if (name_exists(name)) {
1090 if (get_item_id(name) != item) {
1091 ldout(cct, 10) << "device name '" << name << "' already exists as id "
1092 << get_item_id(name) << dendl;
1093 return -EEXIST;
1094 }
1095 } else {
1096 set_item_name(item, name);
1097 }
1098
1099 int cur = item;
1100
f67539c2
TL
1101 // 1. create locations if locations don't exist
1102 // 2. add child in the location with 0 weight.
1103 // Check more detail of insert_item method declared in
1104 // CrushWrapper.h
c07f9fc5 1105 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
7c673cae
FG
1106 // ignore device type
1107 if (p->first == 0)
1108 continue;
1109
1110 // skip types that are unspecified
1111 map<string,string>::const_iterator q = loc.find(p->second);
1112 if (q == loc.end()) {
c07f9fc5
FG
1113 ldout(cct, 2) << "warning: did not specify location for '"
1114 << p->second << "' level (levels are "
7c673cae
FG
1115 << type_map << ")" << dendl;
1116 continue;
1117 }
1118
1119 if (!name_exists(q->second)) {
1120 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
f67539c2 1121 int zero_weight = 0, new_bucket_id;
7c673cae 1122 int r = add_bucket(0, 0,
f67539c2 1123 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &zero_weight, &new_bucket_id);
7c673cae 1124 if (r < 0) {
c07f9fc5
FG
1125 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
1126 << dendl;
7c673cae
FG
1127 return r;
1128 }
f67539c2 1129 set_item_name(new_bucket_id, q->second);
7c673cae 1130
f67539c2 1131 cur = new_bucket_id;
7c673cae
FG
1132 continue;
1133 }
1134
1135 // add to an existing bucket
1136 int id = get_item_id(q->second);
1137 if (!bucket_exists(id)) {
1138 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
1139 return -EINVAL;
1140 }
1141
1142 // check that we aren't creating a cycle.
1143 if (subtree_contains(id, cur)) {
c07f9fc5
FG
1144 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
1145 << id << dendl;
7c673cae
FG
1146 return -EINVAL;
1147 }
1148
1149 // we have done sanity check above
1150 crush_bucket *b = get_bucket(id);
1151
1152 if (p->first != b->type) {
1153 ldout(cct, 1) << "insert_item existing bucket has type "
1154 << "'" << type_map[b->type] << "' != "
1155 << "'" << type_map[p->first] << "'" << dendl;
1156 return -EINVAL;
1157 }
1158
1159 // are we forming a loop?
1160 if (subtree_contains(cur, b->id)) {
1161 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
1162 << "; cannot form loop" << dendl;
1163 return -ELOOP;
1164 }
1165
1166 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
1167 << " to bucket " << id << dendl;
11fdf7f2
TL
1168 [[maybe_unused]] int r = bucket_add_item(b, cur, 0);
1169 ceph_assert(!r);
7c673cae
FG
1170 break;
1171 }
1172
1173 // adjust the item's weight in location
11fdf7f2
TL
1174 if (adjust_item_weightf_in_loc(cct, item, weight, loc,
1175 item >= 0 && init_weight_sets) > 0) {
7c673cae
FG
1176 if (item >= crush->max_devices) {
1177 crush->max_devices = item + 1;
c07f9fc5
FG
1178 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
1179 << dendl;
1180 }
11fdf7f2 1181 r = rebuild_roots_with_classes(cct);
c07f9fc5
FG
1182 if (r < 0) {
1183 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
1184 << cpp_strerror(r) << dendl;
1185 return r;
7c673cae
FG
1186 }
1187 return 0;
1188 }
1189
c07f9fc5
FG
1190 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
1191 << " in " << loc << dendl;
7c673cae
FG
1192 return -EINVAL;
1193}
1194
c07f9fc5
FG
1195
1196int CrushWrapper::move_bucket(
1197 CephContext *cct, int id, const map<string,string>& loc)
7c673cae 1198{
7c673cae
FG
1199 // sorry this only works for buckets
1200 if (id >= 0)
1201 return -EINVAL;
1202
1203 if (!item_exists(id))
1204 return -ENOENT;
1205
1206 // get the name of the bucket we are trying to move for later
1207 string id_name = get_item_name(id);
1208
1209 // detach the bucket
1210 int bucket_weight = detach_bucket(cct, id);
1211
1212 // insert the bucket back into the hierarchy
11fdf7f2
TL
1213 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc,
1214 false);
7c673cae
FG
1215}
1216
c07f9fc5
FG
1217int CrushWrapper::detach_bucket(CephContext *cct, int item)
1218{
1219 if (!crush)
1220 return (-EINVAL);
1221
1222 if (item >= 0)
1223 return (-EINVAL);
1224
1225 // check that the bucket that we want to detach exists
11fdf7f2 1226 ceph_assert(bucket_exists(item));
c07f9fc5
FG
1227
1228 // get the bucket's weight
1229 crush_bucket *b = get_bucket(item);
1230 unsigned bucket_weight = b->weight;
1231
1232 // get where the bucket is located
1233 pair<string, string> bucket_location = get_immediate_parent(item);
1234
1235 // get the id of the parent bucket
1236 int parent_id = get_item_id(bucket_location.second);
1237
1238 // get the parent bucket
1239 crush_bucket *parent_bucket = get_bucket(parent_id);
1240
1241 if (!IS_ERR(parent_bucket)) {
1242 // zero out the bucket weight
11fdf7f2 1243 adjust_item_weight_in_bucket(cct, item, 0, parent_bucket->id, true);
c07f9fc5
FG
1244
1245 // remove the bucket from the parent
1246 bucket_remove_item(parent_bucket, item);
1247 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1248 return PTR_ERR(parent_bucket);
1249 }
1250
1251 // check that we're happy
1252 int test_weight = 0;
1253 map<string,string> test_location;
1254 test_location[ bucket_location.first ] = (bucket_location.second);
1255
1256 bool successful_detach = !(check_item_loc(cct, item, test_location,
1257 &test_weight));
11fdf7f2
TL
1258 ceph_assert(successful_detach);
1259 ceph_assert(test_weight == 0);
c07f9fc5
FG
1260
1261 return bucket_weight;
1262}
1263
11fdf7f2
TL
1264bool CrushWrapper::is_parent_of(int child, int p) const
1265{
1266 int parent = 0;
1267 while (!get_immediate_parent_id(child, &parent)) {
1268 if (parent == p) {
1269 return true;
1270 }
1271 child = parent;
1272 }
1273 return false;
1274}
1275
31f18b77 1276int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
7c673cae 1277{
31f18b77
FG
1278 if (src >= 0 || dst >= 0)
1279 return -EINVAL;
1280 if (!item_exists(src) || !item_exists(dst))
1281 return -EINVAL;
1282 crush_bucket *a = get_bucket(src);
1283 crush_bucket *b = get_bucket(dst);
11fdf7f2
TL
1284 if (is_parent_of(a->id, b->id) || is_parent_of(b->id, a->id)) {
1285 return -EINVAL;
1286 }
31f18b77
FG
1287 unsigned aw = a->weight;
1288 unsigned bw = b->weight;
1289
1290 // swap weights
1291 adjust_item_weight(cct, a->id, bw);
1292 adjust_item_weight(cct, b->id, aw);
1293
1294 // swap items
1295 map<int,unsigned> tmp;
1296 unsigned as = a->size;
1297 unsigned bs = b->size;
1298 for (unsigned i = 0; i < as; ++i) {
1299 int item = a->items[0];
1300 int itemw = crush_get_bucket_item_weight(a, 0);
1301 tmp[item] = itemw;
1302 bucket_remove_item(a, item);
1303 }
11fdf7f2
TL
1304 ceph_assert(a->size == 0);
1305 ceph_assert(b->size == bs);
31f18b77
FG
1306 for (unsigned i = 0; i < bs; ++i) {
1307 int item = b->items[0];
1308 int itemw = crush_get_bucket_item_weight(b, 0);
1309 bucket_remove_item(b, item);
1310 bucket_add_item(a, item, itemw);
1311 }
11fdf7f2
TL
1312 ceph_assert(a->size == bs);
1313 ceph_assert(b->size == 0);
31f18b77
FG
1314 for (auto t : tmp) {
1315 bucket_add_item(b, t.first, t.second);
1316 }
11fdf7f2
TL
1317 ceph_assert(a->size == bs);
1318 ceph_assert(b->size == as);
31f18b77
FG
1319
1320 // swap names
1321 swap_names(src, dst);
11fdf7f2 1322 return rebuild_roots_with_classes(cct);
31f18b77 1323}
7c673cae 1324
c07f9fc5
FG
1325int CrushWrapper::link_bucket(
1326 CephContext *cct, int id, const map<string,string>& loc)
31f18b77 1327{
7c673cae
FG
1328 // sorry this only works for buckets
1329 if (id >= 0)
1330 return -EINVAL;
1331
1332 if (!item_exists(id))
1333 return -ENOENT;
1334
1335 // get the name of the bucket we are trying to move for later
1336 string id_name = get_item_name(id);
1337
1338 crush_bucket *b = get_bucket(id);
1339 unsigned bucket_weight = b->weight;
1340
1341 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1342}
1343
c07f9fc5
FG
1344int CrushWrapper::create_or_move_item(
1345 CephContext *cct, int item, float weight, string name,
11fdf7f2
TL
1346 const map<string,string>& loc, // typename -> bucketname
1347 bool init_weight_sets)
7c673cae 1348{
7c673cae
FG
1349 int ret = 0;
1350 int old_iweight;
1351
1352 if (!is_valid_crush_name(name))
1353 return -EINVAL;
1354
1355 if (check_item_loc(cct, item, loc, &old_iweight)) {
c07f9fc5
FG
1356 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1357 << dendl;
7c673cae
FG
1358 } else {
1359 if (_search_item_exists(item)) {
1360 weight = get_item_weightf(item);
c07f9fc5
FG
1361 ldout(cct, 10) << "create_or_move_item " << item
1362 << " exists with weight " << weight << dendl;
7c673cae
FG
1363 remove_item(cct, item, true);
1364 }
c07f9fc5
FG
1365 ldout(cct, 5) << "create_or_move_item adding " << item
1366 << " weight " << weight
7c673cae 1367 << " at " << loc << dendl;
11fdf7f2
TL
1368 ret = insert_item(cct, item, weight, name, loc,
1369 item >= 0 && init_weight_sets);
7c673cae
FG
1370 if (ret == 0)
1371 ret = 1; // changed
1372 }
1373 return ret;
1374}
1375
c07f9fc5
FG
1376int CrushWrapper::update_item(
1377 CephContext *cct, int item, float weight, string name,
1378 const map<string,string>& loc) // typename -> bucketname
7c673cae 1379{
7c673cae
FG
1380 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1381 << " name " << name << " loc " << loc << dendl;
1382 int ret = 0;
1383
1384 if (!is_valid_crush_name(name))
1385 return -EINVAL;
1386
1387 if (!is_valid_crush_loc(cct, loc))
1388 return -EINVAL;
1389
224ce89b
WB
1390 ret = validate_weightf(weight);
1391 if (ret < 0) {
1392 return ret;
1393 }
1394
7c673cae
FG
1395 // compare quantized (fixed-point integer) weights!
1396 int iweight = (int)(weight * (float)0x10000);
1397 int old_iweight;
1398 if (check_item_loc(cct, item, loc, &old_iweight)) {
1399 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1400 if (old_iweight != iweight) {
1401 ldout(cct, 5) << "update_item " << item << " adjusting weight "
c07f9fc5
FG
1402 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1403 << dendl;
7c673cae 1404 adjust_item_weight_in_loc(cct, item, iweight, loc);
f67539c2
TL
1405 ret = rebuild_roots_with_classes(cct);
1406 if (ret < 0) {
1407 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
1408 << cpp_strerror(ret) << dendl;
1409 return ret;
1410 }
7c673cae
FG
1411 ret = 1;
1412 }
1413 if (get_item_name(item) != name) {
c07f9fc5
FG
1414 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1415 << dendl;
7c673cae
FG
1416 set_item_name(item, name);
1417 ret = 1;
1418 }
1419 } else {
1420 if (item_exists(item)) {
1421 remove_item(cct, item, true);
1422 }
1423 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1424 << " at " << loc << dendl;
1425 ret = insert_item(cct, item, weight, name, loc);
1426 if (ret == 0)
1427 ret = 1; // changed
1428 }
1429 return ret;
1430}
1431
1432int CrushWrapper::get_item_weight(int id) const
1433{
1434 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1435 crush_bucket *b = crush->buckets[bidx];
1436 if (b == NULL)
1437 continue;
1438 if (b->id == id)
1439 return b->weight;
1440 for (unsigned i = 0; i < b->size; i++)
1441 if (b->items[i] == id)
1442 return crush_get_bucket_item_weight(b, i);
1443 }
1444 return -ENOENT;
1445}
1446
1447int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1448{
1449 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1450
1451 int bid = get_item_id(l->second);
1452 if (!bucket_exists(bid))
1453 continue;
1454 crush_bucket *b = get_bucket(bid);
1455 for (unsigned int i = 0; i < b->size; i++) {
1456 if (b->items[i] == id) {
1457 return crush_get_bucket_item_weight(b, i);
1458 }
1459 }
1460 }
1461 return -ENOENT;
1462}
1463
11fdf7f2
TL
1464int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight,
1465 bool update_weight_sets)
7c673cae 1466{
11fdf7f2
TL
1467 ldout(cct, 5) << __func__ << " " << id << " weight " << weight
1468 << " update_weight_sets=" << (int)update_weight_sets
1469 << dendl;
7c673cae
FG
1470 int changed = 0;
1471 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
11fdf7f2 1472 if (!crush->buckets[bidx]) {
7c673cae 1473 continue;
11fdf7f2
TL
1474 }
1475 int r = adjust_item_weight_in_bucket(cct, id, weight, -1-bidx,
1476 update_weight_sets);
1477 if (r > 0) {
1478 ++changed;
1479 }
1480 }
1481 if (!changed) {
1482 return -ENOENT;
1483 }
1484 return changed;
1485}
1486
1487int CrushWrapper::adjust_item_weight_in_bucket(
1488 CephContext *cct, int id, int weight,
1489 int bucket_id,
1490 bool update_weight_sets)
1491{
1492 ldout(cct, 5) << __func__ << " " << id << " weight " << weight
1493 << " in bucket " << bucket_id
1494 << " update_weight_sets=" << (int)update_weight_sets
1495 << dendl;
1496 int changed = 0;
1497 if (!bucket_exists(bucket_id)) {
1498 return -ENOENT;
1499 }
1500 crush_bucket *b = get_bucket(bucket_id);
1501 for (unsigned int i = 0; i < b->size; i++) {
1502 if (b->items[i] == id) {
1503 int diff = bucket_adjust_item_weight(cct, b, id, weight,
1504 update_weight_sets);
1505 ldout(cct, 5) << __func__ << " " << id << " diff " << diff
1506 << " in bucket " << bucket_id << dendl;
1507 adjust_item_weight(cct, bucket_id, b->weight, false);
1508 changed++;
1509 }
1510 }
1511 // update weight-sets so they continue to sum
1512 for (auto& p : choose_args) {
1513 auto &cmap = p.second;
1514 if (!cmap.args) {
1515 continue;
1516 }
1517 crush_choose_arg *arg = &cmap.args[-1 - bucket_id];
1518 if (!arg->weight_set) {
1519 continue;
1520 }
1521 ceph_assert(arg->weight_set_positions > 0);
1522 vector<int> w(arg->weight_set_positions);
1523 for (unsigned i = 0; i < b->size; ++i) {
1524 for (unsigned j = 0; j < arg->weight_set_positions; ++j) {
1525 crush_weight_set *weight_set = &arg->weight_set[j];
1526 w[j] += weight_set->weights[i];
7c673cae
FG
1527 }
1528 }
11fdf7f2
TL
1529 ldout(cct,5) << __func__ << " adjusting bucket " << bucket_id
1530 << " cmap " << p.first << " weights to " << w << dendl;
1531 ostringstream ss;
1532 choose_args_adjust_item_weight(cct, cmap, bucket_id, w, &ss);
7c673cae 1533 }
11fdf7f2 1534 if (!changed) {
7c673cae 1535 return -ENOENT;
11fdf7f2 1536 }
7c673cae
FG
1537 return changed;
1538}
1539
11fdf7f2
TL
1540int CrushWrapper::adjust_item_weight_in_loc(
1541 CephContext *cct, int id, int weight,
1542 const map<string,string>& loc,
1543 bool update_weight_sets)
7c673cae 1544{
c07f9fc5 1545 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
11fdf7f2
TL
1546 << " in " << loc
1547 << " update_weight_sets=" << (int)update_weight_sets
1548 << dendl;
7c673cae 1549 int changed = 0;
c07f9fc5 1550 for (auto l = loc.begin(); l != loc.end(); ++l) {
7c673cae
FG
1551 int bid = get_item_id(l->second);
1552 if (!bucket_exists(bid))
1553 continue;
11fdf7f2
TL
1554 int r = adjust_item_weight_in_bucket(cct, id, weight, bid,
1555 update_weight_sets);
1556 if (r > 0) {
1557 ++changed;
7c673cae
FG
1558 }
1559 }
11fdf7f2 1560 if (!changed) {
7c673cae 1561 return -ENOENT;
11fdf7f2 1562 }
7c673cae
FG
1563 return changed;
1564}
1565
11fdf7f2
TL
1566int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight,
1567 bool update_weight_sets)
7c673cae
FG
1568{
1569 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1570 crush_bucket *b = get_bucket(id);
1571 if (IS_ERR(b))
1572 return PTR_ERR(b);
1573 int changed = 0;
1574 list<crush_bucket*> q;
1575 q.push_back(b);
1576 while (!q.empty()) {
1577 b = q.front();
1578 q.pop_front();
1579 int local_changed = 0;
1580 for (unsigned i=0; i<b->size; ++i) {
1581 int n = b->items[i];
1582 if (n >= 0) {
11fdf7f2 1583 adjust_item_weight_in_bucket(cct, n, weight, b->id, update_weight_sets);
7c673cae
FG
1584 ++changed;
1585 ++local_changed;
1586 } else {
1587 crush_bucket *sub = get_bucket(n);
1588 if (IS_ERR(sub))
1589 continue;
1590 q.push_back(sub);
1591 }
1592 }
7c673cae 1593 }
f67539c2
TL
1594 int ret = rebuild_roots_with_classes(cct);
1595 if (ret < 0) {
1596 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
1597 << cpp_strerror(ret) << dendl;
1598 return ret;
1599 }
7c673cae
FG
1600 return changed;
1601}
1602
1603bool CrushWrapper::check_item_present(int id) const
1604{
1605 bool found = false;
1606
1607 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1608 crush_bucket *b = crush->buckets[bidx];
1609 if (b == 0)
1610 continue;
1611 for (unsigned i = 0; i < b->size; i++)
1612 if (b->items[i] == id)
1613 found = true;
1614 }
1615 return found;
1616}
1617
1618
11fdf7f2 1619pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret) const
7c673cae
FG
1620{
1621
1622 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1623 crush_bucket *b = crush->buckets[bidx];
1624 if (b == 0)
1625 continue;
c07f9fc5 1626 if (is_shadow_item(b->id))
224ce89b 1627 continue;
7c673cae
FG
1628 for (unsigned i = 0; i < b->size; i++)
1629 if (b->items[i] == id) {
11fdf7f2
TL
1630 string parent_id = name_map.at(b->id);
1631 string parent_bucket_type = type_map.at(b->type);
7c673cae
FG
1632 if (_ret)
1633 *_ret = 0;
1634 return make_pair(parent_bucket_type, parent_id);
1635 }
1636 }
1637
1638 if (_ret)
1639 *_ret = -ENOENT;
1640
1641 return pair<string, string>();
1642}
1643
1644int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1645{
1646 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1647 crush_bucket *b = crush->buckets[bidx];
1648 if (b == 0)
1649 continue;
c07f9fc5 1650 if (is_shadow_item(b->id))
224ce89b 1651 continue;
7c673cae
FG
1652 for (unsigned i = 0; i < b->size; i++) {
1653 if (b->items[i] == id) {
1654 *parent = b->id;
1655 return 0;
1656 }
1657 }
1658 }
1659 return -ENOENT;
1660}
1661
28e407b8 1662int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
31f18b77 1663{
28e407b8
AA
1664 if (rule < 0) {
1665 // no rule specified
1666 do {
1667 int r = get_immediate_parent_id(item, &item);
1668 if (r < 0) {
1669 return 0;
1670 }
1671 } while (get_bucket_type(item) != type);
1672 return item;
1673 }
1674 set<int> roots;
1675 find_takes_by_rule(rule, &roots);
1676 for (auto root : roots) {
11fdf7f2 1677 vector<int> candidates;
28e407b8
AA
1678 get_children_of_type(root, type, &candidates, false);
1679 for (auto candidate : candidates) {
1680 if (subtree_contains(candidate, item)) {
1681 // note that here we assure that no two different buckets
1682 // from a single crush rule will share a same device,
1683 // which should generally be true.
1684 return candidate;
1685 }
31f18b77 1686 }
28e407b8
AA
1687 }
1688 return 0; // not found
31f18b77
FG
1689}
1690
11fdf7f2
TL
1691void CrushWrapper::get_subtree_of_type(int type, vector<int> *subtrees)
1692{
1693 set<int> roots;
1694 find_roots(&roots);
1695 for (auto r: roots) {
1696 crush_bucket *b = get_bucket(r);
1697 if (IS_ERR(b))
1698 continue;
1699 get_children_of_type(b->id, type, subtrees);
1700 }
1701}
1702
1703bool CrushWrapper::class_is_in_use(int class_id, ostream *ss)
1704{
1705 list<unsigned> rules;
1706 for (unsigned i = 0; i < crush->max_rules; ++i) {
1707 crush_rule *r = crush->rules[i];
1708 if (!r)
1709 continue;
1710 for (unsigned j = 0; j < r->len; ++j) {
1711 if (r->steps[j].op == CRUSH_RULE_TAKE) {
1712 int root = r->steps[j].arg1;
1713 for (auto &p : class_bucket) {
1714 auto& q = p.second;
1715 if (q.count(class_id) && q[class_id] == root) {
1716 rules.push_back(i);
1717 }
1718 }
1719 }
1720 }
1721 }
1722 if (rules.empty()) {
1723 return false;
1724 }
1725 if (ss) {
1726 ostringstream os;
1727 for (auto &p: rules) {
1728 os << "'" << get_rule_name(p) <<"',";
1729 }
1730 string out(os.str());
1731 out.resize(out.size() - 1); // drop last ','
1732 *ss << "still referenced by crush_rule(s): " << out;
1733 }
1734 return true;
1735}
1736
35e4c445
FG
1737int CrushWrapper::rename_class(const string& srcname, const string& dstname)
1738{
1739 auto i = class_rname.find(srcname);
1740 if (i == class_rname.end())
1741 return -ENOENT;
1742 auto j = class_rname.find(dstname);
1743 if (j != class_rname.end())
1744 return -EEXIST;
1745
1746 int class_id = i->second;
11fdf7f2 1747 ceph_assert(class_name.count(class_id));
35e4c445
FG
1748 // rename any shadow buckets of old class name
1749 for (auto &it: class_map) {
1750 if (it.first < 0 && it.second == class_id) {
1751 string old_name = get_item_name(it.first);
1752 size_t pos = old_name.find("~");
11fdf7f2 1753 ceph_assert(pos != string::npos);
35e4c445
FG
1754 string name_no_class = old_name.substr(0, pos);
1755 string old_class_name = old_name.substr(pos + 1);
11fdf7f2 1756 ceph_assert(old_class_name == srcname);
35e4c445
FG
1757 string new_name = name_no_class + "~" + dstname;
1758 // we do not use set_item_name
1759 // because the name is intentionally invalid
1760 name_map[it.first] = new_name;
1761 have_rmaps = false;
1762 }
1763 }
1764
1765 // rename class
1766 class_rname.erase(srcname);
1767 class_name.erase(class_id);
1768 class_rname[dstname] = class_id;
1769 class_name[class_id] = dstname;
1770 return 0;
1771}
1772
d2e6a577
FG
1773int CrushWrapper::populate_classes(
1774 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
c07f9fc5 1775{
d2e6a577
FG
1776 // build set of previous used shadow ids
1777 set<int32_t> used_ids;
1778 for (auto& p : old_class_bucket) {
1779 for (auto& q : p.second) {
1780 used_ids.insert(q.second);
c07f9fc5 1781 }
c07f9fc5 1782 }
35e4c445
FG
1783 // accumulate weight values for each carg and bucket as we go. because it is
1784 // depth first, we will have the nested bucket weights we need when we
1785 // finish constructing the containing buckets.
28e407b8 1786 map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
7c673cae 1787 set<int> roots;
3efd9988 1788 find_nonshadow_roots(&roots);
7c673cae 1789 for (auto &r : roots) {
f67539c2 1790 assert(r < 0);
7c673cae
FG
1791 for (auto &c : class_name) {
1792 int clone;
d2e6a577 1793 int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
35e4c445 1794 &clone, &cmap_item_weight);
7c673cae
FG
1795 if (res < 0)
1796 return res;
1797 }
1798 }
1799 return 0;
1800}
1801
11fdf7f2 1802int CrushWrapper::trim_roots_with_class(CephContext *cct)
7c673cae
FG
1803{
1804 set<int> roots;
3efd9988 1805 find_shadow_roots(&roots);
7c673cae
FG
1806 for (auto &r : roots) {
1807 if (r >= 0)
1808 continue;
11fdf7f2 1809 int res = remove_root(cct, r);
7c673cae
FG
1810 if (res)
1811 return res;
1812 }
1813 // there is no need to reweight because we only remove from the
1814 // root and down
1815 return 0;
1816}
1817
224ce89b
WB
1818int32_t CrushWrapper::_alloc_class_id() const {
1819 if (class_name.empty()) {
1820 return 0;
1821 }
1822 int32_t class_id = class_name.rbegin()->first + 1;
1823 if (class_id >= 0) {
1824 return class_id;
1825 }
1826 // wrapped, pick a random start and do exhaustive search
9f95a23c 1827 uint32_t upperlimit = std::numeric_limits<int32_t>::max();
224ce89b
WB
1828 upperlimit++;
1829 class_id = rand() % upperlimit;
1830 const auto start = class_id;
1831 do {
1832 if (!class_name.count(class_id)) {
1833 return class_id;
1834 } else {
1835 class_id++;
1836 if (class_id < 0) {
1837 class_id = 0;
1838 }
1839 }
1840 } while (class_id != start);
11fdf7f2 1841 ceph_abort_msg("no available class id");
224ce89b
WB
1842}
1843
f64942e4
AA
1844int CrushWrapper::set_subtree_class(
1845 const string& subtree,
1846 const string& new_class)
7c673cae 1847{
f64942e4
AA
1848 if (!name_exists(subtree)) {
1849 return -ENOENT;
1850 }
1851
11fdf7f2 1852 int new_class_id = get_or_create_class_id(new_class);
f64942e4
AA
1853 int id = get_item_id(subtree);
1854 list<int> q = { id };
1855 while (!q.empty()) {
1856 int id = q.front();
1857 q.pop_front();
1858 crush_bucket *b = get_bucket(id);
1859 if (IS_ERR(b)) {
1860 return PTR_ERR(b);
1861 }
1862 for (unsigned i = 0; i < b->size; ++i) {
1863 int item = b->items[i];
1864 if (item >= 0) {
1865 class_map[item] = new_class_id;
1866 } else {
1867 q.push_back(item);
1868 }
1869 }
1870 }
1871 return 0;
1872}
1873
1874int CrushWrapper::reclassify(
1875 CephContext *cct,
1876 ostream& out,
1877 const map<string,string>& classify_root,
1878 const map<string,pair<string,string>>& classify_bucket
1879 )
1880{
1881 map<int,string> reclassified_bucket; // orig_id -> class
1882
1883 // classify_root
1884 for (auto& i : classify_root) {
1885 string root = i.first;
1886 if (!name_exists(root)) {
1887 out << "root " << root << " does not exist" << std::endl;
1888 return -EINVAL;
1889 }
1890 int root_id = get_item_id(root);
1891 string new_class = i.second;
11fdf7f2 1892 int new_class_id = get_or_create_class_id(new_class);
f64942e4
AA
1893 out << "classify_root " << root << " (" << root_id
1894 << ") as " << new_class << std::endl;
f64942e4
AA
1895
1896 // validate rules
1897 for (unsigned j = 0; j < crush->max_rules; j++) {
1898 if (crush->rules[j]) {
1899 auto rule = crush->rules[j];
1900 for (unsigned k = 0; k < rule->len; ++k) {
1901 if (rule->steps[k].op == CRUSH_RULE_TAKE) {
1902 int step_item = get_rule_arg1(j, k);
1903 int original_item;
1904 int c;
1905 int res = split_id_class(step_item, &original_item, &c);
1906 if (res < 0)
1907 return res;
1908 if (c >= 0) {
1909 if (original_item == root_id) {
1910 out << " rule " << j << " includes take on root "
1911 << root << " class " << c << std::endl;
1912 return -EINVAL;
1913 }
1914 }
1915 }
1916 }
1917 }
1918 }
1919
1920 // rebuild new buckets for root
1921 //cout << "before class_bucket: " << class_bucket << std::endl;
1922 map<int,int> renumber;
1923 list<int> q;
1924 q.push_back(root_id);
1925 while (!q.empty()) {
1926 int id = q.front();
1927 q.pop_front();
1928 crush_bucket *bucket = get_bucket(id);
1929 if (IS_ERR(bucket)) {
1930 out << "cannot find bucket " << id
1931 << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
1932 return PTR_ERR(bucket);
1933 }
1934
1935 // move bucket
1936 int new_id = get_new_bucket_id();
1937 out << " renumbering bucket " << id << " -> " << new_id << std::endl;
1938 renumber[id] = new_id;
1939 crush->buckets[-1-new_id] = bucket;
1940 bucket->id = new_id;
1941 crush->buckets[-1-id] = crush_make_bucket(crush,
1942 bucket->alg,
1943 bucket->hash,
1944 bucket->type,
1945 0, NULL, NULL);
1946 crush->buckets[-1-id]->id = id;
1947 for (auto& i : choose_args) {
1948 i.second.args[-1-new_id] = i.second.args[-1-id];
1949 memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
1950 }
1951 class_bucket.erase(id);
1952 class_bucket[new_id][new_class_id] = id;
1953 name_map[new_id] = string(get_item_name(id));
1954 name_map[id] = string(get_item_name(id)) + "~" + new_class;
1955
1956 for (unsigned j = 0; j < bucket->size; ++j) {
1957 if (bucket->items[j] < 0) {
1958 q.push_front(bucket->items[j]);
1959 } else {
1960 // we don't reclassify the device here; if the users wants that,
1961 // they can pass --set-subtree-class separately.
1962 }
1963 }
1964 }
1965 //cout << "mid class_bucket: " << class_bucket << std::endl;
1966
1967 for (int i = 0; i < crush->max_buckets; ++i) {
1968 crush_bucket *b = crush->buckets[i];
1969 if (!b) {
1970 continue;
1971 }
1972 for (unsigned j = 0; j < b->size; ++j) {
1973 if (renumber.count(b->items[j])) {
1974 b->items[j] = renumber[b->items[j]];
1975 }
1976 }
1977 }
1978
11fdf7f2 1979 int r = rebuild_roots_with_classes(cct);
f64942e4
AA
1980 if (r < 0) {
1981 out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
1982 << std::endl;
1983 return r;
1984 }
1985 //cout << "final class_bucket: " << class_bucket << std::endl;
1986 }
1987
1988 // classify_bucket
1989 map<int,int> send_to; // source bucket -> dest bucket
1990 map<int,map<int,int>> new_class_bucket;
1991 map<int,string> new_bucket_names;
1992 map<int,map<string,string>> new_buckets;
1993 map<string,int> new_bucket_by_name;
1994 for (auto& i : classify_bucket) {
1995 const string& match = i.first; // prefix% or %suffix
1996 const string& new_class = i.second.first;
1997 const string& default_parent = i.second.second;
1998 if (!name_exists(default_parent)) {
1999 out << "default parent " << default_parent << " does not exist"
2000 << std::endl;
2001 return -EINVAL;
2002 }
2003 int default_parent_id = get_item_id(default_parent);
2004 crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
2005 assert(default_parent_bucket);
2006 string default_parent_type_name = get_type_name(default_parent_bucket->type);
2007
2008 out << "classify_bucket " << match << " as " << new_class
2009 << " default bucket " << default_parent
2010 << " (" << default_parent_type_name << ")" << std::endl;
2011
11fdf7f2 2012 int new_class_id = get_or_create_class_id(new_class);
f64942e4
AA
2013 for (int j = 0; j < crush->max_buckets; ++j) {
2014 crush_bucket *b = crush->buckets[j];
2015 if (!b || is_shadow_item(b->id)) {
2016 continue;
2017 }
2018 string name = get_item_name(b->id);
2019 if (name.length() < match.length()) {
2020 continue;
2021 }
2022 string basename;
2023 if (match[0] == '%') {
2024 if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
2025 continue;
2026 }
2027 basename = name.substr(0, name.size() - match.size() + 1);
2028 } else if (match[match.size() - 1] == '%') {
2029 if (match.substr(0, match.size() - 1) !=
2030 name.substr(0, match.size() - 1)) {
2031 continue;
2032 }
2033 basename = name.substr(match.size() - 1);
2034 } else if (match == name) {
2035 basename = default_parent;
2036 } else {
2037 continue;
2038 }
2039 cout << "match " << match << " to " << name << " basename " << basename
2040 << std::endl;
2041 // look up or create basename bucket
2042 int base_id;
2043 if (name_exists(basename)) {
2044 base_id = get_item_id(basename);
2045 cout << " have base " << base_id << std::endl;
2046 } else if (new_bucket_by_name.count(basename)) {
2047 base_id = new_bucket_by_name[basename];
2048 cout << " already creating base " << base_id << std::endl;
2049 } else {
2050 base_id = get_new_bucket_id();
2051 crush->buckets[-1-base_id] = crush_make_bucket(crush,
2052 b->alg,
2053 b->hash,
2054 b->type,
2055 0, NULL, NULL);
2056 crush->buckets[-1-base_id]->id = base_id;
2057 name_map[base_id] = basename;
2058 new_bucket_by_name[basename] = base_id;
2059 cout << " created base " << base_id << std::endl;
2060
2061 new_buckets[base_id][default_parent_type_name] = default_parent;
2062 }
2063 send_to[b->id] = base_id;
2064 new_class_bucket[base_id][new_class_id] = b->id;
2065 new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
2066
2067 // make sure devices are classified
2068 for (unsigned i = 0; i < b->size; ++i) {
2069 int item = b->items[i];
2070 if (item >= 0) {
2071 class_map[item] = new_class_id;
2072 }
2073 }
2074 }
2075 }
2076
2077 // no name_exists() works below,
2078 have_rmaps = false;
2079
2080 // copy items around
2081 //cout << "send_to " << send_to << std::endl;
7c673cae 2082 set<int> roots;
3efd9988 2083 find_roots(&roots);
f64942e4
AA
2084 for (auto& i : send_to) {
2085 crush_bucket *from = get_bucket(i.first);
2086 crush_bucket *to = get_bucket(i.second);
2087 cout << "moving items from " << from->id << " (" << get_item_name(from->id)
2088 << ") to " << to->id << " (" << get_item_name(to->id) << ")"
2089 << std::endl;
2090 for (unsigned j = 0; j < from->size; ++j) {
2091 int item = from->items[j];
2092 int r;
2093 map<string,string> to_loc;
2094 to_loc[get_type_name(to->type)] = get_item_name(to->id);
2095 if (item >= 0) {
2096 if (subtree_contains(to->id, item)) {
2097 continue;
2098 }
2099 map<string,string> from_loc;
2100 from_loc[get_type_name(from->type)] = get_item_name(from->id);
2101 auto w = get_item_weightf_in_loc(item, from_loc);
2102 r = insert_item(cct, item,
2103 w,
2104 get_item_name(item),
2105 to_loc);
2106 } else {
2107 if (!send_to.count(item)) {
2108 lderr(cct) << "item " << item << " in bucket " << from->id
2109 << " is not also a reclassified bucket" << dendl;
2110 return -EINVAL;
2111 }
2112 int newitem = send_to[item];
2113 if (subtree_contains(to->id, newitem)) {
2114 continue;
2115 }
2116 r = link_bucket(cct, newitem, to_loc);
2117 }
2118 if (r != 0) {
2119 cout << __func__ << " err from insert_item: " << cpp_strerror(r)
2120 << std::endl;
2121 return r;
2122 }
2123 }
2124 }
2125
2126 // make sure new buckets have parents
2127 for (auto& i : new_buckets) {
2128 int parent;
2129 if (get_immediate_parent_id(i.first, &parent) < 0) {
2130 cout << "new bucket " << i.first << " missing parent, adding at "
2131 << i.second << std::endl;
2132 int r = link_bucket(cct, i.first, i.second);
2133 if (r != 0) {
2134 cout << __func__ << " err from insert_item: " << cpp_strerror(r)
2135 << std::endl;
2136 return r;
2137 }
2138 }
2139 }
2140
2141 // set class mappings
2142 //cout << "pre class_bucket: " << class_bucket << std::endl;
2143 for (auto& i : new_class_bucket) {
2144 for (auto& j : i.second) {
2145 class_bucket[i.first][j.first] = j.second;
2146 }
2147
2148 }
2149 //cout << "post class_bucket: " << class_bucket << std::endl;
2150 for (auto& i : new_bucket_names) {
2151 name_map[i.first] = i.second;
2152 }
2153
11fdf7f2 2154 int r = rebuild_roots_with_classes(cct);
f64942e4
AA
2155 if (r < 0) {
2156 out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
2157 << std::endl;
2158 return r;
2159 }
2160 //cout << "final class_bucket: " << class_bucket << std::endl;
2161
2162 return 0;
2163}
2164
2165int CrushWrapper::get_new_bucket_id()
2166{
2167 int id = -1;
2168 while (crush->buckets[-1-id] &&
2169 -1-id < crush->max_buckets) {
2170 id--;
2171 }
2172 if (-1-id == crush->max_buckets) {
2173 ++crush->max_buckets;
2174 crush->buckets = (struct crush_bucket**)realloc(
2175 crush->buckets,
2176 sizeof(crush->buckets[0]) * crush->max_buckets);
2177 for (auto& i : choose_args) {
11fdf7f2 2178 assert(i.second.size == (__u32)crush->max_buckets - 1);
f64942e4
AA
2179 ++i.second.size;
2180 i.second.args = (struct crush_choose_arg*)realloc(
2181 i.second.args,
2182 sizeof(i.second.args[0]) * i.second.size);
2183 }
2184 }
2185 return id;
2186}
2187
2188void CrushWrapper::reweight(CephContext *cct)
2189{
2190 set<int> roots;
2191 find_nonshadow_roots(&roots);
2192 for (auto id : roots) {
2193 if (id >= 0)
7c673cae 2194 continue;
f64942e4
AA
2195 crush_bucket *b = get_bucket(id);
2196 ldout(cct, 5) << "reweight root bucket " << id << dendl;
7c673cae 2197 int r = crush_reweight_bucket(crush, b);
11fdf7f2 2198 ceph_assert(r == 0);
f64942e4
AA
2199
2200 for (auto& i : choose_args) {
2201 //cout << "carg " << i.first << std::endl;
2202 vector<uint32_t> w; // discard top-level weights
2203 reweight_bucket(b, i.second, &w);
2204 }
2205 }
11fdf7f2 2206 int r = rebuild_roots_with_classes(cct);
f64942e4
AA
2207 ceph_assert(r == 0);
2208}
2209
2210void CrushWrapper::reweight_bucket(
2211 crush_bucket *b,
2212 crush_choose_arg_map& arg_map,
2213 vector<uint32_t> *weightv)
2214{
2215 int idx = -1 - b->id;
2216 unsigned npos = arg_map.args[idx].weight_set_positions;
2217 //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
2218 weightv->resize(npos);
2219 for (unsigned i = 0; i < b->size; ++i) {
2220 int item = b->items[i];
2221 if (item >= 0) {
2222 for (unsigned pos = 0; pos < npos; ++pos) {
2223 (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
2224 }
2225 } else {
2226 vector<uint32_t> subw(npos);
2227 crush_bucket *sub = get_bucket(item);
2228 assert(sub);
2229 reweight_bucket(sub, arg_map, &subw);
2230 for (unsigned pos = 0; pos < npos; ++pos) {
2231 (*weightv)[pos] += subw[pos];
2232 // strash the real bucket weight as the weights for this reference
2233 arg_map.args[idx].weight_set->weights[i] = subw[pos];
2234 }
2235 }
7c673cae 2236 }
f64942e4 2237 //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
7c673cae
FG
2238}
2239
31f18b77
FG
2240int CrushWrapper::add_simple_rule_at(
2241 string name, string root_name,
2242 string failure_domain_name,
224ce89b 2243 string device_class,
31f18b77 2244 string mode, int rule_type,
224ce89b
WB
2245 int rno,
2246 ostream *err)
7c673cae
FG
2247{
2248 if (rule_exists(name)) {
2249 if (err)
2250 *err << "rule " << name << " exists";
2251 return -EEXIST;
2252 }
2253 if (rno >= 0) {
2254 if (rule_exists(rno)) {
2255 if (err)
2256 *err << "rule with ruleno " << rno << " exists";
2257 return -EEXIST;
2258 }
7c673cae
FG
2259 } else {
2260 for (rno = 0; rno < get_max_rules(); rno++) {
20effc67 2261 if (!rule_exists(rno))
7c673cae
FG
2262 break;
2263 }
2264 }
2265 if (!name_exists(root_name)) {
2266 if (err)
2267 *err << "root item " << root_name << " does not exist";
2268 return -ENOENT;
2269 }
2270 int root = get_item_id(root_name);
2271 int type = 0;
2272 if (failure_domain_name.length()) {
2273 type = get_type_id(failure_domain_name);
2274 if (type < 0) {
2275 if (err)
2276 *err << "unknown type " << failure_domain_name;
2277 return -EINVAL;
2278 }
2279 }
224ce89b
WB
2280 if (device_class.size()) {
2281 if (!class_exists(device_class)) {
2282 if (err)
2283 *err << "device class " << device_class << " does not exist";
2284 return -EINVAL;
2285 }
2286 int c = get_class_id(device_class);
2287 if (class_bucket.count(root) == 0 ||
2288 class_bucket[root].count(c) == 0) {
2289 if (err)
2290 *err << "root " << root_name << " has no devices with class "
2291 << device_class;
2292 return -EINVAL;
2293 }
2294 root = class_bucket[root][c];
2295 }
7c673cae
FG
2296 if (mode != "firstn" && mode != "indep") {
2297 if (err)
2298 *err << "unknown mode " << mode;
2299 return -EINVAL;
2300 }
2301
2302 int steps = 3;
2303 if (mode == "indep")
2304 steps = 5;
20effc67 2305 crush_rule *rule = crush_make_rule(steps, rule_type);
11fdf7f2 2306 ceph_assert(rule);
7c673cae
FG
2307 int step = 0;
2308 if (mode == "indep") {
2309 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
2310 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
2311 }
2312 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
2313 if (type)
2314 crush_rule_set_step(rule, step++,
2315 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
2316 CRUSH_RULE_CHOOSELEAF_INDEP,
2317 CRUSH_CHOOSE_N,
2318 type);
2319 else
2320 crush_rule_set_step(rule, step++,
2321 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
2322 CRUSH_RULE_CHOOSE_INDEP,
2323 CRUSH_CHOOSE_N,
2324 0);
2325 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
2326
2327 int ret = crush_add_rule(crush, rule, rno);
2328 if(ret < 0) {
2329 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
2330 return ret;
2331 }
2332 set_rule_name(rno, name);
2333 have_rmaps = false;
2334 return rno;
2335}
2336
31f18b77
FG
2337int CrushWrapper::add_simple_rule(
2338 string name, string root_name,
2339 string failure_domain_name,
224ce89b 2340 string device_class,
31f18b77
FG
2341 string mode, int rule_type,
2342 ostream *err)
7c673cae 2343{
224ce89b
WB
2344 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
2345 mode,
31f18b77 2346 rule_type, -1, err);
7c673cae
FG
2347}
2348
3efd9988
FG
2349float CrushWrapper::_get_take_weight_osd_map(int root,
2350 map<int,float> *pmap) const
2351{
2352 float sum = 0.0;
2353 list<int> q;
2354 q.push_back(root);
2355 //breadth first iterate the OSD tree
2356 while (!q.empty()) {
2357 int bno = q.front();
2358 q.pop_front();
2359 crush_bucket *b = crush->buckets[-1-bno];
11fdf7f2 2360 ceph_assert(b);
3efd9988
FG
2361 for (unsigned j=0; j<b->size; ++j) {
2362 int item_id = b->items[j];
2363 if (item_id >= 0) { //it's an OSD
2364 float w = crush_get_bucket_item_weight(b, j);
2365 (*pmap)[item_id] = w;
2366 sum += w;
2367 } else { //not an OSD, expand the child later
2368 q.push_back(item_id);
2369 }
2370 }
2371 }
2372 return sum;
2373}
2374
2375void CrushWrapper::_normalize_weight_map(float sum,
2376 const map<int,float>& m,
2377 map<int,float> *pmap) const
2378{
2379 for (auto& p : m) {
2380 map<int,float>::iterator q = pmap->find(p.first);
2381 if (q == pmap->end()) {
2382 (*pmap)[p.first] = p.second / sum;
2383 } else {
2384 q->second += p.second / sum;
2385 }
2386 }
2387}
2388
2389int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
2390{
2391 map<int,float> m;
2392 float sum = _get_take_weight_osd_map(root, &m);
2393 _normalize_weight_map(sum, m, pmap);
2394 return 0;
2395}
2396
2397int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
2398 map<int,float> *pmap) const
7c673cae
FG
2399{
2400 if (ruleno >= crush->max_rules)
2401 return -ENOENT;
2402 if (crush->rules[ruleno] == NULL)
2403 return -ENOENT;
2404 crush_rule *rule = crush->rules[ruleno];
2405
2406 // build a weight map for each TAKE in the rule, and then merge them
31f18b77
FG
2407
2408 // FIXME: if there are multiple takes that place a different number of
2409 // objects we do not take that into account. (Also, note that doing this
2410 // right is also a function of the pool, since the crush rule
2411 // might choose 2 + choose 2 but pool size may only be 3.)
7c673cae
FG
2412 for (unsigned i=0; i<rule->len; ++i) {
2413 map<int,float> m;
2414 float sum = 0;
2415 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
2416 int n = rule->steps[i].arg1;
2417 if (n >= 0) {
2418 m[n] = 1.0;
2419 sum = 1.0;
2420 } else {
3efd9988 2421 sum += _get_take_weight_osd_map(n, &m);
7c673cae
FG
2422 }
2423 }
3efd9988 2424 _normalize_weight_map(sum, m, pmap);
7c673cae
FG
2425 }
2426
2427 return 0;
2428}
2429
2430int CrushWrapper::remove_rule(int ruleno)
2431{
2432 if (ruleno >= (int)crush->max_rules)
2433 return -ENOENT;
2434 if (crush->rules[ruleno] == NULL)
2435 return -ENOENT;
2436 crush_destroy_rule(crush->rules[ruleno]);
2437 crush->rules[ruleno] = NULL;
2438 rule_name_map.erase(ruleno);
2439 have_rmaps = false;
11fdf7f2 2440 return rebuild_roots_with_classes(nullptr);
7c673cae
FG
2441}
2442
11fdf7f2
TL
2443int CrushWrapper::bucket_adjust_item_weight(
2444 CephContext *cct, crush_bucket *bucket, int item, int weight,
2445 bool adjust_weight_sets)
31f18b77 2446{
11fdf7f2 2447 if (adjust_weight_sets) {
31f18b77
FG
2448 unsigned position;
2449 for (position = 0; position < bucket->size; position++)
2450 if (bucket->items[position] == item)
2451 break;
11fdf7f2 2452 ceph_assert(position != bucket->size);
b5b8bbf5
FG
2453 for (auto &w : choose_args) {
2454 crush_choose_arg_map &arg_map = w.second;
31f18b77 2455 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
28e407b8 2456 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
31f18b77
FG
2457 crush_weight_set *weight_set = &arg->weight_set[j];
2458 weight_set->weights[position] = weight;
2459 }
2460 }
2461 }
2462 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
2463}
2464
c07f9fc5
FG
2465int CrushWrapper::add_bucket(
2466 int bucketno, int alg, int hash, int type, int size,
2467 int *items, int *weights, int *idout)
2468{
2469 if (alg == 0) {
2470 alg = get_default_bucket_alg();
2471 if (alg == 0)
2472 return -EINVAL;
2473 }
2474 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
2475 weights);
11fdf7f2
TL
2476 ceph_assert(b);
2477 ceph_assert(idout);
c07f9fc5 2478 int r = crush_add_bucket(crush, bucketno, b, idout);
b5b8bbf5 2479 int pos = -1 - *idout;
c07f9fc5
FG
2480 for (auto& p : choose_args) {
2481 crush_choose_arg_map& cmap = p.second;
11fdf7f2 2482 unsigned new_size = crush->max_buckets;
c07f9fc5 2483 if (cmap.args) {
11fdf7f2
TL
2484 if ((int)cmap.size < crush->max_buckets) {
2485 cmap.args = static_cast<crush_choose_arg*>(realloc(
c07f9fc5 2486 cmap.args,
11fdf7f2
TL
2487 sizeof(crush_choose_arg) * new_size));
2488 ceph_assert(cmap.args);
c07f9fc5 2489 memset(&cmap.args[cmap.size], 0,
11fdf7f2
TL
2490 sizeof(crush_choose_arg) * (new_size - cmap.size));
2491 cmap.size = new_size;
c07f9fc5
FG
2492 }
2493 } else {
11fdf7f2
TL
2494 cmap.args = static_cast<crush_choose_arg*>(calloc(sizeof(crush_choose_arg),
2495 new_size));
2496 ceph_assert(cmap.args);
2497 cmap.size = new_size;
c07f9fc5
FG
2498 }
2499 if (size > 0) {
2500 int positions = get_choose_args_positions(cmap);
b5b8bbf5 2501 crush_choose_arg& carg = cmap.args[pos];
11fdf7f2
TL
2502 carg.weight_set = static_cast<crush_weight_set*>(calloc(sizeof(crush_weight_set),
2503 size));
28e407b8 2504 carg.weight_set_positions = positions;
c07f9fc5
FG
2505 for (int ppos = 0; ppos < positions; ++ppos) {
2506 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
2507 carg.weight_set[ppos].size = size;
2508 for (int bpos = 0; bpos < size; ++bpos) {
2509 carg.weight_set[ppos].weights[bpos] = weights[bpos];
2510 }
2511 }
2512 }
11fdf7f2 2513 assert(crush->max_buckets == (int)cmap.size);
c07f9fc5
FG
2514 }
2515 return r;
2516}
2517
31f18b77
FG
2518int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
2519{
2520 __u32 new_size = bucket->size + 1;
35e4c445
FG
2521 int r = crush_bucket_add_item(crush, bucket, item, weight);
2522 if (r < 0) {
2523 return r;
2524 }
b5b8bbf5
FG
2525 for (auto &w : choose_args) {
2526 crush_choose_arg_map &arg_map = w.second;
31f18b77 2527 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
28e407b8 2528 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
31f18b77 2529 crush_weight_set *weight_set = &arg->weight_set[j];
c07f9fc5
FG
2530 weight_set->weights = (__u32*)realloc(weight_set->weights,
2531 new_size * sizeof(__u32));
11fdf7f2 2532 ceph_assert(weight_set->size + 1 == new_size);
31f18b77
FG
2533 weight_set->weights[weight_set->size] = weight;
2534 weight_set->size = new_size;
2535 }
2536 if (arg->ids_size) {
224ce89b 2537 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
11fdf7f2 2538 ceph_assert(arg->ids_size + 1 == new_size);
31f18b77
FG
2539 arg->ids[arg->ids_size] = item;
2540 arg->ids_size = new_size;
2541 }
2542 }
35e4c445 2543 return 0;
31f18b77
FG
2544}
2545
2546int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
2547{
2548 __u32 new_size = bucket->size - 1;
2549 unsigned position;
2550 for (position = 0; position < bucket->size; position++)
2551 if (bucket->items[position] == item)
2552 break;
11fdf7f2 2553 ceph_assert(position != bucket->size);
35e4c445
FG
2554 int r = crush_bucket_remove_item(crush, bucket, item);
2555 if (r < 0) {
2556 return r;
2557 }
b5b8bbf5
FG
2558 for (auto &w : choose_args) {
2559 crush_choose_arg_map &arg_map = w.second;
31f18b77 2560 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
28e407b8 2561 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
31f18b77 2562 crush_weight_set *weight_set = &arg->weight_set[j];
11fdf7f2 2563 ceph_assert(weight_set->size - 1 == new_size);
31f18b77
FG
2564 for (__u32 k = position; k < new_size; k++)
2565 weight_set->weights[k] = weight_set->weights[k+1];
35e4c445
FG
2566 if (new_size) {
2567 weight_set->weights = (__u32*)realloc(weight_set->weights,
2568 new_size * sizeof(__u32));
2569 } else {
11fdf7f2 2570 free(weight_set->weights);
35e4c445
FG
2571 weight_set->weights = NULL;
2572 }
31f18b77
FG
2573 weight_set->size = new_size;
2574 }
2575 if (arg->ids_size) {
11fdf7f2 2576 ceph_assert(arg->ids_size - 1 == new_size);
31f18b77
FG
2577 for (__u32 k = position; k < new_size; k++)
2578 arg->ids[k] = arg->ids[k+1];
35e4c445
FG
2579 if (new_size) {
2580 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
2581 } else {
11fdf7f2 2582 free(arg->ids);
35e4c445
FG
2583 arg->ids = NULL;
2584 }
31f18b77
FG
2585 arg->ids_size = new_size;
2586 }
2587 }
35e4c445 2588 return 0;
31f18b77
FG
2589}
2590
3efd9988
FG
2591int CrushWrapper::bucket_set_alg(int bid, int alg)
2592{
2593 crush_bucket *b = get_bucket(bid);
2594 if (!b) {
2595 return -ENOENT;
2596 }
2597 b->alg = alg;
2598 return 0;
2599}
2600
224ce89b
WB
2601int CrushWrapper::update_device_class(int id,
2602 const string& class_name,
2603 const string& name,
2604 ostream *ss)
7c673cae 2605{
11fdf7f2 2606 ceph_assert(item_exists(id));
c07f9fc5
FG
2607 auto old_class_name = get_item_class(id);
2608 if (old_class_name && old_class_name != class_name) {
2609 *ss << "osd." << id << " has already bound to class '" << old_class_name
2610 << "', can not reset class to '" << class_name << "'; "
11fdf7f2 2611 << "use 'ceph osd crush rm-device-class <id>' to "
c07f9fc5
FG
2612 << "remove old class first";
2613 return -EBUSY;
2614 }
2615
224ce89b 2616 int class_id = get_or_create_class_id(class_name);
7c673cae 2617 if (id < 0) {
224ce89b 2618 *ss << name << " id " << id << " is negative";
7c673cae
FG
2619 return -EINVAL;
2620 }
7c673cae
FG
2621
2622 if (class_map.count(id) != 0 && class_map[id] == class_id) {
11fdf7f2 2623 *ss << name << " already set to class " << class_name << ". ";
7c673cae
FG
2624 return 0;
2625 }
2626
2627 set_item_class(id, class_id);
2628
11fdf7f2 2629 int r = rebuild_roots_with_classes(nullptr);
7c673cae
FG
2630 if (r < 0)
2631 return r;
2632 return 1;
2633}
2634
c07f9fc5
FG
2635int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
2636{
11fdf7f2 2637 ceph_assert(ss);
c07f9fc5
FG
2638 const char *name = get_item_name(id);
2639 if (!name) {
2640 *ss << "osd." << id << " does not have a name";
2641 return -ENOENT;
2642 }
2643
2644 const char *class_name = get_item_class(id);
2645 if (!class_name) {
2646 *ss << "osd." << id << " has not been bound to a specific class yet";
2647 return 0;
2648 }
2649 class_remove_item(id);
2650
11fdf7f2 2651 int r = rebuild_roots_with_classes(cct);
c07f9fc5
FG
2652 if (r < 0) {
2653 *ss << "unable to rebuild roots with class '" << class_name << "' "
2654 << "of osd." << id << ": " << cpp_strerror(r);
2655 return r;
2656 }
2657 return 0;
2658}
2659
d2e6a577
FG
2660int CrushWrapper::device_class_clone(
2661 int original_id, int device_class,
2662 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
2663 const std::set<int32_t>& used_ids,
35e4c445
FG
2664 int *clone,
2665 map<int,map<int,vector<int>>> *cmap_item_weight)
7c673cae
FG
2666{
2667 const char *item_name = get_item_name(original_id);
2668 if (item_name == NULL)
2669 return -ECHILD;
2670 const char *class_name = get_class_name(device_class);
2671 if (class_name == NULL)
2672 return -EBADF;
2673 string copy_name = item_name + string("~") + class_name;
2674 if (name_exists(copy_name)) {
2675 *clone = get_item_id(copy_name);
2676 return 0;
2677 }
35e4c445 2678
7c673cae 2679 crush_bucket *original = get_bucket(original_id);
11fdf7f2 2680 ceph_assert(!IS_ERR(original));
7c673cae
FG
2681 crush_bucket *copy = crush_make_bucket(crush,
2682 original->alg,
2683 original->hash,
2684 original->type,
2685 0, NULL, NULL);
11fdf7f2 2686 ceph_assert(copy);
35e4c445
FG
2687
2688 vector<unsigned> item_orig_pos; // new item pos -> orig item pos
7c673cae
FG
2689 for (unsigned i = 0; i < original->size; i++) {
2690 int item = original->items[i];
2691 int weight = crush_get_bucket_item_weight(original, i);
2692 if (item >= 0) {
2693 if (class_map.count(item) != 0 && class_map[item] == device_class) {
35e4c445 2694 int res = crush_bucket_add_item(crush, copy, item, weight);
7c673cae
FG
2695 if (res)
2696 return res;
35e4c445
FG
2697 } else {
2698 continue;
7c673cae
FG
2699 }
2700 } else {
2701 int child_copy_id;
d2e6a577 2702 int res = device_class_clone(item, device_class, old_class_bucket,
35e4c445
FG
2703 used_ids, &child_copy_id,
2704 cmap_item_weight);
7c673cae
FG
2705 if (res < 0)
2706 return res;
2707 crush_bucket *child_copy = get_bucket(child_copy_id);
11fdf7f2 2708 ceph_assert(!IS_ERR(child_copy));
35e4c445
FG
2709 res = crush_bucket_add_item(crush, copy, child_copy_id,
2710 child_copy->weight);
7c673cae
FG
2711 if (res)
2712 return res;
2713 }
35e4c445 2714 item_orig_pos.push_back(i);
7c673cae 2715 }
11fdf7f2 2716 ceph_assert(item_orig_pos.size() == copy->size);
35e4c445 2717
d2e6a577
FG
2718 int bno = 0;
2719 if (old_class_bucket.count(original_id) &&
2720 old_class_bucket.at(original_id).count(device_class)) {
2721 bno = old_class_bucket.at(original_id).at(device_class);
2722 } else {
2723 // pick a new shadow bucket id that is not used by the current map
2724 // *or* any previous shadow buckets.
2725 bno = -1;
2726 while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
2727 used_ids.count(bno)) {
2728 --bno;
2729 }
2730 }
2731 int res = crush_add_bucket(crush, bno, copy, clone);
7c673cae
FG
2732 if (res)
2733 return res;
11fdf7f2 2734 ceph_assert(!bno || bno == *clone);
35e4c445 2735
7c673cae
FG
2736 res = set_item_class(*clone, device_class);
2737 if (res < 0)
2738 return res;
35e4c445 2739
7c673cae
FG
2740 // we do not use set_item_name because the name is intentionally invalid
2741 name_map[*clone] = copy_name;
2742 if (have_rmaps)
2743 name_rmap[copy_name] = *clone;
2744 class_bucket[original_id][device_class] = *clone;
35e4c445
FG
2745
2746 // set up choose_args for the new bucket.
2747 for (auto& w : choose_args) {
2748 crush_choose_arg_map& cmap = w.second;
11fdf7f2
TL
2749 if (crush->max_buckets > (int)cmap.size) {
2750 unsigned new_size = crush->max_buckets;
2751 cmap.args = static_cast<crush_choose_arg*>(realloc(cmap.args,
2752 new_size * sizeof(cmap.args[0])));
2753 ceph_assert(cmap.args);
35e4c445
FG
2754 memset(cmap.args + cmap.size, 0,
2755 (new_size - cmap.size) * sizeof(cmap.args[0]));
b5b8bbf5 2756 cmap.size = new_size;
35e4c445
FG
2757 }
2758 auto& o = cmap.args[-1-original_id];
2759 auto& n = cmap.args[-1-bno];
2760 n.ids_size = 0; // FIXME: implement me someday
28e407b8 2761 n.weight_set_positions = o.weight_set_positions;
11fdf7f2
TL
2762 n.weight_set = static_cast<crush_weight_set*>(calloc(
2763 n.weight_set_positions, sizeof(crush_weight_set)));
28e407b8 2764 for (size_t s = 0; s < n.weight_set_positions; ++s) {
35e4c445
FG
2765 n.weight_set[s].size = copy->size;
2766 n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
2767 }
28e407b8
AA
2768 for (size_t s = 0; s < n.weight_set_positions; ++s) {
2769 vector<int> bucket_weights(n.weight_set_positions);
35e4c445
FG
2770 for (size_t i = 0; i < copy->size; ++i) {
2771 int item = copy->items[i];
2772 if (item >= 0) {
2773 n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
28e407b8 2774 } else if ((*cmap_item_weight)[w.first].count(item)) {
35e4c445 2775 n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
28e407b8
AA
2776 } else {
2777 n.weight_set[s].weights[i] = 0;
35e4c445
FG
2778 }
2779 bucket_weights[s] += n.weight_set[s].weights[i];
2780 }
2781 (*cmap_item_weight)[w.first][bno] = bucket_weights;
2782 }
2783 }
7c673cae
FG
2784 return 0;
2785}
2786
b5b8bbf5
FG
2787int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
2788{
11fdf7f2 2789 ceph_assert(rules);
b5b8bbf5
FG
2790 rules->clear();
2791 if (!class_exists(class_name)) {
2792 return -ENOENT;
2793 }
2794 int class_id = get_class_id(class_name);
2795 for (unsigned i = 0; i < crush->max_rules; ++i) {
2796 crush_rule *r = crush->rules[i];
2797 if (!r)
2798 continue;
2799 for (unsigned j = 0; j < r->len; ++j) {
2800 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2801 int step_item = r->steps[j].arg1;
2802 int original_item;
2803 int c;
2804 int res = split_id_class(step_item, &original_item, &c);
2805 if (res < 0) {
2806 return res;
2807 }
2808 if (c != -1 && c == class_id) {
2809 rules->insert(i);
2810 break;
2811 }
2812 }
2813 }
2814 }
2815 return 0;
2816}
2817
3efd9988
FG
2818// return rules that might reference the given osd
2819int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
2820{
11fdf7f2 2821 ceph_assert(rules);
3efd9988
FG
2822 rules->clear();
2823 if (osd < 0) {
2824 return -EINVAL;
2825 }
2826 for (unsigned i = 0; i < crush->max_rules; ++i) {
2827 crush_rule *r = crush->rules[i];
2828 if (!r)
2829 continue;
2830 for (unsigned j = 0; j < r->len; ++j) {
2831 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2832 int step_item = r->steps[j].arg1;
2833 list<int> unordered;
2834 int rc = _get_leaves(step_item, &unordered);
2835 if (rc < 0) {
2836 return rc; // propagate fatal errors!
2837 }
2838 bool match = false;
2839 for (auto &o: unordered) {
11fdf7f2 2840 ceph_assert(o >= 0);
3efd9988
FG
2841 if (o == osd) {
2842 match = true;
2843 break;
2844 }
2845 }
2846 if (match) {
2847 rules->insert(i);
2848 break;
2849 }
2850 }
2851 }
2852 }
2853 return 0;
2854}
2855
d2e6a577
FG
2856bool CrushWrapper::_class_is_dead(int class_id)
2857{
2858 for (auto &p: class_map) {
2859 if (p.first >= 0 && p.second == class_id) {
2860 return false;
2861 }
2862 }
2863 for (unsigned i = 0; i < crush->max_rules; ++i) {
2864 crush_rule *r = crush->rules[i];
2865 if (!r)
2866 continue;
2867 for (unsigned j = 0; j < r->len; ++j) {
2868 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2869 int root = r->steps[j].arg1;
2870 for (auto &p : class_bucket) {
2871 auto& q = p.second;
2872 if (q.count(class_id) && q[class_id] == root) {
2873 return false;
2874 }
2875 }
2876 }
2877 }
2878 }
2879 // no more referenced by any devices or crush rules
2880 return true;
2881}
2882
2883void CrushWrapper::cleanup_dead_classes()
2884{
35e4c445
FG
2885 auto p = class_name.begin();
2886 while (p != class_name.end()) {
2887 if (_class_is_dead(p->first)) {
2888 string n = p->second;
2889 ++p;
2890 remove_class_name(n);
2891 } else {
2892 ++p;
2893 }
d2e6a577
FG
2894 }
2895}
2896
11fdf7f2 2897int CrushWrapper::rebuild_roots_with_classes(CephContext *cct)
7c673cae 2898{
d2e6a577
FG
2899 std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
2900 cleanup_dead_classes();
11fdf7f2 2901 int r = trim_roots_with_class(cct);
7c673cae
FG
2902 if (r < 0)
2903 return r;
d2e6a577
FG
2904 class_bucket.clear();
2905 return populate_classes(old_class_bucket);
7c673cae
FG
2906}
2907
2908void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
2909{
11fdf7f2
TL
2910 using ceph::encode;
2911 ceph_assert(crush);
7c673cae
FG
2912
2913 __u32 magic = CRUSH_MAGIC;
11fdf7f2 2914 encode(magic, bl);
7c673cae 2915
11fdf7f2
TL
2916 encode(crush->max_buckets, bl);
2917 encode(crush->max_rules, bl);
2918 encode(crush->max_devices, bl);
7c673cae 2919
31f18b77
FG
2920 bool encode_compat_choose_args = false;
2921 crush_choose_arg_map arg_map;
2922 memset(&arg_map, '\0', sizeof(arg_map));
2923 if (has_choose_args() &&
2924 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
11fdf7f2 2925 ceph_assert(!has_incompat_choose_args());
31f18b77
FG
2926 encode_compat_choose_args = true;
2927 arg_map = choose_args.begin()->second;
2928 }
2929
7c673cae
FG
2930 // buckets
2931 for (int i=0; i<crush->max_buckets; i++) {
2932 __u32 alg = 0;
2933 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
11fdf7f2 2934 encode(alg, bl);
7c673cae
FG
2935 if (!alg)
2936 continue;
2937
11fdf7f2
TL
2938 encode(crush->buckets[i]->id, bl);
2939 encode(crush->buckets[i]->type, bl);
2940 encode(crush->buckets[i]->alg, bl);
2941 encode(crush->buckets[i]->hash, bl);
2942 encode(crush->buckets[i]->weight, bl);
2943 encode(crush->buckets[i]->size, bl);
7c673cae 2944 for (unsigned j=0; j<crush->buckets[i]->size; j++)
11fdf7f2 2945 encode(crush->buckets[i]->items[j], bl);
7c673cae
FG
2946
2947 switch (crush->buckets[i]->alg) {
2948 case CRUSH_BUCKET_UNIFORM:
11fdf7f2 2949 encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
7c673cae
FG
2950 break;
2951
2952 case CRUSH_BUCKET_LIST:
2953 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
11fdf7f2
TL
2954 encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
2955 encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
7c673cae
FG
2956 }
2957 break;
2958
2959 case CRUSH_BUCKET_TREE:
11fdf7f2 2960 encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
7c673cae 2961 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
11fdf7f2 2962 encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
7c673cae
FG
2963 break;
2964
2965 case CRUSH_BUCKET_STRAW:
2966 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
11fdf7f2
TL
2967 encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
2968 encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
7c673cae
FG
2969 }
2970 break;
2971
2972 case CRUSH_BUCKET_STRAW2:
31f18b77
FG
2973 {
2974 __u32 *weights;
2975 if (encode_compat_choose_args &&
28e407b8 2976 arg_map.args[i].weight_set_positions > 0) {
31f18b77
FG
2977 weights = arg_map.args[i].weight_set[0].weights;
2978 } else {
2979 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
2980 }
2981 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
11fdf7f2 2982 encode(weights[j], bl);
31f18b77 2983 }
7c673cae
FG
2984 }
2985 break;
2986
2987 default:
2988 ceph_abort();
2989 break;
2990 }
2991 }
2992
2993 // rules
2994 for (unsigned i=0; i<crush->max_rules; i++) {
2995 __u32 yes = crush->rules[i] ? 1:0;
11fdf7f2 2996 encode(yes, bl);
7c673cae
FG
2997 if (!yes)
2998 continue;
2999
11fdf7f2 3000 encode(crush->rules[i]->len, bl);
20effc67
TL
3001
3002 /*
3003 * legacy crush_rule_mask was
3004 *
3005 * struct crush_rule_mask {
3006 * __u8 ruleset;
3007 * __u8 type;
3008 * __u8 min_size;
3009 * __u8 max_size;
3010 * };
3011 *
3012 * encode ruleset=ruleid, and min/max of 1/100
3013 */
3014 encode((__u8)i, bl); // ruleset == ruleid
3015 encode(crush->rules[i]->type, bl);
3016 if (HAVE_FEATURE(features, SERVER_QUINCY)) {
3017 encode((__u8)1, bl); // min_size = 1
3018 encode((__u8)100, bl); // max_size = 100
3019 } else {
3020 encode(crush->rules[i]->deprecated_min_size, bl);
3021 encode(crush->rules[i]->deprecated_max_size, bl);
3022 }
3023
7c673cae 3024 for (unsigned j=0; j<crush->rules[i]->len; j++)
11fdf7f2 3025 encode(crush->rules[i]->steps[j], bl);
7c673cae
FG
3026 }
3027
3028 // name info
11fdf7f2
TL
3029 encode(type_map, bl);
3030 encode(name_map, bl);
3031 encode(rule_name_map, bl);
7c673cae
FG
3032
3033 // tunables
11fdf7f2
TL
3034 encode(crush->choose_local_tries, bl);
3035 encode(crush->choose_local_fallback_tries, bl);
3036 encode(crush->choose_total_tries, bl);
3037 encode(crush->chooseleaf_descend_once, bl);
3038 encode(crush->chooseleaf_vary_r, bl);
3039 encode(crush->straw_calc_version, bl);
3040 encode(crush->allowed_bucket_algs, bl);
7c673cae 3041 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
11fdf7f2 3042 encode(crush->chooseleaf_stable, bl);
7c673cae
FG
3043 }
3044
3045 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3046 // device classes
11fdf7f2
TL
3047 encode(class_map, bl);
3048 encode(class_name, bl);
3049 encode(class_bucket, bl);
7c673cae 3050
c07f9fc5 3051 // choose args
224ce89b 3052 __u32 size = (__u32)choose_args.size();
11fdf7f2 3053 encode(size, bl);
7c673cae 3054 for (auto c : choose_args) {
11fdf7f2 3055 encode(c.first, bl);
7c673cae 3056 crush_choose_arg_map arg_map = c.second;
224ce89b 3057 size = 0;
7c673cae
FG
3058 for (__u32 i = 0; i < arg_map.size; i++) {
3059 crush_choose_arg *arg = &arg_map.args[i];
28e407b8 3060 if (arg->weight_set_positions == 0 &&
7c673cae
FG
3061 arg->ids_size == 0)
3062 continue;
3063 size++;
3064 }
11fdf7f2 3065 encode(size, bl);
7c673cae
FG
3066 for (__u32 i = 0; i < arg_map.size; i++) {
3067 crush_choose_arg *arg = &arg_map.args[i];
28e407b8 3068 if (arg->weight_set_positions == 0 &&
7c673cae
FG
3069 arg->ids_size == 0)
3070 continue;
11fdf7f2
TL
3071 encode(i, bl);
3072 encode(arg->weight_set_positions, bl);
28e407b8 3073 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
7c673cae 3074 crush_weight_set *weight_set = &arg->weight_set[j];
11fdf7f2 3075 encode(weight_set->size, bl);
7c673cae 3076 for (__u32 k = 0; k < weight_set->size; k++)
11fdf7f2 3077 encode(weight_set->weights[k], bl);
7c673cae 3078 }
11fdf7f2 3079 encode(arg->ids_size, bl);
7c673cae 3080 for (__u32 j = 0; j < arg->ids_size; j++)
11fdf7f2 3081 encode(arg->ids[j], bl);
7c673cae
FG
3082 }
3083 }
3084 }
3085}
3086
11fdf7f2 3087static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::const_iterator& blp)
7c673cae
FG
3088{
3089 m.clear();
3090 __u32 n;
11fdf7f2 3091 decode(n, blp);
7c673cae
FG
3092 while (n--) {
3093 __s32 key;
11fdf7f2 3094 decode(key, blp);
7c673cae
FG
3095
3096 __u32 strlen;
11fdf7f2 3097 decode(strlen, blp);
7c673cae
FG
3098 if (strlen == 0) {
3099 // der, key was actually 64-bits!
11fdf7f2 3100 decode(strlen, blp);
7c673cae 3101 }
11fdf7f2 3102 decode_nohead(strlen, m[key], blp);
7c673cae
FG
3103 }
3104}
3105
11fdf7f2 3106void CrushWrapper::decode(bufferlist::const_iterator& blp)
7c673cae 3107{
11fdf7f2 3108 using ceph::decode;
7c673cae
FG
3109 create();
3110
3111 __u32 magic;
11fdf7f2 3112 decode(magic, blp);
7c673cae 3113 if (magic != CRUSH_MAGIC)
9f95a23c 3114 throw ceph::buffer::malformed_input("bad magic number");
7c673cae 3115
11fdf7f2
TL
3116 decode(crush->max_buckets, blp);
3117 decode(crush->max_rules, blp);
3118 decode(crush->max_devices, blp);
7c673cae
FG
3119
3120 // legacy tunables, unless we decode something newer
3121 set_tunables_legacy();
3122
3123 try {
3124 // buckets
3125 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
3126 for (int i=0; i<crush->max_buckets; i++) {
3127 decode_crush_bucket(&crush->buckets[i], blp);
3128 }
3129
3130 // rules
3131 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
3132 for (unsigned i = 0; i < crush->max_rules; ++i) {
3133 __u32 yes;
11fdf7f2 3134 decode(yes, blp);
7c673cae
FG
3135 if (!yes) {
3136 crush->rules[i] = NULL;
3137 continue;
3138 }
3139
3140 __u32 len;
11fdf7f2 3141 decode(len, blp);
7c673cae
FG
3142 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
3143 crush->rules[i]->len = len;
20effc67
TL
3144
3145 __u8 ruleset; // ignore + discard
3146 decode(ruleset, blp);
3147 if (ruleset != i) {
3148 throw ::ceph::buffer::malformed_input("crush ruleset_id != rule_id; encoding is too old");
3149 }
3150 decode(crush->rules[i]->type, blp);
3151 decode(crush->rules[i]->deprecated_min_size, blp);
3152 decode(crush->rules[i]->deprecated_max_size, blp);
3153
7c673cae 3154 for (unsigned j=0; j<crush->rules[i]->len; j++)
11fdf7f2 3155 decode(crush->rules[i]->steps[j], blp);
7c673cae
FG
3156 }
3157
3158 // name info
3159 // NOTE: we had a bug where we were incoding int instead of int32, which means the
3160 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
3161 // both by assuming the string is always non-empty.
3162 decode_32_or_64_string_map(type_map, blp);
3163 decode_32_or_64_string_map(name_map, blp);
3164 decode_32_or_64_string_map(rule_name_map, blp);
3165
3166 // tunables
3167 if (!blp.end()) {
11fdf7f2
TL
3168 decode(crush->choose_local_tries, blp);
3169 decode(crush->choose_local_fallback_tries, blp);
3170 decode(crush->choose_total_tries, blp);
7c673cae
FG
3171 }
3172 if (!blp.end()) {
11fdf7f2 3173 decode(crush->chooseleaf_descend_once, blp);
7c673cae
FG
3174 }
3175 if (!blp.end()) {
11fdf7f2 3176 decode(crush->chooseleaf_vary_r, blp);
7c673cae
FG
3177 }
3178 if (!blp.end()) {
11fdf7f2 3179 decode(crush->straw_calc_version, blp);
7c673cae
FG
3180 }
3181 if (!blp.end()) {
11fdf7f2 3182 decode(crush->allowed_bucket_algs, blp);
7c673cae
FG
3183 }
3184 if (!blp.end()) {
11fdf7f2 3185 decode(crush->chooseleaf_stable, blp);
7c673cae
FG
3186 }
3187 if (!blp.end()) {
11fdf7f2
TL
3188 decode(class_map, blp);
3189 decode(class_name, blp);
7c673cae
FG
3190 for (auto &c : class_name)
3191 class_rname[c.second] = c.first;
11fdf7f2 3192 decode(class_bucket, blp);
7c673cae
FG
3193 }
3194 if (!blp.end()) {
224ce89b 3195 __u32 choose_args_size;
11fdf7f2 3196 decode(choose_args_size, blp);
224ce89b 3197 for (__u32 i = 0; i < choose_args_size; i++) {
b5b8bbf5 3198 typename decltype(choose_args)::key_type choose_args_index;
11fdf7f2 3199 decode(choose_args_index, blp);
7c673cae
FG
3200 crush_choose_arg_map arg_map;
3201 arg_map.size = crush->max_buckets;
11fdf7f2
TL
3202 arg_map.args = static_cast<crush_choose_arg*>(calloc(
3203 arg_map.size, sizeof(crush_choose_arg)));
7c673cae 3204 __u32 size;
11fdf7f2 3205 decode(size, blp);
7c673cae
FG
3206 for (__u32 j = 0; j < size; j++) {
3207 __u32 bucket_index;
11fdf7f2
TL
3208 decode(bucket_index, blp);
3209 ceph_assert(bucket_index < arg_map.size);
7c673cae 3210 crush_choose_arg *arg = &arg_map.args[bucket_index];
11fdf7f2 3211 decode(arg->weight_set_positions, blp);
28e407b8 3212 if (arg->weight_set_positions) {
11fdf7f2
TL
3213 arg->weight_set = static_cast<crush_weight_set*>(calloc(
3214 arg->weight_set_positions, sizeof(crush_weight_set)));
28e407b8 3215 for (__u32 k = 0; k < arg->weight_set_positions; k++) {
c07f9fc5 3216 crush_weight_set *weight_set = &arg->weight_set[k];
11fdf7f2 3217 decode(weight_set->size, blp);
c07f9fc5
FG
3218 weight_set->weights = (__u32*)calloc(
3219 weight_set->size, sizeof(__u32));
3220 for (__u32 l = 0; l < weight_set->size; l++)
11fdf7f2 3221 decode(weight_set->weights[l], blp);
c07f9fc5 3222 }
7c673cae 3223 }
11fdf7f2 3224 decode(arg->ids_size, blp);
c07f9fc5 3225 if (arg->ids_size) {
11fdf7f2 3226 ceph_assert(arg->ids_size == crush->buckets[bucket_index]->size);
c07f9fc5
FG
3227 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
3228 for (__u32 k = 0; k < arg->ids_size; k++)
11fdf7f2 3229 decode(arg->ids[k], blp);
c07f9fc5 3230 }
7c673cae
FG
3231 }
3232 choose_args[choose_args_index] = arg_map;
3233 }
3234 }
28e407b8 3235 update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
7c673cae
FG
3236 finalize();
3237 }
3238 catch (...) {
3239 crush_destroy(crush);
3240 throw;
3241 }
3242}
3243
11fdf7f2 3244void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp)
7c673cae 3245{
11fdf7f2 3246 using ceph::decode;
7c673cae 3247 __u32 alg;
11fdf7f2 3248 decode(alg, blp);
7c673cae
FG
3249 if (!alg) {
3250 *bptr = NULL;
3251 return;
3252 }
3253
3254 int size = 0;
3255 switch (alg) {
3256 case CRUSH_BUCKET_UNIFORM:
3257 size = sizeof(crush_bucket_uniform);
3258 break;
3259 case CRUSH_BUCKET_LIST:
3260 size = sizeof(crush_bucket_list);
3261 break;
3262 case CRUSH_BUCKET_TREE:
3263 size = sizeof(crush_bucket_tree);
3264 break;
3265 case CRUSH_BUCKET_STRAW:
3266 size = sizeof(crush_bucket_straw);
3267 break;
3268 case CRUSH_BUCKET_STRAW2:
3269 size = sizeof(crush_bucket_straw2);
3270 break;
3271 default:
3272 {
3273 char str[128];
3274 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
9f95a23c 3275 throw ceph::buffer::malformed_input(str);
7c673cae
FG
3276 }
3277 }
3278 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
3279 *bptr = bucket;
3280
11fdf7f2
TL
3281 decode(bucket->id, blp);
3282 decode(bucket->type, blp);
3283 decode(bucket->alg, blp);
3284 decode(bucket->hash, blp);
3285 decode(bucket->weight, blp);
3286 decode(bucket->size, blp);
7c673cae
FG
3287
3288 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
3289 for (unsigned j = 0; j < bucket->size; ++j) {
11fdf7f2 3290 decode(bucket->items[j], blp);
7c673cae
FG
3291 }
3292
3293 switch (bucket->alg) {
3294 case CRUSH_BUCKET_UNIFORM:
11fdf7f2 3295 decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
7c673cae
FG
3296 break;
3297
3298 case CRUSH_BUCKET_LIST: {
3299 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
3300 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3301 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3302
3303 for (unsigned j = 0; j < bucket->size; ++j) {
11fdf7f2
TL
3304 decode(cbl->item_weights[j], blp);
3305 decode(cbl->sum_weights[j], blp);
7c673cae
FG
3306 }
3307 break;
3308 }
3309
3310 case CRUSH_BUCKET_TREE: {
3311 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
11fdf7f2 3312 decode(cbt->num_nodes, blp);
7c673cae
FG
3313 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
3314 for (unsigned j=0; j<cbt->num_nodes; j++) {
11fdf7f2 3315 decode(cbt->node_weights[j], blp);
7c673cae
FG
3316 }
3317 break;
3318 }
3319
3320 case CRUSH_BUCKET_STRAW: {
3321 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
3322 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3323 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3324 for (unsigned j = 0; j < bucket->size; ++j) {
11fdf7f2
TL
3325 decode(cbs->item_weights[j], blp);
3326 decode(cbs->straws[j], blp);
7c673cae
FG
3327 }
3328 break;
3329 }
3330
3331 case CRUSH_BUCKET_STRAW2: {
3332 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
3333 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
3334 for (unsigned j = 0; j < bucket->size; ++j) {
11fdf7f2 3335 decode(cbs->item_weights[j], blp);
7c673cae
FG
3336 }
3337 break;
3338 }
3339
3340 default:
3341 // We should have handled this case in the first switch statement
3342 ceph_abort();
3343 break;
3344 }
3345}
3346
3347
3348void CrushWrapper::dump(Formatter *f) const
3349{
3350 f->open_array_section("devices");
3351 for (int i=0; i<get_max_devices(); i++) {
3352 f->open_object_section("device");
3353 f->dump_int("id", i);
3354 const char *n = get_item_name(i);
3355 if (n) {
3356 f->dump_string("name", n);
3357 } else {
3358 char name[20];
3359 sprintf(name, "device%d", i);
3360 f->dump_string("name", name);
3361 }
3362 const char *device_class = get_item_class(i);
3363 if (device_class != NULL)
3364 f->dump_string("class", device_class);
3365 f->close_section();
3366 }
3367 f->close_section();
3368
3369 f->open_array_section("types");
3370 int n = get_num_type_names();
3371 for (int i=0; n; i++) {
3372 const char *name = get_type_name(i);
3373 if (!name) {
3374 if (i == 0) {
3375 f->open_object_section("type");
3376 f->dump_int("type_id", 0);
3377 f->dump_string("name", "device");
3378 f->close_section();
3379 }
3380 continue;
3381 }
3382 n--;
3383 f->open_object_section("type");
3384 f->dump_int("type_id", i);
3385 f->dump_string("name", name);
3386 f->close_section();
3387 }
3388 f->close_section();
3389
3390 f->open_array_section("buckets");
3391 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
3392 if (!bucket_exists(bucket))
3393 continue;
3394 f->open_object_section("bucket");
3395 f->dump_int("id", bucket);
3396 if (get_item_name(bucket))
3397 f->dump_string("name", get_item_name(bucket));
3398 f->dump_int("type_id", get_bucket_type(bucket));
3399 if (get_type_name(get_bucket_type(bucket)))
3400 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
3401 f->dump_int("weight", get_bucket_weight(bucket));
3402 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
3403 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
3404 f->open_array_section("items");
3405 for (int j=0; j<get_bucket_size(bucket); j++) {
3406 f->open_object_section("item");
3407 f->dump_int("id", get_bucket_item(bucket, j));
3408 f->dump_int("weight", get_bucket_item_weight(bucket, j));
3409 f->dump_int("pos", j);
3410 f->close_section();
3411 }
3412 f->close_section();
3413 f->close_section();
3414 }
3415 f->close_section();
3416
3417 f->open_array_section("rules");
3418 dump_rules(f);
3419 f->close_section();
3420
3421 f->open_object_section("tunables");
3422 dump_tunables(f);
3423 f->close_section();
3424
3425 dump_choose_args(f);
3426}
3427
3428namespace {
3429 // depth first walker
3430 class TreeDumper {
3431 typedef CrushTreeDumper::Item Item;
3432 const CrushWrapper *crush;
c07f9fc5 3433 const CrushTreeDumper::name_map_t& weight_set_names;
7c673cae 3434 public:
c07f9fc5
FG
3435 explicit TreeDumper(const CrushWrapper *crush,
3436 const CrushTreeDumper::name_map_t& wsnames)
3437 : crush(crush), weight_set_names(wsnames) {}
7c673cae
FG
3438
3439 void dump(Formatter *f) {
3440 set<int> roots;
3efd9988 3441 crush->find_roots(&roots);
7c673cae 3442 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
c07f9fc5 3443 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
7c673cae
FG
3444 }
3445 }
3446
3447 private:
3448 void dump_item(const Item& qi, Formatter* f) {
3449 if (qi.is_bucket()) {
3450 f->open_object_section("bucket");
c07f9fc5 3451 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
7c673cae
FG
3452 dump_bucket_children(qi, f);
3453 f->close_section();
3454 } else {
3455 f->open_object_section("device");
c07f9fc5 3456 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
7c673cae
FG
3457 f->close_section();
3458 }
3459 }
3460
3461 void dump_bucket_children(const Item& parent, Formatter* f) {
3462 f->open_array_section("items");
3463 const int max_pos = crush->get_bucket_size(parent.id);
3464 for (int pos = 0; pos < max_pos; pos++) {
3465 int id = crush->get_bucket_item(parent.id, pos);
3466 float weight = crush->get_bucket_item_weightf(parent.id, pos);
c07f9fc5 3467 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
7c673cae
FG
3468 }
3469 f->close_section();
3470 }
3471 };
3472}
3473
c07f9fc5
FG
3474void CrushWrapper::dump_tree(
3475 Formatter *f,
3476 const CrushTreeDumper::name_map_t& weight_set_names) const
7c673cae 3477{
11fdf7f2 3478 ceph_assert(f);
c07f9fc5 3479 TreeDumper(this, weight_set_names).dump(f);
7c673cae
FG
3480}
3481
3482void CrushWrapper::dump_tunables(Formatter *f) const
3483{
3484 f->dump_int("choose_local_tries", get_choose_local_tries());
3485 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
3486 f->dump_int("choose_total_tries", get_choose_total_tries());
3487 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
3488 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
3489 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
3490 f->dump_int("straw_calc_version", get_straw_calc_version());
3491 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
3492
3493 // be helpful about it
3494 if (has_jewel_tunables())
3495 f->dump_string("profile", "jewel");
3496 else if (has_hammer_tunables())
3497 f->dump_string("profile", "hammer");
3498 else if (has_firefly_tunables())
3499 f->dump_string("profile", "firefly");
3500 else if (has_bobtail_tunables())
3501 f->dump_string("profile", "bobtail");
3502 else if (has_argonaut_tunables())
3503 f->dump_string("profile", "argonaut");
3504 else
3505 f->dump_string("profile", "unknown");
3506 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
3507 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
3508
3509 // be helpful about minimum version required
3510 f->dump_string("minimum_required_version", get_min_required_version());
3511
3512 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
3513 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
3514 f->dump_int("has_v2_rules", (int)has_v2_rules());
3515 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
3516 f->dump_int("has_v3_rules", (int)has_v3_rules());
3517 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
3518 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
3519 f->dump_int("has_v5_rules", (int)has_v5_rules());
3520}
3521
3522void CrushWrapper::dump_choose_args(Formatter *f) const
3523{
3524 f->open_object_section("choose_args");
3525 for (auto c : choose_args) {
3526 crush_choose_arg_map arg_map = c.second;
3527 f->open_array_section(stringify(c.first).c_str());
3528 for (__u32 i = 0; i < arg_map.size; i++) {
3529 crush_choose_arg *arg = &arg_map.args[i];
28e407b8 3530 if (arg->weight_set_positions == 0 &&
7c673cae
FG
3531 arg->ids_size == 0)
3532 continue;
3533 f->open_object_section("choose_args");
3534 int bucket_index = i;
3535 f->dump_int("bucket_id", -1-bucket_index);
28e407b8 3536 if (arg->weight_set_positions > 0) {
7c673cae 3537 f->open_array_section("weight_set");
28e407b8 3538 for (__u32 j = 0; j < arg->weight_set_positions; j++) {
7c673cae
FG
3539 f->open_array_section("weights");
3540 __u32 *weights = arg->weight_set[j].weights;
3541 __u32 size = arg->weight_set[j].size;
3542 for (__u32 k = 0; k < size; k++) {
3543 f->dump_float("weight", (float)weights[k]/(float)0x10000);
3544 }
3545 f->close_section();
3546 }
3547 f->close_section();
3548 }
3549 if (arg->ids_size > 0) {
3550 f->open_array_section("ids");
3551 for (__u32 j = 0; j < arg->ids_size; j++)
3552 f->dump_int("id", arg->ids[j]);
3553 f->close_section();
3554 }
3555 f->close_section();
3556 }
3557 f->close_section();
3558 }
3559 f->close_section();
3560}
3561
3562void CrushWrapper::dump_rules(Formatter *f) const
3563{
3564 for (int i=0; i<get_max_rules(); i++) {
3565 if (!rule_exists(i))
3566 continue;
3567 dump_rule(i, f);
3568 }
3569}
3570
20effc67 3571void CrushWrapper::dump_rule(int rule_id, Formatter *f) const
7c673cae
FG
3572{
3573 f->open_object_section("rule");
20effc67
TL
3574 f->dump_int("rule_id", rule_id);
3575 if (get_rule_name(rule_id))
3576 f->dump_string("rule_name", get_rule_name(rule_id));
3577 f->dump_int("type", get_rule_type(rule_id));
7c673cae 3578 f->open_array_section("steps");
20effc67 3579 for (int j=0; j<get_rule_len(rule_id); j++) {
7c673cae 3580 f->open_object_section("step");
20effc67 3581 switch (get_rule_op(rule_id, j)) {
7c673cae
FG
3582 case CRUSH_RULE_NOOP:
3583 f->dump_string("op", "noop");
3584 break;
3585 case CRUSH_RULE_TAKE:
3586 f->dump_string("op", "take");
3587 {
20effc67 3588 int item = get_rule_arg1(rule_id, j);
7c673cae
FG
3589 f->dump_int("item", item);
3590
3591 const char *name = get_item_name(item);
3592 f->dump_string("item_name", name ? name : "");
3593 }
3594 break;
3595 case CRUSH_RULE_EMIT:
3596 f->dump_string("op", "emit");
3597 break;
3598 case CRUSH_RULE_CHOOSE_FIRSTN:
3599 f->dump_string("op", "choose_firstn");
20effc67
TL
3600 f->dump_int("num", get_rule_arg1(rule_id, j));
3601 f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j)));
7c673cae
FG
3602 break;
3603 case CRUSH_RULE_CHOOSE_INDEP:
3604 f->dump_string("op", "choose_indep");
20effc67
TL
3605 f->dump_int("num", get_rule_arg1(rule_id, j));
3606 f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j)));
7c673cae
FG
3607 break;
3608 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3609 f->dump_string("op", "chooseleaf_firstn");
20effc67
TL
3610 f->dump_int("num", get_rule_arg1(rule_id, j));
3611 f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j)));
7c673cae
FG
3612 break;
3613 case CRUSH_RULE_CHOOSELEAF_INDEP:
3614 f->dump_string("op", "chooseleaf_indep");
20effc67
TL
3615 f->dump_int("num", get_rule_arg1(rule_id, j));
3616 f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j)));
7c673cae
FG
3617 break;
3618 case CRUSH_RULE_SET_CHOOSE_TRIES:
3619 f->dump_string("op", "set_choose_tries");
20effc67 3620 f->dump_int("num", get_rule_arg1(rule_id, j));
7c673cae
FG
3621 break;
3622 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
3623 f->dump_string("op", "set_chooseleaf_tries");
20effc67 3624 f->dump_int("num", get_rule_arg1(rule_id, j));
7c673cae
FG
3625 break;
3626 default:
20effc67
TL
3627 f->dump_int("opcode", get_rule_op(rule_id, j));
3628 f->dump_int("arg1", get_rule_arg1(rule_id, j));
3629 f->dump_int("arg2", get_rule_arg2(rule_id, j));
7c673cae
FG
3630 }
3631 f->close_section();
3632 }
3633 f->close_section();
3634 f->close_section();
3635}
3636
3637void CrushWrapper::list_rules(Formatter *f) const
3638{
3639 for (int rule = 0; rule < get_max_rules(); rule++) {
3640 if (!rule_exists(rule))
3641 continue;
3642 f->dump_string("name", get_rule_name(rule));
3643 }
3644}
3645
c07f9fc5
FG
3646void CrushWrapper::list_rules(ostream *ss) const
3647{
3648 for (int rule = 0; rule < get_max_rules(); rule++) {
3649 if (!rule_exists(rule))
3650 continue;
3651 *ss << get_rule_name(rule) << "\n";
3652 }
3653}
7c673cae 3654
c07f9fc5
FG
3655class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3656public:
3657 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3658
3659 explicit CrushTreePlainDumper(const CrushWrapper *crush,
3660 const CrushTreeDumper::name_map_t& wsnames)
3661 : Parent(crush, wsnames) {}
3662 explicit CrushTreePlainDumper(const CrushWrapper *crush,
3663 const CrushTreeDumper::name_map_t& wsnames,
3664 bool show_shadow)
3665 : Parent(crush, wsnames, show_shadow) {}
3666
3667
3668 void dump(TextTable *tbl) {
3669 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
d2e6a577 3670 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
c07f9fc5
FG
3671 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3672 for (auto& p : crush->choose_args) {
3673 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
3674 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
3675 } else {
3676 string name;
3677 auto q = weight_set_names.find(p.first);
3678 name = q != weight_set_names.end() ? q->second :
3679 stringify(p.first);
3680 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
3681 }
3682 }
3683 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3684 Parent::dump(tbl);
7c673cae
FG
3685 }
3686
3687protected:
c07f9fc5 3688 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
d2e6a577
FG
3689 const char *c = crush->get_item_class(qi.id);
3690 if (!c)
3691 c = "";
c07f9fc5 3692 *tbl << qi.id
d2e6a577 3693 << c
c07f9fc5
FG
3694 << weightf_t(qi.weight);
3695 for (auto& p : crush->choose_args) {
3696 if (qi.parent < 0) {
3697 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
3698 int bidx = -1 - qi.parent;
3699 const crush_bucket *b = crush->get_bucket(qi.parent);
3700 if (b &&
3701 bidx < (int)cmap.size &&
3702 cmap.args[bidx].weight_set &&
28e407b8 3703 cmap.args[bidx].weight_set_positions >= 1) {
c07f9fc5
FG
3704 int pos;
3705 for (pos = 0;
3706 pos < (int)cmap.args[bidx].weight_set[0].size &&
3707 b->items[pos] != qi.id;
3708 ++pos) ;
3709 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
3710 (float)0x10000);
3711 continue;
3712 }
3713 }
3714 *tbl << "";
7c673cae 3715 }
c07f9fc5
FG
3716 ostringstream ss;
3717 for (int k=0; k < qi.depth; k++) {
3718 ss << " ";
7c673cae 3719 }
c07f9fc5
FG
3720 if (qi.is_bucket()) {
3721 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3722 << crush->get_item_name(qi.id);
3723 } else {
3724 ss << "osd." << qi.id;
3725 }
3726 *tbl << ss.str();
3727 *tbl << TextTable::endrow;
7c673cae
FG
3728 }
3729};
3730
3731
3732class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3733public:
3734 typedef CrushTreeDumper::FormattingDumper Parent;
3735
c07f9fc5
FG
3736 explicit CrushTreeFormattingDumper(
3737 const CrushWrapper *crush,
3738 const CrushTreeDumper::name_map_t& wsnames)
3739 : Parent(crush, wsnames) {}
3740
3741 explicit CrushTreeFormattingDumper(
3742 const CrushWrapper *crush,
3743 const CrushTreeDumper::name_map_t& wsnames,
3744 bool show_shadow)
3745 : Parent(crush, wsnames, show_shadow) {}
7c673cae
FG
3746
3747 void dump(Formatter *f) {
3748 f->open_array_section("nodes");
3749 Parent::dump(f);
3750 f->close_section();
91327a77
AA
3751
3752 // There is no stray bucket whose id is a negative number, so just get
3753 // the max_id and iterate from 0 to max_id to dump stray osds.
7c673cae 3754 f->open_array_section("stray");
91327a77
AA
3755 int32_t max_id = -1;
3756 if (!crush->name_map.empty()) {
3757 max_id = crush->name_map.rbegin()->first;
3758 }
3759 for (int32_t i = 0; i <= max_id; i++) {
3760 if (crush->item_exists(i) && !is_touched(i) && should_dump(i)) {
3761 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3762 }
3763 }
7c673cae
FG
3764 f->close_section();
3765 }
3766};
3767
3768
c07f9fc5
FG
3769void CrushWrapper::dump_tree(
3770 ostream *out,
3771 Formatter *f,
3772 const CrushTreeDumper::name_map_t& weight_set_names,
3773 bool show_shadow) const
7c673cae 3774{
c07f9fc5
FG
3775 if (out) {
3776 TextTable tbl;
3777 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
3778 *out << tbl;
3779 }
3780 if (f) {
3781 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
3782 }
7c673cae
FG
3783}
3784
3785void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
3786{
3787 o.push_back(new CrushWrapper);
3788 // fixme
3789}
3790
7c673cae 3791/**
20effc67 3792 * Determine the default CRUSH rule ID to be used with
7c673cae
FG
3793 * newly created replicated pools.
3794 *
20effc67 3795 * @returns a rule ID (>=0) or -1 if no suitable rule found
7c673cae 3796 */
20effc67
TL
3797int CrushWrapper::get_osd_pool_default_crush_replicated_rule(
3798 CephContext *cct)
7c673cae 3799{
20effc67
TL
3800 int crush_rule = cct->_conf.get_val<int64_t>("osd_pool_default_crush_rule");
3801 if (crush_rule < 0) {
3802 crush_rule = find_first_rule(pg_pool_t::TYPE_REPLICATED);
3803 } else if (!rule_exists(crush_rule)) {
3804 crush_rule = -1; // match find_first_rule() retval
7c673cae 3805 }
20effc67 3806 return crush_rule;
7c673cae
FG
3807}
3808
3809bool CrushWrapper::is_valid_crush_name(const string& s)
3810{
3811 if (s.empty())
3812 return false;
3813 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
3814 if (!(*p == '-') &&
3815 !(*p == '_') &&
3816 !(*p == '.') &&
3817 !(*p >= '0' && *p <= '9') &&
3818 !(*p >= 'A' && *p <= 'Z') &&
3819 !(*p >= 'a' && *p <= 'z'))
3820 return false;
3821 }
3822 return true;
3823}
3824
3825bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
3826 const map<string,string>& loc)
3827{
3828 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
3829 if (!is_valid_crush_name(l->first) ||
3830 !is_valid_crush_name(l->second)) {
3831 ldout(cct, 1) << "loc["
3832 << l->first << "] = '"
3833 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
3834 << dendl;
3835 return false;
3836 }
3837 }
3838 return true;
3839}
3840
3841int CrushWrapper::_choose_type_stack(
3842 CephContext *cct,
3843 const vector<pair<int,int>>& stack,
3844 const set<int>& overfull,
3845 const vector<int>& underfull,
92f5a8d4 3846 const vector<int>& more_underfull,
7c673cae
FG
3847 const vector<int>& orig,
3848 vector<int>::const_iterator& i,
3849 set<int>& used,
a8e16298 3850 vector<int> *pw,
92f5a8d4
TL
3851 int root_bucket,
3852 int rule) const
7c673cae
FG
3853{
3854 vector<int> w = *pw;
3855 vector<int> o;
3856
3857 ldout(cct, 10) << __func__ << " stack " << stack
3858 << " orig " << orig
3859 << " at " << *i
3860 << " pw " << *pw
3861 << dendl;
a8e16298 3862 ceph_assert(root_bucket < 0);
7c673cae
FG
3863 vector<int> cumulative_fanout(stack.size());
3864 int f = 1;
3865 for (int j = (int)stack.size() - 1; j >= 0; --j) {
3866 cumulative_fanout[j] = f;
3867 f *= stack[j].second;
3868 }
3869 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
3870 << dendl;
3871
11fdf7f2 3872 // identify underfull targets for each intermediate level.
31f18b77
FG
3873 // this serves two purposes:
3874 // 1. we can tell when we are selecting a bucket that does not have any underfull
3875 // devices beneath it. that means that if the current input includes an overfull
3876 // device, we won't be able to find an underfull device with this parent to
3877 // swap for it.
3878 // 2. when we decide we should reject a bucket due to the above, this list gives us
3879 // a list of peers to consider that *do* have underfull devices available.. (we
3880 // are careful to pick one that has the same parent.)
3881 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
3882 underfull_buckets.resize(stack.size() - 1);
3883 for (auto osd : underfull) {
3884 int item = osd;
3885 for (int j = (int)stack.size() - 2; j >= 0; --j) {
3886 int type = stack[j].first;
92f5a8d4 3887 item = get_parent_of_type(item, type, rule);
31f18b77
FG
3888 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
3889 << " is " << item << dendl;
a8e16298
TL
3890 if (!subtree_contains(root_bucket, item)) {
3891 ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl;
3892 continue;
3893 }
31f18b77
FG
3894 underfull_buckets[j].insert(item);
3895 }
3896 }
3897 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
3898
7c673cae
FG
3899 for (unsigned j = 0; j < stack.size(); ++j) {
3900 int type = stack[j].first;
3901 int fanout = stack[j].second;
3902 int cum_fanout = cumulative_fanout[j];
3903 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
3904 << " cumulative " << cum_fanout
3905 << " w " << w << dendl;
3906 vector<int> o;
3907 auto tmpi = i;
28e407b8
AA
3908 if (i == orig.end()) {
3909 ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
3910 break;
3911 }
7c673cae
FG
3912 for (auto from : w) {
3913 ldout(cct, 10) << " from " << from << dendl;
31f18b77
FG
3914 // identify leaves under each choice. we use this to check whether any of these
3915 // leaves are overfull. (if so, we need to make sure there are underfull candidates
3916 // to swap for them.)
3917 vector<set<int>> leaves;
3918 leaves.resize(fanout);
7c673cae
FG
3919 for (int pos = 0; pos < fanout; ++pos) {
3920 if (type > 0) {
3921 // non-leaf
92f5a8d4 3922 int item = get_parent_of_type(*tmpi, type, rule);
7c673cae 3923 o.push_back(item);
7c673cae 3924 int n = cum_fanout;
31f18b77
FG
3925 while (n-- && tmpi != orig.end()) {
3926 leaves[pos].insert(*tmpi++);
3927 }
3928 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
3929 << " of type " << type << " over leaves " << leaves[pos] << dendl;
7c673cae
FG
3930 } else {
3931 // leaf
3932 bool replaced = false;
3933 if (overfull.count(*i)) {
3934 for (auto item : underfull) {
3935 ldout(cct, 10) << __func__ << " pos " << pos
3936 << " was " << *i << " considering " << item
3937 << dendl;
3938 if (used.count(item)) {
3939 ldout(cct, 20) << __func__ << " in used " << used << dendl;
3940 continue;
3941 }
3942 if (!subtree_contains(from, item)) {
3943 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
3944 continue;
3945 }
3946 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
3947 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
3948 continue;
3949 }
3950 o.push_back(item);
3951 used.insert(item);
3952 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
3953 << *i << " -> " << item << dendl;
3954 replaced = true;
11fdf7f2 3955 ceph_assert(i != orig.end());
7c673cae
FG
3956 ++i;
3957 break;
3958 }
92f5a8d4
TL
3959 if (!replaced) {
3960 for (auto item : more_underfull) {
3961 ldout(cct, 10) << __func__ << " more underfull pos " << pos
3962 << " was " << *i << " considering " << item
3963 << dendl;
3964 if (used.count(item)) {
3965 ldout(cct, 20) << __func__ << " in used " << used << dendl;
3966 continue;
3967 }
3968 if (!subtree_contains(from, item)) {
3969 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
3970 continue;
3971 }
3972 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
3973 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
3974 continue;
3975 }
3976 o.push_back(item);
3977 used.insert(item);
3978 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
3979 << *i << " -> " << item << dendl;
3980 replaced = true;
3981 assert(i != orig.end());
3982 ++i;
3983 break;
3984 }
3985 }
7c673cae
FG
3986 }
3987 if (!replaced) {
3988 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
3989 << dendl;
11fdf7f2 3990 ceph_assert(i != orig.end());
7c673cae
FG
3991 o.push_back(*i);
3992 ++i;
3993 }
3994 if (i == orig.end()) {
3995 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
3996 break;
3997 }
3998 }
3999 }
31f18b77
FG
4000 if (j + 1 < stack.size()) {
4001 // check if any buckets have overfull leaves but no underfull candidates
4002 for (int pos = 0; pos < fanout; ++pos) {
4003 if (underfull_buckets[j].count(o[pos]) == 0) {
4004 // are any leaves overfull?
4005 bool any_overfull = false;
4006 for (auto osd : leaves[pos]) {
4007 if (overfull.count(osd)) {
4008 any_overfull = true;
11fdf7f2 4009 break;
31f18b77
FG
4010 }
4011 }
4012 if (any_overfull) {
4013 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
4014 << ">0 leaves " << leaves[pos] << " is overfull; alts "
4015 << underfull_buckets[j]
4016 << dendl;
4017 for (auto alt : underfull_buckets[j]) {
4018 if (std::find(o.begin(), o.end(), alt) == o.end()) {
4019 // see if alt has the same parent
4020 if (j == 0 ||
92f5a8d4
TL
4021 get_parent_of_type(o[pos], stack[j-1].first, rule) ==
4022 get_parent_of_type(alt, stack[j-1].first, rule)) {
31f18b77
FG
4023 if (j)
4024 ldout(cct, 10) << " replacing " << o[pos]
4025 << " (which has no underfull leaves) with " << alt
4026 << " (same parent "
92f5a8d4 4027 << get_parent_of_type(alt, stack[j-1].first, rule) << " type "
31f18b77
FG
4028 << type << ")" << dendl;
4029 else
4030 ldout(cct, 10) << " replacing " << o[pos]
4031 << " (which has no underfull leaves) with " << alt
4032 << " (first level)" << dendl;
4033 o[pos] = alt;
4034 break;
4035 } else {
4036 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
4037 << " has different parent, skipping" << dendl;
4038 }
4039 }
4040 }
4041 }
4042 }
4043 }
4044 }
7c673cae
FG
4045 if (i == orig.end()) {
4046 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
4047 break;
4048 }
4049 }
4050 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
4051 w.swap(o);
4052 }
4053 *pw = w;
4054 return 0;
4055}
4056
4057int CrushWrapper::try_remap_rule(
4058 CephContext *cct,
4059 int ruleno,
4060 int maxout,
4061 const set<int>& overfull,
4062 const vector<int>& underfull,
92f5a8d4 4063 const vector<int>& more_underfull,
7c673cae
FG
4064 const vector<int>& orig,
4065 vector<int> *out) const
4066{
4067 const crush_map *map = crush;
4068 const crush_rule *rule = get_rule(ruleno);
11fdf7f2 4069 ceph_assert(rule);
7c673cae
FG
4070
4071 ldout(cct, 10) << __func__ << " ruleno " << ruleno
4072 << " numrep " << maxout << " overfull " << overfull
92f5a8d4
TL
4073 << " underfull " << underfull
4074 << " more_underfull " << more_underfull
4075 << " orig " << orig
7c673cae
FG
4076 << dendl;
4077 vector<int> w; // working set
4078 out->clear();
4079
4080 auto i = orig.begin();
4081 set<int> used;
4082
4083 vector<pair<int,int>> type_stack; // (type, fan-out)
a8e16298 4084 int root_bucket = 0;
7c673cae
FG
4085 for (unsigned step = 0; step < rule->len; ++step) {
4086 const crush_rule_step *curstep = &rule->steps[step];
4087 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
4088 switch (curstep->op) {
4089 case CRUSH_RULE_TAKE:
4090 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
4091 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
4092 map->buckets[-1-curstep->arg1])) {
4093 w.clear();
4094 w.push_back(curstep->arg1);
a8e16298 4095 root_bucket = curstep->arg1;
7c673cae
FG
4096 ldout(cct, 10) << __func__ << " take " << w << dendl;
4097 } else {
4098 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
4099 }
4100 break;
4101
4102 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
4103 case CRUSH_RULE_CHOOSELEAF_INDEP:
4104 {
4105 int numrep = curstep->arg1;
4106 int type = curstep->arg2;
4107 if (numrep <= 0)
4108 numrep += maxout;
4109 type_stack.push_back(make_pair(type, numrep));
28e407b8
AA
4110 if (type > 0)
4111 type_stack.push_back(make_pair(0, 1));
92f5a8d4
TL
4112 int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
4113 i, used, &w, root_bucket, ruleno);
7c673cae
FG
4114 if (r < 0)
4115 return r;
4116 type_stack.clear();
4117 }
4118 break;
4119
4120 case CRUSH_RULE_CHOOSE_FIRSTN:
4121 case CRUSH_RULE_CHOOSE_INDEP:
4122 {
4123 int numrep = curstep->arg1;
4124 int type = curstep->arg2;
4125 if (numrep <= 0)
4126 numrep += maxout;
4127 type_stack.push_back(make_pair(type, numrep));
4128 }
4129 break;
4130
4131 case CRUSH_RULE_EMIT:
4132 ldout(cct, 10) << " emit " << w << dendl;
4133 if (!type_stack.empty()) {
92f5a8d4
TL
4134 int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
4135 i, used, &w, root_bucket, ruleno);
7c673cae
FG
4136 if (r < 0)
4137 return r;
4138 type_stack.clear();
4139 }
4140 for (auto item : w) {
4141 out->push_back(item);
4142 }
4143 w.clear();
4144 break;
4145
4146 default:
4147 // ignore
4148 break;
4149 }
4150 }
4151
4152 return 0;
4153}
c07f9fc5
FG
4154
4155
4156int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
4157 CephContext *cct,
4158 crush_choose_arg_map cmap,
4159 int bucketid,
4160 int id,
4161 const vector<int>& weight,
4162 ostream *ss)
4163{
4164 int changed = 0;
4165 int bidx = -1 - bucketid;
4166 crush_bucket *b = crush->buckets[bidx];
4167 if (bidx >= (int)cmap.size) {
4168 if (ss)
4169 *ss << "no weight-set for bucket " << b->id;
4170 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
4171 << dendl;
4172 return 0;
4173 }
4174 crush_choose_arg *carg = &cmap.args[bidx];
4175 if (carg->weight_set == NULL) {
28e407b8
AA
4176 // create a weight-set for this bucket and populate it with the
4177 // bucket weights
4178 unsigned positions = get_choose_args_positions(cmap);
4179 carg->weight_set_positions = positions;
4180 carg->weight_set = static_cast<crush_weight_set*>(
4181 calloc(sizeof(crush_weight_set), positions));
4182 for (unsigned p = 0; p < positions; ++p) {
4183 carg->weight_set[p].size = b->size;
4184 carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
4185 for (unsigned i = 0; i < b->size; ++i) {
4186 carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
4187 }
4188 }
4189 changed++;
c07f9fc5 4190 }
28e407b8 4191 if (carg->weight_set_positions != weight.size()) {
c07f9fc5 4192 if (ss)
28e407b8
AA
4193 *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
4194 ldout(cct, 10) << __func__ << " weight_set_positions != " << weight.size()
c07f9fc5
FG
4195 << " for bucket " << b->id << dendl;
4196 return 0;
4197 }
4198 for (unsigned i = 0; i < b->size; i++) {
4199 if (b->items[i] == id) {
4200 for (unsigned j = 0; j < weight.size(); ++j) {
4201 carg->weight_set[j].weights[i] = weight[j];
4202 }
4203 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
4204 << " in bucket " << b->id << dendl;
4205 changed++;
4206 }
4207 }
4208 if (changed) {
4209 vector<int> bucket_weight(weight.size(), 0);
4210 for (unsigned i = 0; i < b->size; i++) {
4211 for (unsigned j = 0; j < weight.size(); ++j) {
4212 bucket_weight[j] += carg->weight_set[j].weights[i];
4213 }
4214 }
4215 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
4216 }
4217 return changed;
4218}
4219
4220int CrushWrapper::choose_args_adjust_item_weight(
4221 CephContext *cct,
4222 crush_choose_arg_map cmap,
4223 int id,
4224 const vector<int>& weight,
4225 ostream *ss)
4226{
4227 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
4228 int changed = 0;
4229 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
4230 crush_bucket *b = crush->buckets[bidx];
4231 if (b == nullptr) {
4232 continue;
4233 }
4234 changed += _choose_args_adjust_item_weight_in_bucket(
4235 cct, cmap, b->id, id, weight, ss);
4236 }
4237 if (!changed) {
4238 if (ss)
4239 *ss << "item " << id << " not found in crush map";
4240 return -ENOENT;
4241 }
4242 return changed;
4243}