]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
update sources to v12.2.1
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/algorithm/string.hpp>
19
20 #include "OSDMap.h"
21 #include <algorithm>
22 #include "common/config.h"
23 #include "common/Formatter.h"
24 #include "common/TextTable.h"
25 #include "include/ceph_features.h"
26 #include "include/str_map.h"
27
28 #include "common/code_environment.h"
29 #include "mon/health_check.h"
30
31 #include "crush/CrushTreeDumper.h"
32 #include "common/Clock.h"
33 #include "mon/PGStatService.h"
34
35 #define dout_subsys ceph_subsys_osd
36
37 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
38 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
39
40
41 // ----------------------------------
42 // osd_info_t
43
44 void osd_info_t::dump(Formatter *f) const
45 {
46 f->dump_int("last_clean_begin", last_clean_begin);
47 f->dump_int("last_clean_end", last_clean_end);
48 f->dump_int("up_from", up_from);
49 f->dump_int("up_thru", up_thru);
50 f->dump_int("down_at", down_at);
51 f->dump_int("lost_at", lost_at);
52 }
53
54 void osd_info_t::encode(bufferlist& bl) const
55 {
56 __u8 struct_v = 1;
57 ::encode(struct_v, bl);
58 ::encode(last_clean_begin, bl);
59 ::encode(last_clean_end, bl);
60 ::encode(up_from, bl);
61 ::encode(up_thru, bl);
62 ::encode(down_at, bl);
63 ::encode(lost_at, bl);
64 }
65
66 void osd_info_t::decode(bufferlist::iterator& bl)
67 {
68 __u8 struct_v;
69 ::decode(struct_v, bl);
70 ::decode(last_clean_begin, bl);
71 ::decode(last_clean_end, bl);
72 ::decode(up_from, bl);
73 ::decode(up_thru, bl);
74 ::decode(down_at, bl);
75 ::decode(lost_at, bl);
76 }
77
78 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
79 {
80 o.push_back(new osd_info_t);
81 o.push_back(new osd_info_t);
82 o.back()->last_clean_begin = 1;
83 o.back()->last_clean_end = 2;
84 o.back()->up_from = 30;
85 o.back()->up_thru = 40;
86 o.back()->down_at = 5;
87 o.back()->lost_at = 6;
88 }
89
90 ostream& operator<<(ostream& out, const osd_info_t& info)
91 {
92 out << "up_from " << info.up_from
93 << " up_thru " << info.up_thru
94 << " down_at " << info.down_at
95 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
96 if (info.lost_at)
97 out << " lost_at " << info.lost_at;
98 return out;
99 }
100
101 // ----------------------------------
102 // osd_xinfo_t
103
104 void osd_xinfo_t::dump(Formatter *f) const
105 {
106 f->dump_stream("down_stamp") << down_stamp;
107 f->dump_float("laggy_probability", laggy_probability);
108 f->dump_int("laggy_interval", laggy_interval);
109 f->dump_int("features", features);
110 f->dump_unsigned("old_weight", old_weight);
111 }
112
113 void osd_xinfo_t::encode(bufferlist& bl) const
114 {
115 ENCODE_START(3, 1, bl);
116 ::encode(down_stamp, bl);
117 __u32 lp = laggy_probability * 0xfffffffful;
118 ::encode(lp, bl);
119 ::encode(laggy_interval, bl);
120 ::encode(features, bl);
121 ::encode(old_weight, bl);
122 ENCODE_FINISH(bl);
123 }
124
125 void osd_xinfo_t::decode(bufferlist::iterator& bl)
126 {
127 DECODE_START(3, bl);
128 ::decode(down_stamp, bl);
129 __u32 lp;
130 ::decode(lp, bl);
131 laggy_probability = (float)lp / (float)0xffffffff;
132 ::decode(laggy_interval, bl);
133 if (struct_v >= 2)
134 ::decode(features, bl);
135 else
136 features = 0;
137 if (struct_v >= 3)
138 ::decode(old_weight, bl);
139 else
140 old_weight = 0;
141 DECODE_FINISH(bl);
142 }
143
144 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
145 {
146 o.push_back(new osd_xinfo_t);
147 o.push_back(new osd_xinfo_t);
148 o.back()->down_stamp = utime_t(2, 3);
149 o.back()->laggy_probability = .123;
150 o.back()->laggy_interval = 123456;
151 o.back()->old_weight = 0x7fff;
152 }
153
154 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
155 {
156 return out << "down_stamp " << xi.down_stamp
157 << " laggy_probability " << xi.laggy_probability
158 << " laggy_interval " << xi.laggy_interval
159 << " old_weight " << xi.old_weight;
160 }
161
162 // ----------------------------------
163 // OSDMap::Incremental
164
165 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
166 {
167 int n = 0;
168 for (auto &weight : new_weight) {
169 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
170 n++; // marked out
171 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
172 n--; // marked in
173 }
174 return n;
175 }
176
177 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
178 {
179 int n = 0;
180 for (auto &state : new_state) { //
181 if (state.second & CEPH_OSD_UP) {
182 if (previous->is_up(state.first))
183 n++; // marked down
184 else
185 n--; // marked up
186 }
187 }
188 return n;
189 }
190
191 int OSDMap::Incremental::identify_osd(uuid_d u) const
192 {
193 for (auto &uuid : new_uuid)
194 if (uuid.second == u)
195 return uuid.first;
196 return -1;
197 }
198
199 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
200 const OSDMap& osdmap)
201 {
202 assert(epoch == osdmap.get_epoch() + 1);
203
204 for (auto &new_pool : new_pools) {
205 if (!new_pool.second.tiers.empty()) {
206 pg_pool_t& base = new_pool.second;
207
208 for (const auto &tier_pool : base.tiers) {
209 const auto &r = new_pools.find(tier_pool);
210 pg_pool_t *tier = 0;
211 if (r == new_pools.end()) {
212 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
213 if (!orig) {
214 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
215 return -EIO;
216 }
217 tier = get_new_pool(tier_pool, orig);
218 } else {
219 tier = &r->second;
220 }
221 if (tier->tier_of != new_pool.first) {
222 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
223 return -EIO;
224 }
225
226 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
227 << tier_pool << dendl;
228 tier->snap_seq = base.snap_seq;
229 tier->snap_epoch = base.snap_epoch;
230 tier->snaps = base.snaps;
231 tier->removed_snaps = base.removed_snaps;
232 }
233 }
234 }
235 return 0;
236 }
237
238
239 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
240 {
241 if (id >= 0)
242 return is_down(id);
243
244 if (down_cache &&
245 down_cache->count(id)) {
246 return true;
247 }
248
249 list<int> children;
250 crush->get_children(id, &children);
251 for (const auto &child : children) {
252 if (!subtree_is_down(child, down_cache)) {
253 return false;
254 }
255 }
256 if (down_cache) {
257 down_cache->insert(id);
258 }
259 return true;
260 }
261
262 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
263 {
264 // use a stack-local down_cache if we didn't get one from the
265 // caller. then at least this particular call will avoid duplicated
266 // work.
267 set<int> local_down_cache;
268 if (!down_cache) {
269 down_cache = &local_down_cache;
270 }
271
272 int current = id;
273 while (true) {
274 int type;
275 if (current >= 0) {
276 type = 0;
277 } else {
278 type = crush->get_bucket_type(current);
279 }
280 assert(type >= 0);
281
282 if (!subtree_is_down(current, down_cache)) {
283 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
284 return false;
285 }
286
287 // is this a big enough subtree to be marked as down?
288 if (type >= subtree_type) {
289 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
290 return true;
291 }
292
293 int r = crush->get_immediate_parent_id(current, &current);
294 if (r < 0) {
295 return false;
296 }
297 }
298 }
299
300 bool OSDMap::subtree_type_is_down(
301 CephContext *cct,
302 int id,
303 int subtree_type,
304 set<int> *down_in_osds,
305 set<int> *up_in_osds,
306 set<int> *subtree_up,
307 unordered_map<int, set<int> > *subtree_type_down) const
308 {
309 if (id >= 0) {
310 bool is_down_ret = is_down(id);
311 if (!is_out(id)) {
312 if (is_down_ret) {
313 down_in_osds->insert(id);
314 } else {
315 up_in_osds->insert(id);
316 }
317 }
318 return is_down_ret;
319 }
320
321 if (subtree_type_down &&
322 (*subtree_type_down)[subtree_type].count(id)) {
323 return true;
324 }
325
326 list<int> children;
327 crush->get_children(id, &children);
328 for (const auto &child : children) {
329 if (!subtree_type_is_down(
330 cct, child, crush->get_bucket_type(child),
331 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
332 subtree_up->insert(id);
333 return false;
334 }
335 }
336 if (subtree_type_down) {
337 (*subtree_type_down)[subtree_type].insert(id);
338 }
339 return true;
340 }
341
342 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
343 {
344 __u16 v = 5;
345 ::encode(v, bl);
346 ::encode(fsid, bl);
347 ::encode(epoch, bl);
348 ::encode(modified, bl);
349 int32_t new_t = new_pool_max;
350 ::encode(new_t, bl);
351 ::encode(new_flags, bl);
352 ::encode(fullmap, bl);
353 ::encode(crush, bl);
354
355 ::encode(new_max_osd, bl);
356 // for ::encode(new_pools, bl);
357 __u32 n = new_pools.size();
358 ::encode(n, bl);
359 for (const auto &new_pool : new_pools) {
360 n = new_pool.first;
361 ::encode(n, bl);
362 ::encode(new_pool.second, bl, 0);
363 }
364 // for ::encode(new_pool_names, bl);
365 n = new_pool_names.size();
366 ::encode(n, bl);
367
368 for (const auto &new_pool_name : new_pool_names) {
369 n = new_pool_name.first;
370 ::encode(n, bl);
371 ::encode(new_pool_name.second, bl);
372 }
373 // for ::encode(old_pools, bl);
374 n = old_pools.size();
375 ::encode(n, bl);
376 for (auto &old_pool : old_pools) {
377 n = old_pool;
378 ::encode(n, bl);
379 }
380 ::encode(new_up_client, bl, 0);
381 {
382 // legacy is map<int32_t,uint8_t>
383 uint32_t n = new_state.size();
384 ::encode(n, bl);
385 for (auto p : new_state) {
386 ::encode(p.first, bl);
387 ::encode((uint8_t)p.second, bl);
388 }
389 }
390 ::encode(new_weight, bl);
391 // for ::encode(new_pg_temp, bl);
392 n = new_pg_temp.size();
393 ::encode(n, bl);
394
395 for (const auto &pg_temp : new_pg_temp) {
396 old_pg_t opg = pg_temp.first.get_old_pg();
397 ::encode(opg, bl);
398 ::encode(pg_temp.second, bl);
399 }
400 }
401
402 void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
403 {
404 if ((features & CEPH_FEATURE_PGID64) == 0) {
405 encode_client_old(bl);
406 return;
407 }
408
409 // base
410 __u16 v = 6;
411 ::encode(v, bl);
412 ::encode(fsid, bl);
413 ::encode(epoch, bl);
414 ::encode(modified, bl);
415 ::encode(new_pool_max, bl);
416 ::encode(new_flags, bl);
417 ::encode(fullmap, bl);
418 ::encode(crush, bl);
419
420 ::encode(new_max_osd, bl);
421 ::encode(new_pools, bl, features);
422 ::encode(new_pool_names, bl);
423 ::encode(old_pools, bl);
424 ::encode(new_up_client, bl, features);
425 {
426 uint32_t n = new_state.size();
427 ::encode(n, bl);
428 for (auto p : new_state) {
429 ::encode(p.first, bl);
430 ::encode((uint8_t)p.second, bl);
431 }
432 }
433 ::encode(new_weight, bl);
434 ::encode(new_pg_temp, bl);
435
436 // extended
437 __u16 ev = 10;
438 ::encode(ev, bl);
439 ::encode(new_hb_back_up, bl, features);
440 ::encode(new_up_thru, bl);
441 ::encode(new_last_clean_interval, bl);
442 ::encode(new_lost, bl);
443 ::encode(new_blacklist, bl, features);
444 ::encode(old_blacklist, bl, features);
445 ::encode(new_up_cluster, bl, features);
446 ::encode(cluster_snapshot, bl);
447 ::encode(new_uuid, bl);
448 ::encode(new_xinfo, bl);
449 ::encode(new_hb_front_up, bl, features);
450 }
451
452 void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
453 {
454 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
455 encode_classic(bl, features);
456 return;
457 }
458
459 // only a select set of callers should *ever* be encoding new
460 // OSDMaps. others should be passing around the canonical encoded
461 // buffers from on high. select out those callers by passing in an
462 // "impossible" feature bit.
463 assert(features & CEPH_FEATURE_RESERVED);
464 features &= ~CEPH_FEATURE_RESERVED;
465
466 size_t start_offset = bl.length();
467 size_t tail_offset;
468 buffer::list::iterator crc_it;
469
470 // meta-encoding: how we include client-used and osd-specific data
471 ENCODE_START(8, 7, bl);
472
473 {
474 uint8_t v = 5;
475 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
476 v = 3;
477 }
478 ENCODE_START(v, 1, bl); // client-usable data
479 ::encode(fsid, bl);
480 ::encode(epoch, bl);
481 ::encode(modified, bl);
482 ::encode(new_pool_max, bl);
483 ::encode(new_flags, bl);
484 ::encode(fullmap, bl);
485 ::encode(crush, bl);
486
487 ::encode(new_max_osd, bl);
488 ::encode(new_pools, bl, features);
489 ::encode(new_pool_names, bl);
490 ::encode(old_pools, bl);
491 ::encode(new_up_client, bl, features);
492 if (v >= 5) {
493 ::encode(new_state, bl);
494 } else {
495 uint32_t n = new_state.size();
496 ::encode(n, bl);
497 for (auto p : new_state) {
498 ::encode(p.first, bl);
499 ::encode((uint8_t)p.second, bl);
500 }
501 }
502 ::encode(new_weight, bl);
503 ::encode(new_pg_temp, bl);
504 ::encode(new_primary_temp, bl);
505 ::encode(new_primary_affinity, bl);
506 ::encode(new_erasure_code_profiles, bl);
507 ::encode(old_erasure_code_profiles, bl);
508 if (v >= 4) {
509 ::encode(new_pg_upmap, bl);
510 ::encode(old_pg_upmap, bl);
511 ::encode(new_pg_upmap_items, bl);
512 ::encode(old_pg_upmap_items, bl);
513 }
514 ENCODE_FINISH(bl); // client-usable data
515 }
516
517 {
518 uint8_t target_v = 6;
519 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
520 target_v = 2;
521 }
522 ENCODE_START(target_v, 1, bl); // extended, osd-only data
523 ::encode(new_hb_back_up, bl, features);
524 ::encode(new_up_thru, bl);
525 ::encode(new_last_clean_interval, bl);
526 ::encode(new_lost, bl);
527 ::encode(new_blacklist, bl, features);
528 ::encode(old_blacklist, bl, features);
529 ::encode(new_up_cluster, bl, features);
530 ::encode(cluster_snapshot, bl);
531 ::encode(new_uuid, bl);
532 ::encode(new_xinfo, bl);
533 ::encode(new_hb_front_up, bl, features);
534 ::encode(features, bl); // NOTE: features arg, not the member
535 if (target_v >= 3) {
536 ::encode(new_nearfull_ratio, bl);
537 ::encode(new_full_ratio, bl);
538 ::encode(new_backfillfull_ratio, bl);
539 }
540 // 5 was string-based new_require_min_compat_client
541 if (target_v >= 6) {
542 ::encode(new_require_min_compat_client, bl);
543 ::encode(new_require_osd_release, bl);
544 }
545 ENCODE_FINISH(bl); // osd-only data
546 }
547
548 ::encode((uint32_t)0, bl); // dummy inc_crc
549 crc_it = bl.end();
550 crc_it.advance(-4);
551 tail_offset = bl.length();
552
553 ::encode(full_crc, bl);
554
555 ENCODE_FINISH(bl); // meta-encoding wrapper
556
557 // fill in crc
558 bufferlist front;
559 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
560 inc_crc = front.crc32c(-1);
561 bufferlist tail;
562 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
563 inc_crc = tail.crc32c(inc_crc);
564 ceph_le32 crc_le;
565 crc_le = inc_crc;
566 crc_it.copy_in(4, (char*)&crc_le);
567 have_crc = true;
568 }
569
570 void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
571 {
572 __u32 n, t;
573 // base
574 __u16 v;
575 ::decode(v, p);
576 ::decode(fsid, p);
577 ::decode(epoch, p);
578 ::decode(modified, p);
579 if (v == 4 || v == 5) {
580 ::decode(n, p);
581 new_pool_max = n;
582 } else if (v >= 6)
583 ::decode(new_pool_max, p);
584 ::decode(new_flags, p);
585 ::decode(fullmap, p);
586 ::decode(crush, p);
587
588 ::decode(new_max_osd, p);
589 if (v < 6) {
590 new_pools.clear();
591 ::decode(n, p);
592 while (n--) {
593 ::decode(t, p);
594 ::decode(new_pools[t], p);
595 }
596 } else {
597 ::decode(new_pools, p);
598 }
599 if (v == 5) {
600 new_pool_names.clear();
601 ::decode(n, p);
602 while (n--) {
603 ::decode(t, p);
604 ::decode(new_pool_names[t], p);
605 }
606 } else if (v >= 6) {
607 ::decode(new_pool_names, p);
608 }
609 if (v < 6) {
610 old_pools.clear();
611 ::decode(n, p);
612 while (n--) {
613 ::decode(t, p);
614 old_pools.insert(t);
615 }
616 } else {
617 ::decode(old_pools, p);
618 }
619 ::decode(new_up_client, p);
620 {
621 map<int32_t,uint8_t> ns;
622 ::decode(ns, p);
623 for (auto q : ns) {
624 new_state[q.first] = q.second;
625 }
626 }
627 ::decode(new_weight, p);
628
629 if (v < 6) {
630 new_pg_temp.clear();
631 ::decode(n, p);
632 while (n--) {
633 old_pg_t opg;
634 ::decode_raw(opg, p);
635 ::decode(new_pg_temp[pg_t(opg)], p);
636 }
637 } else {
638 ::decode(new_pg_temp, p);
639 }
640
641 // decode short map, too.
642 if (v == 5 && p.end())
643 return;
644
645 // extended
646 __u16 ev = 0;
647 if (v >= 5)
648 ::decode(ev, p);
649 ::decode(new_hb_back_up, p);
650 if (v < 5)
651 ::decode(new_pool_names, p);
652 ::decode(new_up_thru, p);
653 ::decode(new_last_clean_interval, p);
654 ::decode(new_lost, p);
655 ::decode(new_blacklist, p);
656 ::decode(old_blacklist, p);
657 if (ev >= 6)
658 ::decode(new_up_cluster, p);
659 if (ev >= 7)
660 ::decode(cluster_snapshot, p);
661 if (ev >= 8)
662 ::decode(new_uuid, p);
663 if (ev >= 9)
664 ::decode(new_xinfo, p);
665 if (ev >= 10)
666 ::decode(new_hb_front_up, p);
667 }
668
669 void OSDMap::Incremental::decode(bufferlist::iterator& bl)
670 {
671 /**
672 * Older encodings of the Incremental had a single struct_v which
673 * covered the whole encoding, and was prior to our modern
674 * stuff which includes a compatv and a size. So if we see
675 * a struct_v < 7, we must rewind to the beginning and use our
676 * classic decoder.
677 */
678 size_t start_offset = bl.get_off();
679 size_t tail_offset = 0;
680 bufferlist crc_front, crc_tail;
681
682 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
683 if (struct_v < 7) {
684 int struct_v_size = sizeof(struct_v);
685 bl.advance(-struct_v_size);
686 decode_classic(bl);
687 encode_features = 0;
688 if (struct_v >= 6)
689 encode_features = CEPH_FEATURE_PGID64;
690 else
691 encode_features = 0;
692 return;
693 }
694 {
695 DECODE_START(5, bl); // client-usable data
696 ::decode(fsid, bl);
697 ::decode(epoch, bl);
698 ::decode(modified, bl);
699 ::decode(new_pool_max, bl);
700 ::decode(new_flags, bl);
701 ::decode(fullmap, bl);
702 ::decode(crush, bl);
703
704 ::decode(new_max_osd, bl);
705 ::decode(new_pools, bl);
706 ::decode(new_pool_names, bl);
707 ::decode(old_pools, bl);
708 ::decode(new_up_client, bl);
709 if (struct_v >= 5) {
710 ::decode(new_state, bl);
711 } else {
712 map<int32_t,uint8_t> ns;
713 ::decode(ns, bl);
714 for (auto q : ns) {
715 new_state[q.first] = q.second;
716 }
717 }
718 ::decode(new_weight, bl);
719 ::decode(new_pg_temp, bl);
720 ::decode(new_primary_temp, bl);
721 if (struct_v >= 2)
722 ::decode(new_primary_affinity, bl);
723 else
724 new_primary_affinity.clear();
725 if (struct_v >= 3) {
726 ::decode(new_erasure_code_profiles, bl);
727 ::decode(old_erasure_code_profiles, bl);
728 } else {
729 new_erasure_code_profiles.clear();
730 old_erasure_code_profiles.clear();
731 }
732 if (struct_v >= 4) {
733 ::decode(new_pg_upmap, bl);
734 ::decode(old_pg_upmap, bl);
735 ::decode(new_pg_upmap_items, bl);
736 ::decode(old_pg_upmap_items, bl);
737 }
738 DECODE_FINISH(bl); // client-usable data
739 }
740
741 {
742 DECODE_START(6, bl); // extended, osd-only data
743 ::decode(new_hb_back_up, bl);
744 ::decode(new_up_thru, bl);
745 ::decode(new_last_clean_interval, bl);
746 ::decode(new_lost, bl);
747 ::decode(new_blacklist, bl);
748 ::decode(old_blacklist, bl);
749 ::decode(new_up_cluster, bl);
750 ::decode(cluster_snapshot, bl);
751 ::decode(new_uuid, bl);
752 ::decode(new_xinfo, bl);
753 ::decode(new_hb_front_up, bl);
754 if (struct_v >= 2)
755 ::decode(encode_features, bl);
756 else
757 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
758 if (struct_v >= 3) {
759 ::decode(new_nearfull_ratio, bl);
760 ::decode(new_full_ratio, bl);
761 } else {
762 new_nearfull_ratio = -1;
763 new_full_ratio = -1;
764 }
765 if (struct_v >= 4) {
766 ::decode(new_backfillfull_ratio, bl);
767 } else {
768 new_backfillfull_ratio = -1;
769 }
770 if (struct_v == 5) {
771 string r;
772 ::decode(r, bl);
773 if (r.length()) {
774 new_require_min_compat_client = ceph_release_from_name(r.c_str());
775 }
776 }
777 if (struct_v >= 6) {
778 ::decode(new_require_min_compat_client, bl);
779 ::decode(new_require_osd_release, bl);
780 } else {
781 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
782 // only for compat with post-kraken pre-luminous test clusters
783 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
784 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
785 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
786 new_require_osd_release = CEPH_RELEASE_KRAKEN;
787 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
788 new_require_osd_release = CEPH_RELEASE_JEWEL;
789 } else {
790 new_require_osd_release = -1;
791 }
792 }
793 DECODE_FINISH(bl); // osd-only data
794 }
795
796 if (struct_v >= 8) {
797 have_crc = true;
798 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
799 ::decode(inc_crc, bl);
800 tail_offset = bl.get_off();
801 ::decode(full_crc, bl);
802 } else {
803 have_crc = false;
804 full_crc = 0;
805 inc_crc = 0;
806 }
807
808 DECODE_FINISH(bl); // wrapper
809
810 if (have_crc) {
811 // verify crc
812 uint32_t actual = crc_front.crc32c(-1);
813 if (tail_offset < bl.get_off()) {
814 bufferlist tail;
815 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
816 actual = tail.crc32c(actual);
817 }
818 if (inc_crc != actual) {
819 ostringstream ss;
820 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
821 string s = ss.str();
822 throw buffer::malformed_input(s.c_str());
823 }
824 }
825 }
826
827 void OSDMap::Incremental::dump(Formatter *f) const
828 {
829 f->dump_int("epoch", epoch);
830 f->dump_stream("fsid") << fsid;
831 f->dump_stream("modified") << modified;
832 f->dump_int("new_pool_max", new_pool_max);
833 f->dump_int("new_flags", new_flags);
834 f->dump_float("new_full_ratio", new_full_ratio);
835 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
836 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
837 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
838 f->dump_int("new_require_osd_release", new_require_osd_release);
839
840 if (fullmap.length()) {
841 f->open_object_section("full_map");
842 OSDMap full;
843 bufferlist fbl = fullmap; // kludge around constness.
844 auto p = fbl.begin();
845 full.decode(p);
846 full.dump(f);
847 f->close_section();
848 }
849 if (crush.length()) {
850 f->open_object_section("crush");
851 CrushWrapper c;
852 bufferlist tbl = crush; // kludge around constness.
853 auto p = tbl.begin();
854 c.decode(p);
855 c.dump(f);
856 f->close_section();
857 }
858
859 f->dump_int("new_max_osd", new_max_osd);
860
861 f->open_array_section("new_pools");
862
863 for (const auto &new_pool : new_pools) {
864 f->open_object_section("pool");
865 f->dump_int("pool", new_pool.first);
866 new_pool.second.dump(f);
867 f->close_section();
868 }
869 f->close_section();
870 f->open_array_section("new_pool_names");
871
872 for (const auto &new_pool_name : new_pool_names) {
873 f->open_object_section("pool_name");
874 f->dump_int("pool", new_pool_name.first);
875 f->dump_string("name", new_pool_name.second);
876 f->close_section();
877 }
878 f->close_section();
879 f->open_array_section("old_pools");
880
881 for (const auto &old_pool : old_pools)
882 f->dump_int("pool", old_pool);
883 f->close_section();
884
885 f->open_array_section("new_up_osds");
886
887 for (const auto &upclient : new_up_client) {
888 f->open_object_section("osd");
889 f->dump_int("osd", upclient.first);
890 f->dump_stream("public_addr") << upclient.second;
891 f->dump_stream("cluster_addr") << new_up_cluster.find(upclient.first)->second;
892 f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(upclient.first)->second;
893 map<int32_t, entity_addr_t>::const_iterator q;
894 if ((q = new_hb_front_up.find(upclient.first)) != new_hb_front_up.end())
895 f->dump_stream("heartbeat_front_addr") << q->second;
896 f->close_section();
897 }
898 f->close_section();
899
900 f->open_array_section("new_weight");
901
902 for (const auto &weight : new_weight) {
903 f->open_object_section("osd");
904 f->dump_int("osd", weight.first);
905 f->dump_int("weight", weight.second);
906 f->close_section();
907 }
908 f->close_section();
909
910 f->open_array_section("osd_state_xor");
911 for (const auto &ns : new_state) {
912 f->open_object_section("osd");
913 f->dump_int("osd", ns.first);
914 set<string> st;
915 calc_state_set(new_state.find(ns.first)->second, st);
916 f->open_array_section("state_xor");
917 for (auto &state : st)
918 f->dump_string("state", state);
919 f->close_section();
920 f->close_section();
921 }
922 f->close_section();
923
924 f->open_array_section("new_pg_temp");
925
926 for (const auto &pg_temp : new_pg_temp) {
927 f->open_object_section("pg");
928 f->dump_stream("pgid") << pg_temp.first;
929 f->open_array_section("osds");
930
931 for (const auto &osd : pg_temp.second)
932 f->dump_int("osd", osd);
933 f->close_section();
934 f->close_section();
935 }
936 f->close_section();
937
938 f->open_array_section("primary_temp");
939
940 for (const auto &primary_temp : new_primary_temp) {
941 f->dump_stream("pgid") << primary_temp.first;
942 f->dump_int("osd", primary_temp.second);
943 }
944 f->close_section(); // primary_temp
945
946 f->open_array_section("new_pg_upmap");
947 for (auto& i : new_pg_upmap) {
948 f->open_object_section("mapping");
949 f->dump_stream("pgid") << i.first;
950 f->open_array_section("osds");
951 for (auto osd : i.second) {
952 f->dump_int("osd", osd);
953 }
954 f->close_section();
955 f->close_section();
956 }
957 f->close_section();
958 f->open_array_section("old_pg_upmap");
959 for (auto& i : old_pg_upmap) {
960 f->dump_stream("pgid") << i;
961 }
962 f->close_section();
963
964 f->open_array_section("new_pg_upmap_items");
965 for (auto& i : new_pg_upmap_items) {
966 f->open_object_section("mapping");
967 f->dump_stream("pgid") << i.first;
968 f->open_array_section("mappings");
969 for (auto& p : i.second) {
970 f->open_object_section("mapping");
971 f->dump_int("from", p.first);
972 f->dump_int("to", p.second);
973 f->close_section();
974 }
975 f->close_section();
976 f->close_section();
977 }
978 f->close_section();
979 f->open_array_section("old_pg_upmap_items");
980 for (auto& i : old_pg_upmap_items) {
981 f->dump_stream("pgid") << i;
982 }
983 f->close_section();
984
985 f->open_array_section("new_up_thru");
986
987 for (const auto &up_thru : new_up_thru) {
988 f->open_object_section("osd");
989 f->dump_int("osd", up_thru.first);
990 f->dump_int("up_thru", up_thru.second);
991 f->close_section();
992 }
993 f->close_section();
994
995 f->open_array_section("new_lost");
996
997 for (const auto &lost : new_lost) {
998 f->open_object_section("osd");
999 f->dump_int("osd", lost.first);
1000 f->dump_int("epoch_lost", lost.second);
1001 f->close_section();
1002 }
1003 f->close_section();
1004
1005 f->open_array_section("new_last_clean_interval");
1006
1007 for (const auto &last_clean_interval : new_last_clean_interval) {
1008 f->open_object_section("osd");
1009 f->dump_int("osd", last_clean_interval.first);
1010 f->dump_int("first", last_clean_interval.second.first);
1011 f->dump_int("last", last_clean_interval.second.second);
1012 f->close_section();
1013 }
1014 f->close_section();
1015
1016 f->open_array_section("new_blacklist");
1017 for (const auto &blist : new_blacklist) {
1018 stringstream ss;
1019 ss << blist.first;
1020 f->dump_stream(ss.str().c_str()) << blist.second;
1021 }
1022 f->close_section();
1023 f->open_array_section("old_blacklist");
1024 for (const auto &blist : old_blacklist)
1025 f->dump_stream("addr") << blist;
1026 f->close_section();
1027
1028 f->open_array_section("new_xinfo");
1029 for (const auto &xinfo : new_xinfo) {
1030 f->open_object_section("xinfo");
1031 f->dump_int("osd", xinfo.first);
1032 xinfo.second.dump(f);
1033 f->close_section();
1034 }
1035 f->close_section();
1036
1037 if (cluster_snapshot.size())
1038 f->dump_string("cluster_snapshot", cluster_snapshot);
1039
1040 f->open_array_section("new_uuid");
1041 for (const auto &uuid : new_uuid) {
1042 f->open_object_section("osd");
1043 f->dump_int("osd", uuid.first);
1044 f->dump_stream("uuid") << uuid.second;
1045 f->close_section();
1046 }
1047 f->close_section();
1048
1049 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1050 f->open_array_section("old_erasure_code_profiles");
1051 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1052 f->dump_string("old", erasure_code_profile.c_str());
1053 }
1054 f->close_section();
1055 }
1056
1057 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1058 {
1059 o.push_back(new Incremental);
1060 }
1061
1062 // ----------------------------------
1063 // OSDMap
1064
1065 void OSDMap::set_epoch(epoch_t e)
1066 {
1067 epoch = e;
1068 for (auto &pool : pools)
1069 pool.second.last_change = e;
1070 }
1071
1072 bool OSDMap::is_blacklisted(const entity_addr_t& a) const
1073 {
1074 if (blacklist.empty())
1075 return false;
1076
1077 // this specific instance?
1078 if (blacklist.count(a))
1079 return true;
1080
1081 // is entire ip blacklisted?
1082 if (a.is_ip()) {
1083 entity_addr_t b = a;
1084 b.set_port(0);
1085 b.set_nonce(0);
1086 if (blacklist.count(b)) {
1087 return true;
1088 }
1089 }
1090
1091 return false;
1092 }
1093
1094 void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1095 {
1096 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1097 }
1098
1099 void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1100 {
1101 for (const auto &i : blacklist) {
1102 bl->insert(i.first);
1103 }
1104 }
1105
1106 void OSDMap::set_max_osd(int m)
1107 {
1108 int o = max_osd;
1109 max_osd = m;
1110 osd_state.resize(m);
1111 osd_weight.resize(m);
1112 for (; o<max_osd; o++) {
1113 osd_state[o] = 0;
1114 osd_weight[o] = CEPH_OSD_OUT;
1115 }
1116 osd_info.resize(m);
1117 osd_xinfo.resize(m);
1118 osd_addrs->client_addr.resize(m);
1119 osd_addrs->cluster_addr.resize(m);
1120 osd_addrs->hb_back_addr.resize(m);
1121 osd_addrs->hb_front_addr.resize(m);
1122 osd_uuid->resize(m);
1123 if (osd_primary_affinity)
1124 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1125
1126 calc_num_osds();
1127 }
1128
1129 int OSDMap::calc_num_osds()
1130 {
1131 num_osd = 0;
1132 num_up_osd = 0;
1133 num_in_osd = 0;
1134 for (int i=0; i<max_osd; i++) {
1135 if (osd_state[i] & CEPH_OSD_EXISTS) {
1136 ++num_osd;
1137 if (osd_state[i] & CEPH_OSD_UP) {
1138 ++num_up_osd;
1139 }
1140 if (get_weight(i) != CEPH_OSD_OUT) {
1141 ++num_in_osd;
1142 }
1143 }
1144 }
1145 return num_osd;
1146 }
1147
1148 void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
1149 {
1150 *full = 0;
1151 *backfill = 0;
1152 *nearfull = 0;
1153 for (int i = 0; i < max_osd; ++i) {
1154 if (exists(i) && is_up(i) && is_in(i)) {
1155 if (osd_state[i] & CEPH_OSD_FULL)
1156 ++(*full);
1157 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1158 ++(*backfill);
1159 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1160 ++(*nearfull);
1161 }
1162 }
1163 }
1164
1165 static bool get_osd_utilization(
1166 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1167 int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail)
1168 {
1169 auto p = osd_stat.find(id);
1170 if (p == osd_stat.end())
1171 return false;
1172 *kb = p->second.kb;
1173 *kb_used = p->second.kb_used;
1174 *kb_avail = p->second.kb_avail;
1175 return *kb > 0;
1176 }
1177
1178 void OSDMap::get_full_osd_util(
1179 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1180 map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
1181 {
1182 full->clear();
1183 backfill->clear();
1184 nearfull->clear();
1185 for (int i = 0; i < max_osd; ++i) {
1186 if (exists(i) && is_up(i) && is_in(i)) {
1187 int64_t kb, kb_used, kb_avail;
1188 if (osd_state[i] & CEPH_OSD_FULL) {
1189 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1190 full->emplace(i, (float)kb_used / (float)kb);
1191 } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
1192 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1193 backfill->emplace(i, (float)kb_used / (float)kb);
1194 } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
1195 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1196 nearfull->emplace(i, (float)kb_used / (float)kb);
1197 }
1198 }
1199 }
1200 }
1201
1202 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1203 set<int> *nearfull) const
1204 {
1205 full->clear();
1206 backfill->clear();
1207 nearfull->clear();
1208 for (int i = 0; i < max_osd; ++i) {
1209 if (exists(i) && is_up(i) && is_in(i)) {
1210 if (osd_state[i] & CEPH_OSD_FULL)
1211 full->emplace(i);
1212 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1213 backfill->emplace(i);
1214 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1215 nearfull->emplace(i);
1216 }
1217 }
1218 }
1219
1220 void OSDMap::get_all_osds(set<int32_t>& ls) const
1221 {
1222 for (int i=0; i<max_osd; i++)
1223 if (exists(i))
1224 ls.insert(i);
1225 }
1226
1227 void OSDMap::get_up_osds(set<int32_t>& ls) const
1228 {
1229 for (int i = 0; i < max_osd; i++) {
1230 if (is_up(i))
1231 ls.insert(i);
1232 }
1233 }
1234
1235 void OSDMap::get_out_osds(set<int32_t>& ls) const
1236 {
1237 for (int i = 0; i < max_osd; i++) {
1238 if (is_out(i))
1239 ls.insert(i);
1240 }
1241 }
1242
1243 void OSDMap::calc_state_set(int state, set<string>& st)
1244 {
1245 unsigned t = state;
1246 for (unsigned s = 1; t; s <<= 1) {
1247 if (t & s) {
1248 t &= ~s;
1249 st.insert(ceph_osd_state_name(s));
1250 }
1251 }
1252 }
1253
1254 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1255 {
1256 float max = 0;
1257 for (const auto &weight : weights) {
1258 if (weight.second > max)
1259 max = weight.second;
1260 }
1261
1262 for (const auto &weight : weights) {
1263 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1264 }
1265 }
1266
1267 int OSDMap::identify_osd(const entity_addr_t& addr) const
1268 {
1269 for (int i=0; i<max_osd; i++)
1270 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr))
1271 return i;
1272 return -1;
1273 }
1274
1275 int OSDMap::identify_osd(const uuid_d& u) const
1276 {
1277 for (int i=0; i<max_osd; i++)
1278 if (exists(i) && get_uuid(i) == u)
1279 return i;
1280 return -1;
1281 }
1282
1283 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1284 {
1285 for (int i=0; i<max_osd; i++)
1286 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
1287 get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
1288 return i;
1289 return -1;
1290 }
1291
1292 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1293 {
1294 for (int i=0; i<max_osd; i++)
1295 if (exists(i) && (get_addr(i).is_same_host(ip) || get_cluster_addr(i).is_same_host(ip)))
1296 return i;
1297 return -1;
1298 }
1299
1300
1301 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1302 {
1303 uint64_t features = 0; // things we actually have
1304 uint64_t mask = 0; // things we could have
1305
1306 if (crush->has_nondefault_tunables())
1307 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1308 if (crush->has_nondefault_tunables2())
1309 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1310 if (crush->has_nondefault_tunables3())
1311 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1312 if (crush->has_v4_buckets())
1313 features |= CEPH_FEATURE_CRUSH_V4;
1314 if (crush->has_nondefault_tunables5())
1315 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1316 if (crush->has_incompat_choose_args()) {
1317 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1318 }
1319 mask |= CEPH_FEATURES_CRUSH;
1320
1321 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1322 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1323 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1324
1325 for (auto &pool: pools) {
1326 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1327 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1328 }
1329 if (pool.second.is_erasure() &&
1330 entity_type != CEPH_ENTITY_TYPE_CLIENT) { // not for clients
1331 features |= CEPH_FEATURE_OSD_ERASURE_CODES;
1332 }
1333 if (!pool.second.tiers.empty() ||
1334 pool.second.is_tier()) {
1335 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1336 }
1337 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
1338 pool.second.get_type(),
1339 pool.second.get_size());
1340 if (ruleid >= 0) {
1341 if (crush->is_v2_rule(ruleid))
1342 features |= CEPH_FEATURE_CRUSH_V2;
1343 if (crush->is_v3_rule(ruleid))
1344 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1345 if (crush->is_v5_rule(ruleid))
1346 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1347 }
1348 }
1349 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1350 for (auto &erasure_code_profile : erasure_code_profiles) {
1351 auto& profile = erasure_code_profile.second;
1352 const auto& plugin = profile.find("plugin");
1353 if (plugin != profile.end()) {
1354 if (plugin->second == "isa" || plugin->second == "lrc")
1355 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
1356 if (plugin->second == "shec")
1357 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
1358 }
1359 }
1360 }
1361 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1362 if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
1363 mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
1364
1365 if (osd_primary_affinity) {
1366 for (int i = 0; i < max_osd; ++i) {
1367 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1368 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1369 break;
1370 }
1371 }
1372 }
1373 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1374
1375 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1376 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1377 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
1378 features |= jewel_features;
1379 }
1380 mask |= jewel_features;
1381
1382 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1383 | CEPH_FEATURE_MSG_ADDR2;
1384 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
1385 features |= kraken_features;
1386 }
1387 mask |= kraken_features;
1388 }
1389
1390 if (pmask)
1391 *pmask = mask;
1392 return features;
1393 }
1394
1395 uint8_t OSDMap::get_min_compat_client() const
1396 {
1397 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1398
1399 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1400 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1401 return CEPH_RELEASE_LUMINOUS; // v12.2.0
1402 }
1403 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1404 return CEPH_RELEASE_JEWEL; // v10.2.0
1405 }
1406 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1407 return CEPH_RELEASE_HAMMER; // v0.94.0
1408 }
1409 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1410 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1411 HAVE_FEATURE(f, OSD_ERASURE_CODES) || // v0.73-498-gbfc86a8
1412 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1413 return CEPH_RELEASE_FIREFLY; // v0.80.0
1414 }
1415 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1416 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1417 return CEPH_RELEASE_DUMPLING; // v0.67.0
1418 }
1419 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1420 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1421 }
1422 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1423 }
1424
1425 void OSDMap::_calc_up_osd_features()
1426 {
1427 bool first = true;
1428 cached_up_osd_features = 0;
1429 for (int osd = 0; osd < max_osd; ++osd) {
1430 if (!is_up(osd))
1431 continue;
1432 const osd_xinfo_t &xi = get_xinfo(osd);
1433 if (first) {
1434 cached_up_osd_features = xi.features;
1435 first = false;
1436 } else {
1437 cached_up_osd_features &= xi.features;
1438 }
1439 }
1440 }
1441
1442 uint64_t OSDMap::get_up_osd_features() const
1443 {
1444 return cached_up_osd_features;
1445 }
1446
1447 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1448 {
1449 if (o->epoch == n->epoch)
1450 return;
1451
1452 int diff = 0;
1453
1454 // do addrs match?
1455 if (o->max_osd != n->max_osd)
1456 diff++;
1457 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1458 if ( n->osd_addrs->client_addr[i] && o->osd_addrs->client_addr[i] &&
1459 *n->osd_addrs->client_addr[i] == *o->osd_addrs->client_addr[i])
1460 n->osd_addrs->client_addr[i] = o->osd_addrs->client_addr[i];
1461 else
1462 diff++;
1463 if ( n->osd_addrs->cluster_addr[i] && o->osd_addrs->cluster_addr[i] &&
1464 *n->osd_addrs->cluster_addr[i] == *o->osd_addrs->cluster_addr[i])
1465 n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
1466 else
1467 diff++;
1468 if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
1469 *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
1470 n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
1471 else
1472 diff++;
1473 if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
1474 *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
1475 n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
1476 else
1477 diff++;
1478 }
1479 if (diff == 0) {
1480 // zoinks, no differences at all!
1481 n->osd_addrs = o->osd_addrs;
1482 }
1483
1484 // does crush match?
1485 bufferlist oc, nc;
1486 ::encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1487 ::encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1488 if (oc.contents_equal(nc)) {
1489 n->crush = o->crush;
1490 }
1491
1492 // does pg_temp match?
1493 if (*o->pg_temp == *n->pg_temp)
1494 n->pg_temp = o->pg_temp;
1495
1496 // does primary_temp match?
1497 if (o->primary_temp->size() == n->primary_temp->size()) {
1498 if (*o->primary_temp == *n->primary_temp)
1499 n->primary_temp = o->primary_temp;
1500 }
1501
1502 // do uuids match?
1503 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1504 *o->osd_uuid == *n->osd_uuid)
1505 n->osd_uuid = o->osd_uuid;
1506 }
1507
1508 void OSDMap::clean_temps(CephContext *cct,
1509 const OSDMap& osdmap, Incremental *pending_inc)
1510 {
1511 ldout(cct, 10) << __func__ << dendl;
1512 OSDMap tmpmap;
1513 tmpmap.deepish_copy_from(osdmap);
1514 tmpmap.apply_incremental(*pending_inc);
1515
1516 for (auto pg : *tmpmap.pg_temp) {
1517 // if pool does not exist, remove any existing pg_temps associated with
1518 // it. we don't care about pg_temps on the pending_inc either; if there
1519 // are new_pg_temp entries on the pending, clear them out just as well.
1520 if (!osdmap.have_pg_pool(pg.first.pool())) {
1521 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1522 << " for nonexistent pool " << pg.first.pool() << dendl;
1523 pending_inc->new_pg_temp[pg.first].clear();
1524 continue;
1525 }
1526 // all osds down?
1527 unsigned num_up = 0;
1528 for (auto o : pg.second) {
1529 if (!tmpmap.is_down(o)) {
1530 ++num_up;
1531 break;
1532 }
1533 }
1534 if (num_up == 0) {
1535 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1536 << " with all down osds" << pg.second << dendl;
1537 pending_inc->new_pg_temp[pg.first].clear();
1538 continue;
1539 }
1540 // redundant pg_temp?
1541 vector<int> raw_up;
1542 int primary;
1543 tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1544 if (vectors_equal(raw_up, pg.second)) {
1545 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1546 << pg.second << " that matches raw_up mapping" << dendl;
1547 if (osdmap.pg_temp->count(pg.first))
1548 pending_inc->new_pg_temp[pg.first].clear();
1549 else
1550 pending_inc->new_pg_temp.erase(pg.first);
1551 }
1552 }
1553
1554 for (auto &pg : *tmpmap.primary_temp) {
1555 // primary down?
1556 if (tmpmap.is_down(pg.second)) {
1557 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1558 << " to down " << pg.second << dendl;
1559 pending_inc->new_primary_temp[pg.first] = -1;
1560 continue;
1561 }
1562 // redundant primary_temp?
1563 vector<int> real_up, templess_up;
1564 int real_primary, templess_primary;
1565 pg_t pgid = pg.first;
1566 tmpmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1567 tmpmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1568 if (real_primary == templess_primary){
1569 ldout(cct, 10) << __func__ << " removing primary_temp "
1570 << pgid << " -> " << real_primary
1571 << " (unnecessary/redundant)" << dendl;
1572 if (osdmap.primary_temp->count(pgid))
1573 pending_inc->new_primary_temp[pgid] = -1;
1574 else
1575 pending_inc->new_primary_temp.erase(pgid);
1576 }
1577 }
1578 }
1579
1580 int OSDMap::apply_incremental(const Incremental &inc)
1581 {
1582 new_blacklist_entries = false;
1583 if (inc.epoch == 1)
1584 fsid = inc.fsid;
1585 else if (inc.fsid != fsid)
1586 return -EINVAL;
1587
1588 assert(inc.epoch == epoch+1);
1589
1590 epoch++;
1591 modified = inc.modified;
1592
1593 // full map?
1594 if (inc.fullmap.length()) {
1595 bufferlist bl(inc.fullmap);
1596 decode(bl);
1597 return 0;
1598 }
1599
1600 // nope, incremental.
1601 if (inc.new_flags >= 0) {
1602 flags = inc.new_flags;
1603 // the below is just to cover a newly-upgraded luminous mon
1604 // cluster that has to set require_jewel_osds or
1605 // require_kraken_osds before the osds can be upgraded to
1606 // luminous.
1607 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1608 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1609 require_osd_release = CEPH_RELEASE_KRAKEN;
1610 }
1611 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1612 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1613 require_osd_release = CEPH_RELEASE_JEWEL;
1614 }
1615 }
1616 }
1617
1618 if (inc.new_max_osd >= 0)
1619 set_max_osd(inc.new_max_osd);
1620
1621 if (inc.new_pool_max != -1)
1622 pool_max = inc.new_pool_max;
1623
1624 for (const auto &pool : inc.new_pools) {
1625 pools[pool.first] = pool.second;
1626 pools[pool.first].last_change = epoch;
1627 }
1628
1629 for (const auto &pname : inc.new_pool_names) {
1630 auto pool_name_entry = pool_name.find(pname.first);
1631 if (pool_name_entry != pool_name.end()) {
1632 name_pool.erase(pool_name_entry->second);
1633 pool_name_entry->second = pname.second;
1634 } else {
1635 pool_name[pname.first] = pname.second;
1636 }
1637 name_pool[pname.second] = pname.first;
1638 }
1639
1640 for (const auto &pool : inc.old_pools) {
1641 pools.erase(pool);
1642 name_pool.erase(pool_name[pool]);
1643 pool_name.erase(pool);
1644 }
1645
1646 for (const auto &weight : inc.new_weight) {
1647 set_weight(weight.first, weight.second);
1648
1649 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1650 // xinfo old_weight.
1651 if (weight.second) {
1652 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
1653 osd_xinfo[weight.first].old_weight = 0;
1654 }
1655 }
1656
1657 for (const auto &primary_affinity : inc.new_primary_affinity) {
1658 set_primary_affinity(primary_affinity.first, primary_affinity.second);
1659 }
1660
1661 // erasure_code_profiles
1662 for (const auto &profile : inc.old_erasure_code_profiles)
1663 erasure_code_profiles.erase(profile);
1664
1665 for (const auto &profile : inc.new_erasure_code_profiles) {
1666 set_erasure_code_profile(profile.first, profile.second);
1667 }
1668
1669 // up/down
1670 for (const auto &state : inc.new_state) {
1671 const auto osd = state.first;
1672 int s = state.second ? state.second : CEPH_OSD_UP;
1673 if ((osd_state[osd] & CEPH_OSD_UP) &&
1674 (s & CEPH_OSD_UP)) {
1675 osd_info[osd].down_at = epoch;
1676 osd_xinfo[osd].down_stamp = modified;
1677 }
1678 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
1679 (s & CEPH_OSD_EXISTS)) {
1680 // osd is destroyed; clear out anything interesting.
1681 (*osd_uuid)[osd] = uuid_d();
1682 osd_info[osd] = osd_info_t();
1683 osd_xinfo[osd] = osd_xinfo_t();
1684 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1685 osd_addrs->client_addr[osd].reset(new entity_addr_t());
1686 osd_addrs->cluster_addr[osd].reset(new entity_addr_t());
1687 osd_addrs->hb_front_addr[osd].reset(new entity_addr_t());
1688 osd_addrs->hb_back_addr[osd].reset(new entity_addr_t());
1689 osd_state[osd] = 0;
1690 } else {
1691 osd_state[osd] ^= s;
1692 }
1693 }
1694
1695 for (const auto &client : inc.new_up_client) {
1696 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1697 osd_addrs->client_addr[client.first].reset(new entity_addr_t(client.second));
1698 if (inc.new_hb_back_up.empty())
1699 osd_addrs->hb_back_addr[client.first].reset(new entity_addr_t(client.second)); //this is a backward-compatibility hack
1700 else
1701 osd_addrs->hb_back_addr[client.first].reset(
1702 new entity_addr_t(inc.new_hb_back_up.find(client.first)->second));
1703 const auto j = inc.new_hb_front_up.find(client.first);
1704 if (j != inc.new_hb_front_up.end())
1705 osd_addrs->hb_front_addr[client.first].reset(new entity_addr_t(j->second));
1706 else
1707 osd_addrs->hb_front_addr[client.first].reset();
1708
1709 osd_info[client.first].up_from = epoch;
1710 }
1711
1712 for (const auto &cluster : inc.new_up_cluster)
1713 osd_addrs->cluster_addr[cluster.first].reset(new entity_addr_t(cluster.second));
1714
1715 // info
1716 for (const auto &thru : inc.new_up_thru)
1717 osd_info[thru.first].up_thru = thru.second;
1718
1719 for (const auto &interval : inc.new_last_clean_interval) {
1720 osd_info[interval.first].last_clean_begin = interval.second.first;
1721 osd_info[interval.first].last_clean_end = interval.second.second;
1722 }
1723
1724 for (const auto &lost : inc.new_lost)
1725 osd_info[lost.first].lost_at = lost.second;
1726
1727 // xinfo
1728 for (const auto &xinfo : inc.new_xinfo)
1729 osd_xinfo[xinfo.first] = xinfo.second;
1730
1731 // uuid
1732 for (const auto &uuid : inc.new_uuid)
1733 (*osd_uuid)[uuid.first] = uuid.second;
1734
1735 // pg rebuild
1736 for (const auto &pg : inc.new_pg_temp) {
1737 if (pg.second.empty())
1738 pg_temp->erase(pg.first);
1739 else
1740 pg_temp->set(pg.first, pg.second);
1741 }
1742 if (!inc.new_pg_temp.empty()) {
1743 // make sure pg_temp is efficiently stored
1744 pg_temp->rebuild();
1745 }
1746
1747 for (const auto &pg : inc.new_primary_temp) {
1748 if (pg.second == -1)
1749 primary_temp->erase(pg.first);
1750 else
1751 (*primary_temp)[pg.first] = pg.second;
1752 }
1753
1754 for (auto& p : inc.new_pg_upmap) {
1755 pg_upmap[p.first] = p.second;
1756 }
1757 for (auto& pg : inc.old_pg_upmap) {
1758 pg_upmap.erase(pg);
1759 }
1760 for (auto& p : inc.new_pg_upmap_items) {
1761 pg_upmap_items[p.first] = p.second;
1762 }
1763 for (auto& pg : inc.old_pg_upmap_items) {
1764 pg_upmap_items.erase(pg);
1765 }
1766
1767 // blacklist
1768 if (!inc.new_blacklist.empty()) {
1769 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
1770 new_blacklist_entries = true;
1771 }
1772 for (const auto &addr : inc.old_blacklist)
1773 blacklist.erase(addr);
1774
1775 // cluster snapshot?
1776 if (inc.cluster_snapshot.length()) {
1777 cluster_snapshot = inc.cluster_snapshot;
1778 cluster_snapshot_epoch = inc.epoch;
1779 } else {
1780 cluster_snapshot.clear();
1781 cluster_snapshot_epoch = 0;
1782 }
1783
1784 if (inc.new_nearfull_ratio >= 0) {
1785 nearfull_ratio = inc.new_nearfull_ratio;
1786 }
1787 if (inc.new_backfillfull_ratio >= 0) {
1788 backfillfull_ratio = inc.new_backfillfull_ratio;
1789 }
1790 if (inc.new_full_ratio >= 0) {
1791 full_ratio = inc.new_full_ratio;
1792 }
1793 if (inc.new_require_min_compat_client > 0) {
1794 require_min_compat_client = inc.new_require_min_compat_client;
1795 }
1796 if (inc.new_require_osd_release >= 0) {
1797 require_osd_release = inc.new_require_osd_release;
1798 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1799 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
1800 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
1801 }
1802 }
1803
1804 // do new crush map last (after up/down stuff)
1805 if (inc.crush.length()) {
1806 bufferlist bl(inc.crush);
1807 auto blp = bl.begin();
1808 crush.reset(new CrushWrapper);
1809 crush->decode(blp);
1810 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1811 // only increment if this is a luminous-encoded osdmap, lest
1812 // the mon's crush_version diverge from what the osds or others
1813 // are decoding and applying on their end. if we won't encode
1814 // it in the canonical version, don't change it.
1815 ++crush_version;
1816 }
1817 }
1818
1819 calc_num_osds();
1820 _calc_up_osd_features();
1821 return 0;
1822 }
1823
1824 // mapping
1825 int OSDMap::map_to_pg(
1826 int64_t poolid,
1827 const string& name,
1828 const string& key,
1829 const string& nspace,
1830 pg_t *pg) const
1831 {
1832 // calculate ps (placement seed)
1833 const pg_pool_t *pool = get_pg_pool(poolid);
1834 if (!pool)
1835 return -ENOENT;
1836 ps_t ps;
1837 if (!key.empty())
1838 ps = pool->hash_key(key, nspace);
1839 else
1840 ps = pool->hash_key(name, nspace);
1841 *pg = pg_t(ps, poolid);
1842 return 0;
1843 }
1844
1845 int OSDMap::object_locator_to_pg(
1846 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
1847 {
1848 if (loc.hash >= 0) {
1849 if (!get_pg_pool(loc.get_pool())) {
1850 return -ENOENT;
1851 }
1852 pg = pg_t(loc.hash, loc.get_pool());
1853 return 0;
1854 }
1855 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
1856 }
1857
1858 ceph_object_layout OSDMap::make_object_layout(
1859 object_t oid, int pg_pool, string nspace) const
1860 {
1861 object_locator_t loc(pg_pool, nspace);
1862
1863 ceph_object_layout ol;
1864 pg_t pgid = object_locator_to_pg(oid, loc);
1865 ol.ol_pgid = pgid.get_old_pg().v;
1866 ol.ol_stripe_unit = 0;
1867 return ol;
1868 }
1869
1870 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
1871 vector<int>& osds) const
1872 {
1873 if (pool.can_shift_osds()) {
1874 unsigned removed = 0;
1875 for (unsigned i = 0; i < osds.size(); i++) {
1876 if (!exists(osds[i])) {
1877 removed++;
1878 continue;
1879 }
1880 if (removed) {
1881 osds[i - removed] = osds[i];
1882 }
1883 }
1884 if (removed)
1885 osds.resize(osds.size() - removed);
1886 } else {
1887 for (auto& osd : osds) {
1888 if (!exists(osd))
1889 osd = CRUSH_ITEM_NONE;
1890 }
1891 }
1892 }
1893
1894 void OSDMap::_pg_to_raw_osds(
1895 const pg_pool_t& pool, pg_t pg,
1896 vector<int> *osds,
1897 ps_t *ppps) const
1898 {
1899 // map to osds[]
1900 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
1901 unsigned size = pool.get_size();
1902
1903 // what crush rule?
1904 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
1905 if (ruleno >= 0)
1906 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
1907
1908 _remove_nonexistent_osds(pool, *osds);
1909
1910 if (ppps)
1911 *ppps = pps;
1912 }
1913
1914 int OSDMap::_pick_primary(const vector<int>& osds) const
1915 {
1916 for (auto osd : osds) {
1917 if (osd != CRUSH_ITEM_NONE) {
1918 return osd;
1919 }
1920 }
1921 return -1;
1922 }
1923
1924 void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
1925 {
1926 pg_t pg = pi.raw_pg_to_pg(raw_pg);
1927 auto p = pg_upmap.find(pg);
1928 if (p != pg_upmap.end()) {
1929 // make sure targets aren't marked out
1930 for (auto osd : p->second) {
1931 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd_weight[osd] == 0) {
1932 // reject/ignore the explicit mapping
1933 return;
1934 }
1935 }
1936 *raw = vector<int>(p->second.begin(), p->second.end());
1937 // continue to check and apply pg_upmap_items if any
1938 }
1939
1940 auto q = pg_upmap_items.find(pg);
1941 if (q != pg_upmap_items.end()) {
1942 // NOTE: this approach does not allow a bidirectional swap,
1943 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1944 for (auto& r : q->second) {
1945 // make sure the replacement value doesn't already appear
1946 bool exists = false;
1947 ssize_t pos = -1;
1948 for (unsigned i = 0; i < raw->size(); ++i) {
1949 int osd = (*raw)[i];
1950 if (osd == r.second) {
1951 exists = true;
1952 break;
1953 }
1954 // ignore mapping if target is marked out (or invalid osd id)
1955 if (osd == r.first &&
1956 pos < 0 &&
1957 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
1958 osd_weight[r.second] == 0)) {
1959 pos = i;
1960 }
1961 }
1962 if (!exists && pos >= 0) {
1963 (*raw)[pos] = r.second;
1964 }
1965 }
1966 }
1967 }
1968
1969 // pg -> (up osd list)
1970 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1971 vector<int> *up) const
1972 {
1973 if (pool.can_shift_osds()) {
1974 // shift left
1975 up->clear();
1976 up->reserve(raw.size());
1977 for (unsigned i=0; i<raw.size(); i++) {
1978 if (!exists(raw[i]) || is_down(raw[i]))
1979 continue;
1980 up->push_back(raw[i]);
1981 }
1982 } else {
1983 // set down/dne devices to NONE
1984 up->resize(raw.size());
1985 for (int i = raw.size() - 1; i >= 0; --i) {
1986 if (!exists(raw[i]) || is_down(raw[i])) {
1987 (*up)[i] = CRUSH_ITEM_NONE;
1988 } else {
1989 (*up)[i] = raw[i];
1990 }
1991 }
1992 }
1993 }
1994
1995 void OSDMap::_apply_primary_affinity(ps_t seed,
1996 const pg_pool_t& pool,
1997 vector<int> *osds,
1998 int *primary) const
1999 {
2000 // do we have any non-default primary_affinity values for these osds?
2001 if (!osd_primary_affinity)
2002 return;
2003
2004 bool any = false;
2005 for (const auto osd : *osds) {
2006 if (osd != CRUSH_ITEM_NONE &&
2007 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2008 any = true;
2009 break;
2010 }
2011 }
2012 if (!any)
2013 return;
2014
2015 // pick the primary. feed both the seed (for the pg) and the osd
2016 // into the hash/rng so that a proportional fraction of an osd's pgs
2017 // get rejected as primary.
2018 int pos = -1;
2019 for (unsigned i = 0; i < osds->size(); ++i) {
2020 int o = (*osds)[i];
2021 if (o == CRUSH_ITEM_NONE)
2022 continue;
2023 unsigned a = (*osd_primary_affinity)[o];
2024 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2025 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2026 seed, o) >> 16) >= a) {
2027 // we chose not to use this primary. note it anyway as a
2028 // fallback in case we don't pick anyone else, but keep looking.
2029 if (pos < 0)
2030 pos = i;
2031 } else {
2032 pos = i;
2033 break;
2034 }
2035 }
2036 if (pos < 0)
2037 return;
2038
2039 *primary = (*osds)[pos];
2040
2041 if (pool.can_shift_osds() && pos > 0) {
2042 // move the new primary to the front.
2043 for (int i = pos; i > 0; --i) {
2044 (*osds)[i] = (*osds)[i-1];
2045 }
2046 (*osds)[0] = *primary;
2047 }
2048 }
2049
2050 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2051 vector<int> *temp_pg, int *temp_primary) const
2052 {
2053 pg = pool.raw_pg_to_pg(pg);
2054 const auto p = pg_temp->find(pg);
2055 temp_pg->clear();
2056 if (p != pg_temp->end()) {
2057 for (unsigned i=0; i<p->second.size(); i++) {
2058 if (!exists(p->second[i]) || is_down(p->second[i])) {
2059 if (pool.can_shift_osds()) {
2060 continue;
2061 } else {
2062 temp_pg->push_back(CRUSH_ITEM_NONE);
2063 }
2064 } else {
2065 temp_pg->push_back(p->second[i]);
2066 }
2067 }
2068 }
2069 const auto &pp = primary_temp->find(pg);
2070 *temp_primary = -1;
2071 if (pp != primary_temp->end()) {
2072 *temp_primary = pp->second;
2073 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2074 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2075 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2076 *temp_primary = (*temp_pg)[i];
2077 break;
2078 }
2079 }
2080 }
2081 }
2082
2083 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2084 {
2085 *primary = -1;
2086 raw->clear();
2087 const pg_pool_t *pool = get_pg_pool(pg.pool());
2088 if (!pool)
2089 return;
2090 _pg_to_raw_osds(*pool, pg, raw, NULL);
2091 if (primary)
2092 *primary = _pick_primary(*raw);
2093 }
2094
2095 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2096 {
2097 const pg_pool_t *pool = get_pg_pool(pg.pool());
2098 if (!pool) {
2099 if (primary)
2100 *primary = -1;
2101 if (up)
2102 up->clear();
2103 return;
2104 }
2105 vector<int> raw;
2106 ps_t pps;
2107 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2108 _apply_upmap(*pool, pg, &raw);
2109 _raw_to_up_osds(*pool, raw, up);
2110 *primary = _pick_primary(raw);
2111 _apply_primary_affinity(pps, *pool, up, primary);
2112 }
2113
2114 void OSDMap::_pg_to_up_acting_osds(
2115 const pg_t& pg, vector<int> *up, int *up_primary,
2116 vector<int> *acting, int *acting_primary,
2117 bool raw_pg_to_pg) const
2118 {
2119 const pg_pool_t *pool = get_pg_pool(pg.pool());
2120 if (!pool ||
2121 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2122 if (up)
2123 up->clear();
2124 if (up_primary)
2125 *up_primary = -1;
2126 if (acting)
2127 acting->clear();
2128 if (acting_primary)
2129 *acting_primary = -1;
2130 return;
2131 }
2132 vector<int> raw;
2133 vector<int> _up;
2134 vector<int> _acting;
2135 int _up_primary;
2136 int _acting_primary;
2137 ps_t pps;
2138 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2139 if (_acting.empty() || up || up_primary) {
2140 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2141 _apply_upmap(*pool, pg, &raw);
2142 _raw_to_up_osds(*pool, raw, &_up);
2143 _up_primary = _pick_primary(_up);
2144 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2145 if (_acting.empty()) {
2146 _acting = _up;
2147 if (_acting_primary == -1) {
2148 _acting_primary = _up_primary;
2149 }
2150 }
2151
2152 if (up)
2153 up->swap(_up);
2154 if (up_primary)
2155 *up_primary = _up_primary;
2156 }
2157
2158 if (acting)
2159 acting->swap(_acting);
2160 if (acting_primary)
2161 *acting_primary = _acting_primary;
2162 }
2163
2164 int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2165 {
2166 if (!nrep)
2167 nrep = acting.size();
2168 for (int i=0; i<nrep; i++)
2169 if (acting[i] == osd)
2170 return i;
2171 return -1;
2172 }
2173
2174 int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2175 {
2176 return calc_pg_rank(osd, acting, nrep);
2177 }
2178
2179 bool OSDMap::primary_changed(
2180 int oldprimary,
2181 const vector<int> &oldacting,
2182 int newprimary,
2183 const vector<int> &newacting)
2184 {
2185 if (oldacting.empty() && newacting.empty())
2186 return false; // both still empty
2187 if (oldacting.empty() ^ newacting.empty())
2188 return true; // was empty, now not, or vice versa
2189 if (oldprimary != newprimary)
2190 return true; // primary changed
2191 if (calc_pg_rank(oldprimary, oldacting) !=
2192 calc_pg_rank(newprimary, newacting))
2193 return true;
2194 return false; // same primary (tho replicas may have changed)
2195 }
2196
2197
2198 // serialize, unserialize
2199 void OSDMap::encode_client_old(bufferlist& bl) const
2200 {
2201 __u16 v = 5;
2202 ::encode(v, bl);
2203
2204 // base
2205 ::encode(fsid, bl);
2206 ::encode(epoch, bl);
2207 ::encode(created, bl);
2208 ::encode(modified, bl);
2209
2210 // for ::encode(pools, bl);
2211 __u32 n = pools.size();
2212 ::encode(n, bl);
2213
2214 for (const auto &pool : pools) {
2215 n = pool.first;
2216 ::encode(n, bl);
2217 ::encode(pool.second, bl, 0);
2218 }
2219 // for ::encode(pool_name, bl);
2220 n = pool_name.size();
2221 ::encode(n, bl);
2222 for (const auto &pname : pool_name) {
2223 n = pname.first;
2224 ::encode(n, bl);
2225 ::encode(pname.second, bl);
2226 }
2227 // for ::encode(pool_max, bl);
2228 n = pool_max;
2229 ::encode(n, bl);
2230
2231 ::encode(flags, bl);
2232
2233 ::encode(max_osd, bl);
2234 {
2235 uint32_t n = osd_state.size();
2236 ::encode(n, bl);
2237 for (auto s : osd_state) {
2238 ::encode((uint8_t)s, bl);
2239 }
2240 }
2241 ::encode(osd_weight, bl);
2242 ::encode(osd_addrs->client_addr, bl, 0);
2243
2244 // for ::encode(pg_temp, bl);
2245 n = pg_temp->size();
2246 ::encode(n, bl);
2247 for (const auto pg : *pg_temp) {
2248 old_pg_t opg = pg.first.get_old_pg();
2249 ::encode(opg, bl);
2250 ::encode(pg.second, bl);
2251 }
2252
2253 // crush
2254 bufferlist cbl;
2255 crush->encode(cbl, 0 /* legacy (no) features */);
2256 ::encode(cbl, bl);
2257 }
2258
2259 void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2260 {
2261 if ((features & CEPH_FEATURE_PGID64) == 0) {
2262 encode_client_old(bl);
2263 return;
2264 }
2265
2266 __u16 v = 6;
2267 ::encode(v, bl);
2268
2269 // base
2270 ::encode(fsid, bl);
2271 ::encode(epoch, bl);
2272 ::encode(created, bl);
2273 ::encode(modified, bl);
2274
2275 ::encode(pools, bl, features);
2276 ::encode(pool_name, bl);
2277 ::encode(pool_max, bl);
2278
2279 ::encode(flags, bl);
2280
2281 ::encode(max_osd, bl);
2282 {
2283 uint32_t n = osd_state.size();
2284 ::encode(n, bl);
2285 for (auto s : osd_state) {
2286 ::encode((uint8_t)s, bl);
2287 }
2288 }
2289 ::encode(osd_weight, bl);
2290 ::encode(osd_addrs->client_addr, bl, features);
2291
2292 ::encode(*pg_temp, bl);
2293
2294 // crush
2295 bufferlist cbl;
2296 crush->encode(cbl, 0 /* legacy (no) features */);
2297 ::encode(cbl, bl);
2298
2299 // extended
2300 __u16 ev = 10;
2301 ::encode(ev, bl);
2302 ::encode(osd_addrs->hb_back_addr, bl, features);
2303 ::encode(osd_info, bl);
2304 ::encode(blacklist, bl, features);
2305 ::encode(osd_addrs->cluster_addr, bl, features);
2306 ::encode(cluster_snapshot_epoch, bl);
2307 ::encode(cluster_snapshot, bl);
2308 ::encode(*osd_uuid, bl);
2309 ::encode(osd_xinfo, bl);
2310 ::encode(osd_addrs->hb_front_addr, bl, features);
2311 }
2312
2313 void OSDMap::encode(bufferlist& bl, uint64_t features) const
2314 {
2315 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2316 encode_classic(bl, features);
2317 return;
2318 }
2319
2320 // only a select set of callers should *ever* be encoding new
2321 // OSDMaps. others should be passing around the canonical encoded
2322 // buffers from on high. select out those callers by passing in an
2323 // "impossible" feature bit.
2324 assert(features & CEPH_FEATURE_RESERVED);
2325 features &= ~CEPH_FEATURE_RESERVED;
2326
2327 size_t start_offset = bl.length();
2328 size_t tail_offset;
2329 buffer::list::iterator crc_it;
2330
2331 // meta-encoding: how we include client-used and osd-specific data
2332 ENCODE_START(8, 7, bl);
2333
2334 {
2335 uint8_t v = 6;
2336 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2337 v = 3;
2338 }
2339 ENCODE_START(v, 1, bl); // client-usable data
2340 // base
2341 ::encode(fsid, bl);
2342 ::encode(epoch, bl);
2343 ::encode(created, bl);
2344 ::encode(modified, bl);
2345
2346 ::encode(pools, bl, features);
2347 ::encode(pool_name, bl);
2348 ::encode(pool_max, bl);
2349
2350 if (v < 4) {
2351 decltype(flags) f = flags;
2352 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
2353 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
2354 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2355 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2356 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2357 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
2358 ::encode(f, bl);
2359 } else {
2360 ::encode(flags, bl);
2361 }
2362
2363 ::encode(max_osd, bl);
2364 if (v >= 5) {
2365 ::encode(osd_state, bl);
2366 } else {
2367 uint32_t n = osd_state.size();
2368 ::encode(n, bl);
2369 for (auto s : osd_state) {
2370 ::encode((uint8_t)s, bl);
2371 }
2372 }
2373 ::encode(osd_weight, bl);
2374 ::encode(osd_addrs->client_addr, bl, features);
2375
2376 ::encode(*pg_temp, bl);
2377 ::encode(*primary_temp, bl);
2378 if (osd_primary_affinity) {
2379 ::encode(*osd_primary_affinity, bl);
2380 } else {
2381 vector<__u32> v;
2382 ::encode(v, bl);
2383 }
2384
2385 // crush
2386 bufferlist cbl;
2387 crush->encode(cbl, features);
2388 ::encode(cbl, bl);
2389 ::encode(erasure_code_profiles, bl);
2390
2391 if (v >= 4) {
2392 ::encode(pg_upmap, bl);
2393 ::encode(pg_upmap_items, bl);
2394 } else {
2395 assert(pg_upmap.empty());
2396 assert(pg_upmap_items.empty());
2397 }
2398 if (v >= 6) {
2399 ::encode(crush_version, bl);
2400 }
2401 ENCODE_FINISH(bl); // client-usable data
2402 }
2403
2404 {
2405 uint8_t target_v = 5;
2406 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2407 target_v = 1;
2408 }
2409 ENCODE_START(target_v, 1, bl); // extended, osd-only data
2410 ::encode(osd_addrs->hb_back_addr, bl, features);
2411 ::encode(osd_info, bl);
2412 {
2413 // put this in a sorted, ordered map<> so that we encode in a
2414 // deterministic order.
2415 map<entity_addr_t,utime_t> blacklist_map;
2416 for (const auto &addr : blacklist)
2417 blacklist_map.insert(make_pair(addr.first, addr.second));
2418 ::encode(blacklist_map, bl, features);
2419 }
2420 ::encode(osd_addrs->cluster_addr, bl, features);
2421 ::encode(cluster_snapshot_epoch, bl);
2422 ::encode(cluster_snapshot, bl);
2423 ::encode(*osd_uuid, bl);
2424 ::encode(osd_xinfo, bl);
2425 ::encode(osd_addrs->hb_front_addr, bl, features);
2426 if (target_v >= 2) {
2427 ::encode(nearfull_ratio, bl);
2428 ::encode(full_ratio, bl);
2429 ::encode(backfillfull_ratio, bl);
2430 }
2431 // 4 was string-based new_require_min_compat_client
2432 if (target_v >= 5) {
2433 ::encode(require_min_compat_client, bl);
2434 ::encode(require_osd_release, bl);
2435 }
2436 ENCODE_FINISH(bl); // osd-only data
2437 }
2438
2439 ::encode((uint32_t)0, bl); // dummy crc
2440 crc_it = bl.end();
2441 crc_it.advance(-4);
2442 tail_offset = bl.length();
2443
2444 ENCODE_FINISH(bl); // meta-encoding wrapper
2445
2446 // fill in crc
2447 bufferlist front;
2448 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
2449 crc = front.crc32c(-1);
2450 if (tail_offset < bl.length()) {
2451 bufferlist tail;
2452 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2453 crc = tail.crc32c(crc);
2454 }
2455 ceph_le32 crc_le;
2456 crc_le = crc;
2457 crc_it.copy_in(4, (char*)&crc_le);
2458 crc_defined = true;
2459 }
2460
2461 void OSDMap::decode(bufferlist& bl)
2462 {
2463 auto p = bl.begin();
2464 decode(p);
2465 }
2466
2467 void OSDMap::decode_classic(bufferlist::iterator& p)
2468 {
2469 __u32 n, t;
2470 __u16 v;
2471 ::decode(v, p);
2472
2473 // base
2474 ::decode(fsid, p);
2475 ::decode(epoch, p);
2476 ::decode(created, p);
2477 ::decode(modified, p);
2478
2479 if (v < 6) {
2480 if (v < 4) {
2481 int32_t max_pools = 0;
2482 ::decode(max_pools, p);
2483 pool_max = max_pools;
2484 }
2485 pools.clear();
2486 ::decode(n, p);
2487 while (n--) {
2488 ::decode(t, p);
2489 ::decode(pools[t], p);
2490 }
2491 if (v == 4) {
2492 ::decode(n, p);
2493 pool_max = n;
2494 } else if (v == 5) {
2495 pool_name.clear();
2496 ::decode(n, p);
2497 while (n--) {
2498 ::decode(t, p);
2499 ::decode(pool_name[t], p);
2500 }
2501 ::decode(n, p);
2502 pool_max = n;
2503 }
2504 } else {
2505 ::decode(pools, p);
2506 ::decode(pool_name, p);
2507 ::decode(pool_max, p);
2508 }
2509 // kludge around some old bug that zeroed out pool_max (#2307)
2510 if (pools.size() && pool_max < pools.rbegin()->first) {
2511 pool_max = pools.rbegin()->first;
2512 }
2513
2514 ::decode(flags, p);
2515
2516 ::decode(max_osd, p);
2517 {
2518 vector<uint8_t> os;
2519 ::decode(os, p);
2520 osd_state.resize(os.size());
2521 for (unsigned i = 0; i < os.size(); ++i) {
2522 osd_state[i] = os[i];
2523 }
2524 }
2525 ::decode(osd_weight, p);
2526 ::decode(osd_addrs->client_addr, p);
2527 if (v <= 5) {
2528 pg_temp->clear();
2529 ::decode(n, p);
2530 while (n--) {
2531 old_pg_t opg;
2532 ::decode_raw(opg, p);
2533 mempool::osdmap::vector<int32_t> v;
2534 ::decode(v, p);
2535 pg_temp->set(pg_t(opg), v);
2536 }
2537 } else {
2538 ::decode(*pg_temp, p);
2539 }
2540
2541 // crush
2542 bufferlist cbl;
2543 ::decode(cbl, p);
2544 auto cblp = cbl.begin();
2545 crush->decode(cblp);
2546
2547 // extended
2548 __u16 ev = 0;
2549 if (v >= 5)
2550 ::decode(ev, p);
2551 ::decode(osd_addrs->hb_back_addr, p);
2552 ::decode(osd_info, p);
2553 if (v < 5)
2554 ::decode(pool_name, p);
2555
2556 ::decode(blacklist, p);
2557 if (ev >= 6)
2558 ::decode(osd_addrs->cluster_addr, p);
2559 else
2560 osd_addrs->cluster_addr.resize(osd_addrs->client_addr.size());
2561
2562 if (ev >= 7) {
2563 ::decode(cluster_snapshot_epoch, p);
2564 ::decode(cluster_snapshot, p);
2565 }
2566
2567 if (ev >= 8) {
2568 ::decode(*osd_uuid, p);
2569 } else {
2570 osd_uuid->resize(max_osd);
2571 }
2572 if (ev >= 9)
2573 ::decode(osd_xinfo, p);
2574 else
2575 osd_xinfo.resize(max_osd);
2576
2577 if (ev >= 10)
2578 ::decode(osd_addrs->hb_front_addr, p);
2579 else
2580 osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
2581
2582 osd_primary_affinity.reset();
2583
2584 post_decode();
2585 }
2586
2587 void OSDMap::decode(bufferlist::iterator& bl)
2588 {
2589 /**
2590 * Older encodings of the OSDMap had a single struct_v which
2591 * covered the whole encoding, and was prior to our modern
2592 * stuff which includes a compatv and a size. So if we see
2593 * a struct_v < 7, we must rewind to the beginning and use our
2594 * classic decoder.
2595 */
2596 size_t start_offset = bl.get_off();
2597 size_t tail_offset = 0;
2598 bufferlist crc_front, crc_tail;
2599
2600 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
2601 if (struct_v < 7) {
2602 int struct_v_size = sizeof(struct_v);
2603 bl.advance(-struct_v_size);
2604 decode_classic(bl);
2605 return;
2606 }
2607 /**
2608 * Since we made it past that hurdle, we can use our normal paths.
2609 */
2610 {
2611 DECODE_START(6, bl); // client-usable data
2612 // base
2613 ::decode(fsid, bl);
2614 ::decode(epoch, bl);
2615 ::decode(created, bl);
2616 ::decode(modified, bl);
2617
2618 ::decode(pools, bl);
2619 ::decode(pool_name, bl);
2620 ::decode(pool_max, bl);
2621
2622 ::decode(flags, bl);
2623
2624 ::decode(max_osd, bl);
2625 if (struct_v >= 5) {
2626 ::decode(osd_state, bl);
2627 } else {
2628 vector<uint8_t> os;
2629 ::decode(os, bl);
2630 osd_state.resize(os.size());
2631 for (unsigned i = 0; i < os.size(); ++i) {
2632 osd_state[i] = os[i];
2633 }
2634 }
2635 ::decode(osd_weight, bl);
2636 ::decode(osd_addrs->client_addr, bl);
2637
2638 ::decode(*pg_temp, bl);
2639 ::decode(*primary_temp, bl);
2640 if (struct_v >= 2) {
2641 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
2642 ::decode(*osd_primary_affinity, bl);
2643 if (osd_primary_affinity->empty())
2644 osd_primary_affinity.reset();
2645 } else {
2646 osd_primary_affinity.reset();
2647 }
2648
2649 // crush
2650 bufferlist cbl;
2651 ::decode(cbl, bl);
2652 auto cblp = cbl.begin();
2653 crush->decode(cblp);
2654 if (struct_v >= 3) {
2655 ::decode(erasure_code_profiles, bl);
2656 } else {
2657 erasure_code_profiles.clear();
2658 }
2659 if (struct_v >= 4) {
2660 ::decode(pg_upmap, bl);
2661 ::decode(pg_upmap_items, bl);
2662 } else {
2663 pg_upmap.clear();
2664 pg_upmap_items.clear();
2665 }
2666 if (struct_v >= 6) {
2667 ::decode(crush_version, bl);
2668 }
2669 DECODE_FINISH(bl); // client-usable data
2670 }
2671
2672 {
2673 DECODE_START(5, bl); // extended, osd-only data
2674 ::decode(osd_addrs->hb_back_addr, bl);
2675 ::decode(osd_info, bl);
2676 ::decode(blacklist, bl);
2677 ::decode(osd_addrs->cluster_addr, bl);
2678 ::decode(cluster_snapshot_epoch, bl);
2679 ::decode(cluster_snapshot, bl);
2680 ::decode(*osd_uuid, bl);
2681 ::decode(osd_xinfo, bl);
2682 ::decode(osd_addrs->hb_front_addr, bl);
2683 if (struct_v >= 2) {
2684 ::decode(nearfull_ratio, bl);
2685 ::decode(full_ratio, bl);
2686 } else {
2687 nearfull_ratio = 0;
2688 full_ratio = 0;
2689 }
2690 if (struct_v >= 3) {
2691 ::decode(backfillfull_ratio, bl);
2692 } else {
2693 backfillfull_ratio = 0;
2694 }
2695 if (struct_v == 4) {
2696 string r;
2697 ::decode(r, bl);
2698 if (r.length())
2699 require_min_compat_client = ceph_release_from_name(r.c_str());
2700 }
2701 if (struct_v >= 5) {
2702 ::decode(require_min_compat_client, bl);
2703 ::decode(require_osd_release, bl);
2704 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2705 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2706 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
2707 }
2708 } else {
2709 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
2710 // only for compat with post-kraken pre-luminous test clusters
2711 require_osd_release = CEPH_RELEASE_LUMINOUS;
2712 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2713 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
2714 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2715 require_osd_release = CEPH_RELEASE_KRAKEN;
2716 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2717 require_osd_release = CEPH_RELEASE_JEWEL;
2718 } else {
2719 require_osd_release = 0;
2720 }
2721 }
2722 DECODE_FINISH(bl); // osd-only data
2723 }
2724
2725 if (struct_v >= 8) {
2726 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
2727 ::decode(crc, bl);
2728 tail_offset = bl.get_off();
2729 crc_defined = true;
2730 } else {
2731 crc_defined = false;
2732 crc = 0;
2733 }
2734
2735 DECODE_FINISH(bl); // wrapper
2736
2737 if (tail_offset) {
2738 // verify crc
2739 uint32_t actual = crc_front.crc32c(-1);
2740 if (tail_offset < bl.get_off()) {
2741 bufferlist tail;
2742 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
2743 actual = tail.crc32c(actual);
2744 }
2745 if (crc != actual) {
2746 ostringstream ss;
2747 ss << "bad crc, actual " << actual << " != expected " << crc;
2748 string s = ss.str();
2749 throw buffer::malformed_input(s.c_str());
2750 }
2751 }
2752
2753 post_decode();
2754 }
2755
2756 void OSDMap::post_decode()
2757 {
2758 // index pool names
2759 name_pool.clear();
2760 for (const auto &pname : pool_name) {
2761 name_pool[pname.second] = pname.first;
2762 }
2763
2764 calc_num_osds();
2765 _calc_up_osd_features();
2766 }
2767
2768 void OSDMap::dump_erasure_code_profiles(
2769 const mempool::osdmap::map<string,map<string,string>>& profiles,
2770 Formatter *f)
2771 {
2772 f->open_object_section("erasure_code_profiles");
2773 for (const auto &profile : profiles) {
2774 f->open_object_section(profile.first.c_str());
2775 for (const auto &profm : profile.second) {
2776 f->dump_string(profm.first.c_str(), profm.second.c_str());
2777 }
2778 f->close_section();
2779 }
2780 f->close_section();
2781 }
2782
2783 void OSDMap::dump(Formatter *f) const
2784 {
2785 f->dump_int("epoch", get_epoch());
2786 f->dump_stream("fsid") << get_fsid();
2787 f->dump_stream("created") << get_created();
2788 f->dump_stream("modified") << get_modified();
2789 f->dump_string("flags", get_flag_string());
2790 f->dump_unsigned("crush_version", get_crush_version());
2791 f->dump_float("full_ratio", full_ratio);
2792 f->dump_float("backfillfull_ratio", backfillfull_ratio);
2793 f->dump_float("nearfull_ratio", nearfull_ratio);
2794 f->dump_string("cluster_snapshot", get_cluster_snapshot());
2795 f->dump_int("pool_max", get_pool_max());
2796 f->dump_int("max_osd", get_max_osd());
2797 f->dump_string("require_min_compat_client",
2798 ceph_release_name(require_min_compat_client));
2799 f->dump_string("min_compat_client",
2800 ceph_release_name(get_min_compat_client()));
2801 f->dump_string("require_osd_release",
2802 ceph_release_name(require_osd_release));
2803
2804 f->open_array_section("pools");
2805 for (const auto &pool : pools) {
2806 std::string name("<unknown>");
2807 const auto &pni = pool_name.find(pool.first);
2808 if (pni != pool_name.end())
2809 name = pni->second;
2810 f->open_object_section("pool");
2811 f->dump_int("pool", pool.first);
2812 f->dump_string("pool_name", name);
2813 pool.second.dump(f);
2814 f->close_section();
2815 }
2816 f->close_section();
2817
2818 f->open_array_section("osds");
2819 for (int i=0; i<get_max_osd(); i++)
2820 if (exists(i)) {
2821 f->open_object_section("osd_info");
2822 f->dump_int("osd", i);
2823 f->dump_stream("uuid") << get_uuid(i);
2824 f->dump_int("up", is_up(i));
2825 f->dump_int("in", is_in(i));
2826 f->dump_float("weight", get_weightf(i));
2827 f->dump_float("primary_affinity", get_primary_affinityf(i));
2828 get_info(i).dump(f);
2829 f->dump_stream("public_addr") << get_addr(i);
2830 f->dump_stream("cluster_addr") << get_cluster_addr(i);
2831 f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
2832 f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
2833
2834 set<string> st;
2835 get_state(i, st);
2836 f->open_array_section("state");
2837 for (const auto &state : st)
2838 f->dump_string("state", state);
2839 f->close_section();
2840
2841 f->close_section();
2842 }
2843 f->close_section();
2844
2845 f->open_array_section("osd_xinfo");
2846 for (int i=0; i<get_max_osd(); i++) {
2847 if (exists(i)) {
2848 f->open_object_section("xinfo");
2849 f->dump_int("osd", i);
2850 osd_xinfo[i].dump(f);
2851 f->close_section();
2852 }
2853 }
2854 f->close_section();
2855
2856 f->open_array_section("pg_upmap");
2857 for (auto& p : pg_upmap) {
2858 f->open_object_section("mapping");
2859 f->dump_stream("pgid") << p.first;
2860 f->open_array_section("osds");
2861 for (auto q : p.second) {
2862 f->dump_int("osd", q);
2863 }
2864 f->close_section();
2865 f->close_section();
2866 }
2867 f->close_section();
2868 f->open_array_section("pg_upmap_items");
2869 for (auto& p : pg_upmap_items) {
2870 f->open_object_section("mapping");
2871 f->dump_stream("pgid") << p.first;
2872 f->open_array_section("mappings");
2873 for (auto& q : p.second) {
2874 f->open_object_section("mapping");
2875 f->dump_int("from", q.first);
2876 f->dump_int("to", q.second);
2877 f->close_section();
2878 }
2879 f->close_section();
2880 f->close_section();
2881 }
2882 f->close_section();
2883 f->open_array_section("pg_temp");
2884 pg_temp->dump(f);
2885 f->close_section();
2886
2887 f->open_array_section("primary_temp");
2888 for (const auto &pg : *primary_temp) {
2889 f->dump_stream("pgid") << pg.first;
2890 f->dump_int("osd", pg.second);
2891 }
2892 f->close_section(); // primary_temp
2893
2894 f->open_object_section("blacklist");
2895 for (const auto &addr : blacklist) {
2896 stringstream ss;
2897 ss << addr.first;
2898 f->dump_stream(ss.str().c_str()) << addr.second;
2899 }
2900 f->close_section();
2901
2902 dump_erasure_code_profiles(erasure_code_profiles, f);
2903 }
2904
2905 void OSDMap::generate_test_instances(list<OSDMap*>& o)
2906 {
2907 o.push_back(new OSDMap);
2908
2909 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
2910 o.push_back(new OSDMap);
2911 uuid_d fsid;
2912 o.back()->build_simple(cct, 1, fsid, 16);
2913 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
2914 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
2915 cct->put();
2916 }
2917
2918 string OSDMap::get_flag_string(unsigned f)
2919 {
2920 string s;
2921 if ( f& CEPH_OSDMAP_NEARFULL)
2922 s += ",nearfull";
2923 if (f & CEPH_OSDMAP_FULL)
2924 s += ",full";
2925 if (f & CEPH_OSDMAP_PAUSERD)
2926 s += ",pauserd";
2927 if (f & CEPH_OSDMAP_PAUSEWR)
2928 s += ",pausewr";
2929 if (f & CEPH_OSDMAP_PAUSEREC)
2930 s += ",pauserec";
2931 if (f & CEPH_OSDMAP_NOUP)
2932 s += ",noup";
2933 if (f & CEPH_OSDMAP_NODOWN)
2934 s += ",nodown";
2935 if (f & CEPH_OSDMAP_NOOUT)
2936 s += ",noout";
2937 if (f & CEPH_OSDMAP_NOIN)
2938 s += ",noin";
2939 if (f & CEPH_OSDMAP_NOBACKFILL)
2940 s += ",nobackfill";
2941 if (f & CEPH_OSDMAP_NOREBALANCE)
2942 s += ",norebalance";
2943 if (f & CEPH_OSDMAP_NORECOVER)
2944 s += ",norecover";
2945 if (f & CEPH_OSDMAP_NOSCRUB)
2946 s += ",noscrub";
2947 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
2948 s += ",nodeep-scrub";
2949 if (f & CEPH_OSDMAP_NOTIERAGENT)
2950 s += ",notieragent";
2951 if (f & CEPH_OSDMAP_SORTBITWISE)
2952 s += ",sortbitwise";
2953 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
2954 s += ",require_jewel_osds";
2955 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
2956 s += ",require_kraken_osds";
2957 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
2958 s += ",require_luminous_osds";
2959 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
2960 s += ",recovery_deletes";
2961 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
2962 s += ",purged_snapdirs";
2963 if (s.length())
2964 s.erase(0, 1);
2965 return s;
2966 }
2967
2968 string OSDMap::get_flag_string() const
2969 {
2970 return get_flag_string(flags);
2971 }
2972
2973 void OSDMap::print_pools(ostream& out) const
2974 {
2975 for (const auto &pool : pools) {
2976 std::string name("<unknown>");
2977 const auto &pni = pool_name.find(pool.first);
2978 if (pni != pool_name.end())
2979 name = pni->second;
2980 out << "pool " << pool.first
2981 << " '" << name
2982 << "' " << pool.second << "\n";
2983
2984 for (const auto &snap : pool.second.snaps)
2985 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
2986
2987 if (!pool.second.removed_snaps.empty())
2988 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
2989 }
2990 out << std::endl;
2991 }
2992
2993 void OSDMap::print(ostream& out) const
2994 {
2995 out << "epoch " << get_epoch() << "\n"
2996 << "fsid " << get_fsid() << "\n"
2997 << "created " << get_created() << "\n"
2998 << "modified " << get_modified() << "\n";
2999
3000 out << "flags " << get_flag_string() << "\n";
3001 out << "crush_version " << get_crush_version() << "\n";
3002 out << "full_ratio " << full_ratio << "\n";
3003 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3004 out << "nearfull_ratio " << nearfull_ratio << "\n";
3005 if (require_min_compat_client > 0) {
3006 out << "require_min_compat_client "
3007 << ceph_release_name(require_min_compat_client) << "\n";
3008 }
3009 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
3010 << "\n";
3011 if (require_osd_release > 0) {
3012 out << "require_osd_release " << ceph_release_name(require_osd_release)
3013 << "\n";
3014 }
3015 if (get_cluster_snapshot().length())
3016 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3017 out << "\n";
3018
3019 print_pools(out);
3020
3021 out << "max_osd " << get_max_osd() << "\n";
3022 for (int i=0; i<get_max_osd(); i++) {
3023 if (exists(i)) {
3024 out << "osd." << i;
3025 out << (is_up(i) ? " up ":" down");
3026 out << (is_in(i) ? " in ":" out");
3027 out << " weight " << get_weightf(i);
3028 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3029 out << " primary_affinity " << get_primary_affinityf(i);
3030 const osd_info_t& info(get_info(i));
3031 out << " " << info;
3032 out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
3033 << " " << get_hb_front_addr(i);
3034 set<string> st;
3035 get_state(i, st);
3036 out << " " << st;
3037 if (!get_uuid(i).is_zero())
3038 out << " " << get_uuid(i);
3039 out << "\n";
3040 }
3041 }
3042 out << std::endl;
3043
3044 for (auto& p : pg_upmap) {
3045 out << "pg_upmap " << p.first << " " << p.second << "\n";
3046 }
3047 for (auto& p : pg_upmap_items) {
3048 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3049 }
3050
3051 for (const auto pg : *pg_temp)
3052 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3053
3054 for (const auto pg : *primary_temp)
3055 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3056
3057 for (const auto &addr : blacklist)
3058 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
3059
3060 // ignore pg_swap_primary
3061 }
3062
3063 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3064 public:
3065 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3066
3067 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3068 unsigned f)
3069 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
3070
3071 bool should_dump_leaf(int i) const override {
3072 if (!filter) {
3073 return true; // normal case
3074 }
3075 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3076 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3077 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3078 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3079 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3080 return true;
3081 }
3082 return false;
3083 }
3084
3085 bool should_dump_empty_bucket() const override {
3086 return !filter;
3087 }
3088
3089 void dump(TextTable *tbl) {
3090 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
3091 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
3092 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3093 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3094 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
3095 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
3096 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
3097
3098 Parent::dump(tbl);
3099
3100 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3101 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3102 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3103 }
3104 }
3105 }
3106
3107 protected:
3108 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
3109 const char *c = crush->get_item_class(qi.id);
3110 if (!c)
3111 c = "";
3112 *tbl << qi.id
3113 << c
3114 << weightf_t(qi.weight);
3115
3116 ostringstream name;
3117 for (int k = 0; k < qi.depth; k++)
3118 name << " ";
3119 if (qi.is_bucket()) {
3120 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3121 << crush->get_item_name(qi.id);
3122 } else {
3123 name << "osd." << qi.id;
3124 }
3125 *tbl << name.str();
3126
3127 if (!qi.is_bucket()) {
3128 if (!osdmap->exists(qi.id)) {
3129 *tbl << "DNE"
3130 << 0;
3131 } else {
3132 string s;
3133 if (osdmap->is_up(qi.id)) {
3134 s = "up";
3135 } else if (osdmap->is_destroyed(qi.id)) {
3136 s = "destroyed";
3137 } else {
3138 s = "down";
3139 }
3140 *tbl << s
3141 << weightf_t(osdmap->get_weightf(qi.id))
3142 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3143 }
3144 }
3145 *tbl << TextTable::endrow;
3146 }
3147
3148 private:
3149 const OSDMap *osdmap;
3150 const unsigned filter;
3151 };
3152
3153 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3154 public:
3155 typedef CrushTreeDumper::FormattingDumper Parent;
3156
3157 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3158 unsigned f)
3159 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
3160
3161 bool should_dump_leaf(int i) const override {
3162 if (!filter) {
3163 return true; // normal case
3164 }
3165 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3166 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3167 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3168 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3169 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3170 return true;
3171 }
3172 return false;
3173 }
3174
3175 bool should_dump_empty_bucket() const override {
3176 return !filter;
3177 }
3178
3179 void dump(Formatter *f) {
3180 f->open_array_section("nodes");
3181 Parent::dump(f);
3182 f->close_section();
3183 f->open_array_section("stray");
3184 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3185 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3186 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3187 }
3188 f->close_section();
3189 }
3190
3191 protected:
3192 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3193 Parent::dump_item_fields(qi, f);
3194 if (!qi.is_bucket())
3195 {
3196 string s;
3197 if (osdmap->is_up(qi.id)) {
3198 s = "up";
3199 } else if (osdmap->is_destroyed(qi.id)) {
3200 s = "destroyed";
3201 } else {
3202 s = "down";
3203 }
3204 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
3205 f->dump_string("status", s);
3206 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3207 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3208 }
3209 }
3210
3211 private:
3212 const OSDMap *osdmap;
3213 const unsigned filter;
3214 };
3215
3216 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter) const
3217 {
3218 if (f) {
3219 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f);
3220 } else {
3221 assert(out);
3222 TextTable tbl;
3223 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl);
3224 *out << tbl;
3225 }
3226 }
3227
3228 void OSDMap::print_summary(Formatter *f, ostream& out,
3229 const string& prefix) const
3230 {
3231 if (f) {
3232 f->open_object_section("osdmap");
3233 f->dump_int("epoch", get_epoch());
3234 f->dump_int("num_osds", get_num_osds());
3235 f->dump_int("num_up_osds", get_num_up_osds());
3236 f->dump_int("num_in_osds", get_num_in_osds());
3237 f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
3238 f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
3239 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3240 f->close_section();
3241 } else {
3242 out << get_num_osds() << " osds: "
3243 << get_num_up_osds() << " up, "
3244 << get_num_in_osds() << " in";
3245 if (get_num_pg_temp())
3246 out << "; " << get_num_pg_temp() << " remapped pgs";
3247 out << "\n";
3248 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3249 if (important_flags)
3250 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
3251 }
3252 }
3253
3254 void OSDMap::print_oneline_summary(ostream& out) const
3255 {
3256 out << "e" << get_epoch() << ": "
3257 << get_num_osds() << " total, "
3258 << get_num_up_osds() << " up, "
3259 << get_num_in_osds() << " in";
3260 if (test_flag(CEPH_OSDMAP_FULL))
3261 out << " full";
3262 else if (test_flag(CEPH_OSDMAP_NEARFULL))
3263 out << " nearfull";
3264 }
3265
3266 bool OSDMap::crush_ruleset_in_use(int ruleset) const
3267 {
3268 for (const auto &pool : pools) {
3269 if (pool.second.crush_rule == ruleset)
3270 return true;
3271 }
3272 return false;
3273 }
3274
3275 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
3276 int nosd, int pg_bits, int pgp_bits,
3277 bool default_pool)
3278 {
3279 ldout(cct, 10) << "build_simple on " << nosd
3280 << " osds" << dendl;
3281 epoch = e;
3282 set_fsid(fsid);
3283 created = modified = ceph_clock_now();
3284
3285 if (nosd >= 0) {
3286 set_max_osd(nosd);
3287 } else {
3288 // count osds
3289 int maxosd = 0;
3290 const md_config_t *conf = cct->_conf;
3291 vector<string> sections;
3292 conf->get_all_sections(sections);
3293
3294 for (auto &section : sections) {
3295 if (section.find("osd.") != 0)
3296 continue;
3297
3298 const char *begin = section.c_str() + 4;
3299 char *end = (char*)begin;
3300 int o = strtol(begin, &end, 10);
3301 if (*end != '\0')
3302 continue;
3303
3304 if (o > cct->_conf->mon_max_osd) {
3305 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
3306 return -ERANGE;
3307 }
3308
3309 if (o > maxosd)
3310 maxosd = o;
3311 }
3312
3313 set_max_osd(maxosd + 1);
3314 }
3315
3316
3317 stringstream ss;
3318 int r;
3319 if (nosd >= 0)
3320 r = build_simple_crush_map(cct, *crush, nosd, &ss);
3321 else
3322 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
3323 assert(r == 0);
3324
3325 int poolbase = get_max_osd() ? get_max_osd() : 1;
3326
3327 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
3328 assert(default_replicated_rule >= 0);
3329
3330 if (default_pool) {
3331 // pgp_num <= pg_num
3332 if (pgp_bits > pg_bits)
3333 pgp_bits = pg_bits;
3334
3335 vector<string> pool_names;
3336 pool_names.push_back("rbd");
3337 for (auto &plname : pool_names) {
3338 int64_t pool = ++pool_max;
3339 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
3340 pools[pool].flags = cct->_conf->osd_pool_default_flags;
3341 if (cct->_conf->osd_pool_default_flag_hashpspool)
3342 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
3343 if (cct->_conf->osd_pool_default_flag_nodelete)
3344 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
3345 if (cct->_conf->osd_pool_default_flag_nopgchange)
3346 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
3347 if (cct->_conf->osd_pool_default_flag_nosizechange)
3348 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
3349 pools[pool].size = cct->_conf->osd_pool_default_size;
3350 pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
3351 pools[pool].crush_rule = default_replicated_rule;
3352 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
3353 pools[pool].set_pg_num(poolbase << pg_bits);
3354 pools[pool].set_pgp_num(poolbase << pgp_bits);
3355 pools[pool].last_change = epoch;
3356 pools[pool].application_metadata.insert(
3357 {pg_pool_t::APPLICATION_NAME_RBD, {}});
3358 pool_name[pool] = plname;
3359 name_pool[plname] = pool;
3360 }
3361 }
3362
3363 for (int i=0; i<get_max_osd(); i++) {
3364 set_state(i, 0);
3365 set_weight(i, CEPH_OSD_OUT);
3366 }
3367
3368 map<string,string> profile_map;
3369 r = get_erasure_code_profile_default(cct, profile_map, &ss);
3370 if (r < 0) {
3371 lderr(cct) << ss.str() << dendl;
3372 return r;
3373 }
3374 set_erasure_code_profile("default", profile_map);
3375 return 0;
3376 }
3377
3378 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
3379 map<string,string> &profile_map,
3380 ostream *ss)
3381 {
3382 int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
3383 *ss,
3384 &profile_map);
3385 return r;
3386 }
3387
3388 int OSDMap::_build_crush_types(CrushWrapper& crush)
3389 {
3390 crush.set_type_name(0, "osd");
3391 crush.set_type_name(1, "host");
3392 crush.set_type_name(2, "chassis");
3393 crush.set_type_name(3, "rack");
3394 crush.set_type_name(4, "row");
3395 crush.set_type_name(5, "pdu");
3396 crush.set_type_name(6, "pod");
3397 crush.set_type_name(7, "room");
3398 crush.set_type_name(8, "datacenter");
3399 crush.set_type_name(9, "region");
3400 crush.set_type_name(10, "root");
3401 return 10;
3402 }
3403
3404 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
3405 int nosd, ostream *ss)
3406 {
3407 crush.create();
3408
3409 // root
3410 int root_type = _build_crush_types(crush);
3411 int rootid;
3412 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
3413 root_type, 0, NULL, NULL, &rootid);
3414 assert(r == 0);
3415 crush.set_item_name(rootid, "default");
3416
3417 for (int o=0; o<nosd; o++) {
3418 map<string,string> loc;
3419 loc["host"] = "localhost";
3420 loc["rack"] = "localrack";
3421 loc["root"] = "default";
3422 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
3423 char name[32];
3424 snprintf(name, sizeof(name), "osd.%d", o);
3425 crush.insert_item(cct, o, 1.0, name, loc);
3426 }
3427
3428 build_simple_crush_rules(cct, crush, "default", ss);
3429
3430 crush.finalize();
3431
3432 return 0;
3433 }
3434
3435 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
3436 CrushWrapper& crush,
3437 ostream *ss)
3438 {
3439 const md_config_t *conf = cct->_conf;
3440
3441 crush.create();
3442
3443 // root
3444 int root_type = _build_crush_types(crush);
3445 int rootid;
3446 int r = crush.add_bucket(0, 0,
3447 CRUSH_HASH_DEFAULT,
3448 root_type, 0, NULL, NULL, &rootid);
3449 assert(r == 0);
3450 crush.set_item_name(rootid, "default");
3451
3452 // add osds
3453 vector<string> sections;
3454 conf->get_all_sections(sections);
3455
3456 for (auto &section : sections) {
3457 if (section.find("osd.") != 0)
3458 continue;
3459
3460 const char *begin = section.c_str() + 4;
3461 char *end = (char*)begin;
3462 int o = strtol(begin, &end, 10);
3463 if (*end != '\0')
3464 continue;
3465
3466 string host, rack, row, room, dc, pool;
3467 vector<string> sectiontmp;
3468 sectiontmp.push_back("osd");
3469 sectiontmp.push_back(section);
3470 conf->get_val_from_conf_file(sectiontmp, "host", host, false);
3471 conf->get_val_from_conf_file(sectiontmp, "rack", rack, false);
3472 conf->get_val_from_conf_file(sectiontmp, "row", row, false);
3473 conf->get_val_from_conf_file(sectiontmp, "room", room, false);
3474 conf->get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
3475 conf->get_val_from_conf_file(sectiontmp, "root", pool, false);
3476
3477 if (host.length() == 0)
3478 host = "unknownhost";
3479 if (rack.length() == 0)
3480 rack = "unknownrack";
3481
3482 map<string,string> loc;
3483 loc["host"] = host;
3484 loc["rack"] = rack;
3485 if (row.size())
3486 loc["row"] = row;
3487 if (room.size())
3488 loc["room"] = room;
3489 if (dc.size())
3490 loc["datacenter"] = dc;
3491 loc["root"] = "default";
3492
3493 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
3494 crush.insert_item(cct, o, 1.0, section, loc);
3495 }
3496
3497 build_simple_crush_rules(cct, crush, "default", ss);
3498
3499 crush.finalize();
3500
3501 return 0;
3502 }
3503
3504
3505 int OSDMap::build_simple_crush_rules(
3506 CephContext *cct,
3507 CrushWrapper& crush,
3508 const string& root,
3509 ostream *ss)
3510 {
3511 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
3512 string failure_domain =
3513 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
3514
3515 int r;
3516 r = crush.add_simple_rule_at(
3517 "replicated_rule", root, failure_domain, "",
3518 "firstn", pg_pool_t::TYPE_REPLICATED,
3519 crush_rule, ss);
3520 if (r < 0)
3521 return r;
3522 // do not add an erasure rule by default or else we will implicitly
3523 // require the crush_v2 feature of clients
3524 return 0;
3525 }
3526
3527 int OSDMap::summarize_mapping_stats(
3528 OSDMap *newmap,
3529 const set<int64_t> *pools,
3530 std::string *out,
3531 Formatter *f) const
3532 {
3533 set<int64_t> ls;
3534 if (pools) {
3535 ls = *pools;
3536 } else {
3537 for (auto &p : get_pools())
3538 ls.insert(p.first);
3539 }
3540
3541 unsigned total_pg = 0;
3542 unsigned moved_pg = 0;
3543 vector<unsigned> base_by_osd(get_max_osd(), 0);
3544 vector<unsigned> new_by_osd(get_max_osd(), 0);
3545 for (int64_t pool_id : ls) {
3546 const pg_pool_t *pi = get_pg_pool(pool_id);
3547 vector<int> up, up2;
3548 int up_primary;
3549 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
3550 pg_t pgid(ps, pool_id, -1);
3551 total_pg += pi->get_size();
3552 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
3553 for (int osd : up) {
3554 if (osd >= 0 && osd < get_max_osd())
3555 ++base_by_osd[osd];
3556 }
3557 if (newmap) {
3558 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
3559 for (int osd : up2) {
3560 if (osd >= 0 && osd < get_max_osd())
3561 ++new_by_osd[osd];
3562 }
3563 if (pi->type == pg_pool_t::TYPE_ERASURE) {
3564 for (unsigned i=0; i<up.size(); ++i) {
3565 if (up[i] != up2[i]) {
3566 ++moved_pg;
3567 }
3568 }
3569 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
3570 for (int osd : up) {
3571 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
3572 ++moved_pg;
3573 }
3574 }
3575 } else {
3576 assert(0 == "unhandled pool type");
3577 }
3578 }
3579 }
3580 }
3581
3582 unsigned num_up_in = 0;
3583 for (int osd = 0; osd < get_max_osd(); ++osd) {
3584 if (is_up(osd) && is_in(osd))
3585 ++num_up_in;
3586 }
3587 if (!num_up_in) {
3588 return -EINVAL;
3589 }
3590
3591 float avg_pg = (float)total_pg / (float)num_up_in;
3592 float base_stddev = 0, new_stddev = 0;
3593 int min = -1, max = -1;
3594 unsigned min_base_pg = 0, max_base_pg = 0;
3595 unsigned min_new_pg = 0, max_new_pg = 0;
3596 for (int osd = 0; osd < get_max_osd(); ++osd) {
3597 if (is_up(osd) && is_in(osd)) {
3598 float base_diff = (float)base_by_osd[osd] - avg_pg;
3599 base_stddev += base_diff * base_diff;
3600 float new_diff = (float)new_by_osd[osd] - avg_pg;
3601 new_stddev += new_diff * new_diff;
3602 if (min < 0 || base_by_osd[osd] < min_base_pg) {
3603 min = osd;
3604 min_base_pg = base_by_osd[osd];
3605 min_new_pg = new_by_osd[osd];
3606 }
3607 if (max < 0 || base_by_osd[osd] > max_base_pg) {
3608 max = osd;
3609 max_base_pg = base_by_osd[osd];
3610 max_new_pg = new_by_osd[osd];
3611 }
3612 }
3613 }
3614 base_stddev = sqrt(base_stddev / num_up_in);
3615 new_stddev = sqrt(new_stddev / num_up_in);
3616
3617 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
3618
3619 ostringstream ss;
3620 if (f)
3621 f->open_object_section("utilization");
3622 if (newmap) {
3623 if (f) {
3624 f->dump_unsigned("moved_pgs", moved_pg);
3625 f->dump_unsigned("total_pgs", total_pg);
3626 } else {
3627 float percent = 0;
3628 if (total_pg)
3629 percent = (float)moved_pg * 100.0 / (float)total_pg;
3630 ss << "moved " << moved_pg << " / " << total_pg
3631 << " (" << percent << "%)\n";
3632 }
3633 }
3634 if (f) {
3635 f->dump_float("avg_pgs", avg_pg);
3636 f->dump_float("std_dev", base_stddev);
3637 f->dump_float("expected_baseline_std_dev", edev);
3638 if (newmap)
3639 f->dump_float("new_std_dev", new_stddev);
3640 } else {
3641 ss << "avg " << avg_pg << "\n";
3642 ss << "stddev " << base_stddev;
3643 if (newmap)
3644 ss << " -> " << new_stddev;
3645 ss << " (expected baseline " << edev << ")\n";
3646 }
3647 if (min >= 0) {
3648 if (f) {
3649 f->dump_unsigned("min_osd", min);
3650 f->dump_unsigned("min_osd_pgs", min_base_pg);
3651 if (newmap)
3652 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
3653 } else {
3654 ss << "min osd." << min << " with " << min_base_pg;
3655 if (newmap)
3656 ss << " -> " << min_new_pg;
3657 ss << " pgs (" << (float)min_base_pg / avg_pg;
3658 if (newmap)
3659 ss << " -> " << (float)min_new_pg / avg_pg;
3660 ss << " * mean)\n";
3661 }
3662 }
3663 if (max >= 0) {
3664 if (f) {
3665 f->dump_unsigned("max_osd", max);
3666 f->dump_unsigned("max_osd_pgs", max_base_pg);
3667 if (newmap)
3668 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
3669 } else {
3670 ss << "max osd." << max << " with " << max_base_pg;
3671 if (newmap)
3672 ss << " -> " << max_new_pg;
3673 ss << " pgs (" << (float)max_base_pg / avg_pg;
3674 if (newmap)
3675 ss << " -> " << (float)max_new_pg / avg_pg;
3676 ss << " * mean)\n";
3677 }
3678 }
3679 if (f)
3680 f->close_section();
3681 if (out)
3682 *out = ss.str();
3683 return 0;
3684 }
3685
3686
3687 int OSDMap::clean_pg_upmaps(
3688 CephContext *cct,
3689 Incremental *pending_inc)
3690 {
3691 ldout(cct, 10) << __func__ << dendl;
3692 int changed = 0;
3693 for (auto& p : pg_upmap) {
3694 vector<int> raw;
3695 int primary;
3696 pg_to_raw_osds(p.first, &raw, &primary);
3697 if (vectors_equal(raw, p.second)) {
3698 ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
3699 << p.second << dendl;
3700 pending_inc->old_pg_upmap.insert(p.first);
3701 ++changed;
3702 }
3703 }
3704 for (auto& p : pg_upmap_items) {
3705 vector<int> raw;
3706 int primary;
3707 pg_to_raw_osds(p.first, &raw, &primary);
3708 mempool::osdmap::vector<pair<int,int>> newmap;
3709 for (auto& q : p.second) {
3710 if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
3711 newmap.push_back(q);
3712 }
3713 }
3714 if (newmap.empty()) {
3715 ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
3716 << p.second << dendl;
3717 pending_inc->old_pg_upmap_items.insert(p.first);
3718 ++changed;
3719 } else if (newmap != p.second) {
3720 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
3721 << p.first << " " << p.second << " -> " << newmap << dendl;
3722 pending_inc->new_pg_upmap_items[p.first] = newmap;
3723 ++changed;
3724 }
3725 }
3726 return changed;
3727 }
3728
3729 bool OSDMap::try_pg_upmap(
3730 CephContext *cct,
3731 pg_t pg, ///< pg to potentially remap
3732 const set<int>& overfull, ///< osds we'd want to evacuate
3733 const vector<int>& underfull, ///< osds to move to, in order of preference
3734 vector<int> *orig,
3735 vector<int> *out) ///< resulting alternative mapping
3736 {
3737 const pg_pool_t *pool = get_pg_pool(pg.pool());
3738 if (!pool)
3739 return false;
3740 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
3741 pool->get_size());
3742 if (rule < 0)
3743 return false;
3744
3745 // get original mapping
3746 _pg_to_raw_osds(*pool, pg, orig, NULL);
3747
3748 // make sure there is something there to remap
3749 bool any = false;
3750 for (auto osd : *orig) {
3751 if (overfull.count(osd)) {
3752 any = true;
3753 break;
3754 }
3755 }
3756 if (!any) {
3757 return false;
3758 }
3759
3760 int r = crush->try_remap_rule(
3761 cct,
3762 rule,
3763 pool->get_size(),
3764 overfull, underfull,
3765 *orig,
3766 out);
3767 if (r < 0)
3768 return false;
3769 if (*out == *orig)
3770 return false;
3771 return true;
3772 }
3773
3774 int OSDMap::calc_pg_upmaps(
3775 CephContext *cct,
3776 float max_deviation_ratio,
3777 int max,
3778 const set<int64_t>& only_pools_orig,
3779 OSDMap::Incremental *pending_inc)
3780 {
3781 set<int64_t> only_pools;
3782 if (only_pools_orig.empty()) {
3783 for (auto& i : pools) {
3784 only_pools.insert(i.first);
3785 }
3786 } else {
3787 only_pools = only_pools_orig;
3788 }
3789 OSDMap tmp;
3790 tmp.deepish_copy_from(*this);
3791 float start_deviation = 0;
3792 float end_deviation = 0;
3793 int num_changed = 0;
3794 while (true) {
3795 map<int,set<pg_t>> pgs_by_osd;
3796 int total_pgs = 0;
3797 float osd_weight_total = 0;
3798 map<int,float> osd_weight;
3799 for (auto& i : pools) {
3800 if (!only_pools.empty() && !only_pools.count(i.first))
3801 continue;
3802 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
3803 pg_t pg(ps, i.first);
3804 vector<int> up;
3805 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
3806 for (auto osd : up) {
3807 if (osd != CRUSH_ITEM_NONE)
3808 pgs_by_osd[osd].insert(pg);
3809 }
3810 }
3811 total_pgs += i.second.get_size() * i.second.get_pg_num();
3812
3813 map<int,float> pmap;
3814 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
3815 i.second.get_type(),
3816 i.second.get_size());
3817 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
3818 ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
3819 for (auto p : pmap) {
3820 osd_weight[p.first] += p.second;
3821 osd_weight_total += p.second;
3822 }
3823 }
3824 for (auto& i : osd_weight) {
3825 int pgs = 0;
3826 auto p = pgs_by_osd.find(i.first);
3827 if (p != pgs_by_osd.end())
3828 pgs = p->second.size();
3829 else
3830 pgs_by_osd.emplace(i.first, set<pg_t>());
3831 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
3832 << " pgs " << pgs << dendl;
3833 }
3834
3835 if (osd_weight_total == 0) {
3836 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
3837 break;
3838 }
3839 float pgs_per_weight = total_pgs / osd_weight_total;
3840 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
3841 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
3842
3843 // osd deviation
3844 float total_deviation = 0;
3845 map<int,float> osd_deviation; // osd, deviation(pgs)
3846 multimap<float,int> deviation_osd; // deviation(pgs), osd
3847 set<int> overfull;
3848 for (auto& i : pgs_by_osd) {
3849 float target = osd_weight[i.first] * pgs_per_weight;
3850 float deviation = (float)i.second.size() - target;
3851 ldout(cct, 20) << " osd." << i.first
3852 << "\tpgs " << i.second.size()
3853 << "\ttarget " << target
3854 << "\tdeviation " << deviation
3855 << dendl;
3856 osd_deviation[i.first] = deviation;
3857 deviation_osd.insert(make_pair(deviation, i.first));
3858 if (deviation >= 1.0)
3859 overfull.insert(i.first);
3860 total_deviation += abs(deviation);
3861 }
3862 if (num_changed == 0) {
3863 start_deviation = total_deviation;
3864 }
3865 end_deviation = total_deviation;
3866
3867 // build underfull, sorted from least-full to most-average
3868 vector<int> underfull;
3869 for (auto i = deviation_osd.begin();
3870 i != deviation_osd.end();
3871 ++i) {
3872 if (i->first >= -.999)
3873 break;
3874 underfull.push_back(i->second);
3875 }
3876 ldout(cct, 10) << " total_deviation " << total_deviation
3877 << " overfull " << overfull
3878 << " underfull " << underfull << dendl;
3879 if (overfull.empty() || underfull.empty())
3880 break;
3881
3882 // pick fullest
3883 bool restart = false;
3884 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
3885 int osd = p->second;
3886 float deviation = p->first;
3887 float target = osd_weight[osd] * pgs_per_weight;
3888 assert(target > 0);
3889 if (deviation/target < max_deviation_ratio) {
3890 ldout(cct, 10) << " osd." << osd
3891 << " target " << target
3892 << " deviation " << deviation
3893 << " -> ratio " << deviation/target
3894 << " < max ratio " << max_deviation_ratio << dendl;
3895 break;
3896 }
3897 int num_to_move = deviation;
3898 ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
3899 if (num_to_move < 1)
3900 break;
3901
3902 set<pg_t>& pgs = pgs_by_osd[osd];
3903
3904 // look for remaps we can un-remap
3905 for (auto pg : pgs) {
3906 auto p = tmp.pg_upmap_items.find(pg);
3907 if (p != tmp.pg_upmap_items.end()) {
3908 for (auto q : p->second) {
3909 if (q.second == osd) {
3910 ldout(cct, 10) << " dropping pg_upmap_items " << pg
3911 << " " << p->second << dendl;
3912 tmp.pg_upmap_items.erase(p);
3913 pending_inc->old_pg_upmap_items.insert(pg);
3914 ++num_changed;
3915 restart = true;
3916 }
3917 }
3918 }
3919 if (restart)
3920 break;
3921 } // pg loop
3922 if (restart)
3923 break;
3924
3925 for (auto pg : pgs) {
3926 if (tmp.pg_upmap.count(pg) ||
3927 tmp.pg_upmap_items.count(pg)) {
3928 ldout(cct, 20) << " already remapped " << pg << dendl;
3929 continue;
3930 }
3931 ldout(cct, 10) << " trying " << pg << dendl;
3932 vector<int> orig, out;
3933 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
3934 continue;
3935 }
3936 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
3937 if (orig.size() != out.size()) {
3938 continue;
3939 }
3940 assert(orig != out);
3941 auto& rmi = tmp.pg_upmap_items[pg];
3942 for (unsigned i = 0; i < out.size(); ++i) {
3943 if (orig[i] != out[i]) {
3944 rmi.push_back(make_pair(orig[i], out[i]));
3945 }
3946 }
3947 pending_inc->new_pg_upmap_items[pg] = rmi;
3948 ldout(cct, 10) << " " << pg << " pg_upmap_items " << rmi << dendl;
3949 restart = true;
3950 ++num_changed;
3951 break;
3952 } // pg loop
3953 if (restart)
3954 break;
3955 } // osd loop
3956
3957 if (!restart) {
3958 ldout(cct, 10) << " failed to find any changes to make" << dendl;
3959 break;
3960 }
3961 if (--max == 0) {
3962 ldout(cct, 10) << " hit max iterations, stopping" << dendl;
3963 break;
3964 }
3965 }
3966 ldout(cct, 10) << " start deviation " << start_deviation << dendl;
3967 ldout(cct, 10) << " end deviation " << end_deviation << dendl;
3968 return num_changed;
3969 }
3970
3971 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
3972 {
3973 return crush->get_leaves(name, osds);
3974 }
3975
3976 template <typename F>
3977 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
3978 public:
3979 typedef CrushTreeDumper::Dumper<F> Parent;
3980
3981 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3982 const PGStatService *pgs_, bool tree_) :
3983 Parent(crush, osdmap_->get_pool_names()),
3984 osdmap(osdmap_),
3985 pgs(pgs_),
3986 tree(tree_),
3987 average_util(average_utilization()),
3988 min_var(-1),
3989 max_var(-1),
3990 stddev(0),
3991 sum(0) {
3992 }
3993
3994 protected:
3995 void dump_stray(F *f) {
3996 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3997 if (osdmap->exists(i) && !this->is_touched(i))
3998 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3999 }
4000 }
4001
4002 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
4003 if (!tree && qi.is_bucket())
4004 return;
4005
4006 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
4007 int64_t kb = 0, kb_used = 0, kb_avail = 0;
4008 double util = 0;
4009 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
4010 if (kb_used && kb)
4011 util = 100.0 * (double)kb_used / (double)kb;
4012
4013 double var = 1.0;
4014 if (average_util)
4015 var = util / average_util;
4016
4017 size_t num_pgs = qi.is_bucket() ? 0 : pgs->get_num_pg_by_osd(qi.id);
4018
4019 dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
4020
4021 if (!qi.is_bucket() && reweight > 0) {
4022 if (min_var < 0 || var < min_var)
4023 min_var = var;
4024 if (max_var < 0 || var > max_var)
4025 max_var = var;
4026
4027 double dev = util - average_util;
4028 dev *= dev;
4029 stddev += reweight * dev;
4030 sum += reweight;
4031 }
4032 }
4033
4034 virtual void dump_item(const CrushTreeDumper::Item &qi,
4035 float &reweight,
4036 int64_t kb,
4037 int64_t kb_used,
4038 int64_t kb_avail,
4039 double& util,
4040 double& var,
4041 const size_t num_pgs,
4042 F *f) = 0;
4043
4044 double dev() {
4045 return sum > 0 ? sqrt(stddev / sum) : 0;
4046 }
4047
4048 double average_utilization() {
4049 int64_t kb = 0, kb_used = 0;
4050 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4051 if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
4052 continue;
4053 int64_t kb_i, kb_used_i, kb_avail_i;
4054 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
4055 kb += kb_i;
4056 kb_used += kb_used_i;
4057 }
4058 }
4059 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
4060 }
4061
4062 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
4063 int64_t* kb_avail) const {
4064 const osd_stat_t *p = pgs->get_osd_stat(id);
4065 if (!p) return false;
4066 *kb = p->kb;
4067 *kb_used = p->kb_used;
4068 *kb_avail = p->kb_avail;
4069 return *kb > 0;
4070 }
4071
4072 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
4073 int64_t* kb_avail) const {
4074 if (id >= 0) {
4075 if (osdmap->is_out(id)) {
4076 *kb = 0;
4077 *kb_used = 0;
4078 *kb_avail = 0;
4079 return true;
4080 }
4081 return get_osd_utilization(id, kb, kb_used, kb_avail);
4082 }
4083
4084 *kb = 0;
4085 *kb_used = 0;
4086 *kb_avail = 0;
4087
4088 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
4089 int item = osdmap->crush->get_bucket_item(id, k);
4090 int64_t kb_i = 0, kb_used_i = 0, kb_avail_i = 0;
4091 if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
4092 return false;
4093 *kb += kb_i;
4094 *kb_used += kb_used_i;
4095 *kb_avail += kb_avail_i;
4096 }
4097 return *kb > 0;
4098 }
4099
4100 protected:
4101 const OSDMap *osdmap;
4102 const PGStatService *pgs;
4103 bool tree;
4104 double average_util;
4105 double min_var;
4106 double max_var;
4107 double stddev;
4108 double sum;
4109 };
4110
4111
4112 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
4113 public:
4114 typedef OSDUtilizationDumper<TextTable> Parent;
4115
4116 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4117 const PGStatService *pgs, bool tree) :
4118 Parent(crush, osdmap, pgs, tree) {}
4119
4120 void dump(TextTable *tbl) {
4121 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
4122 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
4123 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4124 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4125 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
4126 tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
4127 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
4128 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
4129 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
4130 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
4131 if (tree)
4132 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4133
4134 Parent::dump(tbl);
4135
4136 dump_stray(tbl);
4137
4138 *tbl << ""
4139 << ""
4140 << "" << "TOTAL"
4141 << si_t(pgs->get_osd_sum().kb << 10)
4142 << si_t(pgs->get_osd_sum().kb_used << 10)
4143 << si_t(pgs->get_osd_sum().kb_avail << 10)
4144 << lowprecision_t(average_util)
4145 << ""
4146 << TextTable::endrow;
4147 }
4148
4149 protected:
4150 struct lowprecision_t {
4151 float v;
4152 explicit lowprecision_t(float _v) : v(_v) {}
4153 };
4154 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
4155
4156 using OSDUtilizationDumper<TextTable>::dump_item;
4157 void dump_item(const CrushTreeDumper::Item &qi,
4158 float &reweight,
4159 int64_t kb,
4160 int64_t kb_used,
4161 int64_t kb_avail,
4162 double& util,
4163 double& var,
4164 const size_t num_pgs,
4165 TextTable *tbl) override {
4166 const char *c = crush->get_item_class(qi.id);
4167 if (!c)
4168 c = "";
4169 *tbl << qi.id
4170 << c
4171 << weightf_t(qi.weight)
4172 << weightf_t(reweight)
4173 << si_t(kb << 10)
4174 << si_t(kb_used << 10)
4175 << si_t(kb_avail << 10)
4176 << lowprecision_t(util)
4177 << lowprecision_t(var);
4178
4179 if (qi.is_bucket()) {
4180 *tbl << "-";
4181 } else {
4182 *tbl << num_pgs;
4183 }
4184
4185 if (tree) {
4186 ostringstream name;
4187 for (int k = 0; k < qi.depth; k++)
4188 name << " ";
4189 if (qi.is_bucket()) {
4190 int type = crush->get_bucket_type(qi.id);
4191 name << crush->get_type_name(type) << " "
4192 << crush->get_item_name(qi.id);
4193 } else {
4194 name << "osd." << qi.id;
4195 }
4196 *tbl << name.str();
4197 }
4198
4199 *tbl << TextTable::endrow;
4200 }
4201
4202 public:
4203 string summary() {
4204 ostringstream out;
4205 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
4206 << "/" << lowprecision_t(max_var) << " "
4207 << "STDDEV: " << lowprecision_t(dev());
4208 return out.str();
4209 }
4210 };
4211
4212 ostream& operator<<(ostream& out,
4213 const OSDUtilizationPlainDumper::lowprecision_t& v)
4214 {
4215 if (v.v < -0.01) {
4216 return out << "-";
4217 } else if (v.v < 0.001) {
4218 return out << "0";
4219 } else {
4220 std::streamsize p = out.precision();
4221 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
4222 }
4223 }
4224
4225 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
4226 public:
4227 typedef OSDUtilizationDumper<Formatter> Parent;
4228
4229 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4230 const PGStatService *pgs, bool tree) :
4231 Parent(crush, osdmap, pgs, tree) {}
4232
4233 void dump(Formatter *f) {
4234 f->open_array_section("nodes");
4235 Parent::dump(f);
4236 f->close_section();
4237
4238 f->open_array_section("stray");
4239 dump_stray(f);
4240 f->close_section();
4241 }
4242
4243 protected:
4244 using OSDUtilizationDumper<Formatter>::dump_item;
4245 void dump_item(const CrushTreeDumper::Item &qi,
4246 float &reweight,
4247 int64_t kb,
4248 int64_t kb_used,
4249 int64_t kb_avail,
4250 double& util,
4251 double& var,
4252 const size_t num_pgs,
4253 Formatter *f) override {
4254 f->open_object_section("item");
4255 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
4256 f->dump_float("reweight", reweight);
4257 f->dump_int("kb", kb);
4258 f->dump_int("kb_used", kb_used);
4259 f->dump_int("kb_avail", kb_avail);
4260 f->dump_float("utilization", util);
4261 f->dump_float("var", var);
4262 f->dump_unsigned("pgs", num_pgs);
4263 CrushTreeDumper::dump_bucket_children(crush, qi, f);
4264 f->close_section();
4265 }
4266
4267 public:
4268 void summary(Formatter *f) {
4269 f->open_object_section("summary");
4270 f->dump_int("total_kb", pgs->get_osd_sum().kb);
4271 f->dump_int("total_kb_used", pgs->get_osd_sum().kb_used);
4272 f->dump_int("total_kb_avail", pgs->get_osd_sum().kb_avail);
4273 f->dump_float("average_utilization", average_util);
4274 f->dump_float("min_var", min_var);
4275 f->dump_float("max_var", max_var);
4276 f->dump_float("dev", dev());
4277 f->close_section();
4278 }
4279 };
4280
4281 void print_osd_utilization(const OSDMap& osdmap,
4282 const PGStatService *pgstat,
4283 ostream& out,
4284 Formatter *f,
4285 bool tree)
4286 {
4287 const CrushWrapper *crush = osdmap.crush.get();
4288 if (f) {
4289 f->open_object_section("df");
4290 OSDUtilizationFormatDumper d(crush, &osdmap, pgstat, tree);
4291 d.dump(f);
4292 d.summary(f);
4293 f->close_section();
4294 f->flush(out);
4295 } else {
4296 OSDUtilizationPlainDumper d(crush, &osdmap, pgstat, tree);
4297 TextTable tbl;
4298 d.dump(&tbl);
4299 out << tbl << d.summary() << "\n";
4300 }
4301 }
4302
4303 void OSDMap::check_health(health_check_map_t *checks) const
4304 {
4305 int num_osds = get_num_osds();
4306
4307 // OSD_DOWN
4308 // OSD_$subtree_DOWN
4309 // OSD_ORPHAN
4310 if (num_osds >= 0) {
4311 int num_in_osds = 0;
4312 int num_down_in_osds = 0;
4313 set<int> osds;
4314 set<int> down_in_osds;
4315 set<int> up_in_osds;
4316 set<int> subtree_up;
4317 unordered_map<int, set<int> > subtree_type_down;
4318 unordered_map<int, int> num_osds_subtree;
4319 int max_type = crush->get_max_type_id();
4320
4321 for (int i = 0; i < get_max_osd(); i++) {
4322 if (!exists(i)) {
4323 if (crush->item_exists(i)) {
4324 osds.insert(i);
4325 }
4326 continue;
4327 }
4328 if (is_out(i))
4329 continue;
4330 ++num_in_osds;
4331 if (down_in_osds.count(i) || up_in_osds.count(i))
4332 continue;
4333 if (!is_up(i)) {
4334 down_in_osds.insert(i);
4335 int parent_id = 0;
4336 int current = i;
4337 for (int type = 0; type <= max_type; type++) {
4338 if (!crush->get_type_name(type))
4339 continue;
4340 int r = crush->get_immediate_parent_id(current, &parent_id);
4341 if (r == -ENOENT)
4342 break;
4343 // break early if this parent is already marked as up
4344 if (subtree_up.count(parent_id))
4345 break;
4346 type = crush->get_bucket_type(parent_id);
4347 if (!subtree_type_is_down(
4348 g_ceph_context, parent_id, type,
4349 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
4350 break;
4351 current = parent_id;
4352 }
4353 }
4354 }
4355
4356 // calculate the number of down osds in each down subtree and
4357 // store it in num_osds_subtree
4358 for (int type = 1; type <= max_type; type++) {
4359 if (!crush->get_type_name(type))
4360 continue;
4361 for (auto j = subtree_type_down[type].begin();
4362 j != subtree_type_down[type].end();
4363 ++j) {
4364 list<int> children;
4365 int num = 0;
4366 int num_children = crush->get_children(*j, &children);
4367 if (num_children == 0)
4368 continue;
4369 for (auto l = children.begin(); l != children.end(); ++l) {
4370 if (*l >= 0) {
4371 ++num;
4372 } else if (num_osds_subtree[*l] > 0) {
4373 num = num + num_osds_subtree[*l];
4374 }
4375 }
4376 num_osds_subtree[*j] = num;
4377 }
4378 }
4379 num_down_in_osds = down_in_osds.size();
4380 assert(num_down_in_osds <= num_in_osds);
4381 if (num_down_in_osds > 0) {
4382 // summary of down subtree types and osds
4383 for (int type = max_type; type > 0; type--) {
4384 if (!crush->get_type_name(type))
4385 continue;
4386 if (subtree_type_down[type].size() > 0) {
4387 ostringstream ss;
4388 ss << subtree_type_down[type].size() << " "
4389 << crush->get_type_name(type);
4390 if (subtree_type_down[type].size() > 1) {
4391 ss << "s";
4392 }
4393 int sum_down_osds = 0;
4394 for (auto j = subtree_type_down[type].begin();
4395 j != subtree_type_down[type].end();
4396 ++j) {
4397 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
4398 }
4399 ss << " (" << sum_down_osds << " osds) down";
4400 string err = string("OSD_") +
4401 string(crush->get_type_name(type)) + "_DOWN";
4402 boost::to_upper(err);
4403 auto& d = checks->add(err, HEALTH_WARN, ss.str());
4404 for (auto j = subtree_type_down[type].rbegin();
4405 j != subtree_type_down[type].rend();
4406 ++j) {
4407 ostringstream ss;
4408 ss << crush->get_type_name(type);
4409 ss << " ";
4410 ss << crush->get_item_name(*j);
4411 // at the top level, do not print location
4412 if (type != max_type) {
4413 ss << " (";
4414 ss << crush->get_full_location_ordered_string(*j);
4415 ss << ")";
4416 }
4417 int num = num_osds_subtree[*j];
4418 ss << " (" << num << " osds)";
4419 ss << " is down";
4420 d.detail.push_back(ss.str());
4421 }
4422 }
4423 }
4424 ostringstream ss;
4425 ss << down_in_osds.size() << " osds down";
4426 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
4427 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
4428 ostringstream ss;
4429 ss << "osd." << *it << " (";
4430 ss << crush->get_full_location_ordered_string(*it);
4431 ss << ") is down";
4432 d.detail.push_back(ss.str());
4433 }
4434 }
4435
4436 if (!osds.empty()) {
4437 ostringstream ss;
4438 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
4439 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
4440 for (auto osd : osds) {
4441 ostringstream ss;
4442 ss << "osd." << osd << " exists in crush map but not in osdmap";
4443 d.detail.push_back(ss.str());
4444 }
4445 }
4446 }
4447
4448 // OSD_OUT_OF_ORDER_FULL
4449 {
4450 // An osd could configure failsafe ratio, to something different
4451 // but for now assume it is the same here.
4452 float fsr = g_conf->osd_failsafe_full_ratio;
4453 if (fsr > 1.0) fsr /= 100;
4454 float fr = get_full_ratio();
4455 float br = get_backfillfull_ratio();
4456 float nr = get_nearfull_ratio();
4457
4458 list<string> detail;
4459 // These checks correspond to how OSDService::check_full_status() in an OSD
4460 // handles the improper setting of these values.
4461 if (br < nr) {
4462 ostringstream ss;
4463 ss << "backfillfull_ratio (" << br
4464 << ") < nearfull_ratio (" << nr << "), increased";
4465 detail.push_back(ss.str());
4466 br = nr;
4467 }
4468 if (fr < br) {
4469 ostringstream ss;
4470 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
4471 << "), increased";
4472 detail.push_back(ss.str());
4473 fr = br;
4474 }
4475 if (fsr < fr) {
4476 ostringstream ss;
4477 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
4478 << "), increased";
4479 detail.push_back(ss.str());
4480 }
4481 if (!detail.empty()) {
4482 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
4483 "full ratio(s) out of order");
4484 d.detail.swap(detail);
4485 }
4486 }
4487
4488 // OSD_FULL
4489 // OSD_NEARFULL
4490 // OSD_BACKFILLFULL
4491 // OSD_FAILSAFE_FULL
4492 {
4493 set<int> full, backfillfull, nearfull;
4494 get_full_osd_counts(&full, &backfillfull, &nearfull);
4495 if (full.size()) {
4496 ostringstream ss;
4497 ss << full.size() << " full osd(s)";
4498 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
4499 for (auto& i: full) {
4500 ostringstream ss;
4501 ss << "osd." << i << " is full";
4502 d.detail.push_back(ss.str());
4503 }
4504 }
4505 if (backfillfull.size()) {
4506 ostringstream ss;
4507 ss << backfillfull.size() << " backfillfull osd(s)";
4508 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
4509 for (auto& i: backfillfull) {
4510 ostringstream ss;
4511 ss << "osd." << i << " is backfill full";
4512 d.detail.push_back(ss.str());
4513 }
4514 }
4515 if (nearfull.size()) {
4516 ostringstream ss;
4517 ss << nearfull.size() << " nearfull osd(s)";
4518 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
4519 for (auto& i: nearfull) {
4520 ostringstream ss;
4521 ss << "osd." << i << " is near full";
4522 d.detail.push_back(ss.str());
4523 }
4524 }
4525 }
4526
4527 // OSDMAP_FLAGS
4528 {
4529 // warn about flags
4530 uint64_t warn_flags =
4531 CEPH_OSDMAP_FULL |
4532 CEPH_OSDMAP_PAUSERD |
4533 CEPH_OSDMAP_PAUSEWR |
4534 CEPH_OSDMAP_PAUSEREC |
4535 CEPH_OSDMAP_NOUP |
4536 CEPH_OSDMAP_NODOWN |
4537 CEPH_OSDMAP_NOIN |
4538 CEPH_OSDMAP_NOOUT |
4539 CEPH_OSDMAP_NOBACKFILL |
4540 CEPH_OSDMAP_NORECOVER |
4541 CEPH_OSDMAP_NOSCRUB |
4542 CEPH_OSDMAP_NODEEP_SCRUB |
4543 CEPH_OSDMAP_NOTIERAGENT |
4544 CEPH_OSDMAP_NOREBALANCE;
4545 if (test_flag(warn_flags)) {
4546 ostringstream ss;
4547 ss << get_flag_string(get_flags() & warn_flags)
4548 << " flag(s) set";
4549 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
4550 }
4551 }
4552
4553 // OSD_FLAGS
4554 {
4555 list<string> detail;
4556 const unsigned flags =
4557 CEPH_OSD_NOUP |
4558 CEPH_OSD_NOIN |
4559 CEPH_OSD_NODOWN |
4560 CEPH_OSD_NOOUT;
4561 for (int i = 0; i < max_osd; ++i) {
4562 if (osd_state[i] & flags) {
4563 ostringstream ss;
4564 set<string> states;
4565 OSDMap::calc_state_set(osd_state[i] & flags, states);
4566 ss << "osd." << i << " has flags " << states;
4567 detail.push_back(ss.str());
4568 }
4569 }
4570 if (!detail.empty()) {
4571 ostringstream ss;
4572 ss << detail.size() << " osd(s) have {NOUP,NODOWN,NOIN,NOOUT} flags set";
4573 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
4574 d.detail.swap(detail);
4575 }
4576 }
4577
4578 // OLD_CRUSH_TUNABLES
4579 if (g_conf->mon_warn_on_legacy_crush_tunables) {
4580 string min = crush->get_min_required_version();
4581 if (min < g_conf->mon_crush_min_required_version) {
4582 ostringstream ss;
4583 ss << "crush map has legacy tunables (require " << min
4584 << ", min is " << g_conf->mon_crush_min_required_version << ")";
4585 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
4586 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4587 }
4588 }
4589
4590 // OLD_CRUSH_STRAW_CALC_VERSION
4591 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
4592 if (crush->get_straw_calc_version() == 0) {
4593 ostringstream ss;
4594 ss << "crush map has straw_calc_version=0";
4595 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
4596 d.detail.push_back(
4597 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4598 }
4599 }
4600
4601 // CACHE_POOL_NO_HIT_SET
4602 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
4603 list<string> detail;
4604 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
4605 p != pools.end();
4606 ++p) {
4607 const pg_pool_t& info = p->second;
4608 if (info.cache_mode_requires_hit_set() &&
4609 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
4610 ostringstream ss;
4611 ss << "pool '" << get_pool_name(p->first)
4612 << "' with cache_mode " << info.get_cache_mode_name()
4613 << " needs hit_set_type to be set but it is not";
4614 detail.push_back(ss.str());
4615 }
4616 }
4617 if (!detail.empty()) {
4618 ostringstream ss;
4619 ss << detail.size() << " cache pools are missing hit_sets";
4620 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
4621 d.detail.swap(detail);
4622 }
4623 }
4624
4625 // OSD_NO_SORTBITWISE
4626 if (!test_flag(CEPH_OSDMAP_SORTBITWISE) &&
4627 (get_up_osd_features() &
4628 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
4629 ostringstream ss;
4630 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
4631 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
4632 }
4633
4634 // OSD_UPGRADE_FINISHED
4635 // none of these (yet) since we don't run until luminous upgrade is done.
4636
4637 // POOL_FULL
4638 {
4639 list<string> detail;
4640 for (auto it : get_pools()) {
4641 const pg_pool_t &pool = it.second;
4642 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4643 const string& pool_name = get_pool_name(it.first);
4644 stringstream ss;
4645 ss << "pool '" << pool_name << "' is full";
4646 detail.push_back(ss.str());
4647 }
4648 }
4649 if (!detail.empty()) {
4650 ostringstream ss;
4651 ss << detail.size() << " pool(s) full";
4652 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
4653 d.detail.swap(detail);
4654 }
4655 }
4656 }
4657
4658 int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
4659 ostream *ss) const
4660 {
4661 out->clear();
4662 for (auto i = ls.begin(); i != ls.end(); ++i) {
4663 if (i == ls.begin() &&
4664 (*i == "any" || *i == "all" || *i == "*")) {
4665 get_all_osds(*out);
4666 break;
4667 }
4668 long osd = parse_osd_id(i->c_str(), ss);
4669 if (osd < 0) {
4670 *ss << "invalid osd id '" << *i << "'";
4671 return -EINVAL;
4672 }
4673 out->insert(osd);
4674 }
4675 return 0;
4676 }