]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
import ceph 16.2.7
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <algorithm>
19 #include <optional>
20 #include <random>
21
22 #include <boost/algorithm/string.hpp>
23
24 #include "OSDMap.h"
25 #include "common/config.h"
26 #include "common/errno.h"
27 #include "common/Formatter.h"
28 #include "common/TextTable.h"
29 #include "include/ceph_features.h"
30 #include "include/common_fwd.h"
31 #include "include/str_map.h"
32
33 #include "common/code_environment.h"
34 #include "mon/health_check.h"
35
36 #include "crush/CrushTreeDumper.h"
37 #include "common/Clock.h"
38 #include "mon/PGMap.h"
39
40 using std::list;
41 using std::make_pair;
42 using std::map;
43 using std::multimap;
44 using std::ostream;
45 using std::ostringstream;
46 using std::pair;
47 using std::set;
48 using std::string;
49 using std::stringstream;
50 using std::unordered_map;
51 using std::vector;
52
53 using ceph::decode;
54 using ceph::encode;
55 using ceph::Formatter;
56
57 #define dout_subsys ceph_subsys_osd
58
59 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
60 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
61
62
63 // ----------------------------------
64 // osd_info_t
65
66 void osd_info_t::dump(Formatter *f) const
67 {
68 f->dump_int("last_clean_begin", last_clean_begin);
69 f->dump_int("last_clean_end", last_clean_end);
70 f->dump_int("up_from", up_from);
71 f->dump_int("up_thru", up_thru);
72 f->dump_int("down_at", down_at);
73 f->dump_int("lost_at", lost_at);
74 }
75
76 void osd_info_t::encode(ceph::buffer::list& bl) const
77 {
78 using ceph::encode;
79 __u8 struct_v = 1;
80 encode(struct_v, bl);
81 encode(last_clean_begin, bl);
82 encode(last_clean_end, bl);
83 encode(up_from, bl);
84 encode(up_thru, bl);
85 encode(down_at, bl);
86 encode(lost_at, bl);
87 }
88
89 void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
90 {
91 using ceph::decode;
92 __u8 struct_v;
93 decode(struct_v, bl);
94 decode(last_clean_begin, bl);
95 decode(last_clean_end, bl);
96 decode(up_from, bl);
97 decode(up_thru, bl);
98 decode(down_at, bl);
99 decode(lost_at, bl);
100 }
101
102 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
103 {
104 o.push_back(new osd_info_t);
105 o.push_back(new osd_info_t);
106 o.back()->last_clean_begin = 1;
107 o.back()->last_clean_end = 2;
108 o.back()->up_from = 30;
109 o.back()->up_thru = 40;
110 o.back()->down_at = 5;
111 o.back()->lost_at = 6;
112 }
113
114 ostream& operator<<(ostream& out, const osd_info_t& info)
115 {
116 out << "up_from " << info.up_from
117 << " up_thru " << info.up_thru
118 << " down_at " << info.down_at
119 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
120 if (info.lost_at)
121 out << " lost_at " << info.lost_at;
122 return out;
123 }
124
125 // ----------------------------------
126 // osd_xinfo_t
127
128 void osd_xinfo_t::dump(Formatter *f) const
129 {
130 f->dump_stream("down_stamp") << down_stamp;
131 f->dump_float("laggy_probability", laggy_probability);
132 f->dump_int("laggy_interval", laggy_interval);
133 f->dump_int("features", features);
134 f->dump_unsigned("old_weight", old_weight);
135 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
136 f->dump_int("dead_epoch", dead_epoch);
137 }
138
139 void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
140 {
141 uint8_t v = 4;
142 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
143 v = 3;
144 }
145 ENCODE_START(v, 1, bl);
146 encode(down_stamp, bl);
147 __u32 lp = laggy_probability * float(0xfffffffful);
148 encode(lp, bl);
149 encode(laggy_interval, bl);
150 encode(features, bl);
151 encode(old_weight, bl);
152 if (v >= 4) {
153 encode(last_purged_snaps_scrub, bl);
154 encode(dead_epoch, bl);
155 }
156 ENCODE_FINISH(bl);
157 }
158
159 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
160 {
161 DECODE_START(4, bl);
162 decode(down_stamp, bl);
163 __u32 lp;
164 decode(lp, bl);
165 laggy_probability = (float)lp / (float)0xffffffff;
166 decode(laggy_interval, bl);
167 if (struct_v >= 2)
168 decode(features, bl);
169 else
170 features = 0;
171 if (struct_v >= 3)
172 decode(old_weight, bl);
173 else
174 old_weight = 0;
175 if (struct_v >= 4) {
176 decode(last_purged_snaps_scrub, bl);
177 decode(dead_epoch, bl);
178 } else {
179 dead_epoch = 0;
180 }
181 DECODE_FINISH(bl);
182 }
183
184 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
185 {
186 o.push_back(new osd_xinfo_t);
187 o.push_back(new osd_xinfo_t);
188 o.back()->down_stamp = utime_t(2, 3);
189 o.back()->laggy_probability = .123;
190 o.back()->laggy_interval = 123456;
191 o.back()->old_weight = 0x7fff;
192 }
193
194 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
195 {
196 return out << "down_stamp " << xi.down_stamp
197 << " laggy_probability " << xi.laggy_probability
198 << " laggy_interval " << xi.laggy_interval
199 << " old_weight " << xi.old_weight
200 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
201 << " dead_epoch " << xi.dead_epoch;
202 }
203
204 // ----------------------------------
205 // OSDMap::Incremental
206
207 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
208 {
209 int n = 0;
210 for (auto &weight : new_weight) {
211 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
212 n++; // marked out
213 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
214 n--; // marked in
215 }
216 return n;
217 }
218
219 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
220 {
221 int n = 0;
222 for (auto &state : new_state) { //
223 if (state.second & CEPH_OSD_UP) {
224 if (previous->is_up(state.first))
225 n++; // marked down
226 else
227 n--; // marked up
228 }
229 }
230 return n;
231 }
232
233 int OSDMap::Incremental::identify_osd(uuid_d u) const
234 {
235 for (auto &uuid : new_uuid)
236 if (uuid.second == u)
237 return uuid.first;
238 return -1;
239 }
240
241 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
242 const OSDMap& osdmap)
243 {
244 ceph_assert(epoch == osdmap.get_epoch() + 1);
245
246 for (auto &new_pool : new_pools) {
247 if (!new_pool.second.tiers.empty()) {
248 pg_pool_t& base = new_pool.second;
249
250 auto new_rem_it = new_removed_snaps.find(new_pool.first);
251
252 for (const auto &tier_pool : base.tiers) {
253 const auto &r = new_pools.find(tier_pool);
254 pg_pool_t *tier = 0;
255 if (r == new_pools.end()) {
256 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
257 if (!orig) {
258 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
259 return -EIO;
260 }
261 tier = get_new_pool(tier_pool, orig);
262 } else {
263 tier = &r->second;
264 }
265 if (tier->tier_of != new_pool.first) {
266 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
267 return -EIO;
268 }
269
270 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
271 << tier_pool << dendl;
272 tier->snap_seq = base.snap_seq;
273 tier->snap_epoch = base.snap_epoch;
274 tier->snaps = base.snaps;
275 tier->removed_snaps = base.removed_snaps;
276 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
277 pg_pool_t::FLAG_POOL_SNAPS);
278
279 if (new_rem_it != new_removed_snaps.end()) {
280 new_removed_snaps[tier_pool] = new_rem_it->second;
281 }
282
283 tier->application_metadata = base.application_metadata;
284 }
285 }
286 }
287 return 0;
288 }
289
290 // ----------------------------------
291 // OSDMap
292
293 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
294 {
295 if (id >= 0)
296 return is_down(id);
297
298 if (down_cache &&
299 down_cache->count(id)) {
300 return true;
301 }
302
303 list<int> children;
304 crush->get_children(id, &children);
305 for (const auto &child : children) {
306 if (!subtree_is_down(child, down_cache)) {
307 return false;
308 }
309 }
310 if (down_cache) {
311 down_cache->insert(id);
312 }
313 return true;
314 }
315
316 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
317 {
318 // use a stack-local down_cache if we didn't get one from the
319 // caller. then at least this particular call will avoid duplicated
320 // work.
321 set<int> local_down_cache;
322 if (!down_cache) {
323 down_cache = &local_down_cache;
324 }
325
326 int current = id;
327 while (true) {
328 int type;
329 if (current >= 0) {
330 type = 0;
331 } else {
332 type = crush->get_bucket_type(current);
333 }
334 ceph_assert(type >= 0);
335
336 if (!subtree_is_down(current, down_cache)) {
337 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
338 return false;
339 }
340
341 // is this a big enough subtree to be marked as down?
342 if (type >= subtree_type) {
343 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
344 return true;
345 }
346
347 int r = crush->get_immediate_parent_id(current, &current);
348 if (r < 0) {
349 return false;
350 }
351 }
352 }
353
354 bool OSDMap::subtree_type_is_down(
355 CephContext *cct,
356 int id,
357 int subtree_type,
358 set<int> *down_in_osds,
359 set<int> *up_in_osds,
360 set<int> *subtree_up,
361 unordered_map<int, set<int> > *subtree_type_down) const
362 {
363 if (id >= 0) {
364 bool is_down_ret = is_down(id);
365 if (!is_out(id)) {
366 if (is_down_ret) {
367 down_in_osds->insert(id);
368 } else {
369 up_in_osds->insert(id);
370 }
371 }
372 return is_down_ret;
373 }
374
375 if (subtree_type_down &&
376 (*subtree_type_down)[subtree_type].count(id)) {
377 return true;
378 }
379
380 list<int> children;
381 crush->get_children(id, &children);
382 for (const auto &child : children) {
383 if (!subtree_type_is_down(
384 cct, child, crush->get_bucket_type(child),
385 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
386 subtree_up->insert(id);
387 return false;
388 }
389 }
390 if (subtree_type_down) {
391 (*subtree_type_down)[subtree_type].insert(id);
392 }
393 return true;
394 }
395
396 void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
397 {
398 using ceph::encode;
399 __u16 v = 5;
400 encode(v, bl);
401 encode(fsid, bl);
402 encode(epoch, bl);
403 encode(modified, bl);
404 int32_t new_t = new_pool_max;
405 encode(new_t, bl);
406 encode(new_flags, bl);
407 encode(fullmap, bl);
408 encode(crush, bl);
409
410 encode(new_max_osd, bl);
411 // for encode(new_pools, bl);
412 __u32 n = new_pools.size();
413 encode(n, bl);
414 for (const auto &new_pool : new_pools) {
415 n = new_pool.first;
416 encode(n, bl);
417 encode(new_pool.second, bl, 0);
418 }
419 // for encode(new_pool_names, bl);
420 n = new_pool_names.size();
421 encode(n, bl);
422
423 for (const auto &new_pool_name : new_pool_names) {
424 n = new_pool_name.first;
425 encode(n, bl);
426 encode(new_pool_name.second, bl);
427 }
428 // for encode(old_pools, bl);
429 n = old_pools.size();
430 encode(n, bl);
431 for (auto &old_pool : old_pools) {
432 n = old_pool;
433 encode(n, bl);
434 }
435 encode(new_up_client, bl, 0);
436 {
437 // legacy is map<int32_t,uint8_t>
438 map<int32_t, uint8_t> os;
439 for (auto p : new_state) {
440 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
441 // that an old client could not understand.
442 // skip those!
443 uint8_t s = p.second;
444 if (p.second != 0 && s == 0)
445 continue;
446 os[p.first] = s;
447 }
448 uint32_t n = os.size();
449 encode(n, bl);
450 for (auto p : os) {
451 encode(p.first, bl);
452 encode(p.second, bl);
453 }
454 }
455 encode(new_weight, bl);
456 // for encode(new_pg_temp, bl);
457 n = new_pg_temp.size();
458 encode(n, bl);
459
460 for (const auto &pg_temp : new_pg_temp) {
461 old_pg_t opg = pg_temp.first.get_old_pg();
462 encode(opg, bl);
463 encode(pg_temp.second, bl);
464 }
465 }
466
467 void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
468 {
469 using ceph::encode;
470 if ((features & CEPH_FEATURE_PGID64) == 0) {
471 encode_client_old(bl);
472 return;
473 }
474
475 // base
476 __u16 v = 6;
477 encode(v, bl);
478 encode(fsid, bl);
479 encode(epoch, bl);
480 encode(modified, bl);
481 encode(new_pool_max, bl);
482 encode(new_flags, bl);
483 encode(fullmap, bl);
484 encode(crush, bl);
485
486 encode(new_max_osd, bl);
487 encode(new_pools, bl, features);
488 encode(new_pool_names, bl);
489 encode(old_pools, bl);
490 encode(new_up_client, bl, features);
491 {
492 map<int32_t, uint8_t> os;
493 for (auto p : new_state) {
494 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
495 // that an old client could not understand.
496 // skip those!
497 uint8_t s = p.second;
498 if (p.second != 0 && s == 0)
499 continue;
500 os[p.first] = s;
501 }
502 uint32_t n = os.size();
503 encode(n, bl);
504 for (auto p : os) {
505 encode(p.first, bl);
506 encode(p.second, bl);
507 }
508 }
509 encode(new_weight, bl);
510 encode(new_pg_temp, bl);
511
512 // extended
513 __u16 ev = 10;
514 encode(ev, bl);
515 encode(new_hb_back_up, bl, features);
516 encode(new_up_thru, bl);
517 encode(new_last_clean_interval, bl);
518 encode(new_lost, bl);
519 encode(new_blocklist, bl, features);
520 encode(old_blocklist, bl, features);
521 encode(new_up_cluster, bl, features);
522 encode(cluster_snapshot, bl);
523 encode(new_uuid, bl);
524 encode(new_xinfo, bl, features);
525 encode(new_hb_front_up, bl, features);
526 }
527
528 template<class T>
529 static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
530 {
531 uint32_t n = m.size();
532 encode(n, bl);
533 for (auto& i : m) {
534 encode(i.first, bl);
535 encode(i.second.legacy_addr(), bl, f);
536 }
537 }
538
539 template<class T>
540 static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
541 {
542 uint32_t n = m.size();
543 encode(n, bl);
544 for (auto& i : m) {
545 if (i) {
546 encode(i->legacy_addr(), bl, f);
547 } else {
548 encode(entity_addr_t(), bl, f);
549 }
550 }
551 }
552
553 /* for a description of osdmap incremental versions, and when they were
554 * introduced, please refer to
555 * doc/dev/osd_internals/osdmap_versions.txt
556 */
557 void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
558 {
559 using ceph::encode;
560 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
561 encode_classic(bl, features);
562 return;
563 }
564
565 // only a select set of callers should *ever* be encoding new
566 // OSDMaps. others should be passing around the canonical encoded
567 // buffers from on high. select out those callers by passing in an
568 // "impossible" feature bit.
569 ceph_assert(features & CEPH_FEATURE_RESERVED);
570 features &= ~CEPH_FEATURE_RESERVED;
571
572 size_t start_offset = bl.length();
573 size_t tail_offset;
574 size_t crc_offset;
575 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
576
577 // meta-encoding: how we include client-used and osd-specific data
578 ENCODE_START(8, 7, bl);
579
580 {
581 uint8_t v = 8;
582 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
583 v = 3;
584 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
585 v = 5;
586 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
587 v = 6;
588 }
589 ENCODE_START(v, 1, bl); // client-usable data
590 encode(fsid, bl);
591 encode(epoch, bl);
592 encode(modified, bl);
593 encode(new_pool_max, bl);
594 encode(new_flags, bl);
595 encode(fullmap, bl);
596 encode(crush, bl);
597
598 encode(new_max_osd, bl);
599 encode(new_pools, bl, features);
600 encode(new_pool_names, bl);
601 encode(old_pools, bl);
602 if (v >= 7) {
603 encode(new_up_client, bl, features);
604 } else {
605 encode_addrvec_map_as_addr(new_up_client, bl, features);
606 }
607 if (v >= 5) {
608 encode(new_state, bl);
609 } else {
610 map<int32_t, uint8_t> os;
611 for (auto p : new_state) {
612 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
613 // that an old client could not understand.
614 // skip those!
615 uint8_t s = p.second;
616 if (p.second != 0 && s == 0)
617 continue;
618 os[p.first] = s;
619 }
620 uint32_t n = os.size();
621 encode(n, bl);
622 for (auto p : os) {
623 encode(p.first, bl);
624 encode(p.second, bl);
625 }
626 }
627 encode(new_weight, bl);
628 encode(new_pg_temp, bl);
629 encode(new_primary_temp, bl);
630 encode(new_primary_affinity, bl);
631 encode(new_erasure_code_profiles, bl);
632 encode(old_erasure_code_profiles, bl);
633 if (v >= 4) {
634 encode(new_pg_upmap, bl);
635 encode(old_pg_upmap, bl);
636 encode(new_pg_upmap_items, bl);
637 encode(old_pg_upmap_items, bl);
638 }
639 if (v >= 6) {
640 encode(new_removed_snaps, bl);
641 encode(new_purged_snaps, bl);
642 }
643 if (v >= 8) {
644 encode(new_last_up_change, bl);
645 encode(new_last_in_change, bl);
646 }
647 ENCODE_FINISH(bl); // client-usable data
648 }
649
650 {
651 uint8_t target_v = 9; // if bumping this, be aware of stretch_mode target_v 10!
652 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
653 target_v = 2;
654 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
655 target_v = 6;
656 }
657 if (change_stretch_mode) {
658 target_v = std::max((uint8_t)10, target_v);
659 }
660 ENCODE_START(target_v, 1, bl); // extended, osd-only data
661 if (target_v < 7) {
662 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
663 } else {
664 encode(new_hb_back_up, bl, features);
665 }
666 encode(new_up_thru, bl);
667 encode(new_last_clean_interval, bl);
668 encode(new_lost, bl);
669 encode(new_blocklist, bl, features);
670 encode(old_blocklist, bl, features);
671 if (target_v < 7) {
672 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
673 } else {
674 encode(new_up_cluster, bl, features);
675 }
676 encode(cluster_snapshot, bl);
677 encode(new_uuid, bl);
678 encode(new_xinfo, bl, features);
679 if (target_v < 7) {
680 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
681 } else {
682 encode(new_hb_front_up, bl, features);
683 }
684 encode(features, bl); // NOTE: features arg, not the member
685 if (target_v >= 3) {
686 encode(new_nearfull_ratio, bl);
687 encode(new_full_ratio, bl);
688 encode(new_backfillfull_ratio, bl);
689 }
690 // 5 was string-based new_require_min_compat_client
691 if (target_v >= 6) {
692 encode(new_require_min_compat_client, bl);
693 encode(new_require_osd_release, bl);
694 }
695 if (target_v >= 8) {
696 encode(new_crush_node_flags, bl);
697 }
698 if (target_v >= 9) {
699 encode(new_device_class_flags, bl);
700 }
701 if (target_v >= 10) {
702 encode(change_stretch_mode, bl);
703 encode(new_stretch_bucket_count, bl);
704 encode(new_degraded_stretch_mode, bl);
705 encode(new_recovering_stretch_mode, bl);
706 encode(new_stretch_mode_bucket, bl);
707 encode(stretch_mode_enabled, bl);
708 }
709 ENCODE_FINISH(bl); // osd-only data
710 }
711
712 crc_offset = bl.length();
713 crc_filler = bl.append_hole(sizeof(uint32_t));
714 tail_offset = bl.length();
715
716 encode(full_crc, bl);
717
718 ENCODE_FINISH(bl); // meta-encoding wrapper
719
720 // fill in crc
721 ceph::buffer::list front;
722 front.substr_of(bl, start_offset, crc_offset - start_offset);
723 inc_crc = front.crc32c(-1);
724 ceph::buffer::list tail;
725 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
726 inc_crc = tail.crc32c(inc_crc);
727 ceph_le32 crc_le;
728 crc_le = inc_crc;
729 crc_filler->copy_in(4u, (char*)&crc_le);
730 have_crc = true;
731 }
732
733 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
734 {
735 using ceph::decode;
736 __u32 n, t;
737 // base
738 __u16 v;
739 decode(v, p);
740 decode(fsid, p);
741 decode(epoch, p);
742 decode(modified, p);
743 if (v == 4 || v == 5) {
744 decode(n, p);
745 new_pool_max = n;
746 } else if (v >= 6)
747 decode(new_pool_max, p);
748 decode(new_flags, p);
749 decode(fullmap, p);
750 decode(crush, p);
751
752 decode(new_max_osd, p);
753 if (v < 6) {
754 new_pools.clear();
755 decode(n, p);
756 while (n--) {
757 decode(t, p);
758 decode(new_pools[t], p);
759 }
760 } else {
761 decode(new_pools, p);
762 }
763 if (v == 5) {
764 new_pool_names.clear();
765 decode(n, p);
766 while (n--) {
767 decode(t, p);
768 decode(new_pool_names[t], p);
769 }
770 } else if (v >= 6) {
771 decode(new_pool_names, p);
772 }
773 if (v < 6) {
774 old_pools.clear();
775 decode(n, p);
776 while (n--) {
777 decode(t, p);
778 old_pools.insert(t);
779 }
780 } else {
781 decode(old_pools, p);
782 }
783 decode(new_up_client, p);
784 {
785 map<int32_t,uint8_t> ns;
786 decode(ns, p);
787 for (auto q : ns) {
788 new_state[q.first] = q.second;
789 }
790 }
791 decode(new_weight, p);
792
793 if (v < 6) {
794 new_pg_temp.clear();
795 decode(n, p);
796 while (n--) {
797 old_pg_t opg;
798 ceph::decode_raw(opg, p);
799 decode(new_pg_temp[pg_t(opg)], p);
800 }
801 } else {
802 decode(new_pg_temp, p);
803 }
804
805 // decode short map, too.
806 if (v == 5 && p.end())
807 return;
808
809 // extended
810 __u16 ev = 0;
811 if (v >= 5)
812 decode(ev, p);
813 decode(new_hb_back_up, p);
814 if (v < 5)
815 decode(new_pool_names, p);
816 decode(new_up_thru, p);
817 decode(new_last_clean_interval, p);
818 decode(new_lost, p);
819 decode(new_blocklist, p);
820 decode(old_blocklist, p);
821 if (ev >= 6)
822 decode(new_up_cluster, p);
823 if (ev >= 7)
824 decode(cluster_snapshot, p);
825 if (ev >= 8)
826 decode(new_uuid, p);
827 if (ev >= 9)
828 decode(new_xinfo, p);
829 if (ev >= 10)
830 decode(new_hb_front_up, p);
831 }
832
833 /* for a description of osdmap incremental versions, and when they were
834 * introduced, please refer to
835 * doc/dev/osd_internals/osdmap_versions.txt
836 */
837 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
838 {
839 using ceph::decode;
840 /**
841 * Older encodings of the Incremental had a single struct_v which
842 * covered the whole encoding, and was prior to our modern
843 * stuff which includes a compatv and a size. So if we see
844 * a struct_v < 7, we must rewind to the beginning and use our
845 * classic decoder.
846 */
847 size_t start_offset = bl.get_off();
848 size_t tail_offset = 0;
849 ceph::buffer::list crc_front, crc_tail;
850
851 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
852 if (struct_v < 7) {
853 bl.seek(start_offset);
854 decode_classic(bl);
855 encode_features = 0;
856 if (struct_v >= 6)
857 encode_features = CEPH_FEATURE_PGID64;
858 else
859 encode_features = 0;
860 return;
861 }
862 {
863 DECODE_START(8, bl); // client-usable data
864 decode(fsid, bl);
865 decode(epoch, bl);
866 decode(modified, bl);
867 decode(new_pool_max, bl);
868 decode(new_flags, bl);
869 decode(fullmap, bl);
870 decode(crush, bl);
871
872 decode(new_max_osd, bl);
873 decode(new_pools, bl);
874 decode(new_pool_names, bl);
875 decode(old_pools, bl);
876 decode(new_up_client, bl);
877 if (struct_v >= 5) {
878 decode(new_state, bl);
879 } else {
880 map<int32_t,uint8_t> ns;
881 decode(ns, bl);
882 for (auto q : ns) {
883 new_state[q.first] = q.second;
884 }
885 }
886 decode(new_weight, bl);
887 decode(new_pg_temp, bl);
888 decode(new_primary_temp, bl);
889 if (struct_v >= 2)
890 decode(new_primary_affinity, bl);
891 else
892 new_primary_affinity.clear();
893 if (struct_v >= 3) {
894 decode(new_erasure_code_profiles, bl);
895 decode(old_erasure_code_profiles, bl);
896 } else {
897 new_erasure_code_profiles.clear();
898 old_erasure_code_profiles.clear();
899 }
900 if (struct_v >= 4) {
901 decode(new_pg_upmap, bl);
902 decode(old_pg_upmap, bl);
903 decode(new_pg_upmap_items, bl);
904 decode(old_pg_upmap_items, bl);
905 }
906 if (struct_v >= 6) {
907 decode(new_removed_snaps, bl);
908 decode(new_purged_snaps, bl);
909 }
910 if (struct_v >= 8) {
911 decode(new_last_up_change, bl);
912 decode(new_last_in_change, bl);
913 }
914 DECODE_FINISH(bl); // client-usable data
915 }
916
917 {
918 DECODE_START(10, bl); // extended, osd-only data
919 decode(new_hb_back_up, bl);
920 decode(new_up_thru, bl);
921 decode(new_last_clean_interval, bl);
922 decode(new_lost, bl);
923 decode(new_blocklist, bl);
924 decode(old_blocklist, bl);
925 decode(new_up_cluster, bl);
926 decode(cluster_snapshot, bl);
927 decode(new_uuid, bl);
928 decode(new_xinfo, bl);
929 decode(new_hb_front_up, bl);
930 if (struct_v >= 2)
931 decode(encode_features, bl);
932 else
933 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
934 if (struct_v >= 3) {
935 decode(new_nearfull_ratio, bl);
936 decode(new_full_ratio, bl);
937 } else {
938 new_nearfull_ratio = -1;
939 new_full_ratio = -1;
940 }
941 if (struct_v >= 4) {
942 decode(new_backfillfull_ratio, bl);
943 } else {
944 new_backfillfull_ratio = -1;
945 }
946 if (struct_v == 5) {
947 string r;
948 decode(r, bl);
949 if (r.length()) {
950 new_require_min_compat_client = ceph_release_from_name(r);
951 }
952 }
953 if (struct_v >= 6) {
954 decode(new_require_min_compat_client, bl);
955 decode(new_require_osd_release, bl);
956 } else {
957 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
958 // only for compat with post-kraken pre-luminous test clusters
959 new_require_osd_release = ceph_release_t::luminous;
960 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
961 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
962 new_require_osd_release = ceph_release_t::kraken;
963 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
964 new_require_osd_release = ceph_release_t::jewel;
965 } else {
966 new_require_osd_release = ceph_release_t::unknown;
967 }
968 }
969 if (struct_v >= 8) {
970 decode(new_crush_node_flags, bl);
971 }
972 if (struct_v >= 9) {
973 decode(new_device_class_flags, bl);
974 }
975 if (struct_v >= 10) {
976 decode(change_stretch_mode, bl);
977 decode(new_stretch_bucket_count, bl);
978 decode(new_degraded_stretch_mode, bl);
979 decode(new_recovering_stretch_mode, bl);
980 decode(new_stretch_mode_bucket, bl);
981 decode(stretch_mode_enabled, bl);
982 }
983
984 DECODE_FINISH(bl); // osd-only data
985 }
986
987 if (struct_v >= 8) {
988 have_crc = true;
989 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
990 decode(inc_crc, bl);
991 tail_offset = bl.get_off();
992 decode(full_crc, bl);
993 } else {
994 have_crc = false;
995 full_crc = 0;
996 inc_crc = 0;
997 }
998
999 DECODE_FINISH(bl); // wrapper
1000
1001 if (have_crc) {
1002 // verify crc
1003 uint32_t actual = crc_front.crc32c(-1);
1004 if (tail_offset < bl.get_off()) {
1005 ceph::buffer::list tail;
1006 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1007 actual = tail.crc32c(actual);
1008 }
1009 if (inc_crc != actual) {
1010 ostringstream ss;
1011 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1012 string s = ss.str();
1013 throw ceph::buffer::malformed_input(s.c_str());
1014 }
1015 }
1016 }
1017
1018 void OSDMap::Incremental::dump(Formatter *f) const
1019 {
1020 f->dump_int("epoch", epoch);
1021 f->dump_stream("fsid") << fsid;
1022 f->dump_stream("modified") << modified;
1023 f->dump_stream("new_last_up_change") << new_last_up_change;
1024 f->dump_stream("new_last_in_change") << new_last_in_change;
1025 f->dump_int("new_pool_max", new_pool_max);
1026 f->dump_int("new_flags", new_flags);
1027 f->dump_float("new_full_ratio", new_full_ratio);
1028 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1029 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
1030 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1031 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
1032
1033 if (fullmap.length()) {
1034 f->open_object_section("full_map");
1035 OSDMap full;
1036 ceph::buffer::list fbl = fullmap; // kludge around constness.
1037 auto p = fbl.cbegin();
1038 full.decode(p);
1039 full.dump(f);
1040 f->close_section();
1041 }
1042 if (crush.length()) {
1043 f->open_object_section("crush");
1044 CrushWrapper c;
1045 ceph::buffer::list tbl = crush; // kludge around constness.
1046 auto p = tbl.cbegin();
1047 c.decode(p);
1048 c.dump(f);
1049 f->close_section();
1050 }
1051
1052 f->dump_int("new_max_osd", new_max_osd);
1053
1054 f->open_array_section("new_pools");
1055
1056 for (const auto &new_pool : new_pools) {
1057 f->open_object_section("pool");
1058 f->dump_int("pool", new_pool.first);
1059 new_pool.second.dump(f);
1060 f->close_section();
1061 }
1062 f->close_section();
1063 f->open_array_section("new_pool_names");
1064
1065 for (const auto &new_pool_name : new_pool_names) {
1066 f->open_object_section("pool_name");
1067 f->dump_int("pool", new_pool_name.first);
1068 f->dump_string("name", new_pool_name.second);
1069 f->close_section();
1070 }
1071 f->close_section();
1072 f->open_array_section("old_pools");
1073
1074 for (const auto &old_pool : old_pools)
1075 f->dump_int("pool", old_pool);
1076 f->close_section();
1077
1078 f->open_array_section("new_up_osds");
1079
1080 for (const auto &upclient : new_up_client) {
1081 f->open_object_section("osd");
1082 f->dump_int("osd", upclient.first);
1083 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1084 f->dump_object("public_addrs", upclient.second);
1085 if (auto p = new_up_cluster.find(upclient.first);
1086 p != new_up_cluster.end()) {
1087 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1088 f->dump_object("cluster_addrs", p->second);
1089 }
1090 if (auto p = new_hb_back_up.find(upclient.first);
1091 p != new_hb_back_up.end()) {
1092 f->dump_object("heartbeat_back_addrs", p->second);
1093 }
1094 if (auto p = new_hb_front_up.find(upclient.first);
1095 p != new_hb_front_up.end()) {
1096 f->dump_object("heartbeat_front_addrs", p->second);
1097 }
1098 f->close_section();
1099 }
1100 f->close_section();
1101
1102 f->open_array_section("new_weight");
1103
1104 for (const auto &weight : new_weight) {
1105 f->open_object_section("osd");
1106 f->dump_int("osd", weight.first);
1107 f->dump_int("weight", weight.second);
1108 f->close_section();
1109 }
1110 f->close_section();
1111
1112 f->open_array_section("osd_state_xor");
1113 for (const auto &ns : new_state) {
1114 f->open_object_section("osd");
1115 f->dump_int("osd", ns.first);
1116 set<string> st;
1117 calc_state_set(new_state.find(ns.first)->second, st);
1118 f->open_array_section("state_xor");
1119 for (auto &state : st)
1120 f->dump_string("state", state);
1121 f->close_section();
1122 f->close_section();
1123 }
1124 f->close_section();
1125
1126 f->open_array_section("new_pg_temp");
1127
1128 for (const auto &pg_temp : new_pg_temp) {
1129 f->open_object_section("pg");
1130 f->dump_stream("pgid") << pg_temp.first;
1131 f->open_array_section("osds");
1132
1133 for (const auto &osd : pg_temp.second)
1134 f->dump_int("osd", osd);
1135 f->close_section();
1136 f->close_section();
1137 }
1138 f->close_section();
1139
1140 f->open_array_section("primary_temp");
1141
1142 for (const auto &primary_temp : new_primary_temp) {
1143 f->dump_stream("pgid") << primary_temp.first;
1144 f->dump_int("osd", primary_temp.second);
1145 }
1146 f->close_section(); // primary_temp
1147
1148 f->open_array_section("new_pg_upmap");
1149 for (auto& i : new_pg_upmap) {
1150 f->open_object_section("mapping");
1151 f->dump_stream("pgid") << i.first;
1152 f->open_array_section("osds");
1153 for (auto osd : i.second) {
1154 f->dump_int("osd", osd);
1155 }
1156 f->close_section();
1157 f->close_section();
1158 }
1159 f->close_section();
1160 f->open_array_section("old_pg_upmap");
1161 for (auto& i : old_pg_upmap) {
1162 f->dump_stream("pgid") << i;
1163 }
1164 f->close_section();
1165
1166 f->open_array_section("new_pg_upmap_items");
1167 for (auto& i : new_pg_upmap_items) {
1168 f->open_object_section("mapping");
1169 f->dump_stream("pgid") << i.first;
1170 f->open_array_section("mappings");
1171 for (auto& p : i.second) {
1172 f->open_object_section("mapping");
1173 f->dump_int("from", p.first);
1174 f->dump_int("to", p.second);
1175 f->close_section();
1176 }
1177 f->close_section();
1178 f->close_section();
1179 }
1180 f->close_section();
1181 f->open_array_section("old_pg_upmap_items");
1182 for (auto& i : old_pg_upmap_items) {
1183 f->dump_stream("pgid") << i;
1184 }
1185 f->close_section();
1186
1187 f->open_array_section("new_up_thru");
1188
1189 for (const auto &up_thru : new_up_thru) {
1190 f->open_object_section("osd");
1191 f->dump_int("osd", up_thru.first);
1192 f->dump_int("up_thru", up_thru.second);
1193 f->close_section();
1194 }
1195 f->close_section();
1196
1197 f->open_array_section("new_lost");
1198
1199 for (const auto &lost : new_lost) {
1200 f->open_object_section("osd");
1201 f->dump_int("osd", lost.first);
1202 f->dump_int("epoch_lost", lost.second);
1203 f->close_section();
1204 }
1205 f->close_section();
1206
1207 f->open_array_section("new_last_clean_interval");
1208
1209 for (const auto &last_clean_interval : new_last_clean_interval) {
1210 f->open_object_section("osd");
1211 f->dump_int("osd", last_clean_interval.first);
1212 f->dump_int("first", last_clean_interval.second.first);
1213 f->dump_int("last", last_clean_interval.second.second);
1214 f->close_section();
1215 }
1216 f->close_section();
1217
1218 f->open_array_section("new_blocklist");
1219 for (const auto &blist : new_blocklist) {
1220 stringstream ss;
1221 ss << blist.first;
1222 f->dump_stream(ss.str().c_str()) << blist.second;
1223 }
1224 f->close_section();
1225 f->open_array_section("old_blocklist");
1226 for (const auto &blist : old_blocklist)
1227 f->dump_stream("addr") << blist;
1228 f->close_section();
1229
1230 f->open_array_section("new_xinfo");
1231 for (const auto &xinfo : new_xinfo) {
1232 f->open_object_section("xinfo");
1233 f->dump_int("osd", xinfo.first);
1234 xinfo.second.dump(f);
1235 f->close_section();
1236 }
1237 f->close_section();
1238
1239 if (cluster_snapshot.size())
1240 f->dump_string("cluster_snapshot", cluster_snapshot);
1241
1242 f->open_array_section("new_uuid");
1243 for (const auto &uuid : new_uuid) {
1244 f->open_object_section("osd");
1245 f->dump_int("osd", uuid.first);
1246 f->dump_stream("uuid") << uuid.second;
1247 f->close_section();
1248 }
1249 f->close_section();
1250
1251 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1252 f->open_array_section("old_erasure_code_profiles");
1253 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1254 f->dump_string("old", erasure_code_profile);
1255 }
1256 f->close_section();
1257
1258 f->open_array_section("new_removed_snaps");
1259 for (auto& p : new_removed_snaps) {
1260 f->open_object_section("pool");
1261 f->dump_int("pool", p.first);
1262 f->open_array_section("snaps");
1263 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1264 f->open_object_section("interval");
1265 f->dump_unsigned("begin", q.get_start());
1266 f->dump_unsigned("length", q.get_len());
1267 f->close_section();
1268 }
1269 f->close_section();
1270 f->close_section();
1271 }
1272 f->close_section();
1273 f->open_array_section("new_purged_snaps");
1274 for (auto& p : new_purged_snaps) {
1275 f->open_object_section("pool");
1276 f->dump_int("pool", p.first);
1277 f->open_array_section("snaps");
1278 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1279 f->open_object_section("interval");
1280 f->dump_unsigned("begin", q.get_start());
1281 f->dump_unsigned("length", q.get_len());
1282 f->close_section();
1283 }
1284 f->close_section();
1285 f->close_section();
1286 }
1287 f->open_array_section("new_crush_node_flags");
1288 for (auto& i : new_crush_node_flags) {
1289 f->open_object_section("node");
1290 f->dump_int("id", i.first);
1291 set<string> st;
1292 calc_state_set(i.second, st);
1293 for (auto& j : st) {
1294 f->dump_string("flag", j);
1295 }
1296 f->close_section();
1297 }
1298 f->close_section();
1299 f->open_array_section("new_device_class_flags");
1300 for (auto& i : new_device_class_flags) {
1301 f->open_object_section("device_class");
1302 f->dump_int("id", i.first);
1303 set<string> st;
1304 calc_state_set(i.second, st);
1305 for (auto& j : st) {
1306 f->dump_string("flag", j);
1307 }
1308 f->close_section();
1309 }
1310 f->close_section();
1311 f->open_object_section("stretch_mode");
1312 {
1313 f->dump_bool("change_stretch_mode", change_stretch_mode);
1314 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1315 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1316 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1317 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1318 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1319 }
1320 f->close_section();
1321 f->close_section();
1322 }
1323
1324 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1325 {
1326 o.push_back(new Incremental);
1327 }
1328
1329 // ----------------------------------
1330 // OSDMap
1331
1332 void OSDMap::set_epoch(epoch_t e)
1333 {
1334 epoch = e;
1335 for (auto &pool : pools)
1336 pool.second.last_change = e;
1337 }
1338
1339 bool OSDMap::is_blocklisted(const entity_addr_t& orig) const
1340 {
1341 if (blocklist.empty()) {
1342 return false;
1343 }
1344
1345 // all blocklist entries are type ANY for nautilus+
1346 // FIXME: avoid this copy!
1347 entity_addr_t a = orig;
1348 if (require_osd_release < ceph_release_t::nautilus) {
1349 a.set_type(entity_addr_t::TYPE_LEGACY);
1350 } else {
1351 a.set_type(entity_addr_t::TYPE_ANY);
1352 }
1353
1354 // this specific instance?
1355 if (blocklist.count(a)) {
1356 return true;
1357 }
1358
1359 // is entire ip blocklisted?
1360 if (a.is_ip()) {
1361 a.set_port(0);
1362 a.set_nonce(0);
1363 if (blocklist.count(a)) {
1364 return true;
1365 }
1366 }
1367
1368 return false;
1369 }
1370
1371 bool OSDMap::is_blocklisted(const entity_addrvec_t& av) const
1372 {
1373 if (blocklist.empty())
1374 return false;
1375
1376 for (auto& a : av.v) {
1377 if (is_blocklisted(a)) {
1378 return true;
1379 }
1380 }
1381
1382 return false;
1383 }
1384
1385 void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl) const
1386 {
1387 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
1388 }
1389
1390 void OSDMap::get_blocklist(std::set<entity_addr_t> *bl) const
1391 {
1392 for (const auto &i : blocklist) {
1393 bl->insert(i.first);
1394 }
1395 }
1396
1397 void OSDMap::set_max_osd(int m)
1398 {
1399 max_osd = m;
1400 osd_state.resize(max_osd, 0);
1401 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1402 osd_info.resize(max_osd);
1403 osd_xinfo.resize(max_osd);
1404 osd_addrs->client_addrs.resize(max_osd);
1405 osd_addrs->cluster_addrs.resize(max_osd);
1406 osd_addrs->hb_back_addrs.resize(max_osd);
1407 osd_addrs->hb_front_addrs.resize(max_osd);
1408 osd_uuid->resize(max_osd);
1409 if (osd_primary_affinity)
1410 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1411
1412 calc_num_osds();
1413 }
1414
1415 int OSDMap::calc_num_osds()
1416 {
1417 num_osd = 0;
1418 num_up_osd = 0;
1419 num_in_osd = 0;
1420 for (int i=0; i<max_osd; i++) {
1421 if (osd_state[i] & CEPH_OSD_EXISTS) {
1422 ++num_osd;
1423 if (osd_state[i] & CEPH_OSD_UP) {
1424 ++num_up_osd;
1425 }
1426 if (get_weight(i) != CEPH_OSD_OUT) {
1427 ++num_in_osd;
1428 }
1429 }
1430 }
1431 return num_osd;
1432 }
1433
1434 void OSDMap::get_full_pools(CephContext *cct,
1435 set<int64_t> *full,
1436 set<int64_t> *backfillfull,
1437 set<int64_t> *nearfull) const
1438 {
1439 ceph_assert(full);
1440 ceph_assert(backfillfull);
1441 ceph_assert(nearfull);
1442 full->clear();
1443 backfillfull->clear();
1444 nearfull->clear();
1445
1446 vector<int> full_osds;
1447 vector<int> backfillfull_osds;
1448 vector<int> nearfull_osds;
1449 for (int i = 0; i < max_osd; ++i) {
1450 if (exists(i) && is_up(i) && is_in(i)) {
1451 if (osd_state[i] & CEPH_OSD_FULL)
1452 full_osds.push_back(i);
1453 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1454 backfillfull_osds.push_back(i);
1455 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1456 nearfull_osds.push_back(i);
1457 }
1458 }
1459
1460 for (auto i: full_osds) {
1461 get_pool_ids_by_osd(cct, i, full);
1462 }
1463 for (auto i: backfillfull_osds) {
1464 get_pool_ids_by_osd(cct, i, backfillfull);
1465 }
1466 for (auto i: nearfull_osds) {
1467 get_pool_ids_by_osd(cct, i, nearfull);
1468 }
1469 }
1470
1471 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1472 set<int> *nearfull) const
1473 {
1474 full->clear();
1475 backfill->clear();
1476 nearfull->clear();
1477 for (int i = 0; i < max_osd; ++i) {
1478 if (exists(i) && is_up(i) && is_in(i)) {
1479 if (osd_state[i] & CEPH_OSD_FULL)
1480 full->emplace(i);
1481 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1482 backfill->emplace(i);
1483 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1484 nearfull->emplace(i);
1485 }
1486 }
1487 }
1488
1489 void OSDMap::get_all_osds(set<int32_t>& ls) const
1490 {
1491 for (int i=0; i<max_osd; i++)
1492 if (exists(i))
1493 ls.insert(i);
1494 }
1495
1496 void OSDMap::get_up_osds(set<int32_t>& ls) const
1497 {
1498 for (int i = 0; i < max_osd; i++) {
1499 if (is_up(i))
1500 ls.insert(i);
1501 }
1502 }
1503
1504 void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
1505 {
1506 for (int i = 0; i < max_osd; i++) {
1507 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
1508 ls.insert(i);
1509 }
1510 }
1511
1512 void OSDMap::get_flag_set(set<string> *flagset) const
1513 {
1514 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1515 if (flags & (1<<i)) {
1516 flagset->insert(get_flag_string(flags & (1<<i)));
1517 }
1518 }
1519 }
1520
1521 void OSDMap::calc_state_set(int state, set<string>& st)
1522 {
1523 unsigned t = state;
1524 for (unsigned s = 1; t; s <<= 1) {
1525 if (t & s) {
1526 t &= ~s;
1527 st.insert(ceph_osd_state_name(s));
1528 }
1529 }
1530 }
1531
1532 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1533 {
1534 float max = 0;
1535 for (const auto &weight : weights) {
1536 if (weight.second > max)
1537 max = weight.second;
1538 }
1539
1540 for (const auto &weight : weights) {
1541 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1542 }
1543 }
1544
1545 int OSDMap::identify_osd(const entity_addr_t& addr) const
1546 {
1547 for (int i=0; i<max_osd; i++)
1548 if (exists(i) && (get_addrs(i).contains(addr) ||
1549 get_cluster_addrs(i).contains(addr)))
1550 return i;
1551 return -1;
1552 }
1553
1554 int OSDMap::identify_osd(const uuid_d& u) const
1555 {
1556 for (int i=0; i<max_osd; i++)
1557 if (exists(i) && get_uuid(i) == u)
1558 return i;
1559 return -1;
1560 }
1561
1562 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1563 {
1564 for (int i=0; i<max_osd; i++)
1565 if (exists(i) && (get_addrs(i).contains(addr) ||
1566 get_cluster_addrs(i).contains(addr) ||
1567 get_hb_back_addrs(i).contains(addr) ||
1568 get_hb_front_addrs(i).contains(addr)))
1569 return i;
1570 return -1;
1571 }
1572
1573 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1574 {
1575 for (int i=0; i<max_osd; i++)
1576 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1577 get_cluster_addrs(i).is_same_host(ip)))
1578 return i;
1579 return -1;
1580 }
1581
1582
1583 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1584 {
1585 uint64_t features = 0; // things we actually have
1586 uint64_t mask = 0; // things we could have
1587
1588 if (crush->has_nondefault_tunables())
1589 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1590 if (crush->has_nondefault_tunables2())
1591 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1592 if (crush->has_nondefault_tunables3())
1593 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1594 if (crush->has_v4_buckets())
1595 features |= CEPH_FEATURE_CRUSH_V4;
1596 if (crush->has_nondefault_tunables5())
1597 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1598 if (crush->has_incompat_choose_args()) {
1599 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1600 }
1601 mask |= CEPH_FEATURES_CRUSH;
1602
1603 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1604 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1605 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1606
1607 for (auto &pool: pools) {
1608 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1609 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1610 }
1611 if (!pool.second.tiers.empty() ||
1612 pool.second.is_tier()) {
1613 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1614 }
1615 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
1616 pool.second.get_type(),
1617 pool.second.get_size());
1618 if (ruleid >= 0) {
1619 if (crush->is_v2_rule(ruleid))
1620 features |= CEPH_FEATURE_CRUSH_V2;
1621 if (crush->is_v3_rule(ruleid))
1622 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1623 if (crush->is_v5_rule(ruleid))
1624 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1625 }
1626 }
1627 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1628
1629 if (osd_primary_affinity) {
1630 for (int i = 0; i < max_osd; ++i) {
1631 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1632 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1633 break;
1634 }
1635 }
1636 }
1637 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1638
1639 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1640 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1641 if (require_osd_release >= ceph_release_t::jewel) {
1642 features |= jewel_features;
1643 }
1644 mask |= jewel_features;
1645
1646 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1647 | CEPH_FEATURE_MSG_ADDR2;
1648 if (require_osd_release >= ceph_release_t::kraken) {
1649 features |= kraken_features;
1650 }
1651 mask |= kraken_features;
1652
1653 if (stretch_mode_enabled) {
1654 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1655 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1656 }
1657 }
1658
1659 if (require_min_compat_client >= ceph_release_t::nautilus) {
1660 // if min_compat_client is >= nautilus, require v2 cephx signatures
1661 // from everyone
1662 features |= CEPH_FEATUREMASK_CEPHX_V2;
1663 } else if (require_osd_release >= ceph_release_t::nautilus &&
1664 entity_type == CEPH_ENTITY_TYPE_OSD) {
1665 // if osds are >= nautilus, at least require the signatures from them
1666 features |= CEPH_FEATUREMASK_CEPHX_V2;
1667 }
1668 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1669
1670 if (pmask)
1671 *pmask = mask;
1672 return features;
1673 }
1674
1675 ceph_release_t OSDMap::get_min_compat_client() const
1676 {
1677 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1678
1679 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1680 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1681 return ceph_release_t::luminous; // v12.2.0
1682 }
1683 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1684 return ceph_release_t::jewel; // v10.2.0
1685 }
1686 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1687 return ceph_release_t::hammer; // v0.94.0
1688 }
1689 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1690 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1691 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1692 return ceph_release_t::firefly; // v0.80.0
1693 }
1694 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1695 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1696 return ceph_release_t::dumpling; // v0.67.0
1697 }
1698 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1699 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1700 }
1701 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1702 }
1703
1704 ceph_release_t OSDMap::get_require_min_compat_client() const
1705 {
1706 return require_min_compat_client;
1707 }
1708
1709 void OSDMap::_calc_up_osd_features()
1710 {
1711 bool first = true;
1712 cached_up_osd_features = 0;
1713 for (int osd = 0; osd < max_osd; ++osd) {
1714 if (!is_up(osd))
1715 continue;
1716 const osd_xinfo_t &xi = get_xinfo(osd);
1717 if (xi.features == 0)
1718 continue; // bogus xinfo, maybe #20751 or similar, skipping
1719 if (first) {
1720 cached_up_osd_features = xi.features;
1721 first = false;
1722 } else {
1723 cached_up_osd_features &= xi.features;
1724 }
1725 }
1726 }
1727
1728 uint64_t OSDMap::get_up_osd_features() const
1729 {
1730 return cached_up_osd_features;
1731 }
1732
1733 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1734 {
1735 using ceph::encode;
1736 if (o->epoch == n->epoch)
1737 return;
1738
1739 int diff = 0;
1740
1741 // do addrs match?
1742 if (o->max_osd != n->max_osd)
1743 diff++;
1744 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1745 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1746 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1747 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
1748 else
1749 diff++;
1750 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1751 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1752 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
1753 else
1754 diff++;
1755 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1756 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1757 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
1758 else
1759 diff++;
1760 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1761 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1762 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
1763 else
1764 diff++;
1765 }
1766 if (diff == 0) {
1767 // zoinks, no differences at all!
1768 n->osd_addrs = o->osd_addrs;
1769 }
1770
1771 // does crush match?
1772 ceph::buffer::list oc, nc;
1773 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1774 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1775 if (oc.contents_equal(nc)) {
1776 n->crush = o->crush;
1777 }
1778
1779 // does pg_temp match?
1780 if (*o->pg_temp == *n->pg_temp)
1781 n->pg_temp = o->pg_temp;
1782
1783 // does primary_temp match?
1784 if (o->primary_temp->size() == n->primary_temp->size()) {
1785 if (*o->primary_temp == *n->primary_temp)
1786 n->primary_temp = o->primary_temp;
1787 }
1788
1789 // do uuids match?
1790 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1791 *o->osd_uuid == *n->osd_uuid)
1792 n->osd_uuid = o->osd_uuid;
1793 }
1794
1795 void OSDMap::clean_temps(CephContext *cct,
1796 const OSDMap& oldmap,
1797 const OSDMap& nextmap,
1798 Incremental *pending_inc)
1799 {
1800 ldout(cct, 10) << __func__ << dendl;
1801
1802 for (auto pg : *nextmap.pg_temp) {
1803 // if pool does not exist, remove any existing pg_temps associated with
1804 // it. we don't care about pg_temps on the pending_inc either; if there
1805 // are new_pg_temp entries on the pending, clear them out just as well.
1806 if (!nextmap.have_pg_pool(pg.first.pool())) {
1807 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1808 << " for nonexistent pool " << pg.first.pool() << dendl;
1809 pending_inc->new_pg_temp[pg.first].clear();
1810 continue;
1811 }
1812 // all osds down?
1813 unsigned num_up = 0;
1814 for (auto o : pg.second) {
1815 if (!nextmap.is_down(o)) {
1816 ++num_up;
1817 break;
1818 }
1819 }
1820 if (num_up == 0) {
1821 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1822 << " with all down osds" << pg.second << dendl;
1823 pending_inc->new_pg_temp[pg.first].clear();
1824 continue;
1825 }
1826 // redundant pg_temp?
1827 vector<int> raw_up;
1828 int primary;
1829 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1830 bool remove = false;
1831 if (raw_up == pg.second) {
1832 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1833 << pg.second << " that matches raw_up mapping" << dendl;
1834 remove = true;
1835 }
1836 // oversized pg_temp?
1837 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
1838 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1839 << pg.second << " exceeds pool size" << dendl;
1840 remove = true;
1841 }
1842 if (remove) {
1843 if (oldmap.pg_temp->count(pg.first))
1844 pending_inc->new_pg_temp[pg.first].clear();
1845 else
1846 pending_inc->new_pg_temp.erase(pg.first);
1847 }
1848 }
1849
1850 for (auto &pg : *nextmap.primary_temp) {
1851 // primary down?
1852 if (nextmap.is_down(pg.second)) {
1853 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1854 << " to down " << pg.second << dendl;
1855 pending_inc->new_primary_temp[pg.first] = -1;
1856 continue;
1857 }
1858 // redundant primary_temp?
1859 vector<int> real_up, templess_up;
1860 int real_primary, templess_primary;
1861 pg_t pgid = pg.first;
1862 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1863 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1864 if (real_primary == templess_primary){
1865 ldout(cct, 10) << __func__ << " removing primary_temp "
1866 << pgid << " -> " << real_primary
1867 << " (unnecessary/redundant)" << dendl;
1868 if (oldmap.primary_temp->count(pgid))
1869 pending_inc->new_primary_temp[pgid] = -1;
1870 else
1871 pending_inc->new_primary_temp.erase(pgid);
1872 }
1873 }
1874 }
1875
1876 void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
1877 {
1878 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1879 for (auto& p : pg_upmap)
1880 upmap_pgs->push_back(p.first);
1881 for (auto& p : pg_upmap_items)
1882 upmap_pgs->push_back(p.first);
1883 }
1884
1885 bool OSDMap::check_pg_upmaps(
1886 CephContext *cct,
1887 const vector<pg_t>& to_check,
1888 vector<pg_t> *to_cancel,
1889 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1890 {
1891 bool any_change = false;
1892 map<int, map<int, float>> rule_weight_map;
1893 for (auto& pg : to_check) {
1894 const pg_pool_t *pi = get_pg_pool(pg.pool());
1895 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1896 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1897 << dendl;
1898 to_cancel->push_back(pg);
1899 continue;
1900 }
1901 if (pi->is_pending_merge(pg, nullptr)) {
1902 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1903 << dendl;
1904 to_cancel->push_back(pg);
1905 continue;
1906 }
1907 vector<int> raw, up;
1908 pg_to_raw_upmap(pg, &raw, &up);
1909 auto crush_rule = get_pg_pool_crush_rule(pg);
1910 auto r = crush->verify_upmap(cct,
1911 crush_rule,
1912 get_pg_pool_size(pg),
1913 up);
1914 if (r < 0) {
1915 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1916 << " returning " << r
1917 << dendl;
1918 to_cancel->push_back(pg);
1919 continue;
1920 }
1921 // below we check against crush-topology changing..
1922 map<int, float> weight_map;
1923 auto it = rule_weight_map.find(crush_rule);
1924 if (it == rule_weight_map.end()) {
1925 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
1926 if (r < 0) {
1927 lderr(cct) << __func__ << " unable to get crush weight_map for "
1928 << "crush_rule " << crush_rule
1929 << dendl;
1930 continue;
1931 }
1932 rule_weight_map[crush_rule] = weight_map;
1933 } else {
1934 weight_map = it->second;
1935 }
1936 ldout(cct, 10) << __func__ << " pg " << pg
1937 << " weight_map " << weight_map
1938 << dendl;
1939 for (auto osd : up) {
1940 auto it = weight_map.find(osd);
1941 if (it == weight_map.end()) {
1942 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
1943 << "been moved out of the specific crush-tree"
1944 << dendl;
1945 to_cancel->push_back(pg);
1946 break;
1947 }
1948 auto adjusted_weight = get_weightf(it->first) * it->second;
1949 if (adjusted_weight == 0) {
1950 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
1951 << " is out/crush-out"
1952 << dendl;
1953 to_cancel->push_back(pg);
1954 break;
1955 }
1956 }
1957 if (!to_cancel->empty() && to_cancel->back() == pg)
1958 continue;
1959 // okay, upmap is valid
1960 // continue to check if it is still necessary
1961 auto i = pg_upmap.find(pg);
1962 if (i != pg_upmap.end()) {
1963 if (i->second == raw) {
1964 ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " "
1965 << i->second << dendl;
1966 to_cancel->push_back(pg);
1967 continue;
1968 }
1969 if ((int)i->second.size() != get_pg_pool_size(pg)) {
1970 ldout(cct, 10) << "removing pg_upmap " << i->first << " "
1971 << i->second << " != pool size " << get_pg_pool_size(pg)
1972 << dendl;
1973 to_cancel->push_back(pg);
1974 continue;
1975 }
1976 }
1977 auto j = pg_upmap_items.find(pg);
1978 if (j != pg_upmap_items.end()) {
1979 mempool::osdmap::vector<pair<int,int>> newmap;
1980 for (auto& p : j->second) {
1981 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1982 // cancel mapping if source osd does not exist anymore
1983 continue;
1984 }
1985 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1986 p.second >= 0 && osd_weight[p.second] == 0) {
1987 // cancel mapping if target osd is out
1988 continue;
1989 }
1990 newmap.push_back(p);
1991 }
1992 if (newmap.empty()) {
1993 ldout(cct, 10) << " removing no-op pg_upmap_items "
1994 << j->first << " " << j->second
1995 << dendl;
1996 to_cancel->push_back(pg);
1997 } else if (newmap != j->second) {
1998 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
1999 << j->first << " " << j->second
2000 << " -> " << newmap
2001 << dendl;
2002 to_remap->insert({pg, newmap});
2003 any_change = true;
2004 }
2005 }
2006 }
2007 any_change = any_change || !to_cancel->empty();
2008 return any_change;
2009 }
2010
2011 void OSDMap::clean_pg_upmaps(
2012 CephContext *cct,
2013 Incremental *pending_inc,
2014 const vector<pg_t>& to_cancel,
2015 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2016 {
2017 for (auto &pg: to_cancel) {
2018 auto i = pending_inc->new_pg_upmap.find(pg);
2019 if (i != pending_inc->new_pg_upmap.end()) {
2020 ldout(cct, 10) << __func__ << " cancel invalid pending "
2021 << "pg_upmap entry "
2022 << i->first << "->" << i->second
2023 << dendl;
2024 pending_inc->new_pg_upmap.erase(i);
2025 }
2026 auto j = pg_upmap.find(pg);
2027 if (j != pg_upmap.end()) {
2028 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2029 << j->first << "->" << j->second
2030 << dendl;
2031 pending_inc->old_pg_upmap.insert(pg);
2032 }
2033 auto p = pending_inc->new_pg_upmap_items.find(pg);
2034 if (p != pending_inc->new_pg_upmap_items.end()) {
2035 ldout(cct, 10) << __func__ << " cancel invalid pending "
2036 << "pg_upmap_items entry "
2037 << p->first << "->" << p->second
2038 << dendl;
2039 pending_inc->new_pg_upmap_items.erase(p);
2040 }
2041 auto q = pg_upmap_items.find(pg);
2042 if (q != pg_upmap_items.end()) {
2043 ldout(cct, 10) << __func__ << " cancel invalid "
2044 << "pg_upmap_items entry "
2045 << q->first << "->" << q->second
2046 << dendl;
2047 pending_inc->old_pg_upmap_items.insert(pg);
2048 }
2049 }
2050 for (auto& i : to_remap)
2051 pending_inc->new_pg_upmap_items[i.first] = i.second;
2052 }
2053
2054 bool OSDMap::clean_pg_upmaps(
2055 CephContext *cct,
2056 Incremental *pending_inc) const
2057 {
2058 ldout(cct, 10) << __func__ << dendl;
2059 vector<pg_t> to_check;
2060 vector<pg_t> to_cancel;
2061 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2062
2063 get_upmap_pgs(&to_check);
2064 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2065 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2066 return any_change;
2067 }
2068
2069 int OSDMap::apply_incremental(const Incremental &inc)
2070 {
2071 new_blocklist_entries = false;
2072 if (inc.epoch == 1)
2073 fsid = inc.fsid;
2074 else if (inc.fsid != fsid)
2075 return -EINVAL;
2076
2077 ceph_assert(inc.epoch == epoch+1);
2078
2079 epoch++;
2080 modified = inc.modified;
2081
2082 // full map?
2083 if (inc.fullmap.length()) {
2084 ceph::buffer::list bl(inc.fullmap);
2085 decode(bl);
2086 return 0;
2087 }
2088
2089 // nope, incremental.
2090 if (inc.new_flags >= 0) {
2091 flags = inc.new_flags;
2092 // the below is just to cover a newly-upgraded luminous mon
2093 // cluster that has to set require_jewel_osds or
2094 // require_kraken_osds before the osds can be upgraded to
2095 // luminous.
2096 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2097 if (require_osd_release < ceph_release_t::kraken) {
2098 require_osd_release = ceph_release_t::kraken;
2099 }
2100 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2101 if (require_osd_release < ceph_release_t::jewel) {
2102 require_osd_release = ceph_release_t::jewel;
2103 }
2104 }
2105 }
2106
2107 if (inc.new_max_osd >= 0)
2108 set_max_osd(inc.new_max_osd);
2109
2110 if (inc.new_pool_max != -1)
2111 pool_max = inc.new_pool_max;
2112
2113 for (const auto &pool : inc.new_pools) {
2114 pools[pool.first] = pool.second;
2115 pools[pool.first].last_change = epoch;
2116 }
2117
2118 new_removed_snaps = inc.new_removed_snaps;
2119 new_purged_snaps = inc.new_purged_snaps;
2120 for (auto p = new_removed_snaps.begin();
2121 p != new_removed_snaps.end();
2122 ++p) {
2123 removed_snaps_queue[p->first].union_of(p->second);
2124 }
2125 for (auto p = new_purged_snaps.begin();
2126 p != new_purged_snaps.end();
2127 ++p) {
2128 auto q = removed_snaps_queue.find(p->first);
2129 ceph_assert(q != removed_snaps_queue.end());
2130 q->second.subtract(p->second);
2131 if (q->second.empty()) {
2132 removed_snaps_queue.erase(q);
2133 }
2134 }
2135
2136 if (inc.new_last_up_change != utime_t()) {
2137 last_up_change = inc.new_last_up_change;
2138 }
2139 if (inc.new_last_in_change != utime_t()) {
2140 last_in_change = inc.new_last_in_change;
2141 }
2142
2143 for (const auto &pname : inc.new_pool_names) {
2144 auto pool_name_entry = pool_name.find(pname.first);
2145 if (pool_name_entry != pool_name.end()) {
2146 name_pool.erase(pool_name_entry->second);
2147 pool_name_entry->second = pname.second;
2148 } else {
2149 pool_name[pname.first] = pname.second;
2150 }
2151 name_pool[pname.second] = pname.first;
2152 }
2153
2154 for (const auto &pool : inc.old_pools) {
2155 pools.erase(pool);
2156 name_pool.erase(pool_name[pool]);
2157 pool_name.erase(pool);
2158 }
2159
2160 for (const auto &weight : inc.new_weight) {
2161 set_weight(weight.first, weight.second);
2162
2163 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2164 // xinfo old_weight.
2165 if (weight.second) {
2166 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2167 osd_xinfo[weight.first].old_weight = 0;
2168 }
2169 }
2170
2171 for (const auto &primary_affinity : inc.new_primary_affinity) {
2172 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2173 }
2174
2175 // erasure_code_profiles
2176 for (const auto &profile : inc.old_erasure_code_profiles)
2177 erasure_code_profiles.erase(profile);
2178
2179 for (const auto &profile : inc.new_erasure_code_profiles) {
2180 set_erasure_code_profile(profile.first, profile.second);
2181 }
2182
2183 // up/down
2184 for (const auto &state : inc.new_state) {
2185 const auto osd = state.first;
2186 int s = state.second ? state.second : CEPH_OSD_UP;
2187 if ((osd_state[osd] & CEPH_OSD_UP) &&
2188 (s & CEPH_OSD_UP)) {
2189 osd_info[osd].down_at = epoch;
2190 osd_xinfo[osd].down_stamp = modified;
2191 }
2192 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2193 (s & CEPH_OSD_EXISTS)) {
2194 // osd is destroyed; clear out anything interesting.
2195 (*osd_uuid)[osd] = uuid_d();
2196 osd_info[osd] = osd_info_t();
2197 osd_xinfo[osd] = osd_xinfo_t();
2198 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
2199 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2200 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2201 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2202 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
2203 osd_state[osd] = 0;
2204 } else {
2205 osd_state[osd] ^= s;
2206 }
2207 }
2208
2209 for (const auto &client : inc.new_up_client) {
2210 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
2211 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
2212 osd_addrs->client_addrs[client.first].reset(
2213 new entity_addrvec_t(client.second));
2214 osd_addrs->hb_back_addrs[client.first].reset(
2215 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2216 osd_addrs->hb_front_addrs[client.first].reset(
2217 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
2218
2219 osd_info[client.first].up_from = epoch;
2220 }
2221
2222 for (const auto &cluster : inc.new_up_cluster)
2223 osd_addrs->cluster_addrs[cluster.first].reset(
2224 new entity_addrvec_t(cluster.second));
2225
2226 // info
2227 for (const auto &thru : inc.new_up_thru)
2228 osd_info[thru.first].up_thru = thru.second;
2229
2230 for (const auto &interval : inc.new_last_clean_interval) {
2231 osd_info[interval.first].last_clean_begin = interval.second.first;
2232 osd_info[interval.first].last_clean_end = interval.second.second;
2233 }
2234
2235 for (const auto &lost : inc.new_lost)
2236 osd_info[lost.first].lost_at = lost.second;
2237
2238 // xinfo
2239 for (const auto &xinfo : inc.new_xinfo)
2240 osd_xinfo[xinfo.first] = xinfo.second;
2241
2242 // uuid
2243 for (const auto &uuid : inc.new_uuid)
2244 (*osd_uuid)[uuid.first] = uuid.second;
2245
2246 // pg rebuild
2247 for (const auto &pg : inc.new_pg_temp) {
2248 if (pg.second.empty())
2249 pg_temp->erase(pg.first);
2250 else
2251 pg_temp->set(pg.first, pg.second);
2252 }
2253 if (!inc.new_pg_temp.empty()) {
2254 // make sure pg_temp is efficiently stored
2255 pg_temp->rebuild();
2256 }
2257
2258 for (const auto &pg : inc.new_primary_temp) {
2259 if (pg.second == -1)
2260 primary_temp->erase(pg.first);
2261 else
2262 (*primary_temp)[pg.first] = pg.second;
2263 }
2264
2265 for (auto& p : inc.new_pg_upmap) {
2266 pg_upmap[p.first] = p.second;
2267 }
2268 for (auto& pg : inc.old_pg_upmap) {
2269 pg_upmap.erase(pg);
2270 }
2271 for (auto& p : inc.new_pg_upmap_items) {
2272 pg_upmap_items[p.first] = p.second;
2273 }
2274 for (auto& pg : inc.old_pg_upmap_items) {
2275 pg_upmap_items.erase(pg);
2276 }
2277
2278 // blocklist
2279 if (!inc.new_blocklist.empty()) {
2280 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2281 new_blocklist_entries = true;
2282 }
2283 for (const auto &addr : inc.old_blocklist)
2284 blocklist.erase(addr);
2285
2286 for (auto& i : inc.new_crush_node_flags) {
2287 if (i.second) {
2288 crush_node_flags[i.first] = i.second;
2289 } else {
2290 crush_node_flags.erase(i.first);
2291 }
2292 }
2293
2294 for (auto& i : inc.new_device_class_flags) {
2295 if (i.second) {
2296 device_class_flags[i.first] = i.second;
2297 } else {
2298 device_class_flags.erase(i.first);
2299 }
2300 }
2301
2302 // cluster snapshot?
2303 if (inc.cluster_snapshot.length()) {
2304 cluster_snapshot = inc.cluster_snapshot;
2305 cluster_snapshot_epoch = inc.epoch;
2306 } else {
2307 cluster_snapshot.clear();
2308 cluster_snapshot_epoch = 0;
2309 }
2310
2311 if (inc.new_nearfull_ratio >= 0) {
2312 nearfull_ratio = inc.new_nearfull_ratio;
2313 }
2314 if (inc.new_backfillfull_ratio >= 0) {
2315 backfillfull_ratio = inc.new_backfillfull_ratio;
2316 }
2317 if (inc.new_full_ratio >= 0) {
2318 full_ratio = inc.new_full_ratio;
2319 }
2320 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
2321 require_min_compat_client = inc.new_require_min_compat_client;
2322 }
2323 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2324 require_osd_release = inc.new_require_osd_release;
2325 if (require_osd_release >= ceph_release_t::luminous) {
2326 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2327 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
2328 }
2329 }
2330
2331 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2332 require_osd_release = inc.new_require_osd_release;
2333 if (require_osd_release >= ceph_release_t::nautilus) {
2334 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2335 }
2336 }
2337 // do new crush map last (after up/down stuff)
2338 if (inc.crush.length()) {
2339 ceph::buffer::list bl(inc.crush);
2340 auto blp = bl.cbegin();
2341 crush.reset(new CrushWrapper);
2342 crush->decode(blp);
2343 if (require_osd_release >= ceph_release_t::luminous) {
2344 // only increment if this is a luminous-encoded osdmap, lest
2345 // the mon's crush_version diverge from what the osds or others
2346 // are decoding and applying on their end. if we won't encode
2347 // it in the canonical version, don't change it.
2348 ++crush_version;
2349 }
2350 for (auto it = device_class_flags.begin();
2351 it != device_class_flags.end();) {
2352 const char* class_name = crush->get_class_name(it->first);
2353 if (!class_name) // device class is gone
2354 it = device_class_flags.erase(it);
2355 else
2356 it++;
2357 }
2358 }
2359
2360 if (inc.change_stretch_mode) {
2361 stretch_mode_enabled = inc.stretch_mode_enabled;
2362 stretch_bucket_count = inc.new_stretch_bucket_count;
2363 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2364 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2365 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2366 }
2367
2368 calc_num_osds();
2369 _calc_up_osd_features();
2370 return 0;
2371 }
2372
2373 // mapping
2374 int OSDMap::map_to_pg(
2375 int64_t poolid,
2376 const string& name,
2377 const string& key,
2378 const string& nspace,
2379 pg_t *pg) const
2380 {
2381 // calculate ps (placement seed)
2382 const pg_pool_t *pool = get_pg_pool(poolid);
2383 if (!pool)
2384 return -ENOENT;
2385 ps_t ps;
2386 if (!key.empty())
2387 ps = pool->hash_key(key, nspace);
2388 else
2389 ps = pool->hash_key(name, nspace);
2390 *pg = pg_t(ps, poolid);
2391 return 0;
2392 }
2393
2394 int OSDMap::object_locator_to_pg(
2395 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2396 {
2397 if (loc.hash >= 0) {
2398 if (!get_pg_pool(loc.get_pool())) {
2399 return -ENOENT;
2400 }
2401 pg = pg_t(loc.hash, loc.get_pool());
2402 return 0;
2403 }
2404 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2405 }
2406
2407 ceph_object_layout OSDMap::make_object_layout(
2408 object_t oid, int pg_pool, string nspace) const
2409 {
2410 object_locator_t loc(pg_pool, nspace);
2411
2412 ceph_object_layout ol;
2413 pg_t pgid = object_locator_to_pg(oid, loc);
2414 ol.ol_pgid = pgid.get_old_pg().v;
2415 ol.ol_stripe_unit = 0;
2416 return ol;
2417 }
2418
2419 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2420 vector<int>& osds) const
2421 {
2422 if (pool.can_shift_osds()) {
2423 unsigned removed = 0;
2424 for (unsigned i = 0; i < osds.size(); i++) {
2425 if (!exists(osds[i])) {
2426 removed++;
2427 continue;
2428 }
2429 if (removed) {
2430 osds[i - removed] = osds[i];
2431 }
2432 }
2433 if (removed)
2434 osds.resize(osds.size() - removed);
2435 } else {
2436 for (auto& osd : osds) {
2437 if (!exists(osd))
2438 osd = CRUSH_ITEM_NONE;
2439 }
2440 }
2441 }
2442
2443 void OSDMap::_pg_to_raw_osds(
2444 const pg_pool_t& pool, pg_t pg,
2445 vector<int> *osds,
2446 ps_t *ppps) const
2447 {
2448 // map to osds[]
2449 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2450 unsigned size = pool.get_size();
2451
2452 // what crush rule?
2453 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
2454 if (ruleno >= 0)
2455 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2456
2457 _remove_nonexistent_osds(pool, *osds);
2458
2459 if (ppps)
2460 *ppps = pps;
2461 }
2462
2463 int OSDMap::_pick_primary(const vector<int>& osds) const
2464 {
2465 for (auto osd : osds) {
2466 if (osd != CRUSH_ITEM_NONE) {
2467 return osd;
2468 }
2469 }
2470 return -1;
2471 }
2472
2473 void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
2474 {
2475 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2476 auto p = pg_upmap.find(pg);
2477 if (p != pg_upmap.end()) {
2478 // make sure targets aren't marked out
2479 for (auto osd : p->second) {
2480 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2481 osd_weight[osd] == 0) {
2482 // reject/ignore the explicit mapping
2483 return;
2484 }
2485 }
2486 *raw = vector<int>(p->second.begin(), p->second.end());
2487 // continue to check and apply pg_upmap_items if any
2488 }
2489
2490 auto q = pg_upmap_items.find(pg);
2491 if (q != pg_upmap_items.end()) {
2492 // NOTE: this approach does not allow a bidirectional swap,
2493 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2494 for (auto& r : q->second) {
2495 // make sure the replacement value doesn't already appear
2496 bool exists = false;
2497 ssize_t pos = -1;
2498 for (unsigned i = 0; i < raw->size(); ++i) {
2499 int osd = (*raw)[i];
2500 if (osd == r.second) {
2501 exists = true;
2502 break;
2503 }
2504 // ignore mapping if target is marked out (or invalid osd id)
2505 if (osd == r.first &&
2506 pos < 0 &&
2507 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
2508 r.second >= 0 && osd_weight[r.second] == 0)) {
2509 pos = i;
2510 }
2511 }
2512 if (!exists && pos >= 0) {
2513 (*raw)[pos] = r.second;
2514 }
2515 }
2516 }
2517 }
2518
2519 // pg -> (up osd list)
2520 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2521 vector<int> *up) const
2522 {
2523 if (pool.can_shift_osds()) {
2524 // shift left
2525 up->clear();
2526 up->reserve(raw.size());
2527 for (unsigned i=0; i<raw.size(); i++) {
2528 if (!exists(raw[i]) || is_down(raw[i]))
2529 continue;
2530 up->push_back(raw[i]);
2531 }
2532 } else {
2533 // set down/dne devices to NONE
2534 up->resize(raw.size());
2535 for (int i = raw.size() - 1; i >= 0; --i) {
2536 if (!exists(raw[i]) || is_down(raw[i])) {
2537 (*up)[i] = CRUSH_ITEM_NONE;
2538 } else {
2539 (*up)[i] = raw[i];
2540 }
2541 }
2542 }
2543 }
2544
2545 void OSDMap::_apply_primary_affinity(ps_t seed,
2546 const pg_pool_t& pool,
2547 vector<int> *osds,
2548 int *primary) const
2549 {
2550 // do we have any non-default primary_affinity values for these osds?
2551 if (!osd_primary_affinity)
2552 return;
2553
2554 bool any = false;
2555 for (const auto osd : *osds) {
2556 if (osd != CRUSH_ITEM_NONE &&
2557 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2558 any = true;
2559 break;
2560 }
2561 }
2562 if (!any)
2563 return;
2564
2565 // pick the primary. feed both the seed (for the pg) and the osd
2566 // into the hash/rng so that a proportional fraction of an osd's pgs
2567 // get rejected as primary.
2568 int pos = -1;
2569 for (unsigned i = 0; i < osds->size(); ++i) {
2570 int o = (*osds)[i];
2571 if (o == CRUSH_ITEM_NONE)
2572 continue;
2573 unsigned a = (*osd_primary_affinity)[o];
2574 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2575 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2576 seed, o) >> 16) >= a) {
2577 // we chose not to use this primary. note it anyway as a
2578 // fallback in case we don't pick anyone else, but keep looking.
2579 if (pos < 0)
2580 pos = i;
2581 } else {
2582 pos = i;
2583 break;
2584 }
2585 }
2586 if (pos < 0)
2587 return;
2588
2589 *primary = (*osds)[pos];
2590
2591 if (pool.can_shift_osds() && pos > 0) {
2592 // move the new primary to the front.
2593 for (int i = pos; i > 0; --i) {
2594 (*osds)[i] = (*osds)[i-1];
2595 }
2596 (*osds)[0] = *primary;
2597 }
2598 }
2599
2600 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2601 vector<int> *temp_pg, int *temp_primary) const
2602 {
2603 pg = pool.raw_pg_to_pg(pg);
2604 const auto p = pg_temp->find(pg);
2605 temp_pg->clear();
2606 if (p != pg_temp->end()) {
2607 for (unsigned i=0; i<p->second.size(); i++) {
2608 if (!exists(p->second[i]) || is_down(p->second[i])) {
2609 if (pool.can_shift_osds()) {
2610 continue;
2611 } else {
2612 temp_pg->push_back(CRUSH_ITEM_NONE);
2613 }
2614 } else {
2615 temp_pg->push_back(p->second[i]);
2616 }
2617 }
2618 }
2619 const auto &pp = primary_temp->find(pg);
2620 *temp_primary = -1;
2621 if (pp != primary_temp->end()) {
2622 *temp_primary = pp->second;
2623 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2624 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2625 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2626 *temp_primary = (*temp_pg)[i];
2627 break;
2628 }
2629 }
2630 }
2631 }
2632
2633 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2634 {
2635 const pg_pool_t *pool = get_pg_pool(pg.pool());
2636 if (!pool) {
2637 *primary = -1;
2638 raw->clear();
2639 return;
2640 }
2641 _pg_to_raw_osds(*pool, pg, raw, NULL);
2642 *primary = _pick_primary(*raw);
2643 }
2644
2645 void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2646 vector<int> *raw_upmap) const
2647 {
2648 auto pool = get_pg_pool(pg.pool());
2649 if (!pool) {
2650 raw_upmap->clear();
2651 return;
2652 }
2653 _pg_to_raw_osds(*pool, pg, raw, NULL);
2654 *raw_upmap = *raw;
2655 _apply_upmap(*pool, pg, raw_upmap);
2656 }
2657
2658 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2659 {
2660 const pg_pool_t *pool = get_pg_pool(pg.pool());
2661 if (!pool) {
2662 *primary = -1;
2663 up->clear();
2664 return;
2665 }
2666 vector<int> raw;
2667 ps_t pps;
2668 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2669 _apply_upmap(*pool, pg, &raw);
2670 _raw_to_up_osds(*pool, raw, up);
2671 *primary = _pick_primary(raw);
2672 _apply_primary_affinity(pps, *pool, up, primary);
2673 }
2674
2675 void OSDMap::_pg_to_up_acting_osds(
2676 const pg_t& pg, vector<int> *up, int *up_primary,
2677 vector<int> *acting, int *acting_primary,
2678 bool raw_pg_to_pg) const
2679 {
2680 const pg_pool_t *pool = get_pg_pool(pg.pool());
2681 if (!pool ||
2682 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2683 if (up)
2684 up->clear();
2685 if (up_primary)
2686 *up_primary = -1;
2687 if (acting)
2688 acting->clear();
2689 if (acting_primary)
2690 *acting_primary = -1;
2691 return;
2692 }
2693 vector<int> raw;
2694 vector<int> _up;
2695 vector<int> _acting;
2696 int _up_primary;
2697 int _acting_primary;
2698 ps_t pps;
2699 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2700 if (_acting.empty() || up || up_primary) {
2701 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2702 _apply_upmap(*pool, pg, &raw);
2703 _raw_to_up_osds(*pool, raw, &_up);
2704 _up_primary = _pick_primary(_up);
2705 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2706 if (_acting.empty()) {
2707 _acting = _up;
2708 if (_acting_primary == -1) {
2709 _acting_primary = _up_primary;
2710 }
2711 }
2712
2713 if (up)
2714 up->swap(_up);
2715 if (up_primary)
2716 *up_primary = _up_primary;
2717 }
2718
2719 if (acting)
2720 acting->swap(_acting);
2721 if (acting_primary)
2722 *acting_primary = _acting_primary;
2723 }
2724
2725 int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
2726 {
2727 // This implementation is broken for EC PGs since the osd may appear
2728 // multiple times in the acting set. See
2729 // https://tracker.ceph.com/issues/43213
2730 if (!nrep)
2731 nrep = acting.size();
2732 for (int i=0; i<nrep; i++)
2733 if (acting[i] == osd)
2734 return i;
2735 return -1;
2736 }
2737
2738 int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
2739 {
2740 int nrep = acting.size();
2741 if (who.shard == shard_id_t::NO_SHARD) {
2742 for (int i=0; i<nrep; i++) {
2743 if (acting[i] == who.osd) {
2744 return i;
2745 }
2746 }
2747 } else {
2748 if (who.shard < nrep && acting[who.shard] == who.osd) {
2749 return who.shard;
2750 }
2751 }
2752 return -1;
2753 }
2754
2755 bool OSDMap::primary_changed_broken(
2756 int oldprimary,
2757 const vector<int> &oldacting,
2758 int newprimary,
2759 const vector<int> &newacting)
2760 {
2761 if (oldacting.empty() && newacting.empty())
2762 return false; // both still empty
2763 if (oldacting.empty() ^ newacting.empty())
2764 return true; // was empty, now not, or vice versa
2765 if (oldprimary != newprimary)
2766 return true; // primary changed
2767 if (calc_pg_role_broken(oldprimary, oldacting) !=
2768 calc_pg_role_broken(newprimary, newacting))
2769 return true;
2770 return false; // same primary (tho replicas may have changed)
2771 }
2772
2773 uint64_t OSDMap::get_encoding_features() const
2774 {
2775 uint64_t f = SIGNIFICANT_FEATURES;
2776 if (require_osd_release < ceph_release_t::octopus) {
2777 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2778 }
2779 if (require_osd_release < ceph_release_t::nautilus) {
2780 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2781 }
2782 if (require_osd_release < ceph_release_t::mimic) {
2783 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2784 }
2785 if (require_osd_release < ceph_release_t::luminous) {
2786 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2787 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2788 }
2789 if (require_osd_release < ceph_release_t::kraken) {
2790 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
2791 CEPH_FEATURE_MSG_ADDR2);
2792 }
2793 if (require_osd_release < ceph_release_t::jewel) {
2794 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
2795 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2796 CEPH_FEATURE_CRUSH_TUNABLES5);
2797 }
2798 return f;
2799 }
2800
2801 // serialize, unserialize
2802 void OSDMap::encode_client_old(ceph::buffer::list& bl) const
2803 {
2804 using ceph::encode;
2805 __u16 v = 5;
2806 encode(v, bl);
2807
2808 // base
2809 encode(fsid, bl);
2810 encode(epoch, bl);
2811 encode(created, bl);
2812 encode(modified, bl);
2813
2814 // for encode(pools, bl);
2815 __u32 n = pools.size();
2816 encode(n, bl);
2817
2818 for (const auto &pool : pools) {
2819 n = pool.first;
2820 encode(n, bl);
2821 encode(pool.second, bl, 0);
2822 }
2823 // for encode(pool_name, bl);
2824 n = pool_name.size();
2825 encode(n, bl);
2826 for (const auto &pname : pool_name) {
2827 n = pname.first;
2828 encode(n, bl);
2829 encode(pname.second, bl);
2830 }
2831 // for encode(pool_max, bl);
2832 n = pool_max;
2833 encode(n, bl);
2834
2835 encode(flags, bl);
2836
2837 encode(max_osd, bl);
2838 {
2839 uint32_t n = osd_state.size();
2840 encode(n, bl);
2841 for (auto s : osd_state) {
2842 encode((uint8_t)s, bl);
2843 }
2844 }
2845 encode(osd_weight, bl);
2846 encode(osd_addrs->client_addrs, bl, 0);
2847
2848 // for encode(pg_temp, bl);
2849 n = pg_temp->size();
2850 encode(n, bl);
2851 for (const auto& pg : *pg_temp) {
2852 old_pg_t opg = pg.first.get_old_pg();
2853 encode(opg, bl);
2854 encode(pg.second, bl);
2855 }
2856
2857 // crush
2858 ceph::buffer::list cbl;
2859 crush->encode(cbl, 0 /* legacy (no) features */);
2860 encode(cbl, bl);
2861 }
2862
2863 void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
2864 {
2865 using ceph::encode;
2866 if ((features & CEPH_FEATURE_PGID64) == 0) {
2867 encode_client_old(bl);
2868 return;
2869 }
2870
2871 __u16 v = 6;
2872 encode(v, bl);
2873
2874 // base
2875 encode(fsid, bl);
2876 encode(epoch, bl);
2877 encode(created, bl);
2878 encode(modified, bl);
2879
2880 encode(pools, bl, features);
2881 encode(pool_name, bl);
2882 encode(pool_max, bl);
2883
2884 encode(flags, bl);
2885
2886 encode(max_osd, bl);
2887 {
2888 uint32_t n = osd_state.size();
2889 encode(n, bl);
2890 for (auto s : osd_state) {
2891 encode((uint8_t)s, bl);
2892 }
2893 }
2894 encode(osd_weight, bl);
2895 encode(osd_addrs->client_addrs, bl, features);
2896
2897 encode(*pg_temp, bl);
2898
2899 // crush
2900 ceph::buffer::list cbl;
2901 crush->encode(cbl, 0 /* legacy (no) features */);
2902 encode(cbl, bl);
2903
2904 // extended
2905 __u16 ev = 10;
2906 encode(ev, bl);
2907 encode(osd_addrs->hb_back_addrs, bl, features);
2908 encode(osd_info, bl);
2909 encode(blocklist, bl, features);
2910 encode(osd_addrs->cluster_addrs, bl, features);
2911 encode(cluster_snapshot_epoch, bl);
2912 encode(cluster_snapshot, bl);
2913 encode(*osd_uuid, bl);
2914 encode(osd_xinfo, bl, features);
2915 encode(osd_addrs->hb_front_addrs, bl, features);
2916 }
2917
2918 /* for a description of osdmap versions, and when they were introduced, please
2919 * refer to
2920 * doc/dev/osd_internals/osdmap_versions.txt
2921 */
2922 void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
2923 {
2924 using ceph::encode;
2925 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2926 encode_classic(bl, features);
2927 return;
2928 }
2929
2930 // only a select set of callers should *ever* be encoding new
2931 // OSDMaps. others should be passing around the canonical encoded
2932 // buffers from on high. select out those callers by passing in an
2933 // "impossible" feature bit.
2934 ceph_assert(features & CEPH_FEATURE_RESERVED);
2935 features &= ~CEPH_FEATURE_RESERVED;
2936
2937 size_t start_offset = bl.length();
2938 size_t tail_offset;
2939 size_t crc_offset;
2940 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
2941
2942 // meta-encoding: how we include client-used and osd-specific data
2943 ENCODE_START(8, 7, bl);
2944
2945 {
2946 // NOTE: any new encoding dependencies must be reflected by
2947 // SIGNIFICANT_FEATURES
2948 uint8_t v = 9;
2949 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2950 v = 3;
2951 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2952 v = 6;
2953 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2954 v = 7;
2955 }
2956 ENCODE_START(v, 1, bl); // client-usable data
2957 // base
2958 encode(fsid, bl);
2959 encode(epoch, bl);
2960 encode(created, bl);
2961 encode(modified, bl);
2962
2963 encode(pools, bl, features);
2964 encode(pool_name, bl);
2965 encode(pool_max, bl);
2966
2967 if (v < 4) {
2968 decltype(flags) f = flags;
2969 if (require_osd_release >= ceph_release_t::luminous)
2970 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
2971 else if (require_osd_release == ceph_release_t::kraken)
2972 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2973 else if (require_osd_release == ceph_release_t::jewel)
2974 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
2975 encode(f, bl);
2976 } else {
2977 encode(flags, bl);
2978 }
2979
2980 encode(max_osd, bl);
2981 if (v >= 5) {
2982 encode(osd_state, bl);
2983 } else {
2984 uint32_t n = osd_state.size();
2985 encode(n, bl);
2986 for (auto s : osd_state) {
2987 encode((uint8_t)s, bl);
2988 }
2989 }
2990 encode(osd_weight, bl);
2991 if (v >= 8) {
2992 encode(osd_addrs->client_addrs, bl, features);
2993 } else {
2994 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2995 }
2996
2997 encode(*pg_temp, bl);
2998 encode(*primary_temp, bl);
2999 if (osd_primary_affinity) {
3000 encode(*osd_primary_affinity, bl);
3001 } else {
3002 vector<__u32> v;
3003 encode(v, bl);
3004 }
3005
3006 // crush
3007 ceph::buffer::list cbl;
3008 crush->encode(cbl, features);
3009 encode(cbl, bl);
3010 encode(erasure_code_profiles, bl);
3011
3012 if (v >= 4) {
3013 encode(pg_upmap, bl);
3014 encode(pg_upmap_items, bl);
3015 } else {
3016 ceph_assert(pg_upmap.empty());
3017 ceph_assert(pg_upmap_items.empty());
3018 }
3019 if (v >= 6) {
3020 encode(crush_version, bl);
3021 }
3022 if (v >= 7) {
3023 encode(new_removed_snaps, bl);
3024 encode(new_purged_snaps, bl);
3025 }
3026 if (v >= 9) {
3027 encode(last_up_change, bl);
3028 encode(last_in_change, bl);
3029 }
3030 ENCODE_FINISH(bl); // client-usable data
3031 }
3032
3033 {
3034 // NOTE: any new encoding dependencies must be reflected by
3035 // SIGNIFICANT_FEATURES
3036 uint8_t target_v = 9; // when bumping this, be aware of stretch_mode target_v 10!
3037 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3038 target_v = 1;
3039 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3040 target_v = 5;
3041 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3042 target_v = 6;
3043 }
3044 if (stretch_mode_enabled) {
3045 target_v = std::max((uint8_t)10, target_v);
3046 }
3047 ENCODE_START(target_v, 1, bl); // extended, osd-only data
3048 if (target_v < 7) {
3049 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3050 } else {
3051 encode(osd_addrs->hb_back_addrs, bl, features);
3052 }
3053 encode(osd_info, bl);
3054 {
3055 // put this in a sorted, ordered map<> so that we encode in a
3056 // deterministic order.
3057 map<entity_addr_t,utime_t> blocklist_map;
3058 for (const auto &addr : blocklist)
3059 blocklist_map.insert(make_pair(addr.first, addr.second));
3060 encode(blocklist_map, bl, features);
3061 }
3062 if (target_v < 7) {
3063 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3064 } else {
3065 encode(osd_addrs->cluster_addrs, bl, features);
3066 }
3067 encode(cluster_snapshot_epoch, bl);
3068 encode(cluster_snapshot, bl);
3069 encode(*osd_uuid, bl);
3070 encode(osd_xinfo, bl, features);
3071 if (target_v < 7) {
3072 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3073 } else {
3074 encode(osd_addrs->hb_front_addrs, bl, features);
3075 }
3076 if (target_v >= 2) {
3077 encode(nearfull_ratio, bl);
3078 encode(full_ratio, bl);
3079 encode(backfillfull_ratio, bl);
3080 }
3081 // 4 was string-based new_require_min_compat_client
3082 if (target_v >= 5) {
3083 encode(require_min_compat_client, bl);
3084 encode(require_osd_release, bl);
3085 }
3086 if (target_v >= 6) {
3087 encode(removed_snaps_queue, bl);
3088 }
3089 if (target_v >= 8) {
3090 encode(crush_node_flags, bl);
3091 }
3092 if (target_v >= 9) {
3093 encode(device_class_flags, bl);
3094 }
3095 if (target_v >= 10) {
3096 encode(stretch_mode_enabled, bl);
3097 encode(stretch_bucket_count, bl);
3098 encode(degraded_stretch_mode, bl);
3099 encode(recovering_stretch_mode, bl);
3100 encode(stretch_mode_bucket, bl);
3101 }
3102 ENCODE_FINISH(bl); // osd-only data
3103 }
3104
3105 crc_offset = bl.length();
3106 crc_filler = bl.append_hole(sizeof(uint32_t));
3107 tail_offset = bl.length();
3108
3109 ENCODE_FINISH(bl); // meta-encoding wrapper
3110
3111 // fill in crc
3112 ceph::buffer::list front;
3113 front.substr_of(bl, start_offset, crc_offset - start_offset);
3114 crc = front.crc32c(-1);
3115 if (tail_offset < bl.length()) {
3116 ceph::buffer::list tail;
3117 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3118 crc = tail.crc32c(crc);
3119 }
3120 ceph_le32 crc_le;
3121 crc_le = crc;
3122 crc_filler->copy_in(4, (char*)&crc_le);
3123 crc_defined = true;
3124 }
3125
3126 /* for a description of osdmap versions, and when they were introduced, please
3127 * refer to
3128 * doc/dev/osd_internals/osdmap_versions.txt
3129 */
3130 void OSDMap::decode(ceph::buffer::list& bl)
3131 {
3132 auto p = bl.cbegin();
3133 decode(p);
3134 }
3135
3136 void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
3137 {
3138 using ceph::decode;
3139 __u32 n, t;
3140 __u16 v;
3141 decode(v, p);
3142
3143 // base
3144 decode(fsid, p);
3145 decode(epoch, p);
3146 decode(created, p);
3147 decode(modified, p);
3148
3149 if (v < 6) {
3150 if (v < 4) {
3151 int32_t max_pools = 0;
3152 decode(max_pools, p);
3153 pool_max = max_pools;
3154 }
3155 pools.clear();
3156 decode(n, p);
3157 while (n--) {
3158 decode(t, p);
3159 decode(pools[t], p);
3160 }
3161 if (v == 4) {
3162 decode(n, p);
3163 pool_max = n;
3164 } else if (v == 5) {
3165 pool_name.clear();
3166 decode(n, p);
3167 while (n--) {
3168 decode(t, p);
3169 decode(pool_name[t], p);
3170 }
3171 decode(n, p);
3172 pool_max = n;
3173 }
3174 } else {
3175 decode(pools, p);
3176 decode(pool_name, p);
3177 decode(pool_max, p);
3178 }
3179 // kludge around some old bug that zeroed out pool_max (#2307)
3180 if (pools.size() && pool_max < pools.rbegin()->first) {
3181 pool_max = pools.rbegin()->first;
3182 }
3183
3184 decode(flags, p);
3185
3186 decode(max_osd, p);
3187 {
3188 vector<uint8_t> os;
3189 decode(os, p);
3190 osd_state.resize(os.size());
3191 for (unsigned i = 0; i < os.size(); ++i) {
3192 osd_state[i] = os[i];
3193 }
3194 }
3195 decode(osd_weight, p);
3196 decode(osd_addrs->client_addrs, p);
3197 if (v <= 5) {
3198 pg_temp->clear();
3199 decode(n, p);
3200 while (n--) {
3201 old_pg_t opg;
3202 ceph::decode_raw(opg, p);
3203 mempool::osdmap::vector<int32_t> v;
3204 decode(v, p);
3205 pg_temp->set(pg_t(opg), v);
3206 }
3207 } else {
3208 decode(*pg_temp, p);
3209 }
3210
3211 // crush
3212 ceph::buffer::list cbl;
3213 decode(cbl, p);
3214 auto cblp = cbl.cbegin();
3215 crush->decode(cblp);
3216
3217 // extended
3218 __u16 ev = 0;
3219 if (v >= 5)
3220 decode(ev, p);
3221 decode(osd_addrs->hb_back_addrs, p);
3222 decode(osd_info, p);
3223 if (v < 5)
3224 decode(pool_name, p);
3225
3226 decode(blocklist, p);
3227 if (ev >= 6)
3228 decode(osd_addrs->cluster_addrs, p);
3229 else
3230 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
3231
3232 if (ev >= 7) {
3233 decode(cluster_snapshot_epoch, p);
3234 decode(cluster_snapshot, p);
3235 }
3236
3237 if (ev >= 8) {
3238 decode(*osd_uuid, p);
3239 } else {
3240 osd_uuid->resize(max_osd);
3241 }
3242 if (ev >= 9)
3243 decode(osd_xinfo, p);
3244 else
3245 osd_xinfo.resize(max_osd);
3246
3247 if (ev >= 10)
3248 decode(osd_addrs->hb_front_addrs, p);
3249 else
3250 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
3251
3252 osd_primary_affinity.reset();
3253
3254 post_decode();
3255 }
3256
3257 void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
3258 {
3259 using ceph::decode;
3260 /**
3261 * Older encodings of the OSDMap had a single struct_v which
3262 * covered the whole encoding, and was prior to our modern
3263 * stuff which includes a compatv and a size. So if we see
3264 * a struct_v < 7, we must rewind to the beginning and use our
3265 * classic decoder.
3266 */
3267 size_t start_offset = bl.get_off();
3268 size_t tail_offset = 0;
3269 ceph::buffer::list crc_front, crc_tail;
3270
3271 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3272 if (struct_v < 7) {
3273 bl.seek(start_offset);
3274 decode_classic(bl);
3275 return;
3276 }
3277 /**
3278 * Since we made it past that hurdle, we can use our normal paths.
3279 */
3280 {
3281 DECODE_START(9, bl); // client-usable data
3282 // base
3283 decode(fsid, bl);
3284 decode(epoch, bl);
3285 decode(created, bl);
3286 decode(modified, bl);
3287
3288 decode(pools, bl);
3289 decode(pool_name, bl);
3290 decode(pool_max, bl);
3291
3292 decode(flags, bl);
3293
3294 decode(max_osd, bl);
3295 if (struct_v >= 5) {
3296 decode(osd_state, bl);
3297 } else {
3298 vector<uint8_t> os;
3299 decode(os, bl);
3300 osd_state.resize(os.size());
3301 for (unsigned i = 0; i < os.size(); ++i) {
3302 osd_state[i] = os[i];
3303 }
3304 }
3305 decode(osd_weight, bl);
3306 decode(osd_addrs->client_addrs, bl);
3307
3308 decode(*pg_temp, bl);
3309 decode(*primary_temp, bl);
3310 // dates back to firefly. version increased from 2 to 3 still in firefly.
3311 // do we really still need to keep this around? even for old clients?
3312 if (struct_v >= 2) {
3313 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
3314 decode(*osd_primary_affinity, bl);
3315 if (osd_primary_affinity->empty())
3316 osd_primary_affinity.reset();
3317 } else {
3318 osd_primary_affinity.reset();
3319 }
3320
3321 // crush
3322 ceph::buffer::list cbl;
3323 decode(cbl, bl);
3324 auto cblp = cbl.cbegin();
3325 crush->decode(cblp);
3326 // added in firefly; version increased in luminous, so it affects
3327 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3328 // alone until we require clients to be all luminous?
3329 if (struct_v >= 3) {
3330 decode(erasure_code_profiles, bl);
3331 } else {
3332 erasure_code_profiles.clear();
3333 }
3334 // version increased from 3 to 4 still in luminous, so same as above
3335 // applies.
3336 if (struct_v >= 4) {
3337 decode(pg_upmap, bl);
3338 decode(pg_upmap_items, bl);
3339 } else {
3340 pg_upmap.clear();
3341 pg_upmap_items.clear();
3342 }
3343 // again, version increased from 5 to 6 still in luminous, so above
3344 // applies.
3345 if (struct_v >= 6) {
3346 decode(crush_version, bl);
3347 }
3348 // version increase from 6 to 7 in mimic
3349 if (struct_v >= 7) {
3350 decode(new_removed_snaps, bl);
3351 decode(new_purged_snaps, bl);
3352 }
3353 // version increase from 7 to 8, 8 to 9, in nautilus.
3354 if (struct_v >= 9) {
3355 decode(last_up_change, bl);
3356 decode(last_in_change, bl);
3357 }
3358 DECODE_FINISH(bl); // client-usable data
3359 }
3360
3361 {
3362 DECODE_START(10, bl); // extended, osd-only data
3363 decode(osd_addrs->hb_back_addrs, bl);
3364 decode(osd_info, bl);
3365 decode(blocklist, bl);
3366 decode(osd_addrs->cluster_addrs, bl);
3367 decode(cluster_snapshot_epoch, bl);
3368 decode(cluster_snapshot, bl);
3369 decode(*osd_uuid, bl);
3370 decode(osd_xinfo, bl);
3371 decode(osd_addrs->hb_front_addrs, bl);
3372 //
3373 if (struct_v >= 2) {
3374 decode(nearfull_ratio, bl);
3375 decode(full_ratio, bl);
3376 } else {
3377 nearfull_ratio = 0;
3378 full_ratio = 0;
3379 }
3380 if (struct_v >= 3) {
3381 decode(backfillfull_ratio, bl);
3382 } else {
3383 backfillfull_ratio = 0;
3384 }
3385 if (struct_v == 4) {
3386 string r;
3387 decode(r, bl);
3388 if (r.length())
3389 require_min_compat_client = ceph_release_from_name(r.c_str());
3390 }
3391 if (struct_v >= 5) {
3392 decode(require_min_compat_client, bl);
3393 decode(require_osd_release, bl);
3394 if (require_osd_release >= ceph_release_t::nautilus) {
3395 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3396 }
3397 if (require_osd_release >= ceph_release_t::luminous) {
3398 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3399 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3400 }
3401 } else {
3402 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3403 // only for compat with post-kraken pre-luminous test clusters
3404 require_osd_release = ceph_release_t::luminous;
3405 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3406 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3407 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
3408 require_osd_release = ceph_release_t::kraken;
3409 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
3410 require_osd_release = ceph_release_t::jewel;
3411 } else {
3412 require_osd_release = ceph_release_t::unknown;
3413 }
3414 }
3415 if (struct_v >= 6) {
3416 decode(removed_snaps_queue, bl);
3417 }
3418 if (struct_v >= 8) {
3419 decode(crush_node_flags, bl);
3420 } else {
3421 crush_node_flags.clear();
3422 }
3423 if (struct_v >= 9) {
3424 decode(device_class_flags, bl);
3425 } else {
3426 device_class_flags.clear();
3427 }
3428 if (struct_v >= 10) {
3429 decode(stretch_mode_enabled, bl);
3430 decode(stretch_bucket_count, bl);
3431 decode(degraded_stretch_mode, bl);
3432 decode(recovering_stretch_mode, bl);
3433 decode(stretch_mode_bucket, bl);
3434 } else {
3435 stretch_mode_enabled = false;
3436 stretch_bucket_count = 0;
3437 degraded_stretch_mode = 0;
3438 recovering_stretch_mode = 0;
3439 stretch_mode_bucket = 0;
3440 }
3441 DECODE_FINISH(bl); // osd-only data
3442 }
3443
3444 if (struct_v >= 8) {
3445 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
3446 decode(crc, bl);
3447 tail_offset = bl.get_off();
3448 crc_defined = true;
3449 } else {
3450 crc_defined = false;
3451 crc = 0;
3452 }
3453
3454 DECODE_FINISH(bl); // wrapper
3455
3456 if (tail_offset) {
3457 // verify crc
3458 uint32_t actual = crc_front.crc32c(-1);
3459 if (tail_offset < bl.get_off()) {
3460 ceph::buffer::list tail;
3461 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3462 actual = tail.crc32c(actual);
3463 }
3464 if (crc != actual) {
3465 ostringstream ss;
3466 ss << "bad crc, actual " << actual << " != expected " << crc;
3467 string s = ss.str();
3468 throw ceph::buffer::malformed_input(s.c_str());
3469 }
3470 }
3471
3472 post_decode();
3473 }
3474
3475 void OSDMap::post_decode()
3476 {
3477 // index pool names
3478 name_pool.clear();
3479 for (const auto &pname : pool_name) {
3480 name_pool[pname.second] = pname.first;
3481 }
3482
3483 calc_num_osds();
3484 _calc_up_osd_features();
3485 }
3486
3487 void OSDMap::dump_erasure_code_profiles(
3488 const mempool::osdmap::map<string,map<string,string>>& profiles,
3489 Formatter *f)
3490 {
3491 f->open_object_section("erasure_code_profiles");
3492 for (const auto &profile : profiles) {
3493 f->open_object_section(profile.first.c_str());
3494 for (const auto &profm : profile.second) {
3495 f->dump_string(profm.first.c_str(), profm.second);
3496 }
3497 f->close_section();
3498 }
3499 f->close_section();
3500 }
3501
3502 void OSDMap::dump_osds(Formatter *f) const
3503 {
3504 f->open_array_section("osds");
3505 for (int i=0; i<get_max_osd(); i++) {
3506 if (exists(i)) {
3507 dump_osd(i, f);
3508 }
3509 }
3510 f->close_section();
3511 }
3512
3513 void OSDMap::dump_osd(int id, Formatter *f) const
3514 {
3515 ceph_assert(f != nullptr);
3516 if (!exists(id)) {
3517 return;
3518 }
3519
3520 f->open_object_section("osd_info");
3521 f->dump_int("osd", id);
3522 f->dump_stream("uuid") << get_uuid(id);
3523 f->dump_int("up", is_up(id));
3524 f->dump_int("in", is_in(id));
3525 f->dump_float("weight", get_weightf(id));
3526 f->dump_float("primary_affinity", get_primary_affinityf(id));
3527 get_info(id).dump(f);
3528 f->dump_object("public_addrs", get_addrs(id));
3529 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3530 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3531 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3532 // compat
3533 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3534 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3535 f->dump_stream("heartbeat_back_addr")
3536 << get_hb_back_addrs(id).get_legacy_str();
3537 f->dump_stream("heartbeat_front_addr")
3538 << get_hb_front_addrs(id).get_legacy_str();
3539
3540 set<string> st;
3541 get_state(id, st);
3542 f->open_array_section("state");
3543 for (const auto &state : st)
3544 f->dump_string("state", state);
3545 f->close_section();
3546
3547 f->close_section();
3548 }
3549
3550 void OSDMap::dump(Formatter *f) const
3551 {
3552 f->dump_int("epoch", get_epoch());
3553 f->dump_stream("fsid") << get_fsid();
3554 f->dump_stream("created") << get_created();
3555 f->dump_stream("modified") << get_modified();
3556 f->dump_stream("last_up_change") << last_up_change;
3557 f->dump_stream("last_in_change") << last_in_change;
3558 f->dump_string("flags", get_flag_string());
3559 f->dump_unsigned("flags_num", flags);
3560 f->open_array_section("flags_set");
3561 set<string> flagset;
3562 get_flag_set(&flagset);
3563 for (auto p : flagset) {
3564 f->dump_string("flag", p);
3565 }
3566 f->close_section();
3567 f->dump_unsigned("crush_version", get_crush_version());
3568 f->dump_float("full_ratio", full_ratio);
3569 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3570 f->dump_float("nearfull_ratio", nearfull_ratio);
3571 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3572 f->dump_int("pool_max", get_pool_max());
3573 f->dump_int("max_osd", get_max_osd());
3574 f->dump_string("require_min_compat_client",
3575 to_string(require_min_compat_client));
3576 f->dump_string("min_compat_client",
3577 to_string(get_min_compat_client()));
3578 f->dump_string("require_osd_release",
3579 to_string(require_osd_release));
3580
3581 f->open_array_section("pools");
3582 for (const auto &pool : pools) {
3583 std::string name("<unknown>");
3584 const auto &pni = pool_name.find(pool.first);
3585 if (pni != pool_name.end())
3586 name = pni->second;
3587 f->open_object_section("pool");
3588 f->dump_int("pool", pool.first);
3589 f->dump_string("pool_name", name);
3590 pool.second.dump(f);
3591 f->close_section();
3592 }
3593 f->close_section();
3594
3595 dump_osds(f);
3596
3597 f->open_array_section("osd_xinfo");
3598 for (int i=0; i<get_max_osd(); i++) {
3599 if (exists(i)) {
3600 f->open_object_section("xinfo");
3601 f->dump_int("osd", i);
3602 osd_xinfo[i].dump(f);
3603 f->close_section();
3604 }
3605 }
3606 f->close_section();
3607
3608 f->open_array_section("pg_upmap");
3609 for (auto& p : pg_upmap) {
3610 f->open_object_section("mapping");
3611 f->dump_stream("pgid") << p.first;
3612 f->open_array_section("osds");
3613 for (auto q : p.second) {
3614 f->dump_int("osd", q);
3615 }
3616 f->close_section();
3617 f->close_section();
3618 }
3619 f->close_section();
3620 f->open_array_section("pg_upmap_items");
3621 for (auto& p : pg_upmap_items) {
3622 f->open_object_section("mapping");
3623 f->dump_stream("pgid") << p.first;
3624 f->open_array_section("mappings");
3625 for (auto& q : p.second) {
3626 f->open_object_section("mapping");
3627 f->dump_int("from", q.first);
3628 f->dump_int("to", q.second);
3629 f->close_section();
3630 }
3631 f->close_section();
3632 f->close_section();
3633 }
3634 f->close_section();
3635 f->open_array_section("pg_temp");
3636 pg_temp->dump(f);
3637 f->close_section();
3638
3639 f->open_array_section("primary_temp");
3640 for (const auto &pg : *primary_temp) {
3641 f->dump_stream("pgid") << pg.first;
3642 f->dump_int("osd", pg.second);
3643 }
3644 f->close_section(); // primary_temp
3645
3646 f->open_object_section("blocklist");
3647 for (const auto &addr : blocklist) {
3648 stringstream ss;
3649 ss << addr.first;
3650 f->dump_stream(ss.str().c_str()) << addr.second;
3651 }
3652 f->close_section();
3653
3654 dump_erasure_code_profiles(erasure_code_profiles, f);
3655
3656 f->open_array_section("removed_snaps_queue");
3657 for (auto& p : removed_snaps_queue) {
3658 f->open_object_section("pool");
3659 f->dump_int("pool", p.first);
3660 f->open_array_section("snaps");
3661 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3662 f->open_object_section("interval");
3663 f->dump_unsigned("begin", q.get_start());
3664 f->dump_unsigned("length", q.get_len());
3665 f->close_section();
3666 }
3667 f->close_section();
3668 f->close_section();
3669 }
3670 f->close_section();
3671 f->open_array_section("new_removed_snaps");
3672 for (auto& p : new_removed_snaps) {
3673 f->open_object_section("pool");
3674 f->dump_int("pool", p.first);
3675 f->open_array_section("snaps");
3676 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3677 f->open_object_section("interval");
3678 f->dump_unsigned("begin", q.get_start());
3679 f->dump_unsigned("length", q.get_len());
3680 f->close_section();
3681 }
3682 f->close_section();
3683 f->close_section();
3684 }
3685 f->close_section();
3686 f->open_array_section("new_purged_snaps");
3687 for (auto& p : new_purged_snaps) {
3688 f->open_object_section("pool");
3689 f->dump_int("pool", p.first);
3690 f->open_array_section("snaps");
3691 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3692 f->open_object_section("interval");
3693 f->dump_unsigned("begin", q.get_start());
3694 f->dump_unsigned("length", q.get_len());
3695 f->close_section();
3696 }
3697 f->close_section();
3698 f->close_section();
3699 }
3700 f->close_section();
3701 f->open_object_section("crush_node_flags");
3702 for (auto& i : crush_node_flags) {
3703 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3704 : stringify(i.first);
3705 f->open_array_section(s.c_str());
3706 set<string> st;
3707 calc_state_set(i.second, st);
3708 for (auto& j : st) {
3709 f->dump_string("flag", j);
3710 }
3711 f->close_section();
3712 }
3713 f->close_section();
3714 f->open_object_section("device_class_flags");
3715 for (auto& i : device_class_flags) {
3716 const char* class_name = crush->get_class_name(i.first);
3717 string s = class_name ? class_name : stringify(i.first);
3718 f->open_array_section(s.c_str());
3719 set<string> st;
3720 calc_state_set(i.second, st);
3721 for (auto& j : st) {
3722 f->dump_string("flag", j);
3723 }
3724 f->close_section();
3725 }
3726 f->close_section();
3727 f->open_object_section("stretch_mode");
3728 {
3729 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
3730 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
3731 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
3732 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
3733 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
3734 }
3735 f->close_section();
3736 }
3737
3738 void OSDMap::generate_test_instances(list<OSDMap*>& o)
3739 {
3740 o.push_back(new OSDMap);
3741
3742 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3743 o.push_back(new OSDMap);
3744 uuid_d fsid;
3745 o.back()->build_simple(cct, 1, fsid, 16);
3746 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
3747 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
3748 cct->put();
3749 }
3750
3751 string OSDMap::get_flag_string(unsigned f)
3752 {
3753 string s;
3754 if (f & CEPH_OSDMAP_PAUSERD)
3755 s += ",pauserd";
3756 if (f & CEPH_OSDMAP_PAUSEWR)
3757 s += ",pausewr";
3758 if (f & CEPH_OSDMAP_PAUSEREC)
3759 s += ",pauserec";
3760 if (f & CEPH_OSDMAP_NOUP)
3761 s += ",noup";
3762 if (f & CEPH_OSDMAP_NODOWN)
3763 s += ",nodown";
3764 if (f & CEPH_OSDMAP_NOOUT)
3765 s += ",noout";
3766 if (f & CEPH_OSDMAP_NOIN)
3767 s += ",noin";
3768 if (f & CEPH_OSDMAP_NOBACKFILL)
3769 s += ",nobackfill";
3770 if (f & CEPH_OSDMAP_NOREBALANCE)
3771 s += ",norebalance";
3772 if (f & CEPH_OSDMAP_NORECOVER)
3773 s += ",norecover";
3774 if (f & CEPH_OSDMAP_NOSCRUB)
3775 s += ",noscrub";
3776 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3777 s += ",nodeep-scrub";
3778 if (f & CEPH_OSDMAP_NOTIERAGENT)
3779 s += ",notieragent";
3780 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3781 s += ",nosnaptrim";
3782 if (f & CEPH_OSDMAP_SORTBITWISE)
3783 s += ",sortbitwise";
3784 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3785 s += ",require_jewel_osds";
3786 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3787 s += ",require_kraken_osds";
3788 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3789 s += ",require_luminous_osds";
3790 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3791 s += ",recovery_deletes";
3792 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3793 s += ",purged_snapdirs";
3794 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3795 s += ",pglog_hardlimit";
3796 if (s.length())
3797 s.erase(0, 1);
3798 return s;
3799 }
3800
3801 string OSDMap::get_flag_string() const
3802 {
3803 return get_flag_string(flags);
3804 }
3805
3806 void OSDMap::print_pools(ostream& out) const
3807 {
3808 for (const auto &pool : pools) {
3809 std::string name("<unknown>");
3810 const auto &pni = pool_name.find(pool.first);
3811 if (pni != pool_name.end())
3812 name = pni->second;
3813 out << "pool " << pool.first
3814 << " '" << name
3815 << "' " << pool.second << "\n";
3816
3817 for (const auto &snap : pool.second.snaps)
3818 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3819
3820 if (!pool.second.removed_snaps.empty())
3821 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
3822 auto p = removed_snaps_queue.find(pool.first);
3823 if (p != removed_snaps_queue.end()) {
3824 out << "\tremoved_snaps_queue " << p->second << "\n";
3825 }
3826 }
3827 out << std::endl;
3828 }
3829
3830 void OSDMap::print_osds(ostream& out) const
3831 {
3832 for (int i=0; i<get_max_osd(); i++) {
3833 if (exists(i)) {
3834 print_osd(i, out);
3835 }
3836 }
3837 }
3838 void OSDMap::print_osd(int id, ostream& out) const
3839 {
3840 if (!exists(id)) {
3841 return;
3842 }
3843
3844 out << "osd." << id;
3845 out << (is_up(id) ? " up ":" down");
3846 out << (is_in(id) ? " in ":" out");
3847 out << " weight " << get_weightf(id);
3848 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
3849 out << " primary_affinity " << get_primary_affinityf(id);
3850 }
3851 const osd_info_t& info(get_info(id));
3852 out << " " << info;
3853 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
3854 set<string> st;
3855 get_state(id, st);
3856 out << " " << st;
3857 if (!get_uuid(id).is_zero()) {
3858 out << " " << get_uuid(id);
3859 }
3860 out << "\n";
3861 }
3862
3863 void OSDMap::print(ostream& out) const
3864 {
3865 out << "epoch " << get_epoch() << "\n"
3866 << "fsid " << get_fsid() << "\n"
3867 << "created " << get_created() << "\n"
3868 << "modified " << get_modified() << "\n";
3869
3870 out << "flags " << get_flag_string() << "\n";
3871 out << "crush_version " << get_crush_version() << "\n";
3872 out << "full_ratio " << full_ratio << "\n";
3873 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3874 out << "nearfull_ratio " << nearfull_ratio << "\n";
3875 if (require_min_compat_client != ceph_release_t::unknown) {
3876 out << "require_min_compat_client "
3877 << require_min_compat_client << "\n";
3878 }
3879 out << "min_compat_client " << get_min_compat_client()
3880 << "\n";
3881 if (require_osd_release > ceph_release_t::unknown) {
3882 out << "require_osd_release " << require_osd_release
3883 << "\n";
3884 }
3885 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
3886 if (stretch_mode_enabled) {
3887 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
3888 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
3889 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
3890 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
3891 }
3892 if (get_cluster_snapshot().length())
3893 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3894 out << "\n";
3895
3896 print_pools(out);
3897
3898 out << "max_osd " << get_max_osd() << "\n";
3899 print_osds(out);
3900 out << std::endl;
3901
3902 for (auto& p : pg_upmap) {
3903 out << "pg_upmap " << p.first << " " << p.second << "\n";
3904 }
3905 for (auto& p : pg_upmap_items) {
3906 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3907 }
3908
3909 for (const auto& pg : *pg_temp)
3910 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3911
3912 for (const auto& pg : *primary_temp)
3913 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3914
3915 for (const auto &addr : blocklist)
3916 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
3917 }
3918
3919 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3920 public:
3921 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3922
3923 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3924 unsigned f)
3925 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
3926
3927 bool should_dump_leaf(int i) const override {
3928 if (!filter) {
3929 return true; // normal case
3930 }
3931 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3932 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3933 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3934 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3935 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3936 return true;
3937 }
3938 return false;
3939 }
3940
3941 bool should_dump_empty_bucket() const override {
3942 return !filter;
3943 }
3944
3945 void init_table(TextTable *tbl) {
3946 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
3947 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
3948 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3949 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3950 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
3951 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
3952 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
3953 }
3954 void dump(TextTable *tbl, string& bucket) {
3955 init_table(tbl);
3956
3957 if (!bucket.empty()) {
3958 set_root(bucket);
3959 Parent::dump(tbl);
3960 } else {
3961 Parent::dump(tbl);
3962 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3963 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3964 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3965 }
3966 }
3967 }
3968 }
3969
3970 protected:
3971 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
3972 const char *c = crush->get_item_class(qi.id);
3973 if (!c)
3974 c = "";
3975 *tbl << qi.id
3976 << c
3977 << weightf_t(qi.weight);
3978
3979 ostringstream name;
3980 for (int k = 0; k < qi.depth; k++)
3981 name << " ";
3982 if (qi.is_bucket()) {
3983 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3984 << crush->get_item_name(qi.id);
3985 } else {
3986 name << "osd." << qi.id;
3987 }
3988 *tbl << name.str();
3989
3990 if (!qi.is_bucket()) {
3991 if (!osdmap->exists(qi.id)) {
3992 *tbl << "DNE"
3993 << 0;
3994 } else {
3995 string s;
3996 if (osdmap->is_up(qi.id)) {
3997 s = "up";
3998 } else if (osdmap->is_destroyed(qi.id)) {
3999 s = "destroyed";
4000 } else {
4001 s = "down";
4002 }
4003 *tbl << s
4004 << weightf_t(osdmap->get_weightf(qi.id))
4005 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4006 }
4007 }
4008 *tbl << TextTable::endrow;
4009 }
4010
4011 private:
4012 const OSDMap *osdmap;
4013 const unsigned filter;
4014 };
4015
4016 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4017 public:
4018 typedef CrushTreeDumper::FormattingDumper Parent;
4019
4020 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4021 unsigned f)
4022 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
4023
4024 bool should_dump_leaf(int i) const override {
4025 if (!filter) {
4026 return true; // normal case
4027 }
4028 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4029 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4030 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4031 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4032 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4033 return true;
4034 }
4035 return false;
4036 }
4037
4038 bool should_dump_empty_bucket() const override {
4039 return !filter;
4040 }
4041
4042 void dump(Formatter *f, string& bucket) {
4043 if (!bucket.empty()) {
4044 set_root(bucket);
4045 f->open_array_section("nodes");
4046 Parent::dump(f);
4047 f->close_section();
4048 } else {
4049 f->open_array_section("nodes");
4050 Parent::dump(f);
4051 f->close_section();
4052 f->open_array_section("stray");
4053 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4054 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4055 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4056 }
4057 f->close_section();
4058 }
4059 }
4060
4061 protected:
4062 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4063 Parent::dump_item_fields(qi, f);
4064 if (!qi.is_bucket())
4065 {
4066 string s;
4067 if (osdmap->is_up(qi.id)) {
4068 s = "up";
4069 } else if (osdmap->is_destroyed(qi.id)) {
4070 s = "destroyed";
4071 } else {
4072 s = "down";
4073 }
4074 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
4075 f->dump_string("status", s);
4076 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4077 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4078 }
4079 }
4080
4081 private:
4082 const OSDMap *osdmap;
4083 const unsigned filter;
4084 };
4085
4086 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
4087 {
4088 if (f) {
4089 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
4090 } else {
4091 ceph_assert(out);
4092 TextTable tbl;
4093 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
4094 *out << tbl;
4095 }
4096 }
4097
4098 void OSDMap::print_summary(Formatter *f, ostream& out,
4099 const string& prefix, bool extra) const
4100 {
4101 if (f) {
4102 f->dump_int("epoch", get_epoch());
4103 f->dump_int("num_osds", get_num_osds());
4104 f->dump_int("num_up_osds", get_num_up_osds());
4105 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
4106 f->dump_int("num_in_osds", get_num_in_osds());
4107 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
4108 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4109 } else {
4110 utime_t now = ceph_clock_now();
4111 out << get_num_osds() << " osds: "
4112 << get_num_up_osds() << " up";
4113 if (last_up_change != utime_t()) {
4114 out << " (since " << utimespan_str(now - last_up_change) << ")";
4115 }
4116 out << ", " << get_num_in_osds() << " in";
4117 if (last_in_change != utime_t()) {
4118 out << " (since " << utimespan_str(now - last_in_change) << ")";
4119 }
4120 if (extra)
4121 out << "; epoch: e" << get_epoch();
4122 if (get_num_pg_temp())
4123 out << "; " << get_num_pg_temp() << " remapped pgs";
4124 out << "\n";
4125 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4126 if (important_flags)
4127 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
4128 }
4129 }
4130
4131 void OSDMap::print_oneline_summary(ostream& out) const
4132 {
4133 out << "e" << get_epoch() << ": "
4134 << get_num_osds() << " total, "
4135 << get_num_up_osds() << " up, "
4136 << get_num_in_osds() << " in";
4137 }
4138
4139 bool OSDMap::crush_rule_in_use(int rule_id) const
4140 {
4141 for (const auto &pool : pools) {
4142 if (pool.second.crush_rule == rule_id)
4143 return true;
4144 }
4145 return false;
4146 }
4147
4148 int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4149 ostream *ss) const
4150 {
4151 for (auto& i : pools) {
4152 auto& pool = i.second;
4153 int ruleno = pool.get_crush_rule();
4154 if (!newcrush->rule_exists(ruleno)) {
4155 *ss << "pool " << i.first << " references crush_rule " << ruleno
4156 << " but it is not present";
4157 return -EINVAL;
4158 }
4159 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
4160 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
4161 return -EINVAL;
4162 }
4163 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
4164 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4165 return -EINVAL;
4166 }
4167 int poolsize = pool.get_size();
4168 if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
4169 poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
4170 *ss << "pool " << i.first << " size " << poolsize << " does not"
4171 << " fall within rule " << ruleno
4172 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
4173 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
4174 return -EINVAL;
4175 }
4176 }
4177 return 0;
4178 }
4179
4180 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4181 int nosd, int pg_bits, int pgp_bits,
4182 bool default_pool)
4183 {
4184 ldout(cct, 10) << "build_simple on " << nosd
4185 << " osds" << dendl;
4186 epoch = e;
4187 set_fsid(fsid);
4188 created = modified = ceph_clock_now();
4189
4190 if (nosd >= 0) {
4191 set_max_osd(nosd);
4192 } else {
4193 // count osds
4194 int maxosd = 0;
4195 const auto& conf = cct->_conf;
4196 vector<string> sections;
4197 conf.get_all_sections(sections);
4198
4199 for (auto &section : sections) {
4200 if (section.find("osd.") != 0)
4201 continue;
4202
4203 const char *begin = section.c_str() + 4;
4204 char *end = (char*)begin;
4205 int o = strtol(begin, &end, 10);
4206 if (*end != '\0')
4207 continue;
4208
4209 if (o > cct->_conf->mon_max_osd) {
4210 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4211 return -ERANGE;
4212 }
4213
4214 if (o > maxosd)
4215 maxosd = o;
4216 }
4217
4218 set_max_osd(maxosd + 1);
4219 }
4220
4221
4222 stringstream ss;
4223 int r;
4224 if (nosd >= 0)
4225 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4226 else
4227 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
4228 ceph_assert(r == 0);
4229
4230 int poolbase = get_max_osd() ? get_max_osd() : 1;
4231
4232 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
4233 ceph_assert(default_replicated_rule >= 0);
4234
4235 if (default_pool) {
4236 // pgp_num <= pg_num
4237 if (pgp_bits > pg_bits)
4238 pgp_bits = pg_bits;
4239
4240 vector<string> pool_names;
4241 pool_names.push_back("rbd");
4242 for (auto &plname : pool_names) {
4243 int64_t pool = ++pool_max;
4244 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4245 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4246 if (cct->_conf->osd_pool_default_flag_hashpspool)
4247 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4248 if (cct->_conf->osd_pool_default_flag_nodelete)
4249 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4250 if (cct->_conf->osd_pool_default_flag_nopgchange)
4251 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4252 if (cct->_conf->osd_pool_default_flag_nosizechange)
4253 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
4254 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4255 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4256 pools[pool].size);
4257 pools[pool].crush_rule = default_replicated_rule;
4258 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4259 pools[pool].set_pg_num(poolbase << pg_bits);
4260 pools[pool].set_pgp_num(poolbase << pgp_bits);
4261 pools[pool].set_pg_num_target(poolbase << pg_bits);
4262 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
4263 pools[pool].last_change = epoch;
4264 pools[pool].application_metadata.insert(
4265 {pg_pool_t::APPLICATION_NAME_RBD, {}});
4266 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4267 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4268 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4269 pools[pool].pg_autoscale_mode = m;
4270 } else {
4271 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4272 }
4273 pool_name[pool] = plname;
4274 name_pool[plname] = pool;
4275 }
4276 }
4277
4278 map<string,string> profile_map;
4279 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4280 if (r < 0) {
4281 lderr(cct) << ss.str() << dendl;
4282 return r;
4283 }
4284 set_erasure_code_profile("default", profile_map);
4285 return 0;
4286 }
4287
4288 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4289 map<string,string> &profile_map,
4290 ostream *ss)
4291 {
4292 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
4293 *ss,
4294 &profile_map);
4295 return r;
4296 }
4297
4298 int OSDMap::_build_crush_types(CrushWrapper& crush)
4299 {
4300 crush.set_type_name(0, "osd");
4301 crush.set_type_name(1, "host");
4302 crush.set_type_name(2, "chassis");
4303 crush.set_type_name(3, "rack");
4304 crush.set_type_name(4, "row");
4305 crush.set_type_name(5, "pdu");
4306 crush.set_type_name(6, "pod");
4307 crush.set_type_name(7, "room");
4308 crush.set_type_name(8, "datacenter");
4309 crush.set_type_name(9, "zone");
4310 crush.set_type_name(10, "region");
4311 crush.set_type_name(11, "root");
4312 return 11;
4313 }
4314
4315 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4316 int nosd, ostream *ss)
4317 {
4318 crush.create();
4319
4320 // root
4321 int root_type = _build_crush_types(crush);
4322 int rootid;
4323 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4324 root_type, 0, NULL, NULL, &rootid);
4325 ceph_assert(r == 0);
4326 crush.set_item_name(rootid, "default");
4327
4328 map<string,string> loc{
4329 {"host", "localhost"},
4330 {"rack", "localrack"},
4331 {"root", "default"}
4332 };
4333 for (int o=0; o<nosd; o++) {
4334 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4335 char name[32];
4336 snprintf(name, sizeof(name), "osd.%d", o);
4337 crush.insert_item(cct, o, 1.0, name, loc);
4338 }
4339
4340 build_simple_crush_rules(cct, crush, "default", ss);
4341
4342 crush.finalize();
4343
4344 return 0;
4345 }
4346
4347 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4348 CrushWrapper& crush,
4349 ostream *ss)
4350 {
4351 const auto& conf = cct->_conf;
4352
4353 crush.create();
4354
4355 // root
4356 int root_type = _build_crush_types(crush);
4357 int rootid;
4358 int r = crush.add_bucket(0, 0,
4359 CRUSH_HASH_DEFAULT,
4360 root_type, 0, NULL, NULL, &rootid);
4361 ceph_assert(r == 0);
4362 crush.set_item_name(rootid, "default");
4363
4364 // add osds
4365 vector<string> sections;
4366 conf.get_all_sections(sections);
4367
4368 for (auto &section : sections) {
4369 if (section.find("osd.") != 0)
4370 continue;
4371
4372 const char *begin = section.c_str() + 4;
4373 char *end = (char*)begin;
4374 int o = strtol(begin, &end, 10);
4375 if (*end != '\0')
4376 continue;
4377
4378 string host, rack, row, room, dc, pool;
4379 vector<string> sectiontmp;
4380 sectiontmp.push_back("osd");
4381 sectiontmp.push_back(section);
4382 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4383 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4384 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4385 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4386 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4387 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
4388
4389 if (host.length() == 0)
4390 host = "unknownhost";
4391 if (rack.length() == 0)
4392 rack = "unknownrack";
4393
4394 map<string,string> loc;
4395 loc["host"] = host;
4396 loc["rack"] = rack;
4397 if (row.size())
4398 loc["row"] = row;
4399 if (room.size())
4400 loc["room"] = room;
4401 if (dc.size())
4402 loc["datacenter"] = dc;
4403 loc["root"] = "default";
4404
4405 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4406 crush.insert_item(cct, o, 1.0, section, loc);
4407 }
4408
4409 build_simple_crush_rules(cct, crush, "default", ss);
4410
4411 crush.finalize();
4412
4413 return 0;
4414 }
4415
4416
4417 int OSDMap::build_simple_crush_rules(
4418 CephContext *cct,
4419 CrushWrapper& crush,
4420 const string& root,
4421 ostream *ss)
4422 {
4423 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
4424 string failure_domain =
4425 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4426
4427 int r;
4428 r = crush.add_simple_rule_at(
4429 "replicated_rule", root, failure_domain, "",
4430 "firstn", pg_pool_t::TYPE_REPLICATED,
4431 crush_rule, ss);
4432 if (r < 0)
4433 return r;
4434 // do not add an erasure rule by default or else we will implicitly
4435 // require the crush_v2 feature of clients
4436 return 0;
4437 }
4438
4439 int OSDMap::summarize_mapping_stats(
4440 OSDMap *newmap,
4441 const set<int64_t> *pools,
4442 std::string *out,
4443 Formatter *f) const
4444 {
4445 set<int64_t> ls;
4446 if (pools) {
4447 ls = *pools;
4448 } else {
4449 for (auto &p : get_pools())
4450 ls.insert(p.first);
4451 }
4452
4453 unsigned total_pg = 0;
4454 unsigned moved_pg = 0;
4455 vector<unsigned> base_by_osd(get_max_osd(), 0);
4456 vector<unsigned> new_by_osd(get_max_osd(), 0);
4457 for (int64_t pool_id : ls) {
4458 const pg_pool_t *pi = get_pg_pool(pool_id);
4459 vector<int> up, up2;
4460 int up_primary;
4461 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
4462 pg_t pgid(ps, pool_id);
4463 total_pg += pi->get_size();
4464 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
4465 for (int osd : up) {
4466 if (osd >= 0 && osd < get_max_osd())
4467 ++base_by_osd[osd];
4468 }
4469 if (newmap) {
4470 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
4471 for (int osd : up2) {
4472 if (osd >= 0 && osd < get_max_osd())
4473 ++new_by_osd[osd];
4474 }
4475 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4476 for (unsigned i=0; i<up.size(); ++i) {
4477 if (up[i] != up2[i]) {
4478 ++moved_pg;
4479 }
4480 }
4481 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4482 for (int osd : up) {
4483 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4484 ++moved_pg;
4485 }
4486 }
4487 } else {
4488 ceph_abort_msg("unhandled pool type");
4489 }
4490 }
4491 }
4492 }
4493
4494 unsigned num_up_in = 0;
4495 for (int osd = 0; osd < get_max_osd(); ++osd) {
4496 if (is_up(osd) && is_in(osd))
4497 ++num_up_in;
4498 }
4499 if (!num_up_in) {
4500 return -EINVAL;
4501 }
4502
4503 float avg_pg = (float)total_pg / (float)num_up_in;
4504 float base_stddev = 0, new_stddev = 0;
4505 int min = -1, max = -1;
4506 unsigned min_base_pg = 0, max_base_pg = 0;
4507 unsigned min_new_pg = 0, max_new_pg = 0;
4508 for (int osd = 0; osd < get_max_osd(); ++osd) {
4509 if (is_up(osd) && is_in(osd)) {
4510 float base_diff = (float)base_by_osd[osd] - avg_pg;
4511 base_stddev += base_diff * base_diff;
4512 float new_diff = (float)new_by_osd[osd] - avg_pg;
4513 new_stddev += new_diff * new_diff;
4514 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4515 min = osd;
4516 min_base_pg = base_by_osd[osd];
4517 min_new_pg = new_by_osd[osd];
4518 }
4519 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4520 max = osd;
4521 max_base_pg = base_by_osd[osd];
4522 max_new_pg = new_by_osd[osd];
4523 }
4524 }
4525 }
4526 base_stddev = sqrt(base_stddev / num_up_in);
4527 new_stddev = sqrt(new_stddev / num_up_in);
4528
4529 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4530
4531 ostringstream ss;
4532 if (f)
4533 f->open_object_section("utilization");
4534 if (newmap) {
4535 if (f) {
4536 f->dump_unsigned("moved_pgs", moved_pg);
4537 f->dump_unsigned("total_pgs", total_pg);
4538 } else {
4539 float percent = 0;
4540 if (total_pg)
4541 percent = (float)moved_pg * 100.0 / (float)total_pg;
4542 ss << "moved " << moved_pg << " / " << total_pg
4543 << " (" << percent << "%)\n";
4544 }
4545 }
4546 if (f) {
4547 f->dump_float("avg_pgs", avg_pg);
4548 f->dump_float("std_dev", base_stddev);
4549 f->dump_float("expected_baseline_std_dev", edev);
4550 if (newmap)
4551 f->dump_float("new_std_dev", new_stddev);
4552 } else {
4553 ss << "avg " << avg_pg << "\n";
4554 ss << "stddev " << base_stddev;
4555 if (newmap)
4556 ss << " -> " << new_stddev;
4557 ss << " (expected baseline " << edev << ")\n";
4558 }
4559 if (min >= 0) {
4560 if (f) {
4561 f->dump_unsigned("min_osd", min);
4562 f->dump_unsigned("min_osd_pgs", min_base_pg);
4563 if (newmap)
4564 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4565 } else {
4566 ss << "min osd." << min << " with " << min_base_pg;
4567 if (newmap)
4568 ss << " -> " << min_new_pg;
4569 ss << " pgs (" << (float)min_base_pg / avg_pg;
4570 if (newmap)
4571 ss << " -> " << (float)min_new_pg / avg_pg;
4572 ss << " * mean)\n";
4573 }
4574 }
4575 if (max >= 0) {
4576 if (f) {
4577 f->dump_unsigned("max_osd", max);
4578 f->dump_unsigned("max_osd_pgs", max_base_pg);
4579 if (newmap)
4580 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4581 } else {
4582 ss << "max osd." << max << " with " << max_base_pg;
4583 if (newmap)
4584 ss << " -> " << max_new_pg;
4585 ss << " pgs (" << (float)max_base_pg / avg_pg;
4586 if (newmap)
4587 ss << " -> " << (float)max_new_pg / avg_pg;
4588 ss << " * mean)\n";
4589 }
4590 }
4591 if (f)
4592 f->close_section();
4593 if (out)
4594 *out = ss.str();
4595 return 0;
4596 }
4597
4598 bool OSDMap::try_pg_upmap(
4599 CephContext *cct,
4600 pg_t pg, ///< pg to potentially remap
4601 const set<int>& overfull, ///< osds we'd want to evacuate
4602 const vector<int>& underfull, ///< osds to move to, in order of preference
4603 const vector<int>& more_underfull, ///< more osds only slightly underfull
4604 vector<int> *orig,
4605 vector<int> *out) ///< resulting alternative mapping
4606 {
4607 const pg_pool_t *pool = get_pg_pool(pg.pool());
4608 if (!pool)
4609 return false;
4610 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
4611 pool->get_size());
4612 if (rule < 0)
4613 return false;
4614
4615 // make sure there is something there to remap
4616 bool any = false;
4617 for (auto osd : *orig) {
4618 if (overfull.count(osd)) {
4619 any = true;
4620 break;
4621 }
4622 }
4623 if (!any) {
4624 return false;
4625 }
4626
4627 int r = crush->try_remap_rule(
4628 cct,
4629 rule,
4630 pool->get_size(),
4631 overfull, underfull,
4632 more_underfull,
4633 *orig,
4634 out);
4635 if (r < 0)
4636 return false;
4637 if (*out == *orig)
4638 return false;
4639 return true;
4640 }
4641
4642 int OSDMap::calc_pg_upmaps(
4643 CephContext *cct,
4644 uint32_t max_deviation,
4645 int max,
4646 const set<int64_t>& only_pools,
4647 OSDMap::Incremental *pending_inc)
4648 {
4649 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
4650 OSDMap tmp;
4651 // Can't be less than 1 pg
4652 if (max_deviation < 1)
4653 max_deviation = 1;
4654 tmp.deepish_copy_from(*this);
4655 int num_changed = 0;
4656 map<int,set<pg_t>> pgs_by_osd;
4657 int total_pgs = 0;
4658 float osd_weight_total = 0;
4659 map<int,float> osd_weight;
4660 for (auto& i : pools) {
4661 if (!only_pools.empty() && !only_pools.count(i.first))
4662 continue;
4663 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4664 pg_t pg(ps, i.first);
4665 vector<int> up;
4666 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4667 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4668 for (auto osd : up) {
4669 if (osd != CRUSH_ITEM_NONE)
4670 pgs_by_osd[osd].insert(pg);
4671 }
4672 }
4673 total_pgs += i.second.get_size() * i.second.get_pg_num();
4674
4675 map<int,float> pmap;
4676 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4677 i.second.get_type(),
4678 i.second.get_size());
4679 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4680 ldout(cct,20) << __func__ << " pool " << i.first
4681 << " ruleno " << ruleno
4682 << " weight-map " << pmap
4683 << dendl;
4684 for (auto p : pmap) {
4685 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4686 if (adjusted_weight == 0) {
4687 continue;
4688 }
4689 osd_weight[p.first] += adjusted_weight;
4690 osd_weight_total += adjusted_weight;
4691 }
4692 }
4693 for (auto& i : osd_weight) {
4694 int pgs = 0;
4695 auto p = pgs_by_osd.find(i.first);
4696 if (p != pgs_by_osd.end())
4697 pgs = p->second.size();
4698 else
4699 pgs_by_osd.emplace(i.first, set<pg_t>());
4700 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
4701 << " pgs " << pgs << dendl;
4702 }
4703 if (osd_weight_total == 0) {
4704 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4705 return 0;
4706 }
4707 float pgs_per_weight = total_pgs / osd_weight_total;
4708 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4709 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
4710
4711 if (max <= 0) {
4712 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4713 return 0;
4714 }
4715 float stddev = 0;
4716 map<int,float> osd_deviation; // osd, deviation(pgs)
4717 multimap<float,int> deviation_osd; // deviation(pgs), osd
4718 float cur_max_deviation = 0;
4719 for (auto& i : pgs_by_osd) {
4720 // make sure osd is still there (belongs to this crush-tree)
4721 ceph_assert(osd_weight.count(i.first));
4722 float target = osd_weight[i.first] * pgs_per_weight;
4723 float deviation = (float)i.second.size() - target;
4724 ldout(cct, 20) << " osd." << i.first
4725 << "\tpgs " << i.second.size()
4726 << "\ttarget " << target
4727 << "\tdeviation " << deviation
4728 << dendl;
4729 osd_deviation[i.first] = deviation;
4730 deviation_osd.insert(make_pair(deviation, i.first));
4731 stddev += deviation * deviation;
4732 if (fabsf(deviation) > cur_max_deviation)
4733 cur_max_deviation = fabsf(deviation);
4734 }
4735 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4736 if (cur_max_deviation <= max_deviation) {
4737 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4738 << dendl;
4739 return 0;
4740 }
4741 bool skip_overfull = false;
4742 auto aggressive =
4743 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
4744 auto local_fallback_retries =
4745 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
4746 while (max--) {
4747 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
4748 // build overfull and underfull
4749 set<int> overfull;
4750 set<int> more_overfull;
4751 bool using_more_overfull = false;
4752 vector<int> underfull;
4753 vector<int> more_underfull;
4754 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4755 ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
4756 if (i->first <= 0)
4757 break;
4758 if (i->first > max_deviation) {
4759 ldout(cct, 30) << " add overfull osd." << i->second << dendl;
4760 overfull.insert(i->second);
4761 } else {
4762 more_overfull.insert(i->second);
4763 }
4764 }
4765
4766 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4767 ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
4768 if (i->first >= 0)
4769 break;
4770 if (i->first < -(int)max_deviation) {
4771 ldout(cct, 30) << " add underfull osd." << i->second << dendl;
4772 underfull.push_back(i->second);
4773 } else {
4774 more_underfull.push_back(i->second);
4775 }
4776 }
4777 if (underfull.empty() && overfull.empty()) {
4778 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
4779 break;
4780 }
4781 if (overfull.empty() && !underfull.empty()) {
4782 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4783 overfull = more_overfull;
4784 using_more_overfull = true;
4785 }
4786
4787 ldout(cct, 10) << " overfull " << overfull
4788 << " underfull " << underfull
4789 << dendl;
4790 set<pg_t> to_skip;
4791 uint64_t local_fallback_retried = 0;
4792
4793 retry:
4794
4795 set<pg_t> to_unmap;
4796 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4797 auto temp_pgs_by_osd = pgs_by_osd;
4798 // always start with fullest, break if we find any changes to make
4799 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
4800 if (skip_overfull && !underfull.empty()) {
4801 ldout(cct, 10) << " skipping overfull " << dendl;
4802 break; // fall through to check underfull
4803 }
4804 int osd = p->second;
4805 float deviation = p->first;
4806 if (deviation < 0) {
4807 ldout(cct, 10) << " hitting underfull osds now"
4808 << " when trying to remap overfull osds"
4809 << dendl;
4810 break;
4811 }
4812 float target = osd_weight[osd] * pgs_per_weight;
4813 ldout(cct, 10) << " Overfull search osd." << osd
4814 << " target " << target
4815 << " deviation " << deviation
4816 << dendl;
4817 ceph_assert(target > 0);
4818 if (!using_more_overfull && deviation <= max_deviation) {
4819 ldout(cct, 10) << " osd." << osd
4820 << " target " << target
4821 << " deviation " << deviation
4822 << " < max deviation " << max_deviation
4823 << dendl;
4824 break;
4825 }
4826
4827 vector<pg_t> pgs;
4828 pgs.reserve(pgs_by_osd[osd].size());
4829 for (auto& pg : pgs_by_osd[osd]) {
4830 if (to_skip.count(pg))
4831 continue;
4832 pgs.push_back(pg);
4833 }
4834 if (aggressive) {
4835 // shuffle PG list so they all get equal (in)attention
4836 std::random_device rd;
4837 std::default_random_engine rng{rd()};
4838 std::shuffle(pgs.begin(), pgs.end(), rng);
4839 }
4840 // look for remaps we can un-remap
4841 for (auto pg : pgs) {
4842 auto p = tmp.pg_upmap_items.find(pg);
4843 if (p == tmp.pg_upmap_items.end())
4844 continue;
4845 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4846 for (auto q : p->second) {
4847 if (q.second == osd) {
4848 ldout(cct, 10) << " will try dropping existing"
4849 << " remapping pair "
4850 << q.first << " -> " << q.second
4851 << " which remapped " << pg
4852 << " into overfull osd." << osd
4853 << dendl;
4854 temp_pgs_by_osd[q.second].erase(pg);
4855 temp_pgs_by_osd[q.first].insert(pg);
4856 } else {
4857 new_upmap_items.push_back(q);
4858 }
4859 }
4860 if (new_upmap_items.empty()) {
4861 // drop whole item
4862 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4863 << " remapped " << pg << " into overfull osd." << osd
4864 << ", will try cancelling it entirely"
4865 << dendl;
4866 to_unmap.insert(pg);
4867 goto test_change;
4868 } else if (new_upmap_items.size() != p->second.size()) {
4869 // drop single remapping pair, updating
4870 ceph_assert(new_upmap_items.size() < p->second.size());
4871 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4872 << " remapped " << pg << " into overfull osd." << osd
4873 << ", new_pg_upmap_items now " << new_upmap_items
4874 << dendl;
4875 to_upmap[pg] = new_upmap_items;
4876 goto test_change;
4877 }
4878 }
4879
4880 // try upmap
4881 for (auto pg : pgs) {
4882 auto temp_it = tmp.pg_upmap.find(pg);
4883 if (temp_it != tmp.pg_upmap.end()) {
4884 // leave pg_upmap alone
4885 // it must be specified by admin since balancer does not
4886 // support pg_upmap yet
4887 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4888 << temp_it->second << ", skipping"
4889 << dendl;
4890 continue;
4891 }
4892 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4893 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4894 set<int> existing;
4895 auto it = tmp.pg_upmap_items.find(pg);
4896 if (it != tmp.pg_upmap_items.end() &&
4897 it->second.size() >= (size_t)pg_pool_size) {
4898 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4899 << it->second << ", skipping"
4900 << dendl;
4901 continue;
4902 } else if (it != tmp.pg_upmap_items.end()) {
4903 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4904 << it->second
4905 << dendl;
4906 new_upmap_items = it->second;
4907 // build existing too (for dedup)
4908 for (auto i : it->second) {
4909 existing.insert(i.first);
4910 existing.insert(i.second);
4911 }
4912 // fall through
4913 // to see if we can append more remapping pairs
4914 }
4915 ldout(cct, 10) << " trying " << pg << dendl;
4916 vector<int> raw, orig, out;
4917 tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
4918 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
4919 continue;
4920 }
4921 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
4922 if (orig.size() != out.size()) {
4923 continue;
4924 }
4925 ceph_assert(orig != out);
4926 int pos = -1;
4927 float max_dev = 0;
4928 for (unsigned i = 0; i < out.size(); ++i) {
4929 if (orig[i] == out[i])
4930 continue; // skip invalid remappings
4931 if (existing.count(orig[i]) || existing.count(out[i]))
4932 continue; // we want new remappings only!
4933 if (osd_deviation[orig[i]] > max_dev) {
4934 max_dev = osd_deviation[orig[i]];
4935 pos = i;
4936 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
4937 }
4938 }
4939 if (pos != -1) {
4940 int i = pos;
4941 ldout(cct, 10) << " will try adding new remapping pair "
4942 << orig[i] << " -> " << out[i] << " for " << pg
4943 << (orig[i] != osd ? " NOT selected osd" : "")
4944 << dendl;
4945 existing.insert(orig[i]);
4946 existing.insert(out[i]);
4947 temp_pgs_by_osd[orig[i]].erase(pg);
4948 temp_pgs_by_osd[out[i]].insert(pg);
4949 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4950 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4951 // append new remapping pairs slowly
4952 // This way we can make sure that each tiny change will
4953 // definitely make distribution of PGs converging to
4954 // the perfect status.
4955 to_upmap[pg] = new_upmap_items;
4956 goto test_change;
4957 }
4958 }
4959 }
4960
4961 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4962 ldout(cct, 10) << " failed to find any changes for overfull osds"
4963 << dendl;
4964 for (auto& p : deviation_osd) {
4965 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4966 underfull.end())
4967 break;
4968 int osd = p.second;
4969 float deviation = p.first;
4970 float target = osd_weight[osd] * pgs_per_weight;
4971 ceph_assert(target > 0);
4972 if (fabsf(deviation) < max_deviation) {
4973 // respect max_deviation too
4974 ldout(cct, 10) << " osd." << osd
4975 << " target " << target
4976 << " deviation " << deviation
4977 << " -> absolute " << fabsf(deviation)
4978 << " < max " << max_deviation
4979 << dendl;
4980 break;
4981 }
4982 // look for remaps we can un-remap
4983 vector<pair<pg_t,
4984 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4985 candidates.reserve(tmp.pg_upmap_items.size());
4986 for (auto& i : tmp.pg_upmap_items) {
4987 if (to_skip.count(i.first))
4988 continue;
4989 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4990 continue;
4991 candidates.push_back(make_pair(i.first, i.second));
4992 }
4993 if (aggressive) {
4994 // shuffle candidates so they all get equal (in)attention
4995 std::random_device rd;
4996 std::default_random_engine rng{rd()};
4997 std::shuffle(candidates.begin(), candidates.end(), rng);
4998 }
4999 for (auto& i : candidates) {
5000 auto pg = i.first;
5001 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5002 for (auto& j : i.second) {
5003 if (j.first == osd) {
5004 ldout(cct, 10) << " will try dropping existing"
5005 << " remapping pair "
5006 << j.first << " -> " << j.second
5007 << " which remapped " << pg
5008 << " out from underfull osd." << osd
5009 << dendl;
5010 temp_pgs_by_osd[j.second].erase(pg);
5011 temp_pgs_by_osd[j.first].insert(pg);
5012 } else {
5013 new_upmap_items.push_back(j);
5014 }
5015 }
5016 if (new_upmap_items.empty()) {
5017 // drop whole item
5018 ldout(cct, 10) << " existing pg_upmap_items " << i.second
5019 << " remapped " << pg
5020 << " out from underfull osd." << osd
5021 << ", will try cancelling it entirely"
5022 << dendl;
5023 to_unmap.insert(pg);
5024 goto test_change;
5025 } else if (new_upmap_items.size() != i.second.size()) {
5026 // drop single remapping pair, updating
5027 ceph_assert(new_upmap_items.size() < i.second.size());
5028 ldout(cct, 10) << " existing pg_upmap_items " << i.second
5029 << " remapped " << pg
5030 << " out from underfull osd." << osd
5031 << ", new_pg_upmap_items now " << new_upmap_items
5032 << dendl;
5033 to_upmap[pg] = new_upmap_items;
5034 goto test_change;
5035 }
5036 }
5037 }
5038
5039 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5040 ldout(cct, 10) << " failed to find any changes for underfull osds"
5041 << dendl;
5042 if (!aggressive) {
5043 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5044 break;
5045 } else if (!skip_overfull) {
5046 // safe to quit because below here we know
5047 // we've done checking both overfull and underfull osds..
5048 ldout(cct, 10) << " break due to not being able to find any"
5049 << " further optimizations"
5050 << dendl;
5051 break;
5052 }
5053 // restart with fullest and do exhaustive searching
5054 skip_overfull = false;
5055 continue;
5056
5057 test_change:
5058
5059 // test change, apply if change is good
5060 ceph_assert(to_unmap.size() || to_upmap.size());
5061 float new_stddev = 0;
5062 map<int,float> temp_osd_deviation;
5063 multimap<float,int> temp_deviation_osd;
5064 float cur_max_deviation = 0;
5065 for (auto& i : temp_pgs_by_osd) {
5066 // make sure osd is still there (belongs to this crush-tree)
5067 ceph_assert(osd_weight.count(i.first));
5068 float target = osd_weight[i.first] * pgs_per_weight;
5069 float deviation = (float)i.second.size() - target;
5070 ldout(cct, 20) << " osd." << i.first
5071 << "\tpgs " << i.second.size()
5072 << "\ttarget " << target
5073 << "\tdeviation " << deviation
5074 << dendl;
5075 temp_osd_deviation[i.first] = deviation;
5076 temp_deviation_osd.insert(make_pair(deviation, i.first));
5077 new_stddev += deviation * deviation;
5078 if (fabsf(deviation) > cur_max_deviation)
5079 cur_max_deviation = fabsf(deviation);
5080 }
5081 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5082 if (new_stddev >= stddev) {
5083 if (!aggressive) {
5084 ldout(cct, 10) << " break because stddev is not decreasing"
5085 << " and aggressive mode is not enabled"
5086 << dendl;
5087 break;
5088 }
5089 local_fallback_retried++;
5090 if (local_fallback_retried >= local_fallback_retries) {
5091 // does not make progress
5092 // flip *skip_overfull* so both overfull and underfull
5093 // get equal (in)attention
5094 skip_overfull = !skip_overfull;
5095 ldout(cct, 10) << " hit local_fallback_retries "
5096 << local_fallback_retries
5097 << dendl;
5098 continue;
5099 }
5100 for (auto& i : to_unmap)
5101 to_skip.insert(i);
5102 for (auto& i : to_upmap)
5103 to_skip.insert(i.first);
5104 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5105 << " to_skip " << to_skip
5106 << dendl;
5107 goto retry;
5108 }
5109
5110 // ready to go
5111 ceph_assert(new_stddev < stddev);
5112 stddev = new_stddev;
5113 pgs_by_osd = temp_pgs_by_osd;
5114 osd_deviation = temp_osd_deviation;
5115 deviation_osd = temp_deviation_osd;
5116 for (auto& i : to_unmap) {
5117 ldout(cct, 10) << " unmap pg " << i << dendl;
5118 ceph_assert(tmp.pg_upmap_items.count(i));
5119 tmp.pg_upmap_items.erase(i);
5120 pending_inc->old_pg_upmap_items.insert(i);
5121 ++num_changed;
5122 }
5123 for (auto& i : to_upmap) {
5124 ldout(cct, 10) << " upmap pg " << i.first
5125 << " new pg_upmap_items " << i.second
5126 << dendl;
5127 tmp.pg_upmap_items[i.first] = i.second;
5128 pending_inc->new_pg_upmap_items[i.first] = i.second;
5129 ++num_changed;
5130 }
5131 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5132 if (cur_max_deviation <= max_deviation) {
5133 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5134 << dendl;
5135 break;
5136 }
5137 }
5138 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
5139 return num_changed;
5140 }
5141
5142 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
5143 {
5144 return crush->get_leaves(name, osds);
5145 }
5146
5147 // get pools whose crush rules might reference the given osd
5148 void OSDMap::get_pool_ids_by_osd(CephContext *cct,
5149 int osd,
5150 set<int64_t> *pool_ids) const
5151 {
5152 ceph_assert(pool_ids);
5153 set<int> raw_rules;
5154 int r = crush->get_rules_by_osd(osd, &raw_rules);
5155 if (r < 0) {
5156 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
5157 << dendl;
5158 ceph_assert(r >= 0);
5159 }
5160 set<int> rules;
5161 for (auto &i: raw_rules) {
5162 // exclude any dead rule
5163 if (crush_rule_in_use(i)) {
5164 rules.insert(i);
5165 }
5166 }
5167 for (auto &r: rules) {
5168 get_pool_ids_by_rule(r, pool_ids);
5169 }
5170 }
5171
5172 template <typename F>
5173 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
5174 public:
5175 typedef CrushTreeDumper::Dumper<F> Parent;
5176
5177 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
5178 const PGMap& pgmap_, bool tree_,
5179 const string& filter) :
5180 Parent(crush, osdmap_->get_pool_names()),
5181 osdmap(osdmap_),
5182 pgmap(pgmap_),
5183 tree(tree_),
5184 min_var(-1),
5185 max_var(-1),
5186 stddev(0),
5187 sum(0) {
5188 if (osdmap->crush->name_exists(filter)) {
5189 // filter by crush node
5190 auto item_id = osdmap->crush->get_item_id(filter);
5191 allowed.insert(item_id);
5192 osdmap->crush->get_all_children(item_id, &allowed);
5193 } else if (osdmap->crush->class_exists(filter)) {
5194 // filter by device class
5195 class_id = osdmap->crush->get_class_id(filter);
5196 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
5197 pool_id >= 0) {
5198 // filter by pool
5199 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
5200 set<int> roots;
5201 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
5202 allowed = roots;
5203 for (auto r : roots)
5204 osdmap->crush->get_all_children(r, &allowed);
5205 }
5206 average_util = average_utilization();
5207 }
5208
5209 protected:
5210
5211 bool should_dump(int id) const {
5212 if (!allowed.empty() && !allowed.count(id)) // filter by name
5213 return false;
5214 if (id >= 0 && class_id >= 0) {
5215 auto item_class_id = osdmap->crush->get_item_class_id(id);
5216 if (item_class_id < 0 || // not bound to a class yet
5217 item_class_id != class_id) // or already bound to a different class
5218 return false;
5219 }
5220 return true;
5221 }
5222
5223 set<int> get_dumped_osds() {
5224 if (allowed.empty() && class_id < 0) {
5225 // old way, all
5226 return {};
5227 }
5228 return dumped_osds;
5229 }
5230
5231 void dump_stray(F *f) {
5232 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5233 if (osdmap->exists(i) && !this->is_touched(i))
5234 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
5235 }
5236 }
5237
5238 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
5239 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
5240 return;
5241 if (!should_dump(qi.id))
5242 return;
5243
5244 if (!qi.is_bucket())
5245 dumped_osds.insert(qi.id);
5246 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
5247 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5248 kb_used_meta = 0, kb_avail = 0;
5249 double util = 0;
5250 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5251 &kb_used_omap, &kb_used_meta, &kb_avail))
5252 if (kb_used && kb)
5253 util = 100.0 * (double)kb_used / (double)kb;
5254
5255 double var = 1.0;
5256 if (average_util)
5257 var = util / average_util;
5258
5259 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
5260
5261 dump_item(qi, reweight, kb, kb_used,
5262 kb_used_data, kb_used_omap, kb_used_meta,
5263 kb_avail, util, var, num_pgs, f);
5264
5265 if (!qi.is_bucket() && reweight > 0) {
5266 if (min_var < 0 || var < min_var)
5267 min_var = var;
5268 if (max_var < 0 || var > max_var)
5269 max_var = var;
5270
5271 double dev = util - average_util;
5272 dev *= dev;
5273 stddev += reweight * dev;
5274 sum += reweight;
5275 }
5276 }
5277
5278 virtual void dump_item(const CrushTreeDumper::Item &qi,
5279 float &reweight,
5280 int64_t kb,
5281 int64_t kb_used,
5282 int64_t kb_used_data,
5283 int64_t kb_used_omap,
5284 int64_t kb_used_meta,
5285 int64_t kb_avail,
5286 double& util,
5287 double& var,
5288 const size_t num_pgs,
5289 F *f) = 0;
5290
5291 double dev() {
5292 return sum > 0 ? sqrt(stddev / sum) : 0;
5293 }
5294
5295 double average_utilization() {
5296 int64_t kb = 0, kb_used = 0;
5297 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5298 if (!osdmap->exists(i) ||
5299 osdmap->get_weight(i) == 0 ||
5300 !should_dump(i))
5301 continue;
5302 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5303 kb_avail_i;
5304 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5305 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
5306 kb += kb_i;
5307 kb_used += kb_used_i;
5308 }
5309 }
5310 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5311 }
5312
5313 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
5314 int64_t* kb_used_data,
5315 int64_t* kb_used_omap,
5316 int64_t* kb_used_meta,
5317 int64_t* kb_avail) const {
5318 const osd_stat_t *p = pgmap.get_osd_stat(id);
5319 if (!p) return false;
5320 *kb = p->statfs.kb();
5321 *kb_used = p->statfs.kb_used_raw();
5322 *kb_used_data = p->statfs.kb_used_data();
5323 *kb_used_omap = p->statfs.kb_used_omap();
5324 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5325 *kb_avail = p->statfs.kb_avail();
5326
5327 return true;
5328 }
5329
5330 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
5331 int64_t* kb_used_data,
5332 int64_t* kb_used_omap,
5333 int64_t* kb_used_meta,
5334 int64_t* kb_avail) const {
5335 if (id >= 0) {
5336 if (osdmap->is_out(id) || !should_dump(id)) {
5337 *kb = 0;
5338 *kb_used = 0;
5339 *kb_used_data = 0;
5340 *kb_used_omap = 0;
5341 *kb_used_meta = 0;
5342 *kb_avail = 0;
5343 return true;
5344 }
5345 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5346 kb_used_omap, kb_used_meta, kb_avail);
5347 }
5348
5349 *kb = 0;
5350 *kb_used = 0;
5351 *kb_used_data = 0;
5352 *kb_used_omap = 0;
5353 *kb_used_meta = 0;
5354 *kb_avail = 0;
5355
5356 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5357 int item = osdmap->crush->get_bucket_item(id, k);
5358 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5359 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5360 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5361 &kb_used_data_i, &kb_used_omap_i,
5362 &kb_used_meta_i, &kb_avail_i))
5363 return false;
5364 *kb += kb_i;
5365 *kb_used += kb_used_i;
5366 *kb_used_data += kb_used_data_i;
5367 *kb_used_omap += kb_used_omap_i;
5368 *kb_used_meta += kb_used_meta_i;
5369 *kb_avail += kb_avail_i;
5370 }
5371 return true;
5372 }
5373
5374 protected:
5375 const OSDMap *osdmap;
5376 const PGMap& pgmap;
5377 bool tree;
5378 double average_util;
5379 double min_var;
5380 double max_var;
5381 double stddev;
5382 double sum;
5383 int class_id = -1;
5384 set<int> allowed;
5385 set<int> dumped_osds;
5386 };
5387
5388
5389 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5390 public:
5391 typedef OSDUtilizationDumper<TextTable> Parent;
5392
5393 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
5394 const PGMap& pgmap, bool tree,
5395 const string& filter) :
5396 Parent(crush, osdmap, pgmap, tree, filter) {}
5397
5398 void dump(TextTable *tbl) {
5399 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
5400 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
5401 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5402 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5403 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
5404 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5405 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5406 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5407 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
5408 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5409 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5410 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5411 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
5412 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
5413 if (tree)
5414 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5415
5416 Parent::dump(tbl);
5417
5418 dump_stray(tbl);
5419
5420 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5421 *tbl << ""
5422 << ""
5423 << "" << "TOTAL"
5424 << byte_u_t(sum.statfs.total)
5425 << byte_u_t(sum.statfs.get_used_raw())
5426 << byte_u_t(sum.statfs.allocated)
5427 << byte_u_t(sum.statfs.omap_allocated)
5428 << byte_u_t(sum.statfs.internal_metadata)
5429 << byte_u_t(sum.statfs.available)
5430 << lowprecision_t(average_util)
5431 << ""
5432 << TextTable::endrow;
5433 }
5434
5435 protected:
5436 struct lowprecision_t {
5437 float v;
5438 explicit lowprecision_t(float _v) : v(_v) {}
5439 };
5440 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5441
5442 using OSDUtilizationDumper<TextTable>::dump_item;
5443 void dump_item(const CrushTreeDumper::Item &qi,
5444 float &reweight,
5445 int64_t kb,
5446 int64_t kb_used,
5447 int64_t kb_used_data,
5448 int64_t kb_used_omap,
5449 int64_t kb_used_meta,
5450 int64_t kb_avail,
5451 double& util,
5452 double& var,
5453 const size_t num_pgs,
5454 TextTable *tbl) override {
5455 const char *c = crush->get_item_class(qi.id);
5456 if (!c)
5457 c = "";
5458 *tbl << qi.id
5459 << c
5460 << weightf_t(qi.weight)
5461 << weightf_t(reweight)
5462 << byte_u_t(kb << 10)
5463 << byte_u_t(kb_used << 10)
5464 << byte_u_t(kb_used_data << 10)
5465 << byte_u_t(kb_used_omap << 10)
5466 << byte_u_t(kb_used_meta << 10)
5467 << byte_u_t(kb_avail << 10)
5468 << lowprecision_t(util)
5469 << lowprecision_t(var);
5470
5471 if (qi.is_bucket()) {
5472 *tbl << "-";
5473 *tbl << "";
5474 } else {
5475 *tbl << num_pgs;
5476 if (osdmap->is_up(qi.id)) {
5477 *tbl << "up";
5478 } else if (osdmap->is_destroyed(qi.id)) {
5479 *tbl << "destroyed";
5480 } else {
5481 *tbl << "down";
5482 }
5483 }
5484
5485 if (tree) {
5486 ostringstream name;
5487 for (int k = 0; k < qi.depth; k++)
5488 name << " ";
5489 if (qi.is_bucket()) {
5490 int type = crush->get_bucket_type(qi.id);
5491 name << crush->get_type_name(type) << " "
5492 << crush->get_item_name(qi.id);
5493 } else {
5494 name << "osd." << qi.id;
5495 }
5496 *tbl << name.str();
5497 }
5498
5499 *tbl << TextTable::endrow;
5500 }
5501
5502 public:
5503 string summary() {
5504 ostringstream out;
5505 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5506 << "/" << lowprecision_t(max_var) << " "
5507 << "STDDEV: " << lowprecision_t(dev());
5508 return out.str();
5509 }
5510 };
5511
5512 ostream& operator<<(ostream& out,
5513 const OSDUtilizationPlainDumper::lowprecision_t& v)
5514 {
5515 if (v.v < -0.01) {
5516 return out << "-";
5517 } else if (v.v < 0.001) {
5518 return out << "0";
5519 } else {
5520 std::streamsize p = out.precision();
5521 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5522 }
5523 }
5524
5525 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5526 public:
5527 typedef OSDUtilizationDumper<Formatter> Parent;
5528
5529 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
5530 const PGMap& pgmap, bool tree,
5531 const string& filter) :
5532 Parent(crush, osdmap, pgmap, tree, filter) {}
5533
5534 void dump(Formatter *f) {
5535 f->open_array_section("nodes");
5536 Parent::dump(f);
5537 f->close_section();
5538
5539 f->open_array_section("stray");
5540 dump_stray(f);
5541 f->close_section();
5542 }
5543
5544 protected:
5545 using OSDUtilizationDumper<Formatter>::dump_item;
5546 void dump_item(const CrushTreeDumper::Item &qi,
5547 float &reweight,
5548 int64_t kb,
5549 int64_t kb_used,
5550 int64_t kb_used_data,
5551 int64_t kb_used_omap,
5552 int64_t kb_used_meta,
5553 int64_t kb_avail,
5554 double& util,
5555 double& var,
5556 const size_t num_pgs,
5557 Formatter *f) override {
5558 f->open_object_section("item");
5559 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
5560 f->dump_float("reweight", reweight);
5561 f->dump_int("kb", kb);
5562 f->dump_int("kb_used", kb_used);
5563 f->dump_int("kb_used_data", kb_used_data);
5564 f->dump_int("kb_used_omap", kb_used_omap);
5565 f->dump_int("kb_used_meta", kb_used_meta);
5566 f->dump_int("kb_avail", kb_avail);
5567 f->dump_float("utilization", util);
5568 f->dump_float("var", var);
5569 f->dump_unsigned("pgs", num_pgs);
5570 if (!qi.is_bucket()) {
5571 if (osdmap->is_up(qi.id)) {
5572 f->dump_string("status", "up");
5573 } else if (osdmap->is_destroyed(qi.id)) {
5574 f->dump_string("status", "destroyed");
5575 } else {
5576 f->dump_string("status", "down");
5577 }
5578 }
5579 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5580 f->close_section();
5581 }
5582
5583 public:
5584 void summary(Formatter *f) {
5585 f->open_object_section("summary");
5586 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5587 auto& s = sum.statfs;
5588
5589 f->dump_int("total_kb", s.kb());
5590 f->dump_int("total_kb_used", s.kb_used_raw());
5591 f->dump_int("total_kb_used_data", s.kb_used_data());
5592 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5593 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5594 f->dump_int("total_kb_avail", s.kb_avail());
5595 f->dump_float("average_utilization", average_util);
5596 f->dump_float("min_var", min_var);
5597 f->dump_float("max_var", max_var);
5598 f->dump_float("dev", dev());
5599 f->close_section();
5600 }
5601 };
5602
5603 void print_osd_utilization(const OSDMap& osdmap,
5604 const PGMap& pgmap,
5605 ostream& out,
5606 Formatter *f,
5607 bool tree,
5608 const string& filter)
5609 {
5610 const CrushWrapper *crush = osdmap.crush.get();
5611 if (f) {
5612 f->open_object_section("df");
5613 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
5614 d.dump(f);
5615 d.summary(f);
5616 f->close_section();
5617 f->flush(out);
5618 } else {
5619 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
5620 TextTable tbl;
5621 d.dump(&tbl);
5622 out << tbl << d.summary() << "\n";
5623 }
5624 }
5625
5626 void OSDMap::check_health(CephContext *cct,
5627 health_check_map_t *checks) const
5628 {
5629 int num_osds = get_num_osds();
5630
5631 // OSD_DOWN
5632 // OSD_$subtree_DOWN
5633 // OSD_ORPHAN
5634 if (num_osds >= 0) {
5635 int num_in_osds = 0;
5636 int num_down_in_osds = 0;
5637 set<int> osds;
5638 set<int> down_in_osds;
5639 set<int> up_in_osds;
5640 set<int> subtree_up;
5641 unordered_map<int, set<int> > subtree_type_down;
5642 unordered_map<int, int> num_osds_subtree;
5643 int max_type = crush->get_max_type_id();
5644
5645 for (int i = 0; i < get_max_osd(); i++) {
5646 if (!exists(i)) {
5647 if (crush->item_exists(i)) {
5648 osds.insert(i);
5649 }
5650 continue;
5651 }
5652 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
5653 continue;
5654 ++num_in_osds;
5655 if (down_in_osds.count(i) || up_in_osds.count(i))
5656 continue;
5657 if (!is_up(i)) {
5658 down_in_osds.insert(i);
5659 int parent_id = 0;
5660 int current = i;
5661 for (int type = 0; type <= max_type; type++) {
5662 if (!crush->get_type_name(type))
5663 continue;
5664 int r = crush->get_immediate_parent_id(current, &parent_id);
5665 if (r == -ENOENT)
5666 break;
5667 // break early if this parent is already marked as up
5668 if (subtree_up.count(parent_id))
5669 break;
5670 type = crush->get_bucket_type(parent_id);
5671 if (!subtree_type_is_down(
5672 cct, parent_id, type,
5673 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5674 break;
5675 current = parent_id;
5676 }
5677 }
5678 }
5679
5680 // calculate the number of down osds in each down subtree and
5681 // store it in num_osds_subtree
5682 for (int type = 1; type <= max_type; type++) {
5683 if (!crush->get_type_name(type))
5684 continue;
5685 for (auto j = subtree_type_down[type].begin();
5686 j != subtree_type_down[type].end();
5687 ++j) {
5688 list<int> children;
5689 int num = 0;
5690 int num_children = crush->get_children(*j, &children);
5691 if (num_children == 0)
5692 continue;
5693 for (auto l = children.begin(); l != children.end(); ++l) {
5694 if (*l >= 0) {
5695 ++num;
5696 } else if (num_osds_subtree[*l] > 0) {
5697 num = num + num_osds_subtree[*l];
5698 }
5699 }
5700 num_osds_subtree[*j] = num;
5701 }
5702 }
5703 num_down_in_osds = down_in_osds.size();
5704 ceph_assert(num_down_in_osds <= num_in_osds);
5705 if (num_down_in_osds > 0) {
5706 // summary of down subtree types and osds
5707 for (int type = max_type; type > 0; type--) {
5708 if (!crush->get_type_name(type))
5709 continue;
5710 if (subtree_type_down[type].size() > 0) {
5711 ostringstream ss;
5712 ss << subtree_type_down[type].size() << " "
5713 << crush->get_type_name(type);
5714 if (subtree_type_down[type].size() > 1) {
5715 ss << "s";
5716 }
5717 int sum_down_osds = 0;
5718 for (auto j = subtree_type_down[type].begin();
5719 j != subtree_type_down[type].end();
5720 ++j) {
5721 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5722 }
5723 ss << " (" << sum_down_osds << " osds) down";
5724 string err = string("OSD_") +
5725 string(crush->get_type_name(type)) + "_DOWN";
5726 boost::to_upper(err);
5727 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
5728 subtree_type_down[type].size());
5729 for (auto j = subtree_type_down[type].rbegin();
5730 j != subtree_type_down[type].rend();
5731 ++j) {
5732 ostringstream ss;
5733 ss << crush->get_type_name(type);
5734 ss << " ";
5735 ss << crush->get_item_name(*j);
5736 // at the top level, do not print location
5737 if (type != max_type) {
5738 ss << " (";
5739 ss << crush->get_full_location_ordered_string(*j);
5740 ss << ")";
5741 }
5742 int num = num_osds_subtree[*j];
5743 ss << " (" << num << " osds)";
5744 ss << " is down";
5745 d.detail.push_back(ss.str());
5746 }
5747 }
5748 }
5749 ostringstream ss;
5750 ss << down_in_osds.size() << " osds down";
5751 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
5752 down_in_osds.size());
5753 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5754 ostringstream ss;
5755 ss << "osd." << *it << " (";
5756 ss << crush->get_full_location_ordered_string(*it);
5757 ss << ") is down";
5758 d.detail.push_back(ss.str());
5759 }
5760 }
5761
5762 if (!osds.empty()) {
5763 ostringstream ss;
5764 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
5765 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
5766 osds.size());
5767 for (auto osd : osds) {
5768 ostringstream ss;
5769 ss << "osd." << osd << " exists in crush map but not in osdmap";
5770 d.detail.push_back(ss.str());
5771 }
5772 }
5773 }
5774
5775 std::list<std::string> scrub_messages;
5776 bool noscrub = false, nodeepscrub = false;
5777 for (const auto &p : pools) {
5778 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5779 ostringstream ss;
5780 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5781 scrub_messages.push_back(ss.str());
5782 noscrub = true;
5783 }
5784 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5785 ostringstream ss;
5786 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5787 scrub_messages.push_back(ss.str());
5788 nodeepscrub = true;
5789 }
5790 }
5791 if (noscrub || nodeepscrub) {
5792 string out = "";
5793 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5794 out += nodeepscrub ? "nodeep-scrub" : "";
5795 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
5796 "Some pool(s) have the " + out + " flag(s) set", 0);
5797 d.detail.splice(d.detail.end(), scrub_messages);
5798 }
5799
5800 // OSD_OUT_OF_ORDER_FULL
5801 {
5802 // An osd could configure failsafe ratio, to something different
5803 // but for now assume it is the same here.
5804 float fsr = cct->_conf->osd_failsafe_full_ratio;
5805 if (fsr > 1.0) fsr /= 100;
5806 float fr = get_full_ratio();
5807 float br = get_backfillfull_ratio();
5808 float nr = get_nearfull_ratio();
5809
5810 list<string> detail;
5811 // These checks correspond to how OSDService::check_full_status() in an OSD
5812 // handles the improper setting of these values.
5813 if (br < nr) {
5814 ostringstream ss;
5815 ss << "backfillfull_ratio (" << br
5816 << ") < nearfull_ratio (" << nr << "), increased";
5817 detail.push_back(ss.str());
5818 br = nr;
5819 }
5820 if (fr < br) {
5821 ostringstream ss;
5822 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5823 << "), increased";
5824 detail.push_back(ss.str());
5825 fr = br;
5826 }
5827 if (fsr < fr) {
5828 ostringstream ss;
5829 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5830 << "), increased";
5831 detail.push_back(ss.str());
5832 }
5833 if (!detail.empty()) {
5834 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
5835 "full ratio(s) out of order", 0);
5836 d.detail.swap(detail);
5837 }
5838 }
5839
5840 // OSD_FULL
5841 // OSD_NEARFULL
5842 // OSD_BACKFILLFULL
5843 // OSD_FAILSAFE_FULL
5844 {
5845 set<int> full, backfillfull, nearfull;
5846 get_full_osd_counts(&full, &backfillfull, &nearfull);
5847 if (full.size()) {
5848 ostringstream ss;
5849 ss << full.size() << " full osd(s)";
5850 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
5851 for (auto& i: full) {
5852 ostringstream ss;
5853 ss << "osd." << i << " is full";
5854 d.detail.push_back(ss.str());
5855 }
5856 }
5857 if (backfillfull.size()) {
5858 ostringstream ss;
5859 ss << backfillfull.size() << " backfillfull osd(s)";
5860 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
5861 backfillfull.size());
5862 for (auto& i: backfillfull) {
5863 ostringstream ss;
5864 ss << "osd." << i << " is backfill full";
5865 d.detail.push_back(ss.str());
5866 }
5867 }
5868 if (nearfull.size()) {
5869 ostringstream ss;
5870 ss << nearfull.size() << " nearfull osd(s)";
5871 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
5872 for (auto& i: nearfull) {
5873 ostringstream ss;
5874 ss << "osd." << i << " is near full";
5875 d.detail.push_back(ss.str());
5876 }
5877 }
5878 }
5879
5880 // OSDMAP_FLAGS
5881 {
5882 // warn about flags
5883 uint64_t warn_flags =
5884 CEPH_OSDMAP_PAUSERD |
5885 CEPH_OSDMAP_PAUSEWR |
5886 CEPH_OSDMAP_PAUSEREC |
5887 CEPH_OSDMAP_NOUP |
5888 CEPH_OSDMAP_NODOWN |
5889 CEPH_OSDMAP_NOIN |
5890 CEPH_OSDMAP_NOOUT |
5891 CEPH_OSDMAP_NOBACKFILL |
5892 CEPH_OSDMAP_NORECOVER |
5893 CEPH_OSDMAP_NOSCRUB |
5894 CEPH_OSDMAP_NODEEP_SCRUB |
5895 CEPH_OSDMAP_NOTIERAGENT |
5896 CEPH_OSDMAP_NOSNAPTRIM |
5897 CEPH_OSDMAP_NOREBALANCE;
5898 if (test_flag(warn_flags)) {
5899 ostringstream ss;
5900 string s = get_flag_string(get_flags() & warn_flags);
5901 ss << s << " flag(s) set";
5902 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
5903 s.size() /* kludgey but sufficient */);
5904 }
5905 }
5906
5907 // OSD_FLAGS
5908 {
5909 list<string> detail;
5910 const unsigned flags =
5911 CEPH_OSD_NOUP |
5912 CEPH_OSD_NOIN |
5913 CEPH_OSD_NODOWN |
5914 CEPH_OSD_NOOUT;
5915 for (int i = 0; i < max_osd; ++i) {
5916 if (osd_state[i] & flags) {
5917 ostringstream ss;
5918 set<string> states;
5919 OSDMap::calc_state_set(osd_state[i] & flags, states);
5920 ss << "osd." << i << " has flags " << states;
5921 detail.push_back(ss.str());
5922 }
5923 }
5924 for (auto& i : crush_node_flags) {
5925 if (i.second && crush->item_exists(i.first)) {
5926 ostringstream ss;
5927 set<string> states;
5928 OSDMap::calc_state_set(i.second, states);
5929 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
5930 const char *tn = crush->get_type_name(t);
5931 ss << (tn ? tn : "node") << " "
5932 << crush->get_item_name(i.first) << " has flags " << states;
5933 detail.push_back(ss.str());
5934 }
5935 }
5936 for (auto& i : device_class_flags) {
5937 const char* class_name = crush->get_class_name(i.first);
5938 if (i.second && class_name) {
5939 ostringstream ss;
5940 set<string> states;
5941 OSDMap::calc_state_set(i.second, states);
5942 ss << "device class '" << class_name << "' has flags " << states;
5943 detail.push_back(ss.str());
5944 }
5945 }
5946 if (!detail.empty()) {
5947 ostringstream ss;
5948 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
5949 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
5950 d.detail.swap(detail);
5951 }
5952 }
5953
5954 // OLD_CRUSH_TUNABLES
5955 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
5956 string min = crush->get_min_required_version();
5957 if (min < cct->_conf->mon_crush_min_required_version) {
5958 ostringstream ss;
5959 ss << "crush map has legacy tunables (require " << min
5960 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
5961 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
5962 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
5963 }
5964 }
5965
5966 // OLD_CRUSH_STRAW_CALC_VERSION
5967 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
5968 if (crush->get_straw_calc_version() == 0) {
5969 ostringstream ss;
5970 ss << "crush map has straw_calc_version=0";
5971 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
5972 d.detail.push_back(
5973 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
5974 }
5975 }
5976
5977 // CACHE_POOL_NO_HIT_SET
5978 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
5979 list<string> detail;
5980 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
5981 const pg_pool_t& info = p->second;
5982 if (info.cache_mode_requires_hit_set() &&
5983 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5984 ostringstream ss;
5985 ss << "pool '" << get_pool_name(p->first)
5986 << "' with cache_mode " << info.get_cache_mode_name()
5987 << " needs hit_set_type to be set but it is not";
5988 detail.push_back(ss.str());
5989 }
5990 }
5991 if (!detail.empty()) {
5992 ostringstream ss;
5993 ss << detail.size() << " cache pools are missing hit_sets";
5994 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
5995 detail.size());
5996 d.detail.swap(detail);
5997 }
5998 }
5999
6000 // OSD_NO_SORTBITWISE
6001 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6002 ostringstream ss;
6003 ss << "'sortbitwise' flag is not set";
6004 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
6005 }
6006
6007 // OSD_UPGRADE_FINISHED
6008 // none of these (yet) since we don't run until luminous upgrade is done.
6009
6010 // POOL_NEARFULL/BACKFILLFULL/FULL
6011 {
6012 list<string> full_detail, backfillfull_detail, nearfull_detail;
6013 for (auto it : get_pools()) {
6014 const pg_pool_t &pool = it.second;
6015 const string& pool_name = get_pool_name(it.first);
6016 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
6017 stringstream ss;
6018 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
6019 // may run out of space too,
6020 // but we want EQUOTA taking precedence
6021 ss << "pool '" << pool_name << "' is full (running out of quota)";
6022 } else {
6023 ss << "pool '" << pool_name << "' is full (no space)";
6024 }
6025 full_detail.push_back(ss.str());
6026 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
6027 stringstream ss;
6028 ss << "pool '" << pool_name << "' is backfillfull";
6029 backfillfull_detail.push_back(ss.str());
6030 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
6031 stringstream ss;
6032 ss << "pool '" << pool_name << "' is nearfull";
6033 nearfull_detail.push_back(ss.str());
6034 }
6035 }
6036 if (!full_detail.empty()) {
6037 ostringstream ss;
6038 ss << full_detail.size() << " pool(s) full";
6039 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
6040 d.detail.swap(full_detail);
6041 }
6042 if (!backfillfull_detail.empty()) {
6043 ostringstream ss;
6044 ss << backfillfull_detail.size() << " pool(s) backfillfull";
6045 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
6046 backfillfull_detail.size());
6047 d.detail.swap(backfillfull_detail);
6048 }
6049 if (!nearfull_detail.empty()) {
6050 ostringstream ss;
6051 ss << nearfull_detail.size() << " pool(s) nearfull";
6052 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
6053 nearfull_detail.size());
6054 d.detail.swap(nearfull_detail);
6055 }
6056 }
6057
6058 // POOL_PG_NUM_NOT_POWER_OF_TWO
6059 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
6060 list<string> detail;
6061 for (auto it : get_pools()) {
6062 if (!isp2(it.second.get_pg_num_target())) {
6063 ostringstream ss;
6064 ss << "pool '" << get_pool_name(it.first)
6065 << "' pg_num " << it.second.get_pg_num_target()
6066 << " is not a power of two";
6067 detail.push_back(ss.str());
6068 }
6069 }
6070 if (!detail.empty()) {
6071 ostringstream ss;
6072 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
6073 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
6074 ss.str(), detail.size());
6075 d.detail.swap(detail);
6076 }
6077 }
6078
6079 // POOL_NO_REDUNDANCY
6080 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
6081 {
6082 list<string> detail;
6083 for (auto it : get_pools()) {
6084 if (it.second.get_size() == 1) {
6085 ostringstream ss;
6086 ss << "pool '" << get_pool_name(it.first)
6087 << "' has no replicas configured";
6088 detail.push_back(ss.str());
6089 }
6090 }
6091 if (!detail.empty()) {
6092 ostringstream ss;
6093 ss << detail.size() << " pool(s) have no replicas configured";
6094 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
6095 ss.str(), detail.size());
6096 d.detail.swap(detail);
6097 }
6098 }
6099
6100 // DEGRADED STRETCH MODE
6101 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
6102 if (recovering_stretch_mode) {
6103 stringstream ss;
6104 ss << "We are recovering stretch mode buckets, only requiring "
6105 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6106 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
6107 ss.str(), 0);
6108 } else if (degraded_stretch_mode) {
6109 stringstream ss;
6110 ss << "We are missing stretch mode buckets, only requiring "
6111 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6112 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
6113 ss.str(), 0);
6114 }
6115 }
6116 }
6117
6118 int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
6119 ostream *ss) const
6120 {
6121 out->clear();
6122 for (auto i = ls.begin(); i != ls.end(); ++i) {
6123 if (i == ls.begin() &&
6124 (*i == "any" || *i == "all" || *i == "*")) {
6125 get_all_osds(*out);
6126 break;
6127 }
6128 long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
6129 if (osd < 0) {
6130 *ss << "invalid osd id '" << *i << "'";
6131 return -EINVAL;
6132 }
6133 out->insert(osd);
6134 }
6135 return 0;
6136 }
6137
6138 void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
6139 string &subtree,
6140 int limit, // how many
6141 set<int> skip,
6142 set<int> *want) const {
6143 if (limit <= 0)
6144 return;
6145 int subtree_type = crush->get_type_id(subtree);
6146 if (subtree_type < 1)
6147 return;
6148 vector<int> subtrees;
6149 crush->get_subtree_of_type(subtree_type, &subtrees);
6150 std::random_device rd;
6151 std::default_random_engine rng{rd()};
6152 std::shuffle(subtrees.begin(), subtrees.end(), rng);
6153 for (auto s : subtrees) {
6154 if (limit <= 0)
6155 break;
6156 if (crush->subtree_contains(s, n))
6157 continue;
6158 vector<int> osds;
6159 crush->get_children_of_type(s, 0, &osds);
6160 if (osds.empty())
6161 continue;
6162 vector<int> up_osds;
6163 for (auto o : osds) {
6164 if (is_up(o) && !skip.count(o))
6165 up_osds.push_back(o);
6166 }
6167 if (up_osds.empty())
6168 continue;
6169 auto it = up_osds.begin();
6170 std::advance(it, (n % up_osds.size()));
6171 want->insert(*it);
6172 --limit;
6173 }
6174 }
6175
6176 float OSDMap::pool_raw_used_rate(int64_t poolid) const
6177 {
6178 const pg_pool_t *pool = get_pg_pool(poolid);
6179 assert(pool != nullptr);
6180
6181 switch (pool->get_type()) {
6182 case pg_pool_t::TYPE_REPLICATED:
6183 return pool->get_size();
6184 case pg_pool_t::TYPE_ERASURE:
6185 {
6186 auto& ecp =
6187 get_erasure_code_profile(pool->erasure_code_profile);
6188 auto pm = ecp.find("m");
6189 auto pk = ecp.find("k");
6190 if (pm != ecp.end() && pk != ecp.end()) {
6191 int k = atoi(pk->second.c_str());
6192 int m = atoi(pm->second.c_str());
6193 int mk = m + k;
6194 ceph_assert(mk != 0);
6195 ceph_assert(k != 0);
6196 return (float)mk / k;
6197 } else {
6198 return 0.0;
6199 }
6200 }
6201 break;
6202 default:
6203 ceph_abort_msg("unrecognized pool type");
6204 }
6205 }
6206
6207 unsigned OSDMap::get_osd_crush_node_flags(int osd) const
6208 {
6209 unsigned flags = 0;
6210 if (!crush_node_flags.empty()) {
6211 // the map will contain type -> name
6212 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
6213 for (auto& i : ploc) {
6214 int id = crush->get_item_id(i.second);
6215 auto p = crush_node_flags.find(id);
6216 if (p != crush_node_flags.end()) {
6217 flags |= p->second;
6218 }
6219 }
6220 }
6221 return flags;
6222 }
6223
6224 unsigned OSDMap::get_crush_node_flags(int id) const
6225 {
6226 unsigned flags = 0;
6227 auto it = crush_node_flags.find(id);
6228 if (it != crush_node_flags.end())
6229 flags = it->second;
6230 return flags;
6231 }
6232
6233 unsigned OSDMap::get_device_class_flags(int id) const
6234 {
6235 unsigned flags = 0;
6236 auto it = device_class_flags.find(id);
6237 if (it != device_class_flags.end())
6238 flags = it->second;
6239 return flags;
6240 }