]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <algorithm>
19 #include <bit>
20 #include <optional>
21 #include <random>
22 #include <fmt/format.h>
23
24 #include <boost/algorithm/string.hpp>
25
26 #include "OSDMap.h"
27 #include "common/config.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/TextTable.h"
31 #include "include/ceph_features.h"
32 #include "include/common_fwd.h"
33 #include "include/str_map.h"
34
35 #include "common/code_environment.h"
36 #include "mon/health_check.h"
37
38 #include "crush/CrushTreeDumper.h"
39 #include "common/Clock.h"
40 #include "mon/PGMap.h"
41
42 using std::list;
43 using std::make_pair;
44 using std::map;
45 using std::multimap;
46 using std::ostream;
47 using std::ostringstream;
48 using std::pair;
49 using std::set;
50 using std::string;
51 using std::stringstream;
52 using std::unordered_map;
53 using std::vector;
54
55 using ceph::decode;
56 using ceph::encode;
57 using ceph::Formatter;
58
59 #define dout_subsys ceph_subsys_osd
60
61 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
62 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
63
64
65 // ----------------------------------
66 // osd_info_t
67
68 void osd_info_t::dump(Formatter *f) const
69 {
70 f->dump_int("last_clean_begin", last_clean_begin);
71 f->dump_int("last_clean_end", last_clean_end);
72 f->dump_int("up_from", up_from);
73 f->dump_int("up_thru", up_thru);
74 f->dump_int("down_at", down_at);
75 f->dump_int("lost_at", lost_at);
76 }
77
78 void osd_info_t::encode(ceph::buffer::list& bl) const
79 {
80 using ceph::encode;
81 __u8 struct_v = 1;
82 encode(struct_v, bl);
83 encode(last_clean_begin, bl);
84 encode(last_clean_end, bl);
85 encode(up_from, bl);
86 encode(up_thru, bl);
87 encode(down_at, bl);
88 encode(lost_at, bl);
89 }
90
91 void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
92 {
93 using ceph::decode;
94 __u8 struct_v;
95 decode(struct_v, bl);
96 decode(last_clean_begin, bl);
97 decode(last_clean_end, bl);
98 decode(up_from, bl);
99 decode(up_thru, bl);
100 decode(down_at, bl);
101 decode(lost_at, bl);
102 }
103
104 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
105 {
106 o.push_back(new osd_info_t);
107 o.push_back(new osd_info_t);
108 o.back()->last_clean_begin = 1;
109 o.back()->last_clean_end = 2;
110 o.back()->up_from = 30;
111 o.back()->up_thru = 40;
112 o.back()->down_at = 5;
113 o.back()->lost_at = 6;
114 }
115
116 ostream& operator<<(ostream& out, const osd_info_t& info)
117 {
118 out << "up_from " << info.up_from
119 << " up_thru " << info.up_thru
120 << " down_at " << info.down_at
121 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
122 if (info.lost_at)
123 out << " lost_at " << info.lost_at;
124 return out;
125 }
126
127 // ----------------------------------
128 // osd_xinfo_t
129
130 void osd_xinfo_t::dump(Formatter *f) const
131 {
132 f->dump_stream("down_stamp") << down_stamp;
133 f->dump_float("laggy_probability", laggy_probability);
134 f->dump_int("laggy_interval", laggy_interval);
135 f->dump_int("features", features);
136 f->dump_unsigned("old_weight", old_weight);
137 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
138 f->dump_int("dead_epoch", dead_epoch);
139 }
140
141 void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
142 {
143 uint8_t v = 4;
144 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
145 v = 3;
146 }
147 ENCODE_START(v, 1, bl);
148 encode(down_stamp, bl);
149 __u32 lp = laggy_probability * float(0xfffffffful);
150 encode(lp, bl);
151 encode(laggy_interval, bl);
152 encode(features, bl);
153 encode(old_weight, bl);
154 if (v >= 4) {
155 encode(last_purged_snaps_scrub, bl);
156 encode(dead_epoch, bl);
157 }
158 ENCODE_FINISH(bl);
159 }
160
161 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
162 {
163 DECODE_START(4, bl);
164 decode(down_stamp, bl);
165 __u32 lp;
166 decode(lp, bl);
167 laggy_probability = (float)lp / (float)0xffffffff;
168 decode(laggy_interval, bl);
169 if (struct_v >= 2)
170 decode(features, bl);
171 else
172 features = 0;
173 if (struct_v >= 3)
174 decode(old_weight, bl);
175 else
176 old_weight = 0;
177 if (struct_v >= 4) {
178 decode(last_purged_snaps_scrub, bl);
179 decode(dead_epoch, bl);
180 } else {
181 dead_epoch = 0;
182 }
183 DECODE_FINISH(bl);
184 }
185
186 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
187 {
188 o.push_back(new osd_xinfo_t);
189 o.push_back(new osd_xinfo_t);
190 o.back()->down_stamp = utime_t(2, 3);
191 o.back()->laggy_probability = .123;
192 o.back()->laggy_interval = 123456;
193 o.back()->old_weight = 0x7fff;
194 }
195
196 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
197 {
198 return out << "down_stamp " << xi.down_stamp
199 << " laggy_probability " << xi.laggy_probability
200 << " laggy_interval " << xi.laggy_interval
201 << " old_weight " << xi.old_weight
202 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
203 << " dead_epoch " << xi.dead_epoch;
204 }
205
206 // ----------------------------------
207 // OSDMap::Incremental
208
209 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
210 {
211 int n = 0;
212 for (auto &weight : new_weight) {
213 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
214 n++; // marked out
215 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
216 n--; // marked in
217 }
218 return n;
219 }
220
221 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
222 {
223 int n = 0;
224 for (auto &state : new_state) { //
225 if (state.second & CEPH_OSD_UP) {
226 if (previous->is_up(state.first))
227 n++; // marked down
228 else
229 n--; // marked up
230 }
231 }
232 return n;
233 }
234
235 int OSDMap::Incremental::identify_osd(uuid_d u) const
236 {
237 for (auto &uuid : new_uuid)
238 if (uuid.second == u)
239 return uuid.first;
240 return -1;
241 }
242
243 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
244 const OSDMap& osdmap)
245 {
246 ceph_assert(epoch == osdmap.get_epoch() + 1);
247
248 for (auto &new_pool : new_pools) {
249 if (!new_pool.second.tiers.empty()) {
250 pg_pool_t& base = new_pool.second;
251
252 auto new_rem_it = new_removed_snaps.find(new_pool.first);
253
254 for (const auto &tier_pool : base.tiers) {
255 const auto &r = new_pools.find(tier_pool);
256 pg_pool_t *tier = 0;
257 if (r == new_pools.end()) {
258 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
259 if (!orig) {
260 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
261 return -EIO;
262 }
263 tier = get_new_pool(tier_pool, orig);
264 } else {
265 tier = &r->second;
266 }
267 if (tier->tier_of != new_pool.first) {
268 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
269 return -EIO;
270 }
271
272 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
273 << tier_pool << dendl;
274 tier->snap_seq = base.snap_seq;
275 tier->snap_epoch = base.snap_epoch;
276 tier->snaps = base.snaps;
277 tier->removed_snaps = base.removed_snaps;
278 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
279 pg_pool_t::FLAG_POOL_SNAPS);
280
281 if (new_rem_it != new_removed_snaps.end()) {
282 new_removed_snaps[tier_pool] = new_rem_it->second;
283 }
284
285 tier->application_metadata = base.application_metadata;
286 }
287 }
288 }
289 return 0;
290 }
291
292 // ----------------------------------
293 // OSDMap
294
295 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
296 {
297 if (id >= 0)
298 return is_down(id);
299
300 if (down_cache &&
301 down_cache->count(id)) {
302 return true;
303 }
304
305 list<int> children;
306 crush->get_children(id, &children);
307 for (const auto &child : children) {
308 if (!subtree_is_down(child, down_cache)) {
309 return false;
310 }
311 }
312 if (down_cache) {
313 down_cache->insert(id);
314 }
315 return true;
316 }
317
318 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
319 {
320 // use a stack-local down_cache if we didn't get one from the
321 // caller. then at least this particular call will avoid duplicated
322 // work.
323 set<int> local_down_cache;
324 if (!down_cache) {
325 down_cache = &local_down_cache;
326 }
327
328 int current = id;
329 while (true) {
330 int type;
331 if (current >= 0) {
332 type = 0;
333 } else {
334 type = crush->get_bucket_type(current);
335 }
336 ceph_assert(type >= 0);
337
338 if (!subtree_is_down(current, down_cache)) {
339 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
340 return false;
341 }
342
343 // is this a big enough subtree to be marked as down?
344 if (type >= subtree_type) {
345 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
346 return true;
347 }
348
349 int r = crush->get_immediate_parent_id(current, &current);
350 if (r < 0) {
351 return false;
352 }
353 }
354 }
355
356 bool OSDMap::subtree_type_is_down(
357 CephContext *cct,
358 int id,
359 int subtree_type,
360 set<int> *down_in_osds,
361 set<int> *up_in_osds,
362 set<int> *subtree_up,
363 unordered_map<int, set<int> > *subtree_type_down) const
364 {
365 if (id >= 0) {
366 bool is_down_ret = is_down(id);
367 if (!is_out(id)) {
368 if (is_down_ret) {
369 down_in_osds->insert(id);
370 } else {
371 up_in_osds->insert(id);
372 }
373 }
374 return is_down_ret;
375 }
376
377 if (subtree_type_down &&
378 (*subtree_type_down)[subtree_type].count(id)) {
379 return true;
380 }
381
382 list<int> children;
383 crush->get_children(id, &children);
384 for (const auto &child : children) {
385 if (!subtree_type_is_down(
386 cct, child, crush->get_bucket_type(child),
387 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
388 subtree_up->insert(id);
389 return false;
390 }
391 }
392 if (subtree_type_down) {
393 (*subtree_type_down)[subtree_type].insert(id);
394 }
395 return true;
396 }
397
398 void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
399 {
400 using ceph::encode;
401 __u16 v = 5;
402 encode(v, bl);
403 encode(fsid, bl);
404 encode(epoch, bl);
405 encode(modified, bl);
406 int32_t new_t = new_pool_max;
407 encode(new_t, bl);
408 encode(new_flags, bl);
409 encode(fullmap, bl);
410 encode(crush, bl);
411
412 encode(new_max_osd, bl);
413 // for encode(new_pools, bl);
414 __u32 n = new_pools.size();
415 encode(n, bl);
416 for (const auto &new_pool : new_pools) {
417 n = new_pool.first;
418 encode(n, bl);
419 encode(new_pool.second, bl, 0);
420 }
421 // for encode(new_pool_names, bl);
422 n = new_pool_names.size();
423 encode(n, bl);
424
425 for (const auto &new_pool_name : new_pool_names) {
426 n = new_pool_name.first;
427 encode(n, bl);
428 encode(new_pool_name.second, bl);
429 }
430 // for encode(old_pools, bl);
431 n = old_pools.size();
432 encode(n, bl);
433 for (auto &old_pool : old_pools) {
434 n = old_pool;
435 encode(n, bl);
436 }
437 encode(new_up_client, bl, 0);
438 {
439 // legacy is map<int32_t,uint8_t>
440 map<int32_t, uint8_t> os;
441 for (auto p : new_state) {
442 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
443 // that an old client could not understand.
444 // skip those!
445 uint8_t s = p.second;
446 if (p.second != 0 && s == 0)
447 continue;
448 os[p.first] = s;
449 }
450 uint32_t n = os.size();
451 encode(n, bl);
452 for (auto p : os) {
453 encode(p.first, bl);
454 encode(p.second, bl);
455 }
456 }
457 encode(new_weight, bl);
458 // for encode(new_pg_temp, bl);
459 n = new_pg_temp.size();
460 encode(n, bl);
461
462 for (const auto &pg_temp : new_pg_temp) {
463 old_pg_t opg = pg_temp.first.get_old_pg();
464 encode(opg, bl);
465 encode(pg_temp.second, bl);
466 }
467 }
468
469 void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
470 {
471 using ceph::encode;
472 if ((features & CEPH_FEATURE_PGID64) == 0) {
473 encode_client_old(bl);
474 return;
475 }
476
477 // base
478 __u16 v = 6;
479 encode(v, bl);
480 encode(fsid, bl);
481 encode(epoch, bl);
482 encode(modified, bl);
483 encode(new_pool_max, bl);
484 encode(new_flags, bl);
485 encode(fullmap, bl);
486 encode(crush, bl);
487
488 encode(new_max_osd, bl);
489 encode(new_pools, bl, features);
490 encode(new_pool_names, bl);
491 encode(old_pools, bl);
492 encode(new_up_client, bl, features);
493 {
494 map<int32_t, uint8_t> os;
495 for (auto p : new_state) {
496 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
497 // that an old client could not understand.
498 // skip those!
499 uint8_t s = p.second;
500 if (p.second != 0 && s == 0)
501 continue;
502 os[p.first] = s;
503 }
504 uint32_t n = os.size();
505 encode(n, bl);
506 for (auto p : os) {
507 encode(p.first, bl);
508 encode(p.second, bl);
509 }
510 }
511 encode(new_weight, bl);
512 encode(new_pg_temp, bl);
513
514 // extended
515 __u16 ev = 10;
516 encode(ev, bl);
517 encode(new_hb_back_up, bl, features);
518 encode(new_up_thru, bl);
519 encode(new_last_clean_interval, bl);
520 encode(new_lost, bl);
521 encode(new_blocklist, bl, features);
522 encode(old_blocklist, bl, features);
523 encode(new_up_cluster, bl, features);
524 encode(cluster_snapshot, bl);
525 encode(new_uuid, bl);
526 encode(new_xinfo, bl, features);
527 encode(new_hb_front_up, bl, features);
528 }
529
530 template<class T>
531 static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
532 {
533 uint32_t n = m.size();
534 encode(n, bl);
535 for (auto& i : m) {
536 encode(i.first, bl);
537 encode(i.second.legacy_addr(), bl, f);
538 }
539 }
540
541 template<class T>
542 static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
543 {
544 uint32_t n = m.size();
545 encode(n, bl);
546 for (auto& i : m) {
547 if (i) {
548 encode(i->legacy_addr(), bl, f);
549 } else {
550 encode(entity_addr_t(), bl, f);
551 }
552 }
553 }
554
555 /* for a description of osdmap incremental versions, and when they were
556 * introduced, please refer to
557 * doc/dev/osd_internals/osdmap_versions.txt
558 */
559 void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
560 {
561 using ceph::encode;
562 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
563 encode_classic(bl, features);
564 return;
565 }
566
567 // only a select set of callers should *ever* be encoding new
568 // OSDMaps. others should be passing around the canonical encoded
569 // buffers from on high. select out those callers by passing in an
570 // "impossible" feature bit.
571 ceph_assert(features & CEPH_FEATURE_RESERVED);
572 features &= ~CEPH_FEATURE_RESERVED;
573
574 size_t start_offset = bl.length();
575 size_t tail_offset;
576 size_t crc_offset;
577 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
578
579 // meta-encoding: how we include client-used and osd-specific data
580 ENCODE_START(8, 7, bl);
581
582 {
583 uint8_t v = 9;
584 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
585 v = 3;
586 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
587 v = 5;
588 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
589 v = 6;
590 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
591 v = 8;
592 } */
593 ENCODE_START(v, 1, bl); // client-usable data
594 encode(fsid, bl);
595 encode(epoch, bl);
596 encode(modified, bl);
597 encode(new_pool_max, bl);
598 encode(new_flags, bl);
599 encode(fullmap, bl);
600 encode(crush, bl);
601
602 encode(new_max_osd, bl);
603 encode(new_pools, bl, features);
604 encode(new_pool_names, bl);
605 encode(old_pools, bl);
606 if (v >= 7) {
607 encode(new_up_client, bl, features);
608 } else {
609 encode_addrvec_map_as_addr(new_up_client, bl, features);
610 }
611 if (v >= 5) {
612 encode(new_state, bl);
613 } else {
614 map<int32_t, uint8_t> os;
615 for (auto p : new_state) {
616 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
617 // that an old client could not understand.
618 // skip those!
619 uint8_t s = p.second;
620 if (p.second != 0 && s == 0)
621 continue;
622 os[p.first] = s;
623 }
624 uint32_t n = os.size();
625 encode(n, bl);
626 for (auto p : os) {
627 encode(p.first, bl);
628 encode(p.second, bl);
629 }
630 }
631 encode(new_weight, bl);
632 encode(new_pg_temp, bl);
633 encode(new_primary_temp, bl);
634 encode(new_primary_affinity, bl);
635 encode(new_erasure_code_profiles, bl);
636 encode(old_erasure_code_profiles, bl);
637 if (v >= 4) {
638 encode(new_pg_upmap, bl);
639 encode(old_pg_upmap, bl);
640 encode(new_pg_upmap_items, bl);
641 encode(old_pg_upmap_items, bl);
642 }
643 if (v >= 6) {
644 encode(new_removed_snaps, bl);
645 encode(new_purged_snaps, bl);
646 }
647 if (v >= 8) {
648 encode(new_last_up_change, bl);
649 encode(new_last_in_change, bl);
650 }
651 if (v >= 9) {
652 encode(new_pg_upmap_primary, bl);
653 encode(old_pg_upmap_primary, bl);
654 }
655 ENCODE_FINISH(bl); // client-usable data
656 }
657
658 {
659 uint8_t target_v = 9; // if bumping this, be aware of allow_crimson 12
660 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
661 target_v = 2;
662 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
663 target_v = 6;
664 }
665 if (change_stretch_mode) {
666 target_v = std::max((uint8_t)10, target_v);
667 }
668 if (!new_range_blocklist.empty() ||
669 !old_range_blocklist.empty()) {
670 target_v = std::max((uint8_t)11, target_v);
671 }
672 if (mutate_allow_crimson != mutate_allow_crimson_t::NONE) {
673 target_v = std::max((uint8_t)12, target_v);
674 }
675 ENCODE_START(target_v, 1, bl); // extended, osd-only data
676 if (target_v < 7) {
677 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
678 } else {
679 encode(new_hb_back_up, bl, features);
680 }
681 encode(new_up_thru, bl);
682 encode(new_last_clean_interval, bl);
683 encode(new_lost, bl);
684 encode(new_blocklist, bl, features);
685 encode(old_blocklist, bl, features);
686 if (target_v < 7) {
687 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
688 } else {
689 encode(new_up_cluster, bl, features);
690 }
691 encode(cluster_snapshot, bl);
692 encode(new_uuid, bl);
693 encode(new_xinfo, bl, features);
694 if (target_v < 7) {
695 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
696 } else {
697 encode(new_hb_front_up, bl, features);
698 }
699 encode(features, bl); // NOTE: features arg, not the member
700 if (target_v >= 3) {
701 encode(new_nearfull_ratio, bl);
702 encode(new_full_ratio, bl);
703 encode(new_backfillfull_ratio, bl);
704 }
705 // 5 was string-based new_require_min_compat_client
706 if (target_v >= 6) {
707 encode(new_require_min_compat_client, bl);
708 encode(new_require_osd_release, bl);
709 }
710 if (target_v >= 8) {
711 encode(new_crush_node_flags, bl);
712 }
713 if (target_v >= 9) {
714 encode(new_device_class_flags, bl);
715 }
716 if (target_v >= 10) {
717 encode(change_stretch_mode, bl);
718 encode(new_stretch_bucket_count, bl);
719 encode(new_degraded_stretch_mode, bl);
720 encode(new_recovering_stretch_mode, bl);
721 encode(new_stretch_mode_bucket, bl);
722 encode(stretch_mode_enabled, bl);
723 }
724 if (target_v >= 11) {
725 encode(new_range_blocklist, bl, features);
726 encode(old_range_blocklist, bl, features);
727 }
728 if (target_v >= 12) {
729 encode(mutate_allow_crimson, bl);
730 }
731 ENCODE_FINISH(bl); // osd-only data
732 }
733
734 crc_offset = bl.length();
735 crc_filler = bl.append_hole(sizeof(uint32_t));
736 tail_offset = bl.length();
737
738 encode(full_crc, bl);
739
740 ENCODE_FINISH(bl); // meta-encoding wrapper
741
742 // fill in crc
743 ceph::buffer::list front;
744 front.substr_of(bl, start_offset, crc_offset - start_offset);
745 inc_crc = front.crc32c(-1);
746 ceph::buffer::list tail;
747 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
748 inc_crc = tail.crc32c(inc_crc);
749 ceph_le32 crc_le;
750 crc_le = inc_crc;
751 crc_filler->copy_in(4u, (char*)&crc_le);
752 have_crc = true;
753 }
754
755 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
756 {
757 using ceph::decode;
758 __u32 n, t;
759 // base
760 __u16 v;
761 decode(v, p);
762 decode(fsid, p);
763 decode(epoch, p);
764 decode(modified, p);
765 if (v == 4 || v == 5) {
766 decode(n, p);
767 new_pool_max = n;
768 } else if (v >= 6)
769 decode(new_pool_max, p);
770 decode(new_flags, p);
771 decode(fullmap, p);
772 decode(crush, p);
773
774 decode(new_max_osd, p);
775 if (v < 6) {
776 new_pools.clear();
777 decode(n, p);
778 while (n--) {
779 decode(t, p);
780 decode(new_pools[t], p);
781 }
782 } else {
783 decode(new_pools, p);
784 }
785 if (v == 5) {
786 new_pool_names.clear();
787 decode(n, p);
788 while (n--) {
789 decode(t, p);
790 decode(new_pool_names[t], p);
791 }
792 } else if (v >= 6) {
793 decode(new_pool_names, p);
794 }
795 if (v < 6) {
796 old_pools.clear();
797 decode(n, p);
798 while (n--) {
799 decode(t, p);
800 old_pools.insert(t);
801 }
802 } else {
803 decode(old_pools, p);
804 }
805 decode(new_up_client, p);
806 {
807 map<int32_t,uint8_t> ns;
808 decode(ns, p);
809 for (auto q : ns) {
810 new_state[q.first] = q.second;
811 }
812 }
813 decode(new_weight, p);
814
815 if (v < 6) {
816 new_pg_temp.clear();
817 decode(n, p);
818 while (n--) {
819 old_pg_t opg;
820 ceph::decode_raw(opg, p);
821 decode(new_pg_temp[pg_t(opg)], p);
822 }
823 } else {
824 decode(new_pg_temp, p);
825 }
826
827 // decode short map, too.
828 if (v == 5 && p.end())
829 return;
830
831 // extended
832 __u16 ev = 0;
833 if (v >= 5)
834 decode(ev, p);
835 decode(new_hb_back_up, p);
836 if (v < 5)
837 decode(new_pool_names, p);
838 decode(new_up_thru, p);
839 decode(new_last_clean_interval, p);
840 decode(new_lost, p);
841 decode(new_blocklist, p);
842 decode(old_blocklist, p);
843 if (ev >= 6)
844 decode(new_up_cluster, p);
845 if (ev >= 7)
846 decode(cluster_snapshot, p);
847 if (ev >= 8)
848 decode(new_uuid, p);
849 if (ev >= 9)
850 decode(new_xinfo, p);
851 if (ev >= 10)
852 decode(new_hb_front_up, p);
853 }
854
855 /* for a description of osdmap incremental versions, and when they were
856 * introduced, please refer to
857 * doc/dev/osd_internals/osdmap_versions.txt
858 */
859 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
860 {
861 using ceph::decode;
862 /**
863 * Older encodings of the Incremental had a single struct_v which
864 * covered the whole encoding, and was prior to our modern
865 * stuff which includes a compatv and a size. So if we see
866 * a struct_v < 7, we must rewind to the beginning and use our
867 * classic decoder.
868 */
869 size_t start_offset = bl.get_off();
870 size_t tail_offset = 0;
871 ceph::buffer::list crc_front, crc_tail;
872
873 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
874 if (struct_v < 7) {
875 bl.seek(start_offset);
876 decode_classic(bl);
877 encode_features = 0;
878 if (struct_v >= 6)
879 encode_features = CEPH_FEATURE_PGID64;
880 else
881 encode_features = 0;
882 return;
883 }
884 {
885 DECODE_START(8, bl); // client-usable data
886 decode(fsid, bl);
887 decode(epoch, bl);
888 decode(modified, bl);
889 decode(new_pool_max, bl);
890 decode(new_flags, bl);
891 decode(fullmap, bl);
892 decode(crush, bl);
893
894 decode(new_max_osd, bl);
895 decode(new_pools, bl);
896 decode(new_pool_names, bl);
897 decode(old_pools, bl);
898 decode(new_up_client, bl);
899 if (struct_v >= 5) {
900 decode(new_state, bl);
901 } else {
902 map<int32_t,uint8_t> ns;
903 decode(ns, bl);
904 for (auto q : ns) {
905 new_state[q.first] = q.second;
906 }
907 }
908 decode(new_weight, bl);
909 decode(new_pg_temp, bl);
910 decode(new_primary_temp, bl);
911 if (struct_v >= 2)
912 decode(new_primary_affinity, bl);
913 else
914 new_primary_affinity.clear();
915 if (struct_v >= 3) {
916 decode(new_erasure_code_profiles, bl);
917 decode(old_erasure_code_profiles, bl);
918 } else {
919 new_erasure_code_profiles.clear();
920 old_erasure_code_profiles.clear();
921 }
922 if (struct_v >= 4) {
923 decode(new_pg_upmap, bl);
924 decode(old_pg_upmap, bl);
925 decode(new_pg_upmap_items, bl);
926 decode(old_pg_upmap_items, bl);
927 }
928 if (struct_v >= 6) {
929 decode(new_removed_snaps, bl);
930 decode(new_purged_snaps, bl);
931 }
932 if (struct_v >= 8) {
933 decode(new_last_up_change, bl);
934 decode(new_last_in_change, bl);
935 }
936 DECODE_FINISH(bl); // client-usable data
937 }
938
939 {
940 DECODE_START(10, bl); // extended, osd-only data
941 decode(new_hb_back_up, bl);
942 decode(new_up_thru, bl);
943 decode(new_last_clean_interval, bl);
944 decode(new_lost, bl);
945 decode(new_blocklist, bl);
946 decode(old_blocklist, bl);
947 decode(new_up_cluster, bl);
948 decode(cluster_snapshot, bl);
949 decode(new_uuid, bl);
950 decode(new_xinfo, bl);
951 decode(new_hb_front_up, bl);
952 if (struct_v >= 2)
953 decode(encode_features, bl);
954 else
955 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
956 if (struct_v >= 3) {
957 decode(new_nearfull_ratio, bl);
958 decode(new_full_ratio, bl);
959 } else {
960 new_nearfull_ratio = -1;
961 new_full_ratio = -1;
962 }
963 if (struct_v >= 4) {
964 decode(new_backfillfull_ratio, bl);
965 } else {
966 new_backfillfull_ratio = -1;
967 }
968 if (struct_v == 5) {
969 string r;
970 decode(r, bl);
971 if (r.length()) {
972 new_require_min_compat_client = ceph_release_from_name(r);
973 }
974 }
975 if (struct_v >= 6) {
976 decode(new_require_min_compat_client, bl);
977 decode(new_require_osd_release, bl);
978 } else {
979 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
980 // only for compat with post-kraken pre-luminous test clusters
981 new_require_osd_release = ceph_release_t::luminous;
982 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
983 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
984 new_require_osd_release = ceph_release_t::kraken;
985 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
986 new_require_osd_release = ceph_release_t::jewel;
987 } else {
988 new_require_osd_release = ceph_release_t::unknown;
989 }
990 }
991 if (struct_v >= 8) {
992 decode(new_crush_node_flags, bl);
993 }
994 if (struct_v >= 9) {
995 decode(new_device_class_flags, bl);
996 }
997 if (struct_v >= 10) {
998 decode(change_stretch_mode, bl);
999 decode(new_stretch_bucket_count, bl);
1000 decode(new_degraded_stretch_mode, bl);
1001 decode(new_recovering_stretch_mode, bl);
1002 decode(new_stretch_mode_bucket, bl);
1003 decode(stretch_mode_enabled, bl);
1004 }
1005 if (struct_v >= 11) {
1006 decode(new_range_blocklist, bl);
1007 decode(old_range_blocklist, bl);
1008 }
1009 if (struct_v >= 12) {
1010 decode(mutate_allow_crimson, bl);
1011 }
1012 DECODE_FINISH(bl); // osd-only data
1013 }
1014
1015 if (struct_v >= 8) {
1016 have_crc = true;
1017 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
1018 decode(inc_crc, bl);
1019 tail_offset = bl.get_off();
1020 decode(full_crc, bl);
1021 } else {
1022 have_crc = false;
1023 full_crc = 0;
1024 inc_crc = 0;
1025 }
1026
1027 DECODE_FINISH(bl); // wrapper
1028
1029 if (have_crc) {
1030 // verify crc
1031 uint32_t actual = crc_front.crc32c(-1);
1032 if (tail_offset < bl.get_off()) {
1033 ceph::buffer::list tail;
1034 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1035 actual = tail.crc32c(actual);
1036 }
1037 if (inc_crc != actual) {
1038 ostringstream ss;
1039 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1040 string s = ss.str();
1041 throw ceph::buffer::malformed_input(s.c_str());
1042 }
1043 }
1044 }
1045
1046 void OSDMap::Incremental::dump(Formatter *f) const
1047 {
1048 f->dump_int("epoch", epoch);
1049 f->dump_stream("fsid") << fsid;
1050 f->dump_stream("modified") << modified;
1051 f->dump_stream("new_last_up_change") << new_last_up_change;
1052 f->dump_stream("new_last_in_change") << new_last_in_change;
1053 f->dump_int("new_pool_max", new_pool_max);
1054 f->dump_int("new_flags", new_flags);
1055 f->dump_float("new_full_ratio", new_full_ratio);
1056 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1057 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
1058 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1059 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
1060 f->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson));
1061
1062 if (fullmap.length()) {
1063 f->open_object_section("full_map");
1064 OSDMap full;
1065 ceph::buffer::list fbl = fullmap; // kludge around constness.
1066 auto p = fbl.cbegin();
1067 full.decode(p);
1068 full.dump(f);
1069 f->close_section();
1070 }
1071 if (crush.length()) {
1072 f->open_object_section("crush");
1073 CrushWrapper c;
1074 ceph::buffer::list tbl = crush; // kludge around constness.
1075 auto p = tbl.cbegin();
1076 c.decode(p);
1077 c.dump(f);
1078 f->close_section();
1079 }
1080
1081 f->dump_int("new_max_osd", new_max_osd);
1082
1083 f->open_array_section("new_pools");
1084
1085 for (const auto &new_pool : new_pools) {
1086 f->open_object_section("pool");
1087 f->dump_int("pool", new_pool.first);
1088 new_pool.second.dump(f);
1089 f->close_section();
1090 }
1091 f->close_section();
1092 f->open_array_section("new_pool_names");
1093
1094 for (const auto &new_pool_name : new_pool_names) {
1095 f->open_object_section("pool_name");
1096 f->dump_int("pool", new_pool_name.first);
1097 f->dump_string("name", new_pool_name.second);
1098 f->close_section();
1099 }
1100 f->close_section();
1101 f->open_array_section("old_pools");
1102
1103 for (const auto &old_pool : old_pools)
1104 f->dump_int("pool", old_pool);
1105 f->close_section();
1106
1107 f->open_array_section("new_up_osds");
1108
1109 for (const auto &upclient : new_up_client) {
1110 f->open_object_section("osd");
1111 f->dump_int("osd", upclient.first);
1112 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1113 f->dump_object("public_addrs", upclient.second);
1114 if (auto p = new_up_cluster.find(upclient.first);
1115 p != new_up_cluster.end()) {
1116 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1117 f->dump_object("cluster_addrs", p->second);
1118 }
1119 if (auto p = new_hb_back_up.find(upclient.first);
1120 p != new_hb_back_up.end()) {
1121 f->dump_object("heartbeat_back_addrs", p->second);
1122 }
1123 if (auto p = new_hb_front_up.find(upclient.first);
1124 p != new_hb_front_up.end()) {
1125 f->dump_object("heartbeat_front_addrs", p->second);
1126 }
1127 f->close_section();
1128 }
1129 f->close_section();
1130
1131 f->open_array_section("new_weight");
1132
1133 for (const auto &weight : new_weight) {
1134 f->open_object_section("osd");
1135 f->dump_int("osd", weight.first);
1136 f->dump_int("weight", weight.second);
1137 f->close_section();
1138 }
1139 f->close_section();
1140
1141 f->open_array_section("osd_state_xor");
1142 for (const auto &ns : new_state) {
1143 f->open_object_section("osd");
1144 f->dump_int("osd", ns.first);
1145 set<string> st;
1146 calc_state_set(new_state.find(ns.first)->second, st);
1147 f->open_array_section("state_xor");
1148 for (auto &state : st)
1149 f->dump_string("state", state);
1150 f->close_section();
1151 f->close_section();
1152 }
1153 f->close_section();
1154
1155 f->open_array_section("new_pg_temp");
1156
1157 for (const auto &pg_temp : new_pg_temp) {
1158 f->open_object_section("pg");
1159 f->dump_stream("pgid") << pg_temp.first;
1160 f->open_array_section("osds");
1161
1162 for (const auto &osd : pg_temp.second)
1163 f->dump_int("osd", osd);
1164 f->close_section();
1165 f->close_section();
1166 }
1167 f->close_section();
1168
1169 f->open_array_section("primary_temp");
1170
1171 for (const auto &primary_temp : new_primary_temp) {
1172 f->dump_stream("pgid") << primary_temp.first;
1173 f->dump_int("osd", primary_temp.second);
1174 }
1175 f->close_section(); // primary_temp
1176
1177 f->open_array_section("new_pg_upmap");
1178 for (auto& i : new_pg_upmap) {
1179 f->open_object_section("mapping");
1180 f->dump_stream("pgid") << i.first;
1181 f->open_array_section("osds");
1182 for (auto osd : i.second) {
1183 f->dump_int("osd", osd);
1184 }
1185 f->close_section();
1186 f->close_section();
1187 }
1188 f->close_section();
1189 f->open_array_section("old_pg_upmap");
1190 for (auto& i : old_pg_upmap) {
1191 f->dump_stream("pgid") << i;
1192 }
1193 f->close_section();
1194
1195 f->open_array_section("new_pg_upmap_items");
1196 for (auto& i : new_pg_upmap_items) {
1197 f->open_object_section("mapping");
1198 f->dump_stream("pgid") << i.first;
1199 f->open_array_section("mappings");
1200 for (auto& p : i.second) {
1201 f->open_object_section("mapping");
1202 f->dump_int("from", p.first);
1203 f->dump_int("to", p.second);
1204 f->close_section();
1205 }
1206 f->close_section();
1207 f->close_section();
1208 }
1209 f->close_section();
1210 f->open_array_section("old_pg_upmap_items");
1211 for (auto& i : old_pg_upmap_items) {
1212 f->dump_stream("pgid") << i;
1213 }
1214 f->close_section();
1215
1216 // dump upmap_primaries
1217 f->open_array_section("new_pg_upmap_primaries");
1218 for (auto& [pg, osd] : new_pg_upmap_primary) {
1219 f->open_object_section("primary_mapping");
1220 f->dump_stream("pgid") << pg;
1221 f->dump_int("primary_osd", osd);
1222 f->close_section();
1223 }
1224 f->close_section(); // new_pg_upmap_primaries
1225
1226 // dump old_pg_upmap_primaries (removed primary mappings)
1227 f->open_array_section("old_pg_upmap_primaries");
1228 for (auto& pg : old_pg_upmap_primary) {
1229 f->dump_stream("pgid") << pg;
1230 }
1231 f->close_section(); // old_pg_upmap_primaries
1232
1233 f->open_array_section("new_up_thru");
1234
1235 for (const auto &up_thru : new_up_thru) {
1236 f->open_object_section("osd");
1237 f->dump_int("osd", up_thru.first);
1238 f->dump_int("up_thru", up_thru.second);
1239 f->close_section();
1240 }
1241 f->close_section();
1242
1243 f->open_array_section("new_lost");
1244
1245 for (const auto &lost : new_lost) {
1246 f->open_object_section("osd");
1247 f->dump_int("osd", lost.first);
1248 f->dump_int("epoch_lost", lost.second);
1249 f->close_section();
1250 }
1251 f->close_section();
1252
1253 f->open_array_section("new_last_clean_interval");
1254
1255 for (const auto &last_clean_interval : new_last_clean_interval) {
1256 f->open_object_section("osd");
1257 f->dump_int("osd", last_clean_interval.first);
1258 f->dump_int("first", last_clean_interval.second.first);
1259 f->dump_int("last", last_clean_interval.second.second);
1260 f->close_section();
1261 }
1262 f->close_section();
1263
1264 f->open_array_section("new_blocklist");
1265 for (const auto &blist : new_blocklist) {
1266 stringstream ss;
1267 ss << blist.first;
1268 f->dump_stream(ss.str().c_str()) << blist.second;
1269 }
1270 f->close_section();
1271 f->open_array_section("old_blocklist");
1272 for (const auto &blist : old_blocklist)
1273 f->dump_stream("addr") << blist;
1274 f->close_section();
1275 f->open_array_section("new_range_blocklist");
1276 for (const auto &blist : new_range_blocklist) {
1277 stringstream ss;
1278 ss << blist.first;
1279 f->dump_stream(ss.str().c_str()) << blist.second;
1280 }
1281 f->close_section();
1282 f->open_array_section("old_range_blocklist");
1283 for (const auto &blist : old_range_blocklist)
1284 f->dump_stream("addr") << blist;
1285 f->close_section();
1286
1287 f->open_array_section("new_xinfo");
1288 for (const auto &xinfo : new_xinfo) {
1289 f->open_object_section("xinfo");
1290 f->dump_int("osd", xinfo.first);
1291 xinfo.second.dump(f);
1292 f->close_section();
1293 }
1294 f->close_section();
1295
1296 if (cluster_snapshot.size())
1297 f->dump_string("cluster_snapshot", cluster_snapshot);
1298
1299 f->open_array_section("new_uuid");
1300 for (const auto &uuid : new_uuid) {
1301 f->open_object_section("osd");
1302 f->dump_int("osd", uuid.first);
1303 f->dump_stream("uuid") << uuid.second;
1304 f->close_section();
1305 }
1306 f->close_section();
1307
1308 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1309 f->open_array_section("old_erasure_code_profiles");
1310 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1311 f->dump_string("old", erasure_code_profile);
1312 }
1313 f->close_section();
1314
1315 f->open_array_section("new_removed_snaps");
1316 for (auto& p : new_removed_snaps) {
1317 f->open_object_section("pool");
1318 f->dump_int("pool", p.first);
1319 f->open_array_section("snaps");
1320 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1321 f->open_object_section("interval");
1322 f->dump_unsigned("begin", q.get_start());
1323 f->dump_unsigned("length", q.get_len());
1324 f->close_section();
1325 }
1326 f->close_section();
1327 f->close_section();
1328 }
1329 f->close_section();
1330 f->open_array_section("new_purged_snaps");
1331 for (auto& p : new_purged_snaps) {
1332 f->open_object_section("pool");
1333 f->dump_int("pool", p.first);
1334 f->open_array_section("snaps");
1335 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1336 f->open_object_section("interval");
1337 f->dump_unsigned("begin", q.get_start());
1338 f->dump_unsigned("length", q.get_len());
1339 f->close_section();
1340 }
1341 f->close_section();
1342 f->close_section();
1343 }
1344 f->open_array_section("new_crush_node_flags");
1345 for (auto& i : new_crush_node_flags) {
1346 f->open_object_section("node");
1347 f->dump_int("id", i.first);
1348 set<string> st;
1349 calc_state_set(i.second, st);
1350 for (auto& j : st) {
1351 f->dump_string("flag", j);
1352 }
1353 f->close_section();
1354 }
1355 f->close_section();
1356 f->open_array_section("new_device_class_flags");
1357 for (auto& i : new_device_class_flags) {
1358 f->open_object_section("device_class");
1359 f->dump_int("id", i.first);
1360 set<string> st;
1361 calc_state_set(i.second, st);
1362 for (auto& j : st) {
1363 f->dump_string("flag", j);
1364 }
1365 f->close_section();
1366 }
1367 f->close_section();
1368 f->open_object_section("stretch_mode");
1369 {
1370 f->dump_bool("change_stretch_mode", change_stretch_mode);
1371 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1372 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1373 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1374 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1375 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1376 }
1377 f->close_section();
1378 f->close_section();
1379 }
1380
1381 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1382 {
1383 o.push_back(new Incremental);
1384 }
1385
1386 // ----------------------------------
1387 // OSDMap
1388
1389 void OSDMap::set_epoch(epoch_t e)
1390 {
1391 epoch = e;
1392 for (auto &pool : pools)
1393 pool.second.last_change = e;
1394 }
1395
1396 OSDMap::range_bits::range_bits() : ipv6(false) {
1397 memset(&bits, 0, sizeof(bits));
1398 }
1399
1400 OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
1401 memset(&bits, 0, sizeof(bits));
1402 parse(addr);
1403 }
1404
1405 void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
1406 uint64_t *upper, uint64_t *lower)
1407 {
1408 *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
1409 ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
1410 *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
1411 ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
1412 }
1413
1414 void OSDMap::range_bits::parse(const entity_addr_t& addr) {
1415 // parse it into meaningful data
1416 if (addr.is_ipv6()) {
1417 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
1418 &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
1419 int32_t lower_shift = std::min(128-
1420 static_cast<int32_t>(addr.get_nonce()), 64);
1421 int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
1422 static_cast<int32_t>(addr.get_nonce()), 0);
1423
1424 auto get_mask = [](int32_t shift) -> uint64_t {
1425 if (shift >= 0 && shift < 64) {
1426 return UINT64_MAX << shift;
1427 }
1428 return 0;
1429 };
1430
1431 bits.ipv6.lower_mask = get_mask(lower_shift);
1432 bits.ipv6.upper_mask = get_mask(upper_shift);
1433 ipv6 = true;
1434 } else if (addr.is_ipv4()) {
1435 bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
1436 if (addr.get_nonce() > 0) {
1437 bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
1438 } else {
1439 bits.ipv4.mask = 0;
1440 }
1441 } else {
1442 // uh...
1443 }
1444 }
1445
1446 bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
1447 if (addr.is_ipv4() && !ipv6) {
1448 return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
1449 (bits.ipv4.ip_32_bits & bits.ipv4.mask));
1450 } else if (addr.is_ipv6() && ipv6) {
1451 uint64_t upper_64, lower_64;
1452 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
1453 return (((upper_64 & bits.ipv6.upper_mask) ==
1454 (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
1455 ((lower_64 & bits.ipv6.lower_mask) ==
1456 (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
1457 }
1458 return false;
1459 }
1460
1461 bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
1462 {
1463 if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
1464 if (blocklist.empty() && range_blocklist.empty()) {
1465 if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
1466 return false;
1467 }
1468
1469 // all blocklist entries are type ANY for nautilus+
1470 // FIXME: avoid this copy!
1471 entity_addr_t a = orig;
1472 if (require_osd_release < ceph_release_t::nautilus) {
1473 a.set_type(entity_addr_t::TYPE_LEGACY);
1474 } else {
1475 a.set_type(entity_addr_t::TYPE_ANY);
1476 }
1477
1478 // this specific instance?
1479 if (blocklist.count(a)) {
1480 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
1481 return true;
1482 }
1483
1484 // is entire ip blocklisted?
1485 if (a.is_ip()) {
1486 a.set_port(0);
1487 a.set_nonce(0);
1488 if (blocklist.count(a)) {
1489 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
1490 return true;
1491 }
1492 }
1493
1494 // is it in a blocklisted range?
1495 for (const auto& i : calculated_ranges) {
1496 bool blocked = i.second.matches(a);
1497 if (blocked) {
1498 if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
1499 return true;
1500 }
1501 }
1502
1503 if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
1504 return false;
1505 }
1506
1507 bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
1508 {
1509 if (blocklist.empty() && range_blocklist.empty())
1510 return false;
1511
1512 for (auto& a : av.v) {
1513 if (is_blocklisted(a, cct)) {
1514 return true;
1515 }
1516 }
1517
1518 return false;
1519 }
1520
1521 void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
1522 std::list<std::pair<entity_addr_t,utime_t> > *rl) const
1523 {
1524 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
1525 std::copy(range_blocklist.begin(), range_blocklist.end(),
1526 std::back_inserter(*rl));
1527 }
1528
1529 void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
1530 std::set<entity_addr_t> *rl) const
1531 {
1532 for (const auto &i : blocklist) {
1533 bl->insert(i.first);
1534 }
1535 for (const auto &i : range_blocklist) {
1536 rl->insert(i.first);
1537 }
1538 }
1539
1540 void OSDMap::set_max_osd(int m)
1541 {
1542 max_osd = m;
1543 osd_state.resize(max_osd, 0);
1544 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1545 osd_info.resize(max_osd);
1546 osd_xinfo.resize(max_osd);
1547 osd_addrs->client_addrs.resize(max_osd);
1548 osd_addrs->cluster_addrs.resize(max_osd);
1549 osd_addrs->hb_back_addrs.resize(max_osd);
1550 osd_addrs->hb_front_addrs.resize(max_osd);
1551 osd_uuid->resize(max_osd);
1552 if (osd_primary_affinity)
1553 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1554
1555 calc_num_osds();
1556 }
1557
1558 int OSDMap::calc_num_osds()
1559 {
1560 num_osd = 0;
1561 num_up_osd = 0;
1562 num_in_osd = 0;
1563 for (int i=0; i<max_osd; i++) {
1564 if (osd_state[i] & CEPH_OSD_EXISTS) {
1565 ++num_osd;
1566 if (osd_state[i] & CEPH_OSD_UP) {
1567 ++num_up_osd;
1568 }
1569 if (get_weight(i) != CEPH_OSD_OUT) {
1570 ++num_in_osd;
1571 }
1572 }
1573 }
1574 return num_osd;
1575 }
1576
1577 void OSDMap::get_full_pools(CephContext *cct,
1578 set<int64_t> *full,
1579 set<int64_t> *backfillfull,
1580 set<int64_t> *nearfull) const
1581 {
1582 ceph_assert(full);
1583 ceph_assert(backfillfull);
1584 ceph_assert(nearfull);
1585 full->clear();
1586 backfillfull->clear();
1587 nearfull->clear();
1588
1589 vector<int> full_osds;
1590 vector<int> backfillfull_osds;
1591 vector<int> nearfull_osds;
1592 for (int i = 0; i < max_osd; ++i) {
1593 if (exists(i) && is_up(i) && is_in(i)) {
1594 if (osd_state[i] & CEPH_OSD_FULL)
1595 full_osds.push_back(i);
1596 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1597 backfillfull_osds.push_back(i);
1598 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1599 nearfull_osds.push_back(i);
1600 }
1601 }
1602
1603 for (auto i: full_osds) {
1604 get_pool_ids_by_osd(cct, i, full);
1605 }
1606 for (auto i: backfillfull_osds) {
1607 get_pool_ids_by_osd(cct, i, backfillfull);
1608 }
1609 for (auto i: nearfull_osds) {
1610 get_pool_ids_by_osd(cct, i, nearfull);
1611 }
1612 }
1613
1614 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1615 set<int> *nearfull) const
1616 {
1617 full->clear();
1618 backfill->clear();
1619 nearfull->clear();
1620 for (int i = 0; i < max_osd; ++i) {
1621 if (exists(i) && is_up(i) && is_in(i)) {
1622 if (osd_state[i] & CEPH_OSD_FULL)
1623 full->emplace(i);
1624 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1625 backfill->emplace(i);
1626 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1627 nearfull->emplace(i);
1628 }
1629 }
1630 }
1631
1632 void OSDMap::get_all_osds(set<int32_t>& ls) const
1633 {
1634 for (int i=0; i<max_osd; i++)
1635 if (exists(i))
1636 ls.insert(i);
1637 }
1638
1639 void OSDMap::get_up_osds(set<int32_t>& ls) const
1640 {
1641 for (int i = 0; i < max_osd; i++) {
1642 if (is_up(i))
1643 ls.insert(i);
1644 }
1645 }
1646
1647 void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
1648 {
1649 for (int i = 0; i < max_osd; i++) {
1650 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
1651 ls.insert(i);
1652 }
1653 }
1654
1655 void OSDMap::get_flag_set(set<string> *flagset) const
1656 {
1657 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1658 if (flags & (1<<i)) {
1659 flagset->insert(get_flag_string(flags & (1<<i)));
1660 }
1661 }
1662 }
1663
1664 void OSDMap::calc_state_set(int state, set<string>& st)
1665 {
1666 unsigned t = state;
1667 for (unsigned s = 1; t; s <<= 1) {
1668 if (t & s) {
1669 t &= ~s;
1670 st.insert(ceph_osd_state_name(s));
1671 }
1672 }
1673 }
1674
1675 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1676 {
1677 float max = 0;
1678 for (const auto &weight : weights) {
1679 if (weight.second > max)
1680 max = weight.second;
1681 }
1682
1683 for (const auto &weight : weights) {
1684 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1685 }
1686 }
1687
1688 int OSDMap::identify_osd(const entity_addr_t& addr) const
1689 {
1690 for (int i=0; i<max_osd; i++)
1691 if (exists(i) && (get_addrs(i).contains(addr) ||
1692 get_cluster_addrs(i).contains(addr)))
1693 return i;
1694 return -1;
1695 }
1696
1697 int OSDMap::identify_osd(const uuid_d& u) const
1698 {
1699 for (int i=0; i<max_osd; i++)
1700 if (exists(i) && get_uuid(i) == u)
1701 return i;
1702 return -1;
1703 }
1704
1705 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1706 {
1707 for (int i=0; i<max_osd; i++)
1708 if (exists(i) && (get_addrs(i).contains(addr) ||
1709 get_cluster_addrs(i).contains(addr) ||
1710 get_hb_back_addrs(i).contains(addr) ||
1711 get_hb_front_addrs(i).contains(addr)))
1712 return i;
1713 return -1;
1714 }
1715
1716 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1717 {
1718 for (int i=0; i<max_osd; i++)
1719 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1720 get_cluster_addrs(i).is_same_host(ip)))
1721 return i;
1722 return -1;
1723 }
1724
1725
1726 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1727 {
1728 uint64_t features = 0; // things we actually have
1729 uint64_t mask = 0; // things we could have
1730
1731 if (crush->has_nondefault_tunables())
1732 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1733 if (crush->has_nondefault_tunables2())
1734 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1735 if (crush->has_nondefault_tunables3())
1736 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1737 if (crush->has_v4_buckets())
1738 features |= CEPH_FEATURE_CRUSH_V4;
1739 if (crush->has_nondefault_tunables5())
1740 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1741 if (crush->has_incompat_choose_args()) {
1742 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1743 }
1744 mask |= CEPH_FEATURES_CRUSH;
1745
1746 if (!pg_upmap.empty() || !pg_upmap_items.empty() || !pg_upmap_primaries.empty())
1747 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1748 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1749
1750 for (auto &pool: pools) {
1751 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1752 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1753 }
1754 if (!pool.second.tiers.empty() ||
1755 pool.second.is_tier()) {
1756 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1757 }
1758 int ruleid = pool.second.get_crush_rule();
1759 if (ruleid >= 0) {
1760 if (crush->is_v2_rule(ruleid))
1761 features |= CEPH_FEATURE_CRUSH_V2;
1762 if (crush->is_v3_rule(ruleid))
1763 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1764 if (crush->is_v5_rule(ruleid))
1765 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1766 }
1767 }
1768 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1769
1770 if (osd_primary_affinity) {
1771 for (int i = 0; i < max_osd; ++i) {
1772 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1773 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1774 break;
1775 }
1776 }
1777 }
1778 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1779
1780 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1781 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1782 if (require_osd_release >= ceph_release_t::jewel) {
1783 features |= jewel_features;
1784 }
1785 mask |= jewel_features;
1786
1787 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1788 | CEPH_FEATURE_MSG_ADDR2;
1789 if (require_osd_release >= ceph_release_t::kraken) {
1790 features |= kraken_features;
1791 }
1792 mask |= kraken_features;
1793
1794 if (stretch_mode_enabled) {
1795 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1796 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1797 }
1798 }
1799
1800 if (require_min_compat_client >= ceph_release_t::nautilus) {
1801 // if min_compat_client is >= nautilus, require v2 cephx signatures
1802 // from everyone
1803 features |= CEPH_FEATUREMASK_CEPHX_V2;
1804 } else if (require_osd_release >= ceph_release_t::nautilus &&
1805 entity_type == CEPH_ENTITY_TYPE_OSD) {
1806 // if osds are >= nautilus, at least require the signatures from them
1807 features |= CEPH_FEATUREMASK_CEPHX_V2;
1808 }
1809 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1810
1811 if (pmask)
1812 *pmask = mask;
1813 return features;
1814 }
1815
1816 ceph_release_t OSDMap::get_min_compat_client() const
1817 {
1818 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1819
1820 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1821 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1822 return ceph_release_t::luminous; // v12.2.0
1823 }
1824 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1825 return ceph_release_t::jewel; // v10.2.0
1826 }
1827 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1828 return ceph_release_t::hammer; // v0.94.0
1829 }
1830 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1831 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1832 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1833 return ceph_release_t::firefly; // v0.80.0
1834 }
1835 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1836 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1837 return ceph_release_t::dumpling; // v0.67.0
1838 }
1839 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1840 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1841 }
1842 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1843 }
1844
1845 ceph_release_t OSDMap::get_require_min_compat_client() const
1846 {
1847 return require_min_compat_client;
1848 }
1849
1850 void OSDMap::_calc_up_osd_features()
1851 {
1852 bool first = true;
1853 cached_up_osd_features = 0;
1854 for (int osd = 0; osd < max_osd; ++osd) {
1855 if (!is_up(osd))
1856 continue;
1857 const osd_xinfo_t &xi = get_xinfo(osd);
1858 if (xi.features == 0)
1859 continue; // bogus xinfo, maybe #20751 or similar, skipping
1860 if (first) {
1861 cached_up_osd_features = xi.features;
1862 first = false;
1863 } else {
1864 cached_up_osd_features &= xi.features;
1865 }
1866 }
1867 }
1868
1869 uint64_t OSDMap::get_up_osd_features() const
1870 {
1871 return cached_up_osd_features;
1872 }
1873
1874 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1875 {
1876 using ceph::encode;
1877 if (o->epoch == n->epoch)
1878 return;
1879
1880 int diff = 0;
1881
1882 // do addrs match?
1883 if (o->max_osd != n->max_osd)
1884 diff++;
1885 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1886 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1887 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1888 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
1889 else
1890 diff++;
1891 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1892 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1893 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
1894 else
1895 diff++;
1896 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1897 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1898 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
1899 else
1900 diff++;
1901 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1902 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1903 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
1904 else
1905 diff++;
1906 }
1907 if (diff == 0) {
1908 // zoinks, no differences at all!
1909 n->osd_addrs = o->osd_addrs;
1910 }
1911
1912 // does crush match?
1913 ceph::buffer::list oc, nc;
1914 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1915 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1916 if (oc.contents_equal(nc)) {
1917 n->crush = o->crush;
1918 }
1919
1920 // does pg_temp match?
1921 if (*o->pg_temp == *n->pg_temp)
1922 n->pg_temp = o->pg_temp;
1923
1924 // does primary_temp match?
1925 if (o->primary_temp->size() == n->primary_temp->size()) {
1926 if (*o->primary_temp == *n->primary_temp)
1927 n->primary_temp = o->primary_temp;
1928 }
1929
1930 // do uuids match?
1931 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1932 *o->osd_uuid == *n->osd_uuid)
1933 n->osd_uuid = o->osd_uuid;
1934 }
1935
1936 void OSDMap::clean_temps(CephContext *cct,
1937 const OSDMap& oldmap,
1938 const OSDMap& nextmap,
1939 Incremental *pending_inc)
1940 {
1941 ldout(cct, 10) << __func__ << dendl;
1942
1943 for (auto pg : *nextmap.pg_temp) {
1944 // if pool does not exist, remove any existing pg_temps associated with
1945 // it. we don't care about pg_temps on the pending_inc either; if there
1946 // are new_pg_temp entries on the pending, clear them out just as well.
1947 if (!nextmap.have_pg_pool(pg.first.pool())) {
1948 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1949 << " for nonexistent pool " << pg.first.pool() << dendl;
1950 pending_inc->new_pg_temp[pg.first].clear();
1951 continue;
1952 }
1953 if (!nextmap.pg_exists(pg.first)) {
1954 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1955 << " for nonexistent pg " << dendl;
1956 pending_inc->new_pg_temp[pg.first].clear();
1957 continue;
1958 }
1959 // all osds down?
1960 unsigned num_up = 0;
1961 for (auto o : pg.second) {
1962 if (!nextmap.is_down(o)) {
1963 ++num_up;
1964 break;
1965 }
1966 }
1967 if (num_up == 0) {
1968 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1969 << " with all down osds" << pg.second << dendl;
1970 pending_inc->new_pg_temp[pg.first].clear();
1971 continue;
1972 }
1973 // redundant pg_temp?
1974 vector<int> raw_up;
1975 int primary;
1976 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1977 bool remove = false;
1978 if (raw_up == pg.second) {
1979 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1980 << pg.second << " that matches raw_up mapping" << dendl;
1981 remove = true;
1982 }
1983 // oversized pg_temp?
1984 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
1985 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1986 << pg.second << " exceeds pool size" << dendl;
1987 remove = true;
1988 }
1989 if (remove) {
1990 if (oldmap.pg_temp->count(pg.first))
1991 pending_inc->new_pg_temp[pg.first].clear();
1992 else
1993 pending_inc->new_pg_temp.erase(pg.first);
1994 }
1995 }
1996
1997 for (auto &pg : *nextmap.primary_temp) {
1998 // primary down?
1999 if (nextmap.is_down(pg.second)) {
2000 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
2001 << " to down " << pg.second << dendl;
2002 pending_inc->new_primary_temp[pg.first] = -1;
2003 continue;
2004 }
2005 // redundant primary_temp?
2006 vector<int> real_up, templess_up;
2007 int real_primary, templess_primary;
2008 pg_t pgid = pg.first;
2009 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
2010 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
2011 if (real_primary == templess_primary){
2012 ldout(cct, 10) << __func__ << " removing primary_temp "
2013 << pgid << " -> " << real_primary
2014 << " (unnecessary/redundant)" << dendl;
2015 if (oldmap.primary_temp->count(pgid))
2016 pending_inc->new_primary_temp[pgid] = -1;
2017 else
2018 pending_inc->new_primary_temp.erase(pgid);
2019 }
2020 }
2021 }
2022
2023 void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
2024 {
2025 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
2026 for (auto& p : pg_upmap)
2027 upmap_pgs->push_back(p.first);
2028 for (auto& p : pg_upmap_items)
2029 upmap_pgs->push_back(p.first);
2030 }
2031
2032 bool OSDMap::check_pg_upmaps(
2033 CephContext *cct,
2034 const vector<pg_t>& to_check,
2035 vector<pg_t> *to_cancel,
2036 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
2037 {
2038 bool any_change = false;
2039 map<int, map<int, float>> rule_weight_map;
2040 for (auto& pg : to_check) {
2041 const pg_pool_t *pi = get_pg_pool(pg.pool());
2042 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
2043 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
2044 << dendl;
2045 to_cancel->push_back(pg);
2046 continue;
2047 }
2048 if (pi->is_pending_merge(pg, nullptr)) {
2049 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
2050 << dendl;
2051 to_cancel->push_back(pg);
2052 continue;
2053 }
2054 vector<int> raw, up;
2055 pg_to_raw_upmap(pg, &raw, &up);
2056 auto crush_rule = get_pg_pool_crush_rule(pg);
2057 auto r = crush->verify_upmap(cct,
2058 crush_rule,
2059 get_pg_pool_size(pg),
2060 up);
2061 if (r < 0) {
2062 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
2063 << " returning " << r
2064 << dendl;
2065 to_cancel->push_back(pg);
2066 continue;
2067 }
2068 // below we check against crush-topology changing..
2069 map<int, float> weight_map;
2070 auto it = rule_weight_map.find(crush_rule);
2071 if (it == rule_weight_map.end()) {
2072 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
2073 if (r < 0) {
2074 lderr(cct) << __func__ << " unable to get crush weight_map for "
2075 << "crush_rule " << crush_rule
2076 << dendl;
2077 continue;
2078 }
2079 rule_weight_map[crush_rule] = weight_map;
2080 } else {
2081 weight_map = it->second;
2082 }
2083 ldout(cct, 10) << __func__ << " pg " << pg
2084 << " weight_map " << weight_map
2085 << dendl;
2086 for (auto osd : up) {
2087 auto it = weight_map.find(osd);
2088 if (it == weight_map.end()) {
2089 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
2090 << "been moved out of the specific crush-tree"
2091 << dendl;
2092 to_cancel->push_back(pg);
2093 break;
2094 }
2095 auto adjusted_weight = get_weightf(it->first) * it->second;
2096 if (adjusted_weight == 0) {
2097 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
2098 << " is out/crush-out"
2099 << dendl;
2100 to_cancel->push_back(pg);
2101 break;
2102 }
2103 }
2104 if (!to_cancel->empty() && to_cancel->back() == pg)
2105 continue;
2106 // okay, upmap is valid
2107 // continue to check if it is still necessary
2108 auto i = pg_upmap.find(pg);
2109 if (i != pg_upmap.end()) {
2110 if (i->second == raw) {
2111 ldout(cct, 10) << __func__ << "removing redundant pg_upmap " << i->first << " "
2112 << i->second << dendl;
2113 to_cancel->push_back(pg);
2114 continue;
2115 }
2116 if ((int)i->second.size() != get_pg_pool_size(pg)) {
2117 ldout(cct, 10) << __func__ << "removing pg_upmap " << i->first << " "
2118 << i->second << " != pool size " << get_pg_pool_size(pg)
2119 << dendl;
2120 to_cancel->push_back(pg);
2121 continue;
2122 }
2123 }
2124 auto j = pg_upmap_items.find(pg);
2125 if (j != pg_upmap_items.end()) {
2126 mempool::osdmap::vector<pair<int,int>> newmap;
2127 for (auto& p : j->second) {
2128 auto osd_from = p.first;
2129 auto osd_to = p.second;
2130 if (std::find(raw.begin(), raw.end(), osd_from) == raw.end()) {
2131 // cancel mapping if source osd does not exist anymore
2132 ldout(cct, 20) << __func__ << " pg_upmap_items (source osd does not exist) " << pg_upmap_items << dendl;
2133 continue;
2134 }
2135 if (osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2136 osd_to >= 0 && osd_weight[osd_to] == 0) {
2137 // cancel mapping if target osd is out
2138 ldout(cct, 20) << __func__ << " pg_upmap_items (target osd is out) " << pg_upmap_items << dendl;
2139 continue;
2140 }
2141 newmap.push_back(p);
2142 }
2143 if (newmap.empty()) {
2144 ldout(cct, 10) << __func__ << " removing no-op pg_upmap_items "
2145 << j->first << " " << j->second
2146 << dendl;
2147 to_cancel->push_back(pg);
2148 } else {
2149 //Josh--check partial no-op here.
2150 ldout(cct, 10) << __func__ << " simplifying partially no-op pg_upmap_items "
2151 << j->first << " " << j->second
2152 << " -> " << newmap
2153 << dendl;
2154 to_remap->insert({pg, newmap});
2155 any_change = true;
2156 }
2157 }
2158 }
2159 any_change = any_change || !to_cancel->empty();
2160 return any_change;
2161 }
2162
2163 void OSDMap::clean_pg_upmaps(
2164 CephContext *cct,
2165 Incremental *pending_inc,
2166 const vector<pg_t>& to_cancel,
2167 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2168 {
2169 for (auto &pg: to_cancel) {
2170 auto i = pending_inc->new_pg_upmap.find(pg);
2171 if (i != pending_inc->new_pg_upmap.end()) {
2172 ldout(cct, 10) << __func__ << " cancel invalid pending "
2173 << "pg_upmap entry "
2174 << i->first << "->" << i->second
2175 << dendl;
2176 pending_inc->new_pg_upmap.erase(i);
2177 }
2178 auto j = pg_upmap.find(pg);
2179 if (j != pg_upmap.end()) {
2180 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2181 << j->first << "->" << j->second
2182 << dendl;
2183 pending_inc->old_pg_upmap.insert(pg);
2184 }
2185 auto p = pending_inc->new_pg_upmap_items.find(pg);
2186 if (p != pending_inc->new_pg_upmap_items.end()) {
2187 ldout(cct, 10) << __func__ << " cancel invalid pending "
2188 << "pg_upmap_items entry "
2189 << p->first << "->" << p->second
2190 << dendl;
2191 pending_inc->new_pg_upmap_items.erase(p);
2192 }
2193 auto q = pg_upmap_items.find(pg);
2194 if (q != pg_upmap_items.end()) {
2195 ldout(cct, 10) << __func__ << " cancel invalid "
2196 << "pg_upmap_items entry "
2197 << q->first << "->" << q->second
2198 << dendl;
2199 pending_inc->old_pg_upmap_items.insert(pg);
2200 }
2201 }
2202 for (auto& i : to_remap)
2203 pending_inc->new_pg_upmap_items[i.first] = i.second;
2204 }
2205
2206 bool OSDMap::clean_pg_upmaps(
2207 CephContext *cct,
2208 Incremental *pending_inc) const
2209 {
2210 ldout(cct, 10) << __func__ << dendl;
2211 vector<pg_t> to_check;
2212 vector<pg_t> to_cancel;
2213 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2214
2215 get_upmap_pgs(&to_check);
2216 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2217 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2218 //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
2219 // and cleaned in the same way as pg_upmap. This is not critical since invalid
2220 // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
2221 return any_change;
2222 }
2223
2224 int OSDMap::apply_incremental(const Incremental &inc)
2225 {
2226 new_blocklist_entries = false;
2227 if (inc.epoch == 1)
2228 fsid = inc.fsid;
2229 else if (inc.fsid != fsid)
2230 return -EINVAL;
2231
2232 ceph_assert(inc.epoch == epoch+1);
2233
2234 epoch++;
2235 modified = inc.modified;
2236
2237 // full map?
2238 if (inc.fullmap.length()) {
2239 ceph::buffer::list bl(inc.fullmap);
2240 decode(bl);
2241 return 0;
2242 }
2243
2244 // nope, incremental.
2245 if (inc.new_flags >= 0) {
2246 flags = inc.new_flags;
2247 // the below is just to cover a newly-upgraded luminous mon
2248 // cluster that has to set require_jewel_osds or
2249 // require_kraken_osds before the osds can be upgraded to
2250 // luminous.
2251 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2252 if (require_osd_release < ceph_release_t::kraken) {
2253 require_osd_release = ceph_release_t::kraken;
2254 }
2255 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2256 if (require_osd_release < ceph_release_t::jewel) {
2257 require_osd_release = ceph_release_t::jewel;
2258 }
2259 }
2260 }
2261
2262 if (inc.new_max_osd >= 0)
2263 set_max_osd(inc.new_max_osd);
2264
2265 if (inc.new_pool_max != -1)
2266 pool_max = inc.new_pool_max;
2267
2268 for (const auto &pool : inc.new_pools) {
2269 pools[pool.first] = pool.second;
2270 pools[pool.first].last_change = epoch;
2271 }
2272
2273 new_removed_snaps = inc.new_removed_snaps;
2274 new_purged_snaps = inc.new_purged_snaps;
2275 for (auto p = new_removed_snaps.begin();
2276 p != new_removed_snaps.end();
2277 ++p) {
2278 removed_snaps_queue[p->first].union_of(p->second);
2279 }
2280 for (auto p = new_purged_snaps.begin();
2281 p != new_purged_snaps.end();
2282 ++p) {
2283 auto q = removed_snaps_queue.find(p->first);
2284 ceph_assert(q != removed_snaps_queue.end());
2285 q->second.subtract(p->second);
2286 if (q->second.empty()) {
2287 removed_snaps_queue.erase(q);
2288 }
2289 }
2290
2291 if (inc.new_last_up_change != utime_t()) {
2292 last_up_change = inc.new_last_up_change;
2293 }
2294 if (inc.new_last_in_change != utime_t()) {
2295 last_in_change = inc.new_last_in_change;
2296 }
2297
2298 for (const auto &pname : inc.new_pool_names) {
2299 auto pool_name_entry = pool_name.find(pname.first);
2300 if (pool_name_entry != pool_name.end()) {
2301 name_pool.erase(pool_name_entry->second);
2302 pool_name_entry->second = pname.second;
2303 } else {
2304 pool_name[pname.first] = pname.second;
2305 }
2306 name_pool[pname.second] = pname.first;
2307 }
2308
2309 for (const auto &pool : inc.old_pools) {
2310 pools.erase(pool);
2311 name_pool.erase(pool_name[pool]);
2312 pool_name.erase(pool);
2313 }
2314
2315 for (const auto &weight : inc.new_weight) {
2316 set_weight(weight.first, weight.second);
2317
2318 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2319 // xinfo old_weight.
2320 if (weight.second) {
2321 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2322 osd_xinfo[weight.first].old_weight = 0;
2323 }
2324 }
2325
2326 for (const auto &primary_affinity : inc.new_primary_affinity) {
2327 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2328 }
2329
2330 // erasure_code_profiles
2331 for (const auto &profile : inc.old_erasure_code_profiles)
2332 erasure_code_profiles.erase(profile);
2333
2334 for (const auto &profile : inc.new_erasure_code_profiles) {
2335 set_erasure_code_profile(profile.first, profile.second);
2336 }
2337
2338 // up/down
2339 for (const auto &state : inc.new_state) {
2340 const auto osd = state.first;
2341 int s = state.second ? state.second : CEPH_OSD_UP;
2342 if ((osd_state[osd] & CEPH_OSD_UP) &&
2343 (s & CEPH_OSD_UP)) {
2344 osd_info[osd].down_at = epoch;
2345 osd_xinfo[osd].down_stamp = modified;
2346 }
2347 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2348 (s & CEPH_OSD_EXISTS)) {
2349 // osd is destroyed; clear out anything interesting.
2350 (*osd_uuid)[osd] = uuid_d();
2351 osd_info[osd] = osd_info_t();
2352 osd_xinfo[osd] = osd_xinfo_t();
2353 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
2354 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2355 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2356 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2357 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
2358 osd_state[osd] = 0;
2359 } else {
2360 osd_state[osd] ^= s;
2361 }
2362 }
2363
2364 for (const auto &client : inc.new_up_client) {
2365 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
2366 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
2367 osd_addrs->client_addrs[client.first].reset(
2368 new entity_addrvec_t(client.second));
2369 osd_addrs->hb_back_addrs[client.first].reset(
2370 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2371 osd_addrs->hb_front_addrs[client.first].reset(
2372 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
2373
2374 osd_info[client.first].up_from = epoch;
2375 }
2376
2377 for (const auto &cluster : inc.new_up_cluster)
2378 osd_addrs->cluster_addrs[cluster.first].reset(
2379 new entity_addrvec_t(cluster.second));
2380
2381 // info
2382 for (const auto &thru : inc.new_up_thru)
2383 osd_info[thru.first].up_thru = thru.second;
2384
2385 for (const auto &interval : inc.new_last_clean_interval) {
2386 osd_info[interval.first].last_clean_begin = interval.second.first;
2387 osd_info[interval.first].last_clean_end = interval.second.second;
2388 }
2389
2390 for (const auto &lost : inc.new_lost)
2391 osd_info[lost.first].lost_at = lost.second;
2392
2393 // xinfo
2394 for (const auto &xinfo : inc.new_xinfo)
2395 osd_xinfo[xinfo.first] = xinfo.second;
2396
2397 // uuid
2398 for (const auto &uuid : inc.new_uuid)
2399 (*osd_uuid)[uuid.first] = uuid.second;
2400
2401 // pg rebuild
2402 for (const auto &pg : inc.new_pg_temp) {
2403 if (pg.second.empty())
2404 pg_temp->erase(pg.first);
2405 else
2406 pg_temp->set(pg.first, pg.second);
2407 }
2408 if (!inc.new_pg_temp.empty()) {
2409 // make sure pg_temp is efficiently stored
2410 pg_temp->rebuild();
2411 }
2412
2413 for (const auto &pg : inc.new_primary_temp) {
2414 if (pg.second == -1)
2415 primary_temp->erase(pg.first);
2416 else
2417 (*primary_temp)[pg.first] = pg.second;
2418 }
2419
2420 for (auto& p : inc.new_pg_upmap) {
2421 pg_upmap[p.first] = p.second;
2422 }
2423 for (auto& pg : inc.old_pg_upmap) {
2424 pg_upmap.erase(pg);
2425 }
2426 for (auto& p : inc.new_pg_upmap_items) {
2427 pg_upmap_items[p.first] = p.second;
2428 }
2429 for (auto& pg : inc.old_pg_upmap_items) {
2430 pg_upmap_items.erase(pg);
2431 }
2432
2433 for (auto& [pg, prim] : inc.new_pg_upmap_primary) {
2434 pg_upmap_primaries[pg] = prim;
2435 }
2436 for (auto& pg : inc.old_pg_upmap_primary) {
2437 pg_upmap_primaries.erase(pg);
2438 }
2439
2440 // blocklist
2441 if (!inc.new_blocklist.empty()) {
2442 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2443 new_blocklist_entries = true;
2444 }
2445 for (const auto &addr : inc.old_blocklist)
2446 blocklist.erase(addr);
2447
2448 for (const auto& addr_p : inc.new_range_blocklist) {
2449 range_blocklist.insert(addr_p);
2450 calculated_ranges.emplace(addr_p.first, addr_p.first);
2451 new_blocklist_entries = true;
2452 }
2453 for (const auto &addr : inc.old_range_blocklist) {
2454 calculated_ranges.erase(addr);
2455 range_blocklist.erase(addr);
2456 }
2457
2458 for (auto& i : inc.new_crush_node_flags) {
2459 if (i.second) {
2460 crush_node_flags[i.first] = i.second;
2461 } else {
2462 crush_node_flags.erase(i.first);
2463 }
2464 }
2465
2466 for (auto& i : inc.new_device_class_flags) {
2467 if (i.second) {
2468 device_class_flags[i.first] = i.second;
2469 } else {
2470 device_class_flags.erase(i.first);
2471 }
2472 }
2473
2474 // cluster snapshot?
2475 if (inc.cluster_snapshot.length()) {
2476 cluster_snapshot = inc.cluster_snapshot;
2477 cluster_snapshot_epoch = inc.epoch;
2478 } else {
2479 cluster_snapshot.clear();
2480 cluster_snapshot_epoch = 0;
2481 }
2482
2483 if (inc.new_nearfull_ratio >= 0) {
2484 nearfull_ratio = inc.new_nearfull_ratio;
2485 }
2486 if (inc.new_backfillfull_ratio >= 0) {
2487 backfillfull_ratio = inc.new_backfillfull_ratio;
2488 }
2489 if (inc.new_full_ratio >= 0) {
2490 full_ratio = inc.new_full_ratio;
2491 }
2492 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
2493 require_min_compat_client = inc.new_require_min_compat_client;
2494 }
2495 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2496 require_osd_release = inc.new_require_osd_release;
2497 if (require_osd_release >= ceph_release_t::luminous) {
2498 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2499 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
2500 }
2501 }
2502
2503 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2504 require_osd_release = inc.new_require_osd_release;
2505 if (require_osd_release >= ceph_release_t::nautilus) {
2506 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2507 }
2508 }
2509 // do new crush map last (after up/down stuff)
2510 if (inc.crush.length()) {
2511 ceph::buffer::list bl(inc.crush);
2512 auto blp = bl.cbegin();
2513 crush.reset(new CrushWrapper);
2514 crush->decode(blp);
2515 if (require_osd_release >= ceph_release_t::luminous) {
2516 // only increment if this is a luminous-encoded osdmap, lest
2517 // the mon's crush_version diverge from what the osds or others
2518 // are decoding and applying on their end. if we won't encode
2519 // it in the canonical version, don't change it.
2520 ++crush_version;
2521 }
2522 for (auto it = device_class_flags.begin();
2523 it != device_class_flags.end();) {
2524 const char* class_name = crush->get_class_name(it->first);
2525 if (!class_name) // device class is gone
2526 it = device_class_flags.erase(it);
2527 else
2528 it++;
2529 }
2530 }
2531
2532 if (inc.change_stretch_mode) {
2533 stretch_mode_enabled = inc.stretch_mode_enabled;
2534 stretch_bucket_count = inc.new_stretch_bucket_count;
2535 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2536 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2537 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2538 }
2539
2540 switch (inc.mutate_allow_crimson) {
2541 case Incremental::mutate_allow_crimson_t::NONE:
2542 break;
2543 case Incremental::mutate_allow_crimson_t::SET:
2544 allow_crimson = true;
2545 break;
2546 case Incremental::mutate_allow_crimson_t::CLEAR:
2547 allow_crimson = false;
2548 break;
2549 }
2550
2551 calc_num_osds();
2552 _calc_up_osd_features();
2553 return 0;
2554 }
2555
2556 // mapping
2557 int OSDMap::map_to_pg(
2558 int64_t poolid,
2559 const string& name,
2560 const string& key,
2561 const string& nspace,
2562 pg_t *pg) const
2563 {
2564 // calculate ps (placement seed)
2565 const pg_pool_t *pool = get_pg_pool(poolid);
2566 if (!pool)
2567 return -ENOENT;
2568 ps_t ps;
2569 if (!key.empty())
2570 ps = pool->hash_key(key, nspace);
2571 else
2572 ps = pool->hash_key(name, nspace);
2573 *pg = pg_t(ps, poolid);
2574 return 0;
2575 }
2576
2577 int OSDMap::object_locator_to_pg(
2578 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2579 {
2580 if (loc.hash >= 0) {
2581 if (!get_pg_pool(loc.get_pool())) {
2582 return -ENOENT;
2583 }
2584 pg = pg_t(loc.hash, loc.get_pool());
2585 return 0;
2586 }
2587 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2588 }
2589
2590 ceph_object_layout OSDMap::make_object_layout(
2591 object_t oid, int pg_pool, string nspace) const
2592 {
2593 object_locator_t loc(pg_pool, nspace);
2594
2595 ceph_object_layout ol;
2596 pg_t pgid = object_locator_to_pg(oid, loc);
2597 ol.ol_pgid = pgid.get_old_pg().v;
2598 ol.ol_stripe_unit = 0;
2599 return ol;
2600 }
2601
2602 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2603 vector<int>& osds) const
2604 {
2605 if (pool.can_shift_osds()) {
2606 unsigned removed = 0;
2607 for (unsigned i = 0; i < osds.size(); i++) {
2608 if (!exists(osds[i])) {
2609 removed++;
2610 continue;
2611 }
2612 if (removed) {
2613 osds[i - removed] = osds[i];
2614 }
2615 }
2616 if (removed)
2617 osds.resize(osds.size() - removed);
2618 } else {
2619 for (auto& osd : osds) {
2620 if (!exists(osd))
2621 osd = CRUSH_ITEM_NONE;
2622 }
2623 }
2624 }
2625
2626 void OSDMap::_pg_to_raw_osds(
2627 const pg_pool_t& pool, pg_t pg,
2628 vector<int> *osds,
2629 ps_t *ppps) const
2630 {
2631 // map to osds[]
2632 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2633 unsigned size = pool.get_size();
2634
2635 // what crush rule?
2636 int ruleno = pool.get_crush_rule();
2637 if (ruleno >= 0)
2638 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2639
2640 _remove_nonexistent_osds(pool, *osds);
2641
2642 if (ppps)
2643 *ppps = pps;
2644 }
2645
2646 int OSDMap::_pick_primary(const vector<int>& osds) const
2647 {
2648 for (auto osd : osds) {
2649 if (osd != CRUSH_ITEM_NONE) {
2650 return osd;
2651 }
2652 }
2653 return -1;
2654 }
2655
2656 void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
2657 {
2658 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2659 auto p = pg_upmap.find(pg);
2660 if (p != pg_upmap.end()) {
2661 // make sure targets aren't marked out
2662 for (auto osd : p->second) {
2663 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2664 osd_weight[osd] == 0) {
2665 // reject/ignore the explicit mapping
2666 return;
2667 }
2668 }
2669 *raw = vector<int>(p->second.begin(), p->second.end());
2670 // continue to check and apply pg_upmap_items if any
2671 }
2672
2673 auto q = pg_upmap_items.find(pg);
2674 if (q != pg_upmap_items.end()) {
2675 // NOTE: this approach does not allow a bidirectional swap,
2676 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2677 for (auto& [osd_from, osd_to] : q->second) {
2678 // A capcaity change upmap (repace osd in the pg with osd not in the pg)
2679 // make sure the replacement value doesn't already appear
2680 bool exists = false;
2681 ssize_t pos = -1;
2682 for (unsigned i = 0; i < raw->size(); ++i) {
2683 int osd = (*raw)[i];
2684 if (osd == osd_to) {
2685 exists = true;
2686 break;
2687 }
2688 // ignore mapping if target is marked out (or invalid osd id)
2689 if (osd == osd_from &&
2690 pos < 0 &&
2691 !(osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2692 osd_to >= 0 && osd_weight[osd_to] == 0)) {
2693 pos = i;
2694 }
2695 }
2696 if (!exists && pos >= 0) {
2697 (*raw)[pos] = osd_to;
2698 }
2699 }
2700 }
2701 auto r = pg_upmap_primaries.find(pg);
2702 if (r != pg_upmap_primaries.end()) {
2703 auto new_prim = r->second;
2704 // Apply mapping only if new primary is not marked out and valid osd id
2705 if (new_prim != CRUSH_ITEM_NONE && new_prim < max_osd && new_prim >= 0 &&
2706 osd_weight[new_prim] != 0) {
2707 int new_prim_idx = 0;
2708 for (int i = 1 ; i < (int)raw->size(); i++) { // start from 1 on purpose
2709 if ((*raw)[i] == new_prim) {
2710 new_prim_idx = i;
2711 break;
2712 }
2713 }
2714 if (new_prim_idx > 0) {
2715 // swap primary
2716 (*raw)[new_prim_idx] = (*raw)[0];
2717 (*raw)[0] = new_prim;
2718 }
2719 }
2720 }
2721 }
2722
2723 // pg -> (up osd list)
2724 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2725 vector<int> *up) const
2726 {
2727 if (pool.can_shift_osds()) {
2728 // shift left
2729 up->clear();
2730 up->reserve(raw.size());
2731 for (unsigned i=0; i<raw.size(); i++) {
2732 if (!exists(raw[i]) || is_down(raw[i]))
2733 continue;
2734 up->push_back(raw[i]);
2735 }
2736 } else {
2737 // set down/dne devices to NONE
2738 up->resize(raw.size());
2739 for (int i = raw.size() - 1; i >= 0; --i) {
2740 if (!exists(raw[i]) || is_down(raw[i])) {
2741 (*up)[i] = CRUSH_ITEM_NONE;
2742 } else {
2743 (*up)[i] = raw[i];
2744 }
2745 }
2746 }
2747 }
2748
2749 void OSDMap::_apply_primary_affinity(ps_t seed,
2750 const pg_pool_t& pool,
2751 vector<int> *osds,
2752 int *primary) const
2753 {
2754 // do we have any non-default primary_affinity values for these osds?
2755 if (!osd_primary_affinity)
2756 return;
2757
2758 bool any = false;
2759 for (const auto osd : *osds) {
2760 if (osd != CRUSH_ITEM_NONE &&
2761 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2762 any = true;
2763 break;
2764 }
2765 }
2766 if (!any)
2767 return;
2768
2769 // pick the primary. feed both the seed (for the pg) and the osd
2770 // into the hash/rng so that a proportional fraction of an osd's pgs
2771 // get rejected as primary.
2772 int pos = -1;
2773 for (unsigned i = 0; i < osds->size(); ++i) {
2774 int o = (*osds)[i];
2775 if (o == CRUSH_ITEM_NONE)
2776 continue;
2777 unsigned a = (*osd_primary_affinity)[o];
2778 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2779 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2780 seed, o) >> 16) >= a) {
2781 // we chose not to use this primary. note it anyway as a
2782 // fallback in case we don't pick anyone else, but keep looking.
2783 if (pos < 0)
2784 pos = i;
2785 } else {
2786 pos = i;
2787 break;
2788 }
2789 }
2790 if (pos < 0)
2791 return;
2792
2793 *primary = (*osds)[pos];
2794
2795 if (pool.can_shift_osds() && pos > 0) {
2796 // move the new primary to the front.
2797 for (int i = pos; i > 0; --i) {
2798 (*osds)[i] = (*osds)[i-1];
2799 }
2800 (*osds)[0] = *primary;
2801 }
2802 }
2803
2804 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2805 vector<int> *temp_pg, int *temp_primary) const
2806 {
2807 pg = pool.raw_pg_to_pg(pg);
2808 const auto p = pg_temp->find(pg);
2809 temp_pg->clear();
2810 if (p != pg_temp->end()) {
2811 for (unsigned i=0; i<p->second.size(); i++) {
2812 if (!exists(p->second[i]) || is_down(p->second[i])) {
2813 if (pool.can_shift_osds()) {
2814 continue;
2815 } else {
2816 temp_pg->push_back(CRUSH_ITEM_NONE);
2817 }
2818 } else {
2819 temp_pg->push_back(p->second[i]);
2820 }
2821 }
2822 }
2823 const auto &pp = primary_temp->find(pg);
2824 *temp_primary = -1;
2825 if (pp != primary_temp->end()) {
2826 *temp_primary = pp->second;
2827 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2828 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2829 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2830 *temp_primary = (*temp_pg)[i];
2831 break;
2832 }
2833 }
2834 }
2835 }
2836
2837 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2838 {
2839 const pg_pool_t *pool = get_pg_pool(pg.pool());
2840 if (!pool) {
2841 *primary = -1;
2842 raw->clear();
2843 return;
2844 }
2845 _pg_to_raw_osds(*pool, pg, raw, NULL);
2846 *primary = _pick_primary(*raw);
2847 }
2848
2849 void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2850 vector<int> *raw_upmap) const
2851 {
2852 auto pool = get_pg_pool(pg.pool());
2853 if (!pool) {
2854 raw_upmap->clear();
2855 return;
2856 }
2857 _pg_to_raw_osds(*pool, pg, raw, NULL);
2858 *raw_upmap = *raw;
2859 _apply_upmap(*pool, pg, raw_upmap);
2860 }
2861
2862 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2863 {
2864 const pg_pool_t *pool = get_pg_pool(pg.pool());
2865 if (!pool) {
2866 *primary = -1;
2867 up->clear();
2868 return;
2869 }
2870 vector<int> raw;
2871 ps_t pps;
2872 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2873 _apply_upmap(*pool, pg, &raw);
2874 _raw_to_up_osds(*pool, raw, up);
2875 *primary = _pick_primary(raw);
2876 _apply_primary_affinity(pps, *pool, up, primary);
2877 }
2878
2879 void OSDMap::_pg_to_up_acting_osds(
2880 const pg_t& pg, vector<int> *up, int *up_primary,
2881 vector<int> *acting, int *acting_primary,
2882 bool raw_pg_to_pg) const
2883 {
2884 const pg_pool_t *pool = get_pg_pool(pg.pool());
2885 if (!pool ||
2886 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2887 if (up)
2888 up->clear();
2889 if (up_primary)
2890 *up_primary = -1;
2891 if (acting)
2892 acting->clear();
2893 if (acting_primary)
2894 *acting_primary = -1;
2895 return;
2896 }
2897 vector<int> raw;
2898 vector<int> _up;
2899 vector<int> _acting;
2900 int _up_primary;
2901 int _acting_primary;
2902 ps_t pps;
2903 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2904 if (_acting.empty() || up || up_primary) {
2905 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2906 _apply_upmap(*pool, pg, &raw);
2907 _raw_to_up_osds(*pool, raw, &_up);
2908 _up_primary = _pick_primary(_up);
2909 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2910 if (_acting.empty()) {
2911 _acting = _up;
2912 if (_acting_primary == -1) {
2913 _acting_primary = _up_primary;
2914 }
2915 }
2916
2917 if (up)
2918 up->swap(_up);
2919 if (up_primary)
2920 *up_primary = _up_primary;
2921 }
2922
2923 if (acting)
2924 acting->swap(_acting);
2925 if (acting_primary)
2926 *acting_primary = _acting_primary;
2927 }
2928
2929 int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
2930 {
2931 // This implementation is broken for EC PGs since the osd may appear
2932 // multiple times in the acting set. See
2933 // https://tracker.ceph.com/issues/43213
2934 if (!nrep)
2935 nrep = acting.size();
2936 for (int i=0; i<nrep; i++)
2937 if (acting[i] == osd)
2938 return i;
2939 return -1;
2940 }
2941
2942 int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
2943 {
2944 int nrep = acting.size();
2945 if (who.shard == shard_id_t::NO_SHARD) {
2946 for (int i=0; i<nrep; i++) {
2947 if (acting[i] == who.osd) {
2948 return i;
2949 }
2950 }
2951 } else {
2952 if (who.shard < nrep && acting[who.shard] == who.osd) {
2953 return who.shard;
2954 }
2955 }
2956 return -1;
2957 }
2958
2959 bool OSDMap::primary_changed_broken(
2960 int oldprimary,
2961 const vector<int> &oldacting,
2962 int newprimary,
2963 const vector<int> &newacting)
2964 {
2965 if (oldacting.empty() && newacting.empty())
2966 return false; // both still empty
2967 if (oldacting.empty() ^ newacting.empty())
2968 return true; // was empty, now not, or vice versa
2969 if (oldprimary != newprimary)
2970 return true; // primary changed
2971 if (calc_pg_role_broken(oldprimary, oldacting) !=
2972 calc_pg_role_broken(newprimary, newacting))
2973 return true;
2974 return false; // same primary (tho replicas may have changed)
2975 }
2976
2977 uint64_t OSDMap::get_encoding_features() const
2978 {
2979 uint64_t f = SIGNIFICANT_FEATURES;
2980 if (require_osd_release < ceph_release_t::octopus) {
2981 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2982 }
2983 if (require_osd_release < ceph_release_t::nautilus) {
2984 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2985 }
2986 if (require_osd_release < ceph_release_t::mimic) {
2987 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2988 }
2989 if (require_osd_release < ceph_release_t::luminous) {
2990 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2991 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2992 }
2993 if (require_osd_release < ceph_release_t::kraken) {
2994 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
2995 CEPH_FEATURE_MSG_ADDR2);
2996 }
2997 if (require_osd_release < ceph_release_t::jewel) {
2998 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
2999 CEPH_FEATURE_NEW_OSDOP_ENCODING |
3000 CEPH_FEATURE_CRUSH_TUNABLES5);
3001 }
3002 return f;
3003 }
3004
3005 // serialize, unserialize
3006 void OSDMap::encode_client_old(ceph::buffer::list& bl) const
3007 {
3008 using ceph::encode;
3009 __u16 v = 5;
3010 encode(v, bl);
3011
3012 // base
3013 encode(fsid, bl);
3014 encode(epoch, bl);
3015 encode(created, bl);
3016 encode(modified, bl);
3017
3018 // for encode(pools, bl);
3019 __u32 n = pools.size();
3020 encode(n, bl);
3021
3022 for (const auto &pool : pools) {
3023 n = pool.first;
3024 encode(n, bl);
3025 encode(pool.second, bl, 0);
3026 }
3027 // for encode(pool_name, bl);
3028 n = pool_name.size();
3029 encode(n, bl);
3030 for (const auto &pname : pool_name) {
3031 n = pname.first;
3032 encode(n, bl);
3033 encode(pname.second, bl);
3034 }
3035 // for encode(pool_max, bl);
3036 n = pool_max;
3037 encode(n, bl);
3038
3039 encode(flags, bl);
3040
3041 encode(max_osd, bl);
3042 {
3043 uint32_t n = osd_state.size();
3044 encode(n, bl);
3045 for (auto s : osd_state) {
3046 encode((uint8_t)s, bl);
3047 }
3048 }
3049 encode(osd_weight, bl);
3050 encode(osd_addrs->client_addrs, bl, 0);
3051
3052 // for encode(pg_temp, bl);
3053 n = pg_temp->size();
3054 encode(n, bl);
3055 for (const auto& pg : *pg_temp) {
3056 old_pg_t opg = pg.first.get_old_pg();
3057 encode(opg, bl);
3058 encode(pg.second, bl);
3059 }
3060
3061 // crush
3062 ceph::buffer::list cbl;
3063 crush->encode(cbl, 0 /* legacy (no) features */);
3064 encode(cbl, bl);
3065 }
3066
3067 void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
3068 {
3069 using ceph::encode;
3070 if ((features & CEPH_FEATURE_PGID64) == 0) {
3071 encode_client_old(bl);
3072 return;
3073 }
3074
3075 __u16 v = 6;
3076 encode(v, bl);
3077
3078 // base
3079 encode(fsid, bl);
3080 encode(epoch, bl);
3081 encode(created, bl);
3082 encode(modified, bl);
3083
3084 encode(pools, bl, features);
3085 encode(pool_name, bl);
3086 encode(pool_max, bl);
3087
3088 encode(flags, bl);
3089
3090 encode(max_osd, bl);
3091 {
3092 uint32_t n = osd_state.size();
3093 encode(n, bl);
3094 for (auto s : osd_state) {
3095 encode((uint8_t)s, bl);
3096 }
3097 }
3098 encode(osd_weight, bl);
3099 encode(osd_addrs->client_addrs, bl, features);
3100
3101 encode(*pg_temp, bl);
3102
3103 // crush
3104 ceph::buffer::list cbl;
3105 crush->encode(cbl, 0 /* legacy (no) features */);
3106 encode(cbl, bl);
3107
3108 // extended
3109 __u16 ev = 10;
3110 encode(ev, bl);
3111 encode(osd_addrs->hb_back_addrs, bl, features);
3112 encode(osd_info, bl);
3113 encode(blocklist, bl, features);
3114 encode(osd_addrs->cluster_addrs, bl, features);
3115 encode(cluster_snapshot_epoch, bl);
3116 encode(cluster_snapshot, bl);
3117 encode(*osd_uuid, bl);
3118 encode(osd_xinfo, bl, features);
3119 encode(osd_addrs->hb_front_addrs, bl, features);
3120 }
3121
3122 /* for a description of osdmap versions, and when they were introduced, please
3123 * refer to
3124 * doc/dev/osd_internals/osdmap_versions.txt
3125 */
3126 void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
3127 {
3128 using ceph::encode;
3129 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
3130 encode_classic(bl, features);
3131 return;
3132 }
3133
3134 // only a select set of callers should *ever* be encoding new
3135 // OSDMaps. others should be passing around the canonical encoded
3136 // buffers from on high. select out those callers by passing in an
3137 // "impossible" feature bit.
3138 ceph_assert(features & CEPH_FEATURE_RESERVED);
3139 features &= ~CEPH_FEATURE_RESERVED;
3140
3141 size_t start_offset = bl.length();
3142 size_t tail_offset;
3143 size_t crc_offset;
3144 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
3145
3146 // meta-encoding: how we include client-used and osd-specific data
3147 ENCODE_START(8, 7, bl);
3148
3149 {
3150 // NOTE: any new encoding dependencies must be reflected by
3151 // SIGNIFICANT_FEATURES
3152 uint8_t v = 10;
3153 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3154 v = 3;
3155 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3156 v = 6;
3157 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3158 v = 7;
3159 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
3160 v = 9;
3161 } */
3162 ENCODE_START(v, 1, bl); // client-usable data
3163 // base
3164 encode(fsid, bl);
3165 encode(epoch, bl);
3166 encode(created, bl);
3167 encode(modified, bl);
3168
3169 encode(pools, bl, features);
3170 encode(pool_name, bl);
3171 encode(pool_max, bl);
3172
3173 if (v < 4) {
3174 decltype(flags) f = flags;
3175 if (require_osd_release >= ceph_release_t::luminous)
3176 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
3177 else if (require_osd_release == ceph_release_t::kraken)
3178 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
3179 else if (require_osd_release == ceph_release_t::jewel)
3180 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
3181 encode(f, bl);
3182 } else {
3183 encode(flags, bl);
3184 }
3185
3186 encode(max_osd, bl);
3187 if (v >= 5) {
3188 encode(osd_state, bl);
3189 } else {
3190 uint32_t n = osd_state.size();
3191 encode(n, bl);
3192 for (auto s : osd_state) {
3193 encode((uint8_t)s, bl);
3194 }
3195 }
3196 encode(osd_weight, bl);
3197 if (v >= 8) {
3198 encode(osd_addrs->client_addrs, bl, features);
3199 } else {
3200 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
3201 }
3202
3203 encode(*pg_temp, bl);
3204 encode(*primary_temp, bl);
3205 if (osd_primary_affinity) {
3206 encode(*osd_primary_affinity, bl);
3207 } else {
3208 vector<__u32> v;
3209 encode(v, bl);
3210 }
3211
3212 // crush
3213 ceph::buffer::list cbl;
3214 crush->encode(cbl, features);
3215 encode(cbl, bl);
3216 encode(erasure_code_profiles, bl);
3217
3218 if (v >= 4) {
3219 encode(pg_upmap, bl);
3220 encode(pg_upmap_items, bl);
3221 } else {
3222 ceph_assert(pg_upmap.empty());
3223 ceph_assert(pg_upmap_items.empty());
3224 }
3225 if (v >= 6) {
3226 encode(crush_version, bl);
3227 }
3228 if (v >= 7) {
3229 encode(new_removed_snaps, bl);
3230 encode(new_purged_snaps, bl);
3231 }
3232 if (v >= 9) {
3233 encode(last_up_change, bl);
3234 encode(last_in_change, bl);
3235 }
3236 if (v >= 10) {
3237 encode(pg_upmap_primaries, bl);
3238 } else {
3239 ceph_assert(pg_upmap_primaries.empty());
3240 }
3241 ENCODE_FINISH(bl); // client-usable data
3242 }
3243
3244 {
3245 // NOTE: any new encoding dependencies must be reflected by
3246 // SIGNIFICANT_FEATURES
3247 uint8_t target_v = 9; // when bumping this, be aware of allow_crimson
3248 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3249 target_v = 1;
3250 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3251 target_v = 5;
3252 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3253 target_v = 6;
3254 }
3255 if (stretch_mode_enabled) {
3256 target_v = std::max((uint8_t)10, target_v);
3257 }
3258 if (!range_blocklist.empty()) {
3259 target_v = std::max((uint8_t)11, target_v);
3260 }
3261 if (allow_crimson) {
3262 target_v = std::max((uint8_t)12, target_v);
3263 }
3264 ENCODE_START(target_v, 1, bl); // extended, osd-only data
3265 if (target_v < 7) {
3266 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3267 } else {
3268 encode(osd_addrs->hb_back_addrs, bl, features);
3269 }
3270 encode(osd_info, bl);
3271 {
3272 // put this in a sorted, ordered map<> so that we encode in a
3273 // deterministic order.
3274 map<entity_addr_t,utime_t> blocklist_map;
3275 for (const auto &addr : blocklist)
3276 blocklist_map.insert(make_pair(addr.first, addr.second));
3277 encode(blocklist_map, bl, features);
3278 }
3279 if (target_v < 7) {
3280 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3281 } else {
3282 encode(osd_addrs->cluster_addrs, bl, features);
3283 }
3284 encode(cluster_snapshot_epoch, bl);
3285 encode(cluster_snapshot, bl);
3286 encode(*osd_uuid, bl);
3287 encode(osd_xinfo, bl, features);
3288 if (target_v < 7) {
3289 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3290 } else {
3291 encode(osd_addrs->hb_front_addrs, bl, features);
3292 }
3293 if (target_v >= 2) {
3294 encode(nearfull_ratio, bl);
3295 encode(full_ratio, bl);
3296 encode(backfillfull_ratio, bl);
3297 }
3298 // 4 was string-based new_require_min_compat_client
3299 if (target_v >= 5) {
3300 encode(require_min_compat_client, bl);
3301 encode(require_osd_release, bl);
3302 }
3303 if (target_v >= 6) {
3304 encode(removed_snaps_queue, bl);
3305 }
3306 if (target_v >= 8) {
3307 encode(crush_node_flags, bl);
3308 }
3309 if (target_v >= 9) {
3310 encode(device_class_flags, bl);
3311 }
3312 if (target_v >= 10) {
3313 encode(stretch_mode_enabled, bl);
3314 encode(stretch_bucket_count, bl);
3315 encode(degraded_stretch_mode, bl);
3316 encode(recovering_stretch_mode, bl);
3317 encode(stretch_mode_bucket, bl);
3318 }
3319 if (target_v >= 11) {
3320 ::encode(range_blocklist, bl, features);
3321 }
3322 if (target_v >= 12) {
3323 ::encode(allow_crimson, bl);
3324 }
3325 ENCODE_FINISH(bl); // osd-only data
3326 }
3327
3328 crc_offset = bl.length();
3329 crc_filler = bl.append_hole(sizeof(uint32_t));
3330 tail_offset = bl.length();
3331
3332 ENCODE_FINISH(bl); // meta-encoding wrapper
3333
3334 // fill in crc
3335 ceph::buffer::list front;
3336 front.substr_of(bl, start_offset, crc_offset - start_offset);
3337 crc = front.crc32c(-1);
3338 if (tail_offset < bl.length()) {
3339 ceph::buffer::list tail;
3340 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3341 crc = tail.crc32c(crc);
3342 }
3343 ceph_le32 crc_le;
3344 crc_le = crc;
3345 crc_filler->copy_in(4, (char*)&crc_le);
3346 crc_defined = true;
3347 }
3348
3349 /* for a description of osdmap versions, and when they were introduced, please
3350 * refer to
3351 * doc/dev/osd_internals/osdmap_versions.txt
3352 */
3353 void OSDMap::decode(ceph::buffer::list& bl)
3354 {
3355 auto p = bl.cbegin();
3356 decode(p);
3357 }
3358
3359 void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
3360 {
3361 using ceph::decode;
3362 __u32 n, t;
3363 __u16 v;
3364 decode(v, p);
3365
3366 // base
3367 decode(fsid, p);
3368 decode(epoch, p);
3369 decode(created, p);
3370 decode(modified, p);
3371
3372 if (v < 6) {
3373 if (v < 4) {
3374 int32_t max_pools = 0;
3375 decode(max_pools, p);
3376 pool_max = max_pools;
3377 }
3378 pools.clear();
3379 decode(n, p);
3380 while (n--) {
3381 decode(t, p);
3382 decode(pools[t], p);
3383 }
3384 if (v == 4) {
3385 decode(n, p);
3386 pool_max = n;
3387 } else if (v == 5) {
3388 pool_name.clear();
3389 decode(n, p);
3390 while (n--) {
3391 decode(t, p);
3392 decode(pool_name[t], p);
3393 }
3394 decode(n, p);
3395 pool_max = n;
3396 }
3397 } else {
3398 decode(pools, p);
3399 decode(pool_name, p);
3400 decode(pool_max, p);
3401 }
3402 // kludge around some old bug that zeroed out pool_max (#2307)
3403 if (pools.size() && pool_max < pools.rbegin()->first) {
3404 pool_max = pools.rbegin()->first;
3405 }
3406
3407 decode(flags, p);
3408
3409 decode(max_osd, p);
3410 {
3411 vector<uint8_t> os;
3412 decode(os, p);
3413 osd_state.resize(os.size());
3414 for (unsigned i = 0; i < os.size(); ++i) {
3415 osd_state[i] = os[i];
3416 }
3417 }
3418 decode(osd_weight, p);
3419 decode(osd_addrs->client_addrs, p);
3420 if (v <= 5) {
3421 pg_temp->clear();
3422 decode(n, p);
3423 while (n--) {
3424 old_pg_t opg;
3425 ceph::decode_raw(opg, p);
3426 mempool::osdmap::vector<int32_t> v;
3427 decode(v, p);
3428 pg_temp->set(pg_t(opg), v);
3429 }
3430 } else {
3431 decode(*pg_temp, p);
3432 }
3433
3434 // crush
3435 ceph::buffer::list cbl;
3436 decode(cbl, p);
3437 auto cblp = cbl.cbegin();
3438 crush->decode(cblp);
3439
3440 // extended
3441 __u16 ev = 0;
3442 if (v >= 5)
3443 decode(ev, p);
3444 decode(osd_addrs->hb_back_addrs, p);
3445 decode(osd_info, p);
3446 if (v < 5)
3447 decode(pool_name, p);
3448
3449 decode(blocklist, p);
3450 if (ev >= 6)
3451 decode(osd_addrs->cluster_addrs, p);
3452 else
3453 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
3454
3455 if (ev >= 7) {
3456 decode(cluster_snapshot_epoch, p);
3457 decode(cluster_snapshot, p);
3458 }
3459
3460 if (ev >= 8) {
3461 decode(*osd_uuid, p);
3462 } else {
3463 osd_uuid->resize(max_osd);
3464 }
3465 if (ev >= 9)
3466 decode(osd_xinfo, p);
3467 else
3468 osd_xinfo.resize(max_osd);
3469
3470 if (ev >= 10)
3471 decode(osd_addrs->hb_front_addrs, p);
3472 else
3473 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
3474
3475 osd_primary_affinity.reset();
3476
3477 post_decode();
3478 }
3479
3480 void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
3481 {
3482 using ceph::decode;
3483 /**
3484 * Older encodings of the OSDMap had a single struct_v which
3485 * covered the whole encoding, and was prior to our modern
3486 * stuff which includes a compatv and a size. So if we see
3487 * a struct_v < 7, we must rewind to the beginning and use our
3488 * classic decoder.
3489 */
3490 size_t start_offset = bl.get_off();
3491 size_t tail_offset = 0;
3492 ceph::buffer::list crc_front, crc_tail;
3493
3494 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3495 if (struct_v < 7) {
3496 bl.seek(start_offset);
3497 decode_classic(bl);
3498 return;
3499 }
3500 /**
3501 * Since we made it past that hurdle, we can use our normal paths.
3502 */
3503 {
3504 DECODE_START(9, bl); // client-usable data
3505 // base
3506 decode(fsid, bl);
3507 decode(epoch, bl);
3508 decode(created, bl);
3509 decode(modified, bl);
3510
3511 decode(pools, bl);
3512 decode(pool_name, bl);
3513 decode(pool_max, bl);
3514
3515 decode(flags, bl);
3516
3517 decode(max_osd, bl);
3518 if (struct_v >= 5) {
3519 decode(osd_state, bl);
3520 } else {
3521 vector<uint8_t> os;
3522 decode(os, bl);
3523 osd_state.resize(os.size());
3524 for (unsigned i = 0; i < os.size(); ++i) {
3525 osd_state[i] = os[i];
3526 }
3527 }
3528 decode(osd_weight, bl);
3529 decode(osd_addrs->client_addrs, bl);
3530
3531 decode(*pg_temp, bl);
3532 decode(*primary_temp, bl);
3533 // dates back to firefly. version increased from 2 to 3 still in firefly.
3534 // do we really still need to keep this around? even for old clients?
3535 if (struct_v >= 2) {
3536 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
3537 decode(*osd_primary_affinity, bl);
3538 if (osd_primary_affinity->empty())
3539 osd_primary_affinity.reset();
3540 } else {
3541 osd_primary_affinity.reset();
3542 }
3543
3544 // crush
3545 ceph::buffer::list cbl;
3546 decode(cbl, bl);
3547 auto cblp = cbl.cbegin();
3548 crush->decode(cblp);
3549 // added in firefly; version increased in luminous, so it affects
3550 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3551 // alone until we require clients to be all luminous?
3552 if (struct_v >= 3) {
3553 decode(erasure_code_profiles, bl);
3554 } else {
3555 erasure_code_profiles.clear();
3556 }
3557 // version increased from 3 to 4 still in luminous, so same as above
3558 // applies.
3559 if (struct_v >= 4) {
3560 decode(pg_upmap, bl);
3561 decode(pg_upmap_items, bl);
3562 } else {
3563 pg_upmap.clear();
3564 pg_upmap_items.clear();
3565 }
3566 // again, version increased from 5 to 6 still in luminous, so above
3567 // applies.
3568 if (struct_v >= 6) {
3569 decode(crush_version, bl);
3570 }
3571 // version increase from 6 to 7 in mimic
3572 if (struct_v >= 7) {
3573 decode(new_removed_snaps, bl);
3574 decode(new_purged_snaps, bl);
3575 }
3576 // version increase from 7 to 8, 8 to 9, in nautilus.
3577 if (struct_v >= 9) {
3578 decode(last_up_change, bl);
3579 decode(last_in_change, bl);
3580 }
3581 if (struct_v >= 10) {
3582 decode(pg_upmap_primaries, bl);
3583 } else {
3584 pg_upmap_primaries.clear();
3585 }
3586 DECODE_FINISH(bl); // client-usable data
3587 }
3588
3589 {
3590 DECODE_START(10, bl); // extended, osd-only data
3591 decode(osd_addrs->hb_back_addrs, bl);
3592 decode(osd_info, bl);
3593 decode(blocklist, bl);
3594 decode(osd_addrs->cluster_addrs, bl);
3595 decode(cluster_snapshot_epoch, bl);
3596 decode(cluster_snapshot, bl);
3597 decode(*osd_uuid, bl);
3598 decode(osd_xinfo, bl);
3599 decode(osd_addrs->hb_front_addrs, bl);
3600 //
3601 if (struct_v >= 2) {
3602 decode(nearfull_ratio, bl);
3603 decode(full_ratio, bl);
3604 } else {
3605 nearfull_ratio = 0;
3606 full_ratio = 0;
3607 }
3608 if (struct_v >= 3) {
3609 decode(backfillfull_ratio, bl);
3610 } else {
3611 backfillfull_ratio = 0;
3612 }
3613 if (struct_v == 4) {
3614 string r;
3615 decode(r, bl);
3616 if (r.length())
3617 require_min_compat_client = ceph_release_from_name(r.c_str());
3618 }
3619 if (struct_v >= 5) {
3620 decode(require_min_compat_client, bl);
3621 decode(require_osd_release, bl);
3622 if (require_osd_release >= ceph_release_t::nautilus) {
3623 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3624 }
3625 if (require_osd_release >= ceph_release_t::luminous) {
3626 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3627 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3628 }
3629 } else {
3630 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3631 // only for compat with post-kraken pre-luminous test clusters
3632 require_osd_release = ceph_release_t::luminous;
3633 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3634 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3635 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
3636 require_osd_release = ceph_release_t::kraken;
3637 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
3638 require_osd_release = ceph_release_t::jewel;
3639 } else {
3640 require_osd_release = ceph_release_t::unknown;
3641 }
3642 }
3643 if (struct_v >= 6) {
3644 decode(removed_snaps_queue, bl);
3645 }
3646 if (struct_v >= 8) {
3647 decode(crush_node_flags, bl);
3648 } else {
3649 crush_node_flags.clear();
3650 }
3651 if (struct_v >= 9) {
3652 decode(device_class_flags, bl);
3653 } else {
3654 device_class_flags.clear();
3655 }
3656 if (struct_v >= 10) {
3657 decode(stretch_mode_enabled, bl);
3658 decode(stretch_bucket_count, bl);
3659 decode(degraded_stretch_mode, bl);
3660 decode(recovering_stretch_mode, bl);
3661 decode(stretch_mode_bucket, bl);
3662 } else {
3663 stretch_mode_enabled = false;
3664 stretch_bucket_count = 0;
3665 degraded_stretch_mode = 0;
3666 recovering_stretch_mode = 0;
3667 stretch_mode_bucket = 0;
3668 }
3669 if (struct_v >= 11) {
3670 decode(range_blocklist, bl);
3671 calculated_ranges.clear();
3672 for (const auto& i : range_blocklist) {
3673 calculated_ranges.emplace(i.first, i.first);
3674 }
3675 }
3676 if (struct_v >= 12) {
3677 decode(allow_crimson, bl);
3678 }
3679 DECODE_FINISH(bl); // osd-only data
3680 }
3681
3682 if (struct_v >= 8) {
3683 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
3684 decode(crc, bl);
3685 tail_offset = bl.get_off();
3686 crc_defined = true;
3687 } else {
3688 crc_defined = false;
3689 crc = 0;
3690 }
3691
3692 DECODE_FINISH(bl); // wrapper
3693
3694 if (tail_offset) {
3695 // verify crc
3696 uint32_t actual = crc_front.crc32c(-1);
3697 if (tail_offset < bl.get_off()) {
3698 ceph::buffer::list tail;
3699 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3700 actual = tail.crc32c(actual);
3701 }
3702 if (crc != actual) {
3703 ostringstream ss;
3704 ss << "bad crc, actual " << actual << " != expected " << crc;
3705 string s = ss.str();
3706 throw ceph::buffer::malformed_input(s.c_str());
3707 }
3708 }
3709
3710 post_decode();
3711 }
3712
3713 void OSDMap::post_decode()
3714 {
3715 // index pool names
3716 name_pool.clear();
3717 for (const auto &pname : pool_name) {
3718 name_pool[pname.second] = pname.first;
3719 }
3720
3721 calc_num_osds();
3722 _calc_up_osd_features();
3723 }
3724
3725 void OSDMap::dump_erasure_code_profiles(
3726 const mempool::osdmap::map<string,map<string,string>>& profiles,
3727 Formatter *f)
3728 {
3729 f->open_object_section("erasure_code_profiles");
3730 for (const auto &profile : profiles) {
3731 f->open_object_section(profile.first.c_str());
3732 for (const auto &profm : profile.second) {
3733 f->dump_string(profm.first.c_str(), profm.second);
3734 }
3735 f->close_section();
3736 }
3737 f->close_section();
3738 }
3739
3740 void OSDMap::dump_osds(Formatter *f) const
3741 {
3742 f->open_array_section("osds");
3743 for (int i=0; i<get_max_osd(); i++) {
3744 if (exists(i)) {
3745 dump_osd(i, f);
3746 }
3747 }
3748 f->close_section();
3749 }
3750
3751 void OSDMap::dump_osd(int id, Formatter *f) const
3752 {
3753 ceph_assert(f != nullptr);
3754 if (!exists(id)) {
3755 return;
3756 }
3757
3758 f->open_object_section("osd_info");
3759 f->dump_int("osd", id);
3760 f->dump_stream("uuid") << get_uuid(id);
3761 f->dump_int("up", is_up(id));
3762 f->dump_int("in", is_in(id));
3763 f->dump_float("weight", get_weightf(id));
3764 f->dump_float("primary_affinity", get_primary_affinityf(id));
3765 get_info(id).dump(f);
3766 f->dump_object("public_addrs", get_addrs(id));
3767 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3768 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3769 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3770 // compat
3771 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3772 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3773 f->dump_stream("heartbeat_back_addr")
3774 << get_hb_back_addrs(id).get_legacy_str();
3775 f->dump_stream("heartbeat_front_addr")
3776 << get_hb_front_addrs(id).get_legacy_str();
3777
3778 set<string> st;
3779 get_state(id, st);
3780 f->open_array_section("state");
3781 for (const auto &state : st)
3782 f->dump_string("state", state);
3783 f->close_section();
3784
3785 f->close_section();
3786 }
3787
3788 void OSDMap::dump_pool(CephContext *cct,
3789 int64_t pid,
3790 const pg_pool_t &pdata,
3791 ceph::Formatter *f) const
3792 {
3793 std::string name("<unknown>");
3794 const auto &pni = pool_name.find(pid);
3795 if (pni != pool_name.end())
3796 name = pni->second;
3797 f->open_object_section("pool");
3798 f->dump_int("pool", pid);
3799 f->dump_string("pool_name", name);
3800 pdata.dump(f);
3801 dump_read_balance_score(cct, pid, pdata, f);
3802 f->close_section(); // pool
3803 }
3804
3805 void OSDMap::dump_read_balance_score(CephContext *cct,
3806 int64_t pid,
3807 const pg_pool_t &pdata,
3808 ceph::Formatter *f) const
3809 {
3810 if (pdata.is_replicated()) {
3811 // Add rb section with values for score, optimal score, raw score
3812 // // and primary_affinity average
3813 OSDMap::read_balance_info_t rb_info;
3814 auto rc = calc_read_balance_score(cct, pid, &rb_info);
3815 if (rc >= 0) {
3816 f->open_object_section("read_balance");
3817 f->dump_float("score_acting", rb_info.acting_adj_score);
3818 f->dump_float("score_stable", rb_info.adjusted_score);
3819 f->dump_float("optimal_score", rb_info.optimal_score);
3820 f->dump_float("raw_score_acting", rb_info.acting_raw_score);
3821 f->dump_float("raw_score_stable", rb_info.raw_score);
3822 f->dump_float("primary_affinity_weighted", rb_info.pa_weighted);
3823 f->dump_float("average_primary_affinity", rb_info.pa_avg);
3824 f->dump_float("average_primary_affinity_weighted", rb_info.pa_weighted_avg);
3825 if (rb_info.err_msg.length() > 0) {
3826 f->dump_string("error_message", rb_info.err_msg);
3827 }
3828 f->close_section(); // read_balance
3829 }
3830 else {
3831 if (rb_info.err_msg.length() > 0) {
3832 f->open_object_section("read_balance");
3833 f->dump_string("error_message", rb_info.err_msg);
3834 f->dump_float("score_acting", rb_info.acting_adj_score);
3835 f->dump_float("score_stable", rb_info.adjusted_score);
3836 f->close_section(); // read_balance
3837 }
3838 }
3839 }
3840 }
3841
3842 void OSDMap::dump(Formatter *f, CephContext *cct) const
3843 {
3844 f->dump_int("epoch", get_epoch());
3845 f->dump_stream("fsid") << get_fsid();
3846 f->dump_stream("created") << get_created();
3847 f->dump_stream("modified") << get_modified();
3848 f->dump_stream("last_up_change") << last_up_change;
3849 f->dump_stream("last_in_change") << last_in_change;
3850 f->dump_string("flags", get_flag_string());
3851 f->dump_unsigned("flags_num", flags);
3852 f->open_array_section("flags_set");
3853 set<string> flagset;
3854 get_flag_set(&flagset);
3855 for (auto p : flagset) {
3856 f->dump_string("flag", p);
3857 }
3858 f->close_section();
3859 f->dump_unsigned("crush_version", get_crush_version());
3860 f->dump_float("full_ratio", full_ratio);
3861 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3862 f->dump_float("nearfull_ratio", nearfull_ratio);
3863 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3864 f->dump_int("pool_max", get_pool_max());
3865 f->dump_int("max_osd", get_max_osd());
3866 f->dump_string("require_min_compat_client",
3867 to_string(require_min_compat_client));
3868 f->dump_string("min_compat_client",
3869 to_string(get_min_compat_client()));
3870 f->dump_string("require_osd_release",
3871 to_string(require_osd_release));
3872
3873 f->dump_bool("allow_crimson", allow_crimson);
3874 f->open_array_section("pools");
3875 for (const auto &[pid, pdata] : pools) {
3876 dump_pool(cct, pid, pdata, f);
3877 }
3878 f->close_section();
3879
3880 dump_osds(f);
3881
3882 f->open_array_section("osd_xinfo");
3883 for (int i=0; i<get_max_osd(); i++) {
3884 if (exists(i)) {
3885 f->open_object_section("xinfo");
3886 f->dump_int("osd", i);
3887 osd_xinfo[i].dump(f);
3888 f->close_section();
3889 }
3890 }
3891 f->close_section();
3892
3893 f->open_array_section("pg_upmap");
3894 for (auto& p : pg_upmap) {
3895 f->open_object_section("mapping");
3896 f->dump_stream("pgid") << p.first;
3897 f->open_array_section("osds");
3898 for (auto q : p.second) {
3899 f->dump_int("osd", q);
3900 }
3901 f->close_section();
3902 f->close_section();
3903 }
3904 f->close_section();
3905
3906 f->open_array_section("pg_upmap_items");
3907 for (auto& [pgid, mappings] : pg_upmap_items) {
3908 f->open_object_section("mapping");
3909 f->dump_stream("pgid") << pgid;
3910 f->open_array_section("mappings");
3911 for (auto& [from, to] : mappings) {
3912 f->open_object_section("mapping");
3913 f->dump_int("from", from);
3914 f->dump_int("to", to);
3915 f->close_section();
3916 }
3917 f->close_section();
3918 f->close_section();
3919 }
3920 f->close_section();
3921
3922 f->open_array_section("pg_upmap_primaries");
3923 for (const auto& [pg, osd] : pg_upmap_primaries) {
3924 f->open_object_section("primary_mapping");
3925 f->dump_stream("pgid") << pg;
3926 f->dump_int("primary_osd", osd);
3927 f->close_section();
3928 }
3929 f->close_section(); // primary_temp
3930
3931 f->open_array_section("pg_temp");
3932 pg_temp->dump(f);
3933 f->close_section();
3934
3935 f->open_array_section("primary_temp");
3936 for (const auto &pg : *primary_temp) {
3937 f->dump_stream("pgid") << pg.first;
3938 f->dump_int("osd", pg.second);
3939 }
3940 f->close_section(); // primary_temp
3941
3942 f->open_object_section("blocklist");
3943 for (const auto &addr : blocklist) {
3944 stringstream ss;
3945 ss << addr.first;
3946 f->dump_stream(ss.str().c_str()) << addr.second;
3947 }
3948 f->close_section();
3949 f->open_object_section("range_blocklist");
3950 for (const auto &addr : range_blocklist) {
3951 stringstream ss;
3952 ss << addr.first;
3953 f->dump_stream(ss.str().c_str()) << addr.second;
3954 }
3955 f->close_section();
3956
3957 dump_erasure_code_profiles(erasure_code_profiles, f);
3958
3959 f->open_array_section("removed_snaps_queue");
3960 for (auto& p : removed_snaps_queue) {
3961 f->open_object_section("pool");
3962 f->dump_int("pool", p.first);
3963 f->open_array_section("snaps");
3964 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3965 f->open_object_section("interval");
3966 f->dump_unsigned("begin", q.get_start());
3967 f->dump_unsigned("length", q.get_len());
3968 f->close_section();
3969 }
3970 f->close_section();
3971 f->close_section();
3972 }
3973 f->close_section();
3974 f->open_array_section("new_removed_snaps");
3975 for (auto& p : new_removed_snaps) {
3976 f->open_object_section("pool");
3977 f->dump_int("pool", p.first);
3978 f->open_array_section("snaps");
3979 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3980 f->open_object_section("interval");
3981 f->dump_unsigned("begin", q.get_start());
3982 f->dump_unsigned("length", q.get_len());
3983 f->close_section();
3984 }
3985 f->close_section();
3986 f->close_section();
3987 }
3988 f->close_section();
3989 f->open_array_section("new_purged_snaps");
3990 for (auto& p : new_purged_snaps) {
3991 f->open_object_section("pool");
3992 f->dump_int("pool", p.first);
3993 f->open_array_section("snaps");
3994 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3995 f->open_object_section("interval");
3996 f->dump_unsigned("begin", q.get_start());
3997 f->dump_unsigned("length", q.get_len());
3998 f->close_section();
3999 }
4000 f->close_section();
4001 f->close_section();
4002 }
4003 f->close_section();
4004 f->open_object_section("crush_node_flags");
4005 for (auto& i : crush_node_flags) {
4006 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
4007 : stringify(i.first);
4008 f->open_array_section(s.c_str());
4009 set<string> st;
4010 calc_state_set(i.second, st);
4011 for (auto& j : st) {
4012 f->dump_string("flag", j);
4013 }
4014 f->close_section();
4015 }
4016 f->close_section();
4017 f->open_object_section("device_class_flags");
4018 for (auto& i : device_class_flags) {
4019 const char* class_name = crush->get_class_name(i.first);
4020 string s = class_name ? class_name : stringify(i.first);
4021 f->open_array_section(s.c_str());
4022 set<string> st;
4023 calc_state_set(i.second, st);
4024 for (auto& j : st) {
4025 f->dump_string("flag", j);
4026 }
4027 f->close_section();
4028 }
4029 f->close_section();
4030 f->open_object_section("stretch_mode");
4031 {
4032 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
4033 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
4034 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
4035 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
4036 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
4037 }
4038 f->close_section();
4039 }
4040
4041 void OSDMap::generate_test_instances(list<OSDMap*>& o)
4042 {
4043 o.push_back(new OSDMap);
4044
4045 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
4046 o.push_back(new OSDMap);
4047 uuid_d fsid;
4048 o.back()->build_simple(cct, 1, fsid, 16);
4049 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
4050 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
4051 cct->put();
4052 }
4053
4054 string OSDMap::get_flag_string(unsigned f)
4055 {
4056 string s;
4057 if (f & CEPH_OSDMAP_PAUSERD)
4058 s += ",pauserd";
4059 if (f & CEPH_OSDMAP_PAUSEWR)
4060 s += ",pausewr";
4061 if (f & CEPH_OSDMAP_PAUSEREC)
4062 s += ",pauserec";
4063 if (f & CEPH_OSDMAP_NOUP)
4064 s += ",noup";
4065 if (f & CEPH_OSDMAP_NODOWN)
4066 s += ",nodown";
4067 if (f & CEPH_OSDMAP_NOOUT)
4068 s += ",noout";
4069 if (f & CEPH_OSDMAP_NOIN)
4070 s += ",noin";
4071 if (f & CEPH_OSDMAP_NOBACKFILL)
4072 s += ",nobackfill";
4073 if (f & CEPH_OSDMAP_NOREBALANCE)
4074 s += ",norebalance";
4075 if (f & CEPH_OSDMAP_NORECOVER)
4076 s += ",norecover";
4077 if (f & CEPH_OSDMAP_NOSCRUB)
4078 s += ",noscrub";
4079 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
4080 s += ",nodeep-scrub";
4081 if (f & CEPH_OSDMAP_NOTIERAGENT)
4082 s += ",notieragent";
4083 if (f & CEPH_OSDMAP_NOSNAPTRIM)
4084 s += ",nosnaptrim";
4085 if (f & CEPH_OSDMAP_SORTBITWISE)
4086 s += ",sortbitwise";
4087 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
4088 s += ",require_jewel_osds";
4089 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
4090 s += ",require_kraken_osds";
4091 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
4092 s += ",require_luminous_osds";
4093 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
4094 s += ",recovery_deletes";
4095 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
4096 s += ",purged_snapdirs";
4097 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
4098 s += ",pglog_hardlimit";
4099 if (s.length())
4100 s.erase(0, 1);
4101 return s;
4102 }
4103
4104 string OSDMap::get_flag_string() const
4105 {
4106 return get_flag_string(flags);
4107 }
4108
4109 void OSDMap::print_pools(CephContext *cct, ostream& out) const
4110 {
4111 for (const auto &[pid, pdata] : pools) {
4112 std::string name("<unknown>");
4113 const auto &pni = pool_name.find(pid);
4114 if (pni != pool_name.end())
4115 name = pni->second;
4116 char rb_score_str[32] = "";
4117 int rc = 0;
4118 read_balance_info_t rb_info;
4119 if (pdata.is_replicated()) {
4120 rc = calc_read_balance_score(cct, pid, &rb_info);
4121 if (rc >= 0)
4122 snprintf (rb_score_str, sizeof(rb_score_str),
4123 " read_balance_score %.2f", rb_info.acting_adj_score);
4124 }
4125
4126 out << "pool " << pid
4127 << " '" << name
4128 << "' " << pdata
4129 << rb_score_str << "\n";
4130 if (rb_info.err_msg.length() > 0) {
4131 out << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << "\n";
4132 }
4133
4134 //TODO - print error messages here.
4135
4136 for (const auto &snap : pdata.snaps)
4137 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
4138
4139 if (!pdata.removed_snaps.empty())
4140 out << "\tremoved_snaps " << pdata.removed_snaps << "\n";
4141 auto p = removed_snaps_queue.find(pid);
4142 if (p != removed_snaps_queue.end()) {
4143 out << "\tremoved_snaps_queue " << p->second << "\n";
4144 }
4145 }
4146 out << std::endl;
4147 }
4148
4149 void OSDMap::print_osds(ostream& out) const
4150 {
4151 for (int i=0; i<get_max_osd(); i++) {
4152 if (exists(i)) {
4153 print_osd(i, out);
4154 }
4155 }
4156 }
4157 void OSDMap::print_osd(int id, ostream& out) const
4158 {
4159 if (!exists(id)) {
4160 return;
4161 }
4162
4163 out << "osd." << id;
4164 out << (is_up(id) ? " up ":" down");
4165 out << (is_in(id) ? " in ":" out");
4166 out << " weight " << get_weightf(id);
4167 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
4168 out << " primary_affinity " << get_primary_affinityf(id);
4169 }
4170 const osd_info_t& info(get_info(id));
4171 out << " " << info;
4172 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
4173 set<string> st;
4174 get_state(id, st);
4175 out << " " << st;
4176 if (!get_uuid(id).is_zero()) {
4177 out << " " << get_uuid(id);
4178 }
4179 out << "\n";
4180 }
4181
4182 void OSDMap::print(CephContext *cct, ostream& out) const
4183 {
4184 out << "epoch " << get_epoch() << "\n"
4185 << "fsid " << get_fsid() << "\n"
4186 << "created " << get_created() << "\n"
4187 << "modified " << get_modified() << "\n";
4188
4189 out << "flags " << get_flag_string() << "\n";
4190 out << "crush_version " << get_crush_version() << "\n";
4191 out << "full_ratio " << full_ratio << "\n";
4192 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
4193 out << "nearfull_ratio " << nearfull_ratio << "\n";
4194 if (require_min_compat_client != ceph_release_t::unknown) {
4195 out << "require_min_compat_client "
4196 << require_min_compat_client << "\n";
4197 }
4198 out << "min_compat_client " << get_min_compat_client()
4199 << "\n";
4200 if (require_osd_release > ceph_release_t::unknown) {
4201 out << "require_osd_release " << require_osd_release
4202 << "\n";
4203 }
4204 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
4205 if (stretch_mode_enabled) {
4206 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
4207 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
4208 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
4209 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
4210 }
4211 if (get_cluster_snapshot().length())
4212 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
4213 if (allow_crimson) {
4214 out << "allow_crimson=true\n";
4215 }
4216 out << "\n";
4217
4218 print_pools(cct, out);
4219
4220 out << "max_osd " << get_max_osd() << "\n";
4221 print_osds(out);
4222 out << std::endl;
4223
4224 for (auto& p : pg_upmap) {
4225 out << "pg_upmap " << p.first << " " << p.second << "\n";
4226 }
4227 for (auto& p : pg_upmap_items) {
4228 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
4229 }
4230
4231 for (auto& [pg, osd] : pg_upmap_primaries) {
4232 out << "pg_upmap_primary " << pg << " " << osd << "\n";
4233 }
4234
4235 for (const auto& pg : *pg_temp)
4236 out << "pg_temp " << pg.first << " " << pg.second << "\n";
4237
4238 for (const auto& pg : *primary_temp)
4239 out << "primary_temp " << pg.first << " " << pg.second << "\n";
4240
4241 for (const auto &addr : blocklist)
4242 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
4243 for (const auto &addr : range_blocklist)
4244 out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
4245 }
4246
4247 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
4248 public:
4249 typedef CrushTreeDumper::Dumper<TextTable> Parent;
4250
4251 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4252 unsigned f)
4253 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
4254
4255 bool should_dump_leaf(int i) const override {
4256 if (!filter) {
4257 return true; // normal case
4258 }
4259 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4260 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4261 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4262 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4263 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4264 return true;
4265 }
4266 return false;
4267 }
4268
4269 bool should_dump_empty_bucket() const override {
4270 return !filter;
4271 }
4272
4273 void init_table(TextTable *tbl) {
4274 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
4275 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
4276 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4277 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4278 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
4279 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4280 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
4281 }
4282 void dump(TextTable *tbl, string& bucket) {
4283 init_table(tbl);
4284
4285 if (!bucket.empty()) {
4286 set_root(bucket);
4287 Parent::dump(tbl);
4288 } else {
4289 Parent::dump(tbl);
4290 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4291 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
4292 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
4293 }
4294 }
4295 }
4296 }
4297
4298 protected:
4299 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
4300 const char *c = crush->get_item_class(qi.id);
4301 if (!c)
4302 c = "";
4303 *tbl << qi.id
4304 << c
4305 << weightf_t(qi.weight);
4306
4307 ostringstream name;
4308 for (int k = 0; k < qi.depth; k++)
4309 name << " ";
4310 if (qi.is_bucket()) {
4311 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
4312 << crush->get_item_name(qi.id);
4313 } else {
4314 name << "osd." << qi.id;
4315 }
4316 *tbl << name.str();
4317
4318 if (!qi.is_bucket()) {
4319 if (!osdmap->exists(qi.id)) {
4320 *tbl << "DNE"
4321 << 0;
4322 } else {
4323 string s;
4324 if (osdmap->is_up(qi.id)) {
4325 s = "up";
4326 } else if (osdmap->is_destroyed(qi.id)) {
4327 s = "destroyed";
4328 } else {
4329 s = "down";
4330 }
4331 *tbl << s
4332 << weightf_t(osdmap->get_weightf(qi.id))
4333 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4334 }
4335 }
4336 *tbl << TextTable::endrow;
4337 }
4338
4339 private:
4340 const OSDMap *osdmap;
4341 const unsigned filter;
4342 };
4343
4344 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4345 public:
4346 typedef CrushTreeDumper::FormattingDumper Parent;
4347
4348 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4349 unsigned f)
4350 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
4351
4352 bool should_dump_leaf(int i) const override {
4353 if (!filter) {
4354 return true; // normal case
4355 }
4356 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4357 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4358 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4359 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4360 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4361 return true;
4362 }
4363 return false;
4364 }
4365
4366 bool should_dump_empty_bucket() const override {
4367 return !filter;
4368 }
4369
4370 void dump(Formatter *f, string& bucket) {
4371 if (!bucket.empty()) {
4372 set_root(bucket);
4373 f->open_array_section("nodes");
4374 Parent::dump(f);
4375 f->close_section();
4376 } else {
4377 f->open_array_section("nodes");
4378 Parent::dump(f);
4379 f->close_section();
4380 f->open_array_section("stray");
4381 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4382 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4383 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4384 }
4385 f->close_section();
4386 }
4387 }
4388
4389 protected:
4390 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4391 Parent::dump_item_fields(qi, f);
4392 if (!qi.is_bucket())
4393 {
4394 string s;
4395 if (osdmap->is_up(qi.id)) {
4396 s = "up";
4397 } else if (osdmap->is_destroyed(qi.id)) {
4398 s = "destroyed";
4399 } else {
4400 s = "down";
4401 }
4402 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
4403 f->dump_string("status", s);
4404 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4405 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4406 }
4407 }
4408
4409 private:
4410 const OSDMap *osdmap;
4411 const unsigned filter;
4412 };
4413
4414 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
4415 {
4416 if (f) {
4417 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
4418 } else {
4419 ceph_assert(out);
4420 TextTable tbl;
4421 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
4422 *out << tbl;
4423 }
4424 }
4425
4426 void OSDMap::print_summary(Formatter *f, ostream& out,
4427 const string& prefix, bool extra) const
4428 {
4429 if (f) {
4430 f->dump_int("epoch", get_epoch());
4431 f->dump_int("num_osds", get_num_osds());
4432 f->dump_int("num_up_osds", get_num_up_osds());
4433 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
4434 f->dump_int("num_in_osds", get_num_in_osds());
4435 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
4436 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4437 } else {
4438 utime_t now = ceph_clock_now();
4439 out << get_num_osds() << " osds: "
4440 << get_num_up_osds() << " up";
4441 if (last_up_change != utime_t()) {
4442 out << " (since " << utimespan_str(now - last_up_change) << ")";
4443 }
4444 out << ", " << get_num_in_osds() << " in";
4445 if (last_in_change != utime_t()) {
4446 out << " (since " << utimespan_str(now - last_in_change) << ")";
4447 }
4448 if (extra)
4449 out << "; epoch: e" << get_epoch();
4450 if (get_num_pg_temp())
4451 out << "; " << get_num_pg_temp() << " remapped pgs";
4452 out << "\n";
4453 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4454 if (important_flags)
4455 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
4456 }
4457 }
4458
4459 void OSDMap::print_oneline_summary(ostream& out) const
4460 {
4461 out << "e" << get_epoch() << ": "
4462 << get_num_osds() << " total, "
4463 << get_num_up_osds() << " up, "
4464 << get_num_in_osds() << " in";
4465 }
4466
4467 bool OSDMap::crush_rule_in_use(int rule_id) const
4468 {
4469 for (const auto &pool : pools) {
4470 if (pool.second.crush_rule == rule_id)
4471 return true;
4472 }
4473 return false;
4474 }
4475
4476 int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4477 ostream *ss) const
4478 {
4479 for (auto& i : pools) {
4480 auto& pool = i.second;
4481 int ruleno = pool.get_crush_rule();
4482 if (!newcrush->rule_exists(ruleno)) {
4483 *ss << "pool " << i.first << " references crush_rule " << ruleno
4484 << " but it is not present";
4485 return -EINVAL;
4486 }
4487 if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) {
4488 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4489 return -EINVAL;
4490 }
4491 }
4492 return 0;
4493 }
4494
4495 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4496 int nosd, int pg_bits, int pgp_bits,
4497 bool default_pool)
4498 {
4499 ldout(cct, 10) << "build_simple on " << nosd
4500 << " osds" << dendl;
4501 epoch = e;
4502 set_fsid(fsid);
4503 created = modified = ceph_clock_now();
4504
4505 if (nosd >= 0) {
4506 set_max_osd(nosd);
4507 } else {
4508 // count osds
4509 int maxosd = 0;
4510 const auto& conf = cct->_conf;
4511 vector<string> sections;
4512 conf.get_all_sections(sections);
4513
4514 for (auto &section : sections) {
4515 if (section.find("osd.") != 0)
4516 continue;
4517
4518 const char *begin = section.c_str() + 4;
4519 char *end = (char*)begin;
4520 int o = strtol(begin, &end, 10);
4521 if (*end != '\0')
4522 continue;
4523
4524 if (o > cct->_conf->mon_max_osd) {
4525 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4526 return -ERANGE;
4527 }
4528
4529 if (o > maxosd)
4530 maxosd = o;
4531 }
4532
4533 set_max_osd(maxosd + 1);
4534 }
4535
4536
4537 stringstream ss;
4538 int r;
4539 if (nosd >= 0)
4540 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4541 else
4542 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
4543 ceph_assert(r == 0);
4544
4545 int poolbase = get_max_osd() ? get_max_osd() : 1;
4546
4547 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct);
4548 ceph_assert(default_replicated_rule >= 0);
4549
4550 if (default_pool) {
4551 // pgp_num <= pg_num
4552 if (pgp_bits > pg_bits)
4553 pgp_bits = pg_bits;
4554
4555 vector<string> pool_names;
4556 pool_names.push_back("rbd");
4557 for (auto &plname : pool_names) {
4558 int64_t pool = ++pool_max;
4559 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4560 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4561 if (cct->_conf->osd_pool_default_flag_hashpspool)
4562 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4563 if (cct->_conf->osd_pool_default_flag_nodelete)
4564 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4565 if (cct->_conf->osd_pool_default_flag_nopgchange)
4566 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4567 if (cct->_conf->osd_pool_default_flag_nosizechange)
4568 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
4569 if (cct->_conf->osd_pool_default_flag_bulk)
4570 pools[pool].set_flag(pg_pool_t::FLAG_BULK);
4571 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4572 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4573 pools[pool].size);
4574 pools[pool].crush_rule = default_replicated_rule;
4575 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4576 pools[pool].set_pg_num(poolbase << pg_bits);
4577 pools[pool].set_pgp_num(poolbase << pgp_bits);
4578 pools[pool].set_pg_num_target(poolbase << pg_bits);
4579 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
4580 pools[pool].last_change = epoch;
4581 pools[pool].application_metadata.insert(
4582 {pg_pool_t::APPLICATION_NAME_RBD, {}});
4583 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4584 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4585 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4586 pools[pool].pg_autoscale_mode = m;
4587 } else {
4588 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4589 }
4590 pool_name[pool] = plname;
4591 name_pool[plname] = pool;
4592 }
4593 }
4594
4595 map<string,string> profile_map;
4596 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4597 if (r < 0) {
4598 lderr(cct) << ss.str() << dendl;
4599 return r;
4600 }
4601 set_erasure_code_profile("default", profile_map);
4602 return 0;
4603 }
4604
4605 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4606 map<string,string> &profile_map,
4607 ostream *ss)
4608 {
4609 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
4610 *ss,
4611 &profile_map);
4612 return r;
4613 }
4614
4615 int OSDMap::_build_crush_types(CrushWrapper& crush)
4616 {
4617 crush.set_type_name(0, "osd");
4618 crush.set_type_name(1, "host");
4619 crush.set_type_name(2, "chassis");
4620 crush.set_type_name(3, "rack");
4621 crush.set_type_name(4, "row");
4622 crush.set_type_name(5, "pdu");
4623 crush.set_type_name(6, "pod");
4624 crush.set_type_name(7, "room");
4625 crush.set_type_name(8, "datacenter");
4626 crush.set_type_name(9, "zone");
4627 crush.set_type_name(10, "region");
4628 crush.set_type_name(11, "root");
4629 return 11;
4630 }
4631
4632 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4633 int nosd, ostream *ss)
4634 {
4635 crush.create();
4636
4637 // root
4638 int root_type = _build_crush_types(crush);
4639 int rootid;
4640 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4641 root_type, 0, NULL, NULL, &rootid);
4642 ceph_assert(r == 0);
4643 crush.set_item_name(rootid, "default");
4644
4645 map<string,string> loc{
4646 {"host", "localhost"},
4647 {"rack", "localrack"},
4648 {"root", "default"}
4649 };
4650 for (int o=0; o<nosd; o++) {
4651 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4652 char name[32];
4653 snprintf(name, sizeof(name), "osd.%d", o);
4654 crush.insert_item(cct, o, 1.0, name, loc);
4655 }
4656
4657 build_simple_crush_rules(cct, crush, "default", ss);
4658
4659 crush.finalize();
4660
4661 return 0;
4662 }
4663
4664 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4665 CrushWrapper& crush,
4666 ostream *ss)
4667 {
4668 const auto& conf = cct->_conf;
4669
4670 crush.create();
4671
4672 // root
4673 int root_type = _build_crush_types(crush);
4674 int rootid;
4675 int r = crush.add_bucket(0, 0,
4676 CRUSH_HASH_DEFAULT,
4677 root_type, 0, NULL, NULL, &rootid);
4678 ceph_assert(r == 0);
4679 crush.set_item_name(rootid, "default");
4680
4681 // add osds
4682 vector<string> sections;
4683 conf.get_all_sections(sections);
4684
4685 for (auto &section : sections) {
4686 if (section.find("osd.") != 0)
4687 continue;
4688
4689 const char *begin = section.c_str() + 4;
4690 char *end = (char*)begin;
4691 int o = strtol(begin, &end, 10);
4692 if (*end != '\0')
4693 continue;
4694
4695 string host, rack, row, room, dc, pool;
4696 vector<string> sectiontmp;
4697 sectiontmp.push_back("osd");
4698 sectiontmp.push_back(section);
4699 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4700 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4701 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4702 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4703 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4704 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
4705
4706 if (host.length() == 0)
4707 host = "unknownhost";
4708 if (rack.length() == 0)
4709 rack = "unknownrack";
4710
4711 map<string,string> loc;
4712 loc["host"] = host;
4713 loc["rack"] = rack;
4714 if (row.size())
4715 loc["row"] = row;
4716 if (room.size())
4717 loc["room"] = room;
4718 if (dc.size())
4719 loc["datacenter"] = dc;
4720 loc["root"] = "default";
4721
4722 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4723 crush.insert_item(cct, o, 1.0, section, loc);
4724 }
4725
4726 build_simple_crush_rules(cct, crush, "default", ss);
4727
4728 crush.finalize();
4729
4730 return 0;
4731 }
4732
4733
4734 int OSDMap::build_simple_crush_rules(
4735 CephContext *cct,
4736 CrushWrapper& crush,
4737 const string& root,
4738 ostream *ss)
4739 {
4740 int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct);
4741 string failure_domain =
4742 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4743
4744 int r;
4745 r = crush.add_simple_rule_at(
4746 "replicated_rule", root, failure_domain, "",
4747 "firstn", pg_pool_t::TYPE_REPLICATED,
4748 crush_rule, ss);
4749 if (r < 0)
4750 return r;
4751 // do not add an erasure rule by default or else we will implicitly
4752 // require the crush_v2 feature of clients
4753 return 0;
4754 }
4755
4756 int OSDMap::summarize_mapping_stats(
4757 OSDMap *newmap,
4758 const set<int64_t> *pools,
4759 std::string *out,
4760 Formatter *f) const
4761 {
4762 set<int64_t> ls;
4763 if (pools) {
4764 ls = *pools;
4765 } else {
4766 for (auto &p : get_pools())
4767 ls.insert(p.first);
4768 }
4769
4770 unsigned total_pg = 0;
4771 unsigned moved_pg = 0;
4772 vector<unsigned> base_by_osd(get_max_osd(), 0);
4773 vector<unsigned> new_by_osd(get_max_osd(), 0);
4774 for (int64_t pool_id : ls) {
4775 const pg_pool_t *pi = get_pg_pool(pool_id);
4776 vector<int> up, up2;
4777 int up_primary;
4778 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
4779 pg_t pgid(ps, pool_id);
4780 total_pg += pi->get_size();
4781 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
4782 for (int osd : up) {
4783 if (osd >= 0 && osd < get_max_osd())
4784 ++base_by_osd[osd];
4785 }
4786 if (newmap) {
4787 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
4788 for (int osd : up2) {
4789 if (osd >= 0 && osd < get_max_osd())
4790 ++new_by_osd[osd];
4791 }
4792 if (pi->is_erasure()) {
4793 for (unsigned i=0; i<up.size(); ++i) {
4794 if (up[i] != up2[i]) {
4795 ++moved_pg;
4796 }
4797 }
4798 } else if (pi->is_replicated()) {
4799 for (int osd : up) {
4800 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4801 ++moved_pg;
4802 }
4803 }
4804 } else {
4805 ceph_abort_msg("unhandled pool type");
4806 }
4807 }
4808 }
4809 }
4810
4811 unsigned num_up_in = 0;
4812 for (int osd = 0; osd < get_max_osd(); ++osd) {
4813 if (is_up(osd) && is_in(osd))
4814 ++num_up_in;
4815 }
4816 if (!num_up_in) {
4817 return -EINVAL;
4818 }
4819
4820 float avg_pg = (float)total_pg / (float)num_up_in;
4821 float base_stddev = 0, new_stddev = 0;
4822 int min = -1, max = -1;
4823 unsigned min_base_pg = 0, max_base_pg = 0;
4824 unsigned min_new_pg = 0, max_new_pg = 0;
4825 for (int osd = 0; osd < get_max_osd(); ++osd) {
4826 if (is_up(osd) && is_in(osd)) {
4827 float base_diff = (float)base_by_osd[osd] - avg_pg;
4828 base_stddev += base_diff * base_diff;
4829 float new_diff = (float)new_by_osd[osd] - avg_pg;
4830 new_stddev += new_diff * new_diff;
4831 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4832 min = osd;
4833 min_base_pg = base_by_osd[osd];
4834 min_new_pg = new_by_osd[osd];
4835 }
4836 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4837 max = osd;
4838 max_base_pg = base_by_osd[osd];
4839 max_new_pg = new_by_osd[osd];
4840 }
4841 }
4842 }
4843 base_stddev = sqrt(base_stddev / num_up_in);
4844 new_stddev = sqrt(new_stddev / num_up_in);
4845
4846 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4847
4848 ostringstream ss;
4849 if (f)
4850 f->open_object_section("utilization");
4851 if (newmap) {
4852 if (f) {
4853 f->dump_unsigned("moved_pgs", moved_pg);
4854 f->dump_unsigned("total_pgs", total_pg);
4855 } else {
4856 float percent = 0;
4857 if (total_pg)
4858 percent = (float)moved_pg * 100.0 / (float)total_pg;
4859 ss << "moved " << moved_pg << " / " << total_pg
4860 << " (" << percent << "%)\n";
4861 }
4862 }
4863 if (f) {
4864 f->dump_float("avg_pgs", avg_pg);
4865 f->dump_float("std_dev", base_stddev);
4866 f->dump_float("expected_baseline_std_dev", edev);
4867 if (newmap)
4868 f->dump_float("new_std_dev", new_stddev);
4869 } else {
4870 ss << "avg " << avg_pg << "\n";
4871 ss << "stddev " << base_stddev;
4872 if (newmap)
4873 ss << " -> " << new_stddev;
4874 ss << " (expected baseline " << edev << ")\n";
4875 }
4876 if (min >= 0) {
4877 if (f) {
4878 f->dump_unsigned("min_osd", min);
4879 f->dump_unsigned("min_osd_pgs", min_base_pg);
4880 if (newmap)
4881 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4882 } else {
4883 ss << "min osd." << min << " with " << min_base_pg;
4884 if (newmap)
4885 ss << " -> " << min_new_pg;
4886 ss << " pgs (" << (float)min_base_pg / avg_pg;
4887 if (newmap)
4888 ss << " -> " << (float)min_new_pg / avg_pg;
4889 ss << " * mean)\n";
4890 }
4891 }
4892 if (max >= 0) {
4893 if (f) {
4894 f->dump_unsigned("max_osd", max);
4895 f->dump_unsigned("max_osd_pgs", max_base_pg);
4896 if (newmap)
4897 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4898 } else {
4899 ss << "max osd." << max << " with " << max_base_pg;
4900 if (newmap)
4901 ss << " -> " << max_new_pg;
4902 ss << " pgs (" << (float)max_base_pg / avg_pg;
4903 if (newmap)
4904 ss << " -> " << (float)max_new_pg / avg_pg;
4905 ss << " * mean)\n";
4906 }
4907 }
4908 if (f)
4909 f->close_section();
4910 if (out)
4911 *out = ss.str();
4912 return 0;
4913 }
4914
4915 bool OSDMap::try_pg_upmap(
4916 CephContext *cct,
4917 pg_t pg, ///< pg to potentially remap
4918 const set<int>& overfull, ///< osds we'd want to evacuate
4919 const vector<int>& underfull, ///< osds to move to, in order of preference
4920 const vector<int>& more_underfull, ///< more osds only slightly underfull
4921 vector<int> *orig,
4922 vector<int> *out) ///< resulting alternative mapping
4923 {
4924 const pg_pool_t *pool = get_pg_pool(pg.pool());
4925 if (!pool)
4926 return false;
4927 int rule = pool->get_crush_rule();
4928 if (rule < 0)
4929 return false;
4930
4931 // make sure there is something there to remap
4932 bool any = false;
4933 for (auto osd : *orig) {
4934 if (overfull.count(osd)) {
4935 any = true;
4936 break;
4937 }
4938 }
4939 if (!any) {
4940 return false;
4941 }
4942
4943 int r = crush->try_remap_rule(
4944 cct,
4945 rule,
4946 pool->get_size(),
4947 overfull, underfull,
4948 more_underfull,
4949 *orig,
4950 out);
4951 if (r < 0)
4952 return false;
4953 if (*out == *orig)
4954 return false;
4955 return true;
4956 }
4957
4958
4959 int OSDMap::balance_primaries(
4960 CephContext *cct,
4961 int64_t pid,
4962 OSDMap::Incremental *pending_inc,
4963 OSDMap& tmp_osd_map) const
4964 {
4965 // This function only handles replicated pools.
4966 const pg_pool_t* pool = get_pg_pool(pid);
4967 if (! pool->is_replicated()) {
4968 ldout(cct, 10) << __func__ << " skipping erasure pool "
4969 << get_pool_name(pid) << dendl;
4970 return -EINVAL;
4971 }
4972
4973 // Info to be used in verify_upmap
4974 int pool_size = pool->get_size();
4975 int crush_rule = pool->get_crush_rule();
4976
4977 // Get pgs by osd (map of osd -> pgs)
4978 // Get primaries by osd (map of osd -> primary)
4979 map<uint64_t,set<pg_t>> pgs_by_osd;
4980 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
4981 map<uint64_t,set<pg_t>> acting_prims_by_osd;
4982 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pid, &prim_pgs_by_osd, &acting_prims_by_osd);
4983
4984 // Transfer pgs into a map, `pgs_to_check`. This will tell us the total num_changes after all
4985 // calculations have been finalized.
4986 // Transfer osds into a set, `osds_to_check`.
4987 // This is to avoid poor runtime when we loop through the pgs and to set up
4988 // our call to calc_desired_primary_distribution.
4989 map<pg_t,bool> prim_pgs_to_check;
4990 vector<uint64_t> osds_to_check;
4991 for (const auto & [osd, pgs] : prim_pgs_by_osd) {
4992 osds_to_check.push_back(osd);
4993 for (const auto & pg : pgs) {
4994 prim_pgs_to_check.insert({pg, false});
4995 }
4996 }
4997
4998 // calculate desired primary distribution for each osd
4999 map<uint64_t,float> desired_prim_dist;
5000 int rc = 0;
5001 rc = calc_desired_primary_distribution(cct, pid, osds_to_check, desired_prim_dist);
5002 if (rc < 0) {
5003 ldout(cct, 10) << __func__ << " Error in calculating desired primary distribution" << dendl;
5004 return -EINVAL;
5005 }
5006 map<uint64_t,float> prim_dist_scores;
5007 float actual;
5008 float desired;
5009 for (auto osd : osds_to_check) {
5010 actual = prim_pgs_by_osd[osd].size();
5011 desired = desired_prim_dist[osd];
5012 prim_dist_scores[osd] = actual - desired;
5013 ldout(cct, 10) << __func__ << " desired distribution for osd." << osd << " " << desired << dendl;
5014 }
5015
5016 // get read balance score before balancing
5017 float read_balance_score_before = 0.0;
5018 read_balance_info_t rb_info;
5019 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5020 if (rc >= 0) {
5021 read_balance_score_before = rb_info.adjusted_score;
5022 }
5023 if (rb_info.err_msg.length() > 0) {
5024 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5025 return -EINVAL;
5026 }
5027
5028 // get ready to swap pgs
5029 while (true) {
5030 int curr_num_changes = 0;
5031 vector<int> up_osds;
5032 vector<int> acting_osds;
5033 int up_primary, acting_primary;
5034 for (const auto & [pg, mapped] : prim_pgs_to_check) {
5035 // fill in the up, up primary, acting, and acting primary for the current PG
5036 tmp_osd_map.pg_to_up_acting_osds(pg, &up_osds, &up_primary,
5037 &acting_osds, &acting_primary);
5038
5039 // find the OSD that would make the best swap based on its score
5040 // We start by first testing the OSD that is currently primary for the PG we are checking.
5041 uint64_t curr_best_osd = up_primary;
5042 float prim_score = prim_dist_scores[up_primary];
5043 for (auto potential_osd : up_osds) {
5044 float potential_score = prim_dist_scores[potential_osd];
5045 if ((prim_score > 0) && // taking 1 pg from the prim would not make its score worse
5046 (potential_score < 0) && // adding 1 pg to the potential would not make its score worse
5047 ((prim_score - potential_score) > 1) && // swapping a pg would not just keep the scores the same
5048 (desired_prim_dist[potential_osd] > 0)) // the potential is not off limits (the primary affinity is above 0)
5049 {
5050 curr_best_osd = potential_osd;
5051 }
5052 }
5053
5054 // Make the swap only if:
5055 // 1. The swap is legal
5056 // 2. The balancer has chosen a new primary
5057 auto legal_swap = crush->verify_upmap(cct,
5058 crush_rule,
5059 pool_size,
5060 {(int)curr_best_osd});
5061 if (legal_swap >= 0 &&
5062 ((int)curr_best_osd != up_primary)) {
5063 // Update prim_dist_scores
5064 prim_dist_scores[curr_best_osd] += 1;
5065 prim_dist_scores[up_primary] -= 1;
5066
5067 // Update the mappings
5068 pending_inc->new_pg_upmap_primary[pg] = curr_best_osd;
5069 tmp_osd_map.pg_upmap_primaries[pg] = curr_best_osd;
5070 prim_pgs_to_check[pg] = true; // mark that this pg changed mappings
5071
5072 curr_num_changes++;
5073 }
5074 ldout(cct, 20) << __func__ << " curr_num_changes: " << curr_num_changes << dendl;
5075 }
5076 // If there are no changes after one pass through the pgs, then no further optimizations can be made.
5077 if (curr_num_changes == 0) {
5078 ldout(cct, 20) << __func__ << " curr_num_changes is 0; no further optimizations can be made." << dendl;
5079 break;
5080 }
5081 }
5082
5083 // get read balance score after balancing
5084 float read_balance_score_after = 0.0;
5085 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5086 if (rc >= 0) {
5087 read_balance_score_after = rb_info.adjusted_score;
5088 }
5089 if (rb_info.err_msg.length() > 0) {
5090 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5091 return -EINVAL;
5092 }
5093
5094 // Tally total number of changes
5095 int num_changes = 0;
5096 if (read_balance_score_after < read_balance_score_before) {
5097 for (auto [pg, mapped] : prim_pgs_to_check) {
5098 if (mapped) {
5099 num_changes++;
5100 }
5101 }
5102 }
5103
5104 ldout(cct, 10) << __func__ << " num_changes " << num_changes << dendl;
5105 return num_changes;
5106 }
5107
5108 int OSDMap::calc_desired_primary_distribution(
5109 CephContext *cct,
5110 int64_t pid,
5111 const vector<uint64_t> &osds,
5112 std::map<uint64_t, float>& desired_primary_distribution) const
5113 {
5114 // will return a perfect distribution of floats
5115 // without calculating the floor of each value
5116 //
5117 // This function only handles replicated pools.
5118 const pg_pool_t* pool = get_pg_pool(pid);
5119 if (pool->is_replicated()) {
5120 ldout(cct, 20) << __func__ << " calculating distribution for replicated pool "
5121 << get_pool_name(pid) << dendl;
5122 uint64_t replica_count = pool->get_size();
5123
5124 map<uint64_t,set<pg_t>> pgs_by_osd;
5125 pgs_by_osd = get_pgs_by_osd(cct, pid);
5126
5127 // First calculate the distribution using primary affinity and tally up the sum
5128 auto distribution_sum = 0.0;
5129 for (const auto & osd : osds) {
5130 float osd_primary_count = ((float)pgs_by_osd[osd].size() / (float)replica_count) * get_primary_affinityf(osd);
5131 desired_primary_distribution.insert({osd, osd_primary_count});
5132 distribution_sum += osd_primary_count;
5133 }
5134 if (distribution_sum <= 0) {
5135 ldout(cct, 10) << __func__ << " Unable to calculate primary distribution, likely because primary affinity is"
5136 << " set to 0 on all OSDs." << dendl;
5137 return -EINVAL;
5138 }
5139
5140 // Then, stretch the value (necessary when primary affinity is smaller than 1)
5141 float factor = (float)pool->get_pg_num() / (float)distribution_sum;
5142 float distribution_sum_desired = 0.0;
5143
5144 ceph_assert(factor >= 1.0);
5145 for (const auto & [osd, osd_primary_count] : desired_primary_distribution) {
5146 desired_primary_distribution[osd] *= factor;
5147 distribution_sum_desired += desired_primary_distribution[osd];
5148 }
5149 ceph_assert(fabs(distribution_sum_desired - pool->get_pg_num()) < 0.01);
5150 } else {
5151 ldout(cct, 10) << __func__ <<" skipping erasure pool "
5152 << get_pool_name(pid) << dendl;
5153 return -EINVAL;
5154 }
5155
5156 return 0;
5157 }
5158
5159 int OSDMap::calc_pg_upmaps(
5160 CephContext *cct,
5161 uint32_t max_deviation,
5162 int max,
5163 const set<int64_t>& only_pools,
5164 OSDMap::Incremental *pending_inc,
5165 std::random_device::result_type *p_seed)
5166 {
5167 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
5168 OSDMap tmp_osd_map;
5169 // Can't be less than 1 pg
5170 if (max_deviation < 1)
5171 max_deviation = 1;
5172 tmp_osd_map.deepish_copy_from(*this);
5173 int num_changed = 0;
5174 map<int,set<pg_t>> pgs_by_osd;
5175 int total_pgs = 0;
5176 float osd_weight_total = 0;
5177 map<int,float> osd_weight;
5178
5179 if (max <= 0) {
5180 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
5181 return 0;
5182 }
5183
5184 osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map,
5185 total_pgs, pgs_by_osd, osd_weight);
5186 if (osd_weight_total == 0) {
5187 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
5188 return 0;
5189 }
5190
5191 float pgs_per_weight = total_pgs / osd_weight_total;
5192 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
5193 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
5194
5195 float stddev = 0;
5196 map<int,float> osd_deviation; // osd, deviation(pgs)
5197 multimap<float,int> deviation_osd; // deviation(pgs), osd
5198 float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight,
5199 osd_deviation, deviation_osd, stddev);
5200
5201 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5202 if (cur_max_deviation <= max_deviation) {
5203 ldout(cct, 10) << __func__ << " distribution is almost perfect"
5204 << dendl;
5205 return 0;
5206 }
5207
5208 bool skip_overfull = false;
5209 auto aggressive =
5210 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
5211 auto fast_aggressive = aggressive &&
5212 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively_fast");
5213 auto local_fallback_retries =
5214 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
5215
5216 while (max--) {
5217 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
5218 // build overfull and underfull
5219 set<int> overfull;
5220 set<int> more_overfull;
5221 bool using_more_overfull = false;
5222 vector<int> underfull;
5223 vector<int> more_underfull;
5224 fill_overfull_underfull(cct, deviation_osd, max_deviation,
5225 overfull, more_overfull,
5226 underfull, more_underfull);
5227
5228 if (underfull.empty() && overfull.empty()) {
5229 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
5230 break;
5231 }
5232 if (overfull.empty() && !underfull.empty()) {
5233 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
5234 overfull = more_overfull;
5235 using_more_overfull = true;
5236 }
5237
5238 ldout(cct, 10) << " overfull " << overfull
5239 << " underfull " << underfull
5240 << dendl;
5241 set<pg_t> to_skip;
5242 uint64_t local_fallback_retried = 0;
5243
5244 // Used to prevent some of the unsuccessful loop iterations (save runtime)
5245 // If we can't find a change per OSD we skip further iterations for this OSD
5246 uint n_changes = 0, prev_n_changes = 0;
5247 set<int> osd_to_skip;
5248
5249 retry:
5250
5251 set<pg_t> to_unmap;
5252 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
5253 auto temp_pgs_by_osd = pgs_by_osd;
5254 // always start with fullest, break if we find any changes to make
5255 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
5256 if (skip_overfull && !underfull.empty()) {
5257 ldout(cct, 10) << " skipping overfull " << dendl;
5258 break; // fall through to check underfull
5259 }
5260 int osd = p->second;
5261 float deviation = p->first;
5262 if (fast_aggressive && osd_to_skip.count(osd)) {
5263 ldout(cct, 20) << " Fast aggressive mode: skipping osd " << osd
5264 << " osd_to_skip size = " << osd_to_skip.size() << dendl;
5265 continue;
5266 }
5267
5268 if (deviation < 0) {
5269 ldout(cct, 10) << " hitting underfull osds now"
5270 << " when trying to remap overfull osds"
5271 << dendl;
5272 break;
5273 }
5274 float target = osd_weight[osd] * pgs_per_weight;
5275 ldout(cct, 10) << " Overfull search osd." << osd
5276 << " target " << target
5277 << " deviation " << deviation
5278 << dendl;
5279 ceph_assert(target > 0);
5280 if (!using_more_overfull && deviation <= max_deviation) {
5281 ldout(cct, 10) << " osd." << osd
5282 << " target " << target
5283 << " deviation " << deviation
5284 << " < max deviation " << max_deviation
5285 << dendl;
5286 break;
5287 }
5288
5289 vector<pg_t> pgs;
5290 pgs.reserve(pgs_by_osd[osd].size());
5291 for (auto& pg : pgs_by_osd[osd]) {
5292 if (to_skip.count(pg))
5293 continue;
5294 pgs.push_back(pg);
5295 }
5296 if (aggressive) {
5297 // shuffle PG list so they all get equal (in)attention
5298 std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed));
5299 }
5300 // look for remaps we can un-remap
5301 if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd,
5302 temp_pgs_by_osd, to_unmap, to_upmap))
5303 goto test_change;
5304
5305 // try upmap
5306 for (auto pg : pgs) {
5307 auto temp_it = tmp_osd_map.pg_upmap.find(pg);
5308 if (temp_it != tmp_osd_map.pg_upmap.end()) {
5309 // leave pg_upmap alone
5310 // it must be specified by admin since balancer does not
5311 // support pg_upmap yet
5312 ldout(cct, 10) << " " << pg << " already has pg_upmap "
5313 << temp_it->second << ", skipping"
5314 << dendl;
5315 continue;
5316 }
5317 auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg);
5318 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5319 set<int> existing;
5320 auto it = tmp_osd_map.pg_upmap_items.find(pg);
5321 if (it != tmp_osd_map.pg_upmap_items.end()) {
5322 auto& um_items = it->second;
5323 if (um_items.size() >= (size_t)pg_pool_size) {
5324 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
5325 << um_items << ", skipping"
5326 << dendl;
5327 continue;
5328 } else {
5329 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
5330 << um_items
5331 << dendl;
5332 new_upmap_items = um_items;
5333 // build existing too (for dedup)
5334 for (auto [um_from, um_to] : um_items) {
5335 existing.insert(um_from);
5336 existing.insert(um_to);
5337 }
5338 }
5339 // fall through
5340 // to see if we can append more remapping pairs
5341 }
5342 ldout(cct, 10) << " trying " << pg << dendl;
5343 vector<int> raw, orig, out;
5344 tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
5345 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
5346 continue;
5347 }
5348 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
5349 if (orig.size() != out.size()) {
5350 continue;
5351 }
5352 ceph_assert(orig != out);
5353 int pos = find_best_remap(cct, orig, out, existing, osd_deviation);
5354 if (pos != -1) {
5355 // append new remapping pairs slowly
5356 // This way we can make sure that each tiny change will
5357 // definitely make distribution of PGs converging to
5358 // the perfect status.
5359 add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size,
5360 osd, existing, temp_pgs_by_osd,
5361 new_upmap_items, to_upmap);
5362 goto test_change;
5363 }
5364 }
5365 if (fast_aggressive) {
5366 if (prev_n_changes == n_changes) { // no changes for prev OSD
5367 osd_to_skip.insert(osd);
5368 }
5369 else {
5370 prev_n_changes = n_changes;
5371 }
5372 }
5373
5374 }
5375
5376 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5377 ldout(cct, 10) << " failed to find any changes for overfull osds"
5378 << dendl;
5379 for (auto& [deviation, osd] : deviation_osd) {
5380 if (std::find(underfull.begin(), underfull.end(), osd) ==
5381 underfull.end())
5382 break;
5383 float target = osd_weight[osd] * pgs_per_weight;
5384 ceph_assert(target > 0);
5385 if (fabsf(deviation) < max_deviation) {
5386 // respect max_deviation too
5387 ldout(cct, 10) << " osd." << osd
5388 << " target " << target
5389 << " deviation " << deviation
5390 << " -> absolute " << fabsf(deviation)
5391 << " < max " << max_deviation
5392 << dendl;
5393 break;
5394 }
5395 // look for remaps we can un-remap
5396 candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip,
5397 only_pools, aggressive, p_seed);
5398 if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd,
5399 to_unmap, to_upmap)) {
5400 goto test_change;
5401 }
5402 }
5403
5404 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5405 ldout(cct, 10) << " failed to find any changes for underfull osds"
5406 << dendl;
5407 if (!aggressive) {
5408 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5409 break;
5410 } else if (!skip_overfull) {
5411 // safe to quit because below here we know
5412 // we've done checking both overfull and underfull osds..
5413 ldout(cct, 10) << " break due to not being able to find any"
5414 << " further optimizations"
5415 << dendl;
5416 break;
5417 }
5418 // restart with fullest and do exhaustive searching
5419 skip_overfull = false;
5420 continue;
5421
5422 test_change:
5423
5424 // test change, apply if change is good
5425 ceph_assert(to_unmap.size() || to_upmap.size());
5426 float new_stddev = 0;
5427 map<int,float> temp_osd_deviation;
5428 multimap<float,int> temp_deviation_osd;
5429 float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight,
5430 pgs_per_weight, temp_osd_deviation,
5431 temp_deviation_osd, new_stddev);
5432 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5433 if (new_stddev >= stddev) {
5434 if (!aggressive) {
5435 ldout(cct, 10) << " break because stddev is not decreasing"
5436 << " and aggressive mode is not enabled"
5437 << dendl;
5438 break;
5439 }
5440 local_fallback_retried++;
5441 if (local_fallback_retried >= local_fallback_retries) {
5442 // does not make progress
5443 // flip *skip_overfull* so both overfull and underfull
5444 // get equal (in)attention
5445 skip_overfull = !skip_overfull;
5446 ldout(cct, 10) << " hit local_fallback_retries "
5447 << local_fallback_retries
5448 << dendl;
5449 continue;
5450 }
5451 for (auto& i : to_unmap)
5452 to_skip.insert(i);
5453 for (auto& i : to_upmap)
5454 to_skip.insert(i.first);
5455 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5456 << " to_skip " << to_skip
5457 << dendl;
5458 goto retry;
5459 }
5460
5461 // ready to go
5462 ceph_assert(new_stddev < stddev);
5463 stddev = new_stddev;
5464 pgs_by_osd = temp_pgs_by_osd;
5465 osd_deviation = temp_osd_deviation;
5466 deviation_osd = temp_deviation_osd;
5467 n_changes++;
5468
5469
5470 num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc);
5471
5472 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5473 if (cur_max_deviation <= max_deviation) {
5474 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5475 << dendl;
5476 break;
5477 }
5478 }
5479 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
5480 return num_changed;
5481 }
5482
5483 map<uint64_t,set<pg_t>> OSDMap::get_pgs_by_osd(
5484 CephContext *cct,
5485 int64_t pid,
5486 map<uint64_t, set<pg_t>> *p_primaries_by_osd,
5487 map<uint64_t, set<pg_t>> *p_acting_primaries_by_osd) const
5488 {
5489 // Set up the OSDMap
5490 OSDMap tmp_osd_map;
5491 tmp_osd_map.deepish_copy_from(*this);
5492
5493 // Get the pool from the provided pool id
5494 const pg_pool_t* pool = get_pg_pool(pid);
5495
5496 // build array of pgs from the pool
5497 map<uint64_t,set<pg_t>> pgs_by_osd;
5498 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5499 pg_t pg(ps, pid);
5500 vector<int> up;
5501 int primary;
5502 int acting_prim;
5503 tmp_osd_map.pg_to_up_acting_osds(pg, &up, &primary, nullptr, &acting_prim);
5504 if (cct != nullptr)
5505 ldout(cct, 20) << __func__ << " " << pg
5506 << " up " << up
5507 << " primary " << primary
5508 << " acting_primary " << acting_prim
5509 << dendl;
5510
5511 if (!up.empty()) { // up can be empty is test generated files
5512 // in this case, we return empty result
5513 for (auto osd : up) {
5514 if (osd != CRUSH_ITEM_NONE)
5515 pgs_by_osd[osd].insert(pg);
5516 }
5517 if (p_primaries_by_osd != nullptr) {
5518 if (primary != CRUSH_ITEM_NONE)
5519 (*p_primaries_by_osd)[primary].insert(pg);
5520 }
5521 if (p_acting_primaries_by_osd != nullptr) {
5522 if (acting_prim != CRUSH_ITEM_NONE)
5523 (*p_acting_primaries_by_osd)[acting_prim].insert(pg);
5524 }
5525 }
5526 }
5527 return pgs_by_osd;
5528 }
5529
5530 float OSDMap::get_osds_weight(
5531 CephContext *cct,
5532 const OSDMap& tmp_osd_map,
5533 int64_t pid,
5534 map<int,float>& osds_weight) const
5535 {
5536 map<int,float> pmap;
5537 ceph_assert(pools.count(pid));
5538 int ruleno = pools.at(pid).get_crush_rule();
5539 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap);
5540 ldout(cct,20) << __func__ << " pool " << pid
5541 << " ruleno " << ruleno
5542 << " weight-map " << pmap
5543 << dendl;
5544 float osds_weight_total = 0;
5545 for (auto [oid, oweight] : pmap) {
5546 auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight;
5547 if (adjusted_weight != 0) {
5548 osds_weight[oid] += adjusted_weight;
5549 osds_weight_total += adjusted_weight;
5550 }
5551 }
5552 return osds_weight_total;
5553 }
5554
5555 float OSDMap::build_pool_pgs_info (
5556 CephContext *cct,
5557 const std::set<int64_t>& only_pools, ///< [optional] restrict to pool
5558 const OSDMap& tmp_osd_map,
5559 int& total_pgs,
5560 map<int,set<pg_t>>& pgs_by_osd,
5561 map<int,float>& osds_weight)
5562 {
5563 //
5564 // This function builds some data structures that are used by calc_pg_upmaps.
5565 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5566 // and returns the osd_weight_total
5567 //
5568 float osds_weight_total = 0.0;
5569 for (auto& [pid, pdata] : pools) {
5570 if (!only_pools.empty() && !only_pools.count(pid))
5571 continue;
5572 for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) {
5573 pg_t pg(ps, pid);
5574 vector<int> up;
5575 tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
5576 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
5577 for (auto osd : up) {
5578 if (osd != CRUSH_ITEM_NONE)
5579 pgs_by_osd[osd].insert(pg);
5580 }
5581 }
5582 total_pgs += pdata.get_size() * pdata.get_pg_num();
5583
5584 osds_weight_total = get_osds_weight(cct, tmp_osd_map, pid, osds_weight);
5585 }
5586 for (auto& [oid, oweight] : osds_weight) {
5587 int pgs = 0;
5588 auto p = pgs_by_osd.find(oid);
5589 if (p != pgs_by_osd.end())
5590 pgs = p->second.size();
5591 else
5592 pgs_by_osd.emplace(oid, set<pg_t>());
5593 ldout(cct, 20) << " osd." << oid << " weight " << oweight
5594 << " pgs " << pgs << dendl;
5595 }
5596 return osds_weight_total;
5597
5598 } // return total weight of all OSDs
5599
5600 float OSDMap::calc_deviations (
5601 CephContext *cct,
5602 const map<int,set<pg_t>>& pgs_by_osd,
5603 const map<int,float>& osd_weight,
5604 float pgs_per_weight,
5605 map<int,float>& osd_deviation,
5606 multimap<float,int>& deviation_osd,
5607 float& stddev) // return current max deviation
5608 {
5609 //
5610 // This function calculates the 2 maps osd_deviation and deviation_osd which
5611 // hold the deviation between the current number of PGs which map to an OSD
5612 // and the optimal number. Ot also calculates the stddev of the deviations and
5613 // returns the current max deviation.
5614 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5615 // long as it is monotonic with stddev (and it is), it is sufficient for
5616 // the balancer code.
5617 //
5618 float cur_max_deviation = 0.0;
5619 stddev = 0.0;
5620 for (auto& [oid, opgs] : pgs_by_osd) {
5621 // make sure osd is still there (belongs to this crush-tree)
5622 ceph_assert(osd_weight.count(oid));
5623 float target = osd_weight.at(oid) * pgs_per_weight;
5624 float deviation = (float)opgs.size() - target;
5625 ldout(cct, 20) << " osd." << oid
5626 << "\tpgs " << opgs.size()
5627 << "\ttarget " << target
5628 << "\tdeviation " << deviation
5629 << dendl;
5630 osd_deviation[oid] = deviation;
5631 deviation_osd.insert(make_pair(deviation, oid));
5632 stddev += deviation * deviation;
5633 if (fabsf(deviation) > cur_max_deviation)
5634 cur_max_deviation = fabsf(deviation);
5635 }
5636 return cur_max_deviation;
5637 }
5638
5639 void OSDMap::fill_overfull_underfull (
5640 CephContext *cct,
5641 const std::multimap<float,int>& deviation_osd,
5642 int max_deviation,
5643 std::set<int>& overfull,
5644 std::set<int>& more_overfull,
5645 std::vector<int>& underfull,
5646 std::vector<int>& more_underfull)
5647 {
5648 //
5649 // This function just fills the overfull and underfull data structures for the
5650 // use of calc_pg_upmaps
5651 //
5652 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
5653 auto& odev = i->first;
5654 auto& oid = i->second;
5655 ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl;
5656 if (odev <= 0)
5657 break;
5658 if (odev > max_deviation) {
5659 ldout(cct, 30) << " add overfull osd." << oid << dendl;
5660 overfull.insert(oid);
5661 } else {
5662 more_overfull.insert(oid);
5663 }
5664 }
5665
5666 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
5667 auto& odev = i->first;
5668 auto& oid = i->second;
5669 ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl;
5670 if (odev >= 0)
5671 break;
5672 if (odev < -(int)max_deviation) {
5673 ldout(cct, 30) << " add underfull osd." << oid << dendl;
5674 underfull.push_back(oid);
5675 } else {
5676 more_underfull.push_back(oid);
5677 }
5678 }
5679 }
5680
5681 int OSDMap::pack_upmap_results(
5682 CephContext *cct,
5683 const std::set<pg_t>& to_unmap,
5684 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
5685 OSDMap& tmp_osd_map,
5686 OSDMap::Incremental *pending_inc)
5687 {
5688 //
5689 // This function takes the input from the local variables to_unmap and to_upmap
5690 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5691 // (so that the results are visible outside calc_pg_upmaps)
5692 //
5693 int num_changed = 0;
5694 for (auto& i : to_unmap) {
5695 ldout(cct, 10) << " unmap pg " << i << dendl;
5696 ceph_assert(tmp_osd_map.pg_upmap_items.count(i));
5697 tmp_osd_map.pg_upmap_items.erase(i);
5698 pending_inc->old_pg_upmap_items.insert(i);
5699 ++num_changed;
5700 }
5701 for (auto& [pg, um_items] : to_upmap) {
5702 ldout(cct, 10) << " upmap pg " << pg
5703 << " new pg_upmap_items " << um_items
5704 << dendl;
5705 tmp_osd_map.pg_upmap_items[pg] = um_items;
5706 pending_inc->new_pg_upmap_items[pg] = um_items;
5707 ++num_changed;
5708 }
5709
5710 return num_changed;
5711 }
5712
5713 std::default_random_engine OSDMap::get_random_engine(
5714 CephContext *cct,
5715 std::random_device::result_type *p_seed)
5716 {
5717 //
5718 // This function creates a random_engine to be used for shuffling.
5719 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5720 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5721 // increments seed_set. This is used in order to craete regression test without
5722 // random effect on the results.
5723 //
5724 static std::random_device::result_type seed_set = 0;
5725 std::random_device::result_type seed;
5726 if (p_seed == nullptr) {
5727 std::random_device rd;
5728 seed = rd();
5729 }
5730 else {
5731 seed = *p_seed + seed_set;
5732 ldout(cct, 30) << " Starting random engine with seed "
5733 << seed << dendl;
5734 seed_set++;
5735 }
5736 return std::default_random_engine{seed};
5737 }
5738
5739 bool OSDMap::try_drop_remap_overfull(
5740 CephContext *cct,
5741 const std::vector<pg_t>& pgs,
5742 const OSDMap& tmp_osd_map,
5743 int osd,
5744 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5745 set<pg_t>& to_unmap,
5746 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5747 {
5748 //
5749 // This function tries to drop existimg upmap items which map data to overfull
5750 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5751 // if it found an item that can be dropped, false if not.
5752 //
5753 for (auto pg : pgs) {
5754 auto p = tmp_osd_map.pg_upmap_items.find(pg);
5755 if (p == tmp_osd_map.pg_upmap_items.end())
5756 continue;
5757 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5758 auto& pg_upmap_items = p->second;
5759 for (auto um_pair : pg_upmap_items) {
5760 auto& um_from = um_pair.first;
5761 auto& um_to = um_pair.second;
5762 if (um_to == osd) {
5763 ldout(cct, 10) << " will try dropping existing"
5764 << " remapping pair "
5765 << um_from << " -> " << um_to
5766 << " which remapped " << pg
5767 << " into overfull osd." << osd
5768 << dendl;
5769 temp_pgs_by_osd[um_to].erase(pg);
5770 temp_pgs_by_osd[um_from].insert(pg);
5771 } else {
5772 new_upmap_items.push_back(um_pair);
5773 }
5774 }
5775 if (new_upmap_items.empty()) {
5776 // drop whole item
5777 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5778 << " remapped " << pg << " into overfull osd." << osd
5779 << ", will try cancelling it entirely"
5780 << dendl;
5781 to_unmap.insert(pg);
5782 return true;
5783 } else if (new_upmap_items.size() != pg_upmap_items.size()) {
5784 // drop single remapping pair, updating
5785 ceph_assert(new_upmap_items.size() < pg_upmap_items.size());
5786 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5787 << " remapped " << pg << " into overfull osd." << osd
5788 << ", new_pg_upmap_items now " << new_upmap_items
5789 << dendl;
5790 to_upmap[pg] = new_upmap_items;
5791 return true;
5792 }
5793 }
5794 return false;
5795 }
5796
5797 bool OSDMap::try_drop_remap_underfull(
5798 CephContext *cct,
5799 const candidates_t& candidates,
5800 int osd,
5801 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5802 set<pg_t>& to_unmap,
5803 map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap)
5804 {
5805 //
5806 // This function tries to drop existimg upmap items which map data from underfull
5807 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5808 // if it found an item that can be dropped, false if not.
5809 //
5810 for (auto& [pg, um_pairs] : candidates) {
5811 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5812 for (auto& ump : um_pairs) {
5813 auto& um_from = ump.first;
5814 auto& um_to = ump.second;
5815 if (um_from == osd) {
5816 ldout(cct, 10) << " will try dropping existing"
5817 << " remapping pair "
5818 << um_from << " -> " << um_to
5819 << " which remapped " << pg
5820 << " out from underfull osd." << osd
5821 << dendl;
5822 temp_pgs_by_osd[um_to].erase(pg);
5823 temp_pgs_by_osd[um_from].insert(pg);
5824 } else {
5825 new_upmap_items.push_back(ump);
5826 }
5827 }
5828 if (new_upmap_items.empty()) {
5829 // drop whole item
5830 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5831 << " remapped " << pg
5832 << " out from underfull osd." << osd
5833 << ", will try cancelling it entirely"
5834 << dendl;
5835 to_unmap.insert(pg);
5836 return true;
5837 } else if (new_upmap_items.size() != um_pairs.size()) {
5838 // drop single remapping pair, updating
5839 ceph_assert(new_upmap_items.size() < um_pairs.size());
5840 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5841 << " remapped " << pg
5842 << " out from underfull osd." << osd
5843 << ", new_pg_upmap_items now " << new_upmap_items
5844 << dendl;
5845 to_upmap[pg] = new_upmap_items;
5846 return true;
5847 }
5848 }
5849 return false;
5850 }
5851
5852 void OSDMap::add_remap_pair(
5853 CephContext *cct,
5854 int orig,
5855 int out,
5856 pg_t pg,
5857 size_t pg_pool_size,
5858 int osd,
5859 set<int>& existing,
5860 map<int,set<pg_t>>& temp_pgs_by_osd,
5861 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items,
5862 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5863 {
5864 //
5865 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5866 // the relevant data structures
5867 //
5868 ldout(cct, 10) << " will try adding new remapping pair "
5869 << orig << " -> " << out << " for " << pg
5870 << (orig != osd ? " NOT selected osd" : "")
5871 << dendl;
5872 existing.insert(orig);
5873 existing.insert(out);
5874 temp_pgs_by_osd[orig].erase(pg);
5875 temp_pgs_by_osd[out].insert(pg);
5876 ceph_assert(new_upmap_items.size() < pg_pool_size);
5877 new_upmap_items.push_back(make_pair(orig, out));
5878 // append new remapping pairs slowly
5879 // This way we can make sure that each tiny change will
5880 // definitely make distribution of PGs converging to
5881 // the perfect status.
5882 to_upmap[pg] = new_upmap_items;
5883
5884 }
5885
5886 int OSDMap::find_best_remap (
5887 CephContext *cct,
5888 const vector<int>& orig,
5889 const vector<int>& out,
5890 const set<int>& existing,
5891 const map<int,float> osd_deviation)
5892 {
5893 //
5894 // Find the best remap from the suggestions in orig and out - the best remap
5895 // is the one which maps from the OSD with the largest deviatoion (from the
5896 // OSDs which are part of orig)
5897 //
5898 int best_pos = -1;
5899 float max_dev = 0;
5900 for (unsigned i = 0; i < out.size(); ++i) {
5901 if (orig[i] == out[i])
5902 continue; // skip invalid remappings
5903 if (existing.count(orig[i]) || existing.count(out[i]))
5904 continue; // we want new remappings only!
5905 if (osd_deviation.at(orig[i]) > max_dev) {
5906 max_dev = osd_deviation.at(orig[i]);
5907 best_pos = i;
5908 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl;
5909 }
5910 }
5911 return best_pos;
5912 }
5913
5914 OSDMap::candidates_t OSDMap::build_candidates(
5915 CephContext *cct,
5916 const OSDMap& tmp_osd_map,
5917 const set<pg_t> to_skip,
5918 const set<int64_t>& only_pools,
5919 bool aggressive,
5920 std::random_device::result_type *p_seed)
5921 {
5922 //
5923 // build the candidates data structure
5924 //
5925 candidates_t candidates;
5926 candidates.reserve(tmp_osd_map.pg_upmap_items.size());
5927 for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) {
5928 if (to_skip.count(pg))
5929 continue;
5930 if (!only_pools.empty() && !only_pools.count(pg.pool()))
5931 continue;
5932 candidates.push_back(make_pair(pg, um_pair));
5933 }
5934 if (aggressive) {
5935 // shuffle candidates so they all get equal (in)attention
5936 std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed));
5937 }
5938 return candidates;
5939 }
5940
5941 // return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs
5942 int64_t OSDMap::has_zero_pa_pgs(CephContext *cct, int64_t pool_id) const
5943 {
5944 const pg_pool_t* pool = get_pg_pool(pool_id);
5945 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5946 pg_t pg(ps, pool_id);
5947 vector<int> acting;
5948 pg_to_up_acting_osds(pg, nullptr, nullptr, &acting, nullptr);
5949 if (cct != nullptr) {
5950 ldout(cct, 30) << __func__ << " " << pg << " acting " << acting << dendl;
5951 }
5952 bool pg_zero_pa = true;
5953 for (auto osd : acting) {
5954 if (get_primary_affinityf(osd) != 0) {
5955 pg_zero_pa = false;
5956 break;
5957 }
5958 }
5959 if (pg_zero_pa) {
5960 if (cct != nullptr) {
5961 ldout(cct, 20) << __func__ << " " << pg << " - maps only to OSDs with primiary affinity 0" << dendl;
5962 }
5963 return (int64_t)ps;
5964 }
5965 }
5966 return -1;
5967 }
5968
5969 void OSDMap::zero_rbi(read_balance_info_t &rbi) const {
5970 rbi.pa_avg = 0.;
5971 rbi.pa_weighted = 0.;
5972 rbi.pa_weighted_avg = 0.;
5973 rbi.raw_score = 0.;
5974 rbi.optimal_score = 0.;
5975 rbi.adjusted_score = 0.;
5976 rbi.acting_raw_score = 0.;
5977 rbi.acting_adj_score = 0.;
5978 rbi.err_msg = "";
5979 }
5980
5981 int OSDMap::set_rbi(
5982 CephContext *cct,
5983 read_balance_info_t &rbi,
5984 int64_t pool_id,
5985 float total_w_pa,
5986 float pa_sum,
5987 int num_osds,
5988 int osd_pa_count,
5989 float total_osd_weight,
5990 uint max_prims_per_osd,
5991 uint max_acting_prims_per_osd,
5992 float avg_prims_per_osd,
5993 bool prim_on_zero_pa,
5994 bool acting_on_zero_pa,
5995 float max_osd_score) const
5996 {
5997 // put all the ugly code here, so rest of code is nicer.
5998 const pg_pool_t* pool = get_pg_pool(pool_id);
5999 zero_rbi(rbi);
6000
6001 if (total_w_pa / total_osd_weight < 1. / float(pool->get_size())) {
6002 ldout(cct, 20) << __func__ << " pool " << pool_id << " average primary affinity is lower than"
6003 << 1. / float(pool->get_size()) << dendl;
6004 rbi.err_msg = fmt::format(
6005 "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable",
6006 pool_id, 1. / float(pool->get_size()));
6007 return -EINVAL;
6008 }
6009 rbi.pa_weighted = total_w_pa;
6010
6011 // weighted_prim_affinity_avg
6012 rbi.pa_weighted_avg = rbi_round(rbi.pa_weighted / total_osd_weight); // in [0..1]
6013 // p_rbi->pa_weighted / osd_pa_count; // in [0..1]
6014
6015 rbi.raw_score = rbi_round((float)max_prims_per_osd / avg_prims_per_osd); // >=1
6016 if (acting_on_zero_pa) {
6017 rbi.acting_raw_score = rbi_round(max_osd_score);
6018 rbi.err_msg = fmt::format(
6019 "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate",
6020 pool_id);
6021 } else {
6022 rbi.acting_raw_score = rbi_round((float)max_acting_prims_per_osd / avg_prims_per_osd);
6023 }
6024
6025 if (osd_pa_count != 0) {
6026 // this implies that pa_sum > 0
6027 rbi.pa_avg = rbi_round(pa_sum / osd_pa_count); // in [0..1]
6028 } else {
6029 rbi.pa_avg = 0.;
6030 }
6031
6032 if (rbi.pa_avg != 0.) {
6033 int64_t zpg;
6034 if ((zpg = has_zero_pa_pgs(cct, pool_id)) >= 0) {
6035 pg_t pg(zpg, pool_id);
6036 std::stringstream ss;
6037 ss << pg;
6038 ldout(cct, 10) << __func__ << " pool " << pool_id << " has some PGs where all OSDs are with primary_affinity 0 (" << pg << ",...)" << dendl;
6039 rbi.err_msg = fmt::format(
6040 "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable",
6041 pool_id, ss.str());
6042 return -EINVAL;
6043 }
6044 rbi.optimal_score = rbi_round(float(num_osds) / float(osd_pa_count)); // >= 1
6045 // adjust the score to the primary affinity setting (if prim affinity is set
6046 // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1)
6047 // When total system primary affinity is too low (average < 1 / pool replica count)
6048 // the score is negative in order to grab the user's attention.
6049 rbi.adjusted_score = rbi_round(rbi.raw_score / rbi.optimal_score); // >= 1 if PA is not low
6050 rbi.acting_adj_score = rbi_round(rbi.acting_raw_score / rbi.optimal_score); // >= 1 if PA is not low
6051
6052 } else {
6053 // We should never get here - this condition is checked before calling this function - this is just sanity check code.
6054 rbi.err_msg = fmt::format(
6055 "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score",
6056 pool_id);
6057 return -EINVAL;
6058 }
6059
6060 return 0;
6061 }
6062
6063 int OSDMap::calc_read_balance_score(CephContext *cct, int64_t pool_id,
6064 read_balance_info_t *p_rbi) const
6065 {
6066 //BUG: wrong score with one PG replica 3 and 4 OSDs
6067 if (cct != nullptr)
6068 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) << dendl;
6069
6070 OSDMap tmp_osd_map;
6071 tmp_osd_map.deepish_copy_from(*this);
6072 if (p_rbi == nullptr) {
6073 // The only case where error message is not set - this is not tested in the unit test.
6074 if (cct != nullptr)
6075 ldout(cct,30) << __func__ << " p_rbi is nullptr." << dendl;
6076 return -EINVAL;
6077 }
6078
6079 if (tmp_osd_map.pools.count(pool_id) == 0) {
6080 if (cct != nullptr)
6081 ldout(cct,30) << __func__ << " pool " << pool_id << " not found." << dendl;
6082 zero_rbi(*p_rbi);
6083 p_rbi->err_msg = fmt::format("pool {} not found", pool_id);
6084 return -ENOENT;
6085 }
6086 int rc = 0;
6087 const pg_pool_t* pool = tmp_osd_map.get_pg_pool(pool_id);
6088 auto num_pgs = pool->get_pg_num();
6089
6090 map<uint64_t,set<pg_t>> pgs_by_osd;
6091 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
6092 map<uint64_t,set<pg_t>> acting_prims_by_osd;
6093
6094 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pool_id, &prim_pgs_by_osd, &acting_prims_by_osd);
6095
6096 if (cct != nullptr)
6097 ldout(cct,30) << __func__ << " Primaries for pool: "
6098 << prim_pgs_by_osd << dendl;
6099
6100 if (pgs_by_osd.empty()) {
6101 //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id);
6102 return -EINVAL;
6103 }
6104 if (cct != nullptr) {
6105 for (auto& [osd,pgs] : prim_pgs_by_osd) {
6106 ldout(cct,20) << __func__ << " Pool " << pool_id << " OSD." << osd
6107 << " has " << pgs.size() << " primary PGs, "
6108 << acting_prims_by_osd[osd].size() << " acting primaries."
6109 << dendl;
6110 }
6111 }
6112
6113 auto num_osds = pgs_by_osd.size();
6114
6115 float avg_prims_per_osd = (float)num_pgs / (float)num_osds;
6116 uint64_t max_prims_per_osd = 0;
6117 uint64_t max_acting_prims_per_osd = 0;
6118 float max_osd_score = 0.;
6119 bool prim_on_zero_pa = false;
6120 bool acting_on_zero_pa = false;
6121
6122 float prim_affinity_sum = 0.;
6123 float total_osd_weight = 0.;
6124 float total_weighted_pa = 0.;
6125
6126 map<int,float> osds_crush_weight;
6127 // Set up the OSDMap
6128 int ruleno = tmp_osd_map.pools.at(pool_id).get_crush_rule();
6129 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &osds_crush_weight);
6130
6131 if (cct != nullptr) {
6132 ldout(cct,20) << __func__ << " pool " << pool_id
6133 << " ruleno " << ruleno
6134 << " weight-map " << osds_crush_weight
6135 << dendl;
6136 }
6137 uint osd_pa_count = 0;
6138
6139 for (auto [osd, oweight] : osds_crush_weight) { // loop over all OSDs
6140 total_osd_weight += oweight;
6141 float osd_pa = tmp_osd_map.get_primary_affinityf(osd);
6142 total_weighted_pa += oweight * osd_pa;
6143 if (osd_pa != 0.) {
6144 osd_pa_count++;
6145 }
6146 if (prim_pgs_by_osd.count(osd)) {
6147 auto n_prims = prim_pgs_by_osd.at(osd).size();
6148 max_prims_per_osd = std::max(max_prims_per_osd, n_prims);
6149 if (osd_pa == 0.) {
6150 prim_on_zero_pa = true;
6151 }
6152 }
6153 if (acting_prims_by_osd.count(osd)) {
6154 auto n_aprims = acting_prims_by_osd.at(osd).size();
6155 max_acting_prims_per_osd = std::max(max_acting_prims_per_osd, n_aprims);
6156 if (osd_pa != 0.) {
6157 max_osd_score = std::max(max_osd_score, float(n_aprims) / osd_pa);
6158 }
6159 else {
6160 acting_on_zero_pa = true;
6161 }
6162 }
6163
6164 prim_affinity_sum += osd_pa;
6165 if (cct != nullptr) {
6166 auto np = prim_pgs_by_osd.count(osd) ? prim_pgs_by_osd.at(osd).size() : 0;
6167 auto nap = acting_prims_by_osd.count(osd) ? acting_prims_by_osd.at(osd).size() : 0;
6168 auto wt = osds_crush_weight.count(osd) ? osds_crush_weight.at(osd) : 0.;
6169 ldout(cct,30) << __func__ << " OSD." << osd << " info: "
6170 << " num_primaries " << np
6171 << " num_acting_prims " << nap
6172 << " prim_affinity " << tmp_osd_map.get_primary_affinityf(osd)
6173 << " weight " << wt
6174 << dendl;
6175 }
6176 }
6177 if (cct != nullptr) {
6178 ldout(cct,30) << __func__ << " pool " << pool_id
6179 << " total_osd_weight " << total_osd_weight
6180 << " total_weighted_pa " << total_weighted_pa
6181 << dendl;
6182 }
6183
6184 if (prim_affinity_sum == 0.0) {
6185 if (cct != nullptr) {
6186 ldout(cct, 10) << __func__ << " pool " << pool_id
6187 << " has primary_affinity set to zero on all OSDs" << dendl;
6188 }
6189 zero_rbi(*p_rbi);
6190 p_rbi->err_msg = fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id);
6191
6192 return -ERANGE; // score has a different meaning now.
6193 }
6194 else {
6195 max_osd_score *= prim_affinity_sum / num_osds;
6196 }
6197
6198 rc = tmp_osd_map.set_rbi(cct, *p_rbi, pool_id, total_weighted_pa,
6199 prim_affinity_sum, num_osds, osd_pa_count,
6200 total_osd_weight, max_prims_per_osd,
6201 max_acting_prims_per_osd, avg_prims_per_osd,
6202 prim_on_zero_pa, acting_on_zero_pa, max_osd_score);
6203
6204 if (cct != nullptr) {
6205 ldout(cct,30) << __func__ << " pool " << get_pool_name(pool_id)
6206 << " pa_avg " << p_rbi->pa_avg
6207 << " pa_weighted " << p_rbi->pa_weighted
6208 << " pa_weighted_avg " << p_rbi->pa_weighted_avg
6209 << " optimal_score " << p_rbi->optimal_score
6210 << " adjusted_score " << p_rbi->adjusted_score
6211 << " acting_adj_score " << p_rbi->acting_adj_score
6212 << dendl;
6213 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id)
6214 << " raw_score: " << p_rbi->raw_score
6215 << " acting_raw_score: " << p_rbi->acting_raw_score
6216 << dendl;
6217 ldout(cct,10) << __func__ << " pool " << get_pool_name(pool_id)
6218 << " wl_score: " << p_rbi->acting_adj_score << dendl;
6219 }
6220
6221 return rc;
6222 }
6223
6224 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
6225 {
6226 return crush->get_leaves(name, osds);
6227 }
6228
6229 // get pools whose crush rules might reference the given osd
6230 void OSDMap::get_pool_ids_by_osd(CephContext *cct,
6231 int osd,
6232 set<int64_t> *pool_ids) const
6233 {
6234 ceph_assert(pool_ids);
6235 set<int> raw_rules;
6236 int r = crush->get_rules_by_osd(osd, &raw_rules);
6237 if (r < 0) {
6238 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
6239 << dendl;
6240 ceph_assert(r >= 0);
6241 }
6242 set<int> rules;
6243 for (auto &i: raw_rules) {
6244 // exclude any dead rule
6245 if (crush_rule_in_use(i)) {
6246 rules.insert(i);
6247 }
6248 }
6249 for (auto &r: rules) {
6250 get_pool_ids_by_rule(r, pool_ids);
6251 }
6252 }
6253
6254 template <typename F>
6255 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
6256 public:
6257 typedef CrushTreeDumper::Dumper<F> Parent;
6258
6259 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
6260 const PGMap& pgmap_, bool tree_,
6261 const string& filter) :
6262 Parent(crush, osdmap_->get_pool_names()),
6263 osdmap(osdmap_),
6264 pgmap(pgmap_),
6265 tree(tree_),
6266 min_var(-1),
6267 max_var(-1),
6268 stddev(0),
6269 sum(0) {
6270 if (osdmap->crush->name_exists(filter)) {
6271 // filter by crush node
6272 auto item_id = osdmap->crush->get_item_id(filter);
6273 allowed.insert(item_id);
6274 osdmap->crush->get_all_children(item_id, &allowed);
6275 } else if (osdmap->crush->class_exists(filter)) {
6276 // filter by device class
6277 class_id = osdmap->crush->get_class_id(filter);
6278 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
6279 pool_id >= 0) {
6280 // filter by pool
6281 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
6282 set<int> roots;
6283 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
6284 allowed = roots;
6285 for (auto r : roots)
6286 osdmap->crush->get_all_children(r, &allowed);
6287 }
6288 average_util = average_utilization();
6289 }
6290
6291 protected:
6292
6293 bool should_dump(int id) const {
6294 if (!allowed.empty() && !allowed.count(id)) // filter by name
6295 return false;
6296 if (id >= 0 && class_id >= 0) {
6297 auto item_class_id = osdmap->crush->get_item_class_id(id);
6298 if (item_class_id < 0 || // not bound to a class yet
6299 item_class_id != class_id) // or already bound to a different class
6300 return false;
6301 }
6302 return true;
6303 }
6304
6305 set<int> get_dumped_osds() {
6306 if (allowed.empty() && class_id < 0) {
6307 // old way, all
6308 return {};
6309 }
6310 return dumped_osds;
6311 }
6312
6313 void dump_stray(F *f) {
6314 for (int i = 0; i < osdmap->get_max_osd(); i++) {
6315 if (osdmap->exists(i) && !this->is_touched(i))
6316 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
6317 }
6318 }
6319
6320 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
6321 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
6322 return;
6323 if (!should_dump(qi.id))
6324 return;
6325
6326 if (!qi.is_bucket())
6327 dumped_osds.insert(qi.id);
6328 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
6329 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
6330 kb_used_meta = 0, kb_avail = 0;
6331 double util = 0;
6332 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
6333 &kb_used_omap, &kb_used_meta, &kb_avail))
6334 if (kb_used && kb)
6335 util = 100.0 * (double)kb_used / (double)kb;
6336
6337 double var = 1.0;
6338 if (average_util)
6339 var = util / average_util;
6340
6341 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
6342
6343 dump_item(qi, reweight, kb, kb_used,
6344 kb_used_data, kb_used_omap, kb_used_meta,
6345 kb_avail, util, var, num_pgs, f);
6346
6347 if (!qi.is_bucket() && reweight > 0) {
6348 if (min_var < 0 || var < min_var)
6349 min_var = var;
6350 if (max_var < 0 || var > max_var)
6351 max_var = var;
6352
6353 double dev = util - average_util;
6354 dev *= dev;
6355 stddev += reweight * dev;
6356 sum += reweight;
6357 }
6358 }
6359
6360 virtual void dump_item(const CrushTreeDumper::Item &qi,
6361 float &reweight,
6362 int64_t kb,
6363 int64_t kb_used,
6364 int64_t kb_used_data,
6365 int64_t kb_used_omap,
6366 int64_t kb_used_meta,
6367 int64_t kb_avail,
6368 double& util,
6369 double& var,
6370 const size_t num_pgs,
6371 F *f) = 0;
6372
6373 double dev() {
6374 return sum > 0 ? sqrt(stddev / sum) : 0;
6375 }
6376
6377 double average_utilization() {
6378 int64_t kb = 0, kb_used = 0;
6379 for (int i = 0; i < osdmap->get_max_osd(); i++) {
6380 if (!osdmap->exists(i) ||
6381 osdmap->get_weight(i) == 0 ||
6382 !should_dump(i))
6383 continue;
6384 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
6385 kb_avail_i;
6386 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
6387 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
6388 kb += kb_i;
6389 kb_used += kb_used_i;
6390 }
6391 }
6392 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
6393 }
6394
6395 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
6396 int64_t* kb_used_data,
6397 int64_t* kb_used_omap,
6398 int64_t* kb_used_meta,
6399 int64_t* kb_avail) const {
6400 const osd_stat_t *p = pgmap.get_osd_stat(id);
6401 if (!p) return false;
6402 *kb = p->statfs.kb();
6403 *kb_used = p->statfs.kb_used_raw();
6404 *kb_used_data = p->statfs.kb_used_data();
6405 *kb_used_omap = p->statfs.kb_used_omap();
6406 *kb_used_meta = p->statfs.kb_used_internal_metadata();
6407 *kb_avail = p->statfs.kb_avail();
6408
6409 return true;
6410 }
6411
6412 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
6413 int64_t* kb_used_data,
6414 int64_t* kb_used_omap,
6415 int64_t* kb_used_meta,
6416 int64_t* kb_avail) const {
6417 if (id >= 0) {
6418 if (osdmap->is_out(id) || !should_dump(id)) {
6419 *kb = 0;
6420 *kb_used = 0;
6421 *kb_used_data = 0;
6422 *kb_used_omap = 0;
6423 *kb_used_meta = 0;
6424 *kb_avail = 0;
6425 return true;
6426 }
6427 return get_osd_utilization(id, kb, kb_used, kb_used_data,
6428 kb_used_omap, kb_used_meta, kb_avail);
6429 }
6430
6431 *kb = 0;
6432 *kb_used = 0;
6433 *kb_used_data = 0;
6434 *kb_used_omap = 0;
6435 *kb_used_meta = 0;
6436 *kb_avail = 0;
6437
6438 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
6439 int item = osdmap->crush->get_bucket_item(id, k);
6440 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
6441 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
6442 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
6443 &kb_used_data_i, &kb_used_omap_i,
6444 &kb_used_meta_i, &kb_avail_i))
6445 return false;
6446 *kb += kb_i;
6447 *kb_used += kb_used_i;
6448 *kb_used_data += kb_used_data_i;
6449 *kb_used_omap += kb_used_omap_i;
6450 *kb_used_meta += kb_used_meta_i;
6451 *kb_avail += kb_avail_i;
6452 }
6453 return true;
6454 }
6455
6456 protected:
6457 const OSDMap *osdmap;
6458 const PGMap& pgmap;
6459 bool tree;
6460 double average_util;
6461 double min_var;
6462 double max_var;
6463 double stddev;
6464 double sum;
6465 int class_id = -1;
6466 set<int> allowed;
6467 set<int> dumped_osds;
6468 };
6469
6470
6471 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
6472 public:
6473 typedef OSDUtilizationDumper<TextTable> Parent;
6474
6475 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
6476 const PGMap& pgmap, bool tree,
6477 const string& filter) :
6478 Parent(crush, osdmap, pgmap, tree, filter) {}
6479
6480 void dump(TextTable *tbl) {
6481 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
6482 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
6483 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
6484 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
6485 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
6486 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
6487 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
6488 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
6489 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
6490 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
6491 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
6492 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
6493 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
6494 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
6495 if (tree)
6496 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
6497
6498 Parent::dump(tbl);
6499
6500 dump_stray(tbl);
6501
6502 auto sum = pgmap.get_osd_sum(get_dumped_osds());
6503 *tbl << ""
6504 << ""
6505 << "" << "TOTAL"
6506 << byte_u_t(sum.statfs.total)
6507 << byte_u_t(sum.statfs.get_used_raw())
6508 << byte_u_t(sum.statfs.allocated)
6509 << byte_u_t(sum.statfs.omap_allocated)
6510 << byte_u_t(sum.statfs.internal_metadata)
6511 << byte_u_t(sum.statfs.available)
6512 << lowprecision_t(average_util)
6513 << ""
6514 << TextTable::endrow;
6515 }
6516
6517 protected:
6518 struct lowprecision_t {
6519 float v;
6520 explicit lowprecision_t(float _v) : v(_v) {}
6521 };
6522 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
6523
6524 using OSDUtilizationDumper<TextTable>::dump_item;
6525 void dump_item(const CrushTreeDumper::Item &qi,
6526 float &reweight,
6527 int64_t kb,
6528 int64_t kb_used,
6529 int64_t kb_used_data,
6530 int64_t kb_used_omap,
6531 int64_t kb_used_meta,
6532 int64_t kb_avail,
6533 double& util,
6534 double& var,
6535 const size_t num_pgs,
6536 TextTable *tbl) override {
6537 const char *c = crush->get_item_class(qi.id);
6538 if (!c)
6539 c = "";
6540 *tbl << qi.id
6541 << c
6542 << weightf_t(qi.weight)
6543 << weightf_t(reweight)
6544 << byte_u_t(kb << 10)
6545 << byte_u_t(kb_used << 10)
6546 << byte_u_t(kb_used_data << 10)
6547 << byte_u_t(kb_used_omap << 10)
6548 << byte_u_t(kb_used_meta << 10)
6549 << byte_u_t(kb_avail << 10)
6550 << lowprecision_t(util)
6551 << lowprecision_t(var);
6552
6553 if (qi.is_bucket()) {
6554 *tbl << "-";
6555 *tbl << "";
6556 } else {
6557 *tbl << num_pgs;
6558 if (osdmap->is_up(qi.id)) {
6559 *tbl << "up";
6560 } else if (osdmap->is_destroyed(qi.id)) {
6561 *tbl << "destroyed";
6562 } else {
6563 *tbl << "down";
6564 }
6565 }
6566
6567 if (tree) {
6568 ostringstream name;
6569 for (int k = 0; k < qi.depth; k++)
6570 name << " ";
6571 if (qi.is_bucket()) {
6572 int type = crush->get_bucket_type(qi.id);
6573 name << crush->get_type_name(type) << " "
6574 << crush->get_item_name(qi.id);
6575 } else {
6576 name << "osd." << qi.id;
6577 }
6578 *tbl << name.str();
6579 }
6580
6581 *tbl << TextTable::endrow;
6582 }
6583
6584 public:
6585 string summary() {
6586 ostringstream out;
6587 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
6588 << "/" << lowprecision_t(max_var) << " "
6589 << "STDDEV: " << lowprecision_t(dev());
6590 return out.str();
6591 }
6592 };
6593
6594 ostream& operator<<(ostream& out,
6595 const OSDUtilizationPlainDumper::lowprecision_t& v)
6596 {
6597 if (v.v < -0.01) {
6598 return out << "-";
6599 } else if (v.v < 0.001) {
6600 return out << "0";
6601 } else {
6602 std::streamsize p = out.precision();
6603 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
6604 }
6605 }
6606
6607 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
6608 public:
6609 typedef OSDUtilizationDumper<Formatter> Parent;
6610
6611 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
6612 const PGMap& pgmap, bool tree,
6613 const string& filter) :
6614 Parent(crush, osdmap, pgmap, tree, filter) {}
6615
6616 void dump(Formatter *f) {
6617 f->open_array_section("nodes");
6618 Parent::dump(f);
6619 f->close_section();
6620
6621 f->open_array_section("stray");
6622 dump_stray(f);
6623 f->close_section();
6624 }
6625
6626 protected:
6627 using OSDUtilizationDumper<Formatter>::dump_item;
6628 void dump_item(const CrushTreeDumper::Item &qi,
6629 float &reweight,
6630 int64_t kb,
6631 int64_t kb_used,
6632 int64_t kb_used_data,
6633 int64_t kb_used_omap,
6634 int64_t kb_used_meta,
6635 int64_t kb_avail,
6636 double& util,
6637 double& var,
6638 const size_t num_pgs,
6639 Formatter *f) override {
6640 f->open_object_section("item");
6641 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
6642 f->dump_float("reweight", reweight);
6643 f->dump_int("kb", kb);
6644 f->dump_int("kb_used", kb_used);
6645 f->dump_int("kb_used_data", kb_used_data);
6646 f->dump_int("kb_used_omap", kb_used_omap);
6647 f->dump_int("kb_used_meta", kb_used_meta);
6648 f->dump_int("kb_avail", kb_avail);
6649 f->dump_float("utilization", util);
6650 f->dump_float("var", var);
6651 f->dump_unsigned("pgs", num_pgs);
6652 if (!qi.is_bucket()) {
6653 if (osdmap->is_up(qi.id)) {
6654 f->dump_string("status", "up");
6655 } else if (osdmap->is_destroyed(qi.id)) {
6656 f->dump_string("status", "destroyed");
6657 } else {
6658 f->dump_string("status", "down");
6659 }
6660 }
6661 CrushTreeDumper::dump_bucket_children(crush, qi, f);
6662 f->close_section();
6663 }
6664
6665 public:
6666 void summary(Formatter *f) {
6667 f->open_object_section("summary");
6668 auto sum = pgmap.get_osd_sum(get_dumped_osds());
6669 auto& s = sum.statfs;
6670
6671 f->dump_int("total_kb", s.kb());
6672 f->dump_int("total_kb_used", s.kb_used_raw());
6673 f->dump_int("total_kb_used_data", s.kb_used_data());
6674 f->dump_int("total_kb_used_omap", s.kb_used_omap());
6675 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
6676 f->dump_int("total_kb_avail", s.kb_avail());
6677 f->dump_float("average_utilization", average_util);
6678 f->dump_float("min_var", min_var);
6679 f->dump_float("max_var", max_var);
6680 f->dump_float("dev", dev());
6681 f->close_section();
6682 }
6683 };
6684
6685 void print_osd_utilization(const OSDMap& osdmap,
6686 const PGMap& pgmap,
6687 ostream& out,
6688 Formatter *f,
6689 bool tree,
6690 const string& filter)
6691 {
6692 const CrushWrapper *crush = osdmap.crush.get();
6693 if (f) {
6694 f->open_object_section("df");
6695 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
6696 d.dump(f);
6697 d.summary(f);
6698 f->close_section();
6699 f->flush(out);
6700 } else {
6701 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
6702 TextTable tbl;
6703 d.dump(&tbl);
6704 out << tbl << d.summary() << "\n";
6705 }
6706 }
6707
6708 void OSDMap::check_health(CephContext *cct,
6709 health_check_map_t *checks) const
6710 {
6711 int num_osds = get_num_osds();
6712
6713 // OSD_DOWN
6714 // OSD_$subtree_DOWN
6715 // OSD_ORPHAN
6716 if (num_osds >= 0) {
6717 int num_in_osds = 0;
6718 int num_down_in_osds = 0;
6719 set<int> osds;
6720 set<int> down_in_osds;
6721 set<int> up_in_osds;
6722 set<int> subtree_up;
6723 unordered_map<int, set<int> > subtree_type_down;
6724 unordered_map<int, int> num_osds_subtree;
6725 int max_type = crush->get_max_type_id();
6726
6727 for (int i = 0; i < get_max_osd(); i++) {
6728 if (!exists(i)) {
6729 if (crush->item_exists(i)) {
6730 osds.insert(i);
6731 }
6732 continue;
6733 }
6734 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
6735 continue;
6736 ++num_in_osds;
6737 if (down_in_osds.count(i) || up_in_osds.count(i))
6738 continue;
6739 if (!is_up(i)) {
6740 down_in_osds.insert(i);
6741 int parent_id = 0;
6742 int current = i;
6743 for (int type = 0; type <= max_type; type++) {
6744 if (!crush->get_type_name(type))
6745 continue;
6746 int r = crush->get_immediate_parent_id(current, &parent_id);
6747 if (r == -ENOENT)
6748 break;
6749 // break early if this parent is already marked as up
6750 if (subtree_up.count(parent_id))
6751 break;
6752 type = crush->get_bucket_type(parent_id);
6753 if (!subtree_type_is_down(
6754 cct, parent_id, type,
6755 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
6756 break;
6757 current = parent_id;
6758 }
6759 }
6760 }
6761
6762 // calculate the number of down osds in each down subtree and
6763 // store it in num_osds_subtree
6764 for (int type = 1; type <= max_type; type++) {
6765 if (!crush->get_type_name(type))
6766 continue;
6767 for (auto j = subtree_type_down[type].begin();
6768 j != subtree_type_down[type].end();
6769 ++j) {
6770 list<int> children;
6771 int num = 0;
6772 int num_children = crush->get_children(*j, &children);
6773 if (num_children == 0)
6774 continue;
6775 for (auto l = children.begin(); l != children.end(); ++l) {
6776 if (*l >= 0) {
6777 ++num;
6778 } else if (num_osds_subtree[*l] > 0) {
6779 num = num + num_osds_subtree[*l];
6780 }
6781 }
6782 num_osds_subtree[*j] = num;
6783 }
6784 }
6785 num_down_in_osds = down_in_osds.size();
6786 ceph_assert(num_down_in_osds <= num_in_osds);
6787 if (num_down_in_osds > 0) {
6788 // summary of down subtree types and osds
6789 for (int type = max_type; type > 0; type--) {
6790 if (!crush->get_type_name(type))
6791 continue;
6792 if (subtree_type_down[type].size() > 0) {
6793 ostringstream ss;
6794 ss << subtree_type_down[type].size() << " "
6795 << crush->get_type_name(type);
6796 if (subtree_type_down[type].size() > 1) {
6797 ss << "s";
6798 }
6799 int sum_down_osds = 0;
6800 for (auto j = subtree_type_down[type].begin();
6801 j != subtree_type_down[type].end();
6802 ++j) {
6803 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
6804 }
6805 ss << " (" << sum_down_osds << " osds) down";
6806 string err = string("OSD_") +
6807 string(crush->get_type_name(type)) + "_DOWN";
6808 boost::to_upper(err);
6809 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
6810 subtree_type_down[type].size());
6811 for (auto j = subtree_type_down[type].rbegin();
6812 j != subtree_type_down[type].rend();
6813 ++j) {
6814 ostringstream ss;
6815 ss << crush->get_type_name(type);
6816 ss << " ";
6817 ss << crush->get_item_name(*j);
6818 // at the top level, do not print location
6819 if (type != max_type) {
6820 ss << " (";
6821 ss << crush->get_full_location_ordered_string(*j);
6822 ss << ")";
6823 }
6824 int num = num_osds_subtree[*j];
6825 ss << " (" << num << " osds)";
6826 ss << " is down";
6827 d.detail.push_back(ss.str());
6828 }
6829 }
6830 }
6831 ostringstream ss;
6832 ss << down_in_osds.size() << " osds down";
6833 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
6834 down_in_osds.size());
6835 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
6836 ostringstream ss;
6837 ss << "osd." << *it << " (";
6838 ss << crush->get_full_location_ordered_string(*it);
6839 ss << ") is down";
6840 d.detail.push_back(ss.str());
6841 }
6842 }
6843
6844 if (!osds.empty()) {
6845 ostringstream ss;
6846 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
6847 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
6848 osds.size());
6849 for (auto osd : osds) {
6850 ostringstream ss;
6851 ss << "osd." << osd << " exists in crush map but not in osdmap";
6852 d.detail.push_back(ss.str());
6853 }
6854 }
6855 }
6856
6857 std::list<std::string> scrub_messages;
6858 bool noscrub = false, nodeepscrub = false;
6859 for (const auto &p : pools) {
6860 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
6861 ostringstream ss;
6862 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
6863 scrub_messages.push_back(ss.str());
6864 noscrub = true;
6865 }
6866 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
6867 ostringstream ss;
6868 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
6869 scrub_messages.push_back(ss.str());
6870 nodeepscrub = true;
6871 }
6872 }
6873 if (noscrub || nodeepscrub) {
6874 string out = "";
6875 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
6876 out += nodeepscrub ? "nodeep-scrub" : "";
6877 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
6878 "Some pool(s) have the " + out + " flag(s) set", 0);
6879 d.detail.splice(d.detail.end(), scrub_messages);
6880 }
6881
6882 // OSD_OUT_OF_ORDER_FULL
6883 {
6884 // An osd could configure failsafe ratio, to something different
6885 // but for now assume it is the same here.
6886 float fsr = cct->_conf->osd_failsafe_full_ratio;
6887 if (fsr > 1.0) fsr /= 100;
6888 float fr = get_full_ratio();
6889 float br = get_backfillfull_ratio();
6890 float nr = get_nearfull_ratio();
6891
6892 list<string> detail;
6893 // These checks correspond to how OSDService::check_full_status() in an OSD
6894 // handles the improper setting of these values.
6895 if (br < nr) {
6896 ostringstream ss;
6897 ss << "backfillfull_ratio (" << br
6898 << ") < nearfull_ratio (" << nr << "), increased";
6899 detail.push_back(ss.str());
6900 br = nr;
6901 }
6902 if (fr < br) {
6903 ostringstream ss;
6904 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
6905 << "), increased";
6906 detail.push_back(ss.str());
6907 fr = br;
6908 }
6909 if (fsr < fr) {
6910 ostringstream ss;
6911 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
6912 << "), increased";
6913 detail.push_back(ss.str());
6914 }
6915 if (!detail.empty()) {
6916 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
6917 "full ratio(s) out of order", 0);
6918 d.detail.swap(detail);
6919 }
6920 }
6921
6922 // OSD_FULL
6923 // OSD_NEARFULL
6924 // OSD_BACKFILLFULL
6925 // OSD_FAILSAFE_FULL
6926 {
6927 set<int> full, backfillfull, nearfull;
6928 get_full_osd_counts(&full, &backfillfull, &nearfull);
6929 if (full.size()) {
6930 ostringstream ss;
6931 ss << full.size() << " full osd(s)";
6932 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
6933 for (auto& i: full) {
6934 ostringstream ss;
6935 ss << "osd." << i << " is full";
6936 d.detail.push_back(ss.str());
6937 }
6938 }
6939 if (backfillfull.size()) {
6940 ostringstream ss;
6941 ss << backfillfull.size() << " backfillfull osd(s)";
6942 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
6943 backfillfull.size());
6944 for (auto& i: backfillfull) {
6945 ostringstream ss;
6946 ss << "osd." << i << " is backfill full";
6947 d.detail.push_back(ss.str());
6948 }
6949 }
6950 if (nearfull.size()) {
6951 ostringstream ss;
6952 ss << nearfull.size() << " nearfull osd(s)";
6953 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
6954 for (auto& i: nearfull) {
6955 ostringstream ss;
6956 ss << "osd." << i << " is near full";
6957 d.detail.push_back(ss.str());
6958 }
6959 }
6960 }
6961
6962 // OSDMAP_FLAGS
6963 {
6964 // warn about flags
6965 uint64_t warn_flags =
6966 CEPH_OSDMAP_PAUSERD |
6967 CEPH_OSDMAP_PAUSEWR |
6968 CEPH_OSDMAP_PAUSEREC |
6969 CEPH_OSDMAP_NOUP |
6970 CEPH_OSDMAP_NODOWN |
6971 CEPH_OSDMAP_NOIN |
6972 CEPH_OSDMAP_NOOUT |
6973 CEPH_OSDMAP_NOBACKFILL |
6974 CEPH_OSDMAP_NORECOVER |
6975 CEPH_OSDMAP_NOSCRUB |
6976 CEPH_OSDMAP_NODEEP_SCRUB |
6977 CEPH_OSDMAP_NOTIERAGENT |
6978 CEPH_OSDMAP_NOSNAPTRIM |
6979 CEPH_OSDMAP_NOREBALANCE;
6980 if (test_flag(warn_flags)) {
6981 ostringstream ss;
6982 string s = get_flag_string(get_flags() & warn_flags);
6983 ss << s << " flag(s) set";
6984 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
6985 s.size() /* kludgey but sufficient */);
6986 }
6987 }
6988
6989 // OSD_FLAGS
6990 {
6991 list<string> detail;
6992 const unsigned flags =
6993 CEPH_OSD_NOUP |
6994 CEPH_OSD_NOIN |
6995 CEPH_OSD_NODOWN |
6996 CEPH_OSD_NOOUT;
6997 for (int i = 0; i < max_osd; ++i) {
6998 if (osd_state[i] & flags) {
6999 ostringstream ss;
7000 set<string> states;
7001 OSDMap::calc_state_set(osd_state[i] & flags, states);
7002 ss << "osd." << i << " has flags " << states;
7003 detail.push_back(ss.str());
7004 }
7005 }
7006 for (auto& i : crush_node_flags) {
7007 if (i.second && crush->item_exists(i.first)) {
7008 ostringstream ss;
7009 set<string> states;
7010 OSDMap::calc_state_set(i.second, states);
7011 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
7012 const char *tn = crush->get_type_name(t);
7013 ss << (tn ? tn : "node") << " "
7014 << crush->get_item_name(i.first) << " has flags " << states;
7015 detail.push_back(ss.str());
7016 }
7017 }
7018 for (auto& i : device_class_flags) {
7019 const char* class_name = crush->get_class_name(i.first);
7020 if (i.second && class_name) {
7021 ostringstream ss;
7022 set<string> states;
7023 OSDMap::calc_state_set(i.second, states);
7024 ss << "device class '" << class_name << "' has flags " << states;
7025 detail.push_back(ss.str());
7026 }
7027 }
7028 if (!detail.empty()) {
7029 ostringstream ss;
7030 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
7031 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
7032 d.detail.swap(detail);
7033 }
7034 }
7035
7036 // OLD_CRUSH_TUNABLES
7037 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
7038 string min = crush->get_min_required_version();
7039 if (min < cct->_conf->mon_crush_min_required_version) {
7040 ostringstream ss;
7041 ss << "crush map has legacy tunables (require " << min
7042 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
7043 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
7044 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7045 }
7046 }
7047
7048 // OLD_CRUSH_STRAW_CALC_VERSION
7049 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
7050 if (crush->get_straw_calc_version() == 0) {
7051 ostringstream ss;
7052 ss << "crush map has straw_calc_version=0";
7053 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
7054 d.detail.push_back(
7055 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7056 }
7057 }
7058
7059 // CACHE_POOL_NO_HIT_SET
7060 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
7061 list<string> detail;
7062 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
7063 const pg_pool_t& info = p->second;
7064 if (info.cache_mode_requires_hit_set() &&
7065 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
7066 ostringstream ss;
7067 ss << "pool '" << get_pool_name(p->first)
7068 << "' with cache_mode " << info.get_cache_mode_name()
7069 << " needs hit_set_type to be set but it is not";
7070 detail.push_back(ss.str());
7071 }
7072 }
7073 if (!detail.empty()) {
7074 ostringstream ss;
7075 ss << detail.size() << " cache pools are missing hit_sets";
7076 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
7077 detail.size());
7078 d.detail.swap(detail);
7079 }
7080 }
7081
7082 // OSD_NO_SORTBITWISE
7083 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7084 ostringstream ss;
7085 ss << "'sortbitwise' flag is not set";
7086 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
7087 }
7088
7089 // OSD_UPGRADE_FINISHED
7090 if (auto require_release = pending_require_osd_release()) {
7091 ostringstream ss;
7092 ss << "all OSDs are running " << *require_release << " or later but"
7093 << " require_osd_release < " << *require_release;
7094 auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
7095 d.detail.push_back(ss.str());
7096 }
7097
7098 // POOL_NEARFULL/BACKFILLFULL/FULL
7099 {
7100 list<string> full_detail, backfillfull_detail, nearfull_detail;
7101 for (auto it : get_pools()) {
7102 const pg_pool_t &pool = it.second;
7103 const string& pool_name = get_pool_name(it.first);
7104 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
7105 stringstream ss;
7106 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7107 // may run out of space too,
7108 // but we want EQUOTA taking precedence
7109 ss << "pool '" << pool_name << "' is full (running out of quota)";
7110 } else {
7111 ss << "pool '" << pool_name << "' is full (no space)";
7112 }
7113 full_detail.push_back(ss.str());
7114 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
7115 stringstream ss;
7116 ss << "pool '" << pool_name << "' is backfillfull";
7117 backfillfull_detail.push_back(ss.str());
7118 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
7119 stringstream ss;
7120 ss << "pool '" << pool_name << "' is nearfull";
7121 nearfull_detail.push_back(ss.str());
7122 }
7123 }
7124 if (!full_detail.empty()) {
7125 ostringstream ss;
7126 ss << full_detail.size() << " pool(s) full";
7127 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
7128 d.detail.swap(full_detail);
7129 }
7130 if (!backfillfull_detail.empty()) {
7131 ostringstream ss;
7132 ss << backfillfull_detail.size() << " pool(s) backfillfull";
7133 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
7134 backfillfull_detail.size());
7135 d.detail.swap(backfillfull_detail);
7136 }
7137 if (!nearfull_detail.empty()) {
7138 ostringstream ss;
7139 ss << nearfull_detail.size() << " pool(s) nearfull";
7140 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
7141 nearfull_detail.size());
7142 d.detail.swap(nearfull_detail);
7143 }
7144 }
7145
7146 // POOL_PG_NUM_NOT_POWER_OF_TWO
7147 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
7148 list<string> detail;
7149 for (auto it : get_pools()) {
7150 if (!std::has_single_bit(it.second.get_pg_num_target())) {
7151 ostringstream ss;
7152 ss << "pool '" << get_pool_name(it.first)
7153 << "' pg_num " << it.second.get_pg_num_target()
7154 << " is not a power of two";
7155 detail.push_back(ss.str());
7156 }
7157 }
7158 if (!detail.empty()) {
7159 ostringstream ss;
7160 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
7161 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
7162 ss.str(), detail.size());
7163 d.detail.swap(detail);
7164 }
7165 }
7166
7167 // POOL_NO_REDUNDANCY
7168 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
7169 {
7170 list<string> detail;
7171 for (auto it : get_pools()) {
7172 if (it.second.get_size() == 1) {
7173 ostringstream ss;
7174 ss << "pool '" << get_pool_name(it.first)
7175 << "' has no replicas configured";
7176 detail.push_back(ss.str());
7177 }
7178 }
7179 if (!detail.empty()) {
7180 ostringstream ss;
7181 ss << detail.size() << " pool(s) have no replicas configured";
7182 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
7183 ss.str(), detail.size());
7184 d.detail.swap(detail);
7185 }
7186 }
7187
7188 // DEGRADED STRETCH MODE
7189 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
7190 if (recovering_stretch_mode) {
7191 stringstream ss;
7192 ss << "We are recovering stretch mode buckets, only requiring "
7193 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7194 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
7195 ss.str(), 0);
7196 } else if (degraded_stretch_mode) {
7197 stringstream ss;
7198 ss << "We are missing stretch mode buckets, only requiring "
7199 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7200 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
7201 ss.str(), 0);
7202 }
7203 }
7204 }
7205
7206 int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
7207 ostream *ss) const
7208 {
7209 out->clear();
7210 for (auto i = ls.begin(); i != ls.end(); ++i) {
7211 if (i == ls.begin() &&
7212 (*i == "any" || *i == "all" || *i == "*")) {
7213 get_all_osds(*out);
7214 break;
7215 }
7216 long osd = ceph::common::parse_osd_id(i->c_str(), ss);
7217 if (osd < 0) {
7218 *ss << "invalid osd id '" << *i << "'";
7219 return -EINVAL;
7220 }
7221 out->insert(osd);
7222 }
7223 return 0;
7224 }
7225
7226 void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
7227 string &subtree,
7228 int limit, // how many
7229 set<int> skip,
7230 set<int> *want) const {
7231 if (limit <= 0)
7232 return;
7233 int subtree_type = crush->get_type_id(subtree);
7234 if (subtree_type < 1)
7235 return;
7236 vector<int> subtrees;
7237 crush->get_subtree_of_type(subtree_type, &subtrees);
7238 std::random_device rd;
7239 std::default_random_engine rng{rd()};
7240 std::shuffle(subtrees.begin(), subtrees.end(), rng);
7241 for (auto s : subtrees) {
7242 if (limit <= 0)
7243 break;
7244 if (crush->subtree_contains(s, n))
7245 continue;
7246 vector<int> osds;
7247 crush->get_children_of_type(s, 0, &osds);
7248 if (osds.empty())
7249 continue;
7250 vector<int> up_osds;
7251 for (auto o : osds) {
7252 if (is_up(o) && !skip.count(o))
7253 up_osds.push_back(o);
7254 }
7255 if (up_osds.empty())
7256 continue;
7257 auto it = up_osds.begin();
7258 std::advance(it, (n % up_osds.size()));
7259 want->insert(*it);
7260 --limit;
7261 }
7262 }
7263
7264 float OSDMap::pool_raw_used_rate(int64_t poolid) const
7265 {
7266 const pg_pool_t *pool = get_pg_pool(poolid);
7267 assert(pool != nullptr);
7268
7269 switch (pool->get_type()) {
7270 case pg_pool_t::TYPE_REPLICATED:
7271 return pool->get_size();
7272 case pg_pool_t::TYPE_ERASURE:
7273 {
7274 auto& ecp =
7275 get_erasure_code_profile(pool->erasure_code_profile);
7276 auto pm = ecp.find("m");
7277 auto pk = ecp.find("k");
7278 if (pm != ecp.end() && pk != ecp.end()) {
7279 int k = atoi(pk->second.c_str());
7280 int m = atoi(pm->second.c_str());
7281 int mk = m + k;
7282 ceph_assert(mk != 0);
7283 ceph_assert(k != 0);
7284 return (float)mk / k;
7285 } else {
7286 return 0.0;
7287 }
7288 }
7289 break;
7290 default:
7291 ceph_abort_msg("unrecognized pool type");
7292 }
7293 }
7294
7295 unsigned OSDMap::get_osd_crush_node_flags(int osd) const
7296 {
7297 unsigned flags = 0;
7298 if (!crush_node_flags.empty()) {
7299 // the map will contain type -> name
7300 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
7301 for (auto& i : ploc) {
7302 int id = crush->get_item_id(i.second);
7303 auto p = crush_node_flags.find(id);
7304 if (p != crush_node_flags.end()) {
7305 flags |= p->second;
7306 }
7307 }
7308 }
7309 return flags;
7310 }
7311
7312 unsigned OSDMap::get_crush_node_flags(int id) const
7313 {
7314 unsigned flags = 0;
7315 auto it = crush_node_flags.find(id);
7316 if (it != crush_node_flags.end())
7317 flags = it->second;
7318 return flags;
7319 }
7320
7321 unsigned OSDMap::get_device_class_flags(int id) const
7322 {
7323 unsigned flags = 0;
7324 auto it = device_class_flags.find(id);
7325 if (it != device_class_flags.end())
7326 flags = it->second;
7327 return flags;
7328 }
7329
7330 std::optional<std::string> OSDMap::pending_require_osd_release() const
7331 {
7332 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) &&
7333 require_osd_release < ceph_release_t::quincy) {
7334 return "quincy";
7335 }
7336 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
7337 require_osd_release < ceph_release_t::pacific) {
7338 return "pacific";
7339 }
7340 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
7341 require_osd_release < ceph_release_t::octopus) {
7342 return "octopus";
7343 }
7344 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
7345 require_osd_release < ceph_release_t::nautilus) {
7346 return "nautilus";
7347 }
7348
7349 return std::nullopt;
7350 }