]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <algorithm>
19 #include <bit>
20 #include <optional>
21 #include <random>
22 #include <fmt/format.h>
23
24 #include <boost/algorithm/string.hpp>
25
26 #include "OSDMap.h"
27 #include "common/config.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/TextTable.h"
31 #include "include/ceph_features.h"
32 #include "include/common_fwd.h"
33 #include "include/str_map.h"
34
35 #include "common/code_environment.h"
36 #include "mon/health_check.h"
37
38 #include "crush/CrushTreeDumper.h"
39 #include "common/Clock.h"
40 #include "mon/PGMap.h"
41
42 using std::list;
43 using std::make_pair;
44 using std::map;
45 using std::multimap;
46 using std::ostream;
47 using std::ostringstream;
48 using std::pair;
49 using std::set;
50 using std::string;
51 using std::stringstream;
52 using std::unordered_map;
53 using std::vector;
54
55 using ceph::decode;
56 using ceph::encode;
57 using ceph::Formatter;
58
59 #define dout_subsys ceph_subsys_osd
60
61 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
62 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
63
64
65 // ----------------------------------
66 // osd_info_t
67
68 void osd_info_t::dump(Formatter *f) const
69 {
70 f->dump_int("last_clean_begin", last_clean_begin);
71 f->dump_int("last_clean_end", last_clean_end);
72 f->dump_int("up_from", up_from);
73 f->dump_int("up_thru", up_thru);
74 f->dump_int("down_at", down_at);
75 f->dump_int("lost_at", lost_at);
76 }
77
78 void osd_info_t::encode(ceph::buffer::list& bl) const
79 {
80 using ceph::encode;
81 __u8 struct_v = 1;
82 encode(struct_v, bl);
83 encode(last_clean_begin, bl);
84 encode(last_clean_end, bl);
85 encode(up_from, bl);
86 encode(up_thru, bl);
87 encode(down_at, bl);
88 encode(lost_at, bl);
89 }
90
91 void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
92 {
93 using ceph::decode;
94 __u8 struct_v;
95 decode(struct_v, bl);
96 decode(last_clean_begin, bl);
97 decode(last_clean_end, bl);
98 decode(up_from, bl);
99 decode(up_thru, bl);
100 decode(down_at, bl);
101 decode(lost_at, bl);
102 }
103
104 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
105 {
106 o.push_back(new osd_info_t);
107 o.push_back(new osd_info_t);
108 o.back()->last_clean_begin = 1;
109 o.back()->last_clean_end = 2;
110 o.back()->up_from = 30;
111 o.back()->up_thru = 40;
112 o.back()->down_at = 5;
113 o.back()->lost_at = 6;
114 }
115
116 ostream& operator<<(ostream& out, const osd_info_t& info)
117 {
118 out << "up_from " << info.up_from
119 << " up_thru " << info.up_thru
120 << " down_at " << info.down_at
121 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
122 if (info.lost_at)
123 out << " lost_at " << info.lost_at;
124 return out;
125 }
126
127 // ----------------------------------
128 // osd_xinfo_t
129
130 void osd_xinfo_t::dump(Formatter *f) const
131 {
132 f->dump_stream("down_stamp") << down_stamp;
133 f->dump_float("laggy_probability", laggy_probability);
134 f->dump_int("laggy_interval", laggy_interval);
135 f->dump_int("features", features);
136 f->dump_unsigned("old_weight", old_weight);
137 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
138 f->dump_int("dead_epoch", dead_epoch);
139 }
140
141 void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
142 {
143 uint8_t v = 4;
144 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
145 v = 3;
146 }
147 ENCODE_START(v, 1, bl);
148 encode(down_stamp, bl);
149 __u32 lp = laggy_probability * float(0xfffffffful);
150 encode(lp, bl);
151 encode(laggy_interval, bl);
152 encode(features, bl);
153 encode(old_weight, bl);
154 if (v >= 4) {
155 encode(last_purged_snaps_scrub, bl);
156 encode(dead_epoch, bl);
157 }
158 ENCODE_FINISH(bl);
159 }
160
161 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
162 {
163 DECODE_START(4, bl);
164 decode(down_stamp, bl);
165 __u32 lp;
166 decode(lp, bl);
167 laggy_probability = (float)lp / (float)0xffffffff;
168 decode(laggy_interval, bl);
169 if (struct_v >= 2)
170 decode(features, bl);
171 else
172 features = 0;
173 if (struct_v >= 3)
174 decode(old_weight, bl);
175 else
176 old_weight = 0;
177 if (struct_v >= 4) {
178 decode(last_purged_snaps_scrub, bl);
179 decode(dead_epoch, bl);
180 } else {
181 dead_epoch = 0;
182 }
183 DECODE_FINISH(bl);
184 }
185
186 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
187 {
188 o.push_back(new osd_xinfo_t);
189 o.push_back(new osd_xinfo_t);
190 o.back()->down_stamp = utime_t(2, 3);
191 o.back()->laggy_probability = .123;
192 o.back()->laggy_interval = 123456;
193 o.back()->old_weight = 0x7fff;
194 }
195
196 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
197 {
198 return out << "down_stamp " << xi.down_stamp
199 << " laggy_probability " << xi.laggy_probability
200 << " laggy_interval " << xi.laggy_interval
201 << " old_weight " << xi.old_weight
202 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
203 << " dead_epoch " << xi.dead_epoch;
204 }
205
206 // ----------------------------------
207 // OSDMap::Incremental
208
209 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
210 {
211 int n = 0;
212 for (auto &weight : new_weight) {
213 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
214 n++; // marked out
215 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
216 n--; // marked in
217 }
218 return n;
219 }
220
221 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
222 {
223 int n = 0;
224 for (auto &state : new_state) { //
225 if (state.second & CEPH_OSD_UP) {
226 if (previous->is_up(state.first))
227 n++; // marked down
228 else
229 n--; // marked up
230 }
231 }
232 return n;
233 }
234
235 int OSDMap::Incremental::identify_osd(uuid_d u) const
236 {
237 for (auto &uuid : new_uuid)
238 if (uuid.second == u)
239 return uuid.first;
240 return -1;
241 }
242
243 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
244 const OSDMap& osdmap)
245 {
246 ceph_assert(epoch == osdmap.get_epoch() + 1);
247
248 for (auto &new_pool : new_pools) {
249 if (!new_pool.second.tiers.empty()) {
250 pg_pool_t& base = new_pool.second;
251
252 auto new_rem_it = new_removed_snaps.find(new_pool.first);
253
254 for (const auto &tier_pool : base.tiers) {
255 const auto &r = new_pools.find(tier_pool);
256 pg_pool_t *tier = 0;
257 if (r == new_pools.end()) {
258 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
259 if (!orig) {
260 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
261 return -EIO;
262 }
263 tier = get_new_pool(tier_pool, orig);
264 } else {
265 tier = &r->second;
266 }
267 if (tier->tier_of != new_pool.first) {
268 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
269 return -EIO;
270 }
271
272 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
273 << tier_pool << dendl;
274 tier->snap_seq = base.snap_seq;
275 tier->snap_epoch = base.snap_epoch;
276 tier->snaps = base.snaps;
277 tier->removed_snaps = base.removed_snaps;
278 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
279 pg_pool_t::FLAG_POOL_SNAPS);
280
281 if (new_rem_it != new_removed_snaps.end()) {
282 new_removed_snaps[tier_pool] = new_rem_it->second;
283 }
284
285 tier->application_metadata = base.application_metadata;
286 }
287 }
288 }
289 return 0;
290 }
291
292 // ----------------------------------
293 // OSDMap
294
295 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
296 {
297 if (id >= 0)
298 return is_down(id);
299
300 if (down_cache &&
301 down_cache->count(id)) {
302 return true;
303 }
304
305 list<int> children;
306 crush->get_children(id, &children);
307 for (const auto &child : children) {
308 if (!subtree_is_down(child, down_cache)) {
309 return false;
310 }
311 }
312 if (down_cache) {
313 down_cache->insert(id);
314 }
315 return true;
316 }
317
318 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
319 {
320 // use a stack-local down_cache if we didn't get one from the
321 // caller. then at least this particular call will avoid duplicated
322 // work.
323 set<int> local_down_cache;
324 if (!down_cache) {
325 down_cache = &local_down_cache;
326 }
327
328 int current = id;
329 while (true) {
330 int type;
331 if (current >= 0) {
332 type = 0;
333 } else {
334 type = crush->get_bucket_type(current);
335 }
336 ceph_assert(type >= 0);
337
338 if (!subtree_is_down(current, down_cache)) {
339 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
340 return false;
341 }
342
343 // is this a big enough subtree to be marked as down?
344 if (type >= subtree_type) {
345 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
346 return true;
347 }
348
349 int r = crush->get_immediate_parent_id(current, &current);
350 if (r < 0) {
351 return false;
352 }
353 }
354 }
355
356 bool OSDMap::subtree_type_is_down(
357 CephContext *cct,
358 int id,
359 int subtree_type,
360 set<int> *down_in_osds,
361 set<int> *up_in_osds,
362 set<int> *subtree_up,
363 unordered_map<int, set<int> > *subtree_type_down) const
364 {
365 if (id >= 0) {
366 bool is_down_ret = is_down(id);
367 if (!is_out(id)) {
368 if (is_down_ret) {
369 down_in_osds->insert(id);
370 } else {
371 up_in_osds->insert(id);
372 }
373 }
374 return is_down_ret;
375 }
376
377 if (subtree_type_down &&
378 (*subtree_type_down)[subtree_type].count(id)) {
379 return true;
380 }
381
382 list<int> children;
383 crush->get_children(id, &children);
384 for (const auto &child : children) {
385 if (!subtree_type_is_down(
386 cct, child, crush->get_bucket_type(child),
387 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
388 subtree_up->insert(id);
389 return false;
390 }
391 }
392 if (subtree_type_down) {
393 (*subtree_type_down)[subtree_type].insert(id);
394 }
395 return true;
396 }
397
398 void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
399 {
400 using ceph::encode;
401 __u16 v = 5;
402 encode(v, bl);
403 encode(fsid, bl);
404 encode(epoch, bl);
405 encode(modified, bl);
406 int32_t new_t = new_pool_max;
407 encode(new_t, bl);
408 encode(new_flags, bl);
409 encode(fullmap, bl);
410 encode(crush, bl);
411
412 encode(new_max_osd, bl);
413 // for encode(new_pools, bl);
414 __u32 n = new_pools.size();
415 encode(n, bl);
416 for (const auto &new_pool : new_pools) {
417 n = new_pool.first;
418 encode(n, bl);
419 encode(new_pool.second, bl, 0);
420 }
421 // for encode(new_pool_names, bl);
422 n = new_pool_names.size();
423 encode(n, bl);
424
425 for (const auto &new_pool_name : new_pool_names) {
426 n = new_pool_name.first;
427 encode(n, bl);
428 encode(new_pool_name.second, bl);
429 }
430 // for encode(old_pools, bl);
431 n = old_pools.size();
432 encode(n, bl);
433 for (auto &old_pool : old_pools) {
434 n = old_pool;
435 encode(n, bl);
436 }
437 encode(new_up_client, bl, 0);
438 {
439 // legacy is map<int32_t,uint8_t>
440 map<int32_t, uint8_t> os;
441 for (auto p : new_state) {
442 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
443 // that an old client could not understand.
444 // skip those!
445 uint8_t s = p.second;
446 if (p.second != 0 && s == 0)
447 continue;
448 os[p.first] = s;
449 }
450 uint32_t n = os.size();
451 encode(n, bl);
452 for (auto p : os) {
453 encode(p.first, bl);
454 encode(p.second, bl);
455 }
456 }
457 encode(new_weight, bl);
458 // for encode(new_pg_temp, bl);
459 n = new_pg_temp.size();
460 encode(n, bl);
461
462 for (const auto &pg_temp : new_pg_temp) {
463 old_pg_t opg = pg_temp.first.get_old_pg();
464 encode(opg, bl);
465 encode(pg_temp.second, bl);
466 }
467 }
468
469 void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
470 {
471 using ceph::encode;
472 if ((features & CEPH_FEATURE_PGID64) == 0) {
473 encode_client_old(bl);
474 return;
475 }
476
477 // base
478 __u16 v = 6;
479 encode(v, bl);
480 encode(fsid, bl);
481 encode(epoch, bl);
482 encode(modified, bl);
483 encode(new_pool_max, bl);
484 encode(new_flags, bl);
485 encode(fullmap, bl);
486 encode(crush, bl);
487
488 encode(new_max_osd, bl);
489 encode(new_pools, bl, features);
490 encode(new_pool_names, bl);
491 encode(old_pools, bl);
492 encode(new_up_client, bl, features);
493 {
494 map<int32_t, uint8_t> os;
495 for (auto p : new_state) {
496 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
497 // that an old client could not understand.
498 // skip those!
499 uint8_t s = p.second;
500 if (p.second != 0 && s == 0)
501 continue;
502 os[p.first] = s;
503 }
504 uint32_t n = os.size();
505 encode(n, bl);
506 for (auto p : os) {
507 encode(p.first, bl);
508 encode(p.second, bl);
509 }
510 }
511 encode(new_weight, bl);
512 encode(new_pg_temp, bl);
513
514 // extended
515 __u16 ev = 10;
516 encode(ev, bl);
517 encode(new_hb_back_up, bl, features);
518 encode(new_up_thru, bl);
519 encode(new_last_clean_interval, bl);
520 encode(new_lost, bl);
521 encode(new_blocklist, bl, features);
522 encode(old_blocklist, bl, features);
523 encode(new_up_cluster, bl, features);
524 encode(cluster_snapshot, bl);
525 encode(new_uuid, bl);
526 encode(new_xinfo, bl, features);
527 encode(new_hb_front_up, bl, features);
528 }
529
530 template<class T>
531 static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
532 {
533 uint32_t n = m.size();
534 encode(n, bl);
535 for (auto& i : m) {
536 encode(i.first, bl);
537 encode(i.second.legacy_addr(), bl, f);
538 }
539 }
540
541 template<class T>
542 static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
543 {
544 uint32_t n = m.size();
545 encode(n, bl);
546 for (auto& i : m) {
547 if (i) {
548 encode(i->legacy_addr(), bl, f);
549 } else {
550 encode(entity_addr_t(), bl, f);
551 }
552 }
553 }
554
555 /* for a description of osdmap incremental versions, and when they were
556 * introduced, please refer to
557 * doc/dev/osd_internals/osdmap_versions.txt
558 */
559 void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
560 {
561 using ceph::encode;
562 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
563 encode_classic(bl, features);
564 return;
565 }
566
567 // only a select set of callers should *ever* be encoding new
568 // OSDMaps. others should be passing around the canonical encoded
569 // buffers from on high. select out those callers by passing in an
570 // "impossible" feature bit.
571 ceph_assert(features & CEPH_FEATURE_RESERVED);
572 features &= ~CEPH_FEATURE_RESERVED;
573
574 size_t start_offset = bl.length();
575 size_t tail_offset;
576 size_t crc_offset;
577 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
578
579 // meta-encoding: how we include client-used and osd-specific data
580 ENCODE_START(8, 7, bl);
581
582 {
583 uint8_t v = 9;
584 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
585 v = 3;
586 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
587 v = 5;
588 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
589 v = 6;
590 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
591 v = 8;
592 } */
593 ENCODE_START(v, 1, bl); // client-usable data
594 encode(fsid, bl);
595 encode(epoch, bl);
596 encode(modified, bl);
597 encode(new_pool_max, bl);
598 encode(new_flags, bl);
599 encode(fullmap, bl);
600 encode(crush, bl);
601
602 encode(new_max_osd, bl);
603 encode(new_pools, bl, features);
604 encode(new_pool_names, bl);
605 encode(old_pools, bl);
606 if (v >= 7) {
607 encode(new_up_client, bl, features);
608 } else {
609 encode_addrvec_map_as_addr(new_up_client, bl, features);
610 }
611 if (v >= 5) {
612 encode(new_state, bl);
613 } else {
614 map<int32_t, uint8_t> os;
615 for (auto p : new_state) {
616 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
617 // that an old client could not understand.
618 // skip those!
619 uint8_t s = p.second;
620 if (p.second != 0 && s == 0)
621 continue;
622 os[p.first] = s;
623 }
624 uint32_t n = os.size();
625 encode(n, bl);
626 for (auto p : os) {
627 encode(p.first, bl);
628 encode(p.second, bl);
629 }
630 }
631 encode(new_weight, bl);
632 encode(new_pg_temp, bl);
633 encode(new_primary_temp, bl);
634 encode(new_primary_affinity, bl);
635 encode(new_erasure_code_profiles, bl);
636 encode(old_erasure_code_profiles, bl);
637 if (v >= 4) {
638 encode(new_pg_upmap, bl);
639 encode(old_pg_upmap, bl);
640 encode(new_pg_upmap_items, bl);
641 encode(old_pg_upmap_items, bl);
642 }
643 if (v >= 6) {
644 encode(new_removed_snaps, bl);
645 encode(new_purged_snaps, bl);
646 }
647 if (v >= 8) {
648 encode(new_last_up_change, bl);
649 encode(new_last_in_change, bl);
650 }
651 if (v >= 9) {
652 encode(new_pg_upmap_primary, bl);
653 encode(old_pg_upmap_primary, bl);
654 }
655 ENCODE_FINISH(bl); // client-usable data
656 }
657
658 {
659 uint8_t target_v = 9; // if bumping this, be aware of allow_crimson 12
660 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
661 target_v = 2;
662 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
663 target_v = 6;
664 }
665 if (change_stretch_mode) {
666 target_v = std::max((uint8_t)10, target_v);
667 }
668 if (!new_range_blocklist.empty() ||
669 !old_range_blocklist.empty()) {
670 target_v = std::max((uint8_t)11, target_v);
671 }
672 if (mutate_allow_crimson != mutate_allow_crimson_t::NONE) {
673 target_v = std::max((uint8_t)12, target_v);
674 }
675 ENCODE_START(target_v, 1, bl); // extended, osd-only data
676 if (target_v < 7) {
677 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
678 } else {
679 encode(new_hb_back_up, bl, features);
680 }
681 encode(new_up_thru, bl);
682 encode(new_last_clean_interval, bl);
683 encode(new_lost, bl);
684 encode(new_blocklist, bl, features);
685 encode(old_blocklist, bl, features);
686 if (target_v < 7) {
687 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
688 } else {
689 encode(new_up_cluster, bl, features);
690 }
691 encode(cluster_snapshot, bl);
692 encode(new_uuid, bl);
693 encode(new_xinfo, bl, features);
694 if (target_v < 7) {
695 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
696 } else {
697 encode(new_hb_front_up, bl, features);
698 }
699 encode(features, bl); // NOTE: features arg, not the member
700 if (target_v >= 3) {
701 encode(new_nearfull_ratio, bl);
702 encode(new_full_ratio, bl);
703 encode(new_backfillfull_ratio, bl);
704 }
705 // 5 was string-based new_require_min_compat_client
706 if (target_v >= 6) {
707 encode(new_require_min_compat_client, bl);
708 encode(new_require_osd_release, bl);
709 }
710 if (target_v >= 8) {
711 encode(new_crush_node_flags, bl);
712 }
713 if (target_v >= 9) {
714 encode(new_device_class_flags, bl);
715 }
716 if (target_v >= 10) {
717 encode(change_stretch_mode, bl);
718 encode(new_stretch_bucket_count, bl);
719 encode(new_degraded_stretch_mode, bl);
720 encode(new_recovering_stretch_mode, bl);
721 encode(new_stretch_mode_bucket, bl);
722 encode(stretch_mode_enabled, bl);
723 }
724 if (target_v >= 11) {
725 encode(new_range_blocklist, bl, features);
726 encode(old_range_blocklist, bl, features);
727 }
728 if (target_v >= 12) {
729 encode(mutate_allow_crimson, bl);
730 }
731 ENCODE_FINISH(bl); // osd-only data
732 }
733
734 crc_offset = bl.length();
735 crc_filler = bl.append_hole(sizeof(uint32_t));
736 tail_offset = bl.length();
737
738 encode(full_crc, bl);
739
740 ENCODE_FINISH(bl); // meta-encoding wrapper
741
742 // fill in crc
743 ceph::buffer::list front;
744 front.substr_of(bl, start_offset, crc_offset - start_offset);
745 inc_crc = front.crc32c(-1);
746 ceph::buffer::list tail;
747 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
748 inc_crc = tail.crc32c(inc_crc);
749 ceph_le32 crc_le;
750 crc_le = inc_crc;
751 crc_filler->copy_in(4u, (char*)&crc_le);
752 have_crc = true;
753 }
754
755 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
756 {
757 using ceph::decode;
758 __u32 n, t;
759 // base
760 __u16 v;
761 decode(v, p);
762 decode(fsid, p);
763 decode(epoch, p);
764 decode(modified, p);
765 if (v == 4 || v == 5) {
766 decode(n, p);
767 new_pool_max = n;
768 } else if (v >= 6)
769 decode(new_pool_max, p);
770 decode(new_flags, p);
771 decode(fullmap, p);
772 decode(crush, p);
773
774 decode(new_max_osd, p);
775 if (v < 6) {
776 new_pools.clear();
777 decode(n, p);
778 while (n--) {
779 decode(t, p);
780 decode(new_pools[t], p);
781 }
782 } else {
783 decode(new_pools, p);
784 }
785 if (v == 5) {
786 new_pool_names.clear();
787 decode(n, p);
788 while (n--) {
789 decode(t, p);
790 decode(new_pool_names[t], p);
791 }
792 } else if (v >= 6) {
793 decode(new_pool_names, p);
794 }
795 if (v < 6) {
796 old_pools.clear();
797 decode(n, p);
798 while (n--) {
799 decode(t, p);
800 old_pools.insert(t);
801 }
802 } else {
803 decode(old_pools, p);
804 }
805 decode(new_up_client, p);
806 {
807 map<int32_t,uint8_t> ns;
808 decode(ns, p);
809 for (auto q : ns) {
810 new_state[q.first] = q.second;
811 }
812 }
813 decode(new_weight, p);
814
815 if (v < 6) {
816 new_pg_temp.clear();
817 decode(n, p);
818 while (n--) {
819 old_pg_t opg;
820 ceph::decode_raw(opg, p);
821 decode(new_pg_temp[pg_t(opg)], p);
822 }
823 } else {
824 decode(new_pg_temp, p);
825 }
826
827 // decode short map, too.
828 if (v == 5 && p.end())
829 return;
830
831 // extended
832 __u16 ev = 0;
833 if (v >= 5)
834 decode(ev, p);
835 decode(new_hb_back_up, p);
836 if (v < 5)
837 decode(new_pool_names, p);
838 decode(new_up_thru, p);
839 decode(new_last_clean_interval, p);
840 decode(new_lost, p);
841 decode(new_blocklist, p);
842 decode(old_blocklist, p);
843 if (ev >= 6)
844 decode(new_up_cluster, p);
845 if (ev >= 7)
846 decode(cluster_snapshot, p);
847 if (ev >= 8)
848 decode(new_uuid, p);
849 if (ev >= 9)
850 decode(new_xinfo, p);
851 if (ev >= 10)
852 decode(new_hb_front_up, p);
853 }
854
855 /* for a description of osdmap incremental versions, and when they were
856 * introduced, please refer to
857 * doc/dev/osd_internals/osdmap_versions.txt
858 */
859 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
860 {
861 using ceph::decode;
862 /**
863 * Older encodings of the Incremental had a single struct_v which
864 * covered the whole encoding, and was prior to our modern
865 * stuff which includes a compatv and a size. So if we see
866 * a struct_v < 7, we must rewind to the beginning and use our
867 * classic decoder.
868 */
869 size_t start_offset = bl.get_off();
870 size_t tail_offset = 0;
871 ceph::buffer::list crc_front, crc_tail;
872
873 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
874 if (struct_v < 7) {
875 bl.seek(start_offset);
876 decode_classic(bl);
877 encode_features = 0;
878 if (struct_v >= 6)
879 encode_features = CEPH_FEATURE_PGID64;
880 else
881 encode_features = 0;
882 return;
883 }
884 {
885 DECODE_START(8, bl); // client-usable data
886 decode(fsid, bl);
887 decode(epoch, bl);
888 decode(modified, bl);
889 decode(new_pool_max, bl);
890 decode(new_flags, bl);
891 decode(fullmap, bl);
892 decode(crush, bl);
893
894 decode(new_max_osd, bl);
895 decode(new_pools, bl);
896 decode(new_pool_names, bl);
897 decode(old_pools, bl);
898 decode(new_up_client, bl);
899 if (struct_v >= 5) {
900 decode(new_state, bl);
901 } else {
902 map<int32_t,uint8_t> ns;
903 decode(ns, bl);
904 for (auto q : ns) {
905 new_state[q.first] = q.second;
906 }
907 }
908 decode(new_weight, bl);
909 decode(new_pg_temp, bl);
910 decode(new_primary_temp, bl);
911 if (struct_v >= 2)
912 decode(new_primary_affinity, bl);
913 else
914 new_primary_affinity.clear();
915 if (struct_v >= 3) {
916 decode(new_erasure_code_profiles, bl);
917 decode(old_erasure_code_profiles, bl);
918 } else {
919 new_erasure_code_profiles.clear();
920 old_erasure_code_profiles.clear();
921 }
922 if (struct_v >= 4) {
923 decode(new_pg_upmap, bl);
924 decode(old_pg_upmap, bl);
925 decode(new_pg_upmap_items, bl);
926 decode(old_pg_upmap_items, bl);
927 }
928 if (struct_v >= 6) {
929 decode(new_removed_snaps, bl);
930 decode(new_purged_snaps, bl);
931 }
932 if (struct_v >= 8) {
933 decode(new_last_up_change, bl);
934 decode(new_last_in_change, bl);
935 }
936 DECODE_FINISH(bl); // client-usable data
937 }
938
939 {
940 DECODE_START(10, bl); // extended, osd-only data
941 decode(new_hb_back_up, bl);
942 decode(new_up_thru, bl);
943 decode(new_last_clean_interval, bl);
944 decode(new_lost, bl);
945 decode(new_blocklist, bl);
946 decode(old_blocklist, bl);
947 decode(new_up_cluster, bl);
948 decode(cluster_snapshot, bl);
949 decode(new_uuid, bl);
950 decode(new_xinfo, bl);
951 decode(new_hb_front_up, bl);
952 if (struct_v >= 2)
953 decode(encode_features, bl);
954 else
955 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
956 if (struct_v >= 3) {
957 decode(new_nearfull_ratio, bl);
958 decode(new_full_ratio, bl);
959 } else {
960 new_nearfull_ratio = -1;
961 new_full_ratio = -1;
962 }
963 if (struct_v >= 4) {
964 decode(new_backfillfull_ratio, bl);
965 } else {
966 new_backfillfull_ratio = -1;
967 }
968 if (struct_v == 5) {
969 string r;
970 decode(r, bl);
971 if (r.length()) {
972 new_require_min_compat_client = ceph_release_from_name(r);
973 }
974 }
975 if (struct_v >= 6) {
976 decode(new_require_min_compat_client, bl);
977 decode(new_require_osd_release, bl);
978 } else {
979 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
980 // only for compat with post-kraken pre-luminous test clusters
981 new_require_osd_release = ceph_release_t::luminous;
982 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
983 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
984 new_require_osd_release = ceph_release_t::kraken;
985 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
986 new_require_osd_release = ceph_release_t::jewel;
987 } else {
988 new_require_osd_release = ceph_release_t::unknown;
989 }
990 }
991 if (struct_v >= 8) {
992 decode(new_crush_node_flags, bl);
993 }
994 if (struct_v >= 9) {
995 decode(new_device_class_flags, bl);
996 }
997 if (struct_v >= 10) {
998 decode(change_stretch_mode, bl);
999 decode(new_stretch_bucket_count, bl);
1000 decode(new_degraded_stretch_mode, bl);
1001 decode(new_recovering_stretch_mode, bl);
1002 decode(new_stretch_mode_bucket, bl);
1003 decode(stretch_mode_enabled, bl);
1004 }
1005 if (struct_v >= 11) {
1006 decode(new_range_blocklist, bl);
1007 decode(old_range_blocklist, bl);
1008 }
1009 if (struct_v >= 12) {
1010 decode(mutate_allow_crimson, bl);
1011 }
1012 DECODE_FINISH(bl); // osd-only data
1013 }
1014
1015 if (struct_v >= 8) {
1016 have_crc = true;
1017 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
1018 decode(inc_crc, bl);
1019 tail_offset = bl.get_off();
1020 decode(full_crc, bl);
1021 } else {
1022 have_crc = false;
1023 full_crc = 0;
1024 inc_crc = 0;
1025 }
1026
1027 DECODE_FINISH(bl); // wrapper
1028
1029 if (have_crc) {
1030 // verify crc
1031 uint32_t actual = crc_front.crc32c(-1);
1032 if (tail_offset < bl.get_off()) {
1033 ceph::buffer::list tail;
1034 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1035 actual = tail.crc32c(actual);
1036 }
1037 if (inc_crc != actual) {
1038 ostringstream ss;
1039 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1040 string s = ss.str();
1041 throw ceph::buffer::malformed_input(s.c_str());
1042 }
1043 }
1044 }
1045
1046 void OSDMap::Incremental::dump(Formatter *f) const
1047 {
1048 f->dump_int("epoch", epoch);
1049 f->dump_stream("fsid") << fsid;
1050 f->dump_stream("modified") << modified;
1051 f->dump_stream("new_last_up_change") << new_last_up_change;
1052 f->dump_stream("new_last_in_change") << new_last_in_change;
1053 f->dump_int("new_pool_max", new_pool_max);
1054 f->dump_int("new_flags", new_flags);
1055 f->dump_float("new_full_ratio", new_full_ratio);
1056 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1057 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
1058 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1059 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
1060 f->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson));
1061
1062 if (fullmap.length()) {
1063 f->open_object_section("full_map");
1064 OSDMap full;
1065 ceph::buffer::list fbl = fullmap; // kludge around constness.
1066 auto p = fbl.cbegin();
1067 full.decode(p);
1068 full.dump(f);
1069 f->close_section();
1070 }
1071 if (crush.length()) {
1072 f->open_object_section("crush");
1073 CrushWrapper c;
1074 ceph::buffer::list tbl = crush; // kludge around constness.
1075 auto p = tbl.cbegin();
1076 c.decode(p);
1077 c.dump(f);
1078 f->close_section();
1079 }
1080
1081 f->dump_int("new_max_osd", new_max_osd);
1082
1083 f->open_array_section("new_pools");
1084
1085 for (const auto &new_pool : new_pools) {
1086 f->open_object_section("pool");
1087 f->dump_int("pool", new_pool.first);
1088 new_pool.second.dump(f);
1089 f->close_section();
1090 }
1091 f->close_section();
1092 f->open_array_section("new_pool_names");
1093
1094 for (const auto &new_pool_name : new_pool_names) {
1095 f->open_object_section("pool_name");
1096 f->dump_int("pool", new_pool_name.first);
1097 f->dump_string("name", new_pool_name.second);
1098 f->close_section();
1099 }
1100 f->close_section();
1101 f->open_array_section("old_pools");
1102
1103 for (const auto &old_pool : old_pools)
1104 f->dump_int("pool", old_pool);
1105 f->close_section();
1106
1107 f->open_array_section("new_up_osds");
1108
1109 for (const auto &upclient : new_up_client) {
1110 f->open_object_section("osd");
1111 f->dump_int("osd", upclient.first);
1112 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1113 f->dump_object("public_addrs", upclient.second);
1114 if (auto p = new_up_cluster.find(upclient.first);
1115 p != new_up_cluster.end()) {
1116 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1117 f->dump_object("cluster_addrs", p->second);
1118 }
1119 if (auto p = new_hb_back_up.find(upclient.first);
1120 p != new_hb_back_up.end()) {
1121 f->dump_object("heartbeat_back_addrs", p->second);
1122 }
1123 if (auto p = new_hb_front_up.find(upclient.first);
1124 p != new_hb_front_up.end()) {
1125 f->dump_object("heartbeat_front_addrs", p->second);
1126 }
1127 f->close_section();
1128 }
1129 f->close_section();
1130
1131 f->open_array_section("new_weight");
1132
1133 for (const auto &weight : new_weight) {
1134 f->open_object_section("osd");
1135 f->dump_int("osd", weight.first);
1136 f->dump_int("weight", weight.second);
1137 f->close_section();
1138 }
1139 f->close_section();
1140
1141 f->open_array_section("osd_state_xor");
1142 for (const auto &ns : new_state) {
1143 f->open_object_section("osd");
1144 f->dump_int("osd", ns.first);
1145 set<string> st;
1146 calc_state_set(new_state.find(ns.first)->second, st);
1147 f->open_array_section("state_xor");
1148 for (auto &state : st)
1149 f->dump_string("state", state);
1150 f->close_section();
1151 f->close_section();
1152 }
1153 f->close_section();
1154
1155 f->open_array_section("new_pg_temp");
1156
1157 for (const auto &pg_temp : new_pg_temp) {
1158 f->open_object_section("pg");
1159 f->dump_stream("pgid") << pg_temp.first;
1160 f->open_array_section("osds");
1161
1162 for (const auto &osd : pg_temp.second)
1163 f->dump_int("osd", osd);
1164 f->close_section();
1165 f->close_section();
1166 }
1167 f->close_section();
1168
1169 f->open_array_section("primary_temp");
1170
1171 for (const auto &primary_temp : new_primary_temp) {
1172 f->dump_stream("pgid") << primary_temp.first;
1173 f->dump_int("osd", primary_temp.second);
1174 }
1175 f->close_section(); // primary_temp
1176
1177 f->open_array_section("new_pg_upmap");
1178 for (auto& i : new_pg_upmap) {
1179 f->open_object_section("mapping");
1180 f->dump_stream("pgid") << i.first;
1181 f->open_array_section("osds");
1182 for (auto osd : i.second) {
1183 f->dump_int("osd", osd);
1184 }
1185 f->close_section();
1186 f->close_section();
1187 }
1188 f->close_section();
1189 f->open_array_section("old_pg_upmap");
1190 for (auto& i : old_pg_upmap) {
1191 f->dump_stream("pgid") << i;
1192 }
1193 f->close_section();
1194
1195 f->open_array_section("new_pg_upmap_items");
1196 for (auto& i : new_pg_upmap_items) {
1197 f->open_object_section("mapping");
1198 f->dump_stream("pgid") << i.first;
1199 f->open_array_section("mappings");
1200 for (auto& p : i.second) {
1201 f->open_object_section("mapping");
1202 f->dump_int("from", p.first);
1203 f->dump_int("to", p.second);
1204 f->close_section();
1205 }
1206 f->close_section();
1207 f->close_section();
1208 }
1209 f->close_section();
1210 f->open_array_section("old_pg_upmap_items");
1211 for (auto& i : old_pg_upmap_items) {
1212 f->dump_stream("pgid") << i;
1213 }
1214 f->close_section();
1215
1216 // dump upmap_primaries
1217 f->open_array_section("new_pg_upmap_primaries");
1218 for (auto& [pg, osd] : new_pg_upmap_primary) {
1219 f->open_object_section("primary_mapping");
1220 f->dump_stream("pgid") << pg;
1221 f->dump_int("primary_osd", osd);
1222 f->close_section();
1223 }
1224 f->close_section(); // new_pg_upmap_primaries
1225
1226 // dump old_pg_upmap_primaries (removed primary mappings)
1227 f->open_array_section("old_pg_upmap_primaries");
1228 for (auto& pg : old_pg_upmap_primary) {
1229 f->dump_stream("pgid") << pg;
1230 }
1231 f->close_section(); // old_pg_upmap_primaries
1232
1233 f->open_array_section("new_up_thru");
1234
1235 for (const auto &up_thru : new_up_thru) {
1236 f->open_object_section("osd");
1237 f->dump_int("osd", up_thru.first);
1238 f->dump_int("up_thru", up_thru.second);
1239 f->close_section();
1240 }
1241 f->close_section();
1242
1243 f->open_array_section("new_lost");
1244
1245 for (const auto &lost : new_lost) {
1246 f->open_object_section("osd");
1247 f->dump_int("osd", lost.first);
1248 f->dump_int("epoch_lost", lost.second);
1249 f->close_section();
1250 }
1251 f->close_section();
1252
1253 f->open_array_section("new_last_clean_interval");
1254
1255 for (const auto &last_clean_interval : new_last_clean_interval) {
1256 f->open_object_section("osd");
1257 f->dump_int("osd", last_clean_interval.first);
1258 f->dump_int("first", last_clean_interval.second.first);
1259 f->dump_int("last", last_clean_interval.second.second);
1260 f->close_section();
1261 }
1262 f->close_section();
1263
1264 f->open_array_section("new_blocklist");
1265 for (const auto &blist : new_blocklist) {
1266 stringstream ss;
1267 ss << blist.first;
1268 f->dump_stream(ss.str().c_str()) << blist.second;
1269 }
1270 f->close_section();
1271 f->open_array_section("old_blocklist");
1272 for (const auto &blist : old_blocklist)
1273 f->dump_stream("addr") << blist;
1274 f->close_section();
1275 f->open_array_section("new_range_blocklist");
1276 for (const auto &blist : new_range_blocklist) {
1277 stringstream ss;
1278 ss << blist.first;
1279 f->dump_stream(ss.str().c_str()) << blist.second;
1280 }
1281 f->close_section();
1282 f->open_array_section("old_range_blocklist");
1283 for (const auto &blist : old_range_blocklist)
1284 f->dump_stream("addr") << blist;
1285 f->close_section();
1286
1287 f->open_array_section("new_xinfo");
1288 for (const auto &xinfo : new_xinfo) {
1289 f->open_object_section("xinfo");
1290 f->dump_int("osd", xinfo.first);
1291 xinfo.second.dump(f);
1292 f->close_section();
1293 }
1294 f->close_section();
1295
1296 if (cluster_snapshot.size())
1297 f->dump_string("cluster_snapshot", cluster_snapshot);
1298
1299 f->open_array_section("new_uuid");
1300 for (const auto &uuid : new_uuid) {
1301 f->open_object_section("osd");
1302 f->dump_int("osd", uuid.first);
1303 f->dump_stream("uuid") << uuid.second;
1304 f->close_section();
1305 }
1306 f->close_section();
1307
1308 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1309 f->open_array_section("old_erasure_code_profiles");
1310 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1311 f->dump_string("old", erasure_code_profile);
1312 }
1313 f->close_section();
1314
1315 f->open_array_section("new_removed_snaps");
1316 for (auto& p : new_removed_snaps) {
1317 f->open_object_section("pool");
1318 f->dump_int("pool", p.first);
1319 f->open_array_section("snaps");
1320 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1321 f->open_object_section("interval");
1322 f->dump_unsigned("begin", q.get_start());
1323 f->dump_unsigned("length", q.get_len());
1324 f->close_section();
1325 }
1326 f->close_section();
1327 f->close_section();
1328 }
1329 f->close_section();
1330 f->open_array_section("new_purged_snaps");
1331 for (auto& p : new_purged_snaps) {
1332 f->open_object_section("pool");
1333 f->dump_int("pool", p.first);
1334 f->open_array_section("snaps");
1335 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1336 f->open_object_section("interval");
1337 f->dump_unsigned("begin", q.get_start());
1338 f->dump_unsigned("length", q.get_len());
1339 f->close_section();
1340 }
1341 f->close_section();
1342 f->close_section();
1343 }
1344 f->open_array_section("new_crush_node_flags");
1345 for (auto& i : new_crush_node_flags) {
1346 f->open_object_section("node");
1347 f->dump_int("id", i.first);
1348 set<string> st;
1349 calc_state_set(i.second, st);
1350 for (auto& j : st) {
1351 f->dump_string("flag", j);
1352 }
1353 f->close_section();
1354 }
1355 f->close_section();
1356 f->open_array_section("new_device_class_flags");
1357 for (auto& i : new_device_class_flags) {
1358 f->open_object_section("device_class");
1359 f->dump_int("id", i.first);
1360 set<string> st;
1361 calc_state_set(i.second, st);
1362 for (auto& j : st) {
1363 f->dump_string("flag", j);
1364 }
1365 f->close_section();
1366 }
1367 f->close_section();
1368 f->open_object_section("stretch_mode");
1369 {
1370 f->dump_bool("change_stretch_mode", change_stretch_mode);
1371 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1372 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1373 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1374 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1375 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1376 }
1377 f->close_section();
1378 f->close_section();
1379 }
1380
1381 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1382 {
1383 o.push_back(new Incremental);
1384 }
1385
1386 // ----------------------------------
1387 // OSDMap
1388
1389 void OSDMap::set_epoch(epoch_t e)
1390 {
1391 epoch = e;
1392 for (auto &pool : pools)
1393 pool.second.last_change = e;
1394 }
1395
1396 OSDMap::range_bits::range_bits() : ipv6(false) {
1397 memset(&bits, 0, sizeof(bits));
1398 }
1399
1400 OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
1401 memset(&bits, 0, sizeof(bits));
1402 parse(addr);
1403 }
1404
1405 void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
1406 uint64_t *upper, uint64_t *lower)
1407 {
1408 *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
1409 ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
1410 *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
1411 ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
1412 }
1413
1414 void OSDMap::range_bits::parse(const entity_addr_t& addr) {
1415 // parse it into meaningful data
1416 if (addr.is_ipv6()) {
1417 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
1418 &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
1419 int32_t lower_shift = std::min(128-
1420 static_cast<int32_t>(addr.get_nonce()), 64);
1421 int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
1422 static_cast<int32_t>(addr.get_nonce()), 0);
1423
1424 auto get_mask = [](int32_t shift) -> uint64_t {
1425 if (shift >= 0 && shift < 64) {
1426 return UINT64_MAX << shift;
1427 }
1428 return 0;
1429 };
1430
1431 bits.ipv6.lower_mask = get_mask(lower_shift);
1432 bits.ipv6.upper_mask = get_mask(upper_shift);
1433 ipv6 = true;
1434 } else if (addr.is_ipv4()) {
1435 bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
1436 if (addr.get_nonce() > 0) {
1437 bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
1438 } else {
1439 bits.ipv4.mask = 0;
1440 }
1441 } else {
1442 // uh...
1443 }
1444 }
1445
1446 bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
1447 if (addr.is_ipv4() && !ipv6) {
1448 return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
1449 (bits.ipv4.ip_32_bits & bits.ipv4.mask));
1450 } else if (addr.is_ipv6() && ipv6) {
1451 uint64_t upper_64, lower_64;
1452 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
1453 return (((upper_64 & bits.ipv6.upper_mask) ==
1454 (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
1455 ((lower_64 & bits.ipv6.lower_mask) ==
1456 (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
1457 }
1458 return false;
1459 }
1460
1461 bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
1462 {
1463 if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
1464 if (blocklist.empty() && range_blocklist.empty()) {
1465 if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
1466 return false;
1467 }
1468
1469 // all blocklist entries are type ANY for nautilus+
1470 // FIXME: avoid this copy!
1471 entity_addr_t a = orig;
1472 if (require_osd_release < ceph_release_t::nautilus) {
1473 a.set_type(entity_addr_t::TYPE_LEGACY);
1474 } else {
1475 a.set_type(entity_addr_t::TYPE_ANY);
1476 }
1477
1478 // this specific instance?
1479 if (blocklist.count(a)) {
1480 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
1481 return true;
1482 }
1483
1484 // is entire ip blocklisted?
1485 if (a.is_ip()) {
1486 a.set_port(0);
1487 a.set_nonce(0);
1488 if (blocklist.count(a)) {
1489 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
1490 return true;
1491 }
1492 }
1493
1494 // is it in a blocklisted range?
1495 for (const auto& i : calculated_ranges) {
1496 bool blocked = i.second.matches(a);
1497 if (blocked) {
1498 if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
1499 return true;
1500 }
1501 }
1502
1503 if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
1504 return false;
1505 }
1506
1507 bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
1508 {
1509 if (blocklist.empty() && range_blocklist.empty())
1510 return false;
1511
1512 for (auto& a : av.v) {
1513 if (is_blocklisted(a, cct)) {
1514 return true;
1515 }
1516 }
1517
1518 return false;
1519 }
1520
1521 void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
1522 std::list<std::pair<entity_addr_t,utime_t> > *rl) const
1523 {
1524 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
1525 std::copy(range_blocklist.begin(), range_blocklist.end(),
1526 std::back_inserter(*rl));
1527 }
1528
1529 void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
1530 std::set<entity_addr_t> *rl) const
1531 {
1532 for (const auto &i : blocklist) {
1533 bl->insert(i.first);
1534 }
1535 for (const auto &i : range_blocklist) {
1536 rl->insert(i.first);
1537 }
1538 }
1539
1540 void OSDMap::set_max_osd(int m)
1541 {
1542 max_osd = m;
1543 osd_state.resize(max_osd, 0);
1544 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1545 osd_info.resize(max_osd);
1546 osd_xinfo.resize(max_osd);
1547 osd_addrs->client_addrs.resize(max_osd);
1548 osd_addrs->cluster_addrs.resize(max_osd);
1549 osd_addrs->hb_back_addrs.resize(max_osd);
1550 osd_addrs->hb_front_addrs.resize(max_osd);
1551 osd_uuid->resize(max_osd);
1552 if (osd_primary_affinity)
1553 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1554
1555 calc_num_osds();
1556 }
1557
1558 int OSDMap::calc_num_osds()
1559 {
1560 num_osd = 0;
1561 num_up_osd = 0;
1562 num_in_osd = 0;
1563 for (int i=0; i<max_osd; i++) {
1564 if (osd_state[i] & CEPH_OSD_EXISTS) {
1565 ++num_osd;
1566 if (osd_state[i] & CEPH_OSD_UP) {
1567 ++num_up_osd;
1568 }
1569 if (get_weight(i) != CEPH_OSD_OUT) {
1570 ++num_in_osd;
1571 }
1572 }
1573 }
1574 return num_osd;
1575 }
1576
1577 void OSDMap::get_full_pools(CephContext *cct,
1578 set<int64_t> *full,
1579 set<int64_t> *backfillfull,
1580 set<int64_t> *nearfull) const
1581 {
1582 ceph_assert(full);
1583 ceph_assert(backfillfull);
1584 ceph_assert(nearfull);
1585 full->clear();
1586 backfillfull->clear();
1587 nearfull->clear();
1588
1589 vector<int> full_osds;
1590 vector<int> backfillfull_osds;
1591 vector<int> nearfull_osds;
1592 for (int i = 0; i < max_osd; ++i) {
1593 if (exists(i) && is_up(i) && is_in(i)) {
1594 if (osd_state[i] & CEPH_OSD_FULL)
1595 full_osds.push_back(i);
1596 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1597 backfillfull_osds.push_back(i);
1598 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1599 nearfull_osds.push_back(i);
1600 }
1601 }
1602
1603 for (auto i: full_osds) {
1604 get_pool_ids_by_osd(cct, i, full);
1605 }
1606 for (auto i: backfillfull_osds) {
1607 get_pool_ids_by_osd(cct, i, backfillfull);
1608 }
1609 for (auto i: nearfull_osds) {
1610 get_pool_ids_by_osd(cct, i, nearfull);
1611 }
1612 }
1613
1614 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1615 set<int> *nearfull) const
1616 {
1617 full->clear();
1618 backfill->clear();
1619 nearfull->clear();
1620 for (int i = 0; i < max_osd; ++i) {
1621 if (exists(i) && is_up(i) && is_in(i)) {
1622 if (osd_state[i] & CEPH_OSD_FULL)
1623 full->emplace(i);
1624 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1625 backfill->emplace(i);
1626 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1627 nearfull->emplace(i);
1628 }
1629 }
1630 }
1631
1632 void OSDMap::get_all_osds(set<int32_t>& ls) const
1633 {
1634 for (int i=0; i<max_osd; i++)
1635 if (exists(i))
1636 ls.insert(i);
1637 }
1638
1639 void OSDMap::get_up_osds(set<int32_t>& ls) const
1640 {
1641 for (int i = 0; i < max_osd; i++) {
1642 if (is_up(i))
1643 ls.insert(i);
1644 }
1645 }
1646
1647 void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
1648 {
1649 for (int i = 0; i < max_osd; i++) {
1650 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
1651 ls.insert(i);
1652 }
1653 }
1654
1655 void OSDMap::get_flag_set(set<string> *flagset) const
1656 {
1657 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1658 if (flags & (1<<i)) {
1659 flagset->insert(get_flag_string(flags & (1<<i)));
1660 }
1661 }
1662 }
1663
1664 void OSDMap::calc_state_set(int state, set<string>& st)
1665 {
1666 unsigned t = state;
1667 for (unsigned s = 1; t; s <<= 1) {
1668 if (t & s) {
1669 t &= ~s;
1670 st.insert(ceph_osd_state_name(s));
1671 }
1672 }
1673 }
1674
1675 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1676 {
1677 float max = 0;
1678 for (const auto &weight : weights) {
1679 if (weight.second > max)
1680 max = weight.second;
1681 }
1682
1683 for (const auto &weight : weights) {
1684 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1685 }
1686 }
1687
1688 int OSDMap::identify_osd(const entity_addr_t& addr) const
1689 {
1690 for (int i=0; i<max_osd; i++)
1691 if (exists(i) && (get_addrs(i).contains(addr) ||
1692 get_cluster_addrs(i).contains(addr)))
1693 return i;
1694 return -1;
1695 }
1696
1697 int OSDMap::identify_osd(const uuid_d& u) const
1698 {
1699 for (int i=0; i<max_osd; i++)
1700 if (exists(i) && get_uuid(i) == u)
1701 return i;
1702 return -1;
1703 }
1704
1705 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1706 {
1707 for (int i=0; i<max_osd; i++)
1708 if (exists(i) && (get_addrs(i).contains(addr) ||
1709 get_cluster_addrs(i).contains(addr) ||
1710 get_hb_back_addrs(i).contains(addr) ||
1711 get_hb_front_addrs(i).contains(addr)))
1712 return i;
1713 return -1;
1714 }
1715
1716 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1717 {
1718 for (int i=0; i<max_osd; i++)
1719 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1720 get_cluster_addrs(i).is_same_host(ip)))
1721 return i;
1722 return -1;
1723 }
1724
1725
1726 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1727 {
1728 uint64_t features = 0; // things we actually have
1729 uint64_t mask = 0; // things we could have
1730
1731 if (crush->has_nondefault_tunables())
1732 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1733 if (crush->has_nondefault_tunables2())
1734 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1735 if (crush->has_nondefault_tunables3())
1736 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1737 if (crush->has_v4_buckets())
1738 features |= CEPH_FEATURE_CRUSH_V4;
1739 if (crush->has_nondefault_tunables5())
1740 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1741 if (crush->has_incompat_choose_args()) {
1742 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1743 }
1744 mask |= CEPH_FEATURES_CRUSH;
1745
1746 if (!pg_upmap.empty() || !pg_upmap_items.empty() || !pg_upmap_primaries.empty())
1747 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1748 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1749
1750 for (auto &pool: pools) {
1751 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1752 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1753 }
1754 if (!pool.second.tiers.empty() ||
1755 pool.second.is_tier()) {
1756 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1757 }
1758 int ruleid = pool.second.get_crush_rule();
1759 if (ruleid >= 0) {
1760 if (crush->is_v2_rule(ruleid))
1761 features |= CEPH_FEATURE_CRUSH_V2;
1762 if (crush->is_v3_rule(ruleid))
1763 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1764 if (crush->is_v5_rule(ruleid))
1765 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1766 }
1767 }
1768 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1769
1770 if (osd_primary_affinity) {
1771 for (int i = 0; i < max_osd; ++i) {
1772 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1773 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1774 break;
1775 }
1776 }
1777 }
1778 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1779
1780 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1781 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1782 if (require_osd_release >= ceph_release_t::jewel) {
1783 features |= jewel_features;
1784 }
1785 mask |= jewel_features;
1786
1787 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1788 | CEPH_FEATURE_MSG_ADDR2;
1789 if (require_osd_release >= ceph_release_t::kraken) {
1790 features |= kraken_features;
1791 }
1792 mask |= kraken_features;
1793
1794 if (stretch_mode_enabled) {
1795 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1796 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1797 }
1798 }
1799
1800 if (require_min_compat_client >= ceph_release_t::nautilus) {
1801 // if min_compat_client is >= nautilus, require v2 cephx signatures
1802 // from everyone
1803 features |= CEPH_FEATUREMASK_CEPHX_V2;
1804 } else if (require_osd_release >= ceph_release_t::nautilus &&
1805 entity_type == CEPH_ENTITY_TYPE_OSD) {
1806 // if osds are >= nautilus, at least require the signatures from them
1807 features |= CEPH_FEATUREMASK_CEPHX_V2;
1808 }
1809 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1810
1811 if (pmask)
1812 *pmask = mask;
1813 return features;
1814 }
1815
1816 ceph_release_t OSDMap::get_min_compat_client() const
1817 {
1818 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1819
1820 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1821 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1822 return ceph_release_t::luminous; // v12.2.0
1823 }
1824 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1825 return ceph_release_t::jewel; // v10.2.0
1826 }
1827 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1828 return ceph_release_t::hammer; // v0.94.0
1829 }
1830 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1831 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1832 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1833 return ceph_release_t::firefly; // v0.80.0
1834 }
1835 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1836 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1837 return ceph_release_t::dumpling; // v0.67.0
1838 }
1839 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1840 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1841 }
1842 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
1843 }
1844
1845 ceph_release_t OSDMap::get_require_min_compat_client() const
1846 {
1847 return require_min_compat_client;
1848 }
1849
1850 void OSDMap::_calc_up_osd_features()
1851 {
1852 bool first = true;
1853 cached_up_osd_features = 0;
1854 for (int osd = 0; osd < max_osd; ++osd) {
1855 if (!is_up(osd))
1856 continue;
1857 const osd_xinfo_t &xi = get_xinfo(osd);
1858 if (xi.features == 0)
1859 continue; // bogus xinfo, maybe #20751 or similar, skipping
1860 if (first) {
1861 cached_up_osd_features = xi.features;
1862 first = false;
1863 } else {
1864 cached_up_osd_features &= xi.features;
1865 }
1866 }
1867 }
1868
1869 uint64_t OSDMap::get_up_osd_features() const
1870 {
1871 return cached_up_osd_features;
1872 }
1873
1874 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1875 {
1876 using ceph::encode;
1877 if (o->epoch == n->epoch)
1878 return;
1879
1880 int diff = 0;
1881
1882 // do addrs match?
1883 if (o->max_osd != n->max_osd)
1884 diff++;
1885 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1886 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1887 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1888 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
1889 else
1890 diff++;
1891 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1892 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1893 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
1894 else
1895 diff++;
1896 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1897 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1898 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
1899 else
1900 diff++;
1901 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1902 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1903 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
1904 else
1905 diff++;
1906 }
1907 if (diff == 0) {
1908 // zoinks, no differences at all!
1909 n->osd_addrs = o->osd_addrs;
1910 }
1911
1912 // does crush match?
1913 ceph::buffer::list oc, nc;
1914 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1915 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1916 if (oc.contents_equal(nc)) {
1917 n->crush = o->crush;
1918 }
1919
1920 // does pg_temp match?
1921 if (*o->pg_temp == *n->pg_temp)
1922 n->pg_temp = o->pg_temp;
1923
1924 // does primary_temp match?
1925 if (o->primary_temp->size() == n->primary_temp->size()) {
1926 if (*o->primary_temp == *n->primary_temp)
1927 n->primary_temp = o->primary_temp;
1928 }
1929
1930 // do uuids match?
1931 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1932 *o->osd_uuid == *n->osd_uuid)
1933 n->osd_uuid = o->osd_uuid;
1934 }
1935
1936 void OSDMap::clean_temps(CephContext *cct,
1937 const OSDMap& oldmap,
1938 const OSDMap& nextmap,
1939 Incremental *pending_inc)
1940 {
1941 ldout(cct, 10) << __func__ << dendl;
1942
1943 for (auto pg : *nextmap.pg_temp) {
1944 // if pool does not exist, remove any existing pg_temps associated with
1945 // it. we don't care about pg_temps on the pending_inc either; if there
1946 // are new_pg_temp entries on the pending, clear them out just as well.
1947 if (!nextmap.have_pg_pool(pg.first.pool())) {
1948 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1949 << " for nonexistent pool " << pg.first.pool() << dendl;
1950 pending_inc->new_pg_temp[pg.first].clear();
1951 continue;
1952 }
1953 if (!nextmap.pg_exists(pg.first)) {
1954 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1955 << " for nonexistent pg " << dendl;
1956 pending_inc->new_pg_temp[pg.first].clear();
1957 continue;
1958 }
1959 // all osds down?
1960 unsigned num_up = 0;
1961 for (auto o : pg.second) {
1962 if (!nextmap.is_down(o)) {
1963 ++num_up;
1964 break;
1965 }
1966 }
1967 if (num_up == 0) {
1968 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1969 << " with all down osds" << pg.second << dendl;
1970 pending_inc->new_pg_temp[pg.first].clear();
1971 continue;
1972 }
1973 // redundant pg_temp?
1974 vector<int> raw_up;
1975 int primary;
1976 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1977 bool remove = false;
1978 if (raw_up == pg.second) {
1979 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1980 << pg.second << " that matches raw_up mapping" << dendl;
1981 remove = true;
1982 }
1983 // oversized pg_temp?
1984 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
1985 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1986 << pg.second << " exceeds pool size" << dendl;
1987 remove = true;
1988 }
1989 if (remove) {
1990 if (oldmap.pg_temp->count(pg.first))
1991 pending_inc->new_pg_temp[pg.first].clear();
1992 else
1993 pending_inc->new_pg_temp.erase(pg.first);
1994 }
1995 }
1996
1997 for (auto &pg : *nextmap.primary_temp) {
1998 // primary down?
1999 if (nextmap.is_down(pg.second)) {
2000 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
2001 << " to down " << pg.second << dendl;
2002 pending_inc->new_primary_temp[pg.first] = -1;
2003 continue;
2004 }
2005 // redundant primary_temp?
2006 vector<int> real_up, templess_up;
2007 int real_primary, templess_primary;
2008 pg_t pgid = pg.first;
2009 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
2010 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
2011 if (real_primary == templess_primary){
2012 ldout(cct, 10) << __func__ << " removing primary_temp "
2013 << pgid << " -> " << real_primary
2014 << " (unnecessary/redundant)" << dendl;
2015 if (oldmap.primary_temp->count(pgid))
2016 pending_inc->new_primary_temp[pgid] = -1;
2017 else
2018 pending_inc->new_primary_temp.erase(pgid);
2019 }
2020 }
2021 }
2022
2023 void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
2024 {
2025 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
2026 for (auto& p : pg_upmap)
2027 upmap_pgs->push_back(p.first);
2028 for (auto& p : pg_upmap_items)
2029 upmap_pgs->push_back(p.first);
2030 }
2031
2032 bool OSDMap::check_pg_upmaps(
2033 CephContext *cct,
2034 const vector<pg_t>& to_check,
2035 vector<pg_t> *to_cancel,
2036 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
2037 {
2038 bool any_change = false;
2039 map<int, map<int, float>> rule_weight_map;
2040 for (auto& pg : to_check) {
2041 const pg_pool_t *pi = get_pg_pool(pg.pool());
2042 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
2043 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
2044 << dendl;
2045 to_cancel->push_back(pg);
2046 continue;
2047 }
2048 if (pi->is_pending_merge(pg, nullptr)) {
2049 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
2050 << dendl;
2051 to_cancel->push_back(pg);
2052 continue;
2053 }
2054 vector<int> raw, up;
2055 pg_to_raw_upmap(pg, &raw, &up);
2056 auto crush_rule = get_pg_pool_crush_rule(pg);
2057 auto r = crush->verify_upmap(cct,
2058 crush_rule,
2059 get_pg_pool_size(pg),
2060 up);
2061 if (r < 0) {
2062 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
2063 << " returning " << r
2064 << dendl;
2065 to_cancel->push_back(pg);
2066 continue;
2067 }
2068 // below we check against crush-topology changing..
2069 map<int, float> weight_map;
2070 auto it = rule_weight_map.find(crush_rule);
2071 if (it == rule_weight_map.end()) {
2072 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
2073 if (r < 0) {
2074 lderr(cct) << __func__ << " unable to get crush weight_map for "
2075 << "crush_rule " << crush_rule
2076 << dendl;
2077 continue;
2078 }
2079 rule_weight_map[crush_rule] = weight_map;
2080 } else {
2081 weight_map = it->second;
2082 }
2083 ldout(cct, 10) << __func__ << " pg " << pg
2084 << " weight_map " << weight_map
2085 << dendl;
2086 for (auto osd : up) {
2087 auto it = weight_map.find(osd);
2088 if (it == weight_map.end()) {
2089 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
2090 << "been moved out of the specific crush-tree"
2091 << dendl;
2092 to_cancel->push_back(pg);
2093 break;
2094 }
2095 auto adjusted_weight = get_weightf(it->first) * it->second;
2096 if (adjusted_weight == 0) {
2097 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
2098 << " is out/crush-out"
2099 << dendl;
2100 to_cancel->push_back(pg);
2101 break;
2102 }
2103 }
2104 if (!to_cancel->empty() && to_cancel->back() == pg)
2105 continue;
2106 // okay, upmap is valid
2107 // continue to check if it is still necessary
2108 auto i = pg_upmap.find(pg);
2109 if (i != pg_upmap.end()) {
2110 if (i->second == raw) {
2111 ldout(cct, 10) << __func__ << "removing redundant pg_upmap " << i->first << " "
2112 << i->second << dendl;
2113 to_cancel->push_back(pg);
2114 continue;
2115 }
2116 if ((int)i->second.size() != get_pg_pool_size(pg)) {
2117 ldout(cct, 10) << __func__ << "removing pg_upmap " << i->first << " "
2118 << i->second << " != pool size " << get_pg_pool_size(pg)
2119 << dendl;
2120 to_cancel->push_back(pg);
2121 continue;
2122 }
2123 }
2124 auto j = pg_upmap_items.find(pg);
2125 if (j != pg_upmap_items.end()) {
2126 mempool::osdmap::vector<pair<int,int>> newmap;
2127 for (auto& p : j->second) {
2128 auto osd_from = p.first;
2129 auto osd_to = p.second;
2130 if (std::find(raw.begin(), raw.end(), osd_from) == raw.end()) {
2131 // cancel mapping if source osd does not exist anymore
2132 ldout(cct, 20) << __func__ << " pg_upmap_items (source osd does not exist) " << pg_upmap_items << dendl;
2133 continue;
2134 }
2135 if (osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2136 osd_to >= 0 && osd_weight[osd_to] == 0) {
2137 // cancel mapping if target osd is out
2138 ldout(cct, 20) << __func__ << " pg_upmap_items (target osd is out) " << pg_upmap_items << dendl;
2139 continue;
2140 }
2141 newmap.push_back(p);
2142 }
2143 if (newmap.empty()) {
2144 ldout(cct, 10) << __func__ << " removing no-op pg_upmap_items "
2145 << j->first << " " << j->second
2146 << dendl;
2147 to_cancel->push_back(pg);
2148 } else if (newmap != j->second) {
2149 // check partial no-op here.
2150 ldout(cct, 10) << __func__ << " simplifying partially no-op pg_upmap_items "
2151 << j->first << " " << j->second
2152 << " -> " << newmap
2153 << dendl;
2154 to_remap->insert({pg, newmap});
2155 any_change = true;
2156 }
2157 }
2158 }
2159 any_change = any_change || !to_cancel->empty();
2160 return any_change;
2161 }
2162
2163 void OSDMap::clean_pg_upmaps(
2164 CephContext *cct,
2165 Incremental *pending_inc,
2166 const vector<pg_t>& to_cancel,
2167 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2168 {
2169 for (auto &pg: to_cancel) {
2170 auto i = pending_inc->new_pg_upmap.find(pg);
2171 if (i != pending_inc->new_pg_upmap.end()) {
2172 ldout(cct, 10) << __func__ << " cancel invalid pending "
2173 << "pg_upmap entry "
2174 << i->first << "->" << i->second
2175 << dendl;
2176 pending_inc->new_pg_upmap.erase(i);
2177 }
2178 auto j = pg_upmap.find(pg);
2179 if (j != pg_upmap.end()) {
2180 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2181 << j->first << "->" << j->second
2182 << dendl;
2183 pending_inc->old_pg_upmap.insert(pg);
2184 }
2185 auto p = pending_inc->new_pg_upmap_items.find(pg);
2186 if (p != pending_inc->new_pg_upmap_items.end()) {
2187 ldout(cct, 10) << __func__ << " cancel invalid pending "
2188 << "pg_upmap_items entry "
2189 << p->first << "->" << p->second
2190 << dendl;
2191 pending_inc->new_pg_upmap_items.erase(p);
2192 }
2193 auto q = pg_upmap_items.find(pg);
2194 if (q != pg_upmap_items.end()) {
2195 ldout(cct, 10) << __func__ << " cancel invalid "
2196 << "pg_upmap_items entry "
2197 << q->first << "->" << q->second
2198 << dendl;
2199 pending_inc->old_pg_upmap_items.insert(pg);
2200 }
2201 }
2202 for (auto& i : to_remap)
2203 pending_inc->new_pg_upmap_items[i.first] = i.second;
2204 }
2205
2206 bool OSDMap::clean_pg_upmaps(
2207 CephContext *cct,
2208 Incremental *pending_inc) const
2209 {
2210 ldout(cct, 10) << __func__ << dendl;
2211 vector<pg_t> to_check;
2212 vector<pg_t> to_cancel;
2213 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2214
2215 get_upmap_pgs(&to_check);
2216 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2217 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2218 //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
2219 // and cleaned in the same way as pg_upmap. This is not critical since invalid
2220 // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
2221 return any_change;
2222 }
2223
2224 int OSDMap::apply_incremental(const Incremental &inc)
2225 {
2226 new_blocklist_entries = false;
2227 if (inc.epoch == 1)
2228 fsid = inc.fsid;
2229 else if (inc.fsid != fsid)
2230 return -EINVAL;
2231
2232 ceph_assert(inc.epoch == epoch+1);
2233
2234 epoch++;
2235 modified = inc.modified;
2236
2237 // full map?
2238 if (inc.fullmap.length()) {
2239 ceph::buffer::list bl(inc.fullmap);
2240 decode(bl);
2241 return 0;
2242 }
2243
2244 // nope, incremental.
2245 if (inc.new_flags >= 0) {
2246 flags = inc.new_flags;
2247 // the below is just to cover a newly-upgraded luminous mon
2248 // cluster that has to set require_jewel_osds or
2249 // require_kraken_osds before the osds can be upgraded to
2250 // luminous.
2251 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2252 if (require_osd_release < ceph_release_t::kraken) {
2253 require_osd_release = ceph_release_t::kraken;
2254 }
2255 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2256 if (require_osd_release < ceph_release_t::jewel) {
2257 require_osd_release = ceph_release_t::jewel;
2258 }
2259 }
2260 }
2261
2262 if (inc.new_max_osd >= 0)
2263 set_max_osd(inc.new_max_osd);
2264
2265 if (inc.new_pool_max != -1)
2266 pool_max = inc.new_pool_max;
2267
2268 for (const auto &pool : inc.new_pools) {
2269 pools[pool.first] = pool.second;
2270 pools[pool.first].last_change = epoch;
2271 }
2272
2273 new_removed_snaps = inc.new_removed_snaps;
2274 new_purged_snaps = inc.new_purged_snaps;
2275 for (auto p = new_removed_snaps.begin();
2276 p != new_removed_snaps.end();
2277 ++p) {
2278 removed_snaps_queue[p->first].union_of(p->second);
2279 }
2280 for (auto p = new_purged_snaps.begin();
2281 p != new_purged_snaps.end();
2282 ++p) {
2283 auto q = removed_snaps_queue.find(p->first);
2284 ceph_assert(q != removed_snaps_queue.end());
2285 q->second.subtract(p->second);
2286 if (q->second.empty()) {
2287 removed_snaps_queue.erase(q);
2288 }
2289 }
2290
2291 if (inc.new_last_up_change != utime_t()) {
2292 last_up_change = inc.new_last_up_change;
2293 }
2294 if (inc.new_last_in_change != utime_t()) {
2295 last_in_change = inc.new_last_in_change;
2296 }
2297
2298 for (const auto &pname : inc.new_pool_names) {
2299 auto pool_name_entry = pool_name.find(pname.first);
2300 if (pool_name_entry != pool_name.end()) {
2301 name_pool.erase(pool_name_entry->second);
2302 pool_name_entry->second = pname.second;
2303 } else {
2304 pool_name[pname.first] = pname.second;
2305 }
2306 name_pool[pname.second] = pname.first;
2307 }
2308
2309 for (const auto &pool : inc.old_pools) {
2310 pools.erase(pool);
2311 name_pool.erase(pool_name[pool]);
2312 pool_name.erase(pool);
2313 }
2314
2315 for (const auto &weight : inc.new_weight) {
2316 set_weight(weight.first, weight.second);
2317
2318 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2319 // xinfo old_weight.
2320 if (weight.second) {
2321 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2322 osd_xinfo[weight.first].old_weight = 0;
2323 }
2324 }
2325
2326 for (const auto &primary_affinity : inc.new_primary_affinity) {
2327 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2328 }
2329
2330 // erasure_code_profiles
2331 for (const auto &profile : inc.old_erasure_code_profiles)
2332 erasure_code_profiles.erase(profile);
2333
2334 for (const auto &profile : inc.new_erasure_code_profiles) {
2335 set_erasure_code_profile(profile.first, profile.second);
2336 }
2337
2338 // up/down
2339 for (const auto &state : inc.new_state) {
2340 const auto osd = state.first;
2341 int s = state.second ? state.second : CEPH_OSD_UP;
2342 if ((osd_state[osd] & CEPH_OSD_UP) &&
2343 (s & CEPH_OSD_UP)) {
2344 osd_info[osd].down_at = epoch;
2345 osd_xinfo[osd].down_stamp = modified;
2346 }
2347 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2348 (s & CEPH_OSD_EXISTS)) {
2349 // osd is destroyed; clear out anything interesting.
2350 (*osd_uuid)[osd] = uuid_d();
2351 osd_info[osd] = osd_info_t();
2352 osd_xinfo[osd] = osd_xinfo_t();
2353 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
2354 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2355 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2356 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2357 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
2358 osd_state[osd] = 0;
2359 } else {
2360 osd_state[osd] ^= s;
2361 }
2362 }
2363
2364 for (const auto &client : inc.new_up_client) {
2365 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
2366 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
2367 osd_addrs->client_addrs[client.first].reset(
2368 new entity_addrvec_t(client.second));
2369 osd_addrs->hb_back_addrs[client.first].reset(
2370 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2371 osd_addrs->hb_front_addrs[client.first].reset(
2372 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
2373
2374 osd_info[client.first].up_from = epoch;
2375 }
2376
2377 for (const auto &cluster : inc.new_up_cluster)
2378 osd_addrs->cluster_addrs[cluster.first].reset(
2379 new entity_addrvec_t(cluster.second));
2380
2381 // info
2382 for (const auto &thru : inc.new_up_thru)
2383 osd_info[thru.first].up_thru = thru.second;
2384
2385 for (const auto &interval : inc.new_last_clean_interval) {
2386 osd_info[interval.first].last_clean_begin = interval.second.first;
2387 osd_info[interval.first].last_clean_end = interval.second.second;
2388 }
2389
2390 for (const auto &lost : inc.new_lost)
2391 osd_info[lost.first].lost_at = lost.second;
2392
2393 // xinfo
2394 for (const auto &xinfo : inc.new_xinfo)
2395 osd_xinfo[xinfo.first] = xinfo.second;
2396
2397 // uuid
2398 for (const auto &uuid : inc.new_uuid)
2399 (*osd_uuid)[uuid.first] = uuid.second;
2400
2401 // pg rebuild
2402 for (const auto &pg : inc.new_pg_temp) {
2403 if (pg.second.empty())
2404 pg_temp->erase(pg.first);
2405 else
2406 pg_temp->set(pg.first, pg.second);
2407 }
2408 if (!inc.new_pg_temp.empty()) {
2409 // make sure pg_temp is efficiently stored
2410 pg_temp->rebuild();
2411 }
2412
2413 for (const auto &pg : inc.new_primary_temp) {
2414 if (pg.second == -1)
2415 primary_temp->erase(pg.first);
2416 else
2417 (*primary_temp)[pg.first] = pg.second;
2418 }
2419
2420 for (auto& p : inc.new_pg_upmap) {
2421 pg_upmap[p.first] = p.second;
2422 }
2423 for (auto& pg : inc.old_pg_upmap) {
2424 pg_upmap.erase(pg);
2425 }
2426 for (auto& p : inc.new_pg_upmap_items) {
2427 pg_upmap_items[p.first] = p.second;
2428 }
2429 for (auto& pg : inc.old_pg_upmap_items) {
2430 pg_upmap_items.erase(pg);
2431 }
2432
2433 for (auto& [pg, prim] : inc.new_pg_upmap_primary) {
2434 pg_upmap_primaries[pg] = prim;
2435 }
2436 for (auto& pg : inc.old_pg_upmap_primary) {
2437 pg_upmap_primaries.erase(pg);
2438 }
2439
2440 // blocklist
2441 if (!inc.new_blocklist.empty()) {
2442 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2443 new_blocklist_entries = true;
2444 }
2445 for (const auto &addr : inc.old_blocklist)
2446 blocklist.erase(addr);
2447
2448 for (const auto& addr_p : inc.new_range_blocklist) {
2449 range_blocklist.insert(addr_p);
2450 calculated_ranges.emplace(addr_p.first, addr_p.first);
2451 new_blocklist_entries = true;
2452 }
2453 for (const auto &addr : inc.old_range_blocklist) {
2454 calculated_ranges.erase(addr);
2455 range_blocklist.erase(addr);
2456 }
2457
2458 for (auto& i : inc.new_crush_node_flags) {
2459 if (i.second) {
2460 crush_node_flags[i.first] = i.second;
2461 } else {
2462 crush_node_flags.erase(i.first);
2463 }
2464 }
2465
2466 for (auto& i : inc.new_device_class_flags) {
2467 if (i.second) {
2468 device_class_flags[i.first] = i.second;
2469 } else {
2470 device_class_flags.erase(i.first);
2471 }
2472 }
2473
2474 // cluster snapshot?
2475 if (inc.cluster_snapshot.length()) {
2476 cluster_snapshot = inc.cluster_snapshot;
2477 cluster_snapshot_epoch = inc.epoch;
2478 } else {
2479 cluster_snapshot.clear();
2480 cluster_snapshot_epoch = 0;
2481 }
2482
2483 if (inc.new_nearfull_ratio >= 0) {
2484 nearfull_ratio = inc.new_nearfull_ratio;
2485 }
2486 if (inc.new_backfillfull_ratio >= 0) {
2487 backfillfull_ratio = inc.new_backfillfull_ratio;
2488 }
2489 if (inc.new_full_ratio >= 0) {
2490 full_ratio = inc.new_full_ratio;
2491 }
2492 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
2493 require_min_compat_client = inc.new_require_min_compat_client;
2494 }
2495 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2496 require_osd_release = inc.new_require_osd_release;
2497 if (require_osd_release >= ceph_release_t::luminous) {
2498 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2499 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
2500 }
2501 }
2502
2503 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
2504 require_osd_release = inc.new_require_osd_release;
2505 if (require_osd_release >= ceph_release_t::nautilus) {
2506 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2507 }
2508 }
2509 // do new crush map last (after up/down stuff)
2510 if (inc.crush.length()) {
2511 ceph::buffer::list bl(inc.crush);
2512 auto blp = bl.cbegin();
2513 crush.reset(new CrushWrapper);
2514 crush->decode(blp);
2515 if (require_osd_release >= ceph_release_t::luminous) {
2516 // only increment if this is a luminous-encoded osdmap, lest
2517 // the mon's crush_version diverge from what the osds or others
2518 // are decoding and applying on their end. if we won't encode
2519 // it in the canonical version, don't change it.
2520 ++crush_version;
2521 }
2522 for (auto it = device_class_flags.begin();
2523 it != device_class_flags.end();) {
2524 const char* class_name = crush->get_class_name(it->first);
2525 if (!class_name) // device class is gone
2526 it = device_class_flags.erase(it);
2527 else
2528 it++;
2529 }
2530 }
2531
2532 if (inc.change_stretch_mode) {
2533 stretch_mode_enabled = inc.stretch_mode_enabled;
2534 stretch_bucket_count = inc.new_stretch_bucket_count;
2535 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2536 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2537 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2538 }
2539
2540 switch (inc.mutate_allow_crimson) {
2541 case Incremental::mutate_allow_crimson_t::NONE:
2542 break;
2543 case Incremental::mutate_allow_crimson_t::SET:
2544 allow_crimson = true;
2545 break;
2546 case Incremental::mutate_allow_crimson_t::CLEAR:
2547 allow_crimson = false;
2548 break;
2549 }
2550
2551 calc_num_osds();
2552 _calc_up_osd_features();
2553 return 0;
2554 }
2555
2556 // mapping
2557 int OSDMap::map_to_pg(
2558 int64_t poolid,
2559 const string& name,
2560 const string& key,
2561 const string& nspace,
2562 pg_t *pg) const
2563 {
2564 // calculate ps (placement seed)
2565 const pg_pool_t *pool = get_pg_pool(poolid);
2566 if (!pool)
2567 return -ENOENT;
2568 ps_t ps;
2569 if (!key.empty())
2570 ps = pool->hash_key(key, nspace);
2571 else
2572 ps = pool->hash_key(name, nspace);
2573 *pg = pg_t(ps, poolid);
2574 return 0;
2575 }
2576
2577 int OSDMap::object_locator_to_pg(
2578 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2579 {
2580 if (loc.hash >= 0) {
2581 if (!get_pg_pool(loc.get_pool())) {
2582 return -ENOENT;
2583 }
2584 pg = pg_t(loc.hash, loc.get_pool());
2585 return 0;
2586 }
2587 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2588 }
2589
2590 ceph_object_layout OSDMap::make_object_layout(
2591 object_t oid, int pg_pool, string nspace) const
2592 {
2593 object_locator_t loc(pg_pool, nspace);
2594
2595 ceph_object_layout ol;
2596 pg_t pgid = object_locator_to_pg(oid, loc);
2597 ol.ol_pgid = pgid.get_old_pg().v;
2598 ol.ol_stripe_unit = 0;
2599 return ol;
2600 }
2601
2602 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2603 vector<int>& osds) const
2604 {
2605 if (pool.can_shift_osds()) {
2606 unsigned removed = 0;
2607 for (unsigned i = 0; i < osds.size(); i++) {
2608 if (!exists(osds[i])) {
2609 removed++;
2610 continue;
2611 }
2612 if (removed) {
2613 osds[i - removed] = osds[i];
2614 }
2615 }
2616 if (removed)
2617 osds.resize(osds.size() - removed);
2618 } else {
2619 for (auto& osd : osds) {
2620 if (!exists(osd))
2621 osd = CRUSH_ITEM_NONE;
2622 }
2623 }
2624 }
2625
2626 void OSDMap::_pg_to_raw_osds(
2627 const pg_pool_t& pool, pg_t pg,
2628 vector<int> *osds,
2629 ps_t *ppps) const
2630 {
2631 // map to osds[]
2632 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2633 unsigned size = pool.get_size();
2634
2635 // what crush rule?
2636 int ruleno = pool.get_crush_rule();
2637 if (ruleno >= 0)
2638 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2639
2640 _remove_nonexistent_osds(pool, *osds);
2641
2642 if (ppps)
2643 *ppps = pps;
2644 }
2645
2646 int OSDMap::_pick_primary(const vector<int>& osds) const
2647 {
2648 for (auto osd : osds) {
2649 if (osd != CRUSH_ITEM_NONE) {
2650 return osd;
2651 }
2652 }
2653 return -1;
2654 }
2655
2656 void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
2657 {
2658 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2659 auto p = pg_upmap.find(pg);
2660 if (p != pg_upmap.end()) {
2661 // make sure targets aren't marked out
2662 for (auto osd : p->second) {
2663 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2664 osd_weight[osd] == 0) {
2665 // reject/ignore the explicit mapping
2666 return;
2667 }
2668 }
2669 *raw = vector<int>(p->second.begin(), p->second.end());
2670 // continue to check and apply pg_upmap_items if any
2671 }
2672
2673 auto q = pg_upmap_items.find(pg);
2674 if (q != pg_upmap_items.end()) {
2675 // NOTE: this approach does not allow a bidirectional swap,
2676 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2677 for (auto& [osd_from, osd_to] : q->second) {
2678 // A capcaity change upmap (repace osd in the pg with osd not in the pg)
2679 // make sure the replacement value doesn't already appear
2680 bool exists = false;
2681 ssize_t pos = -1;
2682 for (unsigned i = 0; i < raw->size(); ++i) {
2683 int osd = (*raw)[i];
2684 if (osd == osd_to) {
2685 exists = true;
2686 break;
2687 }
2688 // ignore mapping if target is marked out (or invalid osd id)
2689 if (osd == osd_from &&
2690 pos < 0 &&
2691 !(osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2692 osd_to >= 0 && osd_weight[osd_to] == 0)) {
2693 pos = i;
2694 }
2695 }
2696 if (!exists && pos >= 0) {
2697 (*raw)[pos] = osd_to;
2698 }
2699 }
2700 }
2701 auto r = pg_upmap_primaries.find(pg);
2702 if (r != pg_upmap_primaries.end()) {
2703 auto new_prim = r->second;
2704 // Apply mapping only if new primary is not marked out and valid osd id
2705 if (new_prim != CRUSH_ITEM_NONE && new_prim < max_osd && new_prim >= 0 &&
2706 osd_weight[new_prim] != 0) {
2707 int new_prim_idx = 0;
2708 for (int i = 1 ; i < (int)raw->size(); i++) { // start from 1 on purpose
2709 if ((*raw)[i] == new_prim) {
2710 new_prim_idx = i;
2711 break;
2712 }
2713 }
2714 if (new_prim_idx > 0) {
2715 // swap primary
2716 (*raw)[new_prim_idx] = (*raw)[0];
2717 (*raw)[0] = new_prim;
2718 }
2719 }
2720 }
2721 }
2722
2723 // pg -> (up osd list)
2724 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2725 vector<int> *up) const
2726 {
2727 if (pool.can_shift_osds()) {
2728 // shift left
2729 up->clear();
2730 up->reserve(raw.size());
2731 for (unsigned i=0; i<raw.size(); i++) {
2732 if (!exists(raw[i]) || is_down(raw[i]))
2733 continue;
2734 up->push_back(raw[i]);
2735 }
2736 } else {
2737 // set down/dne devices to NONE
2738 up->resize(raw.size());
2739 for (int i = raw.size() - 1; i >= 0; --i) {
2740 if (!exists(raw[i]) || is_down(raw[i])) {
2741 (*up)[i] = CRUSH_ITEM_NONE;
2742 } else {
2743 (*up)[i] = raw[i];
2744 }
2745 }
2746 }
2747 }
2748
2749 void OSDMap::_apply_primary_affinity(ps_t seed,
2750 const pg_pool_t& pool,
2751 vector<int> *osds,
2752 int *primary) const
2753 {
2754 // do we have any non-default primary_affinity values for these osds?
2755 if (!osd_primary_affinity)
2756 return;
2757
2758 bool any = false;
2759 for (const auto osd : *osds) {
2760 if (osd != CRUSH_ITEM_NONE &&
2761 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2762 any = true;
2763 break;
2764 }
2765 }
2766 if (!any)
2767 return;
2768
2769 // pick the primary. feed both the seed (for the pg) and the osd
2770 // into the hash/rng so that a proportional fraction of an osd's pgs
2771 // get rejected as primary.
2772 int pos = -1;
2773 for (unsigned i = 0; i < osds->size(); ++i) {
2774 int o = (*osds)[i];
2775 if (o == CRUSH_ITEM_NONE)
2776 continue;
2777 unsigned a = (*osd_primary_affinity)[o];
2778 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2779 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2780 seed, o) >> 16) >= a) {
2781 // we chose not to use this primary. note it anyway as a
2782 // fallback in case we don't pick anyone else, but keep looking.
2783 if (pos < 0)
2784 pos = i;
2785 } else {
2786 pos = i;
2787 break;
2788 }
2789 }
2790 if (pos < 0)
2791 return;
2792
2793 *primary = (*osds)[pos];
2794
2795 if (pool.can_shift_osds() && pos > 0) {
2796 // move the new primary to the front.
2797 for (int i = pos; i > 0; --i) {
2798 (*osds)[i] = (*osds)[i-1];
2799 }
2800 (*osds)[0] = *primary;
2801 }
2802 }
2803
2804 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2805 vector<int> *temp_pg, int *temp_primary) const
2806 {
2807 pg = pool.raw_pg_to_pg(pg);
2808 const auto p = pg_temp->find(pg);
2809 temp_pg->clear();
2810 if (p != pg_temp->end()) {
2811 for (unsigned i=0; i<p->second.size(); i++) {
2812 if (!exists(p->second[i]) || is_down(p->second[i])) {
2813 if (pool.can_shift_osds()) {
2814 continue;
2815 } else {
2816 temp_pg->push_back(CRUSH_ITEM_NONE);
2817 }
2818 } else {
2819 temp_pg->push_back(p->second[i]);
2820 }
2821 }
2822 }
2823 const auto &pp = primary_temp->find(pg);
2824 *temp_primary = -1;
2825 if (pp != primary_temp->end()) {
2826 *temp_primary = pp->second;
2827 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2828 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2829 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2830 *temp_primary = (*temp_pg)[i];
2831 break;
2832 }
2833 }
2834 }
2835 }
2836
2837 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2838 {
2839 const pg_pool_t *pool = get_pg_pool(pg.pool());
2840 if (!pool) {
2841 *primary = -1;
2842 raw->clear();
2843 return;
2844 }
2845 _pg_to_raw_osds(*pool, pg, raw, NULL);
2846 *primary = _pick_primary(*raw);
2847 }
2848
2849 void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2850 vector<int> *raw_upmap) const
2851 {
2852 auto pool = get_pg_pool(pg.pool());
2853 if (!pool) {
2854 raw_upmap->clear();
2855 return;
2856 }
2857 _pg_to_raw_osds(*pool, pg, raw, NULL);
2858 *raw_upmap = *raw;
2859 _apply_upmap(*pool, pg, raw_upmap);
2860 }
2861
2862 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2863 {
2864 const pg_pool_t *pool = get_pg_pool(pg.pool());
2865 if (!pool) {
2866 *primary = -1;
2867 up->clear();
2868 return;
2869 }
2870 vector<int> raw;
2871 ps_t pps;
2872 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2873 _apply_upmap(*pool, pg, &raw);
2874 _raw_to_up_osds(*pool, raw, up);
2875 *primary = _pick_primary(raw);
2876 _apply_primary_affinity(pps, *pool, up, primary);
2877 }
2878
2879 void OSDMap::_pg_to_up_acting_osds(
2880 const pg_t& pg, vector<int> *up, int *up_primary,
2881 vector<int> *acting, int *acting_primary,
2882 bool raw_pg_to_pg) const
2883 {
2884 const pg_pool_t *pool = get_pg_pool(pg.pool());
2885 if (!pool ||
2886 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2887 if (up)
2888 up->clear();
2889 if (up_primary)
2890 *up_primary = -1;
2891 if (acting)
2892 acting->clear();
2893 if (acting_primary)
2894 *acting_primary = -1;
2895 return;
2896 }
2897 vector<int> raw;
2898 vector<int> _up;
2899 vector<int> _acting;
2900 int _up_primary;
2901 int _acting_primary;
2902 ps_t pps;
2903 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2904 if (_acting.empty() || up || up_primary) {
2905 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2906 _apply_upmap(*pool, pg, &raw);
2907 _raw_to_up_osds(*pool, raw, &_up);
2908 _up_primary = _pick_primary(_up);
2909 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2910 if (_acting.empty()) {
2911 _acting = _up;
2912 if (_acting_primary == -1) {
2913 _acting_primary = _up_primary;
2914 }
2915 }
2916
2917 if (up)
2918 up->swap(_up);
2919 if (up_primary)
2920 *up_primary = _up_primary;
2921 }
2922
2923 if (acting)
2924 acting->swap(_acting);
2925 if (acting_primary)
2926 *acting_primary = _acting_primary;
2927 }
2928
2929 int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
2930 {
2931 // This implementation is broken for EC PGs since the osd may appear
2932 // multiple times in the acting set. See
2933 // https://tracker.ceph.com/issues/43213
2934 if (!nrep)
2935 nrep = acting.size();
2936 for (int i=0; i<nrep; i++)
2937 if (acting[i] == osd)
2938 return i;
2939 return -1;
2940 }
2941
2942 int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
2943 {
2944 int nrep = acting.size();
2945 if (who.shard == shard_id_t::NO_SHARD) {
2946 for (int i=0; i<nrep; i++) {
2947 if (acting[i] == who.osd) {
2948 return i;
2949 }
2950 }
2951 } else {
2952 if (who.shard < nrep && acting[who.shard] == who.osd) {
2953 return who.shard;
2954 }
2955 }
2956 return -1;
2957 }
2958
2959 bool OSDMap::primary_changed_broken(
2960 int oldprimary,
2961 const vector<int> &oldacting,
2962 int newprimary,
2963 const vector<int> &newacting)
2964 {
2965 if (oldacting.empty() && newacting.empty())
2966 return false; // both still empty
2967 if (oldacting.empty() ^ newacting.empty())
2968 return true; // was empty, now not, or vice versa
2969 if (oldprimary != newprimary)
2970 return true; // primary changed
2971 if (calc_pg_role_broken(oldprimary, oldacting) !=
2972 calc_pg_role_broken(newprimary, newacting))
2973 return true;
2974 return false; // same primary (tho replicas may have changed)
2975 }
2976
2977 uint64_t OSDMap::get_encoding_features() const
2978 {
2979 uint64_t f = SIGNIFICANT_FEATURES;
2980 if (require_osd_release < ceph_release_t::octopus) {
2981 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2982 }
2983 if (require_osd_release < ceph_release_t::nautilus) {
2984 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2985 }
2986 if (require_osd_release < ceph_release_t::mimic) {
2987 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2988 }
2989 if (require_osd_release < ceph_release_t::luminous) {
2990 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2991 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2992 }
2993 if (require_osd_release < ceph_release_t::kraken) {
2994 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
2995 CEPH_FEATURE_MSG_ADDR2);
2996 }
2997 if (require_osd_release < ceph_release_t::jewel) {
2998 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
2999 CEPH_FEATURE_NEW_OSDOP_ENCODING |
3000 CEPH_FEATURE_CRUSH_TUNABLES5);
3001 }
3002 return f;
3003 }
3004
3005 // serialize, unserialize
3006 void OSDMap::encode_client_old(ceph::buffer::list& bl) const
3007 {
3008 using ceph::encode;
3009 __u16 v = 5;
3010 encode(v, bl);
3011
3012 // base
3013 encode(fsid, bl);
3014 encode(epoch, bl);
3015 encode(created, bl);
3016 encode(modified, bl);
3017
3018 // for encode(pools, bl);
3019 __u32 n = pools.size();
3020 encode(n, bl);
3021
3022 for (const auto &pool : pools) {
3023 n = pool.first;
3024 encode(n, bl);
3025 encode(pool.second, bl, 0);
3026 }
3027 // for encode(pool_name, bl);
3028 n = pool_name.size();
3029 encode(n, bl);
3030 for (const auto &pname : pool_name) {
3031 n = pname.first;
3032 encode(n, bl);
3033 encode(pname.second, bl);
3034 }
3035 // for encode(pool_max, bl);
3036 n = pool_max;
3037 encode(n, bl);
3038
3039 encode(flags, bl);
3040
3041 encode(max_osd, bl);
3042 {
3043 uint32_t n = osd_state.size();
3044 encode(n, bl);
3045 for (auto s : osd_state) {
3046 encode((uint8_t)s, bl);
3047 }
3048 }
3049 encode(osd_weight, bl);
3050 encode(osd_addrs->client_addrs, bl, 0);
3051
3052 // for encode(pg_temp, bl);
3053 n = pg_temp->size();
3054 encode(n, bl);
3055 for (const auto& pg : *pg_temp) {
3056 old_pg_t opg = pg.first.get_old_pg();
3057 encode(opg, bl);
3058 encode(pg.second, bl);
3059 }
3060
3061 // crush
3062 ceph::buffer::list cbl;
3063 crush->encode(cbl, 0 /* legacy (no) features */);
3064 encode(cbl, bl);
3065 }
3066
3067 void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
3068 {
3069 using ceph::encode;
3070 if ((features & CEPH_FEATURE_PGID64) == 0) {
3071 encode_client_old(bl);
3072 return;
3073 }
3074
3075 __u16 v = 6;
3076 encode(v, bl);
3077
3078 // base
3079 encode(fsid, bl);
3080 encode(epoch, bl);
3081 encode(created, bl);
3082 encode(modified, bl);
3083
3084 encode(pools, bl, features);
3085 encode(pool_name, bl);
3086 encode(pool_max, bl);
3087
3088 encode(flags, bl);
3089
3090 encode(max_osd, bl);
3091 {
3092 uint32_t n = osd_state.size();
3093 encode(n, bl);
3094 for (auto s : osd_state) {
3095 encode((uint8_t)s, bl);
3096 }
3097 }
3098 encode(osd_weight, bl);
3099 encode(osd_addrs->client_addrs, bl, features);
3100
3101 encode(*pg_temp, bl);
3102
3103 // crush
3104 ceph::buffer::list cbl;
3105 crush->encode(cbl, 0 /* legacy (no) features */);
3106 encode(cbl, bl);
3107
3108 // extended
3109 __u16 ev = 10;
3110 encode(ev, bl);
3111 encode(osd_addrs->hb_back_addrs, bl, features);
3112 encode(osd_info, bl);
3113 encode(blocklist, bl, features);
3114 encode(osd_addrs->cluster_addrs, bl, features);
3115 encode(cluster_snapshot_epoch, bl);
3116 encode(cluster_snapshot, bl);
3117 encode(*osd_uuid, bl);
3118 encode(osd_xinfo, bl, features);
3119 encode(osd_addrs->hb_front_addrs, bl, features);
3120 }
3121
3122 /* for a description of osdmap versions, and when they were introduced, please
3123 * refer to
3124 * doc/dev/osd_internals/osdmap_versions.txt
3125 */
3126 void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
3127 {
3128 using ceph::encode;
3129 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
3130 encode_classic(bl, features);
3131 return;
3132 }
3133
3134 // only a select set of callers should *ever* be encoding new
3135 // OSDMaps. others should be passing around the canonical encoded
3136 // buffers from on high. select out those callers by passing in an
3137 // "impossible" feature bit.
3138 ceph_assert(features & CEPH_FEATURE_RESERVED);
3139 features &= ~CEPH_FEATURE_RESERVED;
3140
3141 size_t start_offset = bl.length();
3142 size_t tail_offset;
3143 size_t crc_offset;
3144 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
3145
3146 // meta-encoding: how we include client-used and osd-specific data
3147 ENCODE_START(8, 7, bl);
3148
3149 {
3150 // NOTE: any new encoding dependencies must be reflected by
3151 // SIGNIFICANT_FEATURES
3152 uint8_t v = 10;
3153 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3154 v = 3;
3155 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3156 v = 6;
3157 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3158 v = 7;
3159 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
3160 v = 9;
3161 } */
3162 ENCODE_START(v, 1, bl); // client-usable data
3163 // base
3164 encode(fsid, bl);
3165 encode(epoch, bl);
3166 encode(created, bl);
3167 encode(modified, bl);
3168
3169 encode(pools, bl, features);
3170 encode(pool_name, bl);
3171 encode(pool_max, bl);
3172
3173 if (v < 4) {
3174 decltype(flags) f = flags;
3175 if (require_osd_release >= ceph_release_t::luminous)
3176 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
3177 else if (require_osd_release == ceph_release_t::kraken)
3178 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
3179 else if (require_osd_release == ceph_release_t::jewel)
3180 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
3181 encode(f, bl);
3182 } else {
3183 encode(flags, bl);
3184 }
3185
3186 encode(max_osd, bl);
3187 if (v >= 5) {
3188 encode(osd_state, bl);
3189 } else {
3190 uint32_t n = osd_state.size();
3191 encode(n, bl);
3192 for (auto s : osd_state) {
3193 encode((uint8_t)s, bl);
3194 }
3195 }
3196 encode(osd_weight, bl);
3197 if (v >= 8) {
3198 encode(osd_addrs->client_addrs, bl, features);
3199 } else {
3200 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
3201 }
3202
3203 encode(*pg_temp, bl);
3204 encode(*primary_temp, bl);
3205 if (osd_primary_affinity) {
3206 encode(*osd_primary_affinity, bl);
3207 } else {
3208 vector<__u32> v;
3209 encode(v, bl);
3210 }
3211
3212 // crush
3213 ceph::buffer::list cbl;
3214 crush->encode(cbl, features);
3215 encode(cbl, bl);
3216 encode(erasure_code_profiles, bl);
3217
3218 if (v >= 4) {
3219 encode(pg_upmap, bl);
3220 encode(pg_upmap_items, bl);
3221 } else {
3222 ceph_assert(pg_upmap.empty());
3223 ceph_assert(pg_upmap_items.empty());
3224 }
3225 if (v >= 6) {
3226 encode(crush_version, bl);
3227 }
3228 if (v >= 7) {
3229 encode(new_removed_snaps, bl);
3230 encode(new_purged_snaps, bl);
3231 }
3232 if (v >= 9) {
3233 encode(last_up_change, bl);
3234 encode(last_in_change, bl);
3235 }
3236 if (v >= 10) {
3237 encode(pg_upmap_primaries, bl);
3238 } else {
3239 ceph_assert(pg_upmap_primaries.empty());
3240 }
3241 ENCODE_FINISH(bl); // client-usable data
3242 }
3243
3244 {
3245 // NOTE: any new encoding dependencies must be reflected by
3246 // SIGNIFICANT_FEATURES
3247 uint8_t target_v = 9; // when bumping this, be aware of allow_crimson
3248 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3249 target_v = 1;
3250 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3251 target_v = 5;
3252 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3253 target_v = 6;
3254 }
3255 if (stretch_mode_enabled) {
3256 target_v = std::max((uint8_t)10, target_v);
3257 }
3258 if (!range_blocklist.empty()) {
3259 target_v = std::max((uint8_t)11, target_v);
3260 }
3261 if (allow_crimson) {
3262 target_v = std::max((uint8_t)12, target_v);
3263 }
3264 ENCODE_START(target_v, 1, bl); // extended, osd-only data
3265 if (target_v < 7) {
3266 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3267 } else {
3268 encode(osd_addrs->hb_back_addrs, bl, features);
3269 }
3270 encode(osd_info, bl);
3271 {
3272 // put this in a sorted, ordered map<> so that we encode in a
3273 // deterministic order.
3274 map<entity_addr_t,utime_t> blocklist_map;
3275 for (const auto &addr : blocklist)
3276 blocklist_map.insert(make_pair(addr.first, addr.second));
3277 encode(blocklist_map, bl, features);
3278 }
3279 if (target_v < 7) {
3280 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3281 } else {
3282 encode(osd_addrs->cluster_addrs, bl, features);
3283 }
3284 encode(cluster_snapshot_epoch, bl);
3285 encode(cluster_snapshot, bl);
3286 encode(*osd_uuid, bl);
3287 encode(osd_xinfo, bl, features);
3288 if (target_v < 7) {
3289 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3290 } else {
3291 encode(osd_addrs->hb_front_addrs, bl, features);
3292 }
3293 if (target_v >= 2) {
3294 encode(nearfull_ratio, bl);
3295 encode(full_ratio, bl);
3296 encode(backfillfull_ratio, bl);
3297 }
3298 // 4 was string-based new_require_min_compat_client
3299 if (target_v >= 5) {
3300 encode(require_min_compat_client, bl);
3301 encode(require_osd_release, bl);
3302 }
3303 if (target_v >= 6) {
3304 encode(removed_snaps_queue, bl);
3305 }
3306 if (target_v >= 8) {
3307 encode(crush_node_flags, bl);
3308 }
3309 if (target_v >= 9) {
3310 encode(device_class_flags, bl);
3311 }
3312 if (target_v >= 10) {
3313 encode(stretch_mode_enabled, bl);
3314 encode(stretch_bucket_count, bl);
3315 encode(degraded_stretch_mode, bl);
3316 encode(recovering_stretch_mode, bl);
3317 encode(stretch_mode_bucket, bl);
3318 }
3319 if (target_v >= 11) {
3320 ::encode(range_blocklist, bl, features);
3321 }
3322 if (target_v >= 12) {
3323 ::encode(allow_crimson, bl);
3324 }
3325 ENCODE_FINISH(bl); // osd-only data
3326 }
3327
3328 crc_offset = bl.length();
3329 crc_filler = bl.append_hole(sizeof(uint32_t));
3330 tail_offset = bl.length();
3331
3332 ENCODE_FINISH(bl); // meta-encoding wrapper
3333
3334 // fill in crc
3335 ceph::buffer::list front;
3336 front.substr_of(bl, start_offset, crc_offset - start_offset);
3337 crc = front.crc32c(-1);
3338 if (tail_offset < bl.length()) {
3339 ceph::buffer::list tail;
3340 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3341 crc = tail.crc32c(crc);
3342 }
3343 ceph_le32 crc_le;
3344 crc_le = crc;
3345 crc_filler->copy_in(4, (char*)&crc_le);
3346 crc_defined = true;
3347 }
3348
3349 /* for a description of osdmap versions, and when they were introduced, please
3350 * refer to
3351 * doc/dev/osd_internals/osdmap_versions.txt
3352 */
3353 void OSDMap::decode(ceph::buffer::list& bl)
3354 {
3355 auto p = bl.cbegin();
3356 decode(p);
3357 }
3358
3359 void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
3360 {
3361 using ceph::decode;
3362 __u32 n, t;
3363 __u16 v;
3364 decode(v, p);
3365
3366 // base
3367 decode(fsid, p);
3368 decode(epoch, p);
3369 decode(created, p);
3370 decode(modified, p);
3371
3372 if (v < 6) {
3373 if (v < 4) {
3374 int32_t max_pools = 0;
3375 decode(max_pools, p);
3376 pool_max = max_pools;
3377 }
3378 pools.clear();
3379 decode(n, p);
3380 while (n--) {
3381 decode(t, p);
3382 decode(pools[t], p);
3383 }
3384 if (v == 4) {
3385 decode(n, p);
3386 pool_max = n;
3387 } else if (v == 5) {
3388 pool_name.clear();
3389 decode(n, p);
3390 while (n--) {
3391 decode(t, p);
3392 decode(pool_name[t], p);
3393 }
3394 decode(n, p);
3395 pool_max = n;
3396 }
3397 } else {
3398 decode(pools, p);
3399 decode(pool_name, p);
3400 decode(pool_max, p);
3401 }
3402 // kludge around some old bug that zeroed out pool_max (#2307)
3403 if (pools.size() && pool_max < pools.rbegin()->first) {
3404 pool_max = pools.rbegin()->first;
3405 }
3406
3407 decode(flags, p);
3408
3409 decode(max_osd, p);
3410 {
3411 vector<uint8_t> os;
3412 decode(os, p);
3413 osd_state.resize(os.size());
3414 for (unsigned i = 0; i < os.size(); ++i) {
3415 osd_state[i] = os[i];
3416 }
3417 }
3418 decode(osd_weight, p);
3419 decode(osd_addrs->client_addrs, p);
3420 if (v <= 5) {
3421 pg_temp->clear();
3422 decode(n, p);
3423 while (n--) {
3424 old_pg_t opg;
3425 ceph::decode_raw(opg, p);
3426 mempool::osdmap::vector<int32_t> v;
3427 decode(v, p);
3428 pg_temp->set(pg_t(opg), v);
3429 }
3430 } else {
3431 decode(*pg_temp, p);
3432 }
3433
3434 // crush
3435 ceph::buffer::list cbl;
3436 decode(cbl, p);
3437 auto cblp = cbl.cbegin();
3438 crush->decode(cblp);
3439
3440 // extended
3441 __u16 ev = 0;
3442 if (v >= 5)
3443 decode(ev, p);
3444 decode(osd_addrs->hb_back_addrs, p);
3445 decode(osd_info, p);
3446 if (v < 5)
3447 decode(pool_name, p);
3448
3449 decode(blocklist, p);
3450 if (ev >= 6)
3451 decode(osd_addrs->cluster_addrs, p);
3452 else
3453 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
3454
3455 if (ev >= 7) {
3456 decode(cluster_snapshot_epoch, p);
3457 decode(cluster_snapshot, p);
3458 }
3459
3460 if (ev >= 8) {
3461 decode(*osd_uuid, p);
3462 } else {
3463 osd_uuid->resize(max_osd);
3464 }
3465 if (ev >= 9)
3466 decode(osd_xinfo, p);
3467 else
3468 osd_xinfo.resize(max_osd);
3469
3470 if (ev >= 10)
3471 decode(osd_addrs->hb_front_addrs, p);
3472 else
3473 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
3474
3475 osd_primary_affinity.reset();
3476
3477 post_decode();
3478 }
3479
3480 void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
3481 {
3482 using ceph::decode;
3483 /**
3484 * Older encodings of the OSDMap had a single struct_v which
3485 * covered the whole encoding, and was prior to our modern
3486 * stuff which includes a compatv and a size. So if we see
3487 * a struct_v < 7, we must rewind to the beginning and use our
3488 * classic decoder.
3489 */
3490 size_t start_offset = bl.get_off();
3491 size_t tail_offset = 0;
3492 ceph::buffer::list crc_front, crc_tail;
3493
3494 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3495 if (struct_v < 7) {
3496 bl.seek(start_offset);
3497 decode_classic(bl);
3498 return;
3499 }
3500 /**
3501 * Since we made it past that hurdle, we can use our normal paths.
3502 */
3503 {
3504 DECODE_START(9, bl); // client-usable data
3505 // base
3506 decode(fsid, bl);
3507 decode(epoch, bl);
3508 decode(created, bl);
3509 decode(modified, bl);
3510
3511 decode(pools, bl);
3512 decode(pool_name, bl);
3513 decode(pool_max, bl);
3514
3515 decode(flags, bl);
3516
3517 decode(max_osd, bl);
3518 if (struct_v >= 5) {
3519 decode(osd_state, bl);
3520 } else {
3521 vector<uint8_t> os;
3522 decode(os, bl);
3523 osd_state.resize(os.size());
3524 for (unsigned i = 0; i < os.size(); ++i) {
3525 osd_state[i] = os[i];
3526 }
3527 }
3528 decode(osd_weight, bl);
3529 decode(osd_addrs->client_addrs, bl);
3530
3531 decode(*pg_temp, bl);
3532 decode(*primary_temp, bl);
3533 // dates back to firefly. version increased from 2 to 3 still in firefly.
3534 // do we really still need to keep this around? even for old clients?
3535 if (struct_v >= 2) {
3536 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
3537 decode(*osd_primary_affinity, bl);
3538 if (osd_primary_affinity->empty())
3539 osd_primary_affinity.reset();
3540 } else {
3541 osd_primary_affinity.reset();
3542 }
3543
3544 // crush
3545 ceph::buffer::list cbl;
3546 decode(cbl, bl);
3547 auto cblp = cbl.cbegin();
3548 crush->decode(cblp);
3549 // added in firefly; version increased in luminous, so it affects
3550 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3551 // alone until we require clients to be all luminous?
3552 if (struct_v >= 3) {
3553 decode(erasure_code_profiles, bl);
3554 } else {
3555 erasure_code_profiles.clear();
3556 }
3557 // version increased from 3 to 4 still in luminous, so same as above
3558 // applies.
3559 if (struct_v >= 4) {
3560 decode(pg_upmap, bl);
3561 decode(pg_upmap_items, bl);
3562 } else {
3563 pg_upmap.clear();
3564 pg_upmap_items.clear();
3565 }
3566 // again, version increased from 5 to 6 still in luminous, so above
3567 // applies.
3568 if (struct_v >= 6) {
3569 decode(crush_version, bl);
3570 }
3571 // version increase from 6 to 7 in mimic
3572 if (struct_v >= 7) {
3573 decode(new_removed_snaps, bl);
3574 decode(new_purged_snaps, bl);
3575 }
3576 // version increase from 7 to 8, 8 to 9, in nautilus.
3577 if (struct_v >= 9) {
3578 decode(last_up_change, bl);
3579 decode(last_in_change, bl);
3580 }
3581 if (struct_v >= 10) {
3582 decode(pg_upmap_primaries, bl);
3583 } else {
3584 pg_upmap_primaries.clear();
3585 }
3586 DECODE_FINISH(bl); // client-usable data
3587 }
3588
3589 {
3590 DECODE_START(10, bl); // extended, osd-only data
3591 decode(osd_addrs->hb_back_addrs, bl);
3592 decode(osd_info, bl);
3593 decode(blocklist, bl);
3594 decode(osd_addrs->cluster_addrs, bl);
3595 decode(cluster_snapshot_epoch, bl);
3596 decode(cluster_snapshot, bl);
3597 decode(*osd_uuid, bl);
3598 decode(osd_xinfo, bl);
3599 decode(osd_addrs->hb_front_addrs, bl);
3600 //
3601 if (struct_v >= 2) {
3602 decode(nearfull_ratio, bl);
3603 decode(full_ratio, bl);
3604 } else {
3605 nearfull_ratio = 0;
3606 full_ratio = 0;
3607 }
3608 if (struct_v >= 3) {
3609 decode(backfillfull_ratio, bl);
3610 } else {
3611 backfillfull_ratio = 0;
3612 }
3613 if (struct_v == 4) {
3614 string r;
3615 decode(r, bl);
3616 if (r.length())
3617 require_min_compat_client = ceph_release_from_name(r.c_str());
3618 }
3619 if (struct_v >= 5) {
3620 decode(require_min_compat_client, bl);
3621 decode(require_osd_release, bl);
3622 if (require_osd_release >= ceph_release_t::nautilus) {
3623 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3624 }
3625 if (require_osd_release >= ceph_release_t::luminous) {
3626 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3627 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3628 }
3629 } else {
3630 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3631 // only for compat with post-kraken pre-luminous test clusters
3632 require_osd_release = ceph_release_t::luminous;
3633 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
3634 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
3635 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
3636 require_osd_release = ceph_release_t::kraken;
3637 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
3638 require_osd_release = ceph_release_t::jewel;
3639 } else {
3640 require_osd_release = ceph_release_t::unknown;
3641 }
3642 }
3643 if (struct_v >= 6) {
3644 decode(removed_snaps_queue, bl);
3645 }
3646 if (struct_v >= 8) {
3647 decode(crush_node_flags, bl);
3648 } else {
3649 crush_node_flags.clear();
3650 }
3651 if (struct_v >= 9) {
3652 decode(device_class_flags, bl);
3653 } else {
3654 device_class_flags.clear();
3655 }
3656 if (struct_v >= 10) {
3657 decode(stretch_mode_enabled, bl);
3658 decode(stretch_bucket_count, bl);
3659 decode(degraded_stretch_mode, bl);
3660 decode(recovering_stretch_mode, bl);
3661 decode(stretch_mode_bucket, bl);
3662 } else {
3663 stretch_mode_enabled = false;
3664 stretch_bucket_count = 0;
3665 degraded_stretch_mode = 0;
3666 recovering_stretch_mode = 0;
3667 stretch_mode_bucket = 0;
3668 }
3669 if (struct_v >= 11) {
3670 decode(range_blocklist, bl);
3671 calculated_ranges.clear();
3672 for (const auto& i : range_blocklist) {
3673 calculated_ranges.emplace(i.first, i.first);
3674 }
3675 }
3676 if (struct_v >= 12) {
3677 decode(allow_crimson, bl);
3678 }
3679 DECODE_FINISH(bl); // osd-only data
3680 }
3681
3682 if (struct_v >= 8) {
3683 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
3684 decode(crc, bl);
3685 tail_offset = bl.get_off();
3686 crc_defined = true;
3687 } else {
3688 crc_defined = false;
3689 crc = 0;
3690 }
3691
3692 DECODE_FINISH(bl); // wrapper
3693
3694 if (tail_offset) {
3695 // verify crc
3696 uint32_t actual = crc_front.crc32c(-1);
3697 if (tail_offset < bl.get_off()) {
3698 ceph::buffer::list tail;
3699 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3700 actual = tail.crc32c(actual);
3701 }
3702 if (crc != actual) {
3703 ostringstream ss;
3704 ss << "bad crc, actual " << actual << " != expected " << crc;
3705 string s = ss.str();
3706 throw ceph::buffer::malformed_input(s.c_str());
3707 }
3708 }
3709
3710 post_decode();
3711 }
3712
3713 void OSDMap::post_decode()
3714 {
3715 // index pool names
3716 name_pool.clear();
3717 for (const auto &pname : pool_name) {
3718 name_pool[pname.second] = pname.first;
3719 }
3720
3721 calc_num_osds();
3722 _calc_up_osd_features();
3723 }
3724
3725 void OSDMap::dump_erasure_code_profiles(
3726 const mempool::osdmap::map<string,map<string,string>>& profiles,
3727 Formatter *f)
3728 {
3729 f->open_object_section("erasure_code_profiles");
3730 for (const auto &profile : profiles) {
3731 f->open_object_section(profile.first.c_str());
3732 for (const auto &profm : profile.second) {
3733 f->dump_string(profm.first.c_str(), profm.second);
3734 }
3735 f->close_section();
3736 }
3737 f->close_section();
3738 }
3739
3740 void OSDMap::dump_osds(Formatter *f) const
3741 {
3742 f->open_array_section("osds");
3743 for (int i=0; i<get_max_osd(); i++) {
3744 if (exists(i)) {
3745 dump_osd(i, f);
3746 }
3747 }
3748 f->close_section();
3749 }
3750
3751 void OSDMap::dump_osd(int id, Formatter *f) const
3752 {
3753 ceph_assert(f != nullptr);
3754 if (!exists(id)) {
3755 return;
3756 }
3757
3758 f->open_object_section("osd_info");
3759 f->dump_int("osd", id);
3760 f->dump_stream("uuid") << get_uuid(id);
3761 f->dump_int("up", is_up(id));
3762 f->dump_int("in", is_in(id));
3763 f->dump_float("weight", get_weightf(id));
3764 f->dump_float("primary_affinity", get_primary_affinityf(id));
3765 get_info(id).dump(f);
3766 f->dump_object("public_addrs", get_addrs(id));
3767 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3768 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3769 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3770 // compat
3771 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3772 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3773 f->dump_stream("heartbeat_back_addr")
3774 << get_hb_back_addrs(id).get_legacy_str();
3775 f->dump_stream("heartbeat_front_addr")
3776 << get_hb_front_addrs(id).get_legacy_str();
3777
3778 set<string> st;
3779 get_state(id, st);
3780 f->open_array_section("state");
3781 for (const auto &state : st)
3782 f->dump_string("state", state);
3783 f->close_section();
3784
3785 f->close_section();
3786 }
3787
3788 void OSDMap::dump_pool(CephContext *cct,
3789 int64_t pid,
3790 const pg_pool_t &pdata,
3791 ceph::Formatter *f) const
3792 {
3793 std::string name("<unknown>");
3794 const auto &pni = pool_name.find(pid);
3795 if (pni != pool_name.end())
3796 name = pni->second;
3797 f->open_object_section("pool");
3798 f->dump_int("pool", pid);
3799 f->dump_string("pool_name", name);
3800 pdata.dump(f);
3801 dump_read_balance_score(cct, pid, pdata, f);
3802 f->close_section(); // pool
3803 }
3804
3805 void OSDMap::dump_read_balance_score(CephContext *cct,
3806 int64_t pid,
3807 const pg_pool_t &pdata,
3808 ceph::Formatter *f) const
3809 {
3810 if (pdata.is_replicated()) {
3811 // Add rb section with values for score, optimal score, raw score
3812 // // and primary_affinity average
3813 OSDMap::read_balance_info_t rb_info;
3814 auto rc = calc_read_balance_score(cct, pid, &rb_info);
3815 if (rc >= 0) {
3816 f->open_object_section("read_balance");
3817 f->dump_float("score_acting", rb_info.acting_adj_score);
3818 f->dump_float("score_stable", rb_info.adjusted_score);
3819 f->dump_float("optimal_score", rb_info.optimal_score);
3820 f->dump_float("raw_score_acting", rb_info.acting_raw_score);
3821 f->dump_float("raw_score_stable", rb_info.raw_score);
3822 f->dump_float("primary_affinity_weighted", rb_info.pa_weighted);
3823 f->dump_float("average_primary_affinity", rb_info.pa_avg);
3824 f->dump_float("average_primary_affinity_weighted", rb_info.pa_weighted_avg);
3825 if (rb_info.err_msg.length() > 0) {
3826 f->dump_string("error_message", rb_info.err_msg);
3827 }
3828 f->close_section(); // read_balance
3829 }
3830 else {
3831 if (rb_info.err_msg.length() > 0) {
3832 f->open_object_section("read_balance");
3833 f->dump_string("error_message", rb_info.err_msg);
3834 f->dump_float("score_acting", rb_info.acting_adj_score);
3835 f->dump_float("score_stable", rb_info.adjusted_score);
3836 f->close_section(); // read_balance
3837 }
3838 }
3839 }
3840 }
3841
3842 void OSDMap::dump(Formatter *f, CephContext *cct) const
3843 {
3844 f->dump_int("epoch", get_epoch());
3845 f->dump_stream("fsid") << get_fsid();
3846 f->dump_stream("created") << get_created();
3847 f->dump_stream("modified") << get_modified();
3848 f->dump_stream("last_up_change") << last_up_change;
3849 f->dump_stream("last_in_change") << last_in_change;
3850 f->dump_string("flags", get_flag_string());
3851 f->dump_unsigned("flags_num", flags);
3852 f->open_array_section("flags_set");
3853 set<string> flagset;
3854 get_flag_set(&flagset);
3855 for (auto p : flagset) {
3856 f->dump_string("flag", p);
3857 }
3858 f->close_section();
3859 f->dump_unsigned("crush_version", get_crush_version());
3860 f->dump_float("full_ratio", full_ratio);
3861 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3862 f->dump_float("nearfull_ratio", nearfull_ratio);
3863 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3864 f->dump_int("pool_max", get_pool_max());
3865 f->dump_int("max_osd", get_max_osd());
3866 f->dump_string("require_min_compat_client",
3867 to_string(require_min_compat_client));
3868 f->dump_string("min_compat_client",
3869 to_string(get_min_compat_client()));
3870 f->dump_string("require_osd_release",
3871 to_string(require_osd_release));
3872
3873 f->dump_bool("allow_crimson", allow_crimson);
3874 f->open_array_section("pools");
3875 for (const auto &[pid, pdata] : pools) {
3876 dump_pool(cct, pid, pdata, f);
3877 }
3878 f->close_section();
3879
3880 dump_osds(f);
3881
3882 f->open_array_section("osd_xinfo");
3883 for (int i=0; i<get_max_osd(); i++) {
3884 if (exists(i)) {
3885 f->open_object_section("xinfo");
3886 f->dump_int("osd", i);
3887 osd_xinfo[i].dump(f);
3888 f->close_section();
3889 }
3890 }
3891 f->close_section();
3892
3893 f->open_array_section("pg_upmap");
3894 for (auto& p : pg_upmap) {
3895 f->open_object_section("mapping");
3896 f->dump_stream("pgid") << p.first;
3897 f->open_array_section("osds");
3898 for (auto q : p.second) {
3899 f->dump_int("osd", q);
3900 }
3901 f->close_section();
3902 f->close_section();
3903 }
3904 f->close_section();
3905
3906 f->open_array_section("pg_upmap_items");
3907 for (auto& [pgid, mappings] : pg_upmap_items) {
3908 f->open_object_section("mapping");
3909 f->dump_stream("pgid") << pgid;
3910 f->open_array_section("mappings");
3911 for (auto& [from, to] : mappings) {
3912 f->open_object_section("mapping");
3913 f->dump_int("from", from);
3914 f->dump_int("to", to);
3915 f->close_section();
3916 }
3917 f->close_section();
3918 f->close_section();
3919 }
3920 f->close_section();
3921
3922 f->open_array_section("pg_upmap_primaries");
3923 for (const auto& [pg, osd] : pg_upmap_primaries) {
3924 f->open_object_section("primary_mapping");
3925 f->dump_stream("pgid") << pg;
3926 f->dump_int("primary_osd", osd);
3927 f->close_section();
3928 }
3929 f->close_section(); // primary_temp
3930
3931 f->open_array_section("pg_temp");
3932 pg_temp->dump(f);
3933 f->close_section();
3934
3935 f->open_array_section("primary_temp");
3936 for (const auto &pg : *primary_temp) {
3937 f->dump_stream("pgid") << pg.first;
3938 f->dump_int("osd", pg.second);
3939 }
3940 f->close_section(); // primary_temp
3941
3942 f->open_object_section("blocklist");
3943 for (const auto &addr : blocklist) {
3944 stringstream ss;
3945 ss << addr.first;
3946 f->dump_stream(ss.str().c_str()) << addr.second;
3947 }
3948 f->close_section();
3949 f->open_object_section("range_blocklist");
3950 for (const auto &addr : range_blocklist) {
3951 stringstream ss;
3952 ss << addr.first;
3953 f->dump_stream(ss.str().c_str()) << addr.second;
3954 }
3955 f->close_section();
3956
3957 dump_erasure_code_profiles(erasure_code_profiles, f);
3958
3959 f->open_array_section("removed_snaps_queue");
3960 for (auto& p : removed_snaps_queue) {
3961 f->open_object_section("pool");
3962 f->dump_int("pool", p.first);
3963 f->open_array_section("snaps");
3964 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3965 f->open_object_section("interval");
3966 f->dump_unsigned("begin", q.get_start());
3967 f->dump_unsigned("length", q.get_len());
3968 f->close_section();
3969 }
3970 f->close_section();
3971 f->close_section();
3972 }
3973 f->close_section();
3974 f->open_array_section("new_removed_snaps");
3975 for (auto& p : new_removed_snaps) {
3976 f->open_object_section("pool");
3977 f->dump_int("pool", p.first);
3978 f->open_array_section("snaps");
3979 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3980 f->open_object_section("interval");
3981 f->dump_unsigned("begin", q.get_start());
3982 f->dump_unsigned("length", q.get_len());
3983 f->close_section();
3984 }
3985 f->close_section();
3986 f->close_section();
3987 }
3988 f->close_section();
3989 f->open_array_section("new_purged_snaps");
3990 for (auto& p : new_purged_snaps) {
3991 f->open_object_section("pool");
3992 f->dump_int("pool", p.first);
3993 f->open_array_section("snaps");
3994 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3995 f->open_object_section("interval");
3996 f->dump_unsigned("begin", q.get_start());
3997 f->dump_unsigned("length", q.get_len());
3998 f->close_section();
3999 }
4000 f->close_section();
4001 f->close_section();
4002 }
4003 f->close_section();
4004 f->open_object_section("crush_node_flags");
4005 for (auto& i : crush_node_flags) {
4006 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
4007 : stringify(i.first);
4008 f->open_array_section(s.c_str());
4009 set<string> st;
4010 calc_state_set(i.second, st);
4011 for (auto& j : st) {
4012 f->dump_string("flag", j);
4013 }
4014 f->close_section();
4015 }
4016 f->close_section();
4017 f->open_object_section("device_class_flags");
4018 for (auto& i : device_class_flags) {
4019 const char* class_name = crush->get_class_name(i.first);
4020 string s = class_name ? class_name : stringify(i.first);
4021 f->open_array_section(s.c_str());
4022 set<string> st;
4023 calc_state_set(i.second, st);
4024 for (auto& j : st) {
4025 f->dump_string("flag", j);
4026 }
4027 f->close_section();
4028 }
4029 f->close_section();
4030 f->open_object_section("stretch_mode");
4031 {
4032 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
4033 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
4034 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
4035 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
4036 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
4037 }
4038 f->close_section();
4039 }
4040
4041 void OSDMap::generate_test_instances(list<OSDMap*>& o)
4042 {
4043 o.push_back(new OSDMap);
4044
4045 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
4046 o.push_back(new OSDMap);
4047 uuid_d fsid;
4048 o.back()->build_simple(cct, 1, fsid, 16);
4049 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
4050 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
4051 cct->put();
4052 }
4053
4054 string OSDMap::get_flag_string(unsigned f)
4055 {
4056 string s;
4057 if (f & CEPH_OSDMAP_PAUSERD)
4058 s += ",pauserd";
4059 if (f & CEPH_OSDMAP_PAUSEWR)
4060 s += ",pausewr";
4061 if (f & CEPH_OSDMAP_PAUSEREC)
4062 s += ",pauserec";
4063 if (f & CEPH_OSDMAP_NOUP)
4064 s += ",noup";
4065 if (f & CEPH_OSDMAP_NODOWN)
4066 s += ",nodown";
4067 if (f & CEPH_OSDMAP_NOOUT)
4068 s += ",noout";
4069 if (f & CEPH_OSDMAP_NOIN)
4070 s += ",noin";
4071 if (f & CEPH_OSDMAP_NOBACKFILL)
4072 s += ",nobackfill";
4073 if (f & CEPH_OSDMAP_NOREBALANCE)
4074 s += ",norebalance";
4075 if (f & CEPH_OSDMAP_NORECOVER)
4076 s += ",norecover";
4077 if (f & CEPH_OSDMAP_NOSCRUB)
4078 s += ",noscrub";
4079 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
4080 s += ",nodeep-scrub";
4081 if (f & CEPH_OSDMAP_NOTIERAGENT)
4082 s += ",notieragent";
4083 if (f & CEPH_OSDMAP_NOSNAPTRIM)
4084 s += ",nosnaptrim";
4085 if (f & CEPH_OSDMAP_SORTBITWISE)
4086 s += ",sortbitwise";
4087 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
4088 s += ",require_jewel_osds";
4089 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
4090 s += ",require_kraken_osds";
4091 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
4092 s += ",require_luminous_osds";
4093 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
4094 s += ",recovery_deletes";
4095 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
4096 s += ",purged_snapdirs";
4097 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
4098 s += ",pglog_hardlimit";
4099 if (f & CEPH_OSDMAP_NOAUTOSCALE)
4100 s += ",noautoscale";
4101 if (s.length())
4102 s.erase(0, 1);
4103 return s;
4104 }
4105
4106 string OSDMap::get_flag_string() const
4107 {
4108 return get_flag_string(flags);
4109 }
4110
4111 void OSDMap::print_pools(CephContext *cct, ostream& out) const
4112 {
4113 for (const auto &[pid, pdata] : pools) {
4114 std::string name("<unknown>");
4115 const auto &pni = pool_name.find(pid);
4116 if (pni != pool_name.end())
4117 name = pni->second;
4118 char rb_score_str[32] = "";
4119 int rc = 0;
4120 read_balance_info_t rb_info;
4121 if (pdata.is_replicated()) {
4122 rc = calc_read_balance_score(cct, pid, &rb_info);
4123 if (rc >= 0)
4124 snprintf (rb_score_str, sizeof(rb_score_str),
4125 " read_balance_score %.2f", rb_info.acting_adj_score);
4126 }
4127
4128 out << "pool " << pid
4129 << " '" << name
4130 << "' " << pdata
4131 << rb_score_str << "\n";
4132 if (rb_info.err_msg.length() > 0) {
4133 out << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << "\n";
4134 }
4135
4136 //TODO - print error messages here.
4137
4138 for (const auto &snap : pdata.snaps)
4139 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
4140
4141 if (!pdata.removed_snaps.empty())
4142 out << "\tremoved_snaps " << pdata.removed_snaps << "\n";
4143 auto p = removed_snaps_queue.find(pid);
4144 if (p != removed_snaps_queue.end()) {
4145 out << "\tremoved_snaps_queue " << p->second << "\n";
4146 }
4147 }
4148 out << std::endl;
4149 }
4150
4151 void OSDMap::print_osds(ostream& out) const
4152 {
4153 for (int i=0; i<get_max_osd(); i++) {
4154 if (exists(i)) {
4155 print_osd(i, out);
4156 }
4157 }
4158 }
4159 void OSDMap::print_osd(int id, ostream& out) const
4160 {
4161 if (!exists(id)) {
4162 return;
4163 }
4164
4165 out << "osd." << id;
4166 out << (is_up(id) ? " up ":" down");
4167 out << (is_in(id) ? " in ":" out");
4168 out << " weight " << get_weightf(id);
4169 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
4170 out << " primary_affinity " << get_primary_affinityf(id);
4171 }
4172 const osd_info_t& info(get_info(id));
4173 out << " " << info;
4174 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
4175 set<string> st;
4176 get_state(id, st);
4177 out << " " << st;
4178 if (!get_uuid(id).is_zero()) {
4179 out << " " << get_uuid(id);
4180 }
4181 out << "\n";
4182 }
4183
4184 void OSDMap::print(CephContext *cct, ostream& out) const
4185 {
4186 out << "epoch " << get_epoch() << "\n"
4187 << "fsid " << get_fsid() << "\n"
4188 << "created " << get_created() << "\n"
4189 << "modified " << get_modified() << "\n";
4190
4191 out << "flags " << get_flag_string() << "\n";
4192 out << "crush_version " << get_crush_version() << "\n";
4193 out << "full_ratio " << full_ratio << "\n";
4194 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
4195 out << "nearfull_ratio " << nearfull_ratio << "\n";
4196 if (require_min_compat_client != ceph_release_t::unknown) {
4197 out << "require_min_compat_client "
4198 << require_min_compat_client << "\n";
4199 }
4200 out << "min_compat_client " << get_min_compat_client()
4201 << "\n";
4202 if (require_osd_release > ceph_release_t::unknown) {
4203 out << "require_osd_release " << require_osd_release
4204 << "\n";
4205 }
4206 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
4207 if (stretch_mode_enabled) {
4208 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
4209 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
4210 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
4211 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
4212 }
4213 if (get_cluster_snapshot().length())
4214 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
4215 if (allow_crimson) {
4216 out << "allow_crimson=true\n";
4217 }
4218 out << "\n";
4219
4220 print_pools(cct, out);
4221
4222 out << "max_osd " << get_max_osd() << "\n";
4223 print_osds(out);
4224 out << std::endl;
4225
4226 for (auto& p : pg_upmap) {
4227 out << "pg_upmap " << p.first << " " << p.second << "\n";
4228 }
4229 for (auto& p : pg_upmap_items) {
4230 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
4231 }
4232
4233 for (auto& [pg, osd] : pg_upmap_primaries) {
4234 out << "pg_upmap_primary " << pg << " " << osd << "\n";
4235 }
4236
4237 for (const auto& pg : *pg_temp)
4238 out << "pg_temp " << pg.first << " " << pg.second << "\n";
4239
4240 for (const auto& pg : *primary_temp)
4241 out << "primary_temp " << pg.first << " " << pg.second << "\n";
4242
4243 for (const auto &addr : blocklist)
4244 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
4245 for (const auto &addr : range_blocklist)
4246 out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
4247 }
4248
4249 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
4250 public:
4251 typedef CrushTreeDumper::Dumper<TextTable> Parent;
4252
4253 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4254 unsigned f)
4255 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
4256
4257 bool should_dump_leaf(int i) const override {
4258 if (!filter) {
4259 return true; // normal case
4260 }
4261 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4262 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4263 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4264 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4265 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4266 return true;
4267 }
4268 return false;
4269 }
4270
4271 bool should_dump_empty_bucket() const override {
4272 return !filter;
4273 }
4274
4275 void init_table(TextTable *tbl) {
4276 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
4277 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
4278 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4279 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4280 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
4281 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4282 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
4283 }
4284 void dump(TextTable *tbl, string& bucket) {
4285 init_table(tbl);
4286
4287 if (!bucket.empty()) {
4288 set_root(bucket);
4289 Parent::dump(tbl);
4290 } else {
4291 Parent::dump(tbl);
4292 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4293 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
4294 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
4295 }
4296 }
4297 }
4298 }
4299
4300 protected:
4301 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
4302 const char *c = crush->get_item_class(qi.id);
4303 if (!c)
4304 c = "";
4305 *tbl << qi.id
4306 << c
4307 << weightf_t(qi.weight);
4308
4309 ostringstream name;
4310 for (int k = 0; k < qi.depth; k++)
4311 name << " ";
4312 if (qi.is_bucket()) {
4313 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
4314 << crush->get_item_name(qi.id);
4315 } else {
4316 name << "osd." << qi.id;
4317 }
4318 *tbl << name.str();
4319
4320 if (!qi.is_bucket()) {
4321 if (!osdmap->exists(qi.id)) {
4322 *tbl << "DNE"
4323 << 0;
4324 } else {
4325 string s;
4326 if (osdmap->is_up(qi.id)) {
4327 s = "up";
4328 } else if (osdmap->is_destroyed(qi.id)) {
4329 s = "destroyed";
4330 } else {
4331 s = "down";
4332 }
4333 *tbl << s
4334 << weightf_t(osdmap->get_weightf(qi.id))
4335 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4336 }
4337 }
4338 *tbl << TextTable::endrow;
4339 }
4340
4341 private:
4342 const OSDMap *osdmap;
4343 const unsigned filter;
4344 };
4345
4346 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4347 public:
4348 typedef CrushTreeDumper::FormattingDumper Parent;
4349
4350 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4351 unsigned f)
4352 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
4353
4354 bool should_dump_leaf(int i) const override {
4355 if (!filter) {
4356 return true; // normal case
4357 }
4358 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4359 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4360 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4361 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4362 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4363 return true;
4364 }
4365 return false;
4366 }
4367
4368 bool should_dump_empty_bucket() const override {
4369 return !filter;
4370 }
4371
4372 void dump(Formatter *f, string& bucket) {
4373 if (!bucket.empty()) {
4374 set_root(bucket);
4375 f->open_array_section("nodes");
4376 Parent::dump(f);
4377 f->close_section();
4378 } else {
4379 f->open_array_section("nodes");
4380 Parent::dump(f);
4381 f->close_section();
4382 f->open_array_section("stray");
4383 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4384 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4385 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4386 }
4387 f->close_section();
4388 }
4389 }
4390
4391 protected:
4392 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4393 Parent::dump_item_fields(qi, f);
4394 if (!qi.is_bucket())
4395 {
4396 string s;
4397 if (osdmap->is_up(qi.id)) {
4398 s = "up";
4399 } else if (osdmap->is_destroyed(qi.id)) {
4400 s = "destroyed";
4401 } else {
4402 s = "down";
4403 }
4404 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
4405 f->dump_string("status", s);
4406 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4407 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4408 }
4409 }
4410
4411 private:
4412 const OSDMap *osdmap;
4413 const unsigned filter;
4414 };
4415
4416 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
4417 {
4418 if (f) {
4419 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
4420 } else {
4421 ceph_assert(out);
4422 TextTable tbl;
4423 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
4424 *out << tbl;
4425 }
4426 }
4427
4428 void OSDMap::print_summary(Formatter *f, ostream& out,
4429 const string& prefix, bool extra) const
4430 {
4431 if (f) {
4432 f->dump_int("epoch", get_epoch());
4433 f->dump_int("num_osds", get_num_osds());
4434 f->dump_int("num_up_osds", get_num_up_osds());
4435 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
4436 f->dump_int("num_in_osds", get_num_in_osds());
4437 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
4438 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4439 } else {
4440 utime_t now = ceph_clock_now();
4441 out << get_num_osds() << " osds: "
4442 << get_num_up_osds() << " up";
4443 if (last_up_change != utime_t()) {
4444 out << " (since " << utimespan_str(now - last_up_change) << ")";
4445 }
4446 out << ", " << get_num_in_osds() << " in";
4447 if (last_in_change != utime_t()) {
4448 out << " (since " << utimespan_str(now - last_in_change) << ")";
4449 }
4450 if (extra)
4451 out << "; epoch: e" << get_epoch();
4452 if (get_num_pg_temp())
4453 out << "; " << get_num_pg_temp() << " remapped pgs";
4454 out << "\n";
4455 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4456 if (important_flags)
4457 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
4458 }
4459 }
4460
4461 void OSDMap::print_oneline_summary(ostream& out) const
4462 {
4463 out << "e" << get_epoch() << ": "
4464 << get_num_osds() << " total, "
4465 << get_num_up_osds() << " up, "
4466 << get_num_in_osds() << " in";
4467 }
4468
4469 bool OSDMap::crush_rule_in_use(int rule_id) const
4470 {
4471 for (const auto &pool : pools) {
4472 if (pool.second.crush_rule == rule_id)
4473 return true;
4474 }
4475 return false;
4476 }
4477
4478 int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4479 ostream *ss) const
4480 {
4481 for (auto& i : pools) {
4482 auto& pool = i.second;
4483 int ruleno = pool.get_crush_rule();
4484 if (!newcrush->rule_exists(ruleno)) {
4485 *ss << "pool " << i.first << " references crush_rule " << ruleno
4486 << " but it is not present";
4487 return -EINVAL;
4488 }
4489 if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) {
4490 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4491 return -EINVAL;
4492 }
4493 }
4494 return 0;
4495 }
4496
4497 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4498 int nosd, int pg_bits, int pgp_bits,
4499 bool default_pool)
4500 {
4501 ldout(cct, 10) << "build_simple on " << nosd
4502 << " osds" << dendl;
4503 epoch = e;
4504 set_fsid(fsid);
4505 created = modified = ceph_clock_now();
4506
4507 if (nosd >= 0) {
4508 set_max_osd(nosd);
4509 } else {
4510 // count osds
4511 int maxosd = 0;
4512 const auto& conf = cct->_conf;
4513 vector<string> sections;
4514 conf.get_all_sections(sections);
4515
4516 for (auto &section : sections) {
4517 if (section.find("osd.") != 0)
4518 continue;
4519
4520 const char *begin = section.c_str() + 4;
4521 char *end = (char*)begin;
4522 int o = strtol(begin, &end, 10);
4523 if (*end != '\0')
4524 continue;
4525
4526 if (o > cct->_conf->mon_max_osd) {
4527 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4528 return -ERANGE;
4529 }
4530
4531 if (o > maxosd)
4532 maxosd = o;
4533 }
4534
4535 set_max_osd(maxosd + 1);
4536 }
4537
4538
4539 stringstream ss;
4540 int r;
4541 if (nosd >= 0)
4542 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4543 else
4544 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
4545 ceph_assert(r == 0);
4546
4547 int poolbase = get_max_osd() ? get_max_osd() : 1;
4548
4549 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct);
4550 ceph_assert(default_replicated_rule >= 0);
4551
4552 if (default_pool) {
4553 // pgp_num <= pg_num
4554 if (pgp_bits > pg_bits)
4555 pgp_bits = pg_bits;
4556
4557 vector<string> pool_names;
4558 pool_names.push_back("rbd");
4559 for (auto &plname : pool_names) {
4560 int64_t pool = ++pool_max;
4561 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4562 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4563 if (cct->_conf->osd_pool_default_flag_hashpspool)
4564 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4565 if (cct->_conf->osd_pool_default_flag_nodelete)
4566 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4567 if (cct->_conf->osd_pool_default_flag_nopgchange)
4568 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4569 if (cct->_conf->osd_pool_default_flag_nosizechange)
4570 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
4571 if (cct->_conf->osd_pool_default_flag_bulk)
4572 pools[pool].set_flag(pg_pool_t::FLAG_BULK);
4573 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4574 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4575 pools[pool].size);
4576 pools[pool].crush_rule = default_replicated_rule;
4577 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4578 pools[pool].set_pg_num(poolbase << pg_bits);
4579 pools[pool].set_pgp_num(poolbase << pgp_bits);
4580 pools[pool].set_pg_num_target(poolbase << pg_bits);
4581 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
4582 pools[pool].last_change = epoch;
4583 pools[pool].application_metadata.insert(
4584 {pg_pool_t::APPLICATION_NAME_RBD, {}});
4585 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4586 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4587 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4588 pools[pool].pg_autoscale_mode = m;
4589 } else {
4590 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4591 }
4592 pool_name[pool] = plname;
4593 name_pool[plname] = pool;
4594 }
4595 }
4596
4597 map<string,string> profile_map;
4598 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4599 if (r < 0) {
4600 lderr(cct) << ss.str() << dendl;
4601 return r;
4602 }
4603 set_erasure_code_profile("default", profile_map);
4604 return 0;
4605 }
4606
4607 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4608 map<string,string> &profile_map,
4609 ostream *ss)
4610 {
4611 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
4612 *ss,
4613 &profile_map);
4614 return r;
4615 }
4616
4617 int OSDMap::_build_crush_types(CrushWrapper& crush)
4618 {
4619 crush.set_type_name(0, "osd");
4620 crush.set_type_name(1, "host");
4621 crush.set_type_name(2, "chassis");
4622 crush.set_type_name(3, "rack");
4623 crush.set_type_name(4, "row");
4624 crush.set_type_name(5, "pdu");
4625 crush.set_type_name(6, "pod");
4626 crush.set_type_name(7, "room");
4627 crush.set_type_name(8, "datacenter");
4628 crush.set_type_name(9, "zone");
4629 crush.set_type_name(10, "region");
4630 crush.set_type_name(11, "root");
4631 return 11;
4632 }
4633
4634 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4635 int nosd, ostream *ss)
4636 {
4637 crush.create();
4638
4639 // root
4640 int root_type = _build_crush_types(crush);
4641 int rootid;
4642 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4643 root_type, 0, NULL, NULL, &rootid);
4644 ceph_assert(r == 0);
4645 crush.set_item_name(rootid, "default");
4646
4647 map<string,string> loc{
4648 {"host", "localhost"},
4649 {"rack", "localrack"},
4650 {"root", "default"}
4651 };
4652 for (int o=0; o<nosd; o++) {
4653 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4654 char name[32];
4655 snprintf(name, sizeof(name), "osd.%d", o);
4656 crush.insert_item(cct, o, 1.0, name, loc);
4657 }
4658
4659 build_simple_crush_rules(cct, crush, "default", ss);
4660
4661 crush.finalize();
4662
4663 return 0;
4664 }
4665
4666 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4667 CrushWrapper& crush,
4668 ostream *ss)
4669 {
4670 const auto& conf = cct->_conf;
4671
4672 crush.create();
4673
4674 // root
4675 int root_type = _build_crush_types(crush);
4676 int rootid;
4677 int r = crush.add_bucket(0, 0,
4678 CRUSH_HASH_DEFAULT,
4679 root_type, 0, NULL, NULL, &rootid);
4680 ceph_assert(r == 0);
4681 crush.set_item_name(rootid, "default");
4682
4683 // add osds
4684 vector<string> sections;
4685 conf.get_all_sections(sections);
4686
4687 for (auto &section : sections) {
4688 if (section.find("osd.") != 0)
4689 continue;
4690
4691 const char *begin = section.c_str() + 4;
4692 char *end = (char*)begin;
4693 int o = strtol(begin, &end, 10);
4694 if (*end != '\0')
4695 continue;
4696
4697 string host, rack, row, room, dc, pool;
4698 vector<string> sectiontmp;
4699 sectiontmp.push_back("osd");
4700 sectiontmp.push_back(section);
4701 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4702 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4703 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4704 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4705 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4706 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
4707
4708 if (host.length() == 0)
4709 host = "unknownhost";
4710 if (rack.length() == 0)
4711 rack = "unknownrack";
4712
4713 map<string,string> loc;
4714 loc["host"] = host;
4715 loc["rack"] = rack;
4716 if (row.size())
4717 loc["row"] = row;
4718 if (room.size())
4719 loc["room"] = room;
4720 if (dc.size())
4721 loc["datacenter"] = dc;
4722 loc["root"] = "default";
4723
4724 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4725 crush.insert_item(cct, o, 1.0, section, loc);
4726 }
4727
4728 build_simple_crush_rules(cct, crush, "default", ss);
4729
4730 crush.finalize();
4731
4732 return 0;
4733 }
4734
4735
4736 int OSDMap::build_simple_crush_rules(
4737 CephContext *cct,
4738 CrushWrapper& crush,
4739 const string& root,
4740 ostream *ss)
4741 {
4742 int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct);
4743 string failure_domain =
4744 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4745
4746 int r;
4747 r = crush.add_simple_rule_at(
4748 "replicated_rule", root, failure_domain, "",
4749 "firstn", pg_pool_t::TYPE_REPLICATED,
4750 crush_rule, ss);
4751 if (r < 0)
4752 return r;
4753 // do not add an erasure rule by default or else we will implicitly
4754 // require the crush_v2 feature of clients
4755 return 0;
4756 }
4757
4758 int OSDMap::summarize_mapping_stats(
4759 OSDMap *newmap,
4760 const set<int64_t> *pools,
4761 std::string *out,
4762 Formatter *f) const
4763 {
4764 set<int64_t> ls;
4765 if (pools) {
4766 ls = *pools;
4767 } else {
4768 for (auto &p : get_pools())
4769 ls.insert(p.first);
4770 }
4771
4772 unsigned total_pg = 0;
4773 unsigned moved_pg = 0;
4774 vector<unsigned> base_by_osd(get_max_osd(), 0);
4775 vector<unsigned> new_by_osd(get_max_osd(), 0);
4776 for (int64_t pool_id : ls) {
4777 const pg_pool_t *pi = get_pg_pool(pool_id);
4778 vector<int> up, up2;
4779 int up_primary;
4780 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
4781 pg_t pgid(ps, pool_id);
4782 total_pg += pi->get_size();
4783 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
4784 for (int osd : up) {
4785 if (osd >= 0 && osd < get_max_osd())
4786 ++base_by_osd[osd];
4787 }
4788 if (newmap) {
4789 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
4790 for (int osd : up2) {
4791 if (osd >= 0 && osd < get_max_osd())
4792 ++new_by_osd[osd];
4793 }
4794 if (pi->is_erasure()) {
4795 for (unsigned i=0; i<up.size(); ++i) {
4796 if (up[i] != up2[i]) {
4797 ++moved_pg;
4798 }
4799 }
4800 } else if (pi->is_replicated()) {
4801 for (int osd : up) {
4802 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4803 ++moved_pg;
4804 }
4805 }
4806 } else {
4807 ceph_abort_msg("unhandled pool type");
4808 }
4809 }
4810 }
4811 }
4812
4813 unsigned num_up_in = 0;
4814 for (int osd = 0; osd < get_max_osd(); ++osd) {
4815 if (is_up(osd) && is_in(osd))
4816 ++num_up_in;
4817 }
4818 if (!num_up_in) {
4819 return -EINVAL;
4820 }
4821
4822 float avg_pg = (float)total_pg / (float)num_up_in;
4823 float base_stddev = 0, new_stddev = 0;
4824 int min = -1, max = -1;
4825 unsigned min_base_pg = 0, max_base_pg = 0;
4826 unsigned min_new_pg = 0, max_new_pg = 0;
4827 for (int osd = 0; osd < get_max_osd(); ++osd) {
4828 if (is_up(osd) && is_in(osd)) {
4829 float base_diff = (float)base_by_osd[osd] - avg_pg;
4830 base_stddev += base_diff * base_diff;
4831 float new_diff = (float)new_by_osd[osd] - avg_pg;
4832 new_stddev += new_diff * new_diff;
4833 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4834 min = osd;
4835 min_base_pg = base_by_osd[osd];
4836 min_new_pg = new_by_osd[osd];
4837 }
4838 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4839 max = osd;
4840 max_base_pg = base_by_osd[osd];
4841 max_new_pg = new_by_osd[osd];
4842 }
4843 }
4844 }
4845 base_stddev = sqrt(base_stddev / num_up_in);
4846 new_stddev = sqrt(new_stddev / num_up_in);
4847
4848 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4849
4850 ostringstream ss;
4851 if (f)
4852 f->open_object_section("utilization");
4853 if (newmap) {
4854 if (f) {
4855 f->dump_unsigned("moved_pgs", moved_pg);
4856 f->dump_unsigned("total_pgs", total_pg);
4857 } else {
4858 float percent = 0;
4859 if (total_pg)
4860 percent = (float)moved_pg * 100.0 / (float)total_pg;
4861 ss << "moved " << moved_pg << " / " << total_pg
4862 << " (" << percent << "%)\n";
4863 }
4864 }
4865 if (f) {
4866 f->dump_float("avg_pgs", avg_pg);
4867 f->dump_float("std_dev", base_stddev);
4868 f->dump_float("expected_baseline_std_dev", edev);
4869 if (newmap)
4870 f->dump_float("new_std_dev", new_stddev);
4871 } else {
4872 ss << "avg " << avg_pg << "\n";
4873 ss << "stddev " << base_stddev;
4874 if (newmap)
4875 ss << " -> " << new_stddev;
4876 ss << " (expected baseline " << edev << ")\n";
4877 }
4878 if (min >= 0) {
4879 if (f) {
4880 f->dump_unsigned("min_osd", min);
4881 f->dump_unsigned("min_osd_pgs", min_base_pg);
4882 if (newmap)
4883 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4884 } else {
4885 ss << "min osd." << min << " with " << min_base_pg;
4886 if (newmap)
4887 ss << " -> " << min_new_pg;
4888 ss << " pgs (" << (float)min_base_pg / avg_pg;
4889 if (newmap)
4890 ss << " -> " << (float)min_new_pg / avg_pg;
4891 ss << " * mean)\n";
4892 }
4893 }
4894 if (max >= 0) {
4895 if (f) {
4896 f->dump_unsigned("max_osd", max);
4897 f->dump_unsigned("max_osd_pgs", max_base_pg);
4898 if (newmap)
4899 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4900 } else {
4901 ss << "max osd." << max << " with " << max_base_pg;
4902 if (newmap)
4903 ss << " -> " << max_new_pg;
4904 ss << " pgs (" << (float)max_base_pg / avg_pg;
4905 if (newmap)
4906 ss << " -> " << (float)max_new_pg / avg_pg;
4907 ss << " * mean)\n";
4908 }
4909 }
4910 if (f)
4911 f->close_section();
4912 if (out)
4913 *out = ss.str();
4914 return 0;
4915 }
4916
4917 bool OSDMap::try_pg_upmap(
4918 CephContext *cct,
4919 pg_t pg, ///< pg to potentially remap
4920 const set<int>& overfull, ///< osds we'd want to evacuate
4921 const vector<int>& underfull, ///< osds to move to, in order of preference
4922 const vector<int>& more_underfull, ///< more osds only slightly underfull
4923 vector<int> *orig,
4924 vector<int> *out) ///< resulting alternative mapping
4925 {
4926 const pg_pool_t *pool = get_pg_pool(pg.pool());
4927 if (!pool)
4928 return false;
4929 int rule = pool->get_crush_rule();
4930 if (rule < 0)
4931 return false;
4932
4933 // make sure there is something there to remap
4934 bool any = false;
4935 for (auto osd : *orig) {
4936 if (overfull.count(osd)) {
4937 any = true;
4938 break;
4939 }
4940 }
4941 if (!any) {
4942 return false;
4943 }
4944
4945 int r = crush->try_remap_rule(
4946 cct,
4947 rule,
4948 pool->get_size(),
4949 overfull, underfull,
4950 more_underfull,
4951 *orig,
4952 out);
4953 if (r < 0)
4954 return false;
4955 if (*out == *orig)
4956 return false;
4957 return true;
4958 }
4959
4960
4961 int OSDMap::balance_primaries(
4962 CephContext *cct,
4963 int64_t pid,
4964 OSDMap::Incremental *pending_inc,
4965 OSDMap& tmp_osd_map) const
4966 {
4967 // This function only handles replicated pools.
4968 const pg_pool_t* pool = get_pg_pool(pid);
4969 if (! pool->is_replicated()) {
4970 ldout(cct, 10) << __func__ << " skipping erasure pool "
4971 << get_pool_name(pid) << dendl;
4972 return -EINVAL;
4973 }
4974
4975 // Info to be used in verify_upmap
4976 int pool_size = pool->get_size();
4977 int crush_rule = pool->get_crush_rule();
4978
4979 // Get pgs by osd (map of osd -> pgs)
4980 // Get primaries by osd (map of osd -> primary)
4981 map<uint64_t,set<pg_t>> pgs_by_osd;
4982 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
4983 map<uint64_t,set<pg_t>> acting_prims_by_osd;
4984 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pid, &prim_pgs_by_osd, &acting_prims_by_osd);
4985
4986 // Construct information about the pgs and osds we will consider in new primary mappings,
4987 // as well as a map of all pgs and their original primary osds.
4988 map<pg_t,bool> prim_pgs_to_check;
4989 vector<uint64_t> osds_to_check;
4990 map<pg_t, uint64_t> orig_prims;
4991 for (const auto & [osd, pgs] : prim_pgs_by_osd) {
4992 osds_to_check.push_back(osd);
4993 for (const auto & pg : pgs) {
4994 prim_pgs_to_check.insert({pg, false});
4995 orig_prims.insert({pg, osd});
4996 }
4997 }
4998
4999 // calculate desired primary distribution for each osd
5000 map<uint64_t,float> desired_prim_dist;
5001 int rc = 0;
5002 rc = calc_desired_primary_distribution(cct, pid, osds_to_check, desired_prim_dist);
5003 if (rc < 0) {
5004 ldout(cct, 10) << __func__ << " Error in calculating desired primary distribution" << dendl;
5005 return -EINVAL;
5006 }
5007 map<uint64_t,float> prim_dist_scores;
5008 float actual;
5009 float desired;
5010 for (auto osd : osds_to_check) {
5011 actual = prim_pgs_by_osd[osd].size();
5012 desired = desired_prim_dist[osd];
5013 prim_dist_scores[osd] = actual - desired;
5014 ldout(cct, 10) << __func__ << " desired distribution for osd." << osd << " " << desired << dendl;
5015 }
5016
5017 // get read balance score before balancing
5018 float read_balance_score_before = 0.0;
5019 read_balance_info_t rb_info;
5020 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5021 if (rc >= 0) {
5022 read_balance_score_before = rb_info.adjusted_score;
5023 }
5024 if (rb_info.err_msg.length() > 0) {
5025 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5026 return -EINVAL;
5027 }
5028
5029 // get ready to swap pgs
5030 while (true) {
5031 int curr_num_changes = 0;
5032 vector<int> up_osds;
5033 vector<int> acting_osds;
5034 int up_primary, acting_primary;
5035 for (const auto & [pg, mapped] : prim_pgs_to_check) {
5036 // fill in the up, up primary, acting, and acting primary for the current PG
5037 tmp_osd_map.pg_to_up_acting_osds(pg, &up_osds, &up_primary,
5038 &acting_osds, &acting_primary);
5039
5040 // find the OSD that would make the best swap based on its score
5041 // We start by first testing the OSD that is currently primary for the PG we are checking.
5042 uint64_t curr_best_osd = up_primary;
5043 float prim_score = prim_dist_scores[up_primary];
5044 for (auto potential_osd : up_osds) {
5045 float potential_score = prim_dist_scores[potential_osd];
5046 if ((prim_score > 0) && // taking 1 pg from the prim would not make its score worse
5047 (potential_score < 0) && // adding 1 pg to the potential would not make its score worse
5048 ((prim_score - potential_score) > 1) && // swapping a pg would not just keep the scores the same
5049 (desired_prim_dist[potential_osd] > 0)) // the potential is not off limits (the primary affinity is above 0)
5050 {
5051 curr_best_osd = potential_osd;
5052 }
5053 }
5054
5055 // Make the swap only if:
5056 // 1. The swap is legal
5057 // 2. The balancer has chosen a new primary
5058 auto legal_swap = crush->verify_upmap(cct,
5059 crush_rule,
5060 pool_size,
5061 {(int)curr_best_osd});
5062 if (legal_swap >= 0 &&
5063 ((int)curr_best_osd != up_primary)) {
5064 // Update prim_dist_scores
5065 prim_dist_scores[curr_best_osd] += 1;
5066 prim_dist_scores[up_primary] -= 1;
5067
5068 // Update the mappings
5069 tmp_osd_map.pg_upmap_primaries[pg] = curr_best_osd;
5070 if (curr_best_osd == orig_prims[pg]) {
5071 pending_inc->new_pg_upmap_primary.erase(pg);
5072 prim_pgs_to_check[pg] = false;
5073 } else {
5074 pending_inc->new_pg_upmap_primary[pg] = curr_best_osd;
5075 prim_pgs_to_check[pg] = true; // mark that this pg changed mappings
5076 }
5077
5078 curr_num_changes++;
5079 }
5080 ldout(cct, 20) << __func__ << " curr_num_changes: " << curr_num_changes << dendl;
5081 }
5082 // If there are no changes after one pass through the pgs, then no further optimizations can be made.
5083 if (curr_num_changes == 0) {
5084 ldout(cct, 20) << __func__ << " curr_num_changes is 0; no further optimizations can be made." << dendl;
5085 break;
5086 }
5087 }
5088
5089 // get read balance score after balancing
5090 float read_balance_score_after = 0.0;
5091 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5092 if (rc >= 0) {
5093 read_balance_score_after = rb_info.adjusted_score;
5094 }
5095 if (rb_info.err_msg.length() > 0) {
5096 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5097 return -EINVAL;
5098 }
5099
5100 // Tally total number of changes
5101 int num_changes = 0;
5102 if (read_balance_score_after < read_balance_score_before) {
5103 for (auto [pg, mapped] : prim_pgs_to_check) {
5104 if (mapped) {
5105 num_changes++;
5106 }
5107 }
5108 }
5109
5110 ldout(cct, 10) << __func__ << " num_changes " << num_changes << dendl;
5111 return num_changes;
5112 }
5113
5114 int OSDMap::calc_desired_primary_distribution(
5115 CephContext *cct,
5116 int64_t pid,
5117 const vector<uint64_t> &osds,
5118 std::map<uint64_t, float>& desired_primary_distribution) const
5119 {
5120 // will return a perfect distribution of floats
5121 // without calculating the floor of each value
5122 //
5123 // This function only handles replicated pools.
5124 const pg_pool_t* pool = get_pg_pool(pid);
5125 if (pool->is_replicated()) {
5126 ldout(cct, 20) << __func__ << " calculating distribution for replicated pool "
5127 << get_pool_name(pid) << dendl;
5128 uint64_t replica_count = pool->get_size();
5129
5130 map<uint64_t,set<pg_t>> pgs_by_osd;
5131 pgs_by_osd = get_pgs_by_osd(cct, pid);
5132
5133 // First calculate the distribution using primary affinity and tally up the sum
5134 auto distribution_sum = 0.0;
5135 for (const auto & osd : osds) {
5136 float osd_primary_count = ((float)pgs_by_osd[osd].size() / (float)replica_count) * get_primary_affinityf(osd);
5137 desired_primary_distribution.insert({osd, osd_primary_count});
5138 distribution_sum += osd_primary_count;
5139 }
5140 if (distribution_sum <= 0) {
5141 ldout(cct, 10) << __func__ << " Unable to calculate primary distribution, likely because primary affinity is"
5142 << " set to 0 on all OSDs." << dendl;
5143 return -EINVAL;
5144 }
5145
5146 // Then, stretch the value (necessary when primary affinity is smaller than 1)
5147 float factor = (float)pool->get_pg_num() / (float)distribution_sum;
5148 float distribution_sum_desired = 0.0;
5149
5150 ceph_assert(factor >= 1.0);
5151 for (const auto & [osd, osd_primary_count] : desired_primary_distribution) {
5152 desired_primary_distribution[osd] *= factor;
5153 distribution_sum_desired += desired_primary_distribution[osd];
5154 }
5155 ceph_assert(fabs(distribution_sum_desired - pool->get_pg_num()) < 0.01);
5156 } else {
5157 ldout(cct, 10) << __func__ <<" skipping erasure pool "
5158 << get_pool_name(pid) << dendl;
5159 return -EINVAL;
5160 }
5161
5162 return 0;
5163 }
5164
5165 int OSDMap::calc_pg_upmaps(
5166 CephContext *cct,
5167 uint32_t max_deviation,
5168 int max,
5169 const set<int64_t>& only_pools,
5170 OSDMap::Incremental *pending_inc,
5171 std::random_device::result_type *p_seed)
5172 {
5173 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
5174 OSDMap tmp_osd_map;
5175 // Can't be less than 1 pg
5176 if (max_deviation < 1)
5177 max_deviation = 1;
5178 tmp_osd_map.deepish_copy_from(*this);
5179 int num_changed = 0;
5180 map<int,set<pg_t>> pgs_by_osd;
5181 int total_pgs = 0;
5182 float osd_weight_total = 0;
5183 map<int,float> osd_weight;
5184
5185 if (max <= 0) {
5186 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
5187 return 0;
5188 }
5189
5190 osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map,
5191 total_pgs, pgs_by_osd, osd_weight);
5192 if (osd_weight_total == 0) {
5193 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
5194 return 0;
5195 }
5196
5197 float pgs_per_weight = total_pgs / osd_weight_total;
5198 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
5199 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
5200
5201 float stddev = 0;
5202 map<int,float> osd_deviation; // osd, deviation(pgs)
5203 multimap<float,int> deviation_osd; // deviation(pgs), osd
5204 float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight,
5205 osd_deviation, deviation_osd, stddev);
5206
5207 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5208 if (cur_max_deviation <= max_deviation) {
5209 ldout(cct, 10) << __func__ << " distribution is almost perfect"
5210 << dendl;
5211 return 0;
5212 }
5213
5214 bool skip_overfull = false;
5215 auto aggressive =
5216 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
5217 auto fast_aggressive = aggressive &&
5218 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively_fast");
5219 auto local_fallback_retries =
5220 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
5221
5222 while (max--) {
5223 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
5224 // build overfull and underfull
5225 set<int> overfull;
5226 set<int> more_overfull;
5227 bool using_more_overfull = false;
5228 vector<int> underfull;
5229 vector<int> more_underfull;
5230 fill_overfull_underfull(cct, deviation_osd, max_deviation,
5231 overfull, more_overfull,
5232 underfull, more_underfull);
5233
5234 if (underfull.empty() && overfull.empty()) {
5235 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
5236 break;
5237 }
5238 if (overfull.empty() && !underfull.empty()) {
5239 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
5240 overfull = more_overfull;
5241 using_more_overfull = true;
5242 }
5243
5244 ldout(cct, 10) << " overfull " << overfull
5245 << " underfull " << underfull
5246 << dendl;
5247 set<pg_t> to_skip;
5248 uint64_t local_fallback_retried = 0;
5249
5250 // Used to prevent some of the unsuccessful loop iterations (save runtime)
5251 // If we can't find a change per OSD we skip further iterations for this OSD
5252 uint n_changes = 0, prev_n_changes = 0;
5253 set<int> osd_to_skip;
5254
5255 retry:
5256
5257 set<pg_t> to_unmap;
5258 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
5259 auto temp_pgs_by_osd = pgs_by_osd;
5260 // always start with fullest, break if we find any changes to make
5261 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
5262 if (skip_overfull && !underfull.empty()) {
5263 ldout(cct, 10) << " skipping overfull " << dendl;
5264 break; // fall through to check underfull
5265 }
5266 int osd = p->second;
5267 float deviation = p->first;
5268 if (fast_aggressive && osd_to_skip.count(osd)) {
5269 ldout(cct, 20) << " Fast aggressive mode: skipping osd " << osd
5270 << " osd_to_skip size = " << osd_to_skip.size() << dendl;
5271 continue;
5272 }
5273
5274 if (deviation < 0) {
5275 ldout(cct, 10) << " hitting underfull osds now"
5276 << " when trying to remap overfull osds"
5277 << dendl;
5278 break;
5279 }
5280 float target = osd_weight[osd] * pgs_per_weight;
5281 ldout(cct, 10) << " Overfull search osd." << osd
5282 << " target " << target
5283 << " deviation " << deviation
5284 << dendl;
5285 ceph_assert(target > 0);
5286 if (!using_more_overfull && deviation <= max_deviation) {
5287 ldout(cct, 10) << " osd." << osd
5288 << " target " << target
5289 << " deviation " << deviation
5290 << " < max deviation " << max_deviation
5291 << dendl;
5292 break;
5293 }
5294
5295 vector<pg_t> pgs;
5296 pgs.reserve(pgs_by_osd[osd].size());
5297 for (auto& pg : pgs_by_osd[osd]) {
5298 if (to_skip.count(pg))
5299 continue;
5300 pgs.push_back(pg);
5301 }
5302 if (aggressive) {
5303 // shuffle PG list so they all get equal (in)attention
5304 std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed));
5305 }
5306 // look for remaps we can un-remap
5307 if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd,
5308 temp_pgs_by_osd, to_unmap, to_upmap))
5309 goto test_change;
5310
5311 // try upmap
5312 for (auto pg : pgs) {
5313 auto temp_it = tmp_osd_map.pg_upmap.find(pg);
5314 if (temp_it != tmp_osd_map.pg_upmap.end()) {
5315 // leave pg_upmap alone
5316 // it must be specified by admin since balancer does not
5317 // support pg_upmap yet
5318 ldout(cct, 10) << " " << pg << " already has pg_upmap "
5319 << temp_it->second << ", skipping"
5320 << dendl;
5321 continue;
5322 }
5323 auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg);
5324 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5325 set<int> existing;
5326 auto it = tmp_osd_map.pg_upmap_items.find(pg);
5327 if (it != tmp_osd_map.pg_upmap_items.end()) {
5328 auto& um_items = it->second;
5329 if (um_items.size() >= (size_t)pg_pool_size) {
5330 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
5331 << um_items << ", skipping"
5332 << dendl;
5333 continue;
5334 } else {
5335 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
5336 << um_items
5337 << dendl;
5338 new_upmap_items = um_items;
5339 // build existing too (for dedup)
5340 for (auto [um_from, um_to] : um_items) {
5341 existing.insert(um_from);
5342 existing.insert(um_to);
5343 }
5344 }
5345 // fall through
5346 // to see if we can append more remapping pairs
5347 }
5348 ldout(cct, 10) << " trying " << pg << dendl;
5349 vector<int> raw, orig, out;
5350 tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
5351 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
5352 continue;
5353 }
5354 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
5355 if (orig.size() != out.size()) {
5356 continue;
5357 }
5358 ceph_assert(orig != out);
5359 int pos = find_best_remap(cct, orig, out, existing, osd_deviation);
5360 if (pos != -1) {
5361 // append new remapping pairs slowly
5362 // This way we can make sure that each tiny change will
5363 // definitely make distribution of PGs converging to
5364 // the perfect status.
5365 add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size,
5366 osd, existing, temp_pgs_by_osd,
5367 new_upmap_items, to_upmap);
5368 goto test_change;
5369 }
5370 }
5371 if (fast_aggressive) {
5372 if (prev_n_changes == n_changes) { // no changes for prev OSD
5373 osd_to_skip.insert(osd);
5374 }
5375 else {
5376 prev_n_changes = n_changes;
5377 }
5378 }
5379
5380 }
5381
5382 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5383 ldout(cct, 10) << " failed to find any changes for overfull osds"
5384 << dendl;
5385 for (auto& [deviation, osd] : deviation_osd) {
5386 if (std::find(underfull.begin(), underfull.end(), osd) ==
5387 underfull.end())
5388 break;
5389 float target = osd_weight[osd] * pgs_per_weight;
5390 ceph_assert(target > 0);
5391 if (fabsf(deviation) < max_deviation) {
5392 // respect max_deviation too
5393 ldout(cct, 10) << " osd." << osd
5394 << " target " << target
5395 << " deviation " << deviation
5396 << " -> absolute " << fabsf(deviation)
5397 << " < max " << max_deviation
5398 << dendl;
5399 break;
5400 }
5401 // look for remaps we can un-remap
5402 candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip,
5403 only_pools, aggressive, p_seed);
5404 if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd,
5405 to_unmap, to_upmap)) {
5406 goto test_change;
5407 }
5408 }
5409
5410 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5411 ldout(cct, 10) << " failed to find any changes for underfull osds"
5412 << dendl;
5413 if (!aggressive) {
5414 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5415 break;
5416 } else if (!skip_overfull) {
5417 // safe to quit because below here we know
5418 // we've done checking both overfull and underfull osds..
5419 ldout(cct, 10) << " break due to not being able to find any"
5420 << " further optimizations"
5421 << dendl;
5422 break;
5423 }
5424 // restart with fullest and do exhaustive searching
5425 skip_overfull = false;
5426 continue;
5427
5428 test_change:
5429
5430 // test change, apply if change is good
5431 ceph_assert(to_unmap.size() || to_upmap.size());
5432 float new_stddev = 0;
5433 map<int,float> temp_osd_deviation;
5434 multimap<float,int> temp_deviation_osd;
5435 float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight,
5436 pgs_per_weight, temp_osd_deviation,
5437 temp_deviation_osd, new_stddev);
5438 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5439 if (new_stddev >= stddev) {
5440 if (!aggressive) {
5441 ldout(cct, 10) << " break because stddev is not decreasing"
5442 << " and aggressive mode is not enabled"
5443 << dendl;
5444 break;
5445 }
5446 local_fallback_retried++;
5447 if (local_fallback_retried >= local_fallback_retries) {
5448 // does not make progress
5449 // flip *skip_overfull* so both overfull and underfull
5450 // get equal (in)attention
5451 skip_overfull = !skip_overfull;
5452 ldout(cct, 10) << " hit local_fallback_retries "
5453 << local_fallback_retries
5454 << dendl;
5455 continue;
5456 }
5457 for (auto& i : to_unmap)
5458 to_skip.insert(i);
5459 for (auto& i : to_upmap)
5460 to_skip.insert(i.first);
5461 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5462 << " to_skip " << to_skip
5463 << dendl;
5464 goto retry;
5465 }
5466
5467 // ready to go
5468 ceph_assert(new_stddev < stddev);
5469 stddev = new_stddev;
5470 pgs_by_osd = temp_pgs_by_osd;
5471 osd_deviation = temp_osd_deviation;
5472 deviation_osd = temp_deviation_osd;
5473 n_changes++;
5474
5475
5476 num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc);
5477
5478 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5479 if (cur_max_deviation <= max_deviation) {
5480 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5481 << dendl;
5482 break;
5483 }
5484 }
5485 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
5486 return num_changed;
5487 }
5488
5489 map<uint64_t,set<pg_t>> OSDMap::get_pgs_by_osd(
5490 CephContext *cct,
5491 int64_t pid,
5492 map<uint64_t, set<pg_t>> *p_primaries_by_osd,
5493 map<uint64_t, set<pg_t>> *p_acting_primaries_by_osd) const
5494 {
5495 // Set up the OSDMap
5496 OSDMap tmp_osd_map;
5497 tmp_osd_map.deepish_copy_from(*this);
5498
5499 // Get the pool from the provided pool id
5500 const pg_pool_t* pool = get_pg_pool(pid);
5501
5502 // build array of pgs from the pool
5503 map<uint64_t,set<pg_t>> pgs_by_osd;
5504 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5505 pg_t pg(ps, pid);
5506 vector<int> up;
5507 int primary;
5508 int acting_prim;
5509 tmp_osd_map.pg_to_up_acting_osds(pg, &up, &primary, nullptr, &acting_prim);
5510 if (cct != nullptr)
5511 ldout(cct, 20) << __func__ << " " << pg
5512 << " up " << up
5513 << " primary " << primary
5514 << " acting_primary " << acting_prim
5515 << dendl;
5516
5517 if (!up.empty()) { // up can be empty is test generated files
5518 // in this case, we return empty result
5519 for (auto osd : up) {
5520 if (osd != CRUSH_ITEM_NONE)
5521 pgs_by_osd[osd].insert(pg);
5522 }
5523 if (p_primaries_by_osd != nullptr) {
5524 if (primary != CRUSH_ITEM_NONE)
5525 (*p_primaries_by_osd)[primary].insert(pg);
5526 }
5527 if (p_acting_primaries_by_osd != nullptr) {
5528 if (acting_prim != CRUSH_ITEM_NONE)
5529 (*p_acting_primaries_by_osd)[acting_prim].insert(pg);
5530 }
5531 }
5532 }
5533 return pgs_by_osd;
5534 }
5535
5536 float OSDMap::get_osds_weight(
5537 CephContext *cct,
5538 const OSDMap& tmp_osd_map,
5539 int64_t pid,
5540 map<int,float>& osds_weight) const
5541 {
5542 map<int,float> pmap;
5543 ceph_assert(pools.count(pid));
5544 int ruleno = pools.at(pid).get_crush_rule();
5545 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap);
5546 ldout(cct,20) << __func__ << " pool " << pid
5547 << " ruleno " << ruleno
5548 << " weight-map " << pmap
5549 << dendl;
5550 float osds_weight_total = 0;
5551 for (auto [oid, oweight] : pmap) {
5552 auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight;
5553 if (adjusted_weight != 0) {
5554 osds_weight[oid] += adjusted_weight;
5555 osds_weight_total += adjusted_weight;
5556 }
5557 }
5558 return osds_weight_total;
5559 }
5560
5561 float OSDMap::build_pool_pgs_info (
5562 CephContext *cct,
5563 const std::set<int64_t>& only_pools, ///< [optional] restrict to pool
5564 const OSDMap& tmp_osd_map,
5565 int& total_pgs,
5566 map<int,set<pg_t>>& pgs_by_osd,
5567 map<int,float>& osds_weight)
5568 {
5569 //
5570 // This function builds some data structures that are used by calc_pg_upmaps.
5571 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5572 // and returns the osd_weight_total
5573 //
5574 float osds_weight_total = 0.0;
5575 for (auto& [pid, pdata] : pools) {
5576 if (!only_pools.empty() && !only_pools.count(pid))
5577 continue;
5578 for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) {
5579 pg_t pg(ps, pid);
5580 vector<int> up;
5581 tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
5582 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
5583 for (auto osd : up) {
5584 if (osd != CRUSH_ITEM_NONE)
5585 pgs_by_osd[osd].insert(pg);
5586 }
5587 }
5588 total_pgs += pdata.get_size() * pdata.get_pg_num();
5589
5590 osds_weight_total = get_osds_weight(cct, tmp_osd_map, pid, osds_weight);
5591 }
5592 for (auto& [oid, oweight] : osds_weight) {
5593 int pgs = 0;
5594 auto p = pgs_by_osd.find(oid);
5595 if (p != pgs_by_osd.end())
5596 pgs = p->second.size();
5597 else
5598 pgs_by_osd.emplace(oid, set<pg_t>());
5599 ldout(cct, 20) << " osd." << oid << " weight " << oweight
5600 << " pgs " << pgs << dendl;
5601 }
5602 return osds_weight_total;
5603
5604 } // return total weight of all OSDs
5605
5606 float OSDMap::calc_deviations (
5607 CephContext *cct,
5608 const map<int,set<pg_t>>& pgs_by_osd,
5609 const map<int,float>& osd_weight,
5610 float pgs_per_weight,
5611 map<int,float>& osd_deviation,
5612 multimap<float,int>& deviation_osd,
5613 float& stddev) // return current max deviation
5614 {
5615 //
5616 // This function calculates the 2 maps osd_deviation and deviation_osd which
5617 // hold the deviation between the current number of PGs which map to an OSD
5618 // and the optimal number. Ot also calculates the stddev of the deviations and
5619 // returns the current max deviation.
5620 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5621 // long as it is monotonic with stddev (and it is), it is sufficient for
5622 // the balancer code.
5623 //
5624 float cur_max_deviation = 0.0;
5625 stddev = 0.0;
5626 for (auto& [oid, opgs] : pgs_by_osd) {
5627 // make sure osd is still there (belongs to this crush-tree)
5628 ceph_assert(osd_weight.count(oid));
5629 float target = osd_weight.at(oid) * pgs_per_weight;
5630 float deviation = (float)opgs.size() - target;
5631 ldout(cct, 20) << " osd." << oid
5632 << "\tpgs " << opgs.size()
5633 << "\ttarget " << target
5634 << "\tdeviation " << deviation
5635 << dendl;
5636 osd_deviation[oid] = deviation;
5637 deviation_osd.insert(make_pair(deviation, oid));
5638 stddev += deviation * deviation;
5639 if (fabsf(deviation) > cur_max_deviation)
5640 cur_max_deviation = fabsf(deviation);
5641 }
5642 return cur_max_deviation;
5643 }
5644
5645 void OSDMap::fill_overfull_underfull (
5646 CephContext *cct,
5647 const std::multimap<float,int>& deviation_osd,
5648 int max_deviation,
5649 std::set<int>& overfull,
5650 std::set<int>& more_overfull,
5651 std::vector<int>& underfull,
5652 std::vector<int>& more_underfull)
5653 {
5654 //
5655 // This function just fills the overfull and underfull data structures for the
5656 // use of calc_pg_upmaps
5657 //
5658 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
5659 auto& odev = i->first;
5660 auto& oid = i->second;
5661 ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl;
5662 if (odev <= 0)
5663 break;
5664 if (odev > max_deviation) {
5665 ldout(cct, 30) << " add overfull osd." << oid << dendl;
5666 overfull.insert(oid);
5667 } else {
5668 more_overfull.insert(oid);
5669 }
5670 }
5671
5672 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
5673 auto& odev = i->first;
5674 auto& oid = i->second;
5675 ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl;
5676 if (odev >= 0)
5677 break;
5678 if (odev < -(int)max_deviation) {
5679 ldout(cct, 30) << " add underfull osd." << oid << dendl;
5680 underfull.push_back(oid);
5681 } else {
5682 more_underfull.push_back(oid);
5683 }
5684 }
5685 }
5686
5687 int OSDMap::pack_upmap_results(
5688 CephContext *cct,
5689 const std::set<pg_t>& to_unmap,
5690 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
5691 OSDMap& tmp_osd_map,
5692 OSDMap::Incremental *pending_inc)
5693 {
5694 //
5695 // This function takes the input from the local variables to_unmap and to_upmap
5696 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5697 // (so that the results are visible outside calc_pg_upmaps)
5698 //
5699 int num_changed = 0;
5700 for (auto& i : to_unmap) {
5701 ldout(cct, 10) << " unmap pg " << i << dendl;
5702 ceph_assert(tmp_osd_map.pg_upmap_items.count(i));
5703 tmp_osd_map.pg_upmap_items.erase(i);
5704 pending_inc->old_pg_upmap_items.insert(i);
5705 ++num_changed;
5706 }
5707 for (auto& [pg, um_items] : to_upmap) {
5708 ldout(cct, 10) << " upmap pg " << pg
5709 << " new pg_upmap_items " << um_items
5710 << dendl;
5711 tmp_osd_map.pg_upmap_items[pg] = um_items;
5712 pending_inc->new_pg_upmap_items[pg] = um_items;
5713 ++num_changed;
5714 }
5715
5716 return num_changed;
5717 }
5718
5719 std::default_random_engine OSDMap::get_random_engine(
5720 CephContext *cct,
5721 std::random_device::result_type *p_seed)
5722 {
5723 //
5724 // This function creates a random_engine to be used for shuffling.
5725 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5726 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5727 // increments seed_set. This is used in order to craete regression test without
5728 // random effect on the results.
5729 //
5730 static std::random_device::result_type seed_set = 0;
5731 std::random_device::result_type seed;
5732 if (p_seed == nullptr) {
5733 std::random_device rd;
5734 seed = rd();
5735 }
5736 else {
5737 seed = *p_seed + seed_set;
5738 ldout(cct, 30) << " Starting random engine with seed "
5739 << seed << dendl;
5740 seed_set++;
5741 }
5742 return std::default_random_engine{seed};
5743 }
5744
5745 bool OSDMap::try_drop_remap_overfull(
5746 CephContext *cct,
5747 const std::vector<pg_t>& pgs,
5748 const OSDMap& tmp_osd_map,
5749 int osd,
5750 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5751 set<pg_t>& to_unmap,
5752 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5753 {
5754 //
5755 // This function tries to drop existimg upmap items which map data to overfull
5756 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5757 // if it found an item that can be dropped, false if not.
5758 //
5759 for (auto pg : pgs) {
5760 auto p = tmp_osd_map.pg_upmap_items.find(pg);
5761 if (p == tmp_osd_map.pg_upmap_items.end())
5762 continue;
5763 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5764 auto& pg_upmap_items = p->second;
5765 for (auto um_pair : pg_upmap_items) {
5766 auto& um_from = um_pair.first;
5767 auto& um_to = um_pair.second;
5768 if (um_to == osd) {
5769 ldout(cct, 10) << " will try dropping existing"
5770 << " remapping pair "
5771 << um_from << " -> " << um_to
5772 << " which remapped " << pg
5773 << " into overfull osd." << osd
5774 << dendl;
5775 temp_pgs_by_osd[um_to].erase(pg);
5776 temp_pgs_by_osd[um_from].insert(pg);
5777 } else {
5778 new_upmap_items.push_back(um_pair);
5779 }
5780 }
5781 if (new_upmap_items.empty()) {
5782 // drop whole item
5783 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5784 << " remapped " << pg << " into overfull osd." << osd
5785 << ", will try cancelling it entirely"
5786 << dendl;
5787 to_unmap.insert(pg);
5788 return true;
5789 } else if (new_upmap_items.size() != pg_upmap_items.size()) {
5790 // drop single remapping pair, updating
5791 ceph_assert(new_upmap_items.size() < pg_upmap_items.size());
5792 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5793 << " remapped " << pg << " into overfull osd." << osd
5794 << ", new_pg_upmap_items now " << new_upmap_items
5795 << dendl;
5796 to_upmap[pg] = new_upmap_items;
5797 return true;
5798 }
5799 }
5800 return false;
5801 }
5802
5803 bool OSDMap::try_drop_remap_underfull(
5804 CephContext *cct,
5805 const candidates_t& candidates,
5806 int osd,
5807 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5808 set<pg_t>& to_unmap,
5809 map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap)
5810 {
5811 //
5812 // This function tries to drop existimg upmap items which map data from underfull
5813 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5814 // if it found an item that can be dropped, false if not.
5815 //
5816 for (auto& [pg, um_pairs] : candidates) {
5817 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5818 for (auto& ump : um_pairs) {
5819 auto& um_from = ump.first;
5820 auto& um_to = ump.second;
5821 if (um_from == osd) {
5822 ldout(cct, 10) << " will try dropping existing"
5823 << " remapping pair "
5824 << um_from << " -> " << um_to
5825 << " which remapped " << pg
5826 << " out from underfull osd." << osd
5827 << dendl;
5828 temp_pgs_by_osd[um_to].erase(pg);
5829 temp_pgs_by_osd[um_from].insert(pg);
5830 } else {
5831 new_upmap_items.push_back(ump);
5832 }
5833 }
5834 if (new_upmap_items.empty()) {
5835 // drop whole item
5836 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5837 << " remapped " << pg
5838 << " out from underfull osd." << osd
5839 << ", will try cancelling it entirely"
5840 << dendl;
5841 to_unmap.insert(pg);
5842 return true;
5843 } else if (new_upmap_items.size() != um_pairs.size()) {
5844 // drop single remapping pair, updating
5845 ceph_assert(new_upmap_items.size() < um_pairs.size());
5846 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5847 << " remapped " << pg
5848 << " out from underfull osd." << osd
5849 << ", new_pg_upmap_items now " << new_upmap_items
5850 << dendl;
5851 to_upmap[pg] = new_upmap_items;
5852 return true;
5853 }
5854 }
5855 return false;
5856 }
5857
5858 void OSDMap::add_remap_pair(
5859 CephContext *cct,
5860 int orig,
5861 int out,
5862 pg_t pg,
5863 size_t pg_pool_size,
5864 int osd,
5865 set<int>& existing,
5866 map<int,set<pg_t>>& temp_pgs_by_osd,
5867 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items,
5868 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5869 {
5870 //
5871 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5872 // the relevant data structures
5873 //
5874 ldout(cct, 10) << " will try adding new remapping pair "
5875 << orig << " -> " << out << " for " << pg
5876 << (orig != osd ? " NOT selected osd" : "")
5877 << dendl;
5878 existing.insert(orig);
5879 existing.insert(out);
5880 temp_pgs_by_osd[orig].erase(pg);
5881 temp_pgs_by_osd[out].insert(pg);
5882 ceph_assert(new_upmap_items.size() < pg_pool_size);
5883 new_upmap_items.push_back(make_pair(orig, out));
5884 // append new remapping pairs slowly
5885 // This way we can make sure that each tiny change will
5886 // definitely make distribution of PGs converging to
5887 // the perfect status.
5888 to_upmap[pg] = new_upmap_items;
5889
5890 }
5891
5892 int OSDMap::find_best_remap (
5893 CephContext *cct,
5894 const vector<int>& orig,
5895 const vector<int>& out,
5896 const set<int>& existing,
5897 const map<int,float> osd_deviation)
5898 {
5899 //
5900 // Find the best remap from the suggestions in orig and out - the best remap
5901 // is the one which maps from the OSD with the largest deviatoion (from the
5902 // OSDs which are part of orig)
5903 //
5904 int best_pos = -1;
5905 float max_dev = 0;
5906 for (unsigned i = 0; i < out.size(); ++i) {
5907 if (orig[i] == out[i])
5908 continue; // skip invalid remappings
5909 if (existing.count(orig[i]) || existing.count(out[i]))
5910 continue; // we want new remappings only!
5911 if (osd_deviation.at(orig[i]) > max_dev) {
5912 max_dev = osd_deviation.at(orig[i]);
5913 best_pos = i;
5914 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl;
5915 }
5916 }
5917 return best_pos;
5918 }
5919
5920 OSDMap::candidates_t OSDMap::build_candidates(
5921 CephContext *cct,
5922 const OSDMap& tmp_osd_map,
5923 const set<pg_t> to_skip,
5924 const set<int64_t>& only_pools,
5925 bool aggressive,
5926 std::random_device::result_type *p_seed)
5927 {
5928 //
5929 // build the candidates data structure
5930 //
5931 candidates_t candidates;
5932 candidates.reserve(tmp_osd_map.pg_upmap_items.size());
5933 for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) {
5934 if (to_skip.count(pg))
5935 continue;
5936 if (!only_pools.empty() && !only_pools.count(pg.pool()))
5937 continue;
5938 candidates.push_back(make_pair(pg, um_pair));
5939 }
5940 if (aggressive) {
5941 // shuffle candidates so they all get equal (in)attention
5942 std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed));
5943 }
5944 return candidates;
5945 }
5946
5947 // return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs
5948 int64_t OSDMap::has_zero_pa_pgs(CephContext *cct, int64_t pool_id) const
5949 {
5950 const pg_pool_t* pool = get_pg_pool(pool_id);
5951 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5952 pg_t pg(ps, pool_id);
5953 vector<int> acting;
5954 pg_to_up_acting_osds(pg, nullptr, nullptr, &acting, nullptr);
5955 if (cct != nullptr) {
5956 ldout(cct, 30) << __func__ << " " << pg << " acting " << acting << dendl;
5957 }
5958 bool pg_zero_pa = true;
5959 for (auto osd : acting) {
5960 if (get_primary_affinityf(osd) != 0) {
5961 pg_zero_pa = false;
5962 break;
5963 }
5964 }
5965 if (pg_zero_pa) {
5966 if (cct != nullptr) {
5967 ldout(cct, 20) << __func__ << " " << pg << " - maps only to OSDs with primiary affinity 0" << dendl;
5968 }
5969 return (int64_t)ps;
5970 }
5971 }
5972 return -1;
5973 }
5974
5975 void OSDMap::zero_rbi(read_balance_info_t &rbi) const {
5976 rbi.pa_avg = 0.;
5977 rbi.pa_weighted = 0.;
5978 rbi.pa_weighted_avg = 0.;
5979 rbi.raw_score = 0.;
5980 rbi.optimal_score = 0.;
5981 rbi.adjusted_score = 0.;
5982 rbi.acting_raw_score = 0.;
5983 rbi.acting_adj_score = 0.;
5984 rbi.err_msg = "";
5985 }
5986
5987 int OSDMap::set_rbi(
5988 CephContext *cct,
5989 read_balance_info_t &rbi,
5990 int64_t pool_id,
5991 float total_w_pa,
5992 float pa_sum,
5993 int num_osds,
5994 int osd_pa_count,
5995 float total_osd_weight,
5996 uint max_prims_per_osd,
5997 uint max_acting_prims_per_osd,
5998 float avg_prims_per_osd,
5999 bool prim_on_zero_pa,
6000 bool acting_on_zero_pa,
6001 float max_osd_score) const
6002 {
6003 // put all the ugly code here, so rest of code is nicer.
6004 const pg_pool_t* pool = get_pg_pool(pool_id);
6005 zero_rbi(rbi);
6006
6007 if (total_w_pa / total_osd_weight < 1. / float(pool->get_size())) {
6008 ldout(cct, 20) << __func__ << " pool " << pool_id << " average primary affinity is lower than"
6009 << 1. / float(pool->get_size()) << dendl;
6010 rbi.err_msg = fmt::format(
6011 "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable",
6012 pool_id, 1. / float(pool->get_size()));
6013 return -EINVAL;
6014 }
6015 rbi.pa_weighted = total_w_pa;
6016
6017 // weighted_prim_affinity_avg
6018 rbi.pa_weighted_avg = rbi_round(rbi.pa_weighted / total_osd_weight); // in [0..1]
6019 // p_rbi->pa_weighted / osd_pa_count; // in [0..1]
6020
6021 rbi.raw_score = rbi_round((float)max_prims_per_osd / avg_prims_per_osd); // >=1
6022 if (acting_on_zero_pa) {
6023 rbi.acting_raw_score = rbi_round(max_osd_score);
6024 rbi.err_msg = fmt::format(
6025 "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate",
6026 pool_id);
6027 } else {
6028 rbi.acting_raw_score = rbi_round((float)max_acting_prims_per_osd / avg_prims_per_osd);
6029 }
6030
6031 if (osd_pa_count != 0) {
6032 // this implies that pa_sum > 0
6033 rbi.pa_avg = rbi_round(pa_sum / osd_pa_count); // in [0..1]
6034 } else {
6035 rbi.pa_avg = 0.;
6036 }
6037
6038 if (rbi.pa_avg != 0.) {
6039 int64_t zpg;
6040 if ((zpg = has_zero_pa_pgs(cct, pool_id)) >= 0) {
6041 pg_t pg(zpg, pool_id);
6042 std::stringstream ss;
6043 ss << pg;
6044 ldout(cct, 10) << __func__ << " pool " << pool_id << " has some PGs where all OSDs are with primary_affinity 0 (" << pg << ",...)" << dendl;
6045 rbi.err_msg = fmt::format(
6046 "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable",
6047 pool_id, ss.str());
6048 return -EINVAL;
6049 }
6050 rbi.optimal_score = rbi_round(float(num_osds) / float(osd_pa_count)); // >= 1
6051 // adjust the score to the primary affinity setting (if prim affinity is set
6052 // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1)
6053 // When total system primary affinity is too low (average < 1 / pool replica count)
6054 // the score is negative in order to grab the user's attention.
6055 rbi.adjusted_score = rbi_round(rbi.raw_score / rbi.optimal_score); // >= 1 if PA is not low
6056 rbi.acting_adj_score = rbi_round(rbi.acting_raw_score / rbi.optimal_score); // >= 1 if PA is not low
6057
6058 } else {
6059 // We should never get here - this condition is checked before calling this function - this is just sanity check code.
6060 rbi.err_msg = fmt::format(
6061 "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score",
6062 pool_id);
6063 return -EINVAL;
6064 }
6065
6066 return 0;
6067 }
6068
6069 int OSDMap::calc_read_balance_score(CephContext *cct, int64_t pool_id,
6070 read_balance_info_t *p_rbi) const
6071 {
6072 //BUG: wrong score with one PG replica 3 and 4 OSDs
6073 if (cct != nullptr)
6074 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) << dendl;
6075
6076 OSDMap tmp_osd_map;
6077 tmp_osd_map.deepish_copy_from(*this);
6078 if (p_rbi == nullptr) {
6079 // The only case where error message is not set - this is not tested in the unit test.
6080 if (cct != nullptr)
6081 ldout(cct,30) << __func__ << " p_rbi is nullptr." << dendl;
6082 return -EINVAL;
6083 }
6084
6085 if (tmp_osd_map.pools.count(pool_id) == 0) {
6086 if (cct != nullptr)
6087 ldout(cct,30) << __func__ << " pool " << pool_id << " not found." << dendl;
6088 zero_rbi(*p_rbi);
6089 p_rbi->err_msg = fmt::format("pool {} not found", pool_id);
6090 return -ENOENT;
6091 }
6092 int rc = 0;
6093 const pg_pool_t* pool = tmp_osd_map.get_pg_pool(pool_id);
6094 auto num_pgs = pool->get_pg_num();
6095
6096 map<uint64_t,set<pg_t>> pgs_by_osd;
6097 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
6098 map<uint64_t,set<pg_t>> acting_prims_by_osd;
6099
6100 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pool_id, &prim_pgs_by_osd, &acting_prims_by_osd);
6101
6102 if (cct != nullptr)
6103 ldout(cct,30) << __func__ << " Primaries for pool: "
6104 << prim_pgs_by_osd << dendl;
6105
6106 if (pgs_by_osd.empty()) {
6107 //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id);
6108 return -EINVAL;
6109 }
6110 if (cct != nullptr) {
6111 for (auto& [osd,pgs] : prim_pgs_by_osd) {
6112 ldout(cct,20) << __func__ << " Pool " << pool_id << " OSD." << osd
6113 << " has " << pgs.size() << " primary PGs, "
6114 << acting_prims_by_osd[osd].size() << " acting primaries."
6115 << dendl;
6116 }
6117 }
6118
6119 auto num_osds = pgs_by_osd.size();
6120
6121 float avg_prims_per_osd = (float)num_pgs / (float)num_osds;
6122 uint64_t max_prims_per_osd = 0;
6123 uint64_t max_acting_prims_per_osd = 0;
6124 float max_osd_score = 0.;
6125 bool prim_on_zero_pa = false;
6126 bool acting_on_zero_pa = false;
6127
6128 float prim_affinity_sum = 0.;
6129 float total_osd_weight = 0.;
6130 float total_weighted_pa = 0.;
6131
6132 map<int,float> osds_crush_weight;
6133 // Set up the OSDMap
6134 int ruleno = tmp_osd_map.pools.at(pool_id).get_crush_rule();
6135 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &osds_crush_weight);
6136
6137 if (cct != nullptr) {
6138 ldout(cct,20) << __func__ << " pool " << pool_id
6139 << " ruleno " << ruleno
6140 << " weight-map " << osds_crush_weight
6141 << dendl;
6142 }
6143 uint osd_pa_count = 0;
6144
6145 for (auto [osd, oweight] : osds_crush_weight) { // loop over all OSDs
6146 total_osd_weight += oweight;
6147 float osd_pa = tmp_osd_map.get_primary_affinityf(osd);
6148 total_weighted_pa += oweight * osd_pa;
6149 if (osd_pa != 0.) {
6150 osd_pa_count++;
6151 }
6152 if (prim_pgs_by_osd.count(osd)) {
6153 auto n_prims = prim_pgs_by_osd.at(osd).size();
6154 max_prims_per_osd = std::max(max_prims_per_osd, n_prims);
6155 if (osd_pa == 0.) {
6156 prim_on_zero_pa = true;
6157 }
6158 }
6159 if (acting_prims_by_osd.count(osd)) {
6160 auto n_aprims = acting_prims_by_osd.at(osd).size();
6161 max_acting_prims_per_osd = std::max(max_acting_prims_per_osd, n_aprims);
6162 if (osd_pa != 0.) {
6163 max_osd_score = std::max(max_osd_score, float(n_aprims) / osd_pa);
6164 }
6165 else {
6166 acting_on_zero_pa = true;
6167 }
6168 }
6169
6170 prim_affinity_sum += osd_pa;
6171 if (cct != nullptr) {
6172 auto np = prim_pgs_by_osd.count(osd) ? prim_pgs_by_osd.at(osd).size() : 0;
6173 auto nap = acting_prims_by_osd.count(osd) ? acting_prims_by_osd.at(osd).size() : 0;
6174 auto wt = osds_crush_weight.count(osd) ? osds_crush_weight.at(osd) : 0.;
6175 ldout(cct,30) << __func__ << " OSD." << osd << " info: "
6176 << " num_primaries " << np
6177 << " num_acting_prims " << nap
6178 << " prim_affinity " << tmp_osd_map.get_primary_affinityf(osd)
6179 << " weight " << wt
6180 << dendl;
6181 }
6182 }
6183 if (cct != nullptr) {
6184 ldout(cct,30) << __func__ << " pool " << pool_id
6185 << " total_osd_weight " << total_osd_weight
6186 << " total_weighted_pa " << total_weighted_pa
6187 << dendl;
6188 }
6189
6190 if (prim_affinity_sum == 0.0) {
6191 if (cct != nullptr) {
6192 ldout(cct, 10) << __func__ << " pool " << pool_id
6193 << " has primary_affinity set to zero on all OSDs" << dendl;
6194 }
6195 zero_rbi(*p_rbi);
6196 p_rbi->err_msg = fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id);
6197
6198 return -ERANGE; // score has a different meaning now.
6199 }
6200 else {
6201 max_osd_score *= prim_affinity_sum / num_osds;
6202 }
6203
6204 rc = tmp_osd_map.set_rbi(cct, *p_rbi, pool_id, total_weighted_pa,
6205 prim_affinity_sum, num_osds, osd_pa_count,
6206 total_osd_weight, max_prims_per_osd,
6207 max_acting_prims_per_osd, avg_prims_per_osd,
6208 prim_on_zero_pa, acting_on_zero_pa, max_osd_score);
6209
6210 if (cct != nullptr) {
6211 ldout(cct,30) << __func__ << " pool " << get_pool_name(pool_id)
6212 << " pa_avg " << p_rbi->pa_avg
6213 << " pa_weighted " << p_rbi->pa_weighted
6214 << " pa_weighted_avg " << p_rbi->pa_weighted_avg
6215 << " optimal_score " << p_rbi->optimal_score
6216 << " adjusted_score " << p_rbi->adjusted_score
6217 << " acting_adj_score " << p_rbi->acting_adj_score
6218 << dendl;
6219 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id)
6220 << " raw_score: " << p_rbi->raw_score
6221 << " acting_raw_score: " << p_rbi->acting_raw_score
6222 << dendl;
6223 ldout(cct,10) << __func__ << " pool " << get_pool_name(pool_id)
6224 << " wl_score: " << p_rbi->acting_adj_score << dendl;
6225 }
6226
6227 return rc;
6228 }
6229
6230 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
6231 {
6232 return crush->get_leaves(name, osds);
6233 }
6234
6235 // get pools whose crush rules might reference the given osd
6236 void OSDMap::get_pool_ids_by_osd(CephContext *cct,
6237 int osd,
6238 set<int64_t> *pool_ids) const
6239 {
6240 ceph_assert(pool_ids);
6241 set<int> raw_rules;
6242 int r = crush->get_rules_by_osd(osd, &raw_rules);
6243 if (r < 0) {
6244 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
6245 << dendl;
6246 ceph_assert(r >= 0);
6247 }
6248 set<int> rules;
6249 for (auto &i: raw_rules) {
6250 // exclude any dead rule
6251 if (crush_rule_in_use(i)) {
6252 rules.insert(i);
6253 }
6254 }
6255 for (auto &r: rules) {
6256 get_pool_ids_by_rule(r, pool_ids);
6257 }
6258 }
6259
6260 template <typename F>
6261 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
6262 public:
6263 typedef CrushTreeDumper::Dumper<F> Parent;
6264
6265 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
6266 const PGMap& pgmap_, bool tree_,
6267 const string& filter) :
6268 Parent(crush, osdmap_->get_pool_names()),
6269 osdmap(osdmap_),
6270 pgmap(pgmap_),
6271 tree(tree_),
6272 min_var(-1),
6273 max_var(-1),
6274 stddev(0),
6275 sum(0) {
6276 if (osdmap->crush->name_exists(filter)) {
6277 // filter by crush node
6278 auto item_id = osdmap->crush->get_item_id(filter);
6279 allowed.insert(item_id);
6280 osdmap->crush->get_all_children(item_id, &allowed);
6281 } else if (osdmap->crush->class_exists(filter)) {
6282 // filter by device class
6283 class_id = osdmap->crush->get_class_id(filter);
6284 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
6285 pool_id >= 0) {
6286 // filter by pool
6287 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
6288 set<int> roots;
6289 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
6290 allowed = roots;
6291 for (auto r : roots)
6292 osdmap->crush->get_all_children(r, &allowed);
6293 }
6294 average_util = average_utilization();
6295 }
6296
6297 protected:
6298
6299 bool should_dump(int id) const {
6300 if (!allowed.empty() && !allowed.count(id)) // filter by name
6301 return false;
6302 if (id >= 0 && class_id >= 0) {
6303 auto item_class_id = osdmap->crush->get_item_class_id(id);
6304 if (item_class_id < 0 || // not bound to a class yet
6305 item_class_id != class_id) // or already bound to a different class
6306 return false;
6307 }
6308 return true;
6309 }
6310
6311 set<int> get_dumped_osds() {
6312 if (allowed.empty() && class_id < 0) {
6313 // old way, all
6314 return {};
6315 }
6316 return dumped_osds;
6317 }
6318
6319 void dump_stray(F *f) {
6320 for (int i = 0; i < osdmap->get_max_osd(); i++) {
6321 if (osdmap->exists(i) && !this->is_touched(i))
6322 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
6323 }
6324 }
6325
6326 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
6327 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
6328 return;
6329 if (!should_dump(qi.id))
6330 return;
6331
6332 if (!qi.is_bucket())
6333 dumped_osds.insert(qi.id);
6334 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
6335 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
6336 kb_used_meta = 0, kb_avail = 0;
6337 double util = 0;
6338 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
6339 &kb_used_omap, &kb_used_meta, &kb_avail))
6340 if (kb_used && kb)
6341 util = 100.0 * (double)kb_used / (double)kb;
6342
6343 double var = 1.0;
6344 if (average_util)
6345 var = util / average_util;
6346
6347 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
6348
6349 dump_item(qi, reweight, kb, kb_used,
6350 kb_used_data, kb_used_omap, kb_used_meta,
6351 kb_avail, util, var, num_pgs, f);
6352
6353 if (!qi.is_bucket() && reweight > 0) {
6354 if (min_var < 0 || var < min_var)
6355 min_var = var;
6356 if (max_var < 0 || var > max_var)
6357 max_var = var;
6358
6359 double dev = util - average_util;
6360 dev *= dev;
6361 stddev += reweight * dev;
6362 sum += reweight;
6363 }
6364 }
6365
6366 virtual void dump_item(const CrushTreeDumper::Item &qi,
6367 float &reweight,
6368 int64_t kb,
6369 int64_t kb_used,
6370 int64_t kb_used_data,
6371 int64_t kb_used_omap,
6372 int64_t kb_used_meta,
6373 int64_t kb_avail,
6374 double& util,
6375 double& var,
6376 const size_t num_pgs,
6377 F *f) = 0;
6378
6379 double dev() {
6380 return sum > 0 ? sqrt(stddev / sum) : 0;
6381 }
6382
6383 double average_utilization() {
6384 int64_t kb = 0, kb_used = 0;
6385 for (int i = 0; i < osdmap->get_max_osd(); i++) {
6386 if (!osdmap->exists(i) ||
6387 osdmap->get_weight(i) == 0 ||
6388 !should_dump(i))
6389 continue;
6390 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
6391 kb_avail_i;
6392 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
6393 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
6394 kb += kb_i;
6395 kb_used += kb_used_i;
6396 }
6397 }
6398 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
6399 }
6400
6401 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
6402 int64_t* kb_used_data,
6403 int64_t* kb_used_omap,
6404 int64_t* kb_used_meta,
6405 int64_t* kb_avail) const {
6406 const osd_stat_t *p = pgmap.get_osd_stat(id);
6407 if (!p) return false;
6408 *kb = p->statfs.kb();
6409 *kb_used = p->statfs.kb_used_raw();
6410 *kb_used_data = p->statfs.kb_used_data();
6411 *kb_used_omap = p->statfs.kb_used_omap();
6412 *kb_used_meta = p->statfs.kb_used_internal_metadata();
6413 *kb_avail = p->statfs.kb_avail();
6414
6415 return true;
6416 }
6417
6418 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
6419 int64_t* kb_used_data,
6420 int64_t* kb_used_omap,
6421 int64_t* kb_used_meta,
6422 int64_t* kb_avail) const {
6423 if (id >= 0) {
6424 if (osdmap->is_out(id) || !should_dump(id)) {
6425 *kb = 0;
6426 *kb_used = 0;
6427 *kb_used_data = 0;
6428 *kb_used_omap = 0;
6429 *kb_used_meta = 0;
6430 *kb_avail = 0;
6431 return true;
6432 }
6433 return get_osd_utilization(id, kb, kb_used, kb_used_data,
6434 kb_used_omap, kb_used_meta, kb_avail);
6435 }
6436
6437 *kb = 0;
6438 *kb_used = 0;
6439 *kb_used_data = 0;
6440 *kb_used_omap = 0;
6441 *kb_used_meta = 0;
6442 *kb_avail = 0;
6443
6444 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
6445 int item = osdmap->crush->get_bucket_item(id, k);
6446 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
6447 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
6448 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
6449 &kb_used_data_i, &kb_used_omap_i,
6450 &kb_used_meta_i, &kb_avail_i))
6451 return false;
6452 *kb += kb_i;
6453 *kb_used += kb_used_i;
6454 *kb_used_data += kb_used_data_i;
6455 *kb_used_omap += kb_used_omap_i;
6456 *kb_used_meta += kb_used_meta_i;
6457 *kb_avail += kb_avail_i;
6458 }
6459 return true;
6460 }
6461
6462 protected:
6463 const OSDMap *osdmap;
6464 const PGMap& pgmap;
6465 bool tree;
6466 double average_util;
6467 double min_var;
6468 double max_var;
6469 double stddev;
6470 double sum;
6471 int class_id = -1;
6472 set<int> allowed;
6473 set<int> dumped_osds;
6474 };
6475
6476
6477 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
6478 public:
6479 typedef OSDUtilizationDumper<TextTable> Parent;
6480
6481 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
6482 const PGMap& pgmap, bool tree,
6483 const string& filter) :
6484 Parent(crush, osdmap, pgmap, tree, filter) {}
6485
6486 void dump(TextTable *tbl) {
6487 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
6488 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
6489 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
6490 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
6491 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
6492 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
6493 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
6494 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
6495 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
6496 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
6497 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
6498 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
6499 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
6500 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
6501 if (tree)
6502 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
6503
6504 Parent::dump(tbl);
6505
6506 dump_stray(tbl);
6507
6508 auto sum = pgmap.get_osd_sum(get_dumped_osds());
6509 *tbl << ""
6510 << ""
6511 << "" << "TOTAL"
6512 << byte_u_t(sum.statfs.total)
6513 << byte_u_t(sum.statfs.get_used_raw())
6514 << byte_u_t(sum.statfs.allocated)
6515 << byte_u_t(sum.statfs.omap_allocated)
6516 << byte_u_t(sum.statfs.internal_metadata)
6517 << byte_u_t(sum.statfs.available)
6518 << lowprecision_t(average_util)
6519 << ""
6520 << TextTable::endrow;
6521 }
6522
6523 protected:
6524 struct lowprecision_t {
6525 float v;
6526 explicit lowprecision_t(float _v) : v(_v) {}
6527 };
6528 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
6529
6530 using OSDUtilizationDumper<TextTable>::dump_item;
6531 void dump_item(const CrushTreeDumper::Item &qi,
6532 float &reweight,
6533 int64_t kb,
6534 int64_t kb_used,
6535 int64_t kb_used_data,
6536 int64_t kb_used_omap,
6537 int64_t kb_used_meta,
6538 int64_t kb_avail,
6539 double& util,
6540 double& var,
6541 const size_t num_pgs,
6542 TextTable *tbl) override {
6543 const char *c = crush->get_item_class(qi.id);
6544 if (!c)
6545 c = "";
6546 *tbl << qi.id
6547 << c
6548 << weightf_t(qi.weight)
6549 << weightf_t(reweight)
6550 << byte_u_t(kb << 10)
6551 << byte_u_t(kb_used << 10)
6552 << byte_u_t(kb_used_data << 10)
6553 << byte_u_t(kb_used_omap << 10)
6554 << byte_u_t(kb_used_meta << 10)
6555 << byte_u_t(kb_avail << 10)
6556 << lowprecision_t(util)
6557 << lowprecision_t(var);
6558
6559 if (qi.is_bucket()) {
6560 *tbl << "-";
6561 *tbl << "";
6562 } else {
6563 *tbl << num_pgs;
6564 if (osdmap->is_up(qi.id)) {
6565 *tbl << "up";
6566 } else if (osdmap->is_destroyed(qi.id)) {
6567 *tbl << "destroyed";
6568 } else {
6569 *tbl << "down";
6570 }
6571 }
6572
6573 if (tree) {
6574 ostringstream name;
6575 for (int k = 0; k < qi.depth; k++)
6576 name << " ";
6577 if (qi.is_bucket()) {
6578 int type = crush->get_bucket_type(qi.id);
6579 name << crush->get_type_name(type) << " "
6580 << crush->get_item_name(qi.id);
6581 } else {
6582 name << "osd." << qi.id;
6583 }
6584 *tbl << name.str();
6585 }
6586
6587 *tbl << TextTable::endrow;
6588 }
6589
6590 public:
6591 string summary() {
6592 ostringstream out;
6593 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
6594 << "/" << lowprecision_t(max_var) << " "
6595 << "STDDEV: " << lowprecision_t(dev());
6596 return out.str();
6597 }
6598 };
6599
6600 ostream& operator<<(ostream& out,
6601 const OSDUtilizationPlainDumper::lowprecision_t& v)
6602 {
6603 if (v.v < -0.01) {
6604 return out << "-";
6605 } else if (v.v < 0.001) {
6606 return out << "0";
6607 } else {
6608 std::streamsize p = out.precision();
6609 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
6610 }
6611 }
6612
6613 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
6614 public:
6615 typedef OSDUtilizationDumper<Formatter> Parent;
6616
6617 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
6618 const PGMap& pgmap, bool tree,
6619 const string& filter) :
6620 Parent(crush, osdmap, pgmap, tree, filter) {}
6621
6622 void dump(Formatter *f) {
6623 f->open_array_section("nodes");
6624 Parent::dump(f);
6625 f->close_section();
6626
6627 f->open_array_section("stray");
6628 dump_stray(f);
6629 f->close_section();
6630 }
6631
6632 protected:
6633 using OSDUtilizationDumper<Formatter>::dump_item;
6634 void dump_item(const CrushTreeDumper::Item &qi,
6635 float &reweight,
6636 int64_t kb,
6637 int64_t kb_used,
6638 int64_t kb_used_data,
6639 int64_t kb_used_omap,
6640 int64_t kb_used_meta,
6641 int64_t kb_avail,
6642 double& util,
6643 double& var,
6644 const size_t num_pgs,
6645 Formatter *f) override {
6646 f->open_object_section("item");
6647 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
6648 f->dump_float("reweight", reweight);
6649 f->dump_int("kb", kb);
6650 f->dump_int("kb_used", kb_used);
6651 f->dump_int("kb_used_data", kb_used_data);
6652 f->dump_int("kb_used_omap", kb_used_omap);
6653 f->dump_int("kb_used_meta", kb_used_meta);
6654 f->dump_int("kb_avail", kb_avail);
6655 f->dump_float("utilization", util);
6656 f->dump_float("var", var);
6657 f->dump_unsigned("pgs", num_pgs);
6658 if (!qi.is_bucket()) {
6659 if (osdmap->is_up(qi.id)) {
6660 f->dump_string("status", "up");
6661 } else if (osdmap->is_destroyed(qi.id)) {
6662 f->dump_string("status", "destroyed");
6663 } else {
6664 f->dump_string("status", "down");
6665 }
6666 }
6667 CrushTreeDumper::dump_bucket_children(crush, qi, f);
6668 f->close_section();
6669 }
6670
6671 public:
6672 void summary(Formatter *f) {
6673 f->open_object_section("summary");
6674 auto sum = pgmap.get_osd_sum(get_dumped_osds());
6675 auto& s = sum.statfs;
6676
6677 f->dump_int("total_kb", s.kb());
6678 f->dump_int("total_kb_used", s.kb_used_raw());
6679 f->dump_int("total_kb_used_data", s.kb_used_data());
6680 f->dump_int("total_kb_used_omap", s.kb_used_omap());
6681 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
6682 f->dump_int("total_kb_avail", s.kb_avail());
6683 f->dump_float("average_utilization", average_util);
6684 f->dump_float("min_var", min_var);
6685 f->dump_float("max_var", max_var);
6686 f->dump_float("dev", dev());
6687 f->close_section();
6688 }
6689 };
6690
6691 void print_osd_utilization(const OSDMap& osdmap,
6692 const PGMap& pgmap,
6693 ostream& out,
6694 Formatter *f,
6695 bool tree,
6696 const string& filter)
6697 {
6698 const CrushWrapper *crush = osdmap.crush.get();
6699 if (f) {
6700 f->open_object_section("df");
6701 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
6702 d.dump(f);
6703 d.summary(f);
6704 f->close_section();
6705 f->flush(out);
6706 } else {
6707 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
6708 TextTable tbl;
6709 d.dump(&tbl);
6710 out << tbl << d.summary() << "\n";
6711 }
6712 }
6713
6714 void OSDMap::check_health(CephContext *cct,
6715 health_check_map_t *checks) const
6716 {
6717 int num_osds = get_num_osds();
6718
6719 // OSD_DOWN
6720 // OSD_$subtree_DOWN
6721 // OSD_ORPHAN
6722 if (num_osds >= 0) {
6723 int num_in_osds = 0;
6724 int num_down_in_osds = 0;
6725 set<int> osds;
6726 set<int> down_in_osds;
6727 set<int> up_in_osds;
6728 set<int> subtree_up;
6729 unordered_map<int, set<int> > subtree_type_down;
6730 unordered_map<int, int> num_osds_subtree;
6731 int max_type = crush->get_max_type_id();
6732
6733 for (int i = 0; i < get_max_osd(); i++) {
6734 if (!exists(i)) {
6735 if (crush->item_exists(i)) {
6736 osds.insert(i);
6737 }
6738 continue;
6739 }
6740 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
6741 continue;
6742 ++num_in_osds;
6743 if (down_in_osds.count(i) || up_in_osds.count(i))
6744 continue;
6745 if (!is_up(i)) {
6746 down_in_osds.insert(i);
6747 int parent_id = 0;
6748 int current = i;
6749 for (int type = 0; type <= max_type; type++) {
6750 if (!crush->get_type_name(type))
6751 continue;
6752 int r = crush->get_immediate_parent_id(current, &parent_id);
6753 if (r == -ENOENT)
6754 break;
6755 // break early if this parent is already marked as up
6756 if (subtree_up.count(parent_id))
6757 break;
6758 type = crush->get_bucket_type(parent_id);
6759 if (!subtree_type_is_down(
6760 cct, parent_id, type,
6761 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
6762 break;
6763 current = parent_id;
6764 }
6765 }
6766 }
6767
6768 // calculate the number of down osds in each down subtree and
6769 // store it in num_osds_subtree
6770 for (int type = 1; type <= max_type; type++) {
6771 if (!crush->get_type_name(type))
6772 continue;
6773 for (auto j = subtree_type_down[type].begin();
6774 j != subtree_type_down[type].end();
6775 ++j) {
6776 list<int> children;
6777 int num = 0;
6778 int num_children = crush->get_children(*j, &children);
6779 if (num_children == 0)
6780 continue;
6781 for (auto l = children.begin(); l != children.end(); ++l) {
6782 if (*l >= 0) {
6783 ++num;
6784 } else if (num_osds_subtree[*l] > 0) {
6785 num = num + num_osds_subtree[*l];
6786 }
6787 }
6788 num_osds_subtree[*j] = num;
6789 }
6790 }
6791 num_down_in_osds = down_in_osds.size();
6792 ceph_assert(num_down_in_osds <= num_in_osds);
6793 if (num_down_in_osds > 0) {
6794 // summary of down subtree types and osds
6795 for (int type = max_type; type > 0; type--) {
6796 if (!crush->get_type_name(type))
6797 continue;
6798 if (subtree_type_down[type].size() > 0) {
6799 ostringstream ss;
6800 ss << subtree_type_down[type].size() << " "
6801 << crush->get_type_name(type);
6802 if (subtree_type_down[type].size() > 1) {
6803 ss << "s";
6804 }
6805 int sum_down_osds = 0;
6806 for (auto j = subtree_type_down[type].begin();
6807 j != subtree_type_down[type].end();
6808 ++j) {
6809 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
6810 }
6811 ss << " (" << sum_down_osds << " osds) down";
6812 string err = string("OSD_") +
6813 string(crush->get_type_name(type)) + "_DOWN";
6814 boost::to_upper(err);
6815 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
6816 subtree_type_down[type].size());
6817 for (auto j = subtree_type_down[type].rbegin();
6818 j != subtree_type_down[type].rend();
6819 ++j) {
6820 ostringstream ss;
6821 ss << crush->get_type_name(type);
6822 ss << " ";
6823 ss << crush->get_item_name(*j);
6824 // at the top level, do not print location
6825 if (type != max_type) {
6826 ss << " (";
6827 ss << crush->get_full_location_ordered_string(*j);
6828 ss << ")";
6829 }
6830 int num = num_osds_subtree[*j];
6831 ss << " (" << num << " osds)";
6832 ss << " is down";
6833 d.detail.push_back(ss.str());
6834 }
6835 }
6836 }
6837 ostringstream ss;
6838 ss << down_in_osds.size() << " osds down";
6839 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
6840 down_in_osds.size());
6841 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
6842 ostringstream ss;
6843 ss << "osd." << *it << " (";
6844 ss << crush->get_full_location_ordered_string(*it);
6845 ss << ") is down";
6846 d.detail.push_back(ss.str());
6847 }
6848 }
6849
6850 if (!osds.empty()) {
6851 ostringstream ss;
6852 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
6853 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
6854 osds.size());
6855 for (auto osd : osds) {
6856 ostringstream ss;
6857 ss << "osd." << osd << " exists in crush map but not in osdmap";
6858 d.detail.push_back(ss.str());
6859 }
6860 }
6861 }
6862
6863 std::list<std::string> scrub_messages;
6864 bool noscrub = false, nodeepscrub = false;
6865 for (const auto &p : pools) {
6866 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
6867 ostringstream ss;
6868 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
6869 scrub_messages.push_back(ss.str());
6870 noscrub = true;
6871 }
6872 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
6873 ostringstream ss;
6874 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
6875 scrub_messages.push_back(ss.str());
6876 nodeepscrub = true;
6877 }
6878 }
6879 if (noscrub || nodeepscrub) {
6880 string out = "";
6881 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
6882 out += nodeepscrub ? "nodeep-scrub" : "";
6883 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
6884 "Some pool(s) have the " + out + " flag(s) set", 0);
6885 d.detail.splice(d.detail.end(), scrub_messages);
6886 }
6887
6888 // OSD_OUT_OF_ORDER_FULL
6889 {
6890 // An osd could configure failsafe ratio, to something different
6891 // but for now assume it is the same here.
6892 float fsr = cct->_conf->osd_failsafe_full_ratio;
6893 if (fsr > 1.0) fsr /= 100;
6894 float fr = get_full_ratio();
6895 float br = get_backfillfull_ratio();
6896 float nr = get_nearfull_ratio();
6897
6898 list<string> detail;
6899 // These checks correspond to how OSDService::check_full_status() in an OSD
6900 // handles the improper setting of these values.
6901 if (br < nr) {
6902 ostringstream ss;
6903 ss << "backfillfull_ratio (" << br
6904 << ") < nearfull_ratio (" << nr << "), increased";
6905 detail.push_back(ss.str());
6906 br = nr;
6907 }
6908 if (fr < br) {
6909 ostringstream ss;
6910 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
6911 << "), increased";
6912 detail.push_back(ss.str());
6913 fr = br;
6914 }
6915 if (fsr < fr) {
6916 ostringstream ss;
6917 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
6918 << "), increased";
6919 detail.push_back(ss.str());
6920 }
6921 if (!detail.empty()) {
6922 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
6923 "full ratio(s) out of order", 0);
6924 d.detail.swap(detail);
6925 }
6926 }
6927
6928 // OSD_FULL
6929 // OSD_NEARFULL
6930 // OSD_BACKFILLFULL
6931 // OSD_FAILSAFE_FULL
6932 {
6933 set<int> full, backfillfull, nearfull;
6934 get_full_osd_counts(&full, &backfillfull, &nearfull);
6935 if (full.size()) {
6936 ostringstream ss;
6937 ss << full.size() << " full osd(s)";
6938 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
6939 for (auto& i: full) {
6940 ostringstream ss;
6941 ss << "osd." << i << " is full";
6942 d.detail.push_back(ss.str());
6943 }
6944 }
6945 if (backfillfull.size()) {
6946 ostringstream ss;
6947 ss << backfillfull.size() << " backfillfull osd(s)";
6948 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
6949 backfillfull.size());
6950 for (auto& i: backfillfull) {
6951 ostringstream ss;
6952 ss << "osd." << i << " is backfill full";
6953 d.detail.push_back(ss.str());
6954 }
6955 }
6956 if (nearfull.size()) {
6957 ostringstream ss;
6958 ss << nearfull.size() << " nearfull osd(s)";
6959 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
6960 for (auto& i: nearfull) {
6961 ostringstream ss;
6962 ss << "osd." << i << " is near full";
6963 d.detail.push_back(ss.str());
6964 }
6965 }
6966 }
6967
6968 // OSDMAP_FLAGS
6969 {
6970 // warn about flags
6971 uint64_t warn_flags =
6972 CEPH_OSDMAP_PAUSERD |
6973 CEPH_OSDMAP_PAUSEWR |
6974 CEPH_OSDMAP_PAUSEREC |
6975 CEPH_OSDMAP_NOUP |
6976 CEPH_OSDMAP_NODOWN |
6977 CEPH_OSDMAP_NOIN |
6978 CEPH_OSDMAP_NOOUT |
6979 CEPH_OSDMAP_NOBACKFILL |
6980 CEPH_OSDMAP_NORECOVER |
6981 CEPH_OSDMAP_NOSCRUB |
6982 CEPH_OSDMAP_NODEEP_SCRUB |
6983 CEPH_OSDMAP_NOTIERAGENT |
6984 CEPH_OSDMAP_NOSNAPTRIM |
6985 CEPH_OSDMAP_NOREBALANCE;
6986 if (test_flag(warn_flags)) {
6987 ostringstream ss;
6988 string s = get_flag_string(get_flags() & warn_flags);
6989 ss << s << " flag(s) set";
6990 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
6991 s.size() /* kludgey but sufficient */);
6992 }
6993 }
6994
6995 // OSD_FLAGS
6996 {
6997 list<string> detail;
6998 const unsigned flags =
6999 CEPH_OSD_NOUP |
7000 CEPH_OSD_NOIN |
7001 CEPH_OSD_NODOWN |
7002 CEPH_OSD_NOOUT;
7003 for (int i = 0; i < max_osd; ++i) {
7004 if (osd_state[i] & flags) {
7005 ostringstream ss;
7006 set<string> states;
7007 OSDMap::calc_state_set(osd_state[i] & flags, states);
7008 ss << "osd." << i << " has flags " << states;
7009 detail.push_back(ss.str());
7010 }
7011 }
7012 for (auto& i : crush_node_flags) {
7013 if (i.second && crush->item_exists(i.first)) {
7014 ostringstream ss;
7015 set<string> states;
7016 OSDMap::calc_state_set(i.second, states);
7017 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
7018 const char *tn = crush->get_type_name(t);
7019 ss << (tn ? tn : "node") << " "
7020 << crush->get_item_name(i.first) << " has flags " << states;
7021 detail.push_back(ss.str());
7022 }
7023 }
7024 for (auto& i : device_class_flags) {
7025 const char* class_name = crush->get_class_name(i.first);
7026 if (i.second && class_name) {
7027 ostringstream ss;
7028 set<string> states;
7029 OSDMap::calc_state_set(i.second, states);
7030 ss << "device class '" << class_name << "' has flags " << states;
7031 detail.push_back(ss.str());
7032 }
7033 }
7034 if (!detail.empty()) {
7035 ostringstream ss;
7036 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
7037 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
7038 d.detail.swap(detail);
7039 }
7040 }
7041
7042 // OLD_CRUSH_TUNABLES
7043 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
7044 string min = crush->get_min_required_version();
7045 if (min < cct->_conf->mon_crush_min_required_version) {
7046 ostringstream ss;
7047 ss << "crush map has legacy tunables (require " << min
7048 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
7049 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
7050 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7051 }
7052 }
7053
7054 // OLD_CRUSH_STRAW_CALC_VERSION
7055 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
7056 if (crush->get_straw_calc_version() == 0) {
7057 ostringstream ss;
7058 ss << "crush map has straw_calc_version=0";
7059 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
7060 d.detail.push_back(
7061 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7062 }
7063 }
7064
7065 // CACHE_POOL_NO_HIT_SET
7066 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
7067 list<string> detail;
7068 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
7069 const pg_pool_t& info = p->second;
7070 if (info.cache_mode_requires_hit_set() &&
7071 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
7072 ostringstream ss;
7073 ss << "pool '" << get_pool_name(p->first)
7074 << "' with cache_mode " << info.get_cache_mode_name()
7075 << " needs hit_set_type to be set but it is not";
7076 detail.push_back(ss.str());
7077 }
7078 }
7079 if (!detail.empty()) {
7080 ostringstream ss;
7081 ss << detail.size() << " cache pools are missing hit_sets";
7082 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
7083 detail.size());
7084 d.detail.swap(detail);
7085 }
7086 }
7087
7088 // OSD_NO_SORTBITWISE
7089 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7090 ostringstream ss;
7091 ss << "'sortbitwise' flag is not set";
7092 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
7093 }
7094
7095 // OSD_UPGRADE_FINISHED
7096 if (auto require_release = pending_require_osd_release()) {
7097 ostringstream ss;
7098 ss << "all OSDs are running " << *require_release << " or later but"
7099 << " require_osd_release < " << *require_release;
7100 auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
7101 d.detail.push_back(ss.str());
7102 }
7103
7104 // POOL_NEARFULL/BACKFILLFULL/FULL
7105 {
7106 list<string> full_detail, backfillfull_detail, nearfull_detail;
7107 for (auto it : get_pools()) {
7108 const pg_pool_t &pool = it.second;
7109 const string& pool_name = get_pool_name(it.first);
7110 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
7111 stringstream ss;
7112 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7113 // may run out of space too,
7114 // but we want EQUOTA taking precedence
7115 ss << "pool '" << pool_name << "' is full (running out of quota)";
7116 } else {
7117 ss << "pool '" << pool_name << "' is full (no space)";
7118 }
7119 full_detail.push_back(ss.str());
7120 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
7121 stringstream ss;
7122 ss << "pool '" << pool_name << "' is backfillfull";
7123 backfillfull_detail.push_back(ss.str());
7124 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
7125 stringstream ss;
7126 ss << "pool '" << pool_name << "' is nearfull";
7127 nearfull_detail.push_back(ss.str());
7128 }
7129 }
7130 if (!full_detail.empty()) {
7131 ostringstream ss;
7132 ss << full_detail.size() << " pool(s) full";
7133 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
7134 d.detail.swap(full_detail);
7135 }
7136 if (!backfillfull_detail.empty()) {
7137 ostringstream ss;
7138 ss << backfillfull_detail.size() << " pool(s) backfillfull";
7139 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
7140 backfillfull_detail.size());
7141 d.detail.swap(backfillfull_detail);
7142 }
7143 if (!nearfull_detail.empty()) {
7144 ostringstream ss;
7145 ss << nearfull_detail.size() << " pool(s) nearfull";
7146 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
7147 nearfull_detail.size());
7148 d.detail.swap(nearfull_detail);
7149 }
7150 }
7151
7152 // POOL_PG_NUM_NOT_POWER_OF_TWO
7153 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
7154 list<string> detail;
7155 for (auto it : get_pools()) {
7156 if (!std::has_single_bit(it.second.get_pg_num_target())) {
7157 ostringstream ss;
7158 ss << "pool '" << get_pool_name(it.first)
7159 << "' pg_num " << it.second.get_pg_num_target()
7160 << " is not a power of two";
7161 detail.push_back(ss.str());
7162 }
7163 }
7164 if (!detail.empty()) {
7165 ostringstream ss;
7166 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
7167 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
7168 ss.str(), detail.size());
7169 d.detail.swap(detail);
7170 }
7171 }
7172
7173 // POOL_NO_REDUNDANCY
7174 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
7175 {
7176 list<string> detail;
7177 for (auto it : get_pools()) {
7178 if (it.second.get_size() == 1) {
7179 ostringstream ss;
7180 ss << "pool '" << get_pool_name(it.first)
7181 << "' has no replicas configured";
7182 detail.push_back(ss.str());
7183 }
7184 }
7185 if (!detail.empty()) {
7186 ostringstream ss;
7187 ss << detail.size() << " pool(s) have no replicas configured";
7188 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
7189 ss.str(), detail.size());
7190 d.detail.swap(detail);
7191 }
7192 }
7193
7194 // DEGRADED STRETCH MODE
7195 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
7196 if (recovering_stretch_mode) {
7197 stringstream ss;
7198 ss << "We are recovering stretch mode buckets, only requiring "
7199 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7200 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
7201 ss.str(), 0);
7202 } else if (degraded_stretch_mode) {
7203 stringstream ss;
7204 ss << "We are missing stretch mode buckets, only requiring "
7205 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7206 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
7207 ss.str(), 0);
7208 }
7209 }
7210 // UNEQUAL_WEIGHT
7211 if (stretch_mode_enabled) {
7212 vector<int> subtrees;
7213 crush->get_subtree_of_type(stretch_mode_bucket, &subtrees);
7214 if (subtrees.size() != 2) {
7215 stringstream ss;
7216 ss << "Stretch mode buckets != 2";
7217 checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
7218 return;
7219 }
7220 int weight1 = crush->get_item_weight(subtrees[0]);
7221 int weight2 = crush->get_item_weight(subtrees[1]);
7222 stringstream ss;
7223 if (weight1 != weight2) {
7224 ss << "Stretch mode buckets have different weights!";
7225 checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
7226 }
7227 }
7228 }
7229
7230 int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
7231 ostream *ss) const
7232 {
7233 out->clear();
7234 for (auto i = ls.begin(); i != ls.end(); ++i) {
7235 if (i == ls.begin() &&
7236 (*i == "any" || *i == "all" || *i == "*")) {
7237 get_all_osds(*out);
7238 break;
7239 }
7240 long osd = ceph::common::parse_osd_id(i->c_str(), ss);
7241 if (osd < 0) {
7242 *ss << "invalid osd id '" << *i << "'";
7243 return -EINVAL;
7244 }
7245 out->insert(osd);
7246 }
7247 return 0;
7248 }
7249
7250 void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
7251 string &subtree,
7252 int limit, // how many
7253 set<int> skip,
7254 set<int> *want) const {
7255 if (limit <= 0)
7256 return;
7257 int subtree_type = crush->get_type_id(subtree);
7258 if (subtree_type < 1)
7259 return;
7260 vector<int> subtrees;
7261 crush->get_subtree_of_type(subtree_type, &subtrees);
7262 std::random_device rd;
7263 std::default_random_engine rng{rd()};
7264 std::shuffle(subtrees.begin(), subtrees.end(), rng);
7265 for (auto s : subtrees) {
7266 if (limit <= 0)
7267 break;
7268 if (crush->subtree_contains(s, n))
7269 continue;
7270 vector<int> osds;
7271 crush->get_children_of_type(s, 0, &osds);
7272 if (osds.empty())
7273 continue;
7274 vector<int> up_osds;
7275 for (auto o : osds) {
7276 if (is_up(o) && !skip.count(o))
7277 up_osds.push_back(o);
7278 }
7279 if (up_osds.empty())
7280 continue;
7281 auto it = up_osds.begin();
7282 std::advance(it, (n % up_osds.size()));
7283 want->insert(*it);
7284 --limit;
7285 }
7286 }
7287
7288 float OSDMap::pool_raw_used_rate(int64_t poolid) const
7289 {
7290 const pg_pool_t *pool = get_pg_pool(poolid);
7291 assert(pool != nullptr);
7292
7293 switch (pool->get_type()) {
7294 case pg_pool_t::TYPE_REPLICATED:
7295 return pool->get_size();
7296 case pg_pool_t::TYPE_ERASURE:
7297 {
7298 auto& ecp =
7299 get_erasure_code_profile(pool->erasure_code_profile);
7300 auto pm = ecp.find("m");
7301 auto pk = ecp.find("k");
7302 if (pm != ecp.end() && pk != ecp.end()) {
7303 int k = atoi(pk->second.c_str());
7304 int m = atoi(pm->second.c_str());
7305 int mk = m + k;
7306 ceph_assert(mk != 0);
7307 ceph_assert(k != 0);
7308 return (float)mk / k;
7309 } else {
7310 return 0.0;
7311 }
7312 }
7313 break;
7314 default:
7315 ceph_abort_msg("unrecognized pool type");
7316 }
7317 }
7318
7319 unsigned OSDMap::get_osd_crush_node_flags(int osd) const
7320 {
7321 unsigned flags = 0;
7322 if (!crush_node_flags.empty()) {
7323 // the map will contain type -> name
7324 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
7325 for (auto& i : ploc) {
7326 int id = crush->get_item_id(i.second);
7327 auto p = crush_node_flags.find(id);
7328 if (p != crush_node_flags.end()) {
7329 flags |= p->second;
7330 }
7331 }
7332 }
7333 return flags;
7334 }
7335
7336 unsigned OSDMap::get_crush_node_flags(int id) const
7337 {
7338 unsigned flags = 0;
7339 auto it = crush_node_flags.find(id);
7340 if (it != crush_node_flags.end())
7341 flags = it->second;
7342 return flags;
7343 }
7344
7345 unsigned OSDMap::get_device_class_flags(int id) const
7346 {
7347 unsigned flags = 0;
7348 auto it = device_class_flags.find(id);
7349 if (it != device_class_flags.end())
7350 flags = it->second;
7351 return flags;
7352 }
7353
7354 std::optional<std::string> OSDMap::pending_require_osd_release() const
7355 {
7356 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) &&
7357 require_osd_release < ceph_release_t::quincy) {
7358 return "quincy";
7359 }
7360 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
7361 require_osd_release < ceph_release_t::pacific) {
7362 return "pacific";
7363 }
7364 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
7365 require_osd_release < ceph_release_t::octopus) {
7366 return "octopus";
7367 }
7368 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
7369 require_osd_release < ceph_release_t::nautilus) {
7370 return "nautilus";
7371 }
7372
7373 return std::nullopt;
7374 }