]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
29#include "include/ceph_features.h"
9f95a23c 30#include "include/common_fwd.h"
7c673cae
FG
31#include "include/str_map.h"
32
33#include "common/code_environment.h"
224ce89b 34#include "mon/health_check.h"
7c673cae
FG
35
36#include "crush/CrushTreeDumper.h"
37#include "common/Clock.h"
11fdf7f2
TL
38#include "mon/PGMap.h"
39
9f95a23c
TL
40using std::list;
41using std::make_pair;
42using std::map;
43using std::multimap;
44using std::ostream;
45using std::ostringstream;
46using std::pair;
47using std::set;
48using std::string;
49using std::stringstream;
50using std::unordered_map;
51using std::vector;
52
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56
7c673cae
FG
57#define dout_subsys ceph_subsys_osd
58
59MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
60MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
61
62
63// ----------------------------------
64// osd_info_t
65
66void osd_info_t::dump(Formatter *f) const
67{
68 f->dump_int("last_clean_begin", last_clean_begin);
69 f->dump_int("last_clean_end", last_clean_end);
70 f->dump_int("up_from", up_from);
71 f->dump_int("up_thru", up_thru);
72 f->dump_int("down_at", down_at);
73 f->dump_int("lost_at", lost_at);
74}
75
9f95a23c 76void osd_info_t::encode(ceph::buffer::list& bl) const
7c673cae 77{
11fdf7f2 78 using ceph::encode;
7c673cae 79 __u8 struct_v = 1;
11fdf7f2
TL
80 encode(struct_v, bl);
81 encode(last_clean_begin, bl);
82 encode(last_clean_end, bl);
83 encode(up_from, bl);
84 encode(up_thru, bl);
85 encode(down_at, bl);
86 encode(lost_at, bl);
7c673cae
FG
87}
88
9f95a23c 89void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 90{
11fdf7f2 91 using ceph::decode;
7c673cae 92 __u8 struct_v;
11fdf7f2
TL
93 decode(struct_v, bl);
94 decode(last_clean_begin, bl);
95 decode(last_clean_end, bl);
96 decode(up_from, bl);
97 decode(up_thru, bl);
98 decode(down_at, bl);
99 decode(lost_at, bl);
7c673cae
FG
100}
101
102void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
103{
104 o.push_back(new osd_info_t);
105 o.push_back(new osd_info_t);
106 o.back()->last_clean_begin = 1;
107 o.back()->last_clean_end = 2;
108 o.back()->up_from = 30;
109 o.back()->up_thru = 40;
110 o.back()->down_at = 5;
111 o.back()->lost_at = 6;
112}
113
114ostream& operator<<(ostream& out, const osd_info_t& info)
115{
116 out << "up_from " << info.up_from
117 << " up_thru " << info.up_thru
118 << " down_at " << info.down_at
119 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
120 if (info.lost_at)
121 out << " lost_at " << info.lost_at;
122 return out;
123}
124
125// ----------------------------------
126// osd_xinfo_t
127
128void osd_xinfo_t::dump(Formatter *f) const
129{
130 f->dump_stream("down_stamp") << down_stamp;
131 f->dump_float("laggy_probability", laggy_probability);
132 f->dump_int("laggy_interval", laggy_interval);
133 f->dump_int("features", features);
134 f->dump_unsigned("old_weight", old_weight);
9f95a23c
TL
135 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
136 f->dump_int("dead_epoch", dead_epoch);
7c673cae
FG
137}
138
9f95a23c 139void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
7c673cae 140{
9f95a23c
TL
141 uint8_t v = 4;
142 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
143 v = 3;
144 }
145 ENCODE_START(v, 1, bl);
11fdf7f2 146 encode(down_stamp, bl);
f67539c2 147 __u32 lp = laggy_probability * float(0xfffffffful);
11fdf7f2
TL
148 encode(lp, bl);
149 encode(laggy_interval, bl);
150 encode(features, bl);
151 encode(old_weight, bl);
9f95a23c
TL
152 if (v >= 4) {
153 encode(last_purged_snaps_scrub, bl);
154 encode(dead_epoch, bl);
155 }
7c673cae
FG
156 ENCODE_FINISH(bl);
157}
158
9f95a23c 159void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 160{
9f95a23c 161 DECODE_START(4, bl);
11fdf7f2 162 decode(down_stamp, bl);
7c673cae 163 __u32 lp;
11fdf7f2 164 decode(lp, bl);
7c673cae 165 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 166 decode(laggy_interval, bl);
7c673cae 167 if (struct_v >= 2)
11fdf7f2 168 decode(features, bl);
7c673cae
FG
169 else
170 features = 0;
171 if (struct_v >= 3)
11fdf7f2 172 decode(old_weight, bl);
7c673cae
FG
173 else
174 old_weight = 0;
9f95a23c
TL
175 if (struct_v >= 4) {
176 decode(last_purged_snaps_scrub, bl);
177 decode(dead_epoch, bl);
178 } else {
179 dead_epoch = 0;
180 }
7c673cae
FG
181 DECODE_FINISH(bl);
182}
183
184void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
185{
186 o.push_back(new osd_xinfo_t);
187 o.push_back(new osd_xinfo_t);
188 o.back()->down_stamp = utime_t(2, 3);
189 o.back()->laggy_probability = .123;
190 o.back()->laggy_interval = 123456;
191 o.back()->old_weight = 0x7fff;
192}
193
194ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
195{
196 return out << "down_stamp " << xi.down_stamp
197 << " laggy_probability " << xi.laggy_probability
198 << " laggy_interval " << xi.laggy_interval
9f95a23c
TL
199 << " old_weight " << xi.old_weight
200 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
201 << " dead_epoch " << xi.dead_epoch;
7c673cae
FG
202}
203
204// ----------------------------------
205// OSDMap::Incremental
206
207int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
208{
209 int n = 0;
210 for (auto &weight : new_weight) {
211 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
212 n++; // marked out
213 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
214 n--; // marked in
215 }
216 return n;
217}
218
219int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
220{
221 int n = 0;
222 for (auto &state : new_state) { //
223 if (state.second & CEPH_OSD_UP) {
224 if (previous->is_up(state.first))
225 n++; // marked down
226 else
227 n--; // marked up
228 }
229 }
230 return n;
231}
232
233int OSDMap::Incremental::identify_osd(uuid_d u) const
234{
235 for (auto &uuid : new_uuid)
236 if (uuid.second == u)
237 return uuid.first;
238 return -1;
239}
240
f67539c2
TL
241int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
242 const OSDMap& osdmap)
7c673cae 243{
11fdf7f2 244 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
245
246 for (auto &new_pool : new_pools) {
247 if (!new_pool.second.tiers.empty()) {
248 pg_pool_t& base = new_pool.second;
249
11fdf7f2
TL
250 auto new_rem_it = new_removed_snaps.find(new_pool.first);
251
7c673cae
FG
252 for (const auto &tier_pool : base.tiers) {
253 const auto &r = new_pools.find(tier_pool);
254 pg_pool_t *tier = 0;
255 if (r == new_pools.end()) {
256 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
257 if (!orig) {
258 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
259 return -EIO;
260 }
261 tier = get_new_pool(tier_pool, orig);
262 } else {
263 tier = &r->second;
264 }
265 if (tier->tier_of != new_pool.first) {
266 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
267 return -EIO;
268 }
269
270 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
271 << tier_pool << dendl;
272 tier->snap_seq = base.snap_seq;
273 tier->snap_epoch = base.snap_epoch;
274 tier->snaps = base.snaps;
275 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
276 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
277 pg_pool_t::FLAG_POOL_SNAPS);
278
279 if (new_rem_it != new_removed_snaps.end()) {
280 new_removed_snaps[tier_pool] = new_rem_it->second;
281 }
f67539c2
TL
282
283 tier->application_metadata = base.application_metadata;
7c673cae
FG
284 }
285 }
286 }
287 return 0;
288}
289
28e407b8
AA
290// ----------------------------------
291// OSDMap
7c673cae
FG
292
293bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
294{
295 if (id >= 0)
296 return is_down(id);
297
298 if (down_cache &&
299 down_cache->count(id)) {
300 return true;
301 }
302
303 list<int> children;
304 crush->get_children(id, &children);
305 for (const auto &child : children) {
306 if (!subtree_is_down(child, down_cache)) {
307 return false;
308 }
309 }
310 if (down_cache) {
311 down_cache->insert(id);
312 }
313 return true;
314}
315
316bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
317{
318 // use a stack-local down_cache if we didn't get one from the
319 // caller. then at least this particular call will avoid duplicated
320 // work.
321 set<int> local_down_cache;
322 if (!down_cache) {
323 down_cache = &local_down_cache;
324 }
325
326 int current = id;
327 while (true) {
328 int type;
329 if (current >= 0) {
330 type = 0;
331 } else {
332 type = crush->get_bucket_type(current);
333 }
11fdf7f2 334 ceph_assert(type >= 0);
7c673cae
FG
335
336 if (!subtree_is_down(current, down_cache)) {
337 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
338 return false;
339 }
340
341 // is this a big enough subtree to be marked as down?
342 if (type >= subtree_type) {
343 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
344 return true;
345 }
346
347 int r = crush->get_immediate_parent_id(current, &current);
348 if (r < 0) {
349 return false;
350 }
351 }
352}
353
224ce89b
WB
354bool OSDMap::subtree_type_is_down(
355 CephContext *cct,
356 int id,
357 int subtree_type,
358 set<int> *down_in_osds,
359 set<int> *up_in_osds,
360 set<int> *subtree_up,
361 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
362{
363 if (id >= 0) {
364 bool is_down_ret = is_down(id);
365 if (!is_out(id)) {
366 if (is_down_ret) {
367 down_in_osds->insert(id);
368 } else {
369 up_in_osds->insert(id);
370 }
371 }
372 return is_down_ret;
373 }
374
375 if (subtree_type_down &&
376 (*subtree_type_down)[subtree_type].count(id)) {
377 return true;
378 }
379
380 list<int> children;
381 crush->get_children(id, &children);
382 for (const auto &child : children) {
224ce89b
WB
383 if (!subtree_type_is_down(
384 cct, child, crush->get_bucket_type(child),
385 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
386 subtree_up->insert(id);
387 return false;
388 }
389 }
390 if (subtree_type_down) {
391 (*subtree_type_down)[subtree_type].insert(id);
392 }
393 return true;
394}
395
9f95a23c 396void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
7c673cae 397{
11fdf7f2 398 using ceph::encode;
7c673cae 399 __u16 v = 5;
11fdf7f2
TL
400 encode(v, bl);
401 encode(fsid, bl);
402 encode(epoch, bl);
403 encode(modified, bl);
7c673cae 404 int32_t new_t = new_pool_max;
11fdf7f2
TL
405 encode(new_t, bl);
406 encode(new_flags, bl);
407 encode(fullmap, bl);
408 encode(crush, bl);
7c673cae 409
11fdf7f2
TL
410 encode(new_max_osd, bl);
411 // for encode(new_pools, bl);
7c673cae 412 __u32 n = new_pools.size();
11fdf7f2 413 encode(n, bl);
7c673cae
FG
414 for (const auto &new_pool : new_pools) {
415 n = new_pool.first;
11fdf7f2
TL
416 encode(n, bl);
417 encode(new_pool.second, bl, 0);
7c673cae 418 }
11fdf7f2 419 // for encode(new_pool_names, bl);
7c673cae 420 n = new_pool_names.size();
11fdf7f2 421 encode(n, bl);
7c673cae
FG
422
423 for (const auto &new_pool_name : new_pool_names) {
424 n = new_pool_name.first;
11fdf7f2
TL
425 encode(n, bl);
426 encode(new_pool_name.second, bl);
7c673cae 427 }
11fdf7f2 428 // for encode(old_pools, bl);
7c673cae 429 n = old_pools.size();
11fdf7f2 430 encode(n, bl);
7c673cae
FG
431 for (auto &old_pool : old_pools) {
432 n = old_pool;
11fdf7f2 433 encode(n, bl);
7c673cae 434 }
11fdf7f2 435 encode(new_up_client, bl, 0);
31f18b77
FG
436 {
437 // legacy is map<int32_t,uint8_t>
9f95a23c 438 map<int32_t, uint8_t> os;
31f18b77 439 for (auto p : new_state) {
9f95a23c
TL
440 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
441 // that an old client could not understand.
442 // skip those!
443 uint8_t s = p.second;
444 if (p.second != 0 && s == 0)
445 continue;
446 os[p.first] = s;
447 }
448 uint32_t n = os.size();
449 encode(n, bl);
450 for (auto p : os) {
11fdf7f2 451 encode(p.first, bl);
9f95a23c 452 encode(p.second, bl);
31f18b77
FG
453 }
454 }
11fdf7f2
TL
455 encode(new_weight, bl);
456 // for encode(new_pg_temp, bl);
7c673cae 457 n = new_pg_temp.size();
11fdf7f2 458 encode(n, bl);
7c673cae
FG
459
460 for (const auto &pg_temp : new_pg_temp) {
461 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
462 encode(opg, bl);
463 encode(pg_temp.second, bl);
7c673cae
FG
464 }
465}
466
9f95a23c 467void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 468{
11fdf7f2 469 using ceph::encode;
7c673cae
FG
470 if ((features & CEPH_FEATURE_PGID64) == 0) {
471 encode_client_old(bl);
472 return;
473 }
474
475 // base
476 __u16 v = 6;
11fdf7f2
TL
477 encode(v, bl);
478 encode(fsid, bl);
479 encode(epoch, bl);
480 encode(modified, bl);
481 encode(new_pool_max, bl);
482 encode(new_flags, bl);
483 encode(fullmap, bl);
484 encode(crush, bl);
485
486 encode(new_max_osd, bl);
487 encode(new_pools, bl, features);
488 encode(new_pool_names, bl);
489 encode(old_pools, bl);
490 encode(new_up_client, bl, features);
31f18b77 491 {
9f95a23c 492 map<int32_t, uint8_t> os;
31f18b77 493 for (auto p : new_state) {
9f95a23c
TL
494 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
495 // that an old client could not understand.
496 // skip those!
497 uint8_t s = p.second;
498 if (p.second != 0 && s == 0)
499 continue;
500 os[p.first] = s;
501 }
502 uint32_t n = os.size();
503 encode(n, bl);
504 for (auto p : os) {
11fdf7f2 505 encode(p.first, bl);
9f95a23c 506 encode(p.second, bl);
31f18b77
FG
507 }
508 }
11fdf7f2
TL
509 encode(new_weight, bl);
510 encode(new_pg_temp, bl);
7c673cae
FG
511
512 // extended
513 __u16 ev = 10;
11fdf7f2
TL
514 encode(ev, bl);
515 encode(new_hb_back_up, bl, features);
516 encode(new_up_thru, bl);
517 encode(new_last_clean_interval, bl);
518 encode(new_lost, bl);
f67539c2
TL
519 encode(new_blocklist, bl, features);
520 encode(old_blocklist, bl, features);
11fdf7f2
TL
521 encode(new_up_cluster, bl, features);
522 encode(cluster_snapshot, bl);
523 encode(new_uuid, bl);
9f95a23c 524 encode(new_xinfo, bl, features);
11fdf7f2
TL
525 encode(new_hb_front_up, bl, features);
526}
527
528template<class T>
9f95a23c 529static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
530{
531 uint32_t n = m.size();
532 encode(n, bl);
533 for (auto& i : m) {
534 encode(i.first, bl);
535 encode(i.second.legacy_addr(), bl, f);
536 }
537}
538
539template<class T>
9f95a23c 540static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
541{
542 uint32_t n = m.size();
543 encode(n, bl);
544 for (auto& i : m) {
545 if (i) {
546 encode(i->legacy_addr(), bl, f);
547 } else {
548 encode(entity_addr_t(), bl, f);
549 }
550 }
7c673cae
FG
551}
552
11fdf7f2
TL
553/* for a description of osdmap incremental versions, and when they were
554 * introduced, please refer to
555 * doc/dev/osd_internals/osdmap_versions.txt
556 */
9f95a23c 557void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 558{
11fdf7f2 559 using ceph::encode;
7c673cae
FG
560 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
561 encode_classic(bl, features);
562 return;
563 }
564
565 // only a select set of callers should *ever* be encoding new
566 // OSDMaps. others should be passing around the canonical encoded
567 // buffers from on high. select out those callers by passing in an
568 // "impossible" feature bit.
11fdf7f2 569 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
570 features &= ~CEPH_FEATURE_RESERVED;
571
572 size_t start_offset = bl.length();
573 size_t tail_offset;
11fdf7f2 574 size_t crc_offset;
9f95a23c 575 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
576
577 // meta-encoding: how we include client-used and osd-specific data
578 ENCODE_START(8, 7, bl);
579
580 {
11fdf7f2 581 uint8_t v = 8;
7c673cae
FG
582 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
583 v = 3;
11fdf7f2
TL
584 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
585 v = 5;
586 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
587 v = 6;
7c673cae
FG
588 }
589 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
590 encode(fsid, bl);
591 encode(epoch, bl);
592 encode(modified, bl);
593 encode(new_pool_max, bl);
594 encode(new_flags, bl);
595 encode(fullmap, bl);
596 encode(crush, bl);
597
598 encode(new_max_osd, bl);
599 encode(new_pools, bl, features);
600 encode(new_pool_names, bl);
601 encode(old_pools, bl);
602 if (v >= 7) {
603 encode(new_up_client, bl, features);
604 } else {
605 encode_addrvec_map_as_addr(new_up_client, bl, features);
606 }
31f18b77 607 if (v >= 5) {
11fdf7f2 608 encode(new_state, bl);
31f18b77 609 } else {
9f95a23c 610 map<int32_t, uint8_t> os;
31f18b77 611 for (auto p : new_state) {
9f95a23c
TL
612 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
613 // that an old client could not understand.
614 // skip those!
615 uint8_t s = p.second;
616 if (p.second != 0 && s == 0)
617 continue;
618 os[p.first] = s;
619 }
620 uint32_t n = os.size();
621 encode(n, bl);
622 for (auto p : os) {
623 encode(p.first, bl);
624 encode(p.second, bl);
31f18b77
FG
625 }
626 }
11fdf7f2
TL
627 encode(new_weight, bl);
628 encode(new_pg_temp, bl);
629 encode(new_primary_temp, bl);
630 encode(new_primary_affinity, bl);
631 encode(new_erasure_code_profiles, bl);
632 encode(old_erasure_code_profiles, bl);
7c673cae 633 if (v >= 4) {
11fdf7f2
TL
634 encode(new_pg_upmap, bl);
635 encode(old_pg_upmap, bl);
636 encode(new_pg_upmap_items, bl);
637 encode(old_pg_upmap_items, bl);
638 }
639 if (v >= 6) {
640 encode(new_removed_snaps, bl);
641 encode(new_purged_snaps, bl);
642 }
643 if (v >= 8) {
644 encode(new_last_up_change, bl);
645 encode(new_last_in_change, bl);
7c673cae
FG
646 }
647 ENCODE_FINISH(bl); // client-usable data
648 }
649
650 {
f67539c2 651 uint8_t target_v = 9; // if bumping this, be aware of stretch_mode target_v 10!
7c673cae
FG
652 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
653 target_v = 2;
11fdf7f2
TL
654 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
655 target_v = 6;
7c673cae 656 }
f67539c2 657 if (change_stretch_mode) {
f67539c2
TL
658 target_v = std::max((uint8_t)10, target_v);
659 }
7c673cae 660 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
661 if (target_v < 7) {
662 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
663 } else {
664 encode(new_hb_back_up, bl, features);
665 }
666 encode(new_up_thru, bl);
667 encode(new_last_clean_interval, bl);
668 encode(new_lost, bl);
f67539c2
TL
669 encode(new_blocklist, bl, features);
670 encode(old_blocklist, bl, features);
11fdf7f2
TL
671 if (target_v < 7) {
672 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
673 } else {
674 encode(new_up_cluster, bl, features);
675 }
676 encode(cluster_snapshot, bl);
677 encode(new_uuid, bl);
9f95a23c 678 encode(new_xinfo, bl, features);
11fdf7f2
TL
679 if (target_v < 7) {
680 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
681 } else {
682 encode(new_hb_front_up, bl, features);
683 }
684 encode(features, bl); // NOTE: features arg, not the member
7c673cae 685 if (target_v >= 3) {
11fdf7f2
TL
686 encode(new_nearfull_ratio, bl);
687 encode(new_full_ratio, bl);
688 encode(new_backfillfull_ratio, bl);
31f18b77
FG
689 }
690 // 5 was string-based new_require_min_compat_client
691 if (target_v >= 6) {
11fdf7f2
TL
692 encode(new_require_min_compat_client, bl);
693 encode(new_require_osd_release, bl);
7c673cae 694 }
81eedcae
TL
695 if (target_v >= 8) {
696 encode(new_crush_node_flags, bl);
697 }
698 if (target_v >= 9) {
699 encode(new_device_class_flags, bl);
700 }
f67539c2
TL
701 if (target_v >= 10) {
702 encode(change_stretch_mode, bl);
703 encode(new_stretch_bucket_count, bl);
704 encode(new_degraded_stretch_mode, bl);
705 encode(new_recovering_stretch_mode, bl);
706 encode(new_stretch_mode_bucket, bl);
707 encode(stretch_mode_enabled, bl);
708 }
7c673cae
FG
709 ENCODE_FINISH(bl); // osd-only data
710 }
711
11fdf7f2
TL
712 crc_offset = bl.length();
713 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
714 tail_offset = bl.length();
715
11fdf7f2 716 encode(full_crc, bl);
7c673cae
FG
717
718 ENCODE_FINISH(bl); // meta-encoding wrapper
719
720 // fill in crc
9f95a23c 721 ceph::buffer::list front;
11fdf7f2 722 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae 723 inc_crc = front.crc32c(-1);
9f95a23c 724 ceph::buffer::list tail;
7c673cae
FG
725 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
726 inc_crc = tail.crc32c(inc_crc);
727 ceph_le32 crc_le;
728 crc_le = inc_crc;
11fdf7f2 729 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
730 have_crc = true;
731}
732
9f95a23c 733void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
7c673cae 734{
11fdf7f2 735 using ceph::decode;
7c673cae
FG
736 __u32 n, t;
737 // base
738 __u16 v;
11fdf7f2
TL
739 decode(v, p);
740 decode(fsid, p);
741 decode(epoch, p);
742 decode(modified, p);
7c673cae 743 if (v == 4 || v == 5) {
11fdf7f2 744 decode(n, p);
7c673cae
FG
745 new_pool_max = n;
746 } else if (v >= 6)
11fdf7f2
TL
747 decode(new_pool_max, p);
748 decode(new_flags, p);
749 decode(fullmap, p);
750 decode(crush, p);
7c673cae 751
11fdf7f2 752 decode(new_max_osd, p);
7c673cae
FG
753 if (v < 6) {
754 new_pools.clear();
11fdf7f2 755 decode(n, p);
7c673cae 756 while (n--) {
11fdf7f2
TL
757 decode(t, p);
758 decode(new_pools[t], p);
7c673cae
FG
759 }
760 } else {
11fdf7f2 761 decode(new_pools, p);
7c673cae
FG
762 }
763 if (v == 5) {
764 new_pool_names.clear();
11fdf7f2 765 decode(n, p);
7c673cae 766 while (n--) {
11fdf7f2
TL
767 decode(t, p);
768 decode(new_pool_names[t], p);
7c673cae
FG
769 }
770 } else if (v >= 6) {
11fdf7f2 771 decode(new_pool_names, p);
7c673cae
FG
772 }
773 if (v < 6) {
774 old_pools.clear();
11fdf7f2 775 decode(n, p);
7c673cae 776 while (n--) {
11fdf7f2 777 decode(t, p);
7c673cae
FG
778 old_pools.insert(t);
779 }
780 } else {
11fdf7f2 781 decode(old_pools, p);
7c673cae 782 }
11fdf7f2 783 decode(new_up_client, p);
31f18b77
FG
784 {
785 map<int32_t,uint8_t> ns;
11fdf7f2 786 decode(ns, p);
31f18b77
FG
787 for (auto q : ns) {
788 new_state[q.first] = q.second;
789 }
790 }
11fdf7f2 791 decode(new_weight, p);
7c673cae
FG
792
793 if (v < 6) {
794 new_pg_temp.clear();
11fdf7f2 795 decode(n, p);
7c673cae
FG
796 while (n--) {
797 old_pg_t opg;
9f95a23c 798 ceph::decode_raw(opg, p);
11fdf7f2 799 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
800 }
801 } else {
11fdf7f2 802 decode(new_pg_temp, p);
7c673cae
FG
803 }
804
805 // decode short map, too.
806 if (v == 5 && p.end())
807 return;
808
809 // extended
810 __u16 ev = 0;
811 if (v >= 5)
11fdf7f2
TL
812 decode(ev, p);
813 decode(new_hb_back_up, p);
7c673cae 814 if (v < 5)
11fdf7f2
TL
815 decode(new_pool_names, p);
816 decode(new_up_thru, p);
817 decode(new_last_clean_interval, p);
818 decode(new_lost, p);
f67539c2
TL
819 decode(new_blocklist, p);
820 decode(old_blocklist, p);
7c673cae 821 if (ev >= 6)
11fdf7f2 822 decode(new_up_cluster, p);
7c673cae 823 if (ev >= 7)
11fdf7f2 824 decode(cluster_snapshot, p);
7c673cae 825 if (ev >= 8)
11fdf7f2 826 decode(new_uuid, p);
7c673cae 827 if (ev >= 9)
11fdf7f2 828 decode(new_xinfo, p);
7c673cae 829 if (ev >= 10)
11fdf7f2 830 decode(new_hb_front_up, p);
7c673cae
FG
831}
832
11fdf7f2
TL
833/* for a description of osdmap incremental versions, and when they were
834 * introduced, please refer to
835 * doc/dev/osd_internals/osdmap_versions.txt
836 */
9f95a23c 837void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 838{
11fdf7f2 839 using ceph::decode;
7c673cae
FG
840 /**
841 * Older encodings of the Incremental had a single struct_v which
842 * covered the whole encoding, and was prior to our modern
843 * stuff which includes a compatv and a size. So if we see
844 * a struct_v < 7, we must rewind to the beginning and use our
845 * classic decoder.
846 */
847 size_t start_offset = bl.get_off();
848 size_t tail_offset = 0;
9f95a23c 849 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
850
851 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
852 if (struct_v < 7) {
11fdf7f2 853 bl.seek(start_offset);
7c673cae
FG
854 decode_classic(bl);
855 encode_features = 0;
856 if (struct_v >= 6)
857 encode_features = CEPH_FEATURE_PGID64;
858 else
859 encode_features = 0;
860 return;
861 }
862 {
11fdf7f2
TL
863 DECODE_START(8, bl); // client-usable data
864 decode(fsid, bl);
865 decode(epoch, bl);
866 decode(modified, bl);
867 decode(new_pool_max, bl);
868 decode(new_flags, bl);
869 decode(fullmap, bl);
870 decode(crush, bl);
871
872 decode(new_max_osd, bl);
873 decode(new_pools, bl);
874 decode(new_pool_names, bl);
875 decode(old_pools, bl);
876 decode(new_up_client, bl);
31f18b77 877 if (struct_v >= 5) {
11fdf7f2 878 decode(new_state, bl);
31f18b77
FG
879 } else {
880 map<int32_t,uint8_t> ns;
11fdf7f2 881 decode(ns, bl);
31f18b77
FG
882 for (auto q : ns) {
883 new_state[q.first] = q.second;
884 }
885 }
11fdf7f2
TL
886 decode(new_weight, bl);
887 decode(new_pg_temp, bl);
888 decode(new_primary_temp, bl);
7c673cae 889 if (struct_v >= 2)
11fdf7f2 890 decode(new_primary_affinity, bl);
7c673cae
FG
891 else
892 new_primary_affinity.clear();
893 if (struct_v >= 3) {
11fdf7f2
TL
894 decode(new_erasure_code_profiles, bl);
895 decode(old_erasure_code_profiles, bl);
7c673cae
FG
896 } else {
897 new_erasure_code_profiles.clear();
898 old_erasure_code_profiles.clear();
899 }
900 if (struct_v >= 4) {
11fdf7f2
TL
901 decode(new_pg_upmap, bl);
902 decode(old_pg_upmap, bl);
903 decode(new_pg_upmap_items, bl);
904 decode(old_pg_upmap_items, bl);
905 }
906 if (struct_v >= 6) {
907 decode(new_removed_snaps, bl);
908 decode(new_purged_snaps, bl);
909 }
910 if (struct_v >= 8) {
911 decode(new_last_up_change, bl);
912 decode(new_last_in_change, bl);
7c673cae
FG
913 }
914 DECODE_FINISH(bl); // client-usable data
915 }
916
917 {
f67539c2 918 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
919 decode(new_hb_back_up, bl);
920 decode(new_up_thru, bl);
921 decode(new_last_clean_interval, bl);
922 decode(new_lost, bl);
f67539c2
TL
923 decode(new_blocklist, bl);
924 decode(old_blocklist, bl);
11fdf7f2
TL
925 decode(new_up_cluster, bl);
926 decode(cluster_snapshot, bl);
927 decode(new_uuid, bl);
928 decode(new_xinfo, bl);
929 decode(new_hb_front_up, bl);
7c673cae 930 if (struct_v >= 2)
11fdf7f2 931 decode(encode_features, bl);
7c673cae
FG
932 else
933 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
934 if (struct_v >= 3) {
11fdf7f2
TL
935 decode(new_nearfull_ratio, bl);
936 decode(new_full_ratio, bl);
7c673cae
FG
937 } else {
938 new_nearfull_ratio = -1;
939 new_full_ratio = -1;
940 }
941 if (struct_v >= 4) {
11fdf7f2 942 decode(new_backfillfull_ratio, bl);
7c673cae
FG
943 } else {
944 new_backfillfull_ratio = -1;
945 }
31f18b77
FG
946 if (struct_v == 5) {
947 string r;
11fdf7f2 948 decode(r, bl);
31f18b77 949 if (r.length()) {
9f95a23c 950 new_require_min_compat_client = ceph_release_from_name(r);
31f18b77
FG
951 }
952 }
953 if (struct_v >= 6) {
11fdf7f2
TL
954 decode(new_require_min_compat_client, bl);
955 decode(new_require_osd_release, bl);
31f18b77
FG
956 } else {
957 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
958 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 959 new_require_osd_release = ceph_release_t::luminous;
31f18b77
FG
960 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
961 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
9f95a23c 962 new_require_osd_release = ceph_release_t::kraken;
31f18b77 963 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
9f95a23c 964 new_require_osd_release = ceph_release_t::jewel;
31f18b77 965 } else {
9f95a23c 966 new_require_osd_release = ceph_release_t::unknown;
31f18b77
FG
967 }
968 }
81eedcae
TL
969 if (struct_v >= 8) {
970 decode(new_crush_node_flags, bl);
971 }
972 if (struct_v >= 9) {
973 decode(new_device_class_flags, bl);
974 }
f67539c2
TL
975 if (struct_v >= 10) {
976 decode(change_stretch_mode, bl);
977 decode(new_stretch_bucket_count, bl);
978 decode(new_degraded_stretch_mode, bl);
979 decode(new_recovering_stretch_mode, bl);
980 decode(new_stretch_mode_bucket, bl);
981 decode(stretch_mode_enabled, bl);
982 }
983
7c673cae
FG
984 DECODE_FINISH(bl); // osd-only data
985 }
986
987 if (struct_v >= 8) {
988 have_crc = true;
989 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 990 decode(inc_crc, bl);
7c673cae 991 tail_offset = bl.get_off();
11fdf7f2 992 decode(full_crc, bl);
7c673cae
FG
993 } else {
994 have_crc = false;
995 full_crc = 0;
996 inc_crc = 0;
997 }
998
999 DECODE_FINISH(bl); // wrapper
1000
1001 if (have_crc) {
1002 // verify crc
1003 uint32_t actual = crc_front.crc32c(-1);
1004 if (tail_offset < bl.get_off()) {
9f95a23c 1005 ceph::buffer::list tail;
7c673cae
FG
1006 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1007 actual = tail.crc32c(actual);
1008 }
1009 if (inc_crc != actual) {
1010 ostringstream ss;
1011 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1012 string s = ss.str();
9f95a23c 1013 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
1014 }
1015 }
1016}
1017
1018void OSDMap::Incremental::dump(Formatter *f) const
1019{
1020 f->dump_int("epoch", epoch);
1021 f->dump_stream("fsid") << fsid;
1022 f->dump_stream("modified") << modified;
11fdf7f2
TL
1023 f->dump_stream("new_last_up_change") << new_last_up_change;
1024 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
1025 f->dump_int("new_pool_max", new_pool_max);
1026 f->dump_int("new_flags", new_flags);
1027 f->dump_float("new_full_ratio", new_full_ratio);
1028 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1029 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
f67539c2
TL
1030 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1031 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
7c673cae
FG
1032
1033 if (fullmap.length()) {
1034 f->open_object_section("full_map");
1035 OSDMap full;
9f95a23c 1036 ceph::buffer::list fbl = fullmap; // kludge around constness.
11fdf7f2 1037 auto p = fbl.cbegin();
7c673cae
FG
1038 full.decode(p);
1039 full.dump(f);
1040 f->close_section();
1041 }
1042 if (crush.length()) {
1043 f->open_object_section("crush");
1044 CrushWrapper c;
9f95a23c 1045 ceph::buffer::list tbl = crush; // kludge around constness.
11fdf7f2 1046 auto p = tbl.cbegin();
7c673cae
FG
1047 c.decode(p);
1048 c.dump(f);
1049 f->close_section();
1050 }
1051
1052 f->dump_int("new_max_osd", new_max_osd);
1053
1054 f->open_array_section("new_pools");
1055
1056 for (const auto &new_pool : new_pools) {
1057 f->open_object_section("pool");
1058 f->dump_int("pool", new_pool.first);
1059 new_pool.second.dump(f);
1060 f->close_section();
1061 }
1062 f->close_section();
1063 f->open_array_section("new_pool_names");
1064
1065 for (const auto &new_pool_name : new_pool_names) {
1066 f->open_object_section("pool_name");
1067 f->dump_int("pool", new_pool_name.first);
1068 f->dump_string("name", new_pool_name.second);
1069 f->close_section();
1070 }
1071 f->close_section();
1072 f->open_array_section("old_pools");
1073
1074 for (const auto &old_pool : old_pools)
1075 f->dump_int("pool", old_pool);
1076 f->close_section();
1077
1078 f->open_array_section("new_up_osds");
1079
1080 for (const auto &upclient : new_up_client) {
1081 f->open_object_section("osd");
1082 f->dump_int("osd", upclient.first);
11fdf7f2
TL
1083 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1084 f->dump_object("public_addrs", upclient.second);
1085 if (auto p = new_up_cluster.find(upclient.first);
1086 p != new_up_cluster.end()) {
1087 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1088 f->dump_object("cluster_addrs", p->second);
1089 }
1090 if (auto p = new_hb_back_up.find(upclient.first);
1091 p != new_hb_back_up.end()) {
1092 f->dump_object("heartbeat_back_addrs", p->second);
1093 }
1094 if (auto p = new_hb_front_up.find(upclient.first);
1095 p != new_hb_front_up.end()) {
1096 f->dump_object("heartbeat_front_addrs", p->second);
1097 }
7c673cae
FG
1098 f->close_section();
1099 }
1100 f->close_section();
1101
1102 f->open_array_section("new_weight");
1103
1104 for (const auto &weight : new_weight) {
1105 f->open_object_section("osd");
1106 f->dump_int("osd", weight.first);
1107 f->dump_int("weight", weight.second);
1108 f->close_section();
1109 }
1110 f->close_section();
1111
1112 f->open_array_section("osd_state_xor");
1113 for (const auto &ns : new_state) {
1114 f->open_object_section("osd");
1115 f->dump_int("osd", ns.first);
1116 set<string> st;
1117 calc_state_set(new_state.find(ns.first)->second, st);
1118 f->open_array_section("state_xor");
1119 for (auto &state : st)
1120 f->dump_string("state", state);
1121 f->close_section();
c07f9fc5 1122 f->close_section();
7c673cae
FG
1123 }
1124 f->close_section();
1125
1126 f->open_array_section("new_pg_temp");
1127
1128 for (const auto &pg_temp : new_pg_temp) {
1129 f->open_object_section("pg");
1130 f->dump_stream("pgid") << pg_temp.first;
1131 f->open_array_section("osds");
1132
1133 for (const auto &osd : pg_temp.second)
1134 f->dump_int("osd", osd);
1135 f->close_section();
1136 f->close_section();
1137 }
1138 f->close_section();
1139
1140 f->open_array_section("primary_temp");
1141
1142 for (const auto &primary_temp : new_primary_temp) {
1143 f->dump_stream("pgid") << primary_temp.first;
1144 f->dump_int("osd", primary_temp.second);
1145 }
1146 f->close_section(); // primary_temp
1147
1148 f->open_array_section("new_pg_upmap");
1149 for (auto& i : new_pg_upmap) {
1150 f->open_object_section("mapping");
1151 f->dump_stream("pgid") << i.first;
1152 f->open_array_section("osds");
1153 for (auto osd : i.second) {
1154 f->dump_int("osd", osd);
1155 }
1156 f->close_section();
1157 f->close_section();
1158 }
1159 f->close_section();
1160 f->open_array_section("old_pg_upmap");
1161 for (auto& i : old_pg_upmap) {
1162 f->dump_stream("pgid") << i;
1163 }
1164 f->close_section();
1165
1166 f->open_array_section("new_pg_upmap_items");
1167 for (auto& i : new_pg_upmap_items) {
1168 f->open_object_section("mapping");
1169 f->dump_stream("pgid") << i.first;
1170 f->open_array_section("mappings");
1171 for (auto& p : i.second) {
1172 f->open_object_section("mapping");
1173 f->dump_int("from", p.first);
1174 f->dump_int("to", p.second);
1175 f->close_section();
1176 }
1177 f->close_section();
1178 f->close_section();
1179 }
1180 f->close_section();
1181 f->open_array_section("old_pg_upmap_items");
1182 for (auto& i : old_pg_upmap_items) {
1183 f->dump_stream("pgid") << i;
1184 }
1185 f->close_section();
1186
1187 f->open_array_section("new_up_thru");
1188
1189 for (const auto &up_thru : new_up_thru) {
1190 f->open_object_section("osd");
1191 f->dump_int("osd", up_thru.first);
1192 f->dump_int("up_thru", up_thru.second);
1193 f->close_section();
1194 }
1195 f->close_section();
1196
1197 f->open_array_section("new_lost");
1198
1199 for (const auto &lost : new_lost) {
1200 f->open_object_section("osd");
1201 f->dump_int("osd", lost.first);
1202 f->dump_int("epoch_lost", lost.second);
1203 f->close_section();
1204 }
1205 f->close_section();
1206
1207 f->open_array_section("new_last_clean_interval");
1208
1209 for (const auto &last_clean_interval : new_last_clean_interval) {
1210 f->open_object_section("osd");
1211 f->dump_int("osd", last_clean_interval.first);
1212 f->dump_int("first", last_clean_interval.second.first);
1213 f->dump_int("last", last_clean_interval.second.second);
1214 f->close_section();
1215 }
1216 f->close_section();
1217
f67539c2
TL
1218 f->open_array_section("new_blocklist");
1219 for (const auto &blist : new_blocklist) {
7c673cae
FG
1220 stringstream ss;
1221 ss << blist.first;
1222 f->dump_stream(ss.str().c_str()) << blist.second;
1223 }
1224 f->close_section();
f67539c2
TL
1225 f->open_array_section("old_blocklist");
1226 for (const auto &blist : old_blocklist)
7c673cae
FG
1227 f->dump_stream("addr") << blist;
1228 f->close_section();
1229
1230 f->open_array_section("new_xinfo");
1231 for (const auto &xinfo : new_xinfo) {
1232 f->open_object_section("xinfo");
1233 f->dump_int("osd", xinfo.first);
1234 xinfo.second.dump(f);
1235 f->close_section();
1236 }
1237 f->close_section();
1238
1239 if (cluster_snapshot.size())
1240 f->dump_string("cluster_snapshot", cluster_snapshot);
1241
1242 f->open_array_section("new_uuid");
1243 for (const auto &uuid : new_uuid) {
1244 f->open_object_section("osd");
1245 f->dump_int("osd", uuid.first);
1246 f->dump_stream("uuid") << uuid.second;
1247 f->close_section();
1248 }
1249 f->close_section();
1250
1251 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1252 f->open_array_section("old_erasure_code_profiles");
1253 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
9f95a23c 1254 f->dump_string("old", erasure_code_profile);
7c673cae
FG
1255 }
1256 f->close_section();
11fdf7f2
TL
1257
1258 f->open_array_section("new_removed_snaps");
1259 for (auto& p : new_removed_snaps) {
1260 f->open_object_section("pool");
1261 f->dump_int("pool", p.first);
1262 f->open_array_section("snaps");
1263 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1264 f->open_object_section("interval");
1265 f->dump_unsigned("begin", q.get_start());
1266 f->dump_unsigned("length", q.get_len());
1267 f->close_section();
1268 }
1269 f->close_section();
1270 f->close_section();
1271 }
1272 f->close_section();
1273 f->open_array_section("new_purged_snaps");
1274 for (auto& p : new_purged_snaps) {
1275 f->open_object_section("pool");
1276 f->dump_int("pool", p.first);
1277 f->open_array_section("snaps");
1278 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1279 f->open_object_section("interval");
1280 f->dump_unsigned("begin", q.get_start());
1281 f->dump_unsigned("length", q.get_len());
1282 f->close_section();
1283 }
1284 f->close_section();
1285 f->close_section();
1286 }
81eedcae
TL
1287 f->open_array_section("new_crush_node_flags");
1288 for (auto& i : new_crush_node_flags) {
1289 f->open_object_section("node");
1290 f->dump_int("id", i.first);
1291 set<string> st;
1292 calc_state_set(i.second, st);
1293 for (auto& j : st) {
1294 f->dump_string("flag", j);
1295 }
1296 f->close_section();
1297 }
1298 f->close_section();
1299 f->open_array_section("new_device_class_flags");
1300 for (auto& i : new_device_class_flags) {
1301 f->open_object_section("device_class");
1302 f->dump_int("id", i.first);
1303 set<string> st;
1304 calc_state_set(i.second, st);
1305 for (auto& j : st) {
1306 f->dump_string("flag", j);
1307 }
1308 f->close_section();
1309 }
1310 f->close_section();
f67539c2
TL
1311 f->open_object_section("stretch_mode");
1312 {
1313 f->dump_bool("change_stretch_mode", change_stretch_mode);
1314 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1315 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1316 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1317 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1318 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1319 }
1320 f->close_section();
11fdf7f2 1321 f->close_section();
7c673cae
FG
1322}
1323
1324void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1325{
1326 o.push_back(new Incremental);
1327}
1328
1329// ----------------------------------
1330// OSDMap
1331
1332void OSDMap::set_epoch(epoch_t e)
1333{
1334 epoch = e;
1335 for (auto &pool : pools)
1336 pool.second.last_change = e;
1337}
1338
f67539c2 1339bool OSDMap::is_blocklisted(const entity_addr_t& orig) const
7c673cae 1340{
f67539c2 1341 if (blocklist.empty()) {
7c673cae 1342 return false;
11fdf7f2
TL
1343 }
1344
f67539c2 1345 // all blocklist entries are type ANY for nautilus+
11fdf7f2
TL
1346 // FIXME: avoid this copy!
1347 entity_addr_t a = orig;
9f95a23c 1348 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1349 a.set_type(entity_addr_t::TYPE_LEGACY);
1350 } else {
1351 a.set_type(entity_addr_t::TYPE_ANY);
1352 }
7c673cae
FG
1353
1354 // this specific instance?
f67539c2 1355 if (blocklist.count(a)) {
7c673cae 1356 return true;
11fdf7f2 1357 }
7c673cae 1358
f67539c2 1359 // is entire ip blocklisted?
7c673cae 1360 if (a.is_ip()) {
11fdf7f2
TL
1361 a.set_port(0);
1362 a.set_nonce(0);
f67539c2 1363 if (blocklist.count(a)) {
11fdf7f2
TL
1364 return true;
1365 }
1366 }
1367
1368 return false;
1369}
1370
f67539c2 1371bool OSDMap::is_blocklisted(const entity_addrvec_t& av) const
11fdf7f2 1372{
f67539c2 1373 if (blocklist.empty())
11fdf7f2
TL
1374 return false;
1375
1376 for (auto& a : av.v) {
f67539c2 1377 if (is_blocklisted(a)) {
7c673cae
FG
1378 return true;
1379 }
1380 }
1381
1382 return false;
1383}
1384
f67539c2 1385void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl) const
7c673cae 1386{
f67539c2 1387 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
7c673cae
FG
1388}
1389
f67539c2 1390void OSDMap::get_blocklist(std::set<entity_addr_t> *bl) const
31f18b77 1391{
f67539c2 1392 for (const auto &i : blocklist) {
31f18b77
FG
1393 bl->insert(i.first);
1394 }
1395}
1396
7c673cae
FG
1397void OSDMap::set_max_osd(int m)
1398{
7c673cae 1399 max_osd = m;
f67539c2
TL
1400 osd_state.resize(max_osd, 0);
1401 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1402 osd_info.resize(max_osd);
1403 osd_xinfo.resize(max_osd);
1404 osd_addrs->client_addrs.resize(max_osd);
1405 osd_addrs->cluster_addrs.resize(max_osd);
1406 osd_addrs->hb_back_addrs.resize(max_osd);
1407 osd_addrs->hb_front_addrs.resize(max_osd);
1408 osd_uuid->resize(max_osd);
7c673cae 1409 if (osd_primary_affinity)
f67539c2 1410 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
7c673cae
FG
1411
1412 calc_num_osds();
1413}
1414
1415int OSDMap::calc_num_osds()
1416{
1417 num_osd = 0;
1418 num_up_osd = 0;
1419 num_in_osd = 0;
1420 for (int i=0; i<max_osd; i++) {
1421 if (osd_state[i] & CEPH_OSD_EXISTS) {
1422 ++num_osd;
1423 if (osd_state[i] & CEPH_OSD_UP) {
1424 ++num_up_osd;
1425 }
1426 if (get_weight(i) != CEPH_OSD_OUT) {
1427 ++num_in_osd;
1428 }
1429 }
1430 }
1431 return num_osd;
1432}
1433
3efd9988
FG
1434void OSDMap::get_full_pools(CephContext *cct,
1435 set<int64_t> *full,
1436 set<int64_t> *backfillfull,
1437 set<int64_t> *nearfull) const
7c673cae 1438{
11fdf7f2
TL
1439 ceph_assert(full);
1440 ceph_assert(backfillfull);
1441 ceph_assert(nearfull);
3efd9988
FG
1442 full->clear();
1443 backfillfull->clear();
1444 nearfull->clear();
1445
1446 vector<int> full_osds;
1447 vector<int> backfillfull_osds;
1448 vector<int> nearfull_osds;
7c673cae
FG
1449 for (int i = 0; i < max_osd; ++i) {
1450 if (exists(i) && is_up(i) && is_in(i)) {
1451 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1452 full_osds.push_back(i);
7c673cae 1453 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1454 backfillfull_osds.push_back(i);
7c673cae 1455 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1456 nearfull_osds.push_back(i);
7c673cae
FG
1457 }
1458 }
3efd9988
FG
1459
1460 for (auto i: full_osds) {
1461 get_pool_ids_by_osd(cct, i, full);
1462 }
1463 for (auto i: backfillfull_osds) {
1464 get_pool_ids_by_osd(cct, i, backfillfull);
1465 }
1466 for (auto i: nearfull_osds) {
1467 get_pool_ids_by_osd(cct, i, nearfull);
1468 }
7c673cae
FG
1469}
1470
31f18b77
FG
1471void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1472 set<int> *nearfull) const
1473{
1474 full->clear();
1475 backfill->clear();
1476 nearfull->clear();
1477 for (int i = 0; i < max_osd; ++i) {
1478 if (exists(i) && is_up(i) && is_in(i)) {
1479 if (osd_state[i] & CEPH_OSD_FULL)
1480 full->emplace(i);
1481 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1482 backfill->emplace(i);
1483 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1484 nearfull->emplace(i);
1485 }
1486 }
1487}
1488
7c673cae
FG
1489void OSDMap::get_all_osds(set<int32_t>& ls) const
1490{
1491 for (int i=0; i<max_osd; i++)
1492 if (exists(i))
1493 ls.insert(i);
1494}
1495
1496void OSDMap::get_up_osds(set<int32_t>& ls) const
1497{
1498 for (int i = 0; i < max_osd; i++) {
1499 if (is_up(i))
1500 ls.insert(i);
1501 }
1502}
1503
81eedcae 1504void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1505{
1506 for (int i = 0; i < max_osd; i++) {
81eedcae 1507 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1508 ls.insert(i);
1509 }
1510}
1511
11fdf7f2
TL
1512void OSDMap::get_flag_set(set<string> *flagset) const
1513{
1514 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1515 if (flags & (1<<i)) {
1516 flagset->insert(get_flag_string(flags & (1<<i)));
1517 }
1518 }
1519}
1520
7c673cae
FG
1521void OSDMap::calc_state_set(int state, set<string>& st)
1522{
1523 unsigned t = state;
1524 for (unsigned s = 1; t; s <<= 1) {
1525 if (t & s) {
1526 t &= ~s;
1527 st.insert(ceph_osd_state_name(s));
1528 }
1529 }
1530}
1531
1532void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1533{
1534 float max = 0;
1535 for (const auto &weight : weights) {
1536 if (weight.second > max)
1537 max = weight.second;
1538 }
1539
1540 for (const auto &weight : weights) {
1541 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1542 }
1543}
1544
1545int OSDMap::identify_osd(const entity_addr_t& addr) const
1546{
1547 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1548 if (exists(i) && (get_addrs(i).contains(addr) ||
1549 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1550 return i;
1551 return -1;
1552}
1553
1554int OSDMap::identify_osd(const uuid_d& u) const
1555{
1556 for (int i=0; i<max_osd; i++)
1557 if (exists(i) && get_uuid(i) == u)
1558 return i;
1559 return -1;
1560}
1561
1562int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1563{
1564 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1565 if (exists(i) && (get_addrs(i).contains(addr) ||
1566 get_cluster_addrs(i).contains(addr) ||
1567 get_hb_back_addrs(i).contains(addr) ||
1568 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1569 return i;
1570 return -1;
1571}
1572
1573int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1574{
1575 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1576 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1577 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1578 return i;
1579 return -1;
1580}
1581
1582
1583uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1584{
1585 uint64_t features = 0; // things we actually have
1586 uint64_t mask = 0; // things we could have
1587
1588 if (crush->has_nondefault_tunables())
1589 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1590 if (crush->has_nondefault_tunables2())
1591 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1592 if (crush->has_nondefault_tunables3())
1593 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1594 if (crush->has_v4_buckets())
1595 features |= CEPH_FEATURE_CRUSH_V4;
1596 if (crush->has_nondefault_tunables5())
1597 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1598 if (crush->has_incompat_choose_args()) {
1599 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1600 }
7c673cae
FG
1601 mask |= CEPH_FEATURES_CRUSH;
1602
1603 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1604 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1605 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1606
1607 for (auto &pool: pools) {
1608 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1609 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1610 }
7c673cae
FG
1611 if (!pool.second.tiers.empty() ||
1612 pool.second.is_tier()) {
1613 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1614 }
20effc67 1615 int ruleid = pool.second.get_crush_rule();
7c673cae
FG
1616 if (ruleid >= 0) {
1617 if (crush->is_v2_rule(ruleid))
1618 features |= CEPH_FEATURE_CRUSH_V2;
1619 if (crush->is_v3_rule(ruleid))
1620 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1621 if (crush->is_v5_rule(ruleid))
1622 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1623 }
1624 }
7c673cae 1625 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1626
1627 if (osd_primary_affinity) {
1628 for (int i = 0; i < max_osd; ++i) {
1629 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1630 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1631 break;
1632 }
1633 }
1634 }
1635 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1636
1637 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1638 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
9f95a23c 1639 if (require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
1640 features |= jewel_features;
1641 }
1642 mask |= jewel_features;
1643
1644 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1645 | CEPH_FEATURE_MSG_ADDR2;
9f95a23c 1646 if (require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
1647 features |= kraken_features;
1648 }
1649 mask |= kraken_features;
f67539c2
TL
1650
1651 if (stretch_mode_enabled) {
1652 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1653 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1654 }
7c673cae
FG
1655 }
1656
9f95a23c 1657 if (require_min_compat_client >= ceph_release_t::nautilus) {
11fdf7f2
TL
1658 // if min_compat_client is >= nautilus, require v2 cephx signatures
1659 // from everyone
1660 features |= CEPH_FEATUREMASK_CEPHX_V2;
9f95a23c 1661 } else if (require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
1662 entity_type == CEPH_ENTITY_TYPE_OSD) {
1663 // if osds are >= nautilus, at least require the signatures from them
1664 features |= CEPH_FEATUREMASK_CEPHX_V2;
1665 }
1666 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1667
7c673cae
FG
1668 if (pmask)
1669 *pmask = mask;
1670 return features;
1671}
1672
9f95a23c 1673ceph_release_t OSDMap::get_min_compat_client() const
7c673cae
FG
1674{
1675 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1676
1677 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77 1678 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
9f95a23c 1679 return ceph_release_t::luminous; // v12.2.0
7c673cae
FG
1680 }
1681 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
9f95a23c 1682 return ceph_release_t::jewel; // v10.2.0
7c673cae
FG
1683 }
1684 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
9f95a23c 1685 return ceph_release_t::hammer; // v0.94.0
7c673cae
FG
1686 }
1687 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1688 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1689 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
9f95a23c 1690 return ceph_release_t::firefly; // v0.80.0
7c673cae
FG
1691 }
1692 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1693 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
9f95a23c 1694 return ceph_release_t::dumpling; // v0.67.0
7c673cae
FG
1695 }
1696 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
9f95a23c 1697 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae 1698 }
9f95a23c 1699 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae
FG
1700}
1701
9f95a23c 1702ceph_release_t OSDMap::get_require_min_compat_client() const
11fdf7f2
TL
1703{
1704 return require_min_compat_client;
1705}
1706
7c673cae
FG
1707void OSDMap::_calc_up_osd_features()
1708{
1709 bool first = true;
1710 cached_up_osd_features = 0;
1711 for (int osd = 0; osd < max_osd; ++osd) {
1712 if (!is_up(osd))
1713 continue;
1714 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1715 if (xi.features == 0)
1716 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1717 if (first) {
1718 cached_up_osd_features = xi.features;
1719 first = false;
1720 } else {
1721 cached_up_osd_features &= xi.features;
1722 }
1723 }
1724}
1725
1726uint64_t OSDMap::get_up_osd_features() const
1727{
1728 return cached_up_osd_features;
1729}
1730
1731void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1732{
11fdf7f2 1733 using ceph::encode;
7c673cae
FG
1734 if (o->epoch == n->epoch)
1735 return;
1736
1737 int diff = 0;
1738
1739 // do addrs match?
1740 if (o->max_osd != n->max_osd)
1741 diff++;
1742 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1743 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1744 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1745 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1746 else
1747 diff++;
11fdf7f2
TL
1748 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1749 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1750 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1751 else
1752 diff++;
11fdf7f2
TL
1753 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1754 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1755 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1756 else
1757 diff++;
11fdf7f2
TL
1758 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1759 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1760 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1761 else
1762 diff++;
1763 }
1764 if (diff == 0) {
1765 // zoinks, no differences at all!
1766 n->osd_addrs = o->osd_addrs;
1767 }
1768
1769 // does crush match?
9f95a23c 1770 ceph::buffer::list oc, nc;
11fdf7f2
TL
1771 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1772 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1773 if (oc.contents_equal(nc)) {
1774 n->crush = o->crush;
1775 }
1776
1777 // does pg_temp match?
31f18b77
FG
1778 if (*o->pg_temp == *n->pg_temp)
1779 n->pg_temp = o->pg_temp;
7c673cae
FG
1780
1781 // does primary_temp match?
1782 if (o->primary_temp->size() == n->primary_temp->size()) {
1783 if (*o->primary_temp == *n->primary_temp)
1784 n->primary_temp = o->primary_temp;
1785 }
1786
1787 // do uuids match?
1788 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1789 *o->osd_uuid == *n->osd_uuid)
1790 n->osd_uuid = o->osd_uuid;
1791}
1792
1793void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1794 const OSDMap& oldmap,
1795 const OSDMap& nextmap,
1796 Incremental *pending_inc)
7c673cae
FG
1797{
1798 ldout(cct, 10) << __func__ << dendl;
7c673cae 1799
11fdf7f2 1800 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1801 // if pool does not exist, remove any existing pg_temps associated with
1802 // it. we don't care about pg_temps on the pending_inc either; if there
1803 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1804 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1805 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1806 << " for nonexistent pool " << pg.first.pool() << dendl;
1807 pending_inc->new_pg_temp[pg.first].clear();
1808 continue;
1809 }
20effc67
TL
1810 if (!nextmap.pg_exists(pg.first)) {
1811 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1812 << " for nonexistent pg " << dendl;
1813 pending_inc->new_pg_temp[pg.first].clear();
1814 continue;
1815 }
7c673cae
FG
1816 // all osds down?
1817 unsigned num_up = 0;
1818 for (auto o : pg.second) {
11fdf7f2 1819 if (!nextmap.is_down(o)) {
7c673cae
FG
1820 ++num_up;
1821 break;
1822 }
1823 }
1824 if (num_up == 0) {
1825 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1826 << " with all down osds" << pg.second << dendl;
1827 pending_inc->new_pg_temp[pg.first].clear();
1828 continue;
1829 }
1830 // redundant pg_temp?
1831 vector<int> raw_up;
1832 int primary;
11fdf7f2 1833 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1834 bool remove = false;
11fdf7f2 1835 if (raw_up == pg.second) {
7c673cae
FG
1836 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1837 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1838 remove = true;
1839 }
1840 // oversized pg_temp?
11fdf7f2 1841 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1842 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1843 << pg.second << " exceeds pool size" << dendl;
1844 remove = true;
1845 }
1846 if (remove) {
11fdf7f2 1847 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1848 pending_inc->new_pg_temp[pg.first].clear();
1849 else
1850 pending_inc->new_pg_temp.erase(pg.first);
1851 }
1852 }
1853
11fdf7f2 1854 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1855 // primary down?
11fdf7f2 1856 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1857 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1858 << " to down " << pg.second << dendl;
1859 pending_inc->new_primary_temp[pg.first] = -1;
1860 continue;
1861 }
1862 // redundant primary_temp?
1863 vector<int> real_up, templess_up;
1864 int real_primary, templess_primary;
1865 pg_t pgid = pg.first;
11fdf7f2
TL
1866 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1867 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1868 if (real_primary == templess_primary){
1869 ldout(cct, 10) << __func__ << " removing primary_temp "
1870 << pgid << " -> " << real_primary
1871 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1872 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1873 pending_inc->new_primary_temp[pgid] = -1;
1874 else
1875 pending_inc->new_primary_temp.erase(pgid);
1876 }
1877 }
1878}
1879
494da23a 1880void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1881{
494da23a
TL
1882 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1883 for (auto& p : pg_upmap)
1884 upmap_pgs->push_back(p.first);
1885 for (auto& p : pg_upmap_items)
1886 upmap_pgs->push_back(p.first);
1887}
94b18763 1888
494da23a
TL
1889bool OSDMap::check_pg_upmaps(
1890 CephContext *cct,
1891 const vector<pg_t>& to_check,
1892 vector<pg_t> *to_cancel,
1893 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1894{
1895 bool any_change = false;
1896 map<int, map<int, float>> rule_weight_map;
28e407b8 1897 for (auto& pg : to_check) {
494da23a 1898 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
1899 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1900 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1901 << dendl;
494da23a 1902 to_cancel->push_back(pg);
11fdf7f2
TL
1903 continue;
1904 }
1905 if (pi->is_pending_merge(pg, nullptr)) {
1906 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1907 << dendl;
494da23a 1908 to_cancel->push_back(pg);
94b18763
FG
1909 continue;
1910 }
494da23a
TL
1911 vector<int> raw, up;
1912 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
1913 auto crush_rule = get_pg_pool_crush_rule(pg);
1914 auto r = crush->verify_upmap(cct,
1915 crush_rule,
1916 get_pg_pool_size(pg),
1917 up);
a8e16298
TL
1918 if (r < 0) {
1919 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1920 << " returning " << r
1921 << dendl;
494da23a 1922 to_cancel->push_back(pg);
a8e16298
TL
1923 continue;
1924 }
1925 // below we check against crush-topology changing..
28e407b8
AA
1926 map<int, float> weight_map;
1927 auto it = rule_weight_map.find(crush_rule);
1928 if (it == rule_weight_map.end()) {
494da23a 1929 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
1930 if (r < 0) {
1931 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
1932 << "crush_rule " << crush_rule
1933 << dendl;
28e407b8
AA
1934 continue;
1935 }
1936 rule_weight_map[crush_rule] = weight_map;
1937 } else {
1938 weight_map = it->second;
1939 }
28e407b8 1940 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1941 << " weight_map " << weight_map
94b18763 1942 << dendl;
a8e16298 1943 for (auto osd : up) {
28e407b8
AA
1944 auto it = weight_map.find(osd);
1945 if (it == weight_map.end()) {
92f5a8d4
TL
1946 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
1947 << "been moved out of the specific crush-tree"
1948 << dendl;
494da23a 1949 to_cancel->push_back(pg);
94b18763
FG
1950 break;
1951 }
494da23a 1952 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 1953 if (adjusted_weight == 0) {
92f5a8d4
TL
1954 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
1955 << " is out/crush-out"
1956 << dendl;
494da23a 1957 to_cancel->push_back(pg);
94b18763
FG
1958 break;
1959 }
1960 }
eafe8130
TL
1961 if (!to_cancel->empty() && to_cancel->back() == pg)
1962 continue;
1963 // okay, upmap is valid
1964 // continue to check if it is still necessary
1965 auto i = pg_upmap.find(pg);
a4b75251
TL
1966 if (i != pg_upmap.end()) {
1967 if (i->second == raw) {
1968 ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " "
1969 << i->second << dendl;
1970 to_cancel->push_back(pg);
1971 continue;
1972 }
1973 if ((int)i->second.size() != get_pg_pool_size(pg)) {
1974 ldout(cct, 10) << "removing pg_upmap " << i->first << " "
1975 << i->second << " != pool size " << get_pg_pool_size(pg)
1976 << dendl;
1977 to_cancel->push_back(pg);
1978 continue;
1979 }
eafe8130
TL
1980 }
1981 auto j = pg_upmap_items.find(pg);
1982 if (j != pg_upmap_items.end()) {
1983 mempool::osdmap::vector<pair<int,int>> newmap;
1984 for (auto& p : j->second) {
1985 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1986 // cancel mapping if source osd does not exist anymore
1987 continue;
1988 }
1989 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1990 p.second >= 0 && osd_weight[p.second] == 0) {
1991 // cancel mapping if target osd is out
1992 continue;
1993 }
1994 newmap.push_back(p);
1995 }
1996 if (newmap.empty()) {
1997 ldout(cct, 10) << " removing no-op pg_upmap_items "
1998 << j->first << " " << j->second
1999 << dendl;
2000 to_cancel->push_back(pg);
2001 } else if (newmap != j->second) {
2002 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
2003 << j->first << " " << j->second
2004 << " -> " << newmap
2005 << dendl;
2006 to_remap->insert({pg, newmap});
2007 any_change = true;
2008 }
2009 }
28e407b8 2010 }
494da23a
TL
2011 any_change = any_change || !to_cancel->empty();
2012 return any_change;
2013}
2014
2015void OSDMap::clean_pg_upmaps(
2016 CephContext *cct,
2017 Incremental *pending_inc,
2018 const vector<pg_t>& to_cancel,
2019 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2020{
28e407b8 2021 for (auto &pg: to_cancel) {
494da23a
TL
2022 auto i = pending_inc->new_pg_upmap.find(pg);
2023 if (i != pending_inc->new_pg_upmap.end()) {
2024 ldout(cct, 10) << __func__ << " cancel invalid pending "
2025 << "pg_upmap entry "
2026 << i->first << "->" << i->second
2027 << dendl;
2028 pending_inc->new_pg_upmap.erase(i);
94b18763 2029 }
494da23a
TL
2030 auto j = pg_upmap.find(pg);
2031 if (j != pg_upmap.end()) {
2032 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2033 << j->first << "->" << j->second
2034 << dendl;
2035 pending_inc->old_pg_upmap.insert(pg);
2036 }
2037 auto p = pending_inc->new_pg_upmap_items.find(pg);
2038 if (p != pending_inc->new_pg_upmap_items.end()) {
2039 ldout(cct, 10) << __func__ << " cancel invalid pending "
2040 << "pg_upmap_items entry "
2041 << p->first << "->" << p->second
2042 << dendl;
2043 pending_inc->new_pg_upmap_items.erase(p);
2044 }
2045 auto q = pg_upmap_items.find(pg);
2046 if (q != pg_upmap_items.end()) {
2047 ldout(cct, 10) << __func__ << " cancel invalid "
2048 << "pg_upmap_items entry "
2049 << q->first << "->" << q->second
2050 << dendl;
2051 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
2052 }
2053 }
494da23a
TL
2054 for (auto& i : to_remap)
2055 pending_inc->new_pg_upmap_items[i.first] = i.second;
2056}
2057
2058bool OSDMap::clean_pg_upmaps(
2059 CephContext *cct,
2060 Incremental *pending_inc) const
2061{
2062 ldout(cct, 10) << __func__ << dendl;
2063 vector<pg_t> to_check;
2064 vector<pg_t> to_cancel;
2065 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2066
2067 get_upmap_pgs(&to_check);
2068 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2069 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2070 return any_change;
94b18763
FG
2071}
2072
7c673cae
FG
2073int OSDMap::apply_incremental(const Incremental &inc)
2074{
f67539c2 2075 new_blocklist_entries = false;
7c673cae
FG
2076 if (inc.epoch == 1)
2077 fsid = inc.fsid;
2078 else if (inc.fsid != fsid)
2079 return -EINVAL;
2080
11fdf7f2 2081 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
2082
2083 epoch++;
2084 modified = inc.modified;
2085
2086 // full map?
2087 if (inc.fullmap.length()) {
9f95a23c 2088 ceph::buffer::list bl(inc.fullmap);
7c673cae
FG
2089 decode(bl);
2090 return 0;
2091 }
2092
2093 // nope, incremental.
31f18b77 2094 if (inc.new_flags >= 0) {
7c673cae 2095 flags = inc.new_flags;
31f18b77
FG
2096 // the below is just to cover a newly-upgraded luminous mon
2097 // cluster that has to set require_jewel_osds or
2098 // require_kraken_osds before the osds can be upgraded to
2099 // luminous.
2100 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c
TL
2101 if (require_osd_release < ceph_release_t::kraken) {
2102 require_osd_release = ceph_release_t::kraken;
31f18b77
FG
2103 }
2104 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c
TL
2105 if (require_osd_release < ceph_release_t::jewel) {
2106 require_osd_release = ceph_release_t::jewel;
31f18b77
FG
2107 }
2108 }
2109 }
7c673cae
FG
2110
2111 if (inc.new_max_osd >= 0)
2112 set_max_osd(inc.new_max_osd);
2113
2114 if (inc.new_pool_max != -1)
2115 pool_max = inc.new_pool_max;
2116
2117 for (const auto &pool : inc.new_pools) {
2118 pools[pool.first] = pool.second;
2119 pools[pool.first].last_change = epoch;
2120 }
2121
11fdf7f2
TL
2122 new_removed_snaps = inc.new_removed_snaps;
2123 new_purged_snaps = inc.new_purged_snaps;
2124 for (auto p = new_removed_snaps.begin();
2125 p != new_removed_snaps.end();
2126 ++p) {
2127 removed_snaps_queue[p->first].union_of(p->second);
2128 }
2129 for (auto p = new_purged_snaps.begin();
2130 p != new_purged_snaps.end();
2131 ++p) {
2132 auto q = removed_snaps_queue.find(p->first);
2133 ceph_assert(q != removed_snaps_queue.end());
2134 q->second.subtract(p->second);
2135 if (q->second.empty()) {
2136 removed_snaps_queue.erase(q);
2137 }
2138 }
2139
2140 if (inc.new_last_up_change != utime_t()) {
2141 last_up_change = inc.new_last_up_change;
2142 }
2143 if (inc.new_last_in_change != utime_t()) {
2144 last_in_change = inc.new_last_in_change;
2145 }
2146
7c673cae
FG
2147 for (const auto &pname : inc.new_pool_names) {
2148 auto pool_name_entry = pool_name.find(pname.first);
2149 if (pool_name_entry != pool_name.end()) {
2150 name_pool.erase(pool_name_entry->second);
2151 pool_name_entry->second = pname.second;
2152 } else {
2153 pool_name[pname.first] = pname.second;
2154 }
2155 name_pool[pname.second] = pname.first;
2156 }
2157
2158 for (const auto &pool : inc.old_pools) {
2159 pools.erase(pool);
2160 name_pool.erase(pool_name[pool]);
2161 pool_name.erase(pool);
2162 }
2163
2164 for (const auto &weight : inc.new_weight) {
2165 set_weight(weight.first, weight.second);
2166
2167 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2168 // xinfo old_weight.
2169 if (weight.second) {
2170 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2171 osd_xinfo[weight.first].old_weight = 0;
2172 }
2173 }
2174
2175 for (const auto &primary_affinity : inc.new_primary_affinity) {
2176 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2177 }
2178
2179 // erasure_code_profiles
2180 for (const auto &profile : inc.old_erasure_code_profiles)
2181 erasure_code_profiles.erase(profile);
2182
2183 for (const auto &profile : inc.new_erasure_code_profiles) {
2184 set_erasure_code_profile(profile.first, profile.second);
2185 }
2186
2187 // up/down
2188 for (const auto &state : inc.new_state) {
2189 const auto osd = state.first;
2190 int s = state.second ? state.second : CEPH_OSD_UP;
2191 if ((osd_state[osd] & CEPH_OSD_UP) &&
2192 (s & CEPH_OSD_UP)) {
2193 osd_info[osd].down_at = epoch;
2194 osd_xinfo[osd].down_stamp = modified;
2195 }
2196 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2197 (s & CEPH_OSD_EXISTS)) {
2198 // osd is destroyed; clear out anything interesting.
2199 (*osd_uuid)[osd] = uuid_d();
2200 osd_info[osd] = osd_info_t();
2201 osd_xinfo[osd] = osd_xinfo_t();
2202 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2203 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2204 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2205 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2206 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2207 osd_state[osd] = 0;
2208 } else {
2209 osd_state[osd] ^= s;
2210 }
2211 }
2212
2213 for (const auto &client : inc.new_up_client) {
2214 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
9f95a23c 2215 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
11fdf7f2
TL
2216 osd_addrs->client_addrs[client.first].reset(
2217 new entity_addrvec_t(client.second));
2218 osd_addrs->hb_back_addrs[client.first].reset(
2219 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2220 osd_addrs->hb_front_addrs[client.first].reset(
2221 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2222
2223 osd_info[client.first].up_from = epoch;
2224 }
2225
2226 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2227 osd_addrs->cluster_addrs[cluster.first].reset(
2228 new entity_addrvec_t(cluster.second));
7c673cae
FG
2229
2230 // info
2231 for (const auto &thru : inc.new_up_thru)
2232 osd_info[thru.first].up_thru = thru.second;
2233
2234 for (const auto &interval : inc.new_last_clean_interval) {
2235 osd_info[interval.first].last_clean_begin = interval.second.first;
2236 osd_info[interval.first].last_clean_end = interval.second.second;
2237 }
2238
2239 for (const auto &lost : inc.new_lost)
2240 osd_info[lost.first].lost_at = lost.second;
2241
2242 // xinfo
2243 for (const auto &xinfo : inc.new_xinfo)
2244 osd_xinfo[xinfo.first] = xinfo.second;
2245
2246 // uuid
2247 for (const auto &uuid : inc.new_uuid)
2248 (*osd_uuid)[uuid.first] = uuid.second;
2249
2250 // pg rebuild
2251 for (const auto &pg : inc.new_pg_temp) {
2252 if (pg.second.empty())
2253 pg_temp->erase(pg.first);
2254 else
31f18b77
FG
2255 pg_temp->set(pg.first, pg.second);
2256 }
2257 if (!inc.new_pg_temp.empty()) {
2258 // make sure pg_temp is efficiently stored
2259 pg_temp->rebuild();
7c673cae
FG
2260 }
2261
2262 for (const auto &pg : inc.new_primary_temp) {
2263 if (pg.second == -1)
2264 primary_temp->erase(pg.first);
2265 else
2266 (*primary_temp)[pg.first] = pg.second;
2267 }
2268
2269 for (auto& p : inc.new_pg_upmap) {
2270 pg_upmap[p.first] = p.second;
2271 }
2272 for (auto& pg : inc.old_pg_upmap) {
2273 pg_upmap.erase(pg);
2274 }
2275 for (auto& p : inc.new_pg_upmap_items) {
2276 pg_upmap_items[p.first] = p.second;
2277 }
2278 for (auto& pg : inc.old_pg_upmap_items) {
2279 pg_upmap_items.erase(pg);
2280 }
2281
f67539c2
TL
2282 // blocklist
2283 if (!inc.new_blocklist.empty()) {
2284 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2285 new_blocklist_entries = true;
7c673cae 2286 }
f67539c2
TL
2287 for (const auto &addr : inc.old_blocklist)
2288 blocklist.erase(addr);
7c673cae 2289
81eedcae
TL
2290 for (auto& i : inc.new_crush_node_flags) {
2291 if (i.second) {
2292 crush_node_flags[i.first] = i.second;
2293 } else {
2294 crush_node_flags.erase(i.first);
2295 }
2296 }
2297
2298 for (auto& i : inc.new_device_class_flags) {
2299 if (i.second) {
2300 device_class_flags[i.first] = i.second;
2301 } else {
2302 device_class_flags.erase(i.first);
2303 }
2304 }
2305
7c673cae
FG
2306 // cluster snapshot?
2307 if (inc.cluster_snapshot.length()) {
2308 cluster_snapshot = inc.cluster_snapshot;
2309 cluster_snapshot_epoch = inc.epoch;
2310 } else {
2311 cluster_snapshot.clear();
2312 cluster_snapshot_epoch = 0;
2313 }
2314
2315 if (inc.new_nearfull_ratio >= 0) {
2316 nearfull_ratio = inc.new_nearfull_ratio;
2317 }
2318 if (inc.new_backfillfull_ratio >= 0) {
2319 backfillfull_ratio = inc.new_backfillfull_ratio;
2320 }
2321 if (inc.new_full_ratio >= 0) {
2322 full_ratio = inc.new_full_ratio;
2323 }
9f95a23c 2324 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
7c673cae
FG
2325 require_min_compat_client = inc.new_require_min_compat_client;
2326 }
9f95a23c 2327 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
31f18b77 2328 require_osd_release = inc.new_require_osd_release;
9f95a23c 2329 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 2330 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2331 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2332 }
2333 }
7c673cae 2334
9f95a23c 2335 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
11fdf7f2 2336 require_osd_release = inc.new_require_osd_release;
9f95a23c 2337 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
2338 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2339 }
2340 }
7c673cae
FG
2341 // do new crush map last (after up/down stuff)
2342 if (inc.crush.length()) {
9f95a23c 2343 ceph::buffer::list bl(inc.crush);
11fdf7f2 2344 auto blp = bl.cbegin();
7c673cae
FG
2345 crush.reset(new CrushWrapper);
2346 crush->decode(blp);
9f95a23c 2347 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77
FG
2348 // only increment if this is a luminous-encoded osdmap, lest
2349 // the mon's crush_version diverge from what the osds or others
2350 // are decoding and applying on their end. if we won't encode
2351 // it in the canonical version, don't change it.
2352 ++crush_version;
2353 }
81eedcae
TL
2354 for (auto it = device_class_flags.begin();
2355 it != device_class_flags.end();) {
2356 const char* class_name = crush->get_class_name(it->first);
2357 if (!class_name) // device class is gone
2358 it = device_class_flags.erase(it);
2359 else
2360 it++;
2361 }
7c673cae
FG
2362 }
2363
f67539c2
TL
2364 if (inc.change_stretch_mode) {
2365 stretch_mode_enabled = inc.stretch_mode_enabled;
2366 stretch_bucket_count = inc.new_stretch_bucket_count;
2367 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2368 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2369 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2370 }
2371
7c673cae
FG
2372 calc_num_osds();
2373 _calc_up_osd_features();
2374 return 0;
2375}
2376
2377// mapping
2378int OSDMap::map_to_pg(
2379 int64_t poolid,
2380 const string& name,
2381 const string& key,
2382 const string& nspace,
2383 pg_t *pg) const
2384{
2385 // calculate ps (placement seed)
2386 const pg_pool_t *pool = get_pg_pool(poolid);
2387 if (!pool)
2388 return -ENOENT;
2389 ps_t ps;
2390 if (!key.empty())
2391 ps = pool->hash_key(key, nspace);
2392 else
2393 ps = pool->hash_key(name, nspace);
2394 *pg = pg_t(ps, poolid);
2395 return 0;
2396}
2397
2398int OSDMap::object_locator_to_pg(
2399 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2400{
2401 if (loc.hash >= 0) {
2402 if (!get_pg_pool(loc.get_pool())) {
2403 return -ENOENT;
2404 }
2405 pg = pg_t(loc.hash, loc.get_pool());
2406 return 0;
2407 }
2408 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2409}
2410
2411ceph_object_layout OSDMap::make_object_layout(
2412 object_t oid, int pg_pool, string nspace) const
2413{
2414 object_locator_t loc(pg_pool, nspace);
2415
2416 ceph_object_layout ol;
2417 pg_t pgid = object_locator_to_pg(oid, loc);
2418 ol.ol_pgid = pgid.get_old_pg().v;
2419 ol.ol_stripe_unit = 0;
2420 return ol;
2421}
2422
2423void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2424 vector<int>& osds) const
2425{
2426 if (pool.can_shift_osds()) {
2427 unsigned removed = 0;
2428 for (unsigned i = 0; i < osds.size(); i++) {
2429 if (!exists(osds[i])) {
2430 removed++;
2431 continue;
2432 }
2433 if (removed) {
2434 osds[i - removed] = osds[i];
2435 }
2436 }
2437 if (removed)
2438 osds.resize(osds.size() - removed);
2439 } else {
2440 for (auto& osd : osds) {
2441 if (!exists(osd))
2442 osd = CRUSH_ITEM_NONE;
2443 }
2444 }
2445}
2446
31f18b77 2447void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2448 const pg_pool_t& pool, pg_t pg,
2449 vector<int> *osds,
2450 ps_t *ppps) const
2451{
2452 // map to osds[]
2453 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2454 unsigned size = pool.get_size();
2455
2456 // what crush rule?
20effc67 2457 int ruleno = pool.get_crush_rule();
7c673cae
FG
2458 if (ruleno >= 0)
2459 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2460
2461 _remove_nonexistent_osds(pool, *osds);
2462
2463 if (ppps)
2464 *ppps = pps;
7c673cae
FG
2465}
2466
2467int OSDMap::_pick_primary(const vector<int>& osds) const
2468{
2469 for (auto osd : osds) {
2470 if (osd != CRUSH_ITEM_NONE) {
2471 return osd;
2472 }
2473 }
2474 return -1;
2475}
2476
224ce89b 2477void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2478{
2479 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2480 auto p = pg_upmap.find(pg);
2481 if (p != pg_upmap.end()) {
2482 // make sure targets aren't marked out
2483 for (auto osd : p->second) {
91327a77
AA
2484 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2485 osd_weight[osd] == 0) {
7c673cae
FG
2486 // reject/ignore the explicit mapping
2487 return;
2488 }
2489 }
2490 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2491 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2492 }
2493
2494 auto q = pg_upmap_items.find(pg);
2495 if (q != pg_upmap_items.end()) {
181888fb
FG
2496 // NOTE: this approach does not allow a bidirectional swap,
2497 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2498 for (auto& r : q->second) {
2499 // make sure the replacement value doesn't already appear
2500 bool exists = false;
2501 ssize_t pos = -1;
2502 for (unsigned i = 0; i < raw->size(); ++i) {
2503 int osd = (*raw)[i];
2504 if (osd == r.second) {
2505 exists = true;
2506 break;
2507 }
2508 // ignore mapping if target is marked out (or invalid osd id)
2509 if (osd == r.first &&
2510 pos < 0 &&
2511 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2512 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2513 pos = i;
2514 }
2515 }
2516 if (!exists && pos >= 0) {
2517 (*raw)[pos] = r.second;
7c673cae
FG
2518 }
2519 }
2520 }
2521}
2522
2523// pg -> (up osd list)
2524void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2525 vector<int> *up) const
2526{
2527 if (pool.can_shift_osds()) {
2528 // shift left
2529 up->clear();
2530 up->reserve(raw.size());
2531 for (unsigned i=0; i<raw.size(); i++) {
2532 if (!exists(raw[i]) || is_down(raw[i]))
2533 continue;
2534 up->push_back(raw[i]);
2535 }
2536 } else {
2537 // set down/dne devices to NONE
2538 up->resize(raw.size());
2539 for (int i = raw.size() - 1; i >= 0; --i) {
2540 if (!exists(raw[i]) || is_down(raw[i])) {
2541 (*up)[i] = CRUSH_ITEM_NONE;
2542 } else {
2543 (*up)[i] = raw[i];
2544 }
2545 }
2546 }
2547}
2548
2549void OSDMap::_apply_primary_affinity(ps_t seed,
2550 const pg_pool_t& pool,
2551 vector<int> *osds,
2552 int *primary) const
2553{
2554 // do we have any non-default primary_affinity values for these osds?
2555 if (!osd_primary_affinity)
2556 return;
2557
2558 bool any = false;
2559 for (const auto osd : *osds) {
2560 if (osd != CRUSH_ITEM_NONE &&
2561 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2562 any = true;
2563 break;
2564 }
2565 }
2566 if (!any)
2567 return;
2568
2569 // pick the primary. feed both the seed (for the pg) and the osd
2570 // into the hash/rng so that a proportional fraction of an osd's pgs
2571 // get rejected as primary.
2572 int pos = -1;
2573 for (unsigned i = 0; i < osds->size(); ++i) {
2574 int o = (*osds)[i];
2575 if (o == CRUSH_ITEM_NONE)
2576 continue;
2577 unsigned a = (*osd_primary_affinity)[o];
2578 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2579 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2580 seed, o) >> 16) >= a) {
2581 // we chose not to use this primary. note it anyway as a
2582 // fallback in case we don't pick anyone else, but keep looking.
2583 if (pos < 0)
2584 pos = i;
2585 } else {
2586 pos = i;
2587 break;
2588 }
2589 }
2590 if (pos < 0)
2591 return;
2592
2593 *primary = (*osds)[pos];
2594
2595 if (pool.can_shift_osds() && pos > 0) {
2596 // move the new primary to the front.
2597 for (int i = pos; i > 0; --i) {
2598 (*osds)[i] = (*osds)[i-1];
2599 }
2600 (*osds)[0] = *primary;
2601 }
2602}
2603
2604void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2605 vector<int> *temp_pg, int *temp_primary) const
2606{
2607 pg = pool.raw_pg_to_pg(pg);
2608 const auto p = pg_temp->find(pg);
2609 temp_pg->clear();
2610 if (p != pg_temp->end()) {
2611 for (unsigned i=0; i<p->second.size(); i++) {
2612 if (!exists(p->second[i]) || is_down(p->second[i])) {
2613 if (pool.can_shift_osds()) {
2614 continue;
2615 } else {
2616 temp_pg->push_back(CRUSH_ITEM_NONE);
2617 }
2618 } else {
2619 temp_pg->push_back(p->second[i]);
2620 }
2621 }
2622 }
2623 const auto &pp = primary_temp->find(pg);
2624 *temp_primary = -1;
2625 if (pp != primary_temp->end()) {
2626 *temp_primary = pp->second;
2627 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2628 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2629 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2630 *temp_primary = (*temp_pg)[i];
2631 break;
2632 }
2633 }
2634 }
2635}
2636
31f18b77 2637void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2638{
7c673cae 2639 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2640 if (!pool) {
2641 *primary = -1;
2642 raw->clear();
31f18b77 2643 return;
11fdf7f2 2644 }
31f18b77 2645 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2646 *primary = _pick_primary(*raw);
7c673cae
FG
2647}
2648
494da23a
TL
2649void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2650 vector<int> *raw_upmap) const
a8e16298
TL
2651{
2652 auto pool = get_pg_pool(pg.pool());
2653 if (!pool) {
2654 raw_upmap->clear();
2655 return;
2656 }
494da23a
TL
2657 _pg_to_raw_osds(*pool, pg, raw, NULL);
2658 *raw_upmap = *raw;
a8e16298
TL
2659 _apply_upmap(*pool, pg, raw_upmap);
2660}
2661
7c673cae
FG
2662void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2663{
2664 const pg_pool_t *pool = get_pg_pool(pg.pool());
2665 if (!pool) {
11fdf7f2
TL
2666 *primary = -1;
2667 up->clear();
7c673cae
FG
2668 return;
2669 }
2670 vector<int> raw;
2671 ps_t pps;
2672 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2673 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2674 _raw_to_up_osds(*pool, raw, up);
2675 *primary = _pick_primary(raw);
2676 _apply_primary_affinity(pps, *pool, up, primary);
2677}
31f18b77 2678
7c673cae
FG
2679void OSDMap::_pg_to_up_acting_osds(
2680 const pg_t& pg, vector<int> *up, int *up_primary,
2681 vector<int> *acting, int *acting_primary,
2682 bool raw_pg_to_pg) const
2683{
2684 const pg_pool_t *pool = get_pg_pool(pg.pool());
2685 if (!pool ||
2686 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2687 if (up)
2688 up->clear();
2689 if (up_primary)
2690 *up_primary = -1;
2691 if (acting)
2692 acting->clear();
2693 if (acting_primary)
2694 *acting_primary = -1;
2695 return;
2696 }
2697 vector<int> raw;
2698 vector<int> _up;
2699 vector<int> _acting;
2700 int _up_primary;
2701 int _acting_primary;
2702 ps_t pps;
2703 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2704 if (_acting.empty() || up || up_primary) {
2705 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2706 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2707 _raw_to_up_osds(*pool, raw, &_up);
2708 _up_primary = _pick_primary(_up);
2709 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2710 if (_acting.empty()) {
2711 _acting = _up;
2712 if (_acting_primary == -1) {
2713 _acting_primary = _up_primary;
2714 }
2715 }
2716
2717 if (up)
2718 up->swap(_up);
2719 if (up_primary)
2720 *up_primary = _up_primary;
2721 }
2722
2723 if (acting)
2724 acting->swap(_acting);
2725 if (acting_primary)
2726 *acting_primary = _acting_primary;
2727}
2728
9f95a23c 2729int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
7c673cae 2730{
9f95a23c
TL
2731 // This implementation is broken for EC PGs since the osd may appear
2732 // multiple times in the acting set. See
2733 // https://tracker.ceph.com/issues/43213
7c673cae
FG
2734 if (!nrep)
2735 nrep = acting.size();
2736 for (int i=0; i<nrep; i++)
2737 if (acting[i] == osd)
2738 return i;
2739 return -1;
2740}
2741
9f95a23c 2742int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
7c673cae 2743{
9f95a23c
TL
2744 int nrep = acting.size();
2745 if (who.shard == shard_id_t::NO_SHARD) {
2746 for (int i=0; i<nrep; i++) {
2747 if (acting[i] == who.osd) {
2748 return i;
2749 }
2750 }
2751 } else {
2752 if (who.shard < nrep && acting[who.shard] == who.osd) {
2753 return who.shard;
2754 }
2755 }
2756 return -1;
7c673cae
FG
2757}
2758
9f95a23c 2759bool OSDMap::primary_changed_broken(
7c673cae
FG
2760 int oldprimary,
2761 const vector<int> &oldacting,
2762 int newprimary,
2763 const vector<int> &newacting)
2764{
2765 if (oldacting.empty() && newacting.empty())
2766 return false; // both still empty
2767 if (oldacting.empty() ^ newacting.empty())
2768 return true; // was empty, now not, or vice versa
2769 if (oldprimary != newprimary)
2770 return true; // primary changed
9f95a23c
TL
2771 if (calc_pg_role_broken(oldprimary, oldacting) !=
2772 calc_pg_role_broken(newprimary, newacting))
7c673cae
FG
2773 return true;
2774 return false; // same primary (tho replicas may have changed)
2775}
2776
28e407b8
AA
2777uint64_t OSDMap::get_encoding_features() const
2778{
2779 uint64_t f = SIGNIFICANT_FEATURES;
9f95a23c
TL
2780 if (require_osd_release < ceph_release_t::octopus) {
2781 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2782 }
2783 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
2784 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2785 }
9f95a23c 2786 if (require_osd_release < ceph_release_t::mimic) {
11fdf7f2
TL
2787 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2788 }
9f95a23c 2789 if (require_osd_release < ceph_release_t::luminous) {
28e407b8
AA
2790 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2791 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2792 }
9f95a23c 2793 if (require_osd_release < ceph_release_t::kraken) {
28e407b8 2794 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2795 CEPH_FEATURE_MSG_ADDR2);
28e407b8 2796 }
9f95a23c 2797 if (require_osd_release < ceph_release_t::jewel) {
28e407b8 2798 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2799 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2800 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2801 }
2802 return f;
2803}
7c673cae
FG
2804
2805// serialize, unserialize
9f95a23c 2806void OSDMap::encode_client_old(ceph::buffer::list& bl) const
7c673cae 2807{
11fdf7f2 2808 using ceph::encode;
7c673cae 2809 __u16 v = 5;
11fdf7f2 2810 encode(v, bl);
7c673cae
FG
2811
2812 // base
11fdf7f2
TL
2813 encode(fsid, bl);
2814 encode(epoch, bl);
2815 encode(created, bl);
2816 encode(modified, bl);
7c673cae 2817
11fdf7f2 2818 // for encode(pools, bl);
7c673cae 2819 __u32 n = pools.size();
11fdf7f2 2820 encode(n, bl);
7c673cae
FG
2821
2822 for (const auto &pool : pools) {
2823 n = pool.first;
11fdf7f2
TL
2824 encode(n, bl);
2825 encode(pool.second, bl, 0);
7c673cae 2826 }
11fdf7f2 2827 // for encode(pool_name, bl);
7c673cae 2828 n = pool_name.size();
11fdf7f2 2829 encode(n, bl);
7c673cae
FG
2830 for (const auto &pname : pool_name) {
2831 n = pname.first;
11fdf7f2
TL
2832 encode(n, bl);
2833 encode(pname.second, bl);
7c673cae 2834 }
11fdf7f2 2835 // for encode(pool_max, bl);
7c673cae 2836 n = pool_max;
11fdf7f2 2837 encode(n, bl);
7c673cae 2838
11fdf7f2 2839 encode(flags, bl);
7c673cae 2840
11fdf7f2 2841 encode(max_osd, bl);
31f18b77
FG
2842 {
2843 uint32_t n = osd_state.size();
11fdf7f2 2844 encode(n, bl);
31f18b77 2845 for (auto s : osd_state) {
11fdf7f2 2846 encode((uint8_t)s, bl);
31f18b77
FG
2847 }
2848 }
11fdf7f2
TL
2849 encode(osd_weight, bl);
2850 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2851
11fdf7f2 2852 // for encode(pg_temp, bl);
7c673cae 2853 n = pg_temp->size();
11fdf7f2 2854 encode(n, bl);
f67539c2 2855 for (const auto& pg : *pg_temp) {
7c673cae 2856 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2857 encode(opg, bl);
2858 encode(pg.second, bl);
7c673cae
FG
2859 }
2860
2861 // crush
9f95a23c 2862 ceph::buffer::list cbl;
7c673cae 2863 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2864 encode(cbl, bl);
7c673cae
FG
2865}
2866
9f95a23c 2867void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2868{
11fdf7f2 2869 using ceph::encode;
7c673cae
FG
2870 if ((features & CEPH_FEATURE_PGID64) == 0) {
2871 encode_client_old(bl);
2872 return;
2873 }
2874
2875 __u16 v = 6;
11fdf7f2 2876 encode(v, bl);
7c673cae
FG
2877
2878 // base
11fdf7f2
TL
2879 encode(fsid, bl);
2880 encode(epoch, bl);
2881 encode(created, bl);
2882 encode(modified, bl);
7c673cae 2883
11fdf7f2
TL
2884 encode(pools, bl, features);
2885 encode(pool_name, bl);
2886 encode(pool_max, bl);
7c673cae 2887
11fdf7f2 2888 encode(flags, bl);
7c673cae 2889
11fdf7f2 2890 encode(max_osd, bl);
31f18b77
FG
2891 {
2892 uint32_t n = osd_state.size();
11fdf7f2 2893 encode(n, bl);
31f18b77 2894 for (auto s : osd_state) {
11fdf7f2 2895 encode((uint8_t)s, bl);
31f18b77
FG
2896 }
2897 }
11fdf7f2
TL
2898 encode(osd_weight, bl);
2899 encode(osd_addrs->client_addrs, bl, features);
7c673cae 2900
11fdf7f2 2901 encode(*pg_temp, bl);
7c673cae
FG
2902
2903 // crush
9f95a23c 2904 ceph::buffer::list cbl;
7c673cae 2905 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2906 encode(cbl, bl);
7c673cae
FG
2907
2908 // extended
2909 __u16 ev = 10;
11fdf7f2
TL
2910 encode(ev, bl);
2911 encode(osd_addrs->hb_back_addrs, bl, features);
2912 encode(osd_info, bl);
f67539c2 2913 encode(blocklist, bl, features);
11fdf7f2
TL
2914 encode(osd_addrs->cluster_addrs, bl, features);
2915 encode(cluster_snapshot_epoch, bl);
2916 encode(cluster_snapshot, bl);
2917 encode(*osd_uuid, bl);
9f95a23c 2918 encode(osd_xinfo, bl, features);
11fdf7f2 2919 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
2920}
2921
11fdf7f2
TL
2922/* for a description of osdmap versions, and when they were introduced, please
2923 * refer to
2924 * doc/dev/osd_internals/osdmap_versions.txt
2925 */
9f95a23c 2926void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2927{
11fdf7f2 2928 using ceph::encode;
7c673cae
FG
2929 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2930 encode_classic(bl, features);
2931 return;
2932 }
2933
2934 // only a select set of callers should *ever* be encoding new
2935 // OSDMaps. others should be passing around the canonical encoded
2936 // buffers from on high. select out those callers by passing in an
2937 // "impossible" feature bit.
11fdf7f2 2938 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
2939 features &= ~CEPH_FEATURE_RESERVED;
2940
2941 size_t start_offset = bl.length();
2942 size_t tail_offset;
11fdf7f2 2943 size_t crc_offset;
9f95a23c 2944 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
2945
2946 // meta-encoding: how we include client-used and osd-specific data
2947 ENCODE_START(8, 7, bl);
2948
2949 {
28e407b8
AA
2950 // NOTE: any new encoding dependencies must be reflected by
2951 // SIGNIFICANT_FEATURES
11fdf7f2 2952 uint8_t v = 9;
31f18b77 2953 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 2954 v = 3;
11fdf7f2
TL
2955 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2956 v = 6;
2957 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2958 v = 7;
7c673cae
FG
2959 }
2960 ENCODE_START(v, 1, bl); // client-usable data
2961 // base
11fdf7f2
TL
2962 encode(fsid, bl);
2963 encode(epoch, bl);
2964 encode(created, bl);
2965 encode(modified, bl);
7c673cae 2966
11fdf7f2
TL
2967 encode(pools, bl, features);
2968 encode(pool_name, bl);
2969 encode(pool_max, bl);
7c673cae 2970
31f18b77
FG
2971 if (v < 4) {
2972 decltype(flags) f = flags;
9f95a23c 2973 if (require_osd_release >= ceph_release_t::luminous)
c07f9fc5 2974 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
9f95a23c 2975 else if (require_osd_release == ceph_release_t::kraken)
31f18b77 2976 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
9f95a23c 2977 else if (require_osd_release == ceph_release_t::jewel)
31f18b77 2978 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 2979 encode(f, bl);
31f18b77 2980 } else {
11fdf7f2 2981 encode(flags, bl);
31f18b77 2982 }
7c673cae 2983
11fdf7f2 2984 encode(max_osd, bl);
31f18b77 2985 if (v >= 5) {
11fdf7f2 2986 encode(osd_state, bl);
31f18b77
FG
2987 } else {
2988 uint32_t n = osd_state.size();
11fdf7f2 2989 encode(n, bl);
31f18b77 2990 for (auto s : osd_state) {
11fdf7f2 2991 encode((uint8_t)s, bl);
31f18b77
FG
2992 }
2993 }
11fdf7f2
TL
2994 encode(osd_weight, bl);
2995 if (v >= 8) {
2996 encode(osd_addrs->client_addrs, bl, features);
2997 } else {
2998 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2999 }
7c673cae 3000
11fdf7f2
TL
3001 encode(*pg_temp, bl);
3002 encode(*primary_temp, bl);
7c673cae 3003 if (osd_primary_affinity) {
11fdf7f2 3004 encode(*osd_primary_affinity, bl);
7c673cae
FG
3005 } else {
3006 vector<__u32> v;
11fdf7f2 3007 encode(v, bl);
7c673cae
FG
3008 }
3009
3010 // crush
9f95a23c 3011 ceph::buffer::list cbl;
7c673cae 3012 crush->encode(cbl, features);
11fdf7f2
TL
3013 encode(cbl, bl);
3014 encode(erasure_code_profiles, bl);
7c673cae
FG
3015
3016 if (v >= 4) {
11fdf7f2
TL
3017 encode(pg_upmap, bl);
3018 encode(pg_upmap_items, bl);
7c673cae 3019 } else {
11fdf7f2
TL
3020 ceph_assert(pg_upmap.empty());
3021 ceph_assert(pg_upmap_items.empty());
7c673cae 3022 }
31f18b77 3023 if (v >= 6) {
11fdf7f2
TL
3024 encode(crush_version, bl);
3025 }
3026 if (v >= 7) {
3027 encode(new_removed_snaps, bl);
3028 encode(new_purged_snaps, bl);
3029 }
3030 if (v >= 9) {
3031 encode(last_up_change, bl);
3032 encode(last_in_change, bl);
31f18b77 3033 }
7c673cae
FG
3034 ENCODE_FINISH(bl); // client-usable data
3035 }
3036
3037 {
28e407b8
AA
3038 // NOTE: any new encoding dependencies must be reflected by
3039 // SIGNIFICANT_FEATURES
f67539c2 3040 uint8_t target_v = 9; // when bumping this, be aware of stretch_mode target_v 10!
7c673cae
FG
3041 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3042 target_v = 1;
11fdf7f2
TL
3043 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3044 target_v = 5;
3045 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3046 target_v = 6;
7c673cae 3047 }
f67539c2
TL
3048 if (stretch_mode_enabled) {
3049 target_v = std::max((uint8_t)10, target_v);
3050 }
7c673cae 3051 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
3052 if (target_v < 7) {
3053 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3054 } else {
3055 encode(osd_addrs->hb_back_addrs, bl, features);
3056 }
3057 encode(osd_info, bl);
7c673cae
FG
3058 {
3059 // put this in a sorted, ordered map<> so that we encode in a
3060 // deterministic order.
f67539c2
TL
3061 map<entity_addr_t,utime_t> blocklist_map;
3062 for (const auto &addr : blocklist)
3063 blocklist_map.insert(make_pair(addr.first, addr.second));
3064 encode(blocklist_map, bl, features);
11fdf7f2
TL
3065 }
3066 if (target_v < 7) {
3067 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3068 } else {
3069 encode(osd_addrs->cluster_addrs, bl, features);
3070 }
3071 encode(cluster_snapshot_epoch, bl);
3072 encode(cluster_snapshot, bl);
3073 encode(*osd_uuid, bl);
9f95a23c 3074 encode(osd_xinfo, bl, features);
11fdf7f2
TL
3075 if (target_v < 7) {
3076 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3077 } else {
3078 encode(osd_addrs->hb_front_addrs, bl, features);
3079 }
7c673cae 3080 if (target_v >= 2) {
11fdf7f2
TL
3081 encode(nearfull_ratio, bl);
3082 encode(full_ratio, bl);
3083 encode(backfillfull_ratio, bl);
31f18b77
FG
3084 }
3085 // 4 was string-based new_require_min_compat_client
3086 if (target_v >= 5) {
11fdf7f2
TL
3087 encode(require_min_compat_client, bl);
3088 encode(require_osd_release, bl);
3089 }
3090 if (target_v >= 6) {
3091 encode(removed_snaps_queue, bl);
7c673cae 3092 }
81eedcae
TL
3093 if (target_v >= 8) {
3094 encode(crush_node_flags, bl);
3095 }
3096 if (target_v >= 9) {
3097 encode(device_class_flags, bl);
3098 }
f67539c2
TL
3099 if (target_v >= 10) {
3100 encode(stretch_mode_enabled, bl);
3101 encode(stretch_bucket_count, bl);
3102 encode(degraded_stretch_mode, bl);
3103 encode(recovering_stretch_mode, bl);
3104 encode(stretch_mode_bucket, bl);
3105 }
7c673cae
FG
3106 ENCODE_FINISH(bl); // osd-only data
3107 }
3108
11fdf7f2
TL
3109 crc_offset = bl.length();
3110 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
3111 tail_offset = bl.length();
3112
3113 ENCODE_FINISH(bl); // meta-encoding wrapper
3114
3115 // fill in crc
9f95a23c 3116 ceph::buffer::list front;
11fdf7f2 3117 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
3118 crc = front.crc32c(-1);
3119 if (tail_offset < bl.length()) {
9f95a23c 3120 ceph::buffer::list tail;
7c673cae
FG
3121 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3122 crc = tail.crc32c(crc);
3123 }
3124 ceph_le32 crc_le;
3125 crc_le = crc;
11fdf7f2 3126 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
3127 crc_defined = true;
3128}
3129
11fdf7f2
TL
3130/* for a description of osdmap versions, and when they were introduced, please
3131 * refer to
3132 * doc/dev/osd_internals/osdmap_versions.txt
3133 */
9f95a23c 3134void OSDMap::decode(ceph::buffer::list& bl)
7c673cae 3135{
11fdf7f2 3136 auto p = bl.cbegin();
7c673cae
FG
3137 decode(p);
3138}
3139
9f95a23c 3140void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
7c673cae 3141{
11fdf7f2 3142 using ceph::decode;
7c673cae
FG
3143 __u32 n, t;
3144 __u16 v;
11fdf7f2 3145 decode(v, p);
7c673cae
FG
3146
3147 // base
11fdf7f2
TL
3148 decode(fsid, p);
3149 decode(epoch, p);
3150 decode(created, p);
3151 decode(modified, p);
7c673cae
FG
3152
3153 if (v < 6) {
3154 if (v < 4) {
3155 int32_t max_pools = 0;
11fdf7f2 3156 decode(max_pools, p);
7c673cae
FG
3157 pool_max = max_pools;
3158 }
3159 pools.clear();
11fdf7f2 3160 decode(n, p);
7c673cae 3161 while (n--) {
11fdf7f2
TL
3162 decode(t, p);
3163 decode(pools[t], p);
7c673cae
FG
3164 }
3165 if (v == 4) {
11fdf7f2 3166 decode(n, p);
7c673cae
FG
3167 pool_max = n;
3168 } else if (v == 5) {
3169 pool_name.clear();
11fdf7f2 3170 decode(n, p);
7c673cae 3171 while (n--) {
11fdf7f2
TL
3172 decode(t, p);
3173 decode(pool_name[t], p);
7c673cae 3174 }
11fdf7f2 3175 decode(n, p);
7c673cae
FG
3176 pool_max = n;
3177 }
3178 } else {
11fdf7f2
TL
3179 decode(pools, p);
3180 decode(pool_name, p);
3181 decode(pool_max, p);
7c673cae
FG
3182 }
3183 // kludge around some old bug that zeroed out pool_max (#2307)
3184 if (pools.size() && pool_max < pools.rbegin()->first) {
3185 pool_max = pools.rbegin()->first;
3186 }
3187
11fdf7f2 3188 decode(flags, p);
7c673cae 3189
11fdf7f2 3190 decode(max_osd, p);
31f18b77
FG
3191 {
3192 vector<uint8_t> os;
11fdf7f2 3193 decode(os, p);
31f18b77
FG
3194 osd_state.resize(os.size());
3195 for (unsigned i = 0; i < os.size(); ++i) {
3196 osd_state[i] = os[i];
3197 }
3198 }
11fdf7f2
TL
3199 decode(osd_weight, p);
3200 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3201 if (v <= 5) {
3202 pg_temp->clear();
11fdf7f2 3203 decode(n, p);
7c673cae
FG
3204 while (n--) {
3205 old_pg_t opg;
9f95a23c 3206 ceph::decode_raw(opg, p);
31f18b77 3207 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3208 decode(v, p);
31f18b77 3209 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3210 }
3211 } else {
11fdf7f2 3212 decode(*pg_temp, p);
7c673cae
FG
3213 }
3214
3215 // crush
9f95a23c 3216 ceph::buffer::list cbl;
11fdf7f2
TL
3217 decode(cbl, p);
3218 auto cblp = cbl.cbegin();
7c673cae
FG
3219 crush->decode(cblp);
3220
3221 // extended
3222 __u16 ev = 0;
3223 if (v >= 5)
11fdf7f2
TL
3224 decode(ev, p);
3225 decode(osd_addrs->hb_back_addrs, p);
3226 decode(osd_info, p);
7c673cae 3227 if (v < 5)
11fdf7f2 3228 decode(pool_name, p);
7c673cae 3229
f67539c2 3230 decode(blocklist, p);
7c673cae 3231 if (ev >= 6)
11fdf7f2 3232 decode(osd_addrs->cluster_addrs, p);
7c673cae 3233 else
11fdf7f2 3234 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3235
3236 if (ev >= 7) {
11fdf7f2
TL
3237 decode(cluster_snapshot_epoch, p);
3238 decode(cluster_snapshot, p);
7c673cae
FG
3239 }
3240
3241 if (ev >= 8) {
11fdf7f2 3242 decode(*osd_uuid, p);
7c673cae
FG
3243 } else {
3244 osd_uuid->resize(max_osd);
3245 }
3246 if (ev >= 9)
11fdf7f2 3247 decode(osd_xinfo, p);
7c673cae
FG
3248 else
3249 osd_xinfo.resize(max_osd);
3250
3251 if (ev >= 10)
11fdf7f2 3252 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3253 else
11fdf7f2 3254 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3255
3256 osd_primary_affinity.reset();
3257
3258 post_decode();
3259}
3260
9f95a23c 3261void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 3262{
11fdf7f2 3263 using ceph::decode;
7c673cae
FG
3264 /**
3265 * Older encodings of the OSDMap had a single struct_v which
3266 * covered the whole encoding, and was prior to our modern
3267 * stuff which includes a compatv and a size. So if we see
3268 * a struct_v < 7, we must rewind to the beginning and use our
3269 * classic decoder.
3270 */
3271 size_t start_offset = bl.get_off();
3272 size_t tail_offset = 0;
9f95a23c 3273 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
3274
3275 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3276 if (struct_v < 7) {
11fdf7f2 3277 bl.seek(start_offset);
7c673cae
FG
3278 decode_classic(bl);
3279 return;
3280 }
3281 /**
3282 * Since we made it past that hurdle, we can use our normal paths.
3283 */
3284 {
11fdf7f2 3285 DECODE_START(9, bl); // client-usable data
7c673cae 3286 // base
11fdf7f2
TL
3287 decode(fsid, bl);
3288 decode(epoch, bl);
3289 decode(created, bl);
3290 decode(modified, bl);
7c673cae 3291
11fdf7f2
TL
3292 decode(pools, bl);
3293 decode(pool_name, bl);
3294 decode(pool_max, bl);
7c673cae 3295
11fdf7f2 3296 decode(flags, bl);
7c673cae 3297
11fdf7f2 3298 decode(max_osd, bl);
31f18b77 3299 if (struct_v >= 5) {
11fdf7f2 3300 decode(osd_state, bl);
31f18b77
FG
3301 } else {
3302 vector<uint8_t> os;
11fdf7f2 3303 decode(os, bl);
31f18b77
FG
3304 osd_state.resize(os.size());
3305 for (unsigned i = 0; i < os.size(); ++i) {
3306 osd_state[i] = os[i];
3307 }
3308 }
11fdf7f2
TL
3309 decode(osd_weight, bl);
3310 decode(osd_addrs->client_addrs, bl);
7c673cae 3311
11fdf7f2
TL
3312 decode(*pg_temp, bl);
3313 decode(*primary_temp, bl);
3314 // dates back to firefly. version increased from 2 to 3 still in firefly.
3315 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3316 if (struct_v >= 2) {
3317 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3318 decode(*osd_primary_affinity, bl);
7c673cae
FG
3319 if (osd_primary_affinity->empty())
3320 osd_primary_affinity.reset();
3321 } else {
3322 osd_primary_affinity.reset();
3323 }
3324
3325 // crush
9f95a23c 3326 ceph::buffer::list cbl;
11fdf7f2
TL
3327 decode(cbl, bl);
3328 auto cblp = cbl.cbegin();
7c673cae 3329 crush->decode(cblp);
11fdf7f2
TL
3330 // added in firefly; version increased in luminous, so it affects
3331 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3332 // alone until we require clients to be all luminous?
7c673cae 3333 if (struct_v >= 3) {
11fdf7f2 3334 decode(erasure_code_profiles, bl);
7c673cae
FG
3335 } else {
3336 erasure_code_profiles.clear();
3337 }
11fdf7f2
TL
3338 // version increased from 3 to 4 still in luminous, so same as above
3339 // applies.
7c673cae 3340 if (struct_v >= 4) {
11fdf7f2
TL
3341 decode(pg_upmap, bl);
3342 decode(pg_upmap_items, bl);
7c673cae
FG
3343 } else {
3344 pg_upmap.clear();
3345 pg_upmap_items.clear();
3346 }
11fdf7f2
TL
3347 // again, version increased from 5 to 6 still in luminous, so above
3348 // applies.
31f18b77 3349 if (struct_v >= 6) {
11fdf7f2
TL
3350 decode(crush_version, bl);
3351 }
3352 // version increase from 6 to 7 in mimic
3353 if (struct_v >= 7) {
3354 decode(new_removed_snaps, bl);
3355 decode(new_purged_snaps, bl);
3356 }
3357 // version increase from 7 to 8, 8 to 9, in nautilus.
3358 if (struct_v >= 9) {
3359 decode(last_up_change, bl);
3360 decode(last_in_change, bl);
31f18b77 3361 }
7c673cae
FG
3362 DECODE_FINISH(bl); // client-usable data
3363 }
3364
3365 {
f67539c2 3366 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
3367 decode(osd_addrs->hb_back_addrs, bl);
3368 decode(osd_info, bl);
f67539c2 3369 decode(blocklist, bl);
11fdf7f2
TL
3370 decode(osd_addrs->cluster_addrs, bl);
3371 decode(cluster_snapshot_epoch, bl);
3372 decode(cluster_snapshot, bl);
3373 decode(*osd_uuid, bl);
3374 decode(osd_xinfo, bl);
3375 decode(osd_addrs->hb_front_addrs, bl);
3376 //
7c673cae 3377 if (struct_v >= 2) {
11fdf7f2
TL
3378 decode(nearfull_ratio, bl);
3379 decode(full_ratio, bl);
7c673cae
FG
3380 } else {
3381 nearfull_ratio = 0;
3382 full_ratio = 0;
3383 }
3384 if (struct_v >= 3) {
11fdf7f2 3385 decode(backfillfull_ratio, bl);
7c673cae
FG
3386 } else {
3387 backfillfull_ratio = 0;
3388 }
31f18b77
FG
3389 if (struct_v == 4) {
3390 string r;
11fdf7f2 3391 decode(r, bl);
31f18b77
FG
3392 if (r.length())
3393 require_min_compat_client = ceph_release_from_name(r.c_str());
3394 }
3395 if (struct_v >= 5) {
11fdf7f2
TL
3396 decode(require_min_compat_client, bl);
3397 decode(require_osd_release, bl);
9f95a23c 3398 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
3399 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3400 }
9f95a23c 3401 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 3402 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3403 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3404 }
3405 } else {
3406 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3407 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 3408 require_osd_release = ceph_release_t::luminous;
31f18b77 3409 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3410 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77 3411 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c 3412 require_osd_release = ceph_release_t::kraken;
31f18b77 3413 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c 3414 require_osd_release = ceph_release_t::jewel;
31f18b77 3415 } else {
9f95a23c 3416 require_osd_release = ceph_release_t::unknown;
31f18b77
FG
3417 }
3418 }
11fdf7f2
TL
3419 if (struct_v >= 6) {
3420 decode(removed_snaps_queue, bl);
3421 }
81eedcae
TL
3422 if (struct_v >= 8) {
3423 decode(crush_node_flags, bl);
3424 } else {
3425 crush_node_flags.clear();
3426 }
3427 if (struct_v >= 9) {
3428 decode(device_class_flags, bl);
3429 } else {
3430 device_class_flags.clear();
3431 }
f67539c2
TL
3432 if (struct_v >= 10) {
3433 decode(stretch_mode_enabled, bl);
3434 decode(stretch_bucket_count, bl);
3435 decode(degraded_stretch_mode, bl);
3436 decode(recovering_stretch_mode, bl);
3437 decode(stretch_mode_bucket, bl);
3438 } else {
3439 stretch_mode_enabled = false;
3440 stretch_bucket_count = 0;
3441 degraded_stretch_mode = 0;
3442 recovering_stretch_mode = 0;
3443 stretch_mode_bucket = 0;
3444 }
7c673cae
FG
3445 DECODE_FINISH(bl); // osd-only data
3446 }
3447
3448 if (struct_v >= 8) {
3449 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3450 decode(crc, bl);
7c673cae
FG
3451 tail_offset = bl.get_off();
3452 crc_defined = true;
3453 } else {
3454 crc_defined = false;
3455 crc = 0;
3456 }
3457
3458 DECODE_FINISH(bl); // wrapper
3459
3460 if (tail_offset) {
3461 // verify crc
3462 uint32_t actual = crc_front.crc32c(-1);
3463 if (tail_offset < bl.get_off()) {
9f95a23c 3464 ceph::buffer::list tail;
7c673cae
FG
3465 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3466 actual = tail.crc32c(actual);
3467 }
3468 if (crc != actual) {
3469 ostringstream ss;
3470 ss << "bad crc, actual " << actual << " != expected " << crc;
3471 string s = ss.str();
9f95a23c 3472 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
3473 }
3474 }
3475
3476 post_decode();
3477}
3478
3479void OSDMap::post_decode()
3480{
3481 // index pool names
3482 name_pool.clear();
3483 for (const auto &pname : pool_name) {
3484 name_pool[pname.second] = pname.first;
3485 }
3486
3487 calc_num_osds();
3488 _calc_up_osd_features();
3489}
3490
3491void OSDMap::dump_erasure_code_profiles(
3492 const mempool::osdmap::map<string,map<string,string>>& profiles,
3493 Formatter *f)
3494{
3495 f->open_object_section("erasure_code_profiles");
3496 for (const auto &profile : profiles) {
3497 f->open_object_section(profile.first.c_str());
3498 for (const auto &profm : profile.second) {
9f95a23c 3499 f->dump_string(profm.first.c_str(), profm.second);
7c673cae
FG
3500 }
3501 f->close_section();
3502 }
3503 f->close_section();
3504}
3505
9f95a23c
TL
3506void OSDMap::dump_osds(Formatter *f) const
3507{
3508 f->open_array_section("osds");
3509 for (int i=0; i<get_max_osd(); i++) {
3510 if (exists(i)) {
3511 dump_osd(i, f);
3512 }
3513 }
3514 f->close_section();
3515}
3516
3517void OSDMap::dump_osd(int id, Formatter *f) const
3518{
3519 ceph_assert(f != nullptr);
3520 if (!exists(id)) {
3521 return;
3522 }
3523
3524 f->open_object_section("osd_info");
3525 f->dump_int("osd", id);
3526 f->dump_stream("uuid") << get_uuid(id);
3527 f->dump_int("up", is_up(id));
3528 f->dump_int("in", is_in(id));
3529 f->dump_float("weight", get_weightf(id));
3530 f->dump_float("primary_affinity", get_primary_affinityf(id));
3531 get_info(id).dump(f);
3532 f->dump_object("public_addrs", get_addrs(id));
3533 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3534 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3535 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3536 // compat
3537 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3538 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3539 f->dump_stream("heartbeat_back_addr")
3540 << get_hb_back_addrs(id).get_legacy_str();
3541 f->dump_stream("heartbeat_front_addr")
3542 << get_hb_front_addrs(id).get_legacy_str();
3543
3544 set<string> st;
3545 get_state(id, st);
3546 f->open_array_section("state");
3547 for (const auto &state : st)
3548 f->dump_string("state", state);
3549 f->close_section();
3550
3551 f->close_section();
3552}
3553
7c673cae
FG
3554void OSDMap::dump(Formatter *f) const
3555{
3556 f->dump_int("epoch", get_epoch());
3557 f->dump_stream("fsid") << get_fsid();
3558 f->dump_stream("created") << get_created();
3559 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3560 f->dump_stream("last_up_change") << last_up_change;
3561 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3562 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3563 f->dump_unsigned("flags_num", flags);
3564 f->open_array_section("flags_set");
3565 set<string> flagset;
3566 get_flag_set(&flagset);
3567 for (auto p : flagset) {
3568 f->dump_string("flag", p);
3569 }
3570 f->close_section();
31f18b77 3571 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3572 f->dump_float("full_ratio", full_ratio);
3573 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3574 f->dump_float("nearfull_ratio", nearfull_ratio);
3575 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3576 f->dump_int("pool_max", get_pool_max());
3577 f->dump_int("max_osd", get_max_osd());
31f18b77 3578 f->dump_string("require_min_compat_client",
f67539c2 3579 to_string(require_min_compat_client));
31f18b77 3580 f->dump_string("min_compat_client",
f67539c2 3581 to_string(get_min_compat_client()));
31f18b77 3582 f->dump_string("require_osd_release",
f67539c2 3583 to_string(require_osd_release));
7c673cae
FG
3584
3585 f->open_array_section("pools");
3586 for (const auto &pool : pools) {
3587 std::string name("<unknown>");
3588 const auto &pni = pool_name.find(pool.first);
3589 if (pni != pool_name.end())
3590 name = pni->second;
3591 f->open_object_section("pool");
3592 f->dump_int("pool", pool.first);
3593 f->dump_string("pool_name", name);
3594 pool.second.dump(f);
3595 f->close_section();
3596 }
3597 f->close_section();
3598
9f95a23c 3599 dump_osds(f);
7c673cae
FG
3600
3601 f->open_array_section("osd_xinfo");
3602 for (int i=0; i<get_max_osd(); i++) {
3603 if (exists(i)) {
3604 f->open_object_section("xinfo");
3605 f->dump_int("osd", i);
3606 osd_xinfo[i].dump(f);
3607 f->close_section();
3608 }
3609 }
3610 f->close_section();
3611
3612 f->open_array_section("pg_upmap");
3613 for (auto& p : pg_upmap) {
3614 f->open_object_section("mapping");
3615 f->dump_stream("pgid") << p.first;
3616 f->open_array_section("osds");
3617 for (auto q : p.second) {
3618 f->dump_int("osd", q);
3619 }
3620 f->close_section();
3621 f->close_section();
3622 }
3623 f->close_section();
3624 f->open_array_section("pg_upmap_items");
3625 for (auto& p : pg_upmap_items) {
3626 f->open_object_section("mapping");
3627 f->dump_stream("pgid") << p.first;
3628 f->open_array_section("mappings");
3629 for (auto& q : p.second) {
3630 f->open_object_section("mapping");
3631 f->dump_int("from", q.first);
3632 f->dump_int("to", q.second);
3633 f->close_section();
3634 }
3635 f->close_section();
3636 f->close_section();
3637 }
3638 f->close_section();
3639 f->open_array_section("pg_temp");
31f18b77 3640 pg_temp->dump(f);
7c673cae
FG
3641 f->close_section();
3642
3643 f->open_array_section("primary_temp");
3644 for (const auto &pg : *primary_temp) {
3645 f->dump_stream("pgid") << pg.first;
3646 f->dump_int("osd", pg.second);
3647 }
3648 f->close_section(); // primary_temp
3649
f67539c2
TL
3650 f->open_object_section("blocklist");
3651 for (const auto &addr : blocklist) {
7c673cae
FG
3652 stringstream ss;
3653 ss << addr.first;
3654 f->dump_stream(ss.str().c_str()) << addr.second;
3655 }
3656 f->close_section();
3657
3658 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3659
3660 f->open_array_section("removed_snaps_queue");
3661 for (auto& p : removed_snaps_queue) {
3662 f->open_object_section("pool");
3663 f->dump_int("pool", p.first);
3664 f->open_array_section("snaps");
3665 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3666 f->open_object_section("interval");
3667 f->dump_unsigned("begin", q.get_start());
3668 f->dump_unsigned("length", q.get_len());
3669 f->close_section();
3670 }
3671 f->close_section();
3672 f->close_section();
3673 }
3674 f->close_section();
3675 f->open_array_section("new_removed_snaps");
3676 for (auto& p : new_removed_snaps) {
3677 f->open_object_section("pool");
3678 f->dump_int("pool", p.first);
3679 f->open_array_section("snaps");
3680 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3681 f->open_object_section("interval");
3682 f->dump_unsigned("begin", q.get_start());
3683 f->dump_unsigned("length", q.get_len());
3684 f->close_section();
3685 }
3686 f->close_section();
3687 f->close_section();
3688 }
3689 f->close_section();
3690 f->open_array_section("new_purged_snaps");
3691 for (auto& p : new_purged_snaps) {
3692 f->open_object_section("pool");
3693 f->dump_int("pool", p.first);
3694 f->open_array_section("snaps");
3695 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3696 f->open_object_section("interval");
3697 f->dump_unsigned("begin", q.get_start());
3698 f->dump_unsigned("length", q.get_len());
3699 f->close_section();
3700 }
3701 f->close_section();
3702 f->close_section();
3703 }
3704 f->close_section();
81eedcae
TL
3705 f->open_object_section("crush_node_flags");
3706 for (auto& i : crush_node_flags) {
3707 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3708 : stringify(i.first);
3709 f->open_array_section(s.c_str());
3710 set<string> st;
3711 calc_state_set(i.second, st);
3712 for (auto& j : st) {
3713 f->dump_string("flag", j);
3714 }
3715 f->close_section();
3716 }
3717 f->close_section();
3718 f->open_object_section("device_class_flags");
3719 for (auto& i : device_class_flags) {
3720 const char* class_name = crush->get_class_name(i.first);
3721 string s = class_name ? class_name : stringify(i.first);
3722 f->open_array_section(s.c_str());
3723 set<string> st;
3724 calc_state_set(i.second, st);
3725 for (auto& j : st) {
3726 f->dump_string("flag", j);
3727 }
3728 f->close_section();
3729 }
3730 f->close_section();
f67539c2
TL
3731 f->open_object_section("stretch_mode");
3732 {
3733 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
3734 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
3735 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
3736 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
3737 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
3738 }
3739 f->close_section();
7c673cae
FG
3740}
3741
3742void OSDMap::generate_test_instances(list<OSDMap*>& o)
3743{
3744 o.push_back(new OSDMap);
3745
3746 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3747 o.push_back(new OSDMap);
3748 uuid_d fsid;
224ce89b 3749 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae 3750 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
f67539c2 3751 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
7c673cae
FG
3752 cct->put();
3753}
3754
3755string OSDMap::get_flag_string(unsigned f)
3756{
3757 string s;
7c673cae
FG
3758 if (f & CEPH_OSDMAP_PAUSERD)
3759 s += ",pauserd";
3760 if (f & CEPH_OSDMAP_PAUSEWR)
3761 s += ",pausewr";
3762 if (f & CEPH_OSDMAP_PAUSEREC)
3763 s += ",pauserec";
3764 if (f & CEPH_OSDMAP_NOUP)
3765 s += ",noup";
3766 if (f & CEPH_OSDMAP_NODOWN)
3767 s += ",nodown";
3768 if (f & CEPH_OSDMAP_NOOUT)
3769 s += ",noout";
3770 if (f & CEPH_OSDMAP_NOIN)
3771 s += ",noin";
3772 if (f & CEPH_OSDMAP_NOBACKFILL)
3773 s += ",nobackfill";
3774 if (f & CEPH_OSDMAP_NOREBALANCE)
3775 s += ",norebalance";
3776 if (f & CEPH_OSDMAP_NORECOVER)
3777 s += ",norecover";
3778 if (f & CEPH_OSDMAP_NOSCRUB)
3779 s += ",noscrub";
3780 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3781 s += ",nodeep-scrub";
3782 if (f & CEPH_OSDMAP_NOTIERAGENT)
3783 s += ",notieragent";
11fdf7f2
TL
3784 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3785 s += ",nosnaptrim";
7c673cae
FG
3786 if (f & CEPH_OSDMAP_SORTBITWISE)
3787 s += ",sortbitwise";
3788 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3789 s += ",require_jewel_osds";
3790 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3791 s += ",require_kraken_osds";
3792 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3793 s += ",require_luminous_osds";
c07f9fc5
FG
3794 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3795 s += ",recovery_deletes";
181888fb
FG
3796 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3797 s += ",purged_snapdirs";
f64942e4
AA
3798 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3799 s += ",pglog_hardlimit";
7c673cae
FG
3800 if (s.length())
3801 s.erase(0, 1);
3802 return s;
3803}
3804
3805string OSDMap::get_flag_string() const
3806{
3807 return get_flag_string(flags);
3808}
3809
7c673cae
FG
3810void OSDMap::print_pools(ostream& out) const
3811{
3812 for (const auto &pool : pools) {
3813 std::string name("<unknown>");
3814 const auto &pni = pool_name.find(pool.first);
3815 if (pni != pool_name.end())
3816 name = pni->second;
3817 out << "pool " << pool.first
3818 << " '" << name
3819 << "' " << pool.second << "\n";
3820
3821 for (const auto &snap : pool.second.snaps)
3822 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3823
3824 if (!pool.second.removed_snaps.empty())
3825 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3826 auto p = removed_snaps_queue.find(pool.first);
3827 if (p != removed_snaps_queue.end()) {
3828 out << "\tremoved_snaps_queue " << p->second << "\n";
3829 }
7c673cae
FG
3830 }
3831 out << std::endl;
3832}
3833
9f95a23c
TL
3834void OSDMap::print_osds(ostream& out) const
3835{
3836 for (int i=0; i<get_max_osd(); i++) {
3837 if (exists(i)) {
3838 print_osd(i, out);
3839 }
3840 }
3841}
3842void OSDMap::print_osd(int id, ostream& out) const
3843{
3844 if (!exists(id)) {
3845 return;
3846 }
3847
3848 out << "osd." << id;
3849 out << (is_up(id) ? " up ":" down");
3850 out << (is_in(id) ? " in ":" out");
3851 out << " weight " << get_weightf(id);
3852 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
3853 out << " primary_affinity " << get_primary_affinityf(id);
3854 }
3855 const osd_info_t& info(get_info(id));
3856 out << " " << info;
3857 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
3858 set<string> st;
3859 get_state(id, st);
3860 out << " " << st;
3861 if (!get_uuid(id).is_zero()) {
3862 out << " " << get_uuid(id);
3863 }
3864 out << "\n";
3865}
3866
7c673cae
FG
3867void OSDMap::print(ostream& out) const
3868{
3869 out << "epoch " << get_epoch() << "\n"
3870 << "fsid " << get_fsid() << "\n"
3871 << "created " << get_created() << "\n"
3872 << "modified " << get_modified() << "\n";
3873
3874 out << "flags " << get_flag_string() << "\n";
31f18b77 3875 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3876 out << "full_ratio " << full_ratio << "\n";
3877 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3878 out << "nearfull_ratio " << nearfull_ratio << "\n";
9f95a23c 3879 if (require_min_compat_client != ceph_release_t::unknown) {
31f18b77 3880 out << "require_min_compat_client "
9f95a23c 3881 << require_min_compat_client << "\n";
7c673cae 3882 }
9f95a23c 3883 out << "min_compat_client " << get_min_compat_client()
31f18b77 3884 << "\n";
9f95a23c
TL
3885 if (require_osd_release > ceph_release_t::unknown) {
3886 out << "require_osd_release " << require_osd_release
224ce89b
WB
3887 << "\n";
3888 }
f67539c2
TL
3889 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
3890 if (stretch_mode_enabled) {
3891 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
3892 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
3893 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
3894 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
3895 }
7c673cae
FG
3896 if (get_cluster_snapshot().length())
3897 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3898 out << "\n";
3899
3900 print_pools(out);
3901
3902 out << "max_osd " << get_max_osd() << "\n";
9f95a23c 3903 print_osds(out);
7c673cae
FG
3904 out << std::endl;
3905
3906 for (auto& p : pg_upmap) {
3907 out << "pg_upmap " << p.first << " " << p.second << "\n";
3908 }
3909 for (auto& p : pg_upmap_items) {
3910 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3911 }
3912
f67539c2 3913 for (const auto& pg : *pg_temp)
7c673cae
FG
3914 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3915
f67539c2 3916 for (const auto& pg : *primary_temp)
7c673cae
FG
3917 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3918
f67539c2
TL
3919 for (const auto &addr : blocklist)
3920 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
3921}
3922
3923class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3924public:
3925 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3926
3927 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3928 unsigned f)
c07f9fc5 3929 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3930
3931 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3932 if (!filter) {
3933 return true; // normal case
3934 }
3935 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3936 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3937 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3938 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3939 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3940 return true;
31f18b77 3941 }
c07f9fc5 3942 return false;
31f18b77
FG
3943 }
3944
3945 bool should_dump_empty_bucket() const override {
3946 return !filter;
3947 }
7c673cae 3948
11fdf7f2 3949 void init_table(TextTable *tbl) {
7c673cae 3950 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3951 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3952 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3953 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3954 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3955 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3956 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
3957 }
3958 void dump(TextTable *tbl, string& bucket) {
3959 init_table(tbl);
7c673cae 3960
11fdf7f2
TL
3961 if (!bucket.empty()) {
3962 set_root(bucket);
3963 Parent::dump(tbl);
3964 } else {
3965 Parent::dump(tbl);
3966 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3967 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3968 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3969 }
31f18b77 3970 }
7c673cae
FG
3971 }
3972 }
3973
3974protected:
3975 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3976 const char *c = crush->get_item_class(qi.id);
3977 if (!c)
3978 c = "";
7c673cae 3979 *tbl << qi.id
224ce89b 3980 << c
7c673cae
FG
3981 << weightf_t(qi.weight);
3982
3983 ostringstream name;
3984 for (int k = 0; k < qi.depth; k++)
3985 name << " ";
3986 if (qi.is_bucket()) {
3987 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3988 << crush->get_item_name(qi.id);
3989 } else {
3990 name << "osd." << qi.id;
3991 }
3992 *tbl << name.str();
3993
3994 if (!qi.is_bucket()) {
3995 if (!osdmap->exists(qi.id)) {
3996 *tbl << "DNE"
3997 << 0;
3998 } else {
c07f9fc5
FG
3999 string s;
4000 if (osdmap->is_up(qi.id)) {
4001 s = "up";
4002 } else if (osdmap->is_destroyed(qi.id)) {
4003 s = "destroyed";
4004 } else {
4005 s = "down";
4006 }
4007 *tbl << s
7c673cae
FG
4008 << weightf_t(osdmap->get_weightf(qi.id))
4009 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4010 }
4011 }
4012 *tbl << TextTable::endrow;
4013 }
4014
4015private:
4016 const OSDMap *osdmap;
31f18b77 4017 const unsigned filter;
7c673cae
FG
4018};
4019
4020class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4021public:
4022 typedef CrushTreeDumper::FormattingDumper Parent;
4023
31f18b77
FG
4024 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4025 unsigned f)
c07f9fc5 4026 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4027
4028 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4029 if (!filter) {
4030 return true; // normal case
4031 }
4032 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4033 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4034 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4035 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4036 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4037 return true;
31f18b77 4038 }
c07f9fc5 4039 return false;
31f18b77
FG
4040 }
4041
4042 bool should_dump_empty_bucket() const override {
4043 return !filter;
4044 }
7c673cae 4045
11fdf7f2
TL
4046 void dump(Formatter *f, string& bucket) {
4047 if (!bucket.empty()) {
4048 set_root(bucket);
4049 f->open_array_section("nodes");
4050 Parent::dump(f);
4051 f->close_section();
4052 } else {
4053 f->open_array_section("nodes");
4054 Parent::dump(f);
4055 f->close_section();
4056 f->open_array_section("stray");
4057 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4058 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4059 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4060 }
4061 f->close_section();
7c673cae 4062 }
7c673cae
FG
4063 }
4064
4065protected:
4066 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4067 Parent::dump_item_fields(qi, f);
4068 if (!qi.is_bucket())
4069 {
c07f9fc5
FG
4070 string s;
4071 if (osdmap->is_up(qi.id)) {
4072 s = "up";
4073 } else if (osdmap->is_destroyed(qi.id)) {
4074 s = "destroyed";
4075 } else {
4076 s = "down";
4077 }
7c673cae 4078 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 4079 f->dump_string("status", s);
7c673cae
FG
4080 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4081 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4082 }
4083 }
4084
4085private:
4086 const OSDMap *osdmap;
31f18b77 4087 const unsigned filter;
7c673cae
FG
4088};
4089
11fdf7f2 4090void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 4091{
31f18b77 4092 if (f) {
11fdf7f2 4093 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 4094 } else {
11fdf7f2 4095 ceph_assert(out);
7c673cae 4096 TextTable tbl;
11fdf7f2 4097 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
4098 *out << tbl;
4099 }
4100}
4101
224ce89b 4102void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 4103 const string& prefix, bool extra) const
7c673cae
FG
4104{
4105 if (f) {
7c673cae
FG
4106 f->dump_int("epoch", get_epoch());
4107 f->dump_int("num_osds", get_num_osds());
4108 f->dump_int("num_up_osds", get_num_up_osds());
9f95a23c 4109 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
7c673cae 4110 f->dump_int("num_in_osds", get_num_in_osds());
9f95a23c 4111 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
7c673cae 4112 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
7c673cae 4113 } else {
11fdf7f2 4114 utime_t now = ceph_clock_now();
31f18b77 4115 out << get_num_osds() << " osds: "
11fdf7f2
TL
4116 << get_num_up_osds() << " up";
4117 if (last_up_change != utime_t()) {
4118 out << " (since " << utimespan_str(now - last_up_change) << ")";
4119 }
4120 out << ", " << get_num_in_osds() << " in";
4121 if (last_in_change != utime_t()) {
4122 out << " (since " << utimespan_str(now - last_in_change) << ")";
4123 }
4124 if (extra)
4125 out << "; epoch: e" << get_epoch();
7c673cae
FG
4126 if (get_num_pg_temp())
4127 out << "; " << get_num_pg_temp() << " remapped pgs";
4128 out << "\n";
4129 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4130 if (important_flags)
224ce89b 4131 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
4132 }
4133}
4134
4135void OSDMap::print_oneline_summary(ostream& out) const
4136{
4137 out << "e" << get_epoch() << ": "
31f18b77 4138 << get_num_osds() << " total, "
7c673cae
FG
4139 << get_num_up_osds() << " up, "
4140 << get_num_in_osds() << " in";
7c673cae
FG
4141}
4142
3efd9988 4143bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
4144{
4145 for (const auto &pool : pools) {
3efd9988 4146 if (pool.second.crush_rule == rule_id)
7c673cae
FG
4147 return true;
4148 }
4149 return false;
4150}
4151
3efd9988
FG
4152int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4153 ostream *ss) const
4154{
4155 for (auto& i : pools) {
4156 auto& pool = i.second;
4157 int ruleno = pool.get_crush_rule();
4158 if (!newcrush->rule_exists(ruleno)) {
4159 *ss << "pool " << i.first << " references crush_rule " << ruleno
4160 << " but it is not present";
4161 return -EINVAL;
4162 }
20effc67 4163 if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) {
3efd9988
FG
4164 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4165 return -EINVAL;
4166 }
3efd9988
FG
4167 }
4168 return 0;
4169}
4170
224ce89b
WB
4171int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4172 int nosd, int pg_bits, int pgp_bits,
4173 bool default_pool)
7c673cae 4174{
224ce89b
WB
4175 ldout(cct, 10) << "build_simple on " << nosd
4176 << " osds" << dendl;
7c673cae
FG
4177 epoch = e;
4178 set_fsid(fsid);
4179 created = modified = ceph_clock_now();
4180
4181 if (nosd >= 0) {
4182 set_max_osd(nosd);
4183 } else {
4184 // count osds
4185 int maxosd = 0;
11fdf7f2 4186 const auto& conf = cct->_conf;
7c673cae 4187 vector<string> sections;
11fdf7f2 4188 conf.get_all_sections(sections);
7c673cae
FG
4189
4190 for (auto &section : sections) {
4191 if (section.find("osd.") != 0)
4192 continue;
4193
4194 const char *begin = section.c_str() + 4;
4195 char *end = (char*)begin;
4196 int o = strtol(begin, &end, 10);
4197 if (*end != '\0')
4198 continue;
4199
4200 if (o > cct->_conf->mon_max_osd) {
4201 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4202 return -ERANGE;
4203 }
4204
4205 if (o > maxosd)
4206 maxosd = o;
4207 }
4208
4209 set_max_osd(maxosd + 1);
4210 }
4211
7c673cae
FG
4212
4213 stringstream ss;
4214 int r;
4215 if (nosd >= 0)
4216 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4217 else
4218 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4219 ceph_assert(r == 0);
7c673cae
FG
4220
4221 int poolbase = get_max_osd() ? get_max_osd() : 1;
4222
20effc67 4223 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct);
11fdf7f2 4224 ceph_assert(default_replicated_rule >= 0);
7c673cae 4225
224ce89b
WB
4226 if (default_pool) {
4227 // pgp_num <= pg_num
4228 if (pgp_bits > pg_bits)
4229 pgp_bits = pg_bits;
4230
4231 vector<string> pool_names;
4232 pool_names.push_back("rbd");
4233 for (auto &plname : pool_names) {
4234 int64_t pool = ++pool_max;
4235 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4236 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4237 if (cct->_conf->osd_pool_default_flag_hashpspool)
4238 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4239 if (cct->_conf->osd_pool_default_flag_nodelete)
4240 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4241 if (cct->_conf->osd_pool_default_flag_nopgchange)
4242 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4243 if (cct->_conf->osd_pool_default_flag_nosizechange)
4244 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
20effc67
TL
4245 if (cct->_conf->osd_pool_default_flag_bulk)
4246 pools[pool].set_flag(pg_pool_t::FLAG_BULK);
11fdf7f2
TL
4247 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4248 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4249 pools[pool].size);
224ce89b
WB
4250 pools[pool].crush_rule = default_replicated_rule;
4251 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4252 pools[pool].set_pg_num(poolbase << pg_bits);
4253 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4254 pools[pool].set_pg_num_target(poolbase << pg_bits);
4255 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4256 pools[pool].last_change = epoch;
c07f9fc5
FG
4257 pools[pool].application_metadata.insert(
4258 {pg_pool_t::APPLICATION_NAME_RBD, {}});
9f95a23c
TL
4259 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4260 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4261 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4262 pools[pool].pg_autoscale_mode = m;
4263 } else {
4264 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4265 }
224ce89b
WB
4266 pool_name[pool] = plname;
4267 name_pool[plname] = pool;
4268 }
7c673cae
FG
4269 }
4270
7c673cae
FG
4271 map<string,string> profile_map;
4272 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4273 if (r < 0) {
4274 lderr(cct) << ss.str() << dendl;
4275 return r;
4276 }
4277 set_erasure_code_profile("default", profile_map);
4278 return 0;
4279}
4280
4281int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4282 map<string,string> &profile_map,
4283 ostream *ss)
4284{
11fdf7f2 4285 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4286 *ss,
4287 &profile_map);
4288 return r;
4289}
4290
4291int OSDMap::_build_crush_types(CrushWrapper& crush)
4292{
4293 crush.set_type_name(0, "osd");
4294 crush.set_type_name(1, "host");
4295 crush.set_type_name(2, "chassis");
4296 crush.set_type_name(3, "rack");
4297 crush.set_type_name(4, "row");
4298 crush.set_type_name(5, "pdu");
4299 crush.set_type_name(6, "pod");
4300 crush.set_type_name(7, "room");
4301 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4302 crush.set_type_name(9, "zone");
4303 crush.set_type_name(10, "region");
4304 crush.set_type_name(11, "root");
4305 return 11;
7c673cae
FG
4306}
4307
4308int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4309 int nosd, ostream *ss)
4310{
4311 crush.create();
4312
4313 // root
4314 int root_type = _build_crush_types(crush);
4315 int rootid;
4316 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4317 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4318 ceph_assert(r == 0);
7c673cae
FG
4319 crush.set_item_name(rootid, "default");
4320
f67539c2
TL
4321 map<string,string> loc{
4322 {"host", "localhost"},
4323 {"rack", "localrack"},
4324 {"root", "default"}
4325 };
7c673cae 4326 for (int o=0; o<nosd; o++) {
7c673cae
FG
4327 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4328 char name[32];
4329 snprintf(name, sizeof(name), "osd.%d", o);
4330 crush.insert_item(cct, o, 1.0, name, loc);
4331 }
4332
31f18b77 4333 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4334
4335 crush.finalize();
4336
4337 return 0;
4338}
4339
4340int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4341 CrushWrapper& crush,
4342 ostream *ss)
4343{
11fdf7f2 4344 const auto& conf = cct->_conf;
7c673cae
FG
4345
4346 crush.create();
4347
4348 // root
4349 int root_type = _build_crush_types(crush);
4350 int rootid;
4351 int r = crush.add_bucket(0, 0,
4352 CRUSH_HASH_DEFAULT,
4353 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4354 ceph_assert(r == 0);
7c673cae
FG
4355 crush.set_item_name(rootid, "default");
4356
4357 // add osds
4358 vector<string> sections;
11fdf7f2 4359 conf.get_all_sections(sections);
7c673cae
FG
4360
4361 for (auto &section : sections) {
4362 if (section.find("osd.") != 0)
4363 continue;
4364
4365 const char *begin = section.c_str() + 4;
4366 char *end = (char*)begin;
4367 int o = strtol(begin, &end, 10);
4368 if (*end != '\0')
4369 continue;
4370
4371 string host, rack, row, room, dc, pool;
4372 vector<string> sectiontmp;
4373 sectiontmp.push_back("osd");
4374 sectiontmp.push_back(section);
11fdf7f2
TL
4375 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4376 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4377 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4378 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4379 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4380 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4381
4382 if (host.length() == 0)
4383 host = "unknownhost";
4384 if (rack.length() == 0)
4385 rack = "unknownrack";
4386
4387 map<string,string> loc;
4388 loc["host"] = host;
4389 loc["rack"] = rack;
4390 if (row.size())
4391 loc["row"] = row;
4392 if (room.size())
4393 loc["room"] = room;
4394 if (dc.size())
4395 loc["datacenter"] = dc;
4396 loc["root"] = "default";
4397
4398 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4399 crush.insert_item(cct, o, 1.0, section, loc);
4400 }
4401
31f18b77 4402 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4403
4404 crush.finalize();
4405
4406 return 0;
4407}
4408
4409
31f18b77
FG
4410int OSDMap::build_simple_crush_rules(
4411 CephContext *cct,
4412 CrushWrapper& crush,
4413 const string& root,
4414 ostream *ss)
7c673cae 4415{
20effc67 4416 int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct);
7c673cae
FG
4417 string failure_domain =
4418 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4419
7c673cae 4420 int r;
31f18b77 4421 r = crush.add_simple_rule_at(
224ce89b 4422 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4423 "firstn", pg_pool_t::TYPE_REPLICATED,
4424 crush_rule, ss);
7c673cae
FG
4425 if (r < 0)
4426 return r;
4427 // do not add an erasure rule by default or else we will implicitly
4428 // require the crush_v2 feature of clients
4429 return 0;
4430}
4431
4432int OSDMap::summarize_mapping_stats(
4433 OSDMap *newmap,
4434 const set<int64_t> *pools,
4435 std::string *out,
4436 Formatter *f) const
4437{
4438 set<int64_t> ls;
4439 if (pools) {
4440 ls = *pools;
4441 } else {
4442 for (auto &p : get_pools())
4443 ls.insert(p.first);
4444 }
4445
4446 unsigned total_pg = 0;
4447 unsigned moved_pg = 0;
4448 vector<unsigned> base_by_osd(get_max_osd(), 0);
4449 vector<unsigned> new_by_osd(get_max_osd(), 0);
4450 for (int64_t pool_id : ls) {
4451 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4452 vector<int> up, up2;
4453 int up_primary;
7c673cae 4454 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4455 pg_t pgid(ps, pool_id);
7c673cae 4456 total_pg += pi->get_size();
31f18b77 4457 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4458 for (int osd : up) {
4459 if (osd >= 0 && osd < get_max_osd())
4460 ++base_by_osd[osd];
4461 }
4462 if (newmap) {
31f18b77 4463 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4464 for (int osd : up2) {
4465 if (osd >= 0 && osd < get_max_osd())
4466 ++new_by_osd[osd];
4467 }
4468 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4469 for (unsigned i=0; i<up.size(); ++i) {
4470 if (up[i] != up2[i]) {
4471 ++moved_pg;
4472 }
4473 }
4474 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4475 for (int osd : up) {
4476 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4477 ++moved_pg;
4478 }
4479 }
4480 } else {
11fdf7f2 4481 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4482 }
4483 }
4484 }
4485 }
4486
4487 unsigned num_up_in = 0;
4488 for (int osd = 0; osd < get_max_osd(); ++osd) {
4489 if (is_up(osd) && is_in(osd))
4490 ++num_up_in;
4491 }
4492 if (!num_up_in) {
4493 return -EINVAL;
4494 }
4495
4496 float avg_pg = (float)total_pg / (float)num_up_in;
4497 float base_stddev = 0, new_stddev = 0;
4498 int min = -1, max = -1;
4499 unsigned min_base_pg = 0, max_base_pg = 0;
4500 unsigned min_new_pg = 0, max_new_pg = 0;
4501 for (int osd = 0; osd < get_max_osd(); ++osd) {
4502 if (is_up(osd) && is_in(osd)) {
4503 float base_diff = (float)base_by_osd[osd] - avg_pg;
4504 base_stddev += base_diff * base_diff;
4505 float new_diff = (float)new_by_osd[osd] - avg_pg;
4506 new_stddev += new_diff * new_diff;
4507 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4508 min = osd;
4509 min_base_pg = base_by_osd[osd];
4510 min_new_pg = new_by_osd[osd];
4511 }
4512 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4513 max = osd;
4514 max_base_pg = base_by_osd[osd];
4515 max_new_pg = new_by_osd[osd];
4516 }
4517 }
4518 }
4519 base_stddev = sqrt(base_stddev / num_up_in);
4520 new_stddev = sqrt(new_stddev / num_up_in);
4521
4522 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4523
4524 ostringstream ss;
4525 if (f)
4526 f->open_object_section("utilization");
4527 if (newmap) {
4528 if (f) {
4529 f->dump_unsigned("moved_pgs", moved_pg);
4530 f->dump_unsigned("total_pgs", total_pg);
4531 } else {
4532 float percent = 0;
4533 if (total_pg)
4534 percent = (float)moved_pg * 100.0 / (float)total_pg;
4535 ss << "moved " << moved_pg << " / " << total_pg
4536 << " (" << percent << "%)\n";
4537 }
4538 }
4539 if (f) {
4540 f->dump_float("avg_pgs", avg_pg);
4541 f->dump_float("std_dev", base_stddev);
4542 f->dump_float("expected_baseline_std_dev", edev);
4543 if (newmap)
4544 f->dump_float("new_std_dev", new_stddev);
4545 } else {
4546 ss << "avg " << avg_pg << "\n";
4547 ss << "stddev " << base_stddev;
4548 if (newmap)
4549 ss << " -> " << new_stddev;
4550 ss << " (expected baseline " << edev << ")\n";
4551 }
4552 if (min >= 0) {
4553 if (f) {
4554 f->dump_unsigned("min_osd", min);
4555 f->dump_unsigned("min_osd_pgs", min_base_pg);
4556 if (newmap)
4557 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4558 } else {
4559 ss << "min osd." << min << " with " << min_base_pg;
4560 if (newmap)
4561 ss << " -> " << min_new_pg;
4562 ss << " pgs (" << (float)min_base_pg / avg_pg;
4563 if (newmap)
4564 ss << " -> " << (float)min_new_pg / avg_pg;
4565 ss << " * mean)\n";
4566 }
4567 }
4568 if (max >= 0) {
4569 if (f) {
4570 f->dump_unsigned("max_osd", max);
4571 f->dump_unsigned("max_osd_pgs", max_base_pg);
4572 if (newmap)
4573 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4574 } else {
4575 ss << "max osd." << max << " with " << max_base_pg;
4576 if (newmap)
4577 ss << " -> " << max_new_pg;
4578 ss << " pgs (" << (float)max_base_pg / avg_pg;
4579 if (newmap)
4580 ss << " -> " << (float)max_new_pg / avg_pg;
4581 ss << " * mean)\n";
4582 }
4583 }
4584 if (f)
4585 f->close_section();
4586 if (out)
4587 *out = ss.str();
4588 return 0;
4589}
4590
7c673cae
FG
4591bool OSDMap::try_pg_upmap(
4592 CephContext *cct,
4593 pg_t pg, ///< pg to potentially remap
4594 const set<int>& overfull, ///< osds we'd want to evacuate
4595 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4596 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4597 vector<int> *orig,
4598 vector<int> *out) ///< resulting alternative mapping
4599{
4600 const pg_pool_t *pool = get_pg_pool(pg.pool());
4601 if (!pool)
4602 return false;
20effc67 4603 int rule = pool->get_crush_rule();
7c673cae
FG
4604 if (rule < 0)
4605 return false;
4606
7c673cae
FG
4607 // make sure there is something there to remap
4608 bool any = false;
4609 for (auto osd : *orig) {
4610 if (overfull.count(osd)) {
4611 any = true;
4612 break;
4613 }
4614 }
4615 if (!any) {
4616 return false;
4617 }
4618
4619 int r = crush->try_remap_rule(
4620 cct,
4621 rule,
4622 pool->get_size(),
4623 overfull, underfull,
92f5a8d4 4624 more_underfull,
7c673cae
FG
4625 *orig,
4626 out);
4627 if (r < 0)
4628 return false;
4629 if (*out == *orig)
4630 return false;
4631 return true;
4632}
4633
4634int OSDMap::calc_pg_upmaps(
4635 CephContext *cct,
92f5a8d4 4636 uint32_t max_deviation,
7c673cae 4637 int max,
a8e16298 4638 const set<int64_t>& only_pools,
20effc67
TL
4639 OSDMap::Incremental *pending_inc,
4640 std::random_device::result_type *p_seed)
7c673cae 4641{
a8e16298 4642 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
20effc67 4643 OSDMap tmp_osd_map;
92f5a8d4
TL
4644 // Can't be less than 1 pg
4645 if (max_deviation < 1)
4646 max_deviation = 1;
20effc67 4647 tmp_osd_map.deepish_copy_from(*this);
7c673cae 4648 int num_changed = 0;
a8e16298
TL
4649 map<int,set<pg_t>> pgs_by_osd;
4650 int total_pgs = 0;
4651 float osd_weight_total = 0;
4652 map<int,float> osd_weight;
a8e16298 4653
20effc67
TL
4654 if (max <= 0) {
4655 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4656 return 0;
a8e16298 4657 }
20effc67
TL
4658
4659 osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map,
4660 total_pgs, pgs_by_osd, osd_weight);
a8e16298
TL
4661 if (osd_weight_total == 0) {
4662 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4663 return 0;
4664 }
20effc67 4665
a8e16298
TL
4666 float pgs_per_weight = total_pgs / osd_weight_total;
4667 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4668 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4669
a8e16298
TL
4670 float stddev = 0;
4671 map<int,float> osd_deviation; // osd, deviation(pgs)
4672 multimap<float,int> deviation_osd; // deviation(pgs), osd
20effc67
TL
4673 float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight,
4674 osd_deviation, deviation_osd, stddev);
4675
92f5a8d4
TL
4676 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4677 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
4678 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4679 << dendl;
4680 return 0;
4681 }
20effc67 4682
a8e16298
TL
4683 bool skip_overfull = false;
4684 auto aggressive =
11fdf7f2 4685 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4686 auto local_fallback_retries =
11fdf7f2 4687 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
20effc67 4688
a8e16298 4689 while (max--) {
92f5a8d4 4690 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
4691 // build overfull and underfull
4692 set<int> overfull;
92f5a8d4
TL
4693 set<int> more_overfull;
4694 bool using_more_overfull = false;
a8e16298 4695 vector<int> underfull;
92f5a8d4 4696 vector<int> more_underfull;
20effc67
TL
4697 fill_overfull_underfull(cct, deviation_osd, max_deviation,
4698 overfull, more_overfull,
4699 underfull, more_underfull);
7c673cae 4700
92f5a8d4
TL
4701 if (underfull.empty() && overfull.empty()) {
4702 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 4703 break;
a8e16298 4704 }
92f5a8d4
TL
4705 if (overfull.empty() && !underfull.empty()) {
4706 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4707 overfull = more_overfull;
4708 using_more_overfull = true;
4709 }
7c673cae 4710
a8e16298
TL
4711 ldout(cct, 10) << " overfull " << overfull
4712 << " underfull " << underfull
4713 << dendl;
4714 set<pg_t> to_skip;
4715 uint64_t local_fallback_retried = 0;
4716
4717 retry:
4718
4719 set<pg_t> to_unmap;
4720 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4721 auto temp_pgs_by_osd = pgs_by_osd;
4722 // always start with fullest, break if we find any changes to make
7c673cae 4723 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 4724 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
4725 ldout(cct, 10) << " skipping overfull " << dendl;
4726 break; // fall through to check underfull
4727 }
7c673cae 4728 int osd = p->second;
31f18b77 4729 float deviation = p->first;
9f95a23c
TL
4730 if (deviation < 0) {
4731 ldout(cct, 10) << " hitting underfull osds now"
4732 << " when trying to remap overfull osds"
4733 << dendl;
4734 break;
4735 }
7c673cae 4736 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
4737 ldout(cct, 10) << " Overfull search osd." << osd
4738 << " target " << target
4739 << " deviation " << deviation
4740 << dendl;
a8e16298 4741 ceph_assert(target > 0);
92f5a8d4 4742 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 4743 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4744 << " target " << target
4745 << " deviation " << deviation
92f5a8d4 4746 << " < max deviation " << max_deviation
a8e16298 4747 << dendl;
7c673cae
FG
4748 break;
4749 }
7c673cae 4750
a8e16298
TL
4751 vector<pg_t> pgs;
4752 pgs.reserve(pgs_by_osd[osd].size());
4753 for (auto& pg : pgs_by_osd[osd]) {
4754 if (to_skip.count(pg))
4755 continue;
4756 pgs.push_back(pg);
4757 }
4758 if (aggressive) {
4759 // shuffle PG list so they all get equal (in)attention
20effc67 4760 std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed));
a8e16298 4761 }
7c673cae 4762 // look for remaps we can un-remap
20effc67
TL
4763 if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd,
4764 temp_pgs_by_osd, to_unmap, to_upmap))
4765 goto test_change;
7c673cae 4766
a8e16298 4767 // try upmap
7c673cae 4768 for (auto pg : pgs) {
20effc67
TL
4769 auto temp_it = tmp_osd_map.pg_upmap.find(pg);
4770 if (temp_it != tmp_osd_map.pg_upmap.end()) {
a8e16298
TL
4771 // leave pg_upmap alone
4772 // it must be specified by admin since balancer does not
4773 // support pg_upmap yet
4774 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4775 << temp_it->second << ", skipping"
4776 << dendl;
7c673cae
FG
4777 continue;
4778 }
20effc67 4779 auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg);
a8e16298
TL
4780 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4781 set<int> existing;
20effc67
TL
4782 auto it = tmp_osd_map.pg_upmap_items.find(pg);
4783 if (it != tmp_osd_map.pg_upmap_items.end()) {
4784 auto& um_items = it->second;
4785 if (um_items.size() >= (size_t)pg_pool_size) {
4786 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4787 << um_items << ", skipping"
4788 << dendl;
4789 continue;
4790 } else {
4791 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4792 << um_items
4793 << dendl;
4794 new_upmap_items = um_items;
4795 // build existing too (for dedup)
4796 for (auto [um_from, um_to] : um_items) {
4797 existing.insert(um_from);
4798 existing.insert(um_to);
4799 }
4800 }
a8e16298
TL
4801 // fall through
4802 // to see if we can append more remapping pairs
20effc67 4803 }
a8e16298 4804 ldout(cct, 10) << " trying " << pg << dendl;
494da23a 4805 vector<int> raw, orig, out;
20effc67 4806 tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 4807 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
4808 continue;
4809 }
a8e16298 4810 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4811 if (orig.size() != out.size()) {
4812 continue;
4813 }
a8e16298 4814 ceph_assert(orig != out);
20effc67 4815 int pos = find_best_remap(cct, orig, out, existing, osd_deviation);
92f5a8d4 4816 if (pos != -1) {
a8e16298
TL
4817 // append new remapping pairs slowly
4818 // This way we can make sure that each tiny change will
4819 // definitely make distribution of PGs converging to
4820 // the perfect status.
20effc67
TL
4821 add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size,
4822 osd, existing, temp_pgs_by_osd,
4823 new_upmap_items, to_upmap);
a8e16298 4824 goto test_change;
7c673cae 4825 }
a8e16298
TL
4826 }
4827 }
7c673cae 4828
a8e16298
TL
4829 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4830 ldout(cct, 10) << " failed to find any changes for overfull osds"
4831 << dendl;
20effc67
TL
4832 for (auto& [deviation, osd] : deviation_osd) {
4833 if (std::find(underfull.begin(), underfull.end(), osd) ==
a8e16298
TL
4834 underfull.end())
4835 break;
a8e16298
TL
4836 float target = osd_weight[osd] * pgs_per_weight;
4837 ceph_assert(target > 0);
92f5a8d4
TL
4838 if (fabsf(deviation) < max_deviation) {
4839 // respect max_deviation too
a8e16298
TL
4840 ldout(cct, 10) << " osd." << osd
4841 << " target " << target
4842 << " deviation " << deviation
92f5a8d4
TL
4843 << " -> absolute " << fabsf(deviation)
4844 << " < max " << max_deviation
a8e16298
TL
4845 << dendl;
4846 break;
4847 }
4848 // look for remaps we can un-remap
20effc67
TL
4849 candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip,
4850 only_pools, aggressive, p_seed);
4851 if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd,
4852 to_unmap, to_upmap)) {
4853 goto test_change;
a8e16298 4854 }
7c673cae 4855 }
a8e16298
TL
4856
4857 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4858 ldout(cct, 10) << " failed to find any changes for underfull osds"
4859 << dendl;
4860 if (!aggressive) {
4861 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
4862 break;
4863 } else if (!skip_overfull) {
4864 // safe to quit because below here we know
4865 // we've done checking both overfull and underfull osds..
4866 ldout(cct, 10) << " break due to not being able to find any"
4867 << " further optimizations"
4868 << dendl;
7c673cae
FG
4869 break;
4870 }
a8e16298
TL
4871 // restart with fullest and do exhaustive searching
4872 skip_overfull = false;
4873 continue;
4874
4875 test_change:
4876
4877 // test change, apply if change is good
4878 ceph_assert(to_unmap.size() || to_upmap.size());
4879 float new_stddev = 0;
4880 map<int,float> temp_osd_deviation;
4881 multimap<float,int> temp_deviation_osd;
20effc67
TL
4882 float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight,
4883 pgs_per_weight, temp_osd_deviation,
4884 temp_deviation_osd, new_stddev);
a8e16298
TL
4885 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
4886 if (new_stddev >= stddev) {
4887 if (!aggressive) {
4888 ldout(cct, 10) << " break because stddev is not decreasing"
4889 << " and aggressive mode is not enabled"
4890 << dendl;
4891 break;
4892 }
4893 local_fallback_retried++;
4894 if (local_fallback_retried >= local_fallback_retries) {
4895 // does not make progress
4896 // flip *skip_overfull* so both overfull and underfull
4897 // get equal (in)attention
4898 skip_overfull = !skip_overfull;
4899 ldout(cct, 10) << " hit local_fallback_retries "
4900 << local_fallback_retries
4901 << dendl;
4902 continue;
4903 }
4904 for (auto& i : to_unmap)
4905 to_skip.insert(i);
4906 for (auto& i : to_upmap)
4907 to_skip.insert(i.first);
4908 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
4909 << " to_skip " << to_skip
4910 << dendl;
4911 goto retry;
4912 }
4913
4914 // ready to go
4915 ceph_assert(new_stddev < stddev);
4916 stddev = new_stddev;
4917 pgs_by_osd = temp_pgs_by_osd;
4918 osd_deviation = temp_osd_deviation;
4919 deviation_osd = temp_deviation_osd;
20effc67
TL
4920
4921 num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc);
4922
92f5a8d4
TL
4923 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4924 if (cur_max_deviation <= max_deviation) {
4925 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
4926 << dendl;
4927 break;
4928 }
7c673cae 4929 }
a8e16298 4930 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
4931 return num_changed;
4932}
31f18b77 4933
20effc67
TL
4934float OSDMap::build_pool_pgs_info (
4935 CephContext *cct,
4936 const std::set<int64_t>& only_pools, ///< [optional] restrict to pool
4937 const OSDMap& tmp_osd_map,
4938 int& total_pgs,
4939 map<int,set<pg_t>>& pgs_by_osd,
4940 map<int,float>& osd_weight)
4941{
4942 //
4943 // This function builds some data structures that are used by calc_pg_upmaps.
4944 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
4945 // and returns the osd_weight_total
4946 //
4947 float osd_weight_total = 0.0;
4948 for (auto& [pid, pdata] : pools) {
4949 if (!only_pools.empty() && !only_pools.count(pid))
4950 continue;
4951 for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) {
4952 pg_t pg(ps, pid);
4953 vector<int> up;
4954 tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4955 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4956 for (auto osd : up) {
4957 if (osd != CRUSH_ITEM_NONE)
4958 pgs_by_osd[osd].insert(pg);
4959 }
4960 }
4961 total_pgs += pdata.get_size() * pdata.get_pg_num();
4962
4963 map<int,float> pmap;
4964 int ruleno = pdata.get_crush_rule();
4965 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap);
4966 ldout(cct,20) << __func__ << " pool " << pid
4967 << " ruleno " << ruleno
4968 << " weight-map " << pmap
4969 << dendl;
4970 for (auto [oid, oweight] : pmap) {
4971 auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight;
4972 if (adjusted_weight == 0) {
4973 continue;
4974 }
4975 osd_weight[oid] += adjusted_weight;
4976 osd_weight_total += adjusted_weight;
4977 }
4978 }
4979 for (auto& [oid, oweight] : osd_weight) {
4980 int pgs = 0;
4981 auto p = pgs_by_osd.find(oid);
4982 if (p != pgs_by_osd.end())
4983 pgs = p->second.size();
4984 else
4985 pgs_by_osd.emplace(oid, set<pg_t>());
4986 ldout(cct, 20) << " osd." << oid << " weight " << oweight
4987 << " pgs " << pgs << dendl;
4988 }
4989 return osd_weight_total;
4990
4991} // return total weight of all OSDs
4992
4993float OSDMap::calc_deviations (
4994 CephContext *cct,
4995 const map<int,set<pg_t>>& pgs_by_osd,
4996 const map<int,float>& osd_weight,
4997 float pgs_per_weight,
4998 map<int,float>& osd_deviation,
4999 multimap<float,int>& deviation_osd,
5000 float& stddev) // return current max deviation
5001{
5002 //
5003 // This function calculates the 2 maps osd_deviation and deviation_osd which
5004 // hold the deviation between the current number of PGs which map to an OSD
5005 // and the optimal number. Ot also calculates the stddev of the deviations and
5006 // returns the current max deviation.
5007 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5008 // long as it is monotonic with stddev (and it is), it is sufficient for
5009 // the balancer code.
5010 //
5011 float cur_max_deviation = 0.0;
5012 stddev = 0.0;
5013 for (auto& [oid, opgs] : pgs_by_osd) {
5014 // make sure osd is still there (belongs to this crush-tree)
5015 ceph_assert(osd_weight.count(oid));
5016 float target = osd_weight.at(oid) * pgs_per_weight;
5017 float deviation = (float)opgs.size() - target;
5018 ldout(cct, 20) << " osd." << oid
5019 << "\tpgs " << opgs.size()
5020 << "\ttarget " << target
5021 << "\tdeviation " << deviation
5022 << dendl;
5023 osd_deviation[oid] = deviation;
5024 deviation_osd.insert(make_pair(deviation, oid));
5025 stddev += deviation * deviation;
5026 if (fabsf(deviation) > cur_max_deviation)
5027 cur_max_deviation = fabsf(deviation);
5028 }
5029 return cur_max_deviation;
5030}
5031
5032void OSDMap::fill_overfull_underfull (
5033 CephContext *cct,
5034 const std::multimap<float,int>& deviation_osd,
5035 int max_deviation,
5036 std::set<int>& overfull,
5037 std::set<int>& more_overfull,
5038 std::vector<int>& underfull,
5039 std::vector<int>& more_underfull)
5040{
5041 //
5042 // This function just fills the overfull and underfull data structures for the
5043 // use of calc_pg_upmaps
5044 //
5045 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
5046 auto& odev = i->first;
5047 auto& oid = i->second;
5048 ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl;
5049 if (odev <= 0)
5050 break;
5051 if (odev > max_deviation) {
5052 ldout(cct, 30) << " add overfull osd." << oid << dendl;
5053 overfull.insert(oid);
5054 } else {
5055 more_overfull.insert(oid);
5056 }
5057 }
5058
5059 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
5060 auto& odev = i->first;
5061 auto& oid = i->second;
5062 ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl;
5063 if (odev >= 0)
5064 break;
5065 if (odev < -(int)max_deviation) {
5066 ldout(cct, 30) << " add underfull osd." << oid << dendl;
5067 underfull.push_back(oid);
5068 } else {
5069 more_underfull.push_back(oid);
5070 }
5071 }
5072}
5073
5074int OSDMap::pack_upmap_results(
5075 CephContext *cct,
5076 const std::set<pg_t>& to_unmap,
5077 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
5078 OSDMap& tmp_osd_map,
5079 OSDMap::Incremental *pending_inc)
5080{
5081 //
5082 // This function takes the input from the local variables to_unmap and to_upmap
5083 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5084 // (so that the results are visible outside calc_pg_upmaps)
5085 //
5086 int num_changed = 0;
5087 for (auto& i : to_unmap) {
5088 ldout(cct, 10) << " unmap pg " << i << dendl;
5089 ceph_assert(tmp_osd_map.pg_upmap_items.count(i));
5090 tmp_osd_map.pg_upmap_items.erase(i);
5091 pending_inc->old_pg_upmap_items.insert(i);
5092 ++num_changed;
5093 }
5094 for (auto& [pg, um_items] : to_upmap) {
5095 ldout(cct, 10) << " upmap pg " << pg
5096 << " new pg_upmap_items " << um_items
5097 << dendl;
5098 tmp_osd_map.pg_upmap_items[pg] = um_items;
5099 pending_inc->new_pg_upmap_items[pg] = um_items;
5100 ++num_changed;
5101 }
5102
5103 return num_changed;
5104}
5105
5106std::default_random_engine OSDMap::get_random_engine(
5107 CephContext *cct,
5108 std::random_device::result_type *p_seed)
5109{
5110 //
5111 // This function creates a random_engine to be used for shuffling.
5112 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5113 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5114 // increments seed_set. This is used in order to craete regression test without
5115 // random effect on the results.
5116 //
5117 static std::random_device::result_type seed_set = 0;
5118 std::random_device::result_type seed;
5119 if (p_seed == nullptr) {
5120 std::random_device rd;
5121 seed = rd();
5122 }
5123 else {
5124 seed = *p_seed + seed_set;
5125 ldout(cct, 30) << " Starting random engine with seed "
5126 << seed << dendl;
5127 seed_set++;
5128 }
5129 return std::default_random_engine{seed};
5130}
5131
5132bool OSDMap::try_drop_remap_overfull(
5133 CephContext *cct,
5134 const std::vector<pg_t>& pgs,
5135 const OSDMap& tmp_osd_map,
5136 int osd,
5137 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5138 set<pg_t>& to_unmap,
5139 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5140{
5141 //
5142 // This function tries to drop existimg upmap items which map data to overfull
5143 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5144 // if it found an item that can be dropped, false if not.
5145 //
5146 for (auto pg : pgs) {
5147 auto p = tmp_osd_map.pg_upmap_items.find(pg);
5148 if (p == tmp_osd_map.pg_upmap_items.end())
5149 continue;
5150 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5151 auto& pg_upmap_items = p->second;
5152 for (auto um_pair : pg_upmap_items) {
5153 auto& um_from = um_pair.first;
5154 auto& um_to = um_pair.second;
5155 if (um_to == osd) {
5156 ldout(cct, 10) << " will try dropping existing"
5157 << " remapping pair "
5158 << um_from << " -> " << um_to
5159 << " which remapped " << pg
5160 << " into overfull osd." << osd
5161 << dendl;
5162 temp_pgs_by_osd[um_to].erase(pg);
5163 temp_pgs_by_osd[um_from].insert(pg);
5164 } else {
5165 new_upmap_items.push_back(um_pair);
5166 }
5167 }
5168 if (new_upmap_items.empty()) {
5169 // drop whole item
5170 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5171 << " remapped " << pg << " into overfull osd." << osd
5172 << ", will try cancelling it entirely"
5173 << dendl;
5174 to_unmap.insert(pg);
5175 return true;
5176 } else if (new_upmap_items.size() != pg_upmap_items.size()) {
5177 // drop single remapping pair, updating
5178 ceph_assert(new_upmap_items.size() < pg_upmap_items.size());
5179 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5180 << " remapped " << pg << " into overfull osd." << osd
5181 << ", new_pg_upmap_items now " << new_upmap_items
5182 << dendl;
5183 to_upmap[pg] = new_upmap_items;
5184 return true;
5185 }
5186 }
5187 return false;
5188}
5189
5190bool OSDMap::try_drop_remap_underfull(
5191 CephContext *cct,
5192 const candidates_t& candidates,
5193 int osd,
5194 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5195 set<pg_t>& to_unmap,
5196 map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap)
5197{
5198 //
5199 // This function tries to drop existimg upmap items which map data from underfull
5200 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5201 // if it found an item that can be dropped, false if not.
5202 //
5203 for (auto& [pg, um_pairs] : candidates) {
5204 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5205 for (auto& ump : um_pairs) {
5206 auto& um_from = ump.first;
5207 auto& um_to = ump.second;
5208 if (um_from == osd) {
5209 ldout(cct, 10) << " will try dropping existing"
5210 << " remapping pair "
5211 << um_from << " -> " << um_to
5212 << " which remapped " << pg
5213 << " out from underfull osd." << osd
5214 << dendl;
5215 temp_pgs_by_osd[um_to].erase(pg);
5216 temp_pgs_by_osd[um_from].insert(pg);
5217 } else {
5218 new_upmap_items.push_back(ump);
5219 }
5220 }
5221 if (new_upmap_items.empty()) {
5222 // drop whole item
5223 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5224 << " remapped " << pg
5225 << " out from underfull osd." << osd
5226 << ", will try cancelling it entirely"
5227 << dendl;
5228 to_unmap.insert(pg);
5229 return true;
5230 } else if (new_upmap_items.size() != um_pairs.size()) {
5231 // drop single remapping pair, updating
5232 ceph_assert(new_upmap_items.size() < um_pairs.size());
5233 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5234 << " remapped " << pg
5235 << " out from underfull osd." << osd
5236 << ", new_pg_upmap_items now " << new_upmap_items
5237 << dendl;
5238 to_upmap[pg] = new_upmap_items;
5239 return true;
5240 }
5241 }
5242 return false;
5243}
5244
5245void OSDMap::add_remap_pair(
5246 CephContext *cct,
5247 int orig,
5248 int out,
5249 pg_t pg,
5250 size_t pg_pool_size,
5251 int osd,
5252 set<int>& existing,
5253 map<int,set<pg_t>>& temp_pgs_by_osd,
5254 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items,
5255 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5256{
5257 //
5258 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5259 // the relevant data structures
5260 //
5261 ldout(cct, 10) << " will try adding new remapping pair "
5262 << orig << " -> " << out << " for " << pg
5263 << (orig != osd ? " NOT selected osd" : "")
5264 << dendl;
5265 existing.insert(orig);
5266 existing.insert(out);
5267 temp_pgs_by_osd[orig].erase(pg);
5268 temp_pgs_by_osd[out].insert(pg);
5269 ceph_assert(new_upmap_items.size() < pg_pool_size);
5270 new_upmap_items.push_back(make_pair(orig, out));
5271 // append new remapping pairs slowly
5272 // This way we can make sure that each tiny change will
5273 // definitely make distribution of PGs converging to
5274 // the perfect status.
5275 to_upmap[pg] = new_upmap_items;
5276
5277}
5278
5279int OSDMap::find_best_remap (
5280 CephContext *cct,
5281 const vector<int>& orig,
5282 const vector<int>& out,
5283 const set<int>& existing,
5284 const map<int,float> osd_deviation)
5285{
5286 //
5287 // Find the best remap from the suggestions in orig and out - the best remap
5288 // is the one which maps from the OSD with the largest deviatoion (from the
5289 // OSDs which are part of orig)
5290 //
5291 int best_pos = -1;
5292 float max_dev = 0;
5293 for (unsigned i = 0; i < out.size(); ++i) {
5294 if (orig[i] == out[i])
5295 continue; // skip invalid remappings
5296 if (existing.count(orig[i]) || existing.count(out[i]))
5297 continue; // we want new remappings only!
5298 if (osd_deviation.at(orig[i]) > max_dev) {
5299 max_dev = osd_deviation.at(orig[i]);
5300 best_pos = i;
5301 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl;
5302 }
5303 }
5304 return best_pos;
5305}
5306
5307OSDMap::candidates_t OSDMap::build_candidates(
5308 CephContext *cct,
5309 const OSDMap& tmp_osd_map,
5310 const set<pg_t> to_skip,
5311 const set<int64_t>& only_pools,
5312 bool aggressive,
5313 std::random_device::result_type *p_seed)
5314{
5315 //
5316 // build the candidates data structure
5317 //
5318 candidates_t candidates;
5319 candidates.reserve(tmp_osd_map.pg_upmap_items.size());
5320 for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) {
5321 if (to_skip.count(pg))
5322 continue;
5323 if (!only_pools.empty() && !only_pools.count(pg.pool()))
5324 continue;
5325 candidates.push_back(make_pair(pg, um_pair));
5326 }
5327 if (aggressive) {
5328 // shuffle candidates so they all get equal (in)attention
5329 std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed));
5330 }
5331 return candidates;
5332}
5333
31f18b77
FG
5334int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
5335{
5336 return crush->get_leaves(name, osds);
5337}
5338
3efd9988
FG
5339// get pools whose crush rules might reference the given osd
5340void OSDMap::get_pool_ids_by_osd(CephContext *cct,
5341 int osd,
5342 set<int64_t> *pool_ids) const
5343{
11fdf7f2 5344 ceph_assert(pool_ids);
3efd9988
FG
5345 set<int> raw_rules;
5346 int r = crush->get_rules_by_osd(osd, &raw_rules);
5347 if (r < 0) {
5348 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
5349 << dendl;
11fdf7f2 5350 ceph_assert(r >= 0);
3efd9988
FG
5351 }
5352 set<int> rules;
5353 for (auto &i: raw_rules) {
5354 // exclude any dead rule
5355 if (crush_rule_in_use(i)) {
5356 rules.insert(i);
5357 }
5358 }
5359 for (auto &r: rules) {
5360 get_pool_ids_by_rule(r, pool_ids);
5361 }
5362}
5363
31f18b77
FG
5364template <typename F>
5365class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
5366public:
5367 typedef CrushTreeDumper::Dumper<F> Parent;
5368
5369 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2 5370 const PGMap& pgmap_, bool tree_,
9f95a23c 5371 const string& filter) :
c07f9fc5 5372 Parent(crush, osdmap_->get_pool_names()),
31f18b77 5373 osdmap(osdmap_),
11fdf7f2 5374 pgmap(pgmap_),
31f18b77 5375 tree(tree_),
31f18b77
FG
5376 min_var(-1),
5377 max_var(-1),
5378 stddev(0),
5379 sum(0) {
9f95a23c
TL
5380 if (osdmap->crush->name_exists(filter)) {
5381 // filter by crush node
5382 auto item_id = osdmap->crush->get_item_id(filter);
11fdf7f2
TL
5383 allowed.insert(item_id);
5384 osdmap->crush->get_all_children(item_id, &allowed);
9f95a23c
TL
5385 } else if (osdmap->crush->class_exists(filter)) {
5386 // filter by device class
5387 class_id = osdmap->crush->get_class_id(filter);
5388 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
5389 pool_id >= 0) {
5390 // filter by pool
5391 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
5392 set<int> roots;
5393 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
5394 allowed = roots;
5395 for (auto r : roots)
5396 osdmap->crush->get_all_children(r, &allowed);
11fdf7f2
TL
5397 }
5398 average_util = average_utilization();
31f18b77
FG
5399 }
5400
5401protected:
11fdf7f2
TL
5402
5403 bool should_dump(int id) const {
5404 if (!allowed.empty() && !allowed.count(id)) // filter by name
5405 return false;
9f95a23c
TL
5406 if (id >= 0 && class_id >= 0) {
5407 auto item_class_id = osdmap->crush->get_item_class_id(id);
5408 if (item_class_id < 0 || // not bound to a class yet
5409 item_class_id != class_id) // or already bound to a different class
11fdf7f2
TL
5410 return false;
5411 }
5412 return true;
5413 }
5414
5415 set<int> get_dumped_osds() {
9f95a23c 5416 if (allowed.empty() && class_id < 0) {
11fdf7f2
TL
5417 // old way, all
5418 return {};
5419 }
5420 return dumped_osds;
5421 }
5422
31f18b77
FG
5423 void dump_stray(F *f) {
5424 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5425 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 5426 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5427 }
5428 }
5429
5430 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
f67539c2 5431 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
31f18b77 5432 return;
11fdf7f2
TL
5433 if (!should_dump(qi.id))
5434 return;
31f18b77 5435
11fdf7f2
TL
5436 if (!qi.is_bucket())
5437 dumped_osds.insert(qi.id);
31f18b77 5438 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5439 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5440 kb_used_meta = 0, kb_avail = 0;
31f18b77 5441 double util = 0;
11fdf7f2
TL
5442 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5443 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5444 if (kb_used && kb)
5445 util = 100.0 * (double)kb_used / (double)kb;
5446
5447 double var = 1.0;
5448 if (average_util)
5449 var = util / average_util;
5450
11fdf7f2 5451 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5452
11fdf7f2
TL
5453 dump_item(qi, reweight, kb, kb_used,
5454 kb_used_data, kb_used_omap, kb_used_meta,
5455 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5456
5457 if (!qi.is_bucket() && reweight > 0) {
5458 if (min_var < 0 || var < min_var)
5459 min_var = var;
5460 if (max_var < 0 || var > max_var)
5461 max_var = var;
5462
5463 double dev = util - average_util;
5464 dev *= dev;
5465 stddev += reweight * dev;
5466 sum += reweight;
5467 }
5468 }
5469
5470 virtual void dump_item(const CrushTreeDumper::Item &qi,
5471 float &reweight,
5472 int64_t kb,
5473 int64_t kb_used,
11fdf7f2
TL
5474 int64_t kb_used_data,
5475 int64_t kb_used_omap,
5476 int64_t kb_used_meta,
31f18b77
FG
5477 int64_t kb_avail,
5478 double& util,
5479 double& var,
5480 const size_t num_pgs,
5481 F *f) = 0;
5482
5483 double dev() {
5484 return sum > 0 ? sqrt(stddev / sum) : 0;
5485 }
5486
5487 double average_utilization() {
5488 int64_t kb = 0, kb_used = 0;
5489 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5490 if (!osdmap->exists(i) ||
5491 osdmap->get_weight(i) == 0 ||
5492 !should_dump(i))
31f18b77 5493 continue;
11fdf7f2
TL
5494 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5495 kb_avail_i;
5496 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5497 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5498 kb += kb_i;
5499 kb_used += kb_used_i;
5500 }
5501 }
5502 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5503 }
5504
5505 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5506 int64_t* kb_used_data,
5507 int64_t* kb_used_omap,
5508 int64_t* kb_used_meta,
31f18b77 5509 int64_t* kb_avail) const {
11fdf7f2 5510 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5511 if (!p) return false;
11fdf7f2
TL
5512 *kb = p->statfs.kb();
5513 *kb_used = p->statfs.kb_used_raw();
5514 *kb_used_data = p->statfs.kb_used_data();
5515 *kb_used_omap = p->statfs.kb_used_omap();
5516 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5517 *kb_avail = p->statfs.kb_avail();
5518
f67539c2 5519 return true;
31f18b77
FG
5520 }
5521
5522 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5523 int64_t* kb_used_data,
5524 int64_t* kb_used_omap,
5525 int64_t* kb_used_meta,
31f18b77
FG
5526 int64_t* kb_avail) const {
5527 if (id >= 0) {
11fdf7f2 5528 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5529 *kb = 0;
5530 *kb_used = 0;
11fdf7f2
TL
5531 *kb_used_data = 0;
5532 *kb_used_omap = 0;
5533 *kb_used_meta = 0;
31f18b77
FG
5534 *kb_avail = 0;
5535 return true;
5536 }
11fdf7f2
TL
5537 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5538 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5539 }
5540
5541 *kb = 0;
5542 *kb_used = 0;
11fdf7f2
TL
5543 *kb_used_data = 0;
5544 *kb_used_omap = 0;
5545 *kb_used_meta = 0;
31f18b77
FG
5546 *kb_avail = 0;
5547
5548 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5549 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5550 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5551 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5552 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5553 &kb_used_data_i, &kb_used_omap_i,
5554 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5555 return false;
5556 *kb += kb_i;
5557 *kb_used += kb_used_i;
11fdf7f2
TL
5558 *kb_used_data += kb_used_data_i;
5559 *kb_used_omap += kb_used_omap_i;
5560 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5561 *kb_avail += kb_avail_i;
5562 }
f67539c2 5563 return true;
31f18b77
FG
5564 }
5565
5566protected:
5567 const OSDMap *osdmap;
11fdf7f2 5568 const PGMap& pgmap;
31f18b77
FG
5569 bool tree;
5570 double average_util;
5571 double min_var;
5572 double max_var;
5573 double stddev;
5574 double sum;
9f95a23c 5575 int class_id = -1;
11fdf7f2
TL
5576 set<int> allowed;
5577 set<int> dumped_osds;
31f18b77
FG
5578};
5579
5580
5581class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5582public:
5583 typedef OSDUtilizationDumper<TextTable> Parent;
5584
5585 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5586 const PGMap& pgmap, bool tree,
9f95a23c
TL
5587 const string& filter) :
5588 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5589
5590 void dump(TextTable *tbl) {
5591 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5592 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5593 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5594 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5595 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5596 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5597 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5598 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5599 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5600 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5601 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5602 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5603 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5604 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5605 if (tree)
5606 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5607
5608 Parent::dump(tbl);
5609
5610 dump_stray(tbl);
5611
11fdf7f2 5612 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5613 *tbl << ""
5614 << ""
5615 << "" << "TOTAL"
11fdf7f2
TL
5616 << byte_u_t(sum.statfs.total)
5617 << byte_u_t(sum.statfs.get_used_raw())
5618 << byte_u_t(sum.statfs.allocated)
5619 << byte_u_t(sum.statfs.omap_allocated)
5620 << byte_u_t(sum.statfs.internal_metadata)
5621 << byte_u_t(sum.statfs.available)
31f18b77
FG
5622 << lowprecision_t(average_util)
5623 << ""
5624 << TextTable::endrow;
5625 }
5626
5627protected:
5628 struct lowprecision_t {
5629 float v;
5630 explicit lowprecision_t(float _v) : v(_v) {}
5631 };
5632 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5633
5634 using OSDUtilizationDumper<TextTable>::dump_item;
5635 void dump_item(const CrushTreeDumper::Item &qi,
5636 float &reweight,
5637 int64_t kb,
5638 int64_t kb_used,
11fdf7f2
TL
5639 int64_t kb_used_data,
5640 int64_t kb_used_omap,
5641 int64_t kb_used_meta,
31f18b77
FG
5642 int64_t kb_avail,
5643 double& util,
5644 double& var,
5645 const size_t num_pgs,
5646 TextTable *tbl) override {
224ce89b
WB
5647 const char *c = crush->get_item_class(qi.id);
5648 if (!c)
5649 c = "";
31f18b77 5650 *tbl << qi.id
224ce89b 5651 << c
31f18b77
FG
5652 << weightf_t(qi.weight)
5653 << weightf_t(reweight)
1adf2230
AA
5654 << byte_u_t(kb << 10)
5655 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5656 << byte_u_t(kb_used_data << 10)
5657 << byte_u_t(kb_used_omap << 10)
5658 << byte_u_t(kb_used_meta << 10)
1adf2230 5659 << byte_u_t(kb_avail << 10)
31f18b77
FG
5660 << lowprecision_t(util)
5661 << lowprecision_t(var);
5662
5663 if (qi.is_bucket()) {
5664 *tbl << "-";
11fdf7f2 5665 *tbl << "";
31f18b77
FG
5666 } else {
5667 *tbl << num_pgs;
11fdf7f2
TL
5668 if (osdmap->is_up(qi.id)) {
5669 *tbl << "up";
5670 } else if (osdmap->is_destroyed(qi.id)) {
5671 *tbl << "destroyed";
5672 } else {
5673 *tbl << "down";
5674 }
31f18b77
FG
5675 }
5676
5677 if (tree) {
5678 ostringstream name;
5679 for (int k = 0; k < qi.depth; k++)
5680 name << " ";
5681 if (qi.is_bucket()) {
5682 int type = crush->get_bucket_type(qi.id);
5683 name << crush->get_type_name(type) << " "
5684 << crush->get_item_name(qi.id);
5685 } else {
5686 name << "osd." << qi.id;
5687 }
5688 *tbl << name.str();
5689 }
5690
5691 *tbl << TextTable::endrow;
5692 }
5693
5694public:
5695 string summary() {
5696 ostringstream out;
5697 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5698 << "/" << lowprecision_t(max_var) << " "
5699 << "STDDEV: " << lowprecision_t(dev());
5700 return out.str();
5701 }
5702};
5703
5704ostream& operator<<(ostream& out,
5705 const OSDUtilizationPlainDumper::lowprecision_t& v)
5706{
5707 if (v.v < -0.01) {
5708 return out << "-";
5709 } else if (v.v < 0.001) {
5710 return out << "0";
5711 } else {
5712 std::streamsize p = out.precision();
5713 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5714 }
5715}
5716
5717class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5718public:
5719 typedef OSDUtilizationDumper<Formatter> Parent;
5720
5721 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5722 const PGMap& pgmap, bool tree,
9f95a23c
TL
5723 const string& filter) :
5724 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5725
5726 void dump(Formatter *f) {
5727 f->open_array_section("nodes");
5728 Parent::dump(f);
5729 f->close_section();
5730
5731 f->open_array_section("stray");
5732 dump_stray(f);
5733 f->close_section();
5734 }
5735
5736protected:
5737 using OSDUtilizationDumper<Formatter>::dump_item;
5738 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5739 float &reweight,
5740 int64_t kb,
5741 int64_t kb_used,
5742 int64_t kb_used_data,
5743 int64_t kb_used_omap,
5744 int64_t kb_used_meta,
5745 int64_t kb_avail,
5746 double& util,
5747 double& var,
5748 const size_t num_pgs,
5749 Formatter *f) override {
31f18b77 5750 f->open_object_section("item");
c07f9fc5 5751 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5752 f->dump_float("reweight", reweight);
5753 f->dump_int("kb", kb);
5754 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5755 f->dump_int("kb_used_data", kb_used_data);
5756 f->dump_int("kb_used_omap", kb_used_omap);
5757 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5758 f->dump_int("kb_avail", kb_avail);
5759 f->dump_float("utilization", util);
5760 f->dump_float("var", var);
5761 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5762 if (!qi.is_bucket()) {
5763 if (osdmap->is_up(qi.id)) {
5764 f->dump_string("status", "up");
5765 } else if (osdmap->is_destroyed(qi.id)) {
5766 f->dump_string("status", "destroyed");
5767 } else {
5768 f->dump_string("status", "down");
5769 }
5770 }
31f18b77
FG
5771 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5772 f->close_section();
5773 }
5774
5775public:
5776 void summary(Formatter *f) {
5777 f->open_object_section("summary");
11fdf7f2
TL
5778 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5779 auto& s = sum.statfs;
5780
5781 f->dump_int("total_kb", s.kb());
5782 f->dump_int("total_kb_used", s.kb_used_raw());
5783 f->dump_int("total_kb_used_data", s.kb_used_data());
5784 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5785 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5786 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5787 f->dump_float("average_utilization", average_util);
5788 f->dump_float("min_var", min_var);
5789 f->dump_float("max_var", max_var);
5790 f->dump_float("dev", dev());
5791 f->close_section();
5792 }
5793};
5794
5795void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5796 const PGMap& pgmap,
5797 ostream& out,
5798 Formatter *f,
5799 bool tree,
9f95a23c 5800 const string& filter)
31f18b77
FG
5801{
5802 const CrushWrapper *crush = osdmap.crush.get();
5803 if (f) {
5804 f->open_object_section("df");
9f95a23c 5805 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5806 d.dump(f);
5807 d.summary(f);
5808 f->close_section();
5809 f->flush(out);
5810 } else {
9f95a23c 5811 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5812 TextTable tbl;
5813 d.dump(&tbl);
5814 out << tbl << d.summary() << "\n";
5815 }
5816}
224ce89b 5817
92f5a8d4
TL
5818void OSDMap::check_health(CephContext *cct,
5819 health_check_map_t *checks) const
224ce89b
WB
5820{
5821 int num_osds = get_num_osds();
5822
5823 // OSD_DOWN
5824 // OSD_$subtree_DOWN
5825 // OSD_ORPHAN
5826 if (num_osds >= 0) {
5827 int num_in_osds = 0;
5828 int num_down_in_osds = 0;
5829 set<int> osds;
5830 set<int> down_in_osds;
5831 set<int> up_in_osds;
5832 set<int> subtree_up;
5833 unordered_map<int, set<int> > subtree_type_down;
5834 unordered_map<int, int> num_osds_subtree;
5835 int max_type = crush->get_max_type_id();
5836
5837 for (int i = 0; i < get_max_osd(); i++) {
5838 if (!exists(i)) {
5839 if (crush->item_exists(i)) {
5840 osds.insert(i);
5841 }
5842 continue;
5843 }
f67539c2 5844 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
224ce89b
WB
5845 continue;
5846 ++num_in_osds;
5847 if (down_in_osds.count(i) || up_in_osds.count(i))
5848 continue;
5849 if (!is_up(i)) {
5850 down_in_osds.insert(i);
5851 int parent_id = 0;
5852 int current = i;
5853 for (int type = 0; type <= max_type; type++) {
5854 if (!crush->get_type_name(type))
5855 continue;
5856 int r = crush->get_immediate_parent_id(current, &parent_id);
5857 if (r == -ENOENT)
5858 break;
5859 // break early if this parent is already marked as up
5860 if (subtree_up.count(parent_id))
5861 break;
5862 type = crush->get_bucket_type(parent_id);
5863 if (!subtree_type_is_down(
92f5a8d4 5864 cct, parent_id, type,
224ce89b
WB
5865 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5866 break;
5867 current = parent_id;
5868 }
5869 }
5870 }
5871
5872 // calculate the number of down osds in each down subtree and
5873 // store it in num_osds_subtree
5874 for (int type = 1; type <= max_type; type++) {
5875 if (!crush->get_type_name(type))
5876 continue;
5877 for (auto j = subtree_type_down[type].begin();
5878 j != subtree_type_down[type].end();
5879 ++j) {
5880 list<int> children;
5881 int num = 0;
5882 int num_children = crush->get_children(*j, &children);
5883 if (num_children == 0)
5884 continue;
5885 for (auto l = children.begin(); l != children.end(); ++l) {
5886 if (*l >= 0) {
5887 ++num;
5888 } else if (num_osds_subtree[*l] > 0) {
5889 num = num + num_osds_subtree[*l];
5890 }
5891 }
5892 num_osds_subtree[*j] = num;
5893 }
5894 }
5895 num_down_in_osds = down_in_osds.size();
11fdf7f2 5896 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
5897 if (num_down_in_osds > 0) {
5898 // summary of down subtree types and osds
5899 for (int type = max_type; type > 0; type--) {
5900 if (!crush->get_type_name(type))
5901 continue;
5902 if (subtree_type_down[type].size() > 0) {
5903 ostringstream ss;
5904 ss << subtree_type_down[type].size() << " "
5905 << crush->get_type_name(type);
5906 if (subtree_type_down[type].size() > 1) {
5907 ss << "s";
5908 }
5909 int sum_down_osds = 0;
5910 for (auto j = subtree_type_down[type].begin();
5911 j != subtree_type_down[type].end();
5912 ++j) {
5913 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5914 }
5915 ss << " (" << sum_down_osds << " osds) down";
5916 string err = string("OSD_") +
5917 string(crush->get_type_name(type)) + "_DOWN";
5918 boost::to_upper(err);
9f95a23c
TL
5919 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
5920 subtree_type_down[type].size());
224ce89b
WB
5921 for (auto j = subtree_type_down[type].rbegin();
5922 j != subtree_type_down[type].rend();
5923 ++j) {
5924 ostringstream ss;
5925 ss << crush->get_type_name(type);
5926 ss << " ";
5927 ss << crush->get_item_name(*j);
5928 // at the top level, do not print location
5929 if (type != max_type) {
5930 ss << " (";
5931 ss << crush->get_full_location_ordered_string(*j);
5932 ss << ")";
5933 }
5934 int num = num_osds_subtree[*j];
5935 ss << " (" << num << " osds)";
5936 ss << " is down";
5937 d.detail.push_back(ss.str());
5938 }
5939 }
5940 }
5941 ostringstream ss;
5942 ss << down_in_osds.size() << " osds down";
9f95a23c
TL
5943 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
5944 down_in_osds.size());
224ce89b
WB
5945 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5946 ostringstream ss;
5947 ss << "osd." << *it << " (";
5948 ss << crush->get_full_location_ordered_string(*it);
5949 ss << ") is down";
5950 d.detail.push_back(ss.str());
5951 }
5952 }
5953
5954 if (!osds.empty()) {
5955 ostringstream ss;
5956 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
9f95a23c
TL
5957 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
5958 osds.size());
224ce89b
WB
5959 for (auto osd : osds) {
5960 ostringstream ss;
5961 ss << "osd." << osd << " exists in crush map but not in osdmap";
5962 d.detail.push_back(ss.str());
5963 }
5964 }
5965 }
5966
eafe8130
TL
5967 std::list<std::string> scrub_messages;
5968 bool noscrub = false, nodeepscrub = false;
5969 for (const auto &p : pools) {
5970 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5971 ostringstream ss;
5972 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5973 scrub_messages.push_back(ss.str());
5974 noscrub = true;
5975 }
5976 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5977 ostringstream ss;
5978 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5979 scrub_messages.push_back(ss.str());
5980 nodeepscrub = true;
5981 }
5982 }
5983 if (noscrub || nodeepscrub) {
5984 string out = "";
5985 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5986 out += nodeepscrub ? "nodeep-scrub" : "";
5987 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
9f95a23c 5988 "Some pool(s) have the " + out + " flag(s) set", 0);
eafe8130
TL
5989 d.detail.splice(d.detail.end(), scrub_messages);
5990 }
5991
224ce89b
WB
5992 // OSD_OUT_OF_ORDER_FULL
5993 {
5994 // An osd could configure failsafe ratio, to something different
5995 // but for now assume it is the same here.
92f5a8d4 5996 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
5997 if (fsr > 1.0) fsr /= 100;
5998 float fr = get_full_ratio();
5999 float br = get_backfillfull_ratio();
6000 float nr = get_nearfull_ratio();
6001
6002 list<string> detail;
6003 // These checks correspond to how OSDService::check_full_status() in an OSD
6004 // handles the improper setting of these values.
6005 if (br < nr) {
6006 ostringstream ss;
6007 ss << "backfillfull_ratio (" << br
6008 << ") < nearfull_ratio (" << nr << "), increased";
6009 detail.push_back(ss.str());
6010 br = nr;
6011 }
6012 if (fr < br) {
6013 ostringstream ss;
6014 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
6015 << "), increased";
6016 detail.push_back(ss.str());
6017 fr = br;
6018 }
6019 if (fsr < fr) {
6020 ostringstream ss;
6021 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
6022 << "), increased";
6023 detail.push_back(ss.str());
6024 }
6025 if (!detail.empty()) {
6026 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
9f95a23c 6027 "full ratio(s) out of order", 0);
224ce89b
WB
6028 d.detail.swap(detail);
6029 }
6030 }
6031
6032 // OSD_FULL
6033 // OSD_NEARFULL
6034 // OSD_BACKFILLFULL
6035 // OSD_FAILSAFE_FULL
6036 {
6037 set<int> full, backfillfull, nearfull;
6038 get_full_osd_counts(&full, &backfillfull, &nearfull);
6039 if (full.size()) {
6040 ostringstream ss;
6041 ss << full.size() << " full osd(s)";
9f95a23c 6042 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
224ce89b
WB
6043 for (auto& i: full) {
6044 ostringstream ss;
6045 ss << "osd." << i << " is full";
6046 d.detail.push_back(ss.str());
6047 }
6048 }
6049 if (backfillfull.size()) {
6050 ostringstream ss;
6051 ss << backfillfull.size() << " backfillfull osd(s)";
9f95a23c
TL
6052 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
6053 backfillfull.size());
224ce89b
WB
6054 for (auto& i: backfillfull) {
6055 ostringstream ss;
6056 ss << "osd." << i << " is backfill full";
6057 d.detail.push_back(ss.str());
6058 }
6059 }
6060 if (nearfull.size()) {
6061 ostringstream ss;
6062 ss << nearfull.size() << " nearfull osd(s)";
9f95a23c 6063 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
224ce89b
WB
6064 for (auto& i: nearfull) {
6065 ostringstream ss;
6066 ss << "osd." << i << " is near full";
6067 d.detail.push_back(ss.str());
6068 }
6069 }
6070 }
6071
6072 // OSDMAP_FLAGS
6073 {
6074 // warn about flags
6075 uint64_t warn_flags =
224ce89b
WB
6076 CEPH_OSDMAP_PAUSERD |
6077 CEPH_OSDMAP_PAUSEWR |
6078 CEPH_OSDMAP_PAUSEREC |
6079 CEPH_OSDMAP_NOUP |
6080 CEPH_OSDMAP_NODOWN |
6081 CEPH_OSDMAP_NOIN |
6082 CEPH_OSDMAP_NOOUT |
6083 CEPH_OSDMAP_NOBACKFILL |
6084 CEPH_OSDMAP_NORECOVER |
6085 CEPH_OSDMAP_NOSCRUB |
6086 CEPH_OSDMAP_NODEEP_SCRUB |
6087 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 6088 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
6089 CEPH_OSDMAP_NOREBALANCE;
6090 if (test_flag(warn_flags)) {
6091 ostringstream ss;
9f95a23c
TL
6092 string s = get_flag_string(get_flags() & warn_flags);
6093 ss << s << " flag(s) set";
6094 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
6095 s.size() /* kludgey but sufficient */);
224ce89b
WB
6096 }
6097 }
6098
6099 // OSD_FLAGS
6100 {
6101 list<string> detail;
6102 const unsigned flags =
6103 CEPH_OSD_NOUP |
6104 CEPH_OSD_NOIN |
6105 CEPH_OSD_NODOWN |
6106 CEPH_OSD_NOOUT;
6107 for (int i = 0; i < max_osd; ++i) {
6108 if (osd_state[i] & flags) {
6109 ostringstream ss;
6110 set<string> states;
6111 OSDMap::calc_state_set(osd_state[i] & flags, states);
6112 ss << "osd." << i << " has flags " << states;
6113 detail.push_back(ss.str());
6114 }
6115 }
81eedcae
TL
6116 for (auto& i : crush_node_flags) {
6117 if (i.second && crush->item_exists(i.first)) {
6118 ostringstream ss;
6119 set<string> states;
6120 OSDMap::calc_state_set(i.second, states);
6121 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
6122 const char *tn = crush->get_type_name(t);
6123 ss << (tn ? tn : "node") << " "
6124 << crush->get_item_name(i.first) << " has flags " << states;
6125 detail.push_back(ss.str());
6126 }
6127 }
6128 for (auto& i : device_class_flags) {
6129 const char* class_name = crush->get_class_name(i.first);
6130 if (i.second && class_name) {
6131 ostringstream ss;
6132 set<string> states;
6133 OSDMap::calc_state_set(i.second, states);
6134 ss << "device class '" << class_name << "' has flags " << states;
6135 detail.push_back(ss.str());
6136 }
6137 }
224ce89b
WB
6138 if (!detail.empty()) {
6139 ostringstream ss;
81eedcae 6140 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
9f95a23c 6141 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
224ce89b
WB
6142 d.detail.swap(detail);
6143 }
6144 }
6145
6146 // OLD_CRUSH_TUNABLES
92f5a8d4 6147 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 6148 string min = crush->get_min_required_version();
92f5a8d4 6149 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
6150 ostringstream ss;
6151 ss << "crush map has legacy tunables (require " << min
92f5a8d4 6152 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
9f95a23c 6153 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
f67539c2 6154 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
6155 }
6156 }
6157
6158 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 6159 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
6160 if (crush->get_straw_calc_version() == 0) {
6161 ostringstream ss;
6162 ss << "crush map has straw_calc_version=0";
9f95a23c 6163 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
224ce89b 6164 d.detail.push_back(
f67539c2 6165 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
6166 }
6167 }
6168
6169 // CACHE_POOL_NO_HIT_SET
92f5a8d4 6170 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b 6171 list<string> detail;
9f95a23c 6172 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
224ce89b
WB
6173 const pg_pool_t& info = p->second;
6174 if (info.cache_mode_requires_hit_set() &&
6175 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
6176 ostringstream ss;
6177 ss << "pool '" << get_pool_name(p->first)
6178 << "' with cache_mode " << info.get_cache_mode_name()
6179 << " needs hit_set_type to be set but it is not";
6180 detail.push_back(ss.str());
6181 }
6182 }
6183 if (!detail.empty()) {
6184 ostringstream ss;
6185 ss << detail.size() << " cache pools are missing hit_sets";
9f95a23c
TL
6186 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
6187 detail.size());
224ce89b
WB
6188 d.detail.swap(detail);
6189 }
6190 }
6191
6192 // OSD_NO_SORTBITWISE
11fdf7f2 6193 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 6194 ostringstream ss;
11fdf7f2 6195 ss << "'sortbitwise' flag is not set";
9f95a23c 6196 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
6197 }
6198
6199 // OSD_UPGRADE_FINISHED
20effc67
TL
6200 if (auto require_release = pending_require_osd_release()) {
6201 ostringstream ss;
6202 ss << "all OSDs are running " << *require_release << " or later but"
6203 << " require_osd_release < " << *require_release;
6204 auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
6205 d.detail.push_back(ss.str());
6206 }
224ce89b 6207
3efd9988 6208 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 6209 {
3efd9988 6210 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
6211 for (auto it : get_pools()) {
6212 const pg_pool_t &pool = it.second;
3efd9988 6213 const string& pool_name = get_pool_name(it.first);
224ce89b 6214 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 6215 stringstream ss;
11fdf7f2 6216 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
6217 // may run out of space too,
6218 // but we want EQUOTA taking precedence
11fdf7f2 6219 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
6220 } else {
6221 ss << "pool '" << pool_name << "' is full (no space)";
6222 }
6223 full_detail.push_back(ss.str());
6224 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
6225 stringstream ss;
6226 ss << "pool '" << pool_name << "' is backfillfull";
6227 backfillfull_detail.push_back(ss.str());
6228 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
6229 stringstream ss;
6230 ss << "pool '" << pool_name << "' is nearfull";
6231 nearfull_detail.push_back(ss.str());
224ce89b
WB
6232 }
6233 }
3efd9988 6234 if (!full_detail.empty()) {
224ce89b 6235 ostringstream ss;
3efd9988 6236 ss << full_detail.size() << " pool(s) full";
9f95a23c 6237 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
3efd9988
FG
6238 d.detail.swap(full_detail);
6239 }
6240 if (!backfillfull_detail.empty()) {
6241 ostringstream ss;
6242 ss << backfillfull_detail.size() << " pool(s) backfillfull";
9f95a23c
TL
6243 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
6244 backfillfull_detail.size());
3efd9988
FG
6245 d.detail.swap(backfillfull_detail);
6246 }
6247 if (!nearfull_detail.empty()) {
6248 ostringstream ss;
6249 ss << nearfull_detail.size() << " pool(s) nearfull";
9f95a23c
TL
6250 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
6251 nearfull_detail.size());
3efd9988 6252 d.detail.swap(nearfull_detail);
224ce89b
WB
6253 }
6254 }
92f5a8d4
TL
6255
6256 // POOL_PG_NUM_NOT_POWER_OF_TWO
6257 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
6258 list<string> detail;
6259 for (auto it : get_pools()) {
6260 if (!isp2(it.second.get_pg_num_target())) {
6261 ostringstream ss;
6262 ss << "pool '" << get_pool_name(it.first)
6263 << "' pg_num " << it.second.get_pg_num_target()
6264 << " is not a power of two";
6265 detail.push_back(ss.str());
6266 }
6267 }
6268 if (!detail.empty()) {
6269 ostringstream ss;
6270 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
6271 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
9f95a23c
TL
6272 ss.str(), detail.size());
6273 d.detail.swap(detail);
6274 }
6275 }
6276
6277 // POOL_NO_REDUNDANCY
6278 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
6279 {
6280 list<string> detail;
6281 for (auto it : get_pools()) {
6282 if (it.second.get_size() == 1) {
6283 ostringstream ss;
6284 ss << "pool '" << get_pool_name(it.first)
6285 << "' has no replicas configured";
6286 detail.push_back(ss.str());
6287 }
6288 }
6289 if (!detail.empty()) {
6290 ostringstream ss;
6291 ss << detail.size() << " pool(s) have no replicas configured";
6292 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
6293 ss.str(), detail.size());
92f5a8d4
TL
6294 d.detail.swap(detail);
6295 }
6296 }
f67539c2
TL
6297
6298 // DEGRADED STRETCH MODE
6299 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
6300 if (recovering_stretch_mode) {
6301 stringstream ss;
6302 ss << "We are recovering stretch mode buckets, only requiring "
6303 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6304 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
6305 ss.str(), 0);
6306 } else if (degraded_stretch_mode) {
6307 stringstream ss;
6308 ss << "We are missing stretch mode buckets, only requiring "
6309 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6310 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
6311 ss.str(), 0);
6312 }
6313 }
224ce89b 6314}
35e4c445
FG
6315
6316int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
6317 ostream *ss) const
6318{
6319 out->clear();
6320 for (auto i = ls.begin(); i != ls.end(); ++i) {
6321 if (i == ls.begin() &&
6322 (*i == "any" || *i == "all" || *i == "*")) {
6323 get_all_osds(*out);
6324 break;
6325 }
9f95a23c 6326 long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
35e4c445
FG
6327 if (osd < 0) {
6328 *ss << "invalid osd id '" << *i << "'";
6329 return -EINVAL;
6330 }
6331 out->insert(osd);
6332 }
6333 return 0;
6334}
11fdf7f2
TL
6335
6336void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
6337 string &subtree,
6338 int limit, // how many
6339 set<int> skip,
6340 set<int> *want) const {
6341 if (limit <= 0)
6342 return;
6343 int subtree_type = crush->get_type_id(subtree);
6344 if (subtree_type < 1)
6345 return;
6346 vector<int> subtrees;
6347 crush->get_subtree_of_type(subtree_type, &subtrees);
6348 std::random_device rd;
6349 std::default_random_engine rng{rd()};
6350 std::shuffle(subtrees.begin(), subtrees.end(), rng);
6351 for (auto s : subtrees) {
6352 if (limit <= 0)
6353 break;
6354 if (crush->subtree_contains(s, n))
6355 continue;
6356 vector<int> osds;
6357 crush->get_children_of_type(s, 0, &osds);
6358 if (osds.empty())
6359 continue;
6360 vector<int> up_osds;
6361 for (auto o : osds) {
6362 if (is_up(o) && !skip.count(o))
6363 up_osds.push_back(o);
6364 }
6365 if (up_osds.empty())
6366 continue;
6367 auto it = up_osds.begin();
6368 std::advance(it, (n % up_osds.size()));
6369 want->insert(*it);
6370 --limit;
6371 }
6372}
6373
6374float OSDMap::pool_raw_used_rate(int64_t poolid) const
6375{
6376 const pg_pool_t *pool = get_pg_pool(poolid);
6377 assert(pool != nullptr);
6378
6379 switch (pool->get_type()) {
6380 case pg_pool_t::TYPE_REPLICATED:
6381 return pool->get_size();
11fdf7f2
TL
6382 case pg_pool_t::TYPE_ERASURE:
6383 {
6384 auto& ecp =
6385 get_erasure_code_profile(pool->erasure_code_profile);
6386 auto pm = ecp.find("m");
6387 auto pk = ecp.find("k");
6388 if (pm != ecp.end() && pk != ecp.end()) {
6389 int k = atoi(pk->second.c_str());
6390 int m = atoi(pm->second.c_str());
6391 int mk = m + k;
6392 ceph_assert(mk != 0);
6393 ceph_assert(k != 0);
6394 return (float)mk / k;
6395 } else {
6396 return 0.0;
6397 }
6398 }
6399 break;
6400 default:
6401 ceph_abort_msg("unrecognized pool type");
6402 }
6403}
81eedcae
TL
6404
6405unsigned OSDMap::get_osd_crush_node_flags(int osd) const
6406{
6407 unsigned flags = 0;
6408 if (!crush_node_flags.empty()) {
6409 // the map will contain type -> name
6410 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
6411 for (auto& i : ploc) {
6412 int id = crush->get_item_id(i.second);
6413 auto p = crush_node_flags.find(id);
6414 if (p != crush_node_flags.end()) {
6415 flags |= p->second;
6416 }
6417 }
6418 }
6419 return flags;
6420}
6421
6422unsigned OSDMap::get_crush_node_flags(int id) const
6423{
6424 unsigned flags = 0;
6425 auto it = crush_node_flags.find(id);
6426 if (it != crush_node_flags.end())
6427 flags = it->second;
6428 return flags;
6429}
6430
6431unsigned OSDMap::get_device_class_flags(int id) const
6432{
6433 unsigned flags = 0;
6434 auto it = device_class_flags.find(id);
6435 if (it != device_class_flags.end())
6436 flags = it->second;
6437 return flags;
6438}
20effc67
TL
6439
6440std::optional<std::string> OSDMap::pending_require_osd_release() const
6441{
6442 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) &&
6443 require_osd_release < ceph_release_t::quincy) {
6444 return "quincy";
6445 }
6446 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
6447 require_osd_release < ceph_release_t::pacific) {
6448 return "pacific";
6449 }
6450 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
6451 require_osd_release < ceph_release_t::octopus) {
6452 return "octopus";
6453 }
6454 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
6455 require_osd_release < ceph_release_t::nautilus) {
6456 return "nautilus";
6457 }
6458
6459 return std::nullopt;
6460}