]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import ceph quincy 17.2.1
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
29#include "include/ceph_features.h"
9f95a23c 30#include "include/common_fwd.h"
7c673cae
FG
31#include "include/str_map.h"
32
33#include "common/code_environment.h"
224ce89b 34#include "mon/health_check.h"
7c673cae
FG
35
36#include "crush/CrushTreeDumper.h"
37#include "common/Clock.h"
11fdf7f2
TL
38#include "mon/PGMap.h"
39
9f95a23c
TL
40using std::list;
41using std::make_pair;
42using std::map;
43using std::multimap;
44using std::ostream;
45using std::ostringstream;
46using std::pair;
47using std::set;
48using std::string;
49using std::stringstream;
50using std::unordered_map;
51using std::vector;
52
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56
7c673cae
FG
57#define dout_subsys ceph_subsys_osd
58
59MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
60MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
61
62
63// ----------------------------------
64// osd_info_t
65
66void osd_info_t::dump(Formatter *f) const
67{
68 f->dump_int("last_clean_begin", last_clean_begin);
69 f->dump_int("last_clean_end", last_clean_end);
70 f->dump_int("up_from", up_from);
71 f->dump_int("up_thru", up_thru);
72 f->dump_int("down_at", down_at);
73 f->dump_int("lost_at", lost_at);
74}
75
9f95a23c 76void osd_info_t::encode(ceph::buffer::list& bl) const
7c673cae 77{
11fdf7f2 78 using ceph::encode;
7c673cae 79 __u8 struct_v = 1;
11fdf7f2
TL
80 encode(struct_v, bl);
81 encode(last_clean_begin, bl);
82 encode(last_clean_end, bl);
83 encode(up_from, bl);
84 encode(up_thru, bl);
85 encode(down_at, bl);
86 encode(lost_at, bl);
7c673cae
FG
87}
88
9f95a23c 89void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 90{
11fdf7f2 91 using ceph::decode;
7c673cae 92 __u8 struct_v;
11fdf7f2
TL
93 decode(struct_v, bl);
94 decode(last_clean_begin, bl);
95 decode(last_clean_end, bl);
96 decode(up_from, bl);
97 decode(up_thru, bl);
98 decode(down_at, bl);
99 decode(lost_at, bl);
7c673cae
FG
100}
101
102void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
103{
104 o.push_back(new osd_info_t);
105 o.push_back(new osd_info_t);
106 o.back()->last_clean_begin = 1;
107 o.back()->last_clean_end = 2;
108 o.back()->up_from = 30;
109 o.back()->up_thru = 40;
110 o.back()->down_at = 5;
111 o.back()->lost_at = 6;
112}
113
114ostream& operator<<(ostream& out, const osd_info_t& info)
115{
116 out << "up_from " << info.up_from
117 << " up_thru " << info.up_thru
118 << " down_at " << info.down_at
119 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
120 if (info.lost_at)
121 out << " lost_at " << info.lost_at;
122 return out;
123}
124
125// ----------------------------------
126// osd_xinfo_t
127
128void osd_xinfo_t::dump(Formatter *f) const
129{
130 f->dump_stream("down_stamp") << down_stamp;
131 f->dump_float("laggy_probability", laggy_probability);
132 f->dump_int("laggy_interval", laggy_interval);
133 f->dump_int("features", features);
134 f->dump_unsigned("old_weight", old_weight);
9f95a23c
TL
135 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
136 f->dump_int("dead_epoch", dead_epoch);
7c673cae
FG
137}
138
9f95a23c 139void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
7c673cae 140{
9f95a23c
TL
141 uint8_t v = 4;
142 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
143 v = 3;
144 }
145 ENCODE_START(v, 1, bl);
11fdf7f2 146 encode(down_stamp, bl);
f67539c2 147 __u32 lp = laggy_probability * float(0xfffffffful);
11fdf7f2
TL
148 encode(lp, bl);
149 encode(laggy_interval, bl);
150 encode(features, bl);
151 encode(old_weight, bl);
9f95a23c
TL
152 if (v >= 4) {
153 encode(last_purged_snaps_scrub, bl);
154 encode(dead_epoch, bl);
155 }
7c673cae
FG
156 ENCODE_FINISH(bl);
157}
158
9f95a23c 159void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 160{
9f95a23c 161 DECODE_START(4, bl);
11fdf7f2 162 decode(down_stamp, bl);
7c673cae 163 __u32 lp;
11fdf7f2 164 decode(lp, bl);
7c673cae 165 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 166 decode(laggy_interval, bl);
7c673cae 167 if (struct_v >= 2)
11fdf7f2 168 decode(features, bl);
7c673cae
FG
169 else
170 features = 0;
171 if (struct_v >= 3)
11fdf7f2 172 decode(old_weight, bl);
7c673cae
FG
173 else
174 old_weight = 0;
9f95a23c
TL
175 if (struct_v >= 4) {
176 decode(last_purged_snaps_scrub, bl);
177 decode(dead_epoch, bl);
178 } else {
179 dead_epoch = 0;
180 }
7c673cae
FG
181 DECODE_FINISH(bl);
182}
183
184void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
185{
186 o.push_back(new osd_xinfo_t);
187 o.push_back(new osd_xinfo_t);
188 o.back()->down_stamp = utime_t(2, 3);
189 o.back()->laggy_probability = .123;
190 o.back()->laggy_interval = 123456;
191 o.back()->old_weight = 0x7fff;
192}
193
194ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
195{
196 return out << "down_stamp " << xi.down_stamp
197 << " laggy_probability " << xi.laggy_probability
198 << " laggy_interval " << xi.laggy_interval
9f95a23c
TL
199 << " old_weight " << xi.old_weight
200 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
201 << " dead_epoch " << xi.dead_epoch;
7c673cae
FG
202}
203
204// ----------------------------------
205// OSDMap::Incremental
206
207int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
208{
209 int n = 0;
210 for (auto &weight : new_weight) {
211 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
212 n++; // marked out
213 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
214 n--; // marked in
215 }
216 return n;
217}
218
219int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
220{
221 int n = 0;
222 for (auto &state : new_state) { //
223 if (state.second & CEPH_OSD_UP) {
224 if (previous->is_up(state.first))
225 n++; // marked down
226 else
227 n--; // marked up
228 }
229 }
230 return n;
231}
232
233int OSDMap::Incremental::identify_osd(uuid_d u) const
234{
235 for (auto &uuid : new_uuid)
236 if (uuid.second == u)
237 return uuid.first;
238 return -1;
239}
240
f67539c2
TL
241int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
242 const OSDMap& osdmap)
7c673cae 243{
11fdf7f2 244 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
245
246 for (auto &new_pool : new_pools) {
247 if (!new_pool.second.tiers.empty()) {
248 pg_pool_t& base = new_pool.second;
249
11fdf7f2
TL
250 auto new_rem_it = new_removed_snaps.find(new_pool.first);
251
7c673cae
FG
252 for (const auto &tier_pool : base.tiers) {
253 const auto &r = new_pools.find(tier_pool);
254 pg_pool_t *tier = 0;
255 if (r == new_pools.end()) {
256 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
257 if (!orig) {
258 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
259 return -EIO;
260 }
261 tier = get_new_pool(tier_pool, orig);
262 } else {
263 tier = &r->second;
264 }
265 if (tier->tier_of != new_pool.first) {
266 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
267 return -EIO;
268 }
269
270 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
271 << tier_pool << dendl;
272 tier->snap_seq = base.snap_seq;
273 tier->snap_epoch = base.snap_epoch;
274 tier->snaps = base.snaps;
275 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
276 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
277 pg_pool_t::FLAG_POOL_SNAPS);
278
279 if (new_rem_it != new_removed_snaps.end()) {
280 new_removed_snaps[tier_pool] = new_rem_it->second;
281 }
f67539c2
TL
282
283 tier->application_metadata = base.application_metadata;
7c673cae
FG
284 }
285 }
286 }
287 return 0;
288}
289
28e407b8
AA
290// ----------------------------------
291// OSDMap
7c673cae
FG
292
293bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
294{
295 if (id >= 0)
296 return is_down(id);
297
298 if (down_cache &&
299 down_cache->count(id)) {
300 return true;
301 }
302
303 list<int> children;
304 crush->get_children(id, &children);
305 for (const auto &child : children) {
306 if (!subtree_is_down(child, down_cache)) {
307 return false;
308 }
309 }
310 if (down_cache) {
311 down_cache->insert(id);
312 }
313 return true;
314}
315
316bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
317{
318 // use a stack-local down_cache if we didn't get one from the
319 // caller. then at least this particular call will avoid duplicated
320 // work.
321 set<int> local_down_cache;
322 if (!down_cache) {
323 down_cache = &local_down_cache;
324 }
325
326 int current = id;
327 while (true) {
328 int type;
329 if (current >= 0) {
330 type = 0;
331 } else {
332 type = crush->get_bucket_type(current);
333 }
11fdf7f2 334 ceph_assert(type >= 0);
7c673cae
FG
335
336 if (!subtree_is_down(current, down_cache)) {
337 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
338 return false;
339 }
340
341 // is this a big enough subtree to be marked as down?
342 if (type >= subtree_type) {
343 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
344 return true;
345 }
346
347 int r = crush->get_immediate_parent_id(current, &current);
348 if (r < 0) {
349 return false;
350 }
351 }
352}
353
224ce89b
WB
354bool OSDMap::subtree_type_is_down(
355 CephContext *cct,
356 int id,
357 int subtree_type,
358 set<int> *down_in_osds,
359 set<int> *up_in_osds,
360 set<int> *subtree_up,
361 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
362{
363 if (id >= 0) {
364 bool is_down_ret = is_down(id);
365 if (!is_out(id)) {
366 if (is_down_ret) {
367 down_in_osds->insert(id);
368 } else {
369 up_in_osds->insert(id);
370 }
371 }
372 return is_down_ret;
373 }
374
375 if (subtree_type_down &&
376 (*subtree_type_down)[subtree_type].count(id)) {
377 return true;
378 }
379
380 list<int> children;
381 crush->get_children(id, &children);
382 for (const auto &child : children) {
224ce89b
WB
383 if (!subtree_type_is_down(
384 cct, child, crush->get_bucket_type(child),
385 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
386 subtree_up->insert(id);
387 return false;
388 }
389 }
390 if (subtree_type_down) {
391 (*subtree_type_down)[subtree_type].insert(id);
392 }
393 return true;
394}
395
9f95a23c 396void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
7c673cae 397{
11fdf7f2 398 using ceph::encode;
7c673cae 399 __u16 v = 5;
11fdf7f2
TL
400 encode(v, bl);
401 encode(fsid, bl);
402 encode(epoch, bl);
403 encode(modified, bl);
7c673cae 404 int32_t new_t = new_pool_max;
11fdf7f2
TL
405 encode(new_t, bl);
406 encode(new_flags, bl);
407 encode(fullmap, bl);
408 encode(crush, bl);
7c673cae 409
11fdf7f2
TL
410 encode(new_max_osd, bl);
411 // for encode(new_pools, bl);
7c673cae 412 __u32 n = new_pools.size();
11fdf7f2 413 encode(n, bl);
7c673cae
FG
414 for (const auto &new_pool : new_pools) {
415 n = new_pool.first;
11fdf7f2
TL
416 encode(n, bl);
417 encode(new_pool.second, bl, 0);
7c673cae 418 }
11fdf7f2 419 // for encode(new_pool_names, bl);
7c673cae 420 n = new_pool_names.size();
11fdf7f2 421 encode(n, bl);
7c673cae
FG
422
423 for (const auto &new_pool_name : new_pool_names) {
424 n = new_pool_name.first;
11fdf7f2
TL
425 encode(n, bl);
426 encode(new_pool_name.second, bl);
7c673cae 427 }
11fdf7f2 428 // for encode(old_pools, bl);
7c673cae 429 n = old_pools.size();
11fdf7f2 430 encode(n, bl);
7c673cae
FG
431 for (auto &old_pool : old_pools) {
432 n = old_pool;
11fdf7f2 433 encode(n, bl);
7c673cae 434 }
11fdf7f2 435 encode(new_up_client, bl, 0);
31f18b77
FG
436 {
437 // legacy is map<int32_t,uint8_t>
9f95a23c 438 map<int32_t, uint8_t> os;
31f18b77 439 for (auto p : new_state) {
9f95a23c
TL
440 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
441 // that an old client could not understand.
442 // skip those!
443 uint8_t s = p.second;
444 if (p.second != 0 && s == 0)
445 continue;
446 os[p.first] = s;
447 }
448 uint32_t n = os.size();
449 encode(n, bl);
450 for (auto p : os) {
11fdf7f2 451 encode(p.first, bl);
9f95a23c 452 encode(p.second, bl);
31f18b77
FG
453 }
454 }
11fdf7f2
TL
455 encode(new_weight, bl);
456 // for encode(new_pg_temp, bl);
7c673cae 457 n = new_pg_temp.size();
11fdf7f2 458 encode(n, bl);
7c673cae
FG
459
460 for (const auto &pg_temp : new_pg_temp) {
461 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
462 encode(opg, bl);
463 encode(pg_temp.second, bl);
7c673cae
FG
464 }
465}
466
9f95a23c 467void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 468{
11fdf7f2 469 using ceph::encode;
7c673cae
FG
470 if ((features & CEPH_FEATURE_PGID64) == 0) {
471 encode_client_old(bl);
472 return;
473 }
474
475 // base
476 __u16 v = 6;
11fdf7f2
TL
477 encode(v, bl);
478 encode(fsid, bl);
479 encode(epoch, bl);
480 encode(modified, bl);
481 encode(new_pool_max, bl);
482 encode(new_flags, bl);
483 encode(fullmap, bl);
484 encode(crush, bl);
485
486 encode(new_max_osd, bl);
487 encode(new_pools, bl, features);
488 encode(new_pool_names, bl);
489 encode(old_pools, bl);
490 encode(new_up_client, bl, features);
31f18b77 491 {
9f95a23c 492 map<int32_t, uint8_t> os;
31f18b77 493 for (auto p : new_state) {
9f95a23c
TL
494 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
495 // that an old client could not understand.
496 // skip those!
497 uint8_t s = p.second;
498 if (p.second != 0 && s == 0)
499 continue;
500 os[p.first] = s;
501 }
502 uint32_t n = os.size();
503 encode(n, bl);
504 for (auto p : os) {
11fdf7f2 505 encode(p.first, bl);
9f95a23c 506 encode(p.second, bl);
31f18b77
FG
507 }
508 }
11fdf7f2
TL
509 encode(new_weight, bl);
510 encode(new_pg_temp, bl);
7c673cae
FG
511
512 // extended
513 __u16 ev = 10;
11fdf7f2
TL
514 encode(ev, bl);
515 encode(new_hb_back_up, bl, features);
516 encode(new_up_thru, bl);
517 encode(new_last_clean_interval, bl);
518 encode(new_lost, bl);
f67539c2
TL
519 encode(new_blocklist, bl, features);
520 encode(old_blocklist, bl, features);
11fdf7f2
TL
521 encode(new_up_cluster, bl, features);
522 encode(cluster_snapshot, bl);
523 encode(new_uuid, bl);
9f95a23c 524 encode(new_xinfo, bl, features);
11fdf7f2
TL
525 encode(new_hb_front_up, bl, features);
526}
527
528template<class T>
9f95a23c 529static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
530{
531 uint32_t n = m.size();
532 encode(n, bl);
533 for (auto& i : m) {
534 encode(i.first, bl);
535 encode(i.second.legacy_addr(), bl, f);
536 }
537}
538
539template<class T>
9f95a23c 540static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
541{
542 uint32_t n = m.size();
543 encode(n, bl);
544 for (auto& i : m) {
545 if (i) {
546 encode(i->legacy_addr(), bl, f);
547 } else {
548 encode(entity_addr_t(), bl, f);
549 }
550 }
7c673cae
FG
551}
552
11fdf7f2
TL
553/* for a description of osdmap incremental versions, and when they were
554 * introduced, please refer to
555 * doc/dev/osd_internals/osdmap_versions.txt
556 */
9f95a23c 557void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 558{
11fdf7f2 559 using ceph::encode;
7c673cae
FG
560 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
561 encode_classic(bl, features);
562 return;
563 }
564
565 // only a select set of callers should *ever* be encoding new
566 // OSDMaps. others should be passing around the canonical encoded
567 // buffers from on high. select out those callers by passing in an
568 // "impossible" feature bit.
11fdf7f2 569 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
570 features &= ~CEPH_FEATURE_RESERVED;
571
572 size_t start_offset = bl.length();
573 size_t tail_offset;
11fdf7f2 574 size_t crc_offset;
9f95a23c 575 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
576
577 // meta-encoding: how we include client-used and osd-specific data
578 ENCODE_START(8, 7, bl);
579
580 {
11fdf7f2 581 uint8_t v = 8;
7c673cae
FG
582 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
583 v = 3;
11fdf7f2
TL
584 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
585 v = 5;
586 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
587 v = 6;
7c673cae
FG
588 }
589 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
590 encode(fsid, bl);
591 encode(epoch, bl);
592 encode(modified, bl);
593 encode(new_pool_max, bl);
594 encode(new_flags, bl);
595 encode(fullmap, bl);
596 encode(crush, bl);
597
598 encode(new_max_osd, bl);
599 encode(new_pools, bl, features);
600 encode(new_pool_names, bl);
601 encode(old_pools, bl);
602 if (v >= 7) {
603 encode(new_up_client, bl, features);
604 } else {
605 encode_addrvec_map_as_addr(new_up_client, bl, features);
606 }
31f18b77 607 if (v >= 5) {
11fdf7f2 608 encode(new_state, bl);
31f18b77 609 } else {
9f95a23c 610 map<int32_t, uint8_t> os;
31f18b77 611 for (auto p : new_state) {
9f95a23c
TL
612 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
613 // that an old client could not understand.
614 // skip those!
615 uint8_t s = p.second;
616 if (p.second != 0 && s == 0)
617 continue;
618 os[p.first] = s;
619 }
620 uint32_t n = os.size();
621 encode(n, bl);
622 for (auto p : os) {
623 encode(p.first, bl);
624 encode(p.second, bl);
31f18b77
FG
625 }
626 }
11fdf7f2
TL
627 encode(new_weight, bl);
628 encode(new_pg_temp, bl);
629 encode(new_primary_temp, bl);
630 encode(new_primary_affinity, bl);
631 encode(new_erasure_code_profiles, bl);
632 encode(old_erasure_code_profiles, bl);
7c673cae 633 if (v >= 4) {
11fdf7f2
TL
634 encode(new_pg_upmap, bl);
635 encode(old_pg_upmap, bl);
636 encode(new_pg_upmap_items, bl);
637 encode(old_pg_upmap_items, bl);
638 }
639 if (v >= 6) {
640 encode(new_removed_snaps, bl);
641 encode(new_purged_snaps, bl);
642 }
643 if (v >= 8) {
644 encode(new_last_up_change, bl);
645 encode(new_last_in_change, bl);
7c673cae
FG
646 }
647 ENCODE_FINISH(bl); // client-usable data
648 }
649
650 {
33c7a0ef 651 uint8_t target_v = 9; // if bumping this, be aware of range_blocklist 11
7c673cae
FG
652 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
653 target_v = 2;
11fdf7f2
TL
654 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
655 target_v = 6;
7c673cae 656 }
f67539c2 657 if (change_stretch_mode) {
f67539c2
TL
658 target_v = std::max((uint8_t)10, target_v);
659 }
33c7a0ef
TL
660 if (!new_range_blocklist.empty() ||
661 !old_range_blocklist.empty()) {
662 target_v = std::max((uint8_t)11, target_v);
663 }
7c673cae 664 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
665 if (target_v < 7) {
666 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
667 } else {
668 encode(new_hb_back_up, bl, features);
669 }
670 encode(new_up_thru, bl);
671 encode(new_last_clean_interval, bl);
672 encode(new_lost, bl);
f67539c2
TL
673 encode(new_blocklist, bl, features);
674 encode(old_blocklist, bl, features);
11fdf7f2
TL
675 if (target_v < 7) {
676 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
677 } else {
678 encode(new_up_cluster, bl, features);
679 }
680 encode(cluster_snapshot, bl);
681 encode(new_uuid, bl);
9f95a23c 682 encode(new_xinfo, bl, features);
11fdf7f2
TL
683 if (target_v < 7) {
684 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
685 } else {
686 encode(new_hb_front_up, bl, features);
687 }
688 encode(features, bl); // NOTE: features arg, not the member
7c673cae 689 if (target_v >= 3) {
11fdf7f2
TL
690 encode(new_nearfull_ratio, bl);
691 encode(new_full_ratio, bl);
692 encode(new_backfillfull_ratio, bl);
31f18b77
FG
693 }
694 // 5 was string-based new_require_min_compat_client
695 if (target_v >= 6) {
11fdf7f2
TL
696 encode(new_require_min_compat_client, bl);
697 encode(new_require_osd_release, bl);
7c673cae 698 }
81eedcae
TL
699 if (target_v >= 8) {
700 encode(new_crush_node_flags, bl);
701 }
702 if (target_v >= 9) {
703 encode(new_device_class_flags, bl);
704 }
f67539c2
TL
705 if (target_v >= 10) {
706 encode(change_stretch_mode, bl);
707 encode(new_stretch_bucket_count, bl);
708 encode(new_degraded_stretch_mode, bl);
709 encode(new_recovering_stretch_mode, bl);
710 encode(new_stretch_mode_bucket, bl);
711 encode(stretch_mode_enabled, bl);
712 }
33c7a0ef
TL
713 if (target_v >= 11) {
714 encode(new_range_blocklist, bl, features);
715 encode(old_range_blocklist, bl, features);
716 }
7c673cae
FG
717 ENCODE_FINISH(bl); // osd-only data
718 }
719
11fdf7f2
TL
720 crc_offset = bl.length();
721 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
722 tail_offset = bl.length();
723
11fdf7f2 724 encode(full_crc, bl);
7c673cae
FG
725
726 ENCODE_FINISH(bl); // meta-encoding wrapper
727
728 // fill in crc
9f95a23c 729 ceph::buffer::list front;
11fdf7f2 730 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae 731 inc_crc = front.crc32c(-1);
9f95a23c 732 ceph::buffer::list tail;
7c673cae
FG
733 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
734 inc_crc = tail.crc32c(inc_crc);
735 ceph_le32 crc_le;
736 crc_le = inc_crc;
11fdf7f2 737 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
738 have_crc = true;
739}
740
9f95a23c 741void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
7c673cae 742{
11fdf7f2 743 using ceph::decode;
7c673cae
FG
744 __u32 n, t;
745 // base
746 __u16 v;
11fdf7f2
TL
747 decode(v, p);
748 decode(fsid, p);
749 decode(epoch, p);
750 decode(modified, p);
7c673cae 751 if (v == 4 || v == 5) {
11fdf7f2 752 decode(n, p);
7c673cae
FG
753 new_pool_max = n;
754 } else if (v >= 6)
11fdf7f2
TL
755 decode(new_pool_max, p);
756 decode(new_flags, p);
757 decode(fullmap, p);
758 decode(crush, p);
7c673cae 759
11fdf7f2 760 decode(new_max_osd, p);
7c673cae
FG
761 if (v < 6) {
762 new_pools.clear();
11fdf7f2 763 decode(n, p);
7c673cae 764 while (n--) {
11fdf7f2
TL
765 decode(t, p);
766 decode(new_pools[t], p);
7c673cae
FG
767 }
768 } else {
11fdf7f2 769 decode(new_pools, p);
7c673cae
FG
770 }
771 if (v == 5) {
772 new_pool_names.clear();
11fdf7f2 773 decode(n, p);
7c673cae 774 while (n--) {
11fdf7f2
TL
775 decode(t, p);
776 decode(new_pool_names[t], p);
7c673cae
FG
777 }
778 } else if (v >= 6) {
11fdf7f2 779 decode(new_pool_names, p);
7c673cae
FG
780 }
781 if (v < 6) {
782 old_pools.clear();
11fdf7f2 783 decode(n, p);
7c673cae 784 while (n--) {
11fdf7f2 785 decode(t, p);
7c673cae
FG
786 old_pools.insert(t);
787 }
788 } else {
11fdf7f2 789 decode(old_pools, p);
7c673cae 790 }
11fdf7f2 791 decode(new_up_client, p);
31f18b77
FG
792 {
793 map<int32_t,uint8_t> ns;
11fdf7f2 794 decode(ns, p);
31f18b77
FG
795 for (auto q : ns) {
796 new_state[q.first] = q.second;
797 }
798 }
11fdf7f2 799 decode(new_weight, p);
7c673cae
FG
800
801 if (v < 6) {
802 new_pg_temp.clear();
11fdf7f2 803 decode(n, p);
7c673cae
FG
804 while (n--) {
805 old_pg_t opg;
9f95a23c 806 ceph::decode_raw(opg, p);
11fdf7f2 807 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
808 }
809 } else {
11fdf7f2 810 decode(new_pg_temp, p);
7c673cae
FG
811 }
812
813 // decode short map, too.
814 if (v == 5 && p.end())
815 return;
816
817 // extended
818 __u16 ev = 0;
819 if (v >= 5)
11fdf7f2
TL
820 decode(ev, p);
821 decode(new_hb_back_up, p);
7c673cae 822 if (v < 5)
11fdf7f2
TL
823 decode(new_pool_names, p);
824 decode(new_up_thru, p);
825 decode(new_last_clean_interval, p);
826 decode(new_lost, p);
f67539c2
TL
827 decode(new_blocklist, p);
828 decode(old_blocklist, p);
7c673cae 829 if (ev >= 6)
11fdf7f2 830 decode(new_up_cluster, p);
7c673cae 831 if (ev >= 7)
11fdf7f2 832 decode(cluster_snapshot, p);
7c673cae 833 if (ev >= 8)
11fdf7f2 834 decode(new_uuid, p);
7c673cae 835 if (ev >= 9)
11fdf7f2 836 decode(new_xinfo, p);
7c673cae 837 if (ev >= 10)
11fdf7f2 838 decode(new_hb_front_up, p);
7c673cae
FG
839}
840
11fdf7f2
TL
841/* for a description of osdmap incremental versions, and when they were
842 * introduced, please refer to
843 * doc/dev/osd_internals/osdmap_versions.txt
844 */
9f95a23c 845void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 846{
11fdf7f2 847 using ceph::decode;
7c673cae
FG
848 /**
849 * Older encodings of the Incremental had a single struct_v which
850 * covered the whole encoding, and was prior to our modern
851 * stuff which includes a compatv and a size. So if we see
852 * a struct_v < 7, we must rewind to the beginning and use our
853 * classic decoder.
854 */
855 size_t start_offset = bl.get_off();
856 size_t tail_offset = 0;
9f95a23c 857 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
858
859 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
860 if (struct_v < 7) {
11fdf7f2 861 bl.seek(start_offset);
7c673cae
FG
862 decode_classic(bl);
863 encode_features = 0;
864 if (struct_v >= 6)
865 encode_features = CEPH_FEATURE_PGID64;
866 else
867 encode_features = 0;
868 return;
869 }
870 {
11fdf7f2
TL
871 DECODE_START(8, bl); // client-usable data
872 decode(fsid, bl);
873 decode(epoch, bl);
874 decode(modified, bl);
875 decode(new_pool_max, bl);
876 decode(new_flags, bl);
877 decode(fullmap, bl);
878 decode(crush, bl);
879
880 decode(new_max_osd, bl);
881 decode(new_pools, bl);
882 decode(new_pool_names, bl);
883 decode(old_pools, bl);
884 decode(new_up_client, bl);
31f18b77 885 if (struct_v >= 5) {
11fdf7f2 886 decode(new_state, bl);
31f18b77
FG
887 } else {
888 map<int32_t,uint8_t> ns;
11fdf7f2 889 decode(ns, bl);
31f18b77
FG
890 for (auto q : ns) {
891 new_state[q.first] = q.second;
892 }
893 }
11fdf7f2
TL
894 decode(new_weight, bl);
895 decode(new_pg_temp, bl);
896 decode(new_primary_temp, bl);
7c673cae 897 if (struct_v >= 2)
11fdf7f2 898 decode(new_primary_affinity, bl);
7c673cae
FG
899 else
900 new_primary_affinity.clear();
901 if (struct_v >= 3) {
11fdf7f2
TL
902 decode(new_erasure_code_profiles, bl);
903 decode(old_erasure_code_profiles, bl);
7c673cae
FG
904 } else {
905 new_erasure_code_profiles.clear();
906 old_erasure_code_profiles.clear();
907 }
908 if (struct_v >= 4) {
11fdf7f2
TL
909 decode(new_pg_upmap, bl);
910 decode(old_pg_upmap, bl);
911 decode(new_pg_upmap_items, bl);
912 decode(old_pg_upmap_items, bl);
913 }
914 if (struct_v >= 6) {
915 decode(new_removed_snaps, bl);
916 decode(new_purged_snaps, bl);
917 }
918 if (struct_v >= 8) {
919 decode(new_last_up_change, bl);
920 decode(new_last_in_change, bl);
7c673cae
FG
921 }
922 DECODE_FINISH(bl); // client-usable data
923 }
924
925 {
f67539c2 926 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
927 decode(new_hb_back_up, bl);
928 decode(new_up_thru, bl);
929 decode(new_last_clean_interval, bl);
930 decode(new_lost, bl);
f67539c2
TL
931 decode(new_blocklist, bl);
932 decode(old_blocklist, bl);
11fdf7f2
TL
933 decode(new_up_cluster, bl);
934 decode(cluster_snapshot, bl);
935 decode(new_uuid, bl);
936 decode(new_xinfo, bl);
937 decode(new_hb_front_up, bl);
7c673cae 938 if (struct_v >= 2)
11fdf7f2 939 decode(encode_features, bl);
7c673cae
FG
940 else
941 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
942 if (struct_v >= 3) {
11fdf7f2
TL
943 decode(new_nearfull_ratio, bl);
944 decode(new_full_ratio, bl);
7c673cae
FG
945 } else {
946 new_nearfull_ratio = -1;
947 new_full_ratio = -1;
948 }
949 if (struct_v >= 4) {
11fdf7f2 950 decode(new_backfillfull_ratio, bl);
7c673cae
FG
951 } else {
952 new_backfillfull_ratio = -1;
953 }
31f18b77
FG
954 if (struct_v == 5) {
955 string r;
11fdf7f2 956 decode(r, bl);
31f18b77 957 if (r.length()) {
9f95a23c 958 new_require_min_compat_client = ceph_release_from_name(r);
31f18b77
FG
959 }
960 }
961 if (struct_v >= 6) {
11fdf7f2
TL
962 decode(new_require_min_compat_client, bl);
963 decode(new_require_osd_release, bl);
31f18b77
FG
964 } else {
965 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
966 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 967 new_require_osd_release = ceph_release_t::luminous;
31f18b77
FG
968 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
969 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
9f95a23c 970 new_require_osd_release = ceph_release_t::kraken;
31f18b77 971 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
9f95a23c 972 new_require_osd_release = ceph_release_t::jewel;
31f18b77 973 } else {
9f95a23c 974 new_require_osd_release = ceph_release_t::unknown;
31f18b77
FG
975 }
976 }
81eedcae
TL
977 if (struct_v >= 8) {
978 decode(new_crush_node_flags, bl);
979 }
980 if (struct_v >= 9) {
981 decode(new_device_class_flags, bl);
982 }
f67539c2
TL
983 if (struct_v >= 10) {
984 decode(change_stretch_mode, bl);
985 decode(new_stretch_bucket_count, bl);
986 decode(new_degraded_stretch_mode, bl);
987 decode(new_recovering_stretch_mode, bl);
988 decode(new_stretch_mode_bucket, bl);
989 decode(stretch_mode_enabled, bl);
990 }
33c7a0ef
TL
991 if (struct_v >= 11) {
992 decode(new_range_blocklist, bl);
993 decode(old_range_blocklist, bl);
994 }
7c673cae
FG
995 DECODE_FINISH(bl); // osd-only data
996 }
997
998 if (struct_v >= 8) {
999 have_crc = true;
1000 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 1001 decode(inc_crc, bl);
7c673cae 1002 tail_offset = bl.get_off();
11fdf7f2 1003 decode(full_crc, bl);
7c673cae
FG
1004 } else {
1005 have_crc = false;
1006 full_crc = 0;
1007 inc_crc = 0;
1008 }
1009
1010 DECODE_FINISH(bl); // wrapper
1011
1012 if (have_crc) {
1013 // verify crc
1014 uint32_t actual = crc_front.crc32c(-1);
1015 if (tail_offset < bl.get_off()) {
9f95a23c 1016 ceph::buffer::list tail;
7c673cae
FG
1017 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1018 actual = tail.crc32c(actual);
1019 }
1020 if (inc_crc != actual) {
1021 ostringstream ss;
1022 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1023 string s = ss.str();
9f95a23c 1024 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
1025 }
1026 }
1027}
1028
1029void OSDMap::Incremental::dump(Formatter *f) const
1030{
1031 f->dump_int("epoch", epoch);
1032 f->dump_stream("fsid") << fsid;
1033 f->dump_stream("modified") << modified;
11fdf7f2
TL
1034 f->dump_stream("new_last_up_change") << new_last_up_change;
1035 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
1036 f->dump_int("new_pool_max", new_pool_max);
1037 f->dump_int("new_flags", new_flags);
1038 f->dump_float("new_full_ratio", new_full_ratio);
1039 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1040 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
f67539c2
TL
1041 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1042 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
7c673cae
FG
1043
1044 if (fullmap.length()) {
1045 f->open_object_section("full_map");
1046 OSDMap full;
9f95a23c 1047 ceph::buffer::list fbl = fullmap; // kludge around constness.
11fdf7f2 1048 auto p = fbl.cbegin();
7c673cae
FG
1049 full.decode(p);
1050 full.dump(f);
1051 f->close_section();
1052 }
1053 if (crush.length()) {
1054 f->open_object_section("crush");
1055 CrushWrapper c;
9f95a23c 1056 ceph::buffer::list tbl = crush; // kludge around constness.
11fdf7f2 1057 auto p = tbl.cbegin();
7c673cae
FG
1058 c.decode(p);
1059 c.dump(f);
1060 f->close_section();
1061 }
1062
1063 f->dump_int("new_max_osd", new_max_osd);
1064
1065 f->open_array_section("new_pools");
1066
1067 for (const auto &new_pool : new_pools) {
1068 f->open_object_section("pool");
1069 f->dump_int("pool", new_pool.first);
1070 new_pool.second.dump(f);
1071 f->close_section();
1072 }
1073 f->close_section();
1074 f->open_array_section("new_pool_names");
1075
1076 for (const auto &new_pool_name : new_pool_names) {
1077 f->open_object_section("pool_name");
1078 f->dump_int("pool", new_pool_name.first);
1079 f->dump_string("name", new_pool_name.second);
1080 f->close_section();
1081 }
1082 f->close_section();
1083 f->open_array_section("old_pools");
1084
1085 for (const auto &old_pool : old_pools)
1086 f->dump_int("pool", old_pool);
1087 f->close_section();
1088
1089 f->open_array_section("new_up_osds");
1090
1091 for (const auto &upclient : new_up_client) {
1092 f->open_object_section("osd");
1093 f->dump_int("osd", upclient.first);
11fdf7f2
TL
1094 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1095 f->dump_object("public_addrs", upclient.second);
1096 if (auto p = new_up_cluster.find(upclient.first);
1097 p != new_up_cluster.end()) {
1098 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1099 f->dump_object("cluster_addrs", p->second);
1100 }
1101 if (auto p = new_hb_back_up.find(upclient.first);
1102 p != new_hb_back_up.end()) {
1103 f->dump_object("heartbeat_back_addrs", p->second);
1104 }
1105 if (auto p = new_hb_front_up.find(upclient.first);
1106 p != new_hb_front_up.end()) {
1107 f->dump_object("heartbeat_front_addrs", p->second);
1108 }
7c673cae
FG
1109 f->close_section();
1110 }
1111 f->close_section();
1112
1113 f->open_array_section("new_weight");
1114
1115 for (const auto &weight : new_weight) {
1116 f->open_object_section("osd");
1117 f->dump_int("osd", weight.first);
1118 f->dump_int("weight", weight.second);
1119 f->close_section();
1120 }
1121 f->close_section();
1122
1123 f->open_array_section("osd_state_xor");
1124 for (const auto &ns : new_state) {
1125 f->open_object_section("osd");
1126 f->dump_int("osd", ns.first);
1127 set<string> st;
1128 calc_state_set(new_state.find(ns.first)->second, st);
1129 f->open_array_section("state_xor");
1130 for (auto &state : st)
1131 f->dump_string("state", state);
1132 f->close_section();
c07f9fc5 1133 f->close_section();
7c673cae
FG
1134 }
1135 f->close_section();
1136
1137 f->open_array_section("new_pg_temp");
1138
1139 for (const auto &pg_temp : new_pg_temp) {
1140 f->open_object_section("pg");
1141 f->dump_stream("pgid") << pg_temp.first;
1142 f->open_array_section("osds");
1143
1144 for (const auto &osd : pg_temp.second)
1145 f->dump_int("osd", osd);
1146 f->close_section();
1147 f->close_section();
1148 }
1149 f->close_section();
1150
1151 f->open_array_section("primary_temp");
1152
1153 for (const auto &primary_temp : new_primary_temp) {
1154 f->dump_stream("pgid") << primary_temp.first;
1155 f->dump_int("osd", primary_temp.second);
1156 }
1157 f->close_section(); // primary_temp
1158
1159 f->open_array_section("new_pg_upmap");
1160 for (auto& i : new_pg_upmap) {
1161 f->open_object_section("mapping");
1162 f->dump_stream("pgid") << i.first;
1163 f->open_array_section("osds");
1164 for (auto osd : i.second) {
1165 f->dump_int("osd", osd);
1166 }
1167 f->close_section();
1168 f->close_section();
1169 }
1170 f->close_section();
1171 f->open_array_section("old_pg_upmap");
1172 for (auto& i : old_pg_upmap) {
1173 f->dump_stream("pgid") << i;
1174 }
1175 f->close_section();
1176
1177 f->open_array_section("new_pg_upmap_items");
1178 for (auto& i : new_pg_upmap_items) {
1179 f->open_object_section("mapping");
1180 f->dump_stream("pgid") << i.first;
1181 f->open_array_section("mappings");
1182 for (auto& p : i.second) {
1183 f->open_object_section("mapping");
1184 f->dump_int("from", p.first);
1185 f->dump_int("to", p.second);
1186 f->close_section();
1187 }
1188 f->close_section();
1189 f->close_section();
1190 }
1191 f->close_section();
1192 f->open_array_section("old_pg_upmap_items");
1193 for (auto& i : old_pg_upmap_items) {
1194 f->dump_stream("pgid") << i;
1195 }
1196 f->close_section();
1197
1198 f->open_array_section("new_up_thru");
1199
1200 for (const auto &up_thru : new_up_thru) {
1201 f->open_object_section("osd");
1202 f->dump_int("osd", up_thru.first);
1203 f->dump_int("up_thru", up_thru.second);
1204 f->close_section();
1205 }
1206 f->close_section();
1207
1208 f->open_array_section("new_lost");
1209
1210 for (const auto &lost : new_lost) {
1211 f->open_object_section("osd");
1212 f->dump_int("osd", lost.first);
1213 f->dump_int("epoch_lost", lost.second);
1214 f->close_section();
1215 }
1216 f->close_section();
1217
1218 f->open_array_section("new_last_clean_interval");
1219
1220 for (const auto &last_clean_interval : new_last_clean_interval) {
1221 f->open_object_section("osd");
1222 f->dump_int("osd", last_clean_interval.first);
1223 f->dump_int("first", last_clean_interval.second.first);
1224 f->dump_int("last", last_clean_interval.second.second);
1225 f->close_section();
1226 }
1227 f->close_section();
1228
f67539c2
TL
1229 f->open_array_section("new_blocklist");
1230 for (const auto &blist : new_blocklist) {
7c673cae
FG
1231 stringstream ss;
1232 ss << blist.first;
1233 f->dump_stream(ss.str().c_str()) << blist.second;
1234 }
1235 f->close_section();
f67539c2
TL
1236 f->open_array_section("old_blocklist");
1237 for (const auto &blist : old_blocklist)
7c673cae
FG
1238 f->dump_stream("addr") << blist;
1239 f->close_section();
33c7a0ef
TL
1240 f->open_array_section("new_range_blocklist");
1241 for (const auto &blist : new_range_blocklist) {
1242 stringstream ss;
1243 ss << blist.first;
1244 f->dump_stream(ss.str().c_str()) << blist.second;
1245 }
1246 f->close_section();
1247 f->open_array_section("old_range_blocklist");
1248 for (const auto &blist : old_range_blocklist)
1249 f->dump_stream("addr") << blist;
1250 f->close_section();
7c673cae
FG
1251
1252 f->open_array_section("new_xinfo");
1253 for (const auto &xinfo : new_xinfo) {
1254 f->open_object_section("xinfo");
1255 f->dump_int("osd", xinfo.first);
1256 xinfo.second.dump(f);
1257 f->close_section();
1258 }
1259 f->close_section();
1260
1261 if (cluster_snapshot.size())
1262 f->dump_string("cluster_snapshot", cluster_snapshot);
1263
1264 f->open_array_section("new_uuid");
1265 for (const auto &uuid : new_uuid) {
1266 f->open_object_section("osd");
1267 f->dump_int("osd", uuid.first);
1268 f->dump_stream("uuid") << uuid.second;
1269 f->close_section();
1270 }
1271 f->close_section();
1272
1273 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1274 f->open_array_section("old_erasure_code_profiles");
1275 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
9f95a23c 1276 f->dump_string("old", erasure_code_profile);
7c673cae
FG
1277 }
1278 f->close_section();
11fdf7f2
TL
1279
1280 f->open_array_section("new_removed_snaps");
1281 for (auto& p : new_removed_snaps) {
1282 f->open_object_section("pool");
1283 f->dump_int("pool", p.first);
1284 f->open_array_section("snaps");
1285 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1286 f->open_object_section("interval");
1287 f->dump_unsigned("begin", q.get_start());
1288 f->dump_unsigned("length", q.get_len());
1289 f->close_section();
1290 }
1291 f->close_section();
1292 f->close_section();
1293 }
1294 f->close_section();
1295 f->open_array_section("new_purged_snaps");
1296 for (auto& p : new_purged_snaps) {
1297 f->open_object_section("pool");
1298 f->dump_int("pool", p.first);
1299 f->open_array_section("snaps");
1300 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1301 f->open_object_section("interval");
1302 f->dump_unsigned("begin", q.get_start());
1303 f->dump_unsigned("length", q.get_len());
1304 f->close_section();
1305 }
1306 f->close_section();
1307 f->close_section();
1308 }
81eedcae
TL
1309 f->open_array_section("new_crush_node_flags");
1310 for (auto& i : new_crush_node_flags) {
1311 f->open_object_section("node");
1312 f->dump_int("id", i.first);
1313 set<string> st;
1314 calc_state_set(i.second, st);
1315 for (auto& j : st) {
1316 f->dump_string("flag", j);
1317 }
1318 f->close_section();
1319 }
1320 f->close_section();
1321 f->open_array_section("new_device_class_flags");
1322 for (auto& i : new_device_class_flags) {
1323 f->open_object_section("device_class");
1324 f->dump_int("id", i.first);
1325 set<string> st;
1326 calc_state_set(i.second, st);
1327 for (auto& j : st) {
1328 f->dump_string("flag", j);
1329 }
1330 f->close_section();
1331 }
1332 f->close_section();
f67539c2
TL
1333 f->open_object_section("stretch_mode");
1334 {
1335 f->dump_bool("change_stretch_mode", change_stretch_mode);
1336 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1337 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1338 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1339 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1340 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1341 }
1342 f->close_section();
11fdf7f2 1343 f->close_section();
7c673cae
FG
1344}
1345
1346void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1347{
1348 o.push_back(new Incremental);
1349}
1350
1351// ----------------------------------
1352// OSDMap
1353
1354void OSDMap::set_epoch(epoch_t e)
1355{
1356 epoch = e;
1357 for (auto &pool : pools)
1358 pool.second.last_change = e;
1359}
1360
33c7a0ef
TL
1361OSDMap::range_bits::range_bits() : ipv6(false) {
1362 memset(&bits, 0, sizeof(bits));
1363}
1364
1365OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
1366 memset(&bits, 0, sizeof(bits));
1367 parse(addr);
1368}
1369
1370void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
1371 uint64_t *upper, uint64_t *lower)
1372{
1373 *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
1374 ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
1375 *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
1376 ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
1377}
1378
1379void OSDMap::range_bits::parse(const entity_addr_t& addr) {
1380 // parse it into meaningful data
1381 if (addr.is_ipv6()) {
1382 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
1383 &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
1384 int32_t lower_shift = std::min(128-
1385 static_cast<int32_t>(addr.get_nonce()), 64);
1386 int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
1387 static_cast<int32_t>(addr.get_nonce()), 0);
1388
1389 auto get_mask = [](int32_t shift) -> uint64_t {
1390 if (shift >= 0 && shift < 64) {
1391 return UINT64_MAX << shift;
1392 }
1393 return 0;
1394 };
1395
1396 bits.ipv6.lower_mask = get_mask(lower_shift);
1397 bits.ipv6.upper_mask = get_mask(upper_shift);
1398 ipv6 = true;
1399 } else if (addr.is_ipv4()) {
1400 bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
1401 if (addr.get_nonce() > 0) {
1402 bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
1403 } else {
1404 bits.ipv4.mask = 0;
1405 }
1406 } else {
1407 // uh...
1408 }
1409}
1410
1411bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
1412 if (addr.is_ipv4() && !ipv6) {
1413 return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
1414 (bits.ipv4.ip_32_bits & bits.ipv4.mask));
1415 } else if (addr.is_ipv6() && ipv6) {
1416 uint64_t upper_64, lower_64;
1417 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
1418 return (((upper_64 & bits.ipv6.upper_mask) ==
1419 (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
1420 ((lower_64 & bits.ipv6.lower_mask) ==
1421 (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
1422 }
1423 return false;
1424}
1425
1426bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
7c673cae 1427{
33c7a0ef
TL
1428 if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
1429 if (blocklist.empty() && range_blocklist.empty()) {
1430 if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
7c673cae 1431 return false;
11fdf7f2
TL
1432 }
1433
f67539c2 1434 // all blocklist entries are type ANY for nautilus+
11fdf7f2
TL
1435 // FIXME: avoid this copy!
1436 entity_addr_t a = orig;
9f95a23c 1437 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1438 a.set_type(entity_addr_t::TYPE_LEGACY);
1439 } else {
1440 a.set_type(entity_addr_t::TYPE_ANY);
1441 }
7c673cae
FG
1442
1443 // this specific instance?
f67539c2 1444 if (blocklist.count(a)) {
33c7a0ef 1445 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
7c673cae 1446 return true;
11fdf7f2 1447 }
7c673cae 1448
f67539c2 1449 // is entire ip blocklisted?
7c673cae 1450 if (a.is_ip()) {
11fdf7f2
TL
1451 a.set_port(0);
1452 a.set_nonce(0);
f67539c2 1453 if (blocklist.count(a)) {
33c7a0ef 1454 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
11fdf7f2
TL
1455 return true;
1456 }
1457 }
1458
33c7a0ef
TL
1459 // is it in a blocklisted range?
1460 for (const auto& i : calculated_ranges) {
1461 bool blocked = i.second.matches(a);
1462 if (blocked) {
1463 if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
1464 return true;
1465 }
1466 }
1467
1468 if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
11fdf7f2
TL
1469 return false;
1470}
1471
33c7a0ef 1472bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
11fdf7f2 1473{
33c7a0ef 1474 if (blocklist.empty() && range_blocklist.empty())
11fdf7f2
TL
1475 return false;
1476
1477 for (auto& a : av.v) {
33c7a0ef 1478 if (is_blocklisted(a, cct)) {
7c673cae
FG
1479 return true;
1480 }
1481 }
1482
1483 return false;
1484}
1485
33c7a0ef
TL
1486void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
1487 std::list<std::pair<entity_addr_t,utime_t> > *rl) const
7c673cae 1488{
f67539c2 1489 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
33c7a0ef
TL
1490 std::copy(range_blocklist.begin(), range_blocklist.end(),
1491 std::back_inserter(*rl));
7c673cae
FG
1492}
1493
33c7a0ef
TL
1494void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
1495 std::set<entity_addr_t> *rl) const
31f18b77 1496{
f67539c2 1497 for (const auto &i : blocklist) {
31f18b77
FG
1498 bl->insert(i.first);
1499 }
33c7a0ef
TL
1500 for (const auto &i : range_blocklist) {
1501 rl->insert(i.first);
1502 }
31f18b77
FG
1503}
1504
7c673cae
FG
1505void OSDMap::set_max_osd(int m)
1506{
7c673cae 1507 max_osd = m;
f67539c2
TL
1508 osd_state.resize(max_osd, 0);
1509 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1510 osd_info.resize(max_osd);
1511 osd_xinfo.resize(max_osd);
1512 osd_addrs->client_addrs.resize(max_osd);
1513 osd_addrs->cluster_addrs.resize(max_osd);
1514 osd_addrs->hb_back_addrs.resize(max_osd);
1515 osd_addrs->hb_front_addrs.resize(max_osd);
1516 osd_uuid->resize(max_osd);
7c673cae 1517 if (osd_primary_affinity)
f67539c2 1518 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
7c673cae
FG
1519
1520 calc_num_osds();
1521}
1522
1523int OSDMap::calc_num_osds()
1524{
1525 num_osd = 0;
1526 num_up_osd = 0;
1527 num_in_osd = 0;
1528 for (int i=0; i<max_osd; i++) {
1529 if (osd_state[i] & CEPH_OSD_EXISTS) {
1530 ++num_osd;
1531 if (osd_state[i] & CEPH_OSD_UP) {
1532 ++num_up_osd;
1533 }
1534 if (get_weight(i) != CEPH_OSD_OUT) {
1535 ++num_in_osd;
1536 }
1537 }
1538 }
1539 return num_osd;
1540}
1541
3efd9988
FG
1542void OSDMap::get_full_pools(CephContext *cct,
1543 set<int64_t> *full,
1544 set<int64_t> *backfillfull,
1545 set<int64_t> *nearfull) const
7c673cae 1546{
11fdf7f2
TL
1547 ceph_assert(full);
1548 ceph_assert(backfillfull);
1549 ceph_assert(nearfull);
3efd9988
FG
1550 full->clear();
1551 backfillfull->clear();
1552 nearfull->clear();
1553
1554 vector<int> full_osds;
1555 vector<int> backfillfull_osds;
1556 vector<int> nearfull_osds;
7c673cae
FG
1557 for (int i = 0; i < max_osd; ++i) {
1558 if (exists(i) && is_up(i) && is_in(i)) {
1559 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1560 full_osds.push_back(i);
7c673cae 1561 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1562 backfillfull_osds.push_back(i);
7c673cae 1563 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1564 nearfull_osds.push_back(i);
7c673cae
FG
1565 }
1566 }
3efd9988
FG
1567
1568 for (auto i: full_osds) {
1569 get_pool_ids_by_osd(cct, i, full);
1570 }
1571 for (auto i: backfillfull_osds) {
1572 get_pool_ids_by_osd(cct, i, backfillfull);
1573 }
1574 for (auto i: nearfull_osds) {
1575 get_pool_ids_by_osd(cct, i, nearfull);
1576 }
7c673cae
FG
1577}
1578
31f18b77
FG
1579void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1580 set<int> *nearfull) const
1581{
1582 full->clear();
1583 backfill->clear();
1584 nearfull->clear();
1585 for (int i = 0; i < max_osd; ++i) {
1586 if (exists(i) && is_up(i) && is_in(i)) {
1587 if (osd_state[i] & CEPH_OSD_FULL)
1588 full->emplace(i);
1589 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1590 backfill->emplace(i);
1591 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1592 nearfull->emplace(i);
1593 }
1594 }
1595}
1596
7c673cae
FG
1597void OSDMap::get_all_osds(set<int32_t>& ls) const
1598{
1599 for (int i=0; i<max_osd; i++)
1600 if (exists(i))
1601 ls.insert(i);
1602}
1603
1604void OSDMap::get_up_osds(set<int32_t>& ls) const
1605{
1606 for (int i = 0; i < max_osd; i++) {
1607 if (is_up(i))
1608 ls.insert(i);
1609 }
1610}
1611
81eedcae 1612void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1613{
1614 for (int i = 0; i < max_osd; i++) {
81eedcae 1615 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1616 ls.insert(i);
1617 }
1618}
1619
11fdf7f2
TL
1620void OSDMap::get_flag_set(set<string> *flagset) const
1621{
1622 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1623 if (flags & (1<<i)) {
1624 flagset->insert(get_flag_string(flags & (1<<i)));
1625 }
1626 }
1627}
1628
7c673cae
FG
1629void OSDMap::calc_state_set(int state, set<string>& st)
1630{
1631 unsigned t = state;
1632 for (unsigned s = 1; t; s <<= 1) {
1633 if (t & s) {
1634 t &= ~s;
1635 st.insert(ceph_osd_state_name(s));
1636 }
1637 }
1638}
1639
1640void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1641{
1642 float max = 0;
1643 for (const auto &weight : weights) {
1644 if (weight.second > max)
1645 max = weight.second;
1646 }
1647
1648 for (const auto &weight : weights) {
1649 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1650 }
1651}
1652
1653int OSDMap::identify_osd(const entity_addr_t& addr) const
1654{
1655 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1656 if (exists(i) && (get_addrs(i).contains(addr) ||
1657 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1658 return i;
1659 return -1;
1660}
1661
1662int OSDMap::identify_osd(const uuid_d& u) const
1663{
1664 for (int i=0; i<max_osd; i++)
1665 if (exists(i) && get_uuid(i) == u)
1666 return i;
1667 return -1;
1668}
1669
1670int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1671{
1672 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1673 if (exists(i) && (get_addrs(i).contains(addr) ||
1674 get_cluster_addrs(i).contains(addr) ||
1675 get_hb_back_addrs(i).contains(addr) ||
1676 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1677 return i;
1678 return -1;
1679}
1680
1681int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1682{
1683 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1684 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1685 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1686 return i;
1687 return -1;
1688}
1689
1690
1691uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1692{
1693 uint64_t features = 0; // things we actually have
1694 uint64_t mask = 0; // things we could have
1695
1696 if (crush->has_nondefault_tunables())
1697 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1698 if (crush->has_nondefault_tunables2())
1699 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1700 if (crush->has_nondefault_tunables3())
1701 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1702 if (crush->has_v4_buckets())
1703 features |= CEPH_FEATURE_CRUSH_V4;
1704 if (crush->has_nondefault_tunables5())
1705 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1706 if (crush->has_incompat_choose_args()) {
1707 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1708 }
7c673cae
FG
1709 mask |= CEPH_FEATURES_CRUSH;
1710
1711 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1712 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1713 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1714
1715 for (auto &pool: pools) {
1716 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1717 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1718 }
7c673cae
FG
1719 if (!pool.second.tiers.empty() ||
1720 pool.second.is_tier()) {
1721 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1722 }
20effc67 1723 int ruleid = pool.second.get_crush_rule();
7c673cae
FG
1724 if (ruleid >= 0) {
1725 if (crush->is_v2_rule(ruleid))
1726 features |= CEPH_FEATURE_CRUSH_V2;
1727 if (crush->is_v3_rule(ruleid))
1728 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1729 if (crush->is_v5_rule(ruleid))
1730 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1731 }
1732 }
7c673cae 1733 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1734
1735 if (osd_primary_affinity) {
1736 for (int i = 0; i < max_osd; ++i) {
1737 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1738 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1739 break;
1740 }
1741 }
1742 }
1743 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1744
1745 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1746 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
9f95a23c 1747 if (require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
1748 features |= jewel_features;
1749 }
1750 mask |= jewel_features;
1751
1752 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1753 | CEPH_FEATURE_MSG_ADDR2;
9f95a23c 1754 if (require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
1755 features |= kraken_features;
1756 }
1757 mask |= kraken_features;
f67539c2
TL
1758
1759 if (stretch_mode_enabled) {
1760 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1761 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1762 }
7c673cae
FG
1763 }
1764
9f95a23c 1765 if (require_min_compat_client >= ceph_release_t::nautilus) {
11fdf7f2
TL
1766 // if min_compat_client is >= nautilus, require v2 cephx signatures
1767 // from everyone
1768 features |= CEPH_FEATUREMASK_CEPHX_V2;
9f95a23c 1769 } else if (require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
1770 entity_type == CEPH_ENTITY_TYPE_OSD) {
1771 // if osds are >= nautilus, at least require the signatures from them
1772 features |= CEPH_FEATUREMASK_CEPHX_V2;
1773 }
1774 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1775
7c673cae
FG
1776 if (pmask)
1777 *pmask = mask;
1778 return features;
1779}
1780
9f95a23c 1781ceph_release_t OSDMap::get_min_compat_client() const
7c673cae
FG
1782{
1783 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1784
1785 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77 1786 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
9f95a23c 1787 return ceph_release_t::luminous; // v12.2.0
7c673cae
FG
1788 }
1789 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
9f95a23c 1790 return ceph_release_t::jewel; // v10.2.0
7c673cae
FG
1791 }
1792 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
9f95a23c 1793 return ceph_release_t::hammer; // v0.94.0
7c673cae
FG
1794 }
1795 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1796 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1797 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
9f95a23c 1798 return ceph_release_t::firefly; // v0.80.0
7c673cae
FG
1799 }
1800 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1801 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
9f95a23c 1802 return ceph_release_t::dumpling; // v0.67.0
7c673cae
FG
1803 }
1804 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
9f95a23c 1805 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae 1806 }
9f95a23c 1807 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae
FG
1808}
1809
9f95a23c 1810ceph_release_t OSDMap::get_require_min_compat_client() const
11fdf7f2
TL
1811{
1812 return require_min_compat_client;
1813}
1814
7c673cae
FG
1815void OSDMap::_calc_up_osd_features()
1816{
1817 bool first = true;
1818 cached_up_osd_features = 0;
1819 for (int osd = 0; osd < max_osd; ++osd) {
1820 if (!is_up(osd))
1821 continue;
1822 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1823 if (xi.features == 0)
1824 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1825 if (first) {
1826 cached_up_osd_features = xi.features;
1827 first = false;
1828 } else {
1829 cached_up_osd_features &= xi.features;
1830 }
1831 }
1832}
1833
1834uint64_t OSDMap::get_up_osd_features() const
1835{
1836 return cached_up_osd_features;
1837}
1838
1839void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1840{
11fdf7f2 1841 using ceph::encode;
7c673cae
FG
1842 if (o->epoch == n->epoch)
1843 return;
1844
1845 int diff = 0;
1846
1847 // do addrs match?
1848 if (o->max_osd != n->max_osd)
1849 diff++;
1850 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1851 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1852 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1853 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1854 else
1855 diff++;
11fdf7f2
TL
1856 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1857 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1858 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1859 else
1860 diff++;
11fdf7f2
TL
1861 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1862 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1863 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1864 else
1865 diff++;
11fdf7f2
TL
1866 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1867 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1868 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1869 else
1870 diff++;
1871 }
1872 if (diff == 0) {
1873 // zoinks, no differences at all!
1874 n->osd_addrs = o->osd_addrs;
1875 }
1876
1877 // does crush match?
9f95a23c 1878 ceph::buffer::list oc, nc;
11fdf7f2
TL
1879 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1880 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1881 if (oc.contents_equal(nc)) {
1882 n->crush = o->crush;
1883 }
1884
1885 // does pg_temp match?
31f18b77
FG
1886 if (*o->pg_temp == *n->pg_temp)
1887 n->pg_temp = o->pg_temp;
7c673cae
FG
1888
1889 // does primary_temp match?
1890 if (o->primary_temp->size() == n->primary_temp->size()) {
1891 if (*o->primary_temp == *n->primary_temp)
1892 n->primary_temp = o->primary_temp;
1893 }
1894
1895 // do uuids match?
1896 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1897 *o->osd_uuid == *n->osd_uuid)
1898 n->osd_uuid = o->osd_uuid;
1899}
1900
1901void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1902 const OSDMap& oldmap,
1903 const OSDMap& nextmap,
1904 Incremental *pending_inc)
7c673cae
FG
1905{
1906 ldout(cct, 10) << __func__ << dendl;
7c673cae 1907
11fdf7f2 1908 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1909 // if pool does not exist, remove any existing pg_temps associated with
1910 // it. we don't care about pg_temps on the pending_inc either; if there
1911 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1912 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1913 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1914 << " for nonexistent pool " << pg.first.pool() << dendl;
1915 pending_inc->new_pg_temp[pg.first].clear();
1916 continue;
1917 }
20effc67
TL
1918 if (!nextmap.pg_exists(pg.first)) {
1919 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1920 << " for nonexistent pg " << dendl;
1921 pending_inc->new_pg_temp[pg.first].clear();
1922 continue;
1923 }
7c673cae
FG
1924 // all osds down?
1925 unsigned num_up = 0;
1926 for (auto o : pg.second) {
11fdf7f2 1927 if (!nextmap.is_down(o)) {
7c673cae
FG
1928 ++num_up;
1929 break;
1930 }
1931 }
1932 if (num_up == 0) {
1933 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1934 << " with all down osds" << pg.second << dendl;
1935 pending_inc->new_pg_temp[pg.first].clear();
1936 continue;
1937 }
1938 // redundant pg_temp?
1939 vector<int> raw_up;
1940 int primary;
11fdf7f2 1941 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1942 bool remove = false;
11fdf7f2 1943 if (raw_up == pg.second) {
7c673cae
FG
1944 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1945 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1946 remove = true;
1947 }
1948 // oversized pg_temp?
11fdf7f2 1949 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1950 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1951 << pg.second << " exceeds pool size" << dendl;
1952 remove = true;
1953 }
1954 if (remove) {
11fdf7f2 1955 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1956 pending_inc->new_pg_temp[pg.first].clear();
1957 else
1958 pending_inc->new_pg_temp.erase(pg.first);
1959 }
1960 }
1961
11fdf7f2 1962 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1963 // primary down?
11fdf7f2 1964 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1965 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1966 << " to down " << pg.second << dendl;
1967 pending_inc->new_primary_temp[pg.first] = -1;
1968 continue;
1969 }
1970 // redundant primary_temp?
1971 vector<int> real_up, templess_up;
1972 int real_primary, templess_primary;
1973 pg_t pgid = pg.first;
11fdf7f2
TL
1974 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1975 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1976 if (real_primary == templess_primary){
1977 ldout(cct, 10) << __func__ << " removing primary_temp "
1978 << pgid << " -> " << real_primary
1979 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1980 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1981 pending_inc->new_primary_temp[pgid] = -1;
1982 else
1983 pending_inc->new_primary_temp.erase(pgid);
1984 }
1985 }
1986}
1987
494da23a 1988void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1989{
494da23a
TL
1990 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1991 for (auto& p : pg_upmap)
1992 upmap_pgs->push_back(p.first);
1993 for (auto& p : pg_upmap_items)
1994 upmap_pgs->push_back(p.first);
1995}
94b18763 1996
494da23a
TL
1997bool OSDMap::check_pg_upmaps(
1998 CephContext *cct,
1999 const vector<pg_t>& to_check,
2000 vector<pg_t> *to_cancel,
2001 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
2002{
2003 bool any_change = false;
2004 map<int, map<int, float>> rule_weight_map;
28e407b8 2005 for (auto& pg : to_check) {
494da23a 2006 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
2007 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
2008 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
2009 << dendl;
494da23a 2010 to_cancel->push_back(pg);
11fdf7f2
TL
2011 continue;
2012 }
2013 if (pi->is_pending_merge(pg, nullptr)) {
2014 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
2015 << dendl;
494da23a 2016 to_cancel->push_back(pg);
94b18763
FG
2017 continue;
2018 }
494da23a
TL
2019 vector<int> raw, up;
2020 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
2021 auto crush_rule = get_pg_pool_crush_rule(pg);
2022 auto r = crush->verify_upmap(cct,
2023 crush_rule,
2024 get_pg_pool_size(pg),
2025 up);
a8e16298
TL
2026 if (r < 0) {
2027 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
2028 << " returning " << r
2029 << dendl;
494da23a 2030 to_cancel->push_back(pg);
a8e16298
TL
2031 continue;
2032 }
2033 // below we check against crush-topology changing..
28e407b8
AA
2034 map<int, float> weight_map;
2035 auto it = rule_weight_map.find(crush_rule);
2036 if (it == rule_weight_map.end()) {
494da23a 2037 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
2038 if (r < 0) {
2039 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
2040 << "crush_rule " << crush_rule
2041 << dendl;
28e407b8
AA
2042 continue;
2043 }
2044 rule_weight_map[crush_rule] = weight_map;
2045 } else {
2046 weight_map = it->second;
2047 }
28e407b8 2048 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 2049 << " weight_map " << weight_map
94b18763 2050 << dendl;
a8e16298 2051 for (auto osd : up) {
28e407b8
AA
2052 auto it = weight_map.find(osd);
2053 if (it == weight_map.end()) {
92f5a8d4
TL
2054 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
2055 << "been moved out of the specific crush-tree"
2056 << dendl;
494da23a 2057 to_cancel->push_back(pg);
94b18763
FG
2058 break;
2059 }
494da23a 2060 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 2061 if (adjusted_weight == 0) {
92f5a8d4
TL
2062 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
2063 << " is out/crush-out"
2064 << dendl;
494da23a 2065 to_cancel->push_back(pg);
94b18763
FG
2066 break;
2067 }
2068 }
eafe8130
TL
2069 if (!to_cancel->empty() && to_cancel->back() == pg)
2070 continue;
2071 // okay, upmap is valid
2072 // continue to check if it is still necessary
2073 auto i = pg_upmap.find(pg);
a4b75251
TL
2074 if (i != pg_upmap.end()) {
2075 if (i->second == raw) {
2076 ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " "
2077 << i->second << dendl;
2078 to_cancel->push_back(pg);
2079 continue;
2080 }
2081 if ((int)i->second.size() != get_pg_pool_size(pg)) {
2082 ldout(cct, 10) << "removing pg_upmap " << i->first << " "
2083 << i->second << " != pool size " << get_pg_pool_size(pg)
2084 << dendl;
2085 to_cancel->push_back(pg);
2086 continue;
2087 }
eafe8130
TL
2088 }
2089 auto j = pg_upmap_items.find(pg);
2090 if (j != pg_upmap_items.end()) {
2091 mempool::osdmap::vector<pair<int,int>> newmap;
2092 for (auto& p : j->second) {
2093 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
2094 // cancel mapping if source osd does not exist anymore
2095 continue;
2096 }
2097 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
2098 p.second >= 0 && osd_weight[p.second] == 0) {
2099 // cancel mapping if target osd is out
2100 continue;
2101 }
2102 newmap.push_back(p);
2103 }
2104 if (newmap.empty()) {
2105 ldout(cct, 10) << " removing no-op pg_upmap_items "
2106 << j->first << " " << j->second
2107 << dendl;
2108 to_cancel->push_back(pg);
2109 } else if (newmap != j->second) {
2110 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
2111 << j->first << " " << j->second
2112 << " -> " << newmap
2113 << dendl;
2114 to_remap->insert({pg, newmap});
2115 any_change = true;
2116 }
2117 }
28e407b8 2118 }
494da23a
TL
2119 any_change = any_change || !to_cancel->empty();
2120 return any_change;
2121}
2122
2123void OSDMap::clean_pg_upmaps(
2124 CephContext *cct,
2125 Incremental *pending_inc,
2126 const vector<pg_t>& to_cancel,
2127 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2128{
28e407b8 2129 for (auto &pg: to_cancel) {
494da23a
TL
2130 auto i = pending_inc->new_pg_upmap.find(pg);
2131 if (i != pending_inc->new_pg_upmap.end()) {
2132 ldout(cct, 10) << __func__ << " cancel invalid pending "
2133 << "pg_upmap entry "
2134 << i->first << "->" << i->second
2135 << dendl;
2136 pending_inc->new_pg_upmap.erase(i);
94b18763 2137 }
494da23a
TL
2138 auto j = pg_upmap.find(pg);
2139 if (j != pg_upmap.end()) {
2140 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2141 << j->first << "->" << j->second
2142 << dendl;
2143 pending_inc->old_pg_upmap.insert(pg);
2144 }
2145 auto p = pending_inc->new_pg_upmap_items.find(pg);
2146 if (p != pending_inc->new_pg_upmap_items.end()) {
2147 ldout(cct, 10) << __func__ << " cancel invalid pending "
2148 << "pg_upmap_items entry "
2149 << p->first << "->" << p->second
2150 << dendl;
2151 pending_inc->new_pg_upmap_items.erase(p);
2152 }
2153 auto q = pg_upmap_items.find(pg);
2154 if (q != pg_upmap_items.end()) {
2155 ldout(cct, 10) << __func__ << " cancel invalid "
2156 << "pg_upmap_items entry "
2157 << q->first << "->" << q->second
2158 << dendl;
2159 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
2160 }
2161 }
494da23a
TL
2162 for (auto& i : to_remap)
2163 pending_inc->new_pg_upmap_items[i.first] = i.second;
2164}
2165
2166bool OSDMap::clean_pg_upmaps(
2167 CephContext *cct,
2168 Incremental *pending_inc) const
2169{
2170 ldout(cct, 10) << __func__ << dendl;
2171 vector<pg_t> to_check;
2172 vector<pg_t> to_cancel;
2173 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2174
2175 get_upmap_pgs(&to_check);
2176 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2177 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2178 return any_change;
94b18763
FG
2179}
2180
7c673cae
FG
2181int OSDMap::apply_incremental(const Incremental &inc)
2182{
f67539c2 2183 new_blocklist_entries = false;
7c673cae
FG
2184 if (inc.epoch == 1)
2185 fsid = inc.fsid;
2186 else if (inc.fsid != fsid)
2187 return -EINVAL;
2188
11fdf7f2 2189 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
2190
2191 epoch++;
2192 modified = inc.modified;
2193
2194 // full map?
2195 if (inc.fullmap.length()) {
9f95a23c 2196 ceph::buffer::list bl(inc.fullmap);
7c673cae
FG
2197 decode(bl);
2198 return 0;
2199 }
2200
2201 // nope, incremental.
31f18b77 2202 if (inc.new_flags >= 0) {
7c673cae 2203 flags = inc.new_flags;
31f18b77
FG
2204 // the below is just to cover a newly-upgraded luminous mon
2205 // cluster that has to set require_jewel_osds or
2206 // require_kraken_osds before the osds can be upgraded to
2207 // luminous.
2208 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c
TL
2209 if (require_osd_release < ceph_release_t::kraken) {
2210 require_osd_release = ceph_release_t::kraken;
31f18b77
FG
2211 }
2212 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c
TL
2213 if (require_osd_release < ceph_release_t::jewel) {
2214 require_osd_release = ceph_release_t::jewel;
31f18b77
FG
2215 }
2216 }
2217 }
7c673cae
FG
2218
2219 if (inc.new_max_osd >= 0)
2220 set_max_osd(inc.new_max_osd);
2221
2222 if (inc.new_pool_max != -1)
2223 pool_max = inc.new_pool_max;
2224
2225 for (const auto &pool : inc.new_pools) {
2226 pools[pool.first] = pool.second;
2227 pools[pool.first].last_change = epoch;
2228 }
2229
11fdf7f2
TL
2230 new_removed_snaps = inc.new_removed_snaps;
2231 new_purged_snaps = inc.new_purged_snaps;
2232 for (auto p = new_removed_snaps.begin();
2233 p != new_removed_snaps.end();
2234 ++p) {
2235 removed_snaps_queue[p->first].union_of(p->second);
2236 }
2237 for (auto p = new_purged_snaps.begin();
2238 p != new_purged_snaps.end();
2239 ++p) {
2240 auto q = removed_snaps_queue.find(p->first);
2241 ceph_assert(q != removed_snaps_queue.end());
2242 q->second.subtract(p->second);
2243 if (q->second.empty()) {
2244 removed_snaps_queue.erase(q);
2245 }
2246 }
2247
2248 if (inc.new_last_up_change != utime_t()) {
2249 last_up_change = inc.new_last_up_change;
2250 }
2251 if (inc.new_last_in_change != utime_t()) {
2252 last_in_change = inc.new_last_in_change;
2253 }
2254
7c673cae
FG
2255 for (const auto &pname : inc.new_pool_names) {
2256 auto pool_name_entry = pool_name.find(pname.first);
2257 if (pool_name_entry != pool_name.end()) {
2258 name_pool.erase(pool_name_entry->second);
2259 pool_name_entry->second = pname.second;
2260 } else {
2261 pool_name[pname.first] = pname.second;
2262 }
2263 name_pool[pname.second] = pname.first;
2264 }
2265
2266 for (const auto &pool : inc.old_pools) {
2267 pools.erase(pool);
2268 name_pool.erase(pool_name[pool]);
2269 pool_name.erase(pool);
2270 }
2271
2272 for (const auto &weight : inc.new_weight) {
2273 set_weight(weight.first, weight.second);
2274
2275 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2276 // xinfo old_weight.
2277 if (weight.second) {
2278 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2279 osd_xinfo[weight.first].old_weight = 0;
2280 }
2281 }
2282
2283 for (const auto &primary_affinity : inc.new_primary_affinity) {
2284 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2285 }
2286
2287 // erasure_code_profiles
2288 for (const auto &profile : inc.old_erasure_code_profiles)
2289 erasure_code_profiles.erase(profile);
2290
2291 for (const auto &profile : inc.new_erasure_code_profiles) {
2292 set_erasure_code_profile(profile.first, profile.second);
2293 }
2294
2295 // up/down
2296 for (const auto &state : inc.new_state) {
2297 const auto osd = state.first;
2298 int s = state.second ? state.second : CEPH_OSD_UP;
2299 if ((osd_state[osd] & CEPH_OSD_UP) &&
2300 (s & CEPH_OSD_UP)) {
2301 osd_info[osd].down_at = epoch;
2302 osd_xinfo[osd].down_stamp = modified;
2303 }
2304 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2305 (s & CEPH_OSD_EXISTS)) {
2306 // osd is destroyed; clear out anything interesting.
2307 (*osd_uuid)[osd] = uuid_d();
2308 osd_info[osd] = osd_info_t();
2309 osd_xinfo[osd] = osd_xinfo_t();
2310 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2311 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2312 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2313 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2314 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2315 osd_state[osd] = 0;
2316 } else {
2317 osd_state[osd] ^= s;
2318 }
2319 }
2320
2321 for (const auto &client : inc.new_up_client) {
2322 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
9f95a23c 2323 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
11fdf7f2
TL
2324 osd_addrs->client_addrs[client.first].reset(
2325 new entity_addrvec_t(client.second));
2326 osd_addrs->hb_back_addrs[client.first].reset(
2327 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2328 osd_addrs->hb_front_addrs[client.first].reset(
2329 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2330
2331 osd_info[client.first].up_from = epoch;
2332 }
2333
2334 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2335 osd_addrs->cluster_addrs[cluster.first].reset(
2336 new entity_addrvec_t(cluster.second));
7c673cae
FG
2337
2338 // info
2339 for (const auto &thru : inc.new_up_thru)
2340 osd_info[thru.first].up_thru = thru.second;
2341
2342 for (const auto &interval : inc.new_last_clean_interval) {
2343 osd_info[interval.first].last_clean_begin = interval.second.first;
2344 osd_info[interval.first].last_clean_end = interval.second.second;
2345 }
2346
2347 for (const auto &lost : inc.new_lost)
2348 osd_info[lost.first].lost_at = lost.second;
2349
2350 // xinfo
2351 for (const auto &xinfo : inc.new_xinfo)
2352 osd_xinfo[xinfo.first] = xinfo.second;
2353
2354 // uuid
2355 for (const auto &uuid : inc.new_uuid)
2356 (*osd_uuid)[uuid.first] = uuid.second;
2357
2358 // pg rebuild
2359 for (const auto &pg : inc.new_pg_temp) {
2360 if (pg.second.empty())
2361 pg_temp->erase(pg.first);
2362 else
31f18b77
FG
2363 pg_temp->set(pg.first, pg.second);
2364 }
2365 if (!inc.new_pg_temp.empty()) {
2366 // make sure pg_temp is efficiently stored
2367 pg_temp->rebuild();
7c673cae
FG
2368 }
2369
2370 for (const auto &pg : inc.new_primary_temp) {
2371 if (pg.second == -1)
2372 primary_temp->erase(pg.first);
2373 else
2374 (*primary_temp)[pg.first] = pg.second;
2375 }
2376
2377 for (auto& p : inc.new_pg_upmap) {
2378 pg_upmap[p.first] = p.second;
2379 }
2380 for (auto& pg : inc.old_pg_upmap) {
2381 pg_upmap.erase(pg);
2382 }
2383 for (auto& p : inc.new_pg_upmap_items) {
2384 pg_upmap_items[p.first] = p.second;
2385 }
2386 for (auto& pg : inc.old_pg_upmap_items) {
2387 pg_upmap_items.erase(pg);
2388 }
2389
f67539c2
TL
2390 // blocklist
2391 if (!inc.new_blocklist.empty()) {
2392 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2393 new_blocklist_entries = true;
7c673cae 2394 }
f67539c2
TL
2395 for (const auto &addr : inc.old_blocklist)
2396 blocklist.erase(addr);
7c673cae 2397
33c7a0ef
TL
2398 for (const auto& addr_p : inc.new_range_blocklist) {
2399 range_blocklist.insert(addr_p);
2400 calculated_ranges.emplace(addr_p.first, addr_p.first);
2401 new_blocklist_entries = true;
2402 }
2403 for (const auto &addr : inc.old_range_blocklist) {
2404 calculated_ranges.erase(addr);
2405 range_blocklist.erase(addr);
2406 }
2407
81eedcae
TL
2408 for (auto& i : inc.new_crush_node_flags) {
2409 if (i.second) {
2410 crush_node_flags[i.first] = i.second;
2411 } else {
2412 crush_node_flags.erase(i.first);
2413 }
2414 }
2415
2416 for (auto& i : inc.new_device_class_flags) {
2417 if (i.second) {
2418 device_class_flags[i.first] = i.second;
2419 } else {
2420 device_class_flags.erase(i.first);
2421 }
2422 }
2423
7c673cae
FG
2424 // cluster snapshot?
2425 if (inc.cluster_snapshot.length()) {
2426 cluster_snapshot = inc.cluster_snapshot;
2427 cluster_snapshot_epoch = inc.epoch;
2428 } else {
2429 cluster_snapshot.clear();
2430 cluster_snapshot_epoch = 0;
2431 }
2432
2433 if (inc.new_nearfull_ratio >= 0) {
2434 nearfull_ratio = inc.new_nearfull_ratio;
2435 }
2436 if (inc.new_backfillfull_ratio >= 0) {
2437 backfillfull_ratio = inc.new_backfillfull_ratio;
2438 }
2439 if (inc.new_full_ratio >= 0) {
2440 full_ratio = inc.new_full_ratio;
2441 }
9f95a23c 2442 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
7c673cae
FG
2443 require_min_compat_client = inc.new_require_min_compat_client;
2444 }
9f95a23c 2445 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
31f18b77 2446 require_osd_release = inc.new_require_osd_release;
9f95a23c 2447 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 2448 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2449 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2450 }
2451 }
7c673cae 2452
9f95a23c 2453 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
11fdf7f2 2454 require_osd_release = inc.new_require_osd_release;
9f95a23c 2455 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
2456 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2457 }
2458 }
7c673cae
FG
2459 // do new crush map last (after up/down stuff)
2460 if (inc.crush.length()) {
9f95a23c 2461 ceph::buffer::list bl(inc.crush);
11fdf7f2 2462 auto blp = bl.cbegin();
7c673cae
FG
2463 crush.reset(new CrushWrapper);
2464 crush->decode(blp);
9f95a23c 2465 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77
FG
2466 // only increment if this is a luminous-encoded osdmap, lest
2467 // the mon's crush_version diverge from what the osds or others
2468 // are decoding and applying on their end. if we won't encode
2469 // it in the canonical version, don't change it.
2470 ++crush_version;
2471 }
81eedcae
TL
2472 for (auto it = device_class_flags.begin();
2473 it != device_class_flags.end();) {
2474 const char* class_name = crush->get_class_name(it->first);
2475 if (!class_name) // device class is gone
2476 it = device_class_flags.erase(it);
2477 else
2478 it++;
2479 }
7c673cae
FG
2480 }
2481
f67539c2
TL
2482 if (inc.change_stretch_mode) {
2483 stretch_mode_enabled = inc.stretch_mode_enabled;
2484 stretch_bucket_count = inc.new_stretch_bucket_count;
2485 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2486 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2487 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2488 }
2489
7c673cae
FG
2490 calc_num_osds();
2491 _calc_up_osd_features();
2492 return 0;
2493}
2494
2495// mapping
2496int OSDMap::map_to_pg(
2497 int64_t poolid,
2498 const string& name,
2499 const string& key,
2500 const string& nspace,
2501 pg_t *pg) const
2502{
2503 // calculate ps (placement seed)
2504 const pg_pool_t *pool = get_pg_pool(poolid);
2505 if (!pool)
2506 return -ENOENT;
2507 ps_t ps;
2508 if (!key.empty())
2509 ps = pool->hash_key(key, nspace);
2510 else
2511 ps = pool->hash_key(name, nspace);
2512 *pg = pg_t(ps, poolid);
2513 return 0;
2514}
2515
2516int OSDMap::object_locator_to_pg(
2517 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2518{
2519 if (loc.hash >= 0) {
2520 if (!get_pg_pool(loc.get_pool())) {
2521 return -ENOENT;
2522 }
2523 pg = pg_t(loc.hash, loc.get_pool());
2524 return 0;
2525 }
2526 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2527}
2528
2529ceph_object_layout OSDMap::make_object_layout(
2530 object_t oid, int pg_pool, string nspace) const
2531{
2532 object_locator_t loc(pg_pool, nspace);
2533
2534 ceph_object_layout ol;
2535 pg_t pgid = object_locator_to_pg(oid, loc);
2536 ol.ol_pgid = pgid.get_old_pg().v;
2537 ol.ol_stripe_unit = 0;
2538 return ol;
2539}
2540
2541void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2542 vector<int>& osds) const
2543{
2544 if (pool.can_shift_osds()) {
2545 unsigned removed = 0;
2546 for (unsigned i = 0; i < osds.size(); i++) {
2547 if (!exists(osds[i])) {
2548 removed++;
2549 continue;
2550 }
2551 if (removed) {
2552 osds[i - removed] = osds[i];
2553 }
2554 }
2555 if (removed)
2556 osds.resize(osds.size() - removed);
2557 } else {
2558 for (auto& osd : osds) {
2559 if (!exists(osd))
2560 osd = CRUSH_ITEM_NONE;
2561 }
2562 }
2563}
2564
31f18b77 2565void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2566 const pg_pool_t& pool, pg_t pg,
2567 vector<int> *osds,
2568 ps_t *ppps) const
2569{
2570 // map to osds[]
2571 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2572 unsigned size = pool.get_size();
2573
2574 // what crush rule?
20effc67 2575 int ruleno = pool.get_crush_rule();
7c673cae
FG
2576 if (ruleno >= 0)
2577 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2578
2579 _remove_nonexistent_osds(pool, *osds);
2580
2581 if (ppps)
2582 *ppps = pps;
7c673cae
FG
2583}
2584
2585int OSDMap::_pick_primary(const vector<int>& osds) const
2586{
2587 for (auto osd : osds) {
2588 if (osd != CRUSH_ITEM_NONE) {
2589 return osd;
2590 }
2591 }
2592 return -1;
2593}
2594
224ce89b 2595void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2596{
2597 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2598 auto p = pg_upmap.find(pg);
2599 if (p != pg_upmap.end()) {
2600 // make sure targets aren't marked out
2601 for (auto osd : p->second) {
91327a77
AA
2602 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2603 osd_weight[osd] == 0) {
7c673cae
FG
2604 // reject/ignore the explicit mapping
2605 return;
2606 }
2607 }
2608 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2609 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2610 }
2611
2612 auto q = pg_upmap_items.find(pg);
2613 if (q != pg_upmap_items.end()) {
181888fb
FG
2614 // NOTE: this approach does not allow a bidirectional swap,
2615 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2616 for (auto& r : q->second) {
2617 // make sure the replacement value doesn't already appear
2618 bool exists = false;
2619 ssize_t pos = -1;
2620 for (unsigned i = 0; i < raw->size(); ++i) {
2621 int osd = (*raw)[i];
2622 if (osd == r.second) {
2623 exists = true;
2624 break;
2625 }
2626 // ignore mapping if target is marked out (or invalid osd id)
2627 if (osd == r.first &&
2628 pos < 0 &&
2629 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2630 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2631 pos = i;
2632 }
2633 }
2634 if (!exists && pos >= 0) {
2635 (*raw)[pos] = r.second;
7c673cae
FG
2636 }
2637 }
2638 }
2639}
2640
2641// pg -> (up osd list)
2642void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2643 vector<int> *up) const
2644{
2645 if (pool.can_shift_osds()) {
2646 // shift left
2647 up->clear();
2648 up->reserve(raw.size());
2649 for (unsigned i=0; i<raw.size(); i++) {
2650 if (!exists(raw[i]) || is_down(raw[i]))
2651 continue;
2652 up->push_back(raw[i]);
2653 }
2654 } else {
2655 // set down/dne devices to NONE
2656 up->resize(raw.size());
2657 for (int i = raw.size() - 1; i >= 0; --i) {
2658 if (!exists(raw[i]) || is_down(raw[i])) {
2659 (*up)[i] = CRUSH_ITEM_NONE;
2660 } else {
2661 (*up)[i] = raw[i];
2662 }
2663 }
2664 }
2665}
2666
2667void OSDMap::_apply_primary_affinity(ps_t seed,
2668 const pg_pool_t& pool,
2669 vector<int> *osds,
2670 int *primary) const
2671{
2672 // do we have any non-default primary_affinity values for these osds?
2673 if (!osd_primary_affinity)
2674 return;
2675
2676 bool any = false;
2677 for (const auto osd : *osds) {
2678 if (osd != CRUSH_ITEM_NONE &&
2679 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2680 any = true;
2681 break;
2682 }
2683 }
2684 if (!any)
2685 return;
2686
2687 // pick the primary. feed both the seed (for the pg) and the osd
2688 // into the hash/rng so that a proportional fraction of an osd's pgs
2689 // get rejected as primary.
2690 int pos = -1;
2691 for (unsigned i = 0; i < osds->size(); ++i) {
2692 int o = (*osds)[i];
2693 if (o == CRUSH_ITEM_NONE)
2694 continue;
2695 unsigned a = (*osd_primary_affinity)[o];
2696 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2697 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2698 seed, o) >> 16) >= a) {
2699 // we chose not to use this primary. note it anyway as a
2700 // fallback in case we don't pick anyone else, but keep looking.
2701 if (pos < 0)
2702 pos = i;
2703 } else {
2704 pos = i;
2705 break;
2706 }
2707 }
2708 if (pos < 0)
2709 return;
2710
2711 *primary = (*osds)[pos];
2712
2713 if (pool.can_shift_osds() && pos > 0) {
2714 // move the new primary to the front.
2715 for (int i = pos; i > 0; --i) {
2716 (*osds)[i] = (*osds)[i-1];
2717 }
2718 (*osds)[0] = *primary;
2719 }
2720}
2721
2722void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2723 vector<int> *temp_pg, int *temp_primary) const
2724{
2725 pg = pool.raw_pg_to_pg(pg);
2726 const auto p = pg_temp->find(pg);
2727 temp_pg->clear();
2728 if (p != pg_temp->end()) {
2729 for (unsigned i=0; i<p->second.size(); i++) {
2730 if (!exists(p->second[i]) || is_down(p->second[i])) {
2731 if (pool.can_shift_osds()) {
2732 continue;
2733 } else {
2734 temp_pg->push_back(CRUSH_ITEM_NONE);
2735 }
2736 } else {
2737 temp_pg->push_back(p->second[i]);
2738 }
2739 }
2740 }
2741 const auto &pp = primary_temp->find(pg);
2742 *temp_primary = -1;
2743 if (pp != primary_temp->end()) {
2744 *temp_primary = pp->second;
2745 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2746 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2747 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2748 *temp_primary = (*temp_pg)[i];
2749 break;
2750 }
2751 }
2752 }
2753}
2754
31f18b77 2755void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2756{
7c673cae 2757 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2758 if (!pool) {
2759 *primary = -1;
2760 raw->clear();
31f18b77 2761 return;
11fdf7f2 2762 }
31f18b77 2763 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2764 *primary = _pick_primary(*raw);
7c673cae
FG
2765}
2766
494da23a
TL
2767void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2768 vector<int> *raw_upmap) const
a8e16298
TL
2769{
2770 auto pool = get_pg_pool(pg.pool());
2771 if (!pool) {
2772 raw_upmap->clear();
2773 return;
2774 }
494da23a
TL
2775 _pg_to_raw_osds(*pool, pg, raw, NULL);
2776 *raw_upmap = *raw;
a8e16298
TL
2777 _apply_upmap(*pool, pg, raw_upmap);
2778}
2779
7c673cae
FG
2780void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2781{
2782 const pg_pool_t *pool = get_pg_pool(pg.pool());
2783 if (!pool) {
11fdf7f2
TL
2784 *primary = -1;
2785 up->clear();
7c673cae
FG
2786 return;
2787 }
2788 vector<int> raw;
2789 ps_t pps;
2790 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2791 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2792 _raw_to_up_osds(*pool, raw, up);
2793 *primary = _pick_primary(raw);
2794 _apply_primary_affinity(pps, *pool, up, primary);
2795}
31f18b77 2796
7c673cae
FG
2797void OSDMap::_pg_to_up_acting_osds(
2798 const pg_t& pg, vector<int> *up, int *up_primary,
2799 vector<int> *acting, int *acting_primary,
2800 bool raw_pg_to_pg) const
2801{
2802 const pg_pool_t *pool = get_pg_pool(pg.pool());
2803 if (!pool ||
2804 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2805 if (up)
2806 up->clear();
2807 if (up_primary)
2808 *up_primary = -1;
2809 if (acting)
2810 acting->clear();
2811 if (acting_primary)
2812 *acting_primary = -1;
2813 return;
2814 }
2815 vector<int> raw;
2816 vector<int> _up;
2817 vector<int> _acting;
2818 int _up_primary;
2819 int _acting_primary;
2820 ps_t pps;
2821 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2822 if (_acting.empty() || up || up_primary) {
2823 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2824 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2825 _raw_to_up_osds(*pool, raw, &_up);
2826 _up_primary = _pick_primary(_up);
2827 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2828 if (_acting.empty()) {
2829 _acting = _up;
2830 if (_acting_primary == -1) {
2831 _acting_primary = _up_primary;
2832 }
2833 }
2834
2835 if (up)
2836 up->swap(_up);
2837 if (up_primary)
2838 *up_primary = _up_primary;
2839 }
2840
2841 if (acting)
2842 acting->swap(_acting);
2843 if (acting_primary)
2844 *acting_primary = _acting_primary;
2845}
2846
9f95a23c 2847int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
7c673cae 2848{
9f95a23c
TL
2849 // This implementation is broken for EC PGs since the osd may appear
2850 // multiple times in the acting set. See
2851 // https://tracker.ceph.com/issues/43213
7c673cae
FG
2852 if (!nrep)
2853 nrep = acting.size();
2854 for (int i=0; i<nrep; i++)
2855 if (acting[i] == osd)
2856 return i;
2857 return -1;
2858}
2859
9f95a23c 2860int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
7c673cae 2861{
9f95a23c
TL
2862 int nrep = acting.size();
2863 if (who.shard == shard_id_t::NO_SHARD) {
2864 for (int i=0; i<nrep; i++) {
2865 if (acting[i] == who.osd) {
2866 return i;
2867 }
2868 }
2869 } else {
2870 if (who.shard < nrep && acting[who.shard] == who.osd) {
2871 return who.shard;
2872 }
2873 }
2874 return -1;
7c673cae
FG
2875}
2876
9f95a23c 2877bool OSDMap::primary_changed_broken(
7c673cae
FG
2878 int oldprimary,
2879 const vector<int> &oldacting,
2880 int newprimary,
2881 const vector<int> &newacting)
2882{
2883 if (oldacting.empty() && newacting.empty())
2884 return false; // both still empty
2885 if (oldacting.empty() ^ newacting.empty())
2886 return true; // was empty, now not, or vice versa
2887 if (oldprimary != newprimary)
2888 return true; // primary changed
9f95a23c
TL
2889 if (calc_pg_role_broken(oldprimary, oldacting) !=
2890 calc_pg_role_broken(newprimary, newacting))
7c673cae
FG
2891 return true;
2892 return false; // same primary (tho replicas may have changed)
2893}
2894
28e407b8
AA
2895uint64_t OSDMap::get_encoding_features() const
2896{
2897 uint64_t f = SIGNIFICANT_FEATURES;
9f95a23c
TL
2898 if (require_osd_release < ceph_release_t::octopus) {
2899 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2900 }
2901 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
2902 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2903 }
9f95a23c 2904 if (require_osd_release < ceph_release_t::mimic) {
11fdf7f2
TL
2905 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2906 }
9f95a23c 2907 if (require_osd_release < ceph_release_t::luminous) {
28e407b8
AA
2908 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2909 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2910 }
9f95a23c 2911 if (require_osd_release < ceph_release_t::kraken) {
28e407b8 2912 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2913 CEPH_FEATURE_MSG_ADDR2);
28e407b8 2914 }
9f95a23c 2915 if (require_osd_release < ceph_release_t::jewel) {
28e407b8 2916 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2917 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2918 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2919 }
2920 return f;
2921}
7c673cae
FG
2922
2923// serialize, unserialize
9f95a23c 2924void OSDMap::encode_client_old(ceph::buffer::list& bl) const
7c673cae 2925{
11fdf7f2 2926 using ceph::encode;
7c673cae 2927 __u16 v = 5;
11fdf7f2 2928 encode(v, bl);
7c673cae
FG
2929
2930 // base
11fdf7f2
TL
2931 encode(fsid, bl);
2932 encode(epoch, bl);
2933 encode(created, bl);
2934 encode(modified, bl);
7c673cae 2935
11fdf7f2 2936 // for encode(pools, bl);
7c673cae 2937 __u32 n = pools.size();
11fdf7f2 2938 encode(n, bl);
7c673cae
FG
2939
2940 for (const auto &pool : pools) {
2941 n = pool.first;
11fdf7f2
TL
2942 encode(n, bl);
2943 encode(pool.second, bl, 0);
7c673cae 2944 }
11fdf7f2 2945 // for encode(pool_name, bl);
7c673cae 2946 n = pool_name.size();
11fdf7f2 2947 encode(n, bl);
7c673cae
FG
2948 for (const auto &pname : pool_name) {
2949 n = pname.first;
11fdf7f2
TL
2950 encode(n, bl);
2951 encode(pname.second, bl);
7c673cae 2952 }
11fdf7f2 2953 // for encode(pool_max, bl);
7c673cae 2954 n = pool_max;
11fdf7f2 2955 encode(n, bl);
7c673cae 2956
11fdf7f2 2957 encode(flags, bl);
7c673cae 2958
11fdf7f2 2959 encode(max_osd, bl);
31f18b77
FG
2960 {
2961 uint32_t n = osd_state.size();
11fdf7f2 2962 encode(n, bl);
31f18b77 2963 for (auto s : osd_state) {
11fdf7f2 2964 encode((uint8_t)s, bl);
31f18b77
FG
2965 }
2966 }
11fdf7f2
TL
2967 encode(osd_weight, bl);
2968 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2969
11fdf7f2 2970 // for encode(pg_temp, bl);
7c673cae 2971 n = pg_temp->size();
11fdf7f2 2972 encode(n, bl);
f67539c2 2973 for (const auto& pg : *pg_temp) {
7c673cae 2974 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2975 encode(opg, bl);
2976 encode(pg.second, bl);
7c673cae
FG
2977 }
2978
2979 // crush
9f95a23c 2980 ceph::buffer::list cbl;
7c673cae 2981 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2982 encode(cbl, bl);
7c673cae
FG
2983}
2984
9f95a23c 2985void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2986{
11fdf7f2 2987 using ceph::encode;
7c673cae
FG
2988 if ((features & CEPH_FEATURE_PGID64) == 0) {
2989 encode_client_old(bl);
2990 return;
2991 }
2992
2993 __u16 v = 6;
11fdf7f2 2994 encode(v, bl);
7c673cae
FG
2995
2996 // base
11fdf7f2
TL
2997 encode(fsid, bl);
2998 encode(epoch, bl);
2999 encode(created, bl);
3000 encode(modified, bl);
7c673cae 3001
11fdf7f2
TL
3002 encode(pools, bl, features);
3003 encode(pool_name, bl);
3004 encode(pool_max, bl);
7c673cae 3005
11fdf7f2 3006 encode(flags, bl);
7c673cae 3007
11fdf7f2 3008 encode(max_osd, bl);
31f18b77
FG
3009 {
3010 uint32_t n = osd_state.size();
11fdf7f2 3011 encode(n, bl);
31f18b77 3012 for (auto s : osd_state) {
11fdf7f2 3013 encode((uint8_t)s, bl);
31f18b77
FG
3014 }
3015 }
11fdf7f2
TL
3016 encode(osd_weight, bl);
3017 encode(osd_addrs->client_addrs, bl, features);
7c673cae 3018
11fdf7f2 3019 encode(*pg_temp, bl);
7c673cae
FG
3020
3021 // crush
9f95a23c 3022 ceph::buffer::list cbl;
7c673cae 3023 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 3024 encode(cbl, bl);
7c673cae
FG
3025
3026 // extended
3027 __u16 ev = 10;
11fdf7f2
TL
3028 encode(ev, bl);
3029 encode(osd_addrs->hb_back_addrs, bl, features);
3030 encode(osd_info, bl);
f67539c2 3031 encode(blocklist, bl, features);
11fdf7f2
TL
3032 encode(osd_addrs->cluster_addrs, bl, features);
3033 encode(cluster_snapshot_epoch, bl);
3034 encode(cluster_snapshot, bl);
3035 encode(*osd_uuid, bl);
9f95a23c 3036 encode(osd_xinfo, bl, features);
11fdf7f2 3037 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
3038}
3039
11fdf7f2
TL
3040/* for a description of osdmap versions, and when they were introduced, please
3041 * refer to
3042 * doc/dev/osd_internals/osdmap_versions.txt
3043 */
9f95a23c 3044void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 3045{
11fdf7f2 3046 using ceph::encode;
7c673cae
FG
3047 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
3048 encode_classic(bl, features);
3049 return;
3050 }
3051
3052 // only a select set of callers should *ever* be encoding new
3053 // OSDMaps. others should be passing around the canonical encoded
3054 // buffers from on high. select out those callers by passing in an
3055 // "impossible" feature bit.
11fdf7f2 3056 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
3057 features &= ~CEPH_FEATURE_RESERVED;
3058
3059 size_t start_offset = bl.length();
3060 size_t tail_offset;
11fdf7f2 3061 size_t crc_offset;
9f95a23c 3062 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
3063
3064 // meta-encoding: how we include client-used and osd-specific data
3065 ENCODE_START(8, 7, bl);
3066
3067 {
28e407b8
AA
3068 // NOTE: any new encoding dependencies must be reflected by
3069 // SIGNIFICANT_FEATURES
11fdf7f2 3070 uint8_t v = 9;
31f18b77 3071 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 3072 v = 3;
11fdf7f2
TL
3073 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3074 v = 6;
3075 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3076 v = 7;
7c673cae
FG
3077 }
3078 ENCODE_START(v, 1, bl); // client-usable data
3079 // base
11fdf7f2
TL
3080 encode(fsid, bl);
3081 encode(epoch, bl);
3082 encode(created, bl);
3083 encode(modified, bl);
7c673cae 3084
11fdf7f2
TL
3085 encode(pools, bl, features);
3086 encode(pool_name, bl);
3087 encode(pool_max, bl);
7c673cae 3088
31f18b77
FG
3089 if (v < 4) {
3090 decltype(flags) f = flags;
9f95a23c 3091 if (require_osd_release >= ceph_release_t::luminous)
c07f9fc5 3092 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
9f95a23c 3093 else if (require_osd_release == ceph_release_t::kraken)
31f18b77 3094 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
9f95a23c 3095 else if (require_osd_release == ceph_release_t::jewel)
31f18b77 3096 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 3097 encode(f, bl);
31f18b77 3098 } else {
11fdf7f2 3099 encode(flags, bl);
31f18b77 3100 }
7c673cae 3101
11fdf7f2 3102 encode(max_osd, bl);
31f18b77 3103 if (v >= 5) {
11fdf7f2 3104 encode(osd_state, bl);
31f18b77
FG
3105 } else {
3106 uint32_t n = osd_state.size();
11fdf7f2 3107 encode(n, bl);
31f18b77 3108 for (auto s : osd_state) {
11fdf7f2 3109 encode((uint8_t)s, bl);
31f18b77
FG
3110 }
3111 }
11fdf7f2
TL
3112 encode(osd_weight, bl);
3113 if (v >= 8) {
3114 encode(osd_addrs->client_addrs, bl, features);
3115 } else {
3116 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
3117 }
7c673cae 3118
11fdf7f2
TL
3119 encode(*pg_temp, bl);
3120 encode(*primary_temp, bl);
7c673cae 3121 if (osd_primary_affinity) {
11fdf7f2 3122 encode(*osd_primary_affinity, bl);
7c673cae
FG
3123 } else {
3124 vector<__u32> v;
11fdf7f2 3125 encode(v, bl);
7c673cae
FG
3126 }
3127
3128 // crush
9f95a23c 3129 ceph::buffer::list cbl;
7c673cae 3130 crush->encode(cbl, features);
11fdf7f2
TL
3131 encode(cbl, bl);
3132 encode(erasure_code_profiles, bl);
7c673cae
FG
3133
3134 if (v >= 4) {
11fdf7f2
TL
3135 encode(pg_upmap, bl);
3136 encode(pg_upmap_items, bl);
7c673cae 3137 } else {
11fdf7f2
TL
3138 ceph_assert(pg_upmap.empty());
3139 ceph_assert(pg_upmap_items.empty());
7c673cae 3140 }
31f18b77 3141 if (v >= 6) {
11fdf7f2
TL
3142 encode(crush_version, bl);
3143 }
3144 if (v >= 7) {
3145 encode(new_removed_snaps, bl);
3146 encode(new_purged_snaps, bl);
3147 }
3148 if (v >= 9) {
3149 encode(last_up_change, bl);
3150 encode(last_in_change, bl);
31f18b77 3151 }
7c673cae
FG
3152 ENCODE_FINISH(bl); // client-usable data
3153 }
3154
3155 {
28e407b8
AA
3156 // NOTE: any new encoding dependencies must be reflected by
3157 // SIGNIFICANT_FEATURES
33c7a0ef 3158 uint8_t target_v = 9; // when bumping this, be aware of range blocklist
7c673cae
FG
3159 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3160 target_v = 1;
11fdf7f2
TL
3161 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3162 target_v = 5;
3163 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3164 target_v = 6;
7c673cae 3165 }
f67539c2
TL
3166 if (stretch_mode_enabled) {
3167 target_v = std::max((uint8_t)10, target_v);
3168 }
33c7a0ef
TL
3169 if (!range_blocklist.empty()) {
3170 target_v = std::max((uint8_t)11, target_v);
3171 }
7c673cae 3172 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
3173 if (target_v < 7) {
3174 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3175 } else {
3176 encode(osd_addrs->hb_back_addrs, bl, features);
3177 }
3178 encode(osd_info, bl);
7c673cae
FG
3179 {
3180 // put this in a sorted, ordered map<> so that we encode in a
3181 // deterministic order.
f67539c2
TL
3182 map<entity_addr_t,utime_t> blocklist_map;
3183 for (const auto &addr : blocklist)
3184 blocklist_map.insert(make_pair(addr.first, addr.second));
3185 encode(blocklist_map, bl, features);
11fdf7f2
TL
3186 }
3187 if (target_v < 7) {
3188 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3189 } else {
3190 encode(osd_addrs->cluster_addrs, bl, features);
3191 }
3192 encode(cluster_snapshot_epoch, bl);
3193 encode(cluster_snapshot, bl);
3194 encode(*osd_uuid, bl);
9f95a23c 3195 encode(osd_xinfo, bl, features);
11fdf7f2
TL
3196 if (target_v < 7) {
3197 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3198 } else {
3199 encode(osd_addrs->hb_front_addrs, bl, features);
3200 }
7c673cae 3201 if (target_v >= 2) {
11fdf7f2
TL
3202 encode(nearfull_ratio, bl);
3203 encode(full_ratio, bl);
3204 encode(backfillfull_ratio, bl);
31f18b77
FG
3205 }
3206 // 4 was string-based new_require_min_compat_client
3207 if (target_v >= 5) {
11fdf7f2
TL
3208 encode(require_min_compat_client, bl);
3209 encode(require_osd_release, bl);
3210 }
3211 if (target_v >= 6) {
3212 encode(removed_snaps_queue, bl);
7c673cae 3213 }
81eedcae
TL
3214 if (target_v >= 8) {
3215 encode(crush_node_flags, bl);
3216 }
3217 if (target_v >= 9) {
3218 encode(device_class_flags, bl);
3219 }
f67539c2
TL
3220 if (target_v >= 10) {
3221 encode(stretch_mode_enabled, bl);
3222 encode(stretch_bucket_count, bl);
3223 encode(degraded_stretch_mode, bl);
3224 encode(recovering_stretch_mode, bl);
3225 encode(stretch_mode_bucket, bl);
3226 }
33c7a0ef
TL
3227 if (target_v >= 11) {
3228 ::encode(range_blocklist, bl, features);
3229 }
7c673cae
FG
3230 ENCODE_FINISH(bl); // osd-only data
3231 }
3232
11fdf7f2
TL
3233 crc_offset = bl.length();
3234 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
3235 tail_offset = bl.length();
3236
3237 ENCODE_FINISH(bl); // meta-encoding wrapper
3238
3239 // fill in crc
9f95a23c 3240 ceph::buffer::list front;
11fdf7f2 3241 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
3242 crc = front.crc32c(-1);
3243 if (tail_offset < bl.length()) {
9f95a23c 3244 ceph::buffer::list tail;
7c673cae
FG
3245 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3246 crc = tail.crc32c(crc);
3247 }
3248 ceph_le32 crc_le;
3249 crc_le = crc;
11fdf7f2 3250 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
3251 crc_defined = true;
3252}
3253
11fdf7f2
TL
3254/* for a description of osdmap versions, and when they were introduced, please
3255 * refer to
3256 * doc/dev/osd_internals/osdmap_versions.txt
3257 */
9f95a23c 3258void OSDMap::decode(ceph::buffer::list& bl)
7c673cae 3259{
11fdf7f2 3260 auto p = bl.cbegin();
7c673cae
FG
3261 decode(p);
3262}
3263
9f95a23c 3264void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
7c673cae 3265{
11fdf7f2 3266 using ceph::decode;
7c673cae
FG
3267 __u32 n, t;
3268 __u16 v;
11fdf7f2 3269 decode(v, p);
7c673cae
FG
3270
3271 // base
11fdf7f2
TL
3272 decode(fsid, p);
3273 decode(epoch, p);
3274 decode(created, p);
3275 decode(modified, p);
7c673cae
FG
3276
3277 if (v < 6) {
3278 if (v < 4) {
3279 int32_t max_pools = 0;
11fdf7f2 3280 decode(max_pools, p);
7c673cae
FG
3281 pool_max = max_pools;
3282 }
3283 pools.clear();
11fdf7f2 3284 decode(n, p);
7c673cae 3285 while (n--) {
11fdf7f2
TL
3286 decode(t, p);
3287 decode(pools[t], p);
7c673cae
FG
3288 }
3289 if (v == 4) {
11fdf7f2 3290 decode(n, p);
7c673cae
FG
3291 pool_max = n;
3292 } else if (v == 5) {
3293 pool_name.clear();
11fdf7f2 3294 decode(n, p);
7c673cae 3295 while (n--) {
11fdf7f2
TL
3296 decode(t, p);
3297 decode(pool_name[t], p);
7c673cae 3298 }
11fdf7f2 3299 decode(n, p);
7c673cae
FG
3300 pool_max = n;
3301 }
3302 } else {
11fdf7f2
TL
3303 decode(pools, p);
3304 decode(pool_name, p);
3305 decode(pool_max, p);
7c673cae
FG
3306 }
3307 // kludge around some old bug that zeroed out pool_max (#2307)
3308 if (pools.size() && pool_max < pools.rbegin()->first) {
3309 pool_max = pools.rbegin()->first;
3310 }
3311
11fdf7f2 3312 decode(flags, p);
7c673cae 3313
11fdf7f2 3314 decode(max_osd, p);
31f18b77
FG
3315 {
3316 vector<uint8_t> os;
11fdf7f2 3317 decode(os, p);
31f18b77
FG
3318 osd_state.resize(os.size());
3319 for (unsigned i = 0; i < os.size(); ++i) {
3320 osd_state[i] = os[i];
3321 }
3322 }
11fdf7f2
TL
3323 decode(osd_weight, p);
3324 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3325 if (v <= 5) {
3326 pg_temp->clear();
11fdf7f2 3327 decode(n, p);
7c673cae
FG
3328 while (n--) {
3329 old_pg_t opg;
9f95a23c 3330 ceph::decode_raw(opg, p);
31f18b77 3331 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3332 decode(v, p);
31f18b77 3333 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3334 }
3335 } else {
11fdf7f2 3336 decode(*pg_temp, p);
7c673cae
FG
3337 }
3338
3339 // crush
9f95a23c 3340 ceph::buffer::list cbl;
11fdf7f2
TL
3341 decode(cbl, p);
3342 auto cblp = cbl.cbegin();
7c673cae
FG
3343 crush->decode(cblp);
3344
3345 // extended
3346 __u16 ev = 0;
3347 if (v >= 5)
11fdf7f2
TL
3348 decode(ev, p);
3349 decode(osd_addrs->hb_back_addrs, p);
3350 decode(osd_info, p);
7c673cae 3351 if (v < 5)
11fdf7f2 3352 decode(pool_name, p);
7c673cae 3353
f67539c2 3354 decode(blocklist, p);
7c673cae 3355 if (ev >= 6)
11fdf7f2 3356 decode(osd_addrs->cluster_addrs, p);
7c673cae 3357 else
11fdf7f2 3358 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3359
3360 if (ev >= 7) {
11fdf7f2
TL
3361 decode(cluster_snapshot_epoch, p);
3362 decode(cluster_snapshot, p);
7c673cae
FG
3363 }
3364
3365 if (ev >= 8) {
11fdf7f2 3366 decode(*osd_uuid, p);
7c673cae
FG
3367 } else {
3368 osd_uuid->resize(max_osd);
3369 }
3370 if (ev >= 9)
11fdf7f2 3371 decode(osd_xinfo, p);
7c673cae
FG
3372 else
3373 osd_xinfo.resize(max_osd);
3374
3375 if (ev >= 10)
11fdf7f2 3376 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3377 else
11fdf7f2 3378 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3379
3380 osd_primary_affinity.reset();
3381
3382 post_decode();
3383}
3384
9f95a23c 3385void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 3386{
11fdf7f2 3387 using ceph::decode;
7c673cae
FG
3388 /**
3389 * Older encodings of the OSDMap had a single struct_v which
3390 * covered the whole encoding, and was prior to our modern
3391 * stuff which includes a compatv and a size. So if we see
3392 * a struct_v < 7, we must rewind to the beginning and use our
3393 * classic decoder.
3394 */
3395 size_t start_offset = bl.get_off();
3396 size_t tail_offset = 0;
9f95a23c 3397 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
3398
3399 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3400 if (struct_v < 7) {
11fdf7f2 3401 bl.seek(start_offset);
7c673cae
FG
3402 decode_classic(bl);
3403 return;
3404 }
3405 /**
3406 * Since we made it past that hurdle, we can use our normal paths.
3407 */
3408 {
11fdf7f2 3409 DECODE_START(9, bl); // client-usable data
7c673cae 3410 // base
11fdf7f2
TL
3411 decode(fsid, bl);
3412 decode(epoch, bl);
3413 decode(created, bl);
3414 decode(modified, bl);
7c673cae 3415
11fdf7f2
TL
3416 decode(pools, bl);
3417 decode(pool_name, bl);
3418 decode(pool_max, bl);
7c673cae 3419
11fdf7f2 3420 decode(flags, bl);
7c673cae 3421
11fdf7f2 3422 decode(max_osd, bl);
31f18b77 3423 if (struct_v >= 5) {
11fdf7f2 3424 decode(osd_state, bl);
31f18b77
FG
3425 } else {
3426 vector<uint8_t> os;
11fdf7f2 3427 decode(os, bl);
31f18b77
FG
3428 osd_state.resize(os.size());
3429 for (unsigned i = 0; i < os.size(); ++i) {
3430 osd_state[i] = os[i];
3431 }
3432 }
11fdf7f2
TL
3433 decode(osd_weight, bl);
3434 decode(osd_addrs->client_addrs, bl);
7c673cae 3435
11fdf7f2
TL
3436 decode(*pg_temp, bl);
3437 decode(*primary_temp, bl);
3438 // dates back to firefly. version increased from 2 to 3 still in firefly.
3439 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3440 if (struct_v >= 2) {
3441 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3442 decode(*osd_primary_affinity, bl);
7c673cae
FG
3443 if (osd_primary_affinity->empty())
3444 osd_primary_affinity.reset();
3445 } else {
3446 osd_primary_affinity.reset();
3447 }
3448
3449 // crush
9f95a23c 3450 ceph::buffer::list cbl;
11fdf7f2
TL
3451 decode(cbl, bl);
3452 auto cblp = cbl.cbegin();
7c673cae 3453 crush->decode(cblp);
11fdf7f2
TL
3454 // added in firefly; version increased in luminous, so it affects
3455 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3456 // alone until we require clients to be all luminous?
7c673cae 3457 if (struct_v >= 3) {
11fdf7f2 3458 decode(erasure_code_profiles, bl);
7c673cae
FG
3459 } else {
3460 erasure_code_profiles.clear();
3461 }
11fdf7f2
TL
3462 // version increased from 3 to 4 still in luminous, so same as above
3463 // applies.
7c673cae 3464 if (struct_v >= 4) {
11fdf7f2
TL
3465 decode(pg_upmap, bl);
3466 decode(pg_upmap_items, bl);
7c673cae
FG
3467 } else {
3468 pg_upmap.clear();
3469 pg_upmap_items.clear();
3470 }
11fdf7f2
TL
3471 // again, version increased from 5 to 6 still in luminous, so above
3472 // applies.
31f18b77 3473 if (struct_v >= 6) {
11fdf7f2
TL
3474 decode(crush_version, bl);
3475 }
3476 // version increase from 6 to 7 in mimic
3477 if (struct_v >= 7) {
3478 decode(new_removed_snaps, bl);
3479 decode(new_purged_snaps, bl);
3480 }
3481 // version increase from 7 to 8, 8 to 9, in nautilus.
3482 if (struct_v >= 9) {
3483 decode(last_up_change, bl);
3484 decode(last_in_change, bl);
31f18b77 3485 }
7c673cae
FG
3486 DECODE_FINISH(bl); // client-usable data
3487 }
3488
3489 {
f67539c2 3490 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
3491 decode(osd_addrs->hb_back_addrs, bl);
3492 decode(osd_info, bl);
f67539c2 3493 decode(blocklist, bl);
11fdf7f2
TL
3494 decode(osd_addrs->cluster_addrs, bl);
3495 decode(cluster_snapshot_epoch, bl);
3496 decode(cluster_snapshot, bl);
3497 decode(*osd_uuid, bl);
3498 decode(osd_xinfo, bl);
3499 decode(osd_addrs->hb_front_addrs, bl);
3500 //
7c673cae 3501 if (struct_v >= 2) {
11fdf7f2
TL
3502 decode(nearfull_ratio, bl);
3503 decode(full_ratio, bl);
7c673cae
FG
3504 } else {
3505 nearfull_ratio = 0;
3506 full_ratio = 0;
3507 }
3508 if (struct_v >= 3) {
11fdf7f2 3509 decode(backfillfull_ratio, bl);
7c673cae
FG
3510 } else {
3511 backfillfull_ratio = 0;
3512 }
31f18b77
FG
3513 if (struct_v == 4) {
3514 string r;
11fdf7f2 3515 decode(r, bl);
31f18b77
FG
3516 if (r.length())
3517 require_min_compat_client = ceph_release_from_name(r.c_str());
3518 }
3519 if (struct_v >= 5) {
11fdf7f2
TL
3520 decode(require_min_compat_client, bl);
3521 decode(require_osd_release, bl);
9f95a23c 3522 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
3523 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3524 }
9f95a23c 3525 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 3526 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3527 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3528 }
3529 } else {
3530 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3531 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 3532 require_osd_release = ceph_release_t::luminous;
31f18b77 3533 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3534 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77 3535 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c 3536 require_osd_release = ceph_release_t::kraken;
31f18b77 3537 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c 3538 require_osd_release = ceph_release_t::jewel;
31f18b77 3539 } else {
9f95a23c 3540 require_osd_release = ceph_release_t::unknown;
31f18b77
FG
3541 }
3542 }
11fdf7f2
TL
3543 if (struct_v >= 6) {
3544 decode(removed_snaps_queue, bl);
3545 }
81eedcae
TL
3546 if (struct_v >= 8) {
3547 decode(crush_node_flags, bl);
3548 } else {
3549 crush_node_flags.clear();
3550 }
3551 if (struct_v >= 9) {
3552 decode(device_class_flags, bl);
3553 } else {
3554 device_class_flags.clear();
3555 }
f67539c2
TL
3556 if (struct_v >= 10) {
3557 decode(stretch_mode_enabled, bl);
3558 decode(stretch_bucket_count, bl);
3559 decode(degraded_stretch_mode, bl);
3560 decode(recovering_stretch_mode, bl);
3561 decode(stretch_mode_bucket, bl);
3562 } else {
3563 stretch_mode_enabled = false;
3564 stretch_bucket_count = 0;
3565 degraded_stretch_mode = 0;
3566 recovering_stretch_mode = 0;
3567 stretch_mode_bucket = 0;
3568 }
33c7a0ef
TL
3569 if (struct_v >= 11) {
3570 decode(range_blocklist, bl);
3571 calculated_ranges.clear();
3572 for (const auto& i : range_blocklist) {
3573 calculated_ranges.emplace(i.first, i.first);
3574 }
3575 }
7c673cae
FG
3576 DECODE_FINISH(bl); // osd-only data
3577 }
3578
3579 if (struct_v >= 8) {
3580 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3581 decode(crc, bl);
7c673cae
FG
3582 tail_offset = bl.get_off();
3583 crc_defined = true;
3584 } else {
3585 crc_defined = false;
3586 crc = 0;
3587 }
3588
3589 DECODE_FINISH(bl); // wrapper
3590
3591 if (tail_offset) {
3592 // verify crc
3593 uint32_t actual = crc_front.crc32c(-1);
3594 if (tail_offset < bl.get_off()) {
9f95a23c 3595 ceph::buffer::list tail;
7c673cae
FG
3596 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3597 actual = tail.crc32c(actual);
3598 }
3599 if (crc != actual) {
3600 ostringstream ss;
3601 ss << "bad crc, actual " << actual << " != expected " << crc;
3602 string s = ss.str();
9f95a23c 3603 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
3604 }
3605 }
3606
3607 post_decode();
3608}
3609
3610void OSDMap::post_decode()
3611{
3612 // index pool names
3613 name_pool.clear();
3614 for (const auto &pname : pool_name) {
3615 name_pool[pname.second] = pname.first;
3616 }
3617
3618 calc_num_osds();
3619 _calc_up_osd_features();
3620}
3621
3622void OSDMap::dump_erasure_code_profiles(
3623 const mempool::osdmap::map<string,map<string,string>>& profiles,
3624 Formatter *f)
3625{
3626 f->open_object_section("erasure_code_profiles");
3627 for (const auto &profile : profiles) {
3628 f->open_object_section(profile.first.c_str());
3629 for (const auto &profm : profile.second) {
9f95a23c 3630 f->dump_string(profm.first.c_str(), profm.second);
7c673cae
FG
3631 }
3632 f->close_section();
3633 }
3634 f->close_section();
3635}
3636
9f95a23c
TL
3637void OSDMap::dump_osds(Formatter *f) const
3638{
3639 f->open_array_section("osds");
3640 for (int i=0; i<get_max_osd(); i++) {
3641 if (exists(i)) {
3642 dump_osd(i, f);
3643 }
3644 }
3645 f->close_section();
3646}
3647
3648void OSDMap::dump_osd(int id, Formatter *f) const
3649{
3650 ceph_assert(f != nullptr);
3651 if (!exists(id)) {
3652 return;
3653 }
3654
3655 f->open_object_section("osd_info");
3656 f->dump_int("osd", id);
3657 f->dump_stream("uuid") << get_uuid(id);
3658 f->dump_int("up", is_up(id));
3659 f->dump_int("in", is_in(id));
3660 f->dump_float("weight", get_weightf(id));
3661 f->dump_float("primary_affinity", get_primary_affinityf(id));
3662 get_info(id).dump(f);
3663 f->dump_object("public_addrs", get_addrs(id));
3664 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3665 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3666 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3667 // compat
3668 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3669 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3670 f->dump_stream("heartbeat_back_addr")
3671 << get_hb_back_addrs(id).get_legacy_str();
3672 f->dump_stream("heartbeat_front_addr")
3673 << get_hb_front_addrs(id).get_legacy_str();
3674
3675 set<string> st;
3676 get_state(id, st);
3677 f->open_array_section("state");
3678 for (const auto &state : st)
3679 f->dump_string("state", state);
3680 f->close_section();
3681
3682 f->close_section();
3683}
3684
7c673cae
FG
3685void OSDMap::dump(Formatter *f) const
3686{
3687 f->dump_int("epoch", get_epoch());
3688 f->dump_stream("fsid") << get_fsid();
3689 f->dump_stream("created") << get_created();
3690 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3691 f->dump_stream("last_up_change") << last_up_change;
3692 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3693 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3694 f->dump_unsigned("flags_num", flags);
3695 f->open_array_section("flags_set");
3696 set<string> flagset;
3697 get_flag_set(&flagset);
3698 for (auto p : flagset) {
3699 f->dump_string("flag", p);
3700 }
3701 f->close_section();
31f18b77 3702 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3703 f->dump_float("full_ratio", full_ratio);
3704 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3705 f->dump_float("nearfull_ratio", nearfull_ratio);
3706 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3707 f->dump_int("pool_max", get_pool_max());
3708 f->dump_int("max_osd", get_max_osd());
31f18b77 3709 f->dump_string("require_min_compat_client",
f67539c2 3710 to_string(require_min_compat_client));
31f18b77 3711 f->dump_string("min_compat_client",
f67539c2 3712 to_string(get_min_compat_client()));
31f18b77 3713 f->dump_string("require_osd_release",
f67539c2 3714 to_string(require_osd_release));
7c673cae
FG
3715
3716 f->open_array_section("pools");
3717 for (const auto &pool : pools) {
3718 std::string name("<unknown>");
3719 const auto &pni = pool_name.find(pool.first);
3720 if (pni != pool_name.end())
3721 name = pni->second;
3722 f->open_object_section("pool");
3723 f->dump_int("pool", pool.first);
3724 f->dump_string("pool_name", name);
3725 pool.second.dump(f);
3726 f->close_section();
3727 }
3728 f->close_section();
3729
9f95a23c 3730 dump_osds(f);
7c673cae
FG
3731
3732 f->open_array_section("osd_xinfo");
3733 for (int i=0; i<get_max_osd(); i++) {
3734 if (exists(i)) {
3735 f->open_object_section("xinfo");
3736 f->dump_int("osd", i);
3737 osd_xinfo[i].dump(f);
3738 f->close_section();
3739 }
3740 }
3741 f->close_section();
3742
3743 f->open_array_section("pg_upmap");
3744 for (auto& p : pg_upmap) {
3745 f->open_object_section("mapping");
3746 f->dump_stream("pgid") << p.first;
3747 f->open_array_section("osds");
3748 for (auto q : p.second) {
3749 f->dump_int("osd", q);
3750 }
3751 f->close_section();
3752 f->close_section();
3753 }
3754 f->close_section();
3755 f->open_array_section("pg_upmap_items");
3756 for (auto& p : pg_upmap_items) {
3757 f->open_object_section("mapping");
3758 f->dump_stream("pgid") << p.first;
3759 f->open_array_section("mappings");
3760 for (auto& q : p.second) {
3761 f->open_object_section("mapping");
3762 f->dump_int("from", q.first);
3763 f->dump_int("to", q.second);
3764 f->close_section();
3765 }
3766 f->close_section();
3767 f->close_section();
3768 }
3769 f->close_section();
3770 f->open_array_section("pg_temp");
31f18b77 3771 pg_temp->dump(f);
7c673cae
FG
3772 f->close_section();
3773
3774 f->open_array_section("primary_temp");
3775 for (const auto &pg : *primary_temp) {
3776 f->dump_stream("pgid") << pg.first;
3777 f->dump_int("osd", pg.second);
3778 }
3779 f->close_section(); // primary_temp
3780
f67539c2
TL
3781 f->open_object_section("blocklist");
3782 for (const auto &addr : blocklist) {
7c673cae
FG
3783 stringstream ss;
3784 ss << addr.first;
3785 f->dump_stream(ss.str().c_str()) << addr.second;
3786 }
3787 f->close_section();
33c7a0ef
TL
3788 f->open_object_section("range_blocklist");
3789 for (const auto &addr : range_blocklist) {
3790 stringstream ss;
3791 ss << addr.first;
3792 f->dump_stream(ss.str().c_str()) << addr.second;
3793 }
3794 f->close_section();
7c673cae
FG
3795
3796 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3797
3798 f->open_array_section("removed_snaps_queue");
3799 for (auto& p : removed_snaps_queue) {
3800 f->open_object_section("pool");
3801 f->dump_int("pool", p.first);
3802 f->open_array_section("snaps");
3803 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3804 f->open_object_section("interval");
3805 f->dump_unsigned("begin", q.get_start());
3806 f->dump_unsigned("length", q.get_len());
3807 f->close_section();
3808 }
3809 f->close_section();
3810 f->close_section();
3811 }
3812 f->close_section();
3813 f->open_array_section("new_removed_snaps");
3814 for (auto& p : new_removed_snaps) {
3815 f->open_object_section("pool");
3816 f->dump_int("pool", p.first);
3817 f->open_array_section("snaps");
3818 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3819 f->open_object_section("interval");
3820 f->dump_unsigned("begin", q.get_start());
3821 f->dump_unsigned("length", q.get_len());
3822 f->close_section();
3823 }
3824 f->close_section();
3825 f->close_section();
3826 }
3827 f->close_section();
3828 f->open_array_section("new_purged_snaps");
3829 for (auto& p : new_purged_snaps) {
3830 f->open_object_section("pool");
3831 f->dump_int("pool", p.first);
3832 f->open_array_section("snaps");
3833 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3834 f->open_object_section("interval");
3835 f->dump_unsigned("begin", q.get_start());
3836 f->dump_unsigned("length", q.get_len());
3837 f->close_section();
3838 }
3839 f->close_section();
3840 f->close_section();
3841 }
3842 f->close_section();
81eedcae
TL
3843 f->open_object_section("crush_node_flags");
3844 for (auto& i : crush_node_flags) {
3845 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3846 : stringify(i.first);
3847 f->open_array_section(s.c_str());
3848 set<string> st;
3849 calc_state_set(i.second, st);
3850 for (auto& j : st) {
3851 f->dump_string("flag", j);
3852 }
3853 f->close_section();
3854 }
3855 f->close_section();
3856 f->open_object_section("device_class_flags");
3857 for (auto& i : device_class_flags) {
3858 const char* class_name = crush->get_class_name(i.first);
3859 string s = class_name ? class_name : stringify(i.first);
3860 f->open_array_section(s.c_str());
3861 set<string> st;
3862 calc_state_set(i.second, st);
3863 for (auto& j : st) {
3864 f->dump_string("flag", j);
3865 }
3866 f->close_section();
3867 }
3868 f->close_section();
f67539c2
TL
3869 f->open_object_section("stretch_mode");
3870 {
3871 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
3872 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
3873 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
3874 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
3875 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
3876 }
3877 f->close_section();
7c673cae
FG
3878}
3879
3880void OSDMap::generate_test_instances(list<OSDMap*>& o)
3881{
3882 o.push_back(new OSDMap);
3883
3884 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3885 o.push_back(new OSDMap);
3886 uuid_d fsid;
224ce89b 3887 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae 3888 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
f67539c2 3889 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
7c673cae
FG
3890 cct->put();
3891}
3892
3893string OSDMap::get_flag_string(unsigned f)
3894{
3895 string s;
7c673cae
FG
3896 if (f & CEPH_OSDMAP_PAUSERD)
3897 s += ",pauserd";
3898 if (f & CEPH_OSDMAP_PAUSEWR)
3899 s += ",pausewr";
3900 if (f & CEPH_OSDMAP_PAUSEREC)
3901 s += ",pauserec";
3902 if (f & CEPH_OSDMAP_NOUP)
3903 s += ",noup";
3904 if (f & CEPH_OSDMAP_NODOWN)
3905 s += ",nodown";
3906 if (f & CEPH_OSDMAP_NOOUT)
3907 s += ",noout";
3908 if (f & CEPH_OSDMAP_NOIN)
3909 s += ",noin";
3910 if (f & CEPH_OSDMAP_NOBACKFILL)
3911 s += ",nobackfill";
3912 if (f & CEPH_OSDMAP_NOREBALANCE)
3913 s += ",norebalance";
3914 if (f & CEPH_OSDMAP_NORECOVER)
3915 s += ",norecover";
3916 if (f & CEPH_OSDMAP_NOSCRUB)
3917 s += ",noscrub";
3918 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3919 s += ",nodeep-scrub";
3920 if (f & CEPH_OSDMAP_NOTIERAGENT)
3921 s += ",notieragent";
11fdf7f2
TL
3922 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3923 s += ",nosnaptrim";
7c673cae
FG
3924 if (f & CEPH_OSDMAP_SORTBITWISE)
3925 s += ",sortbitwise";
3926 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3927 s += ",require_jewel_osds";
3928 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3929 s += ",require_kraken_osds";
3930 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3931 s += ",require_luminous_osds";
c07f9fc5
FG
3932 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3933 s += ",recovery_deletes";
181888fb
FG
3934 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3935 s += ",purged_snapdirs";
f64942e4
AA
3936 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3937 s += ",pglog_hardlimit";
7c673cae
FG
3938 if (s.length())
3939 s.erase(0, 1);
3940 return s;
3941}
3942
3943string OSDMap::get_flag_string() const
3944{
3945 return get_flag_string(flags);
3946}
3947
7c673cae
FG
3948void OSDMap::print_pools(ostream& out) const
3949{
3950 for (const auto &pool : pools) {
3951 std::string name("<unknown>");
3952 const auto &pni = pool_name.find(pool.first);
3953 if (pni != pool_name.end())
3954 name = pni->second;
3955 out << "pool " << pool.first
3956 << " '" << name
3957 << "' " << pool.second << "\n";
3958
3959 for (const auto &snap : pool.second.snaps)
3960 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3961
3962 if (!pool.second.removed_snaps.empty())
3963 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3964 auto p = removed_snaps_queue.find(pool.first);
3965 if (p != removed_snaps_queue.end()) {
3966 out << "\tremoved_snaps_queue " << p->second << "\n";
3967 }
7c673cae
FG
3968 }
3969 out << std::endl;
3970}
3971
9f95a23c
TL
3972void OSDMap::print_osds(ostream& out) const
3973{
3974 for (int i=0; i<get_max_osd(); i++) {
3975 if (exists(i)) {
3976 print_osd(i, out);
3977 }
3978 }
3979}
3980void OSDMap::print_osd(int id, ostream& out) const
3981{
3982 if (!exists(id)) {
3983 return;
3984 }
3985
3986 out << "osd." << id;
3987 out << (is_up(id) ? " up ":" down");
3988 out << (is_in(id) ? " in ":" out");
3989 out << " weight " << get_weightf(id);
3990 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
3991 out << " primary_affinity " << get_primary_affinityf(id);
3992 }
3993 const osd_info_t& info(get_info(id));
3994 out << " " << info;
3995 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
3996 set<string> st;
3997 get_state(id, st);
3998 out << " " << st;
3999 if (!get_uuid(id).is_zero()) {
4000 out << " " << get_uuid(id);
4001 }
4002 out << "\n";
4003}
4004
7c673cae
FG
4005void OSDMap::print(ostream& out) const
4006{
4007 out << "epoch " << get_epoch() << "\n"
4008 << "fsid " << get_fsid() << "\n"
4009 << "created " << get_created() << "\n"
4010 << "modified " << get_modified() << "\n";
4011
4012 out << "flags " << get_flag_string() << "\n";
31f18b77 4013 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
4014 out << "full_ratio " << full_ratio << "\n";
4015 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
4016 out << "nearfull_ratio " << nearfull_ratio << "\n";
9f95a23c 4017 if (require_min_compat_client != ceph_release_t::unknown) {
31f18b77 4018 out << "require_min_compat_client "
9f95a23c 4019 << require_min_compat_client << "\n";
7c673cae 4020 }
9f95a23c 4021 out << "min_compat_client " << get_min_compat_client()
31f18b77 4022 << "\n";
9f95a23c
TL
4023 if (require_osd_release > ceph_release_t::unknown) {
4024 out << "require_osd_release " << require_osd_release
224ce89b
WB
4025 << "\n";
4026 }
f67539c2
TL
4027 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
4028 if (stretch_mode_enabled) {
4029 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
4030 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
4031 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
4032 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
4033 }
7c673cae
FG
4034 if (get_cluster_snapshot().length())
4035 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
4036 out << "\n";
4037
4038 print_pools(out);
4039
4040 out << "max_osd " << get_max_osd() << "\n";
9f95a23c 4041 print_osds(out);
7c673cae
FG
4042 out << std::endl;
4043
4044 for (auto& p : pg_upmap) {
4045 out << "pg_upmap " << p.first << " " << p.second << "\n";
4046 }
4047 for (auto& p : pg_upmap_items) {
4048 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
4049 }
4050
f67539c2 4051 for (const auto& pg : *pg_temp)
7c673cae
FG
4052 out << "pg_temp " << pg.first << " " << pg.second << "\n";
4053
f67539c2 4054 for (const auto& pg : *primary_temp)
7c673cae
FG
4055 out << "primary_temp " << pg.first << " " << pg.second << "\n";
4056
f67539c2
TL
4057 for (const auto &addr : blocklist)
4058 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
33c7a0ef
TL
4059 for (const auto &addr : range_blocklist)
4060 out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
4061}
4062
4063class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
4064public:
4065 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
4066
4067 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4068 unsigned f)
c07f9fc5 4069 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4070
4071 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4072 if (!filter) {
4073 return true; // normal case
4074 }
4075 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4076 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4077 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4078 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4079 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4080 return true;
31f18b77 4081 }
c07f9fc5 4082 return false;
31f18b77
FG
4083 }
4084
4085 bool should_dump_empty_bucket() const override {
4086 return !filter;
4087 }
7c673cae 4088
11fdf7f2 4089 void init_table(TextTable *tbl) {
7c673cae 4090 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 4091 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
4092 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4093 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 4094 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 4095 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 4096 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
4097 }
4098 void dump(TextTable *tbl, string& bucket) {
4099 init_table(tbl);
7c673cae 4100
11fdf7f2
TL
4101 if (!bucket.empty()) {
4102 set_root(bucket);
4103 Parent::dump(tbl);
4104 } else {
4105 Parent::dump(tbl);
4106 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4107 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
4108 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
4109 }
31f18b77 4110 }
7c673cae
FG
4111 }
4112 }
4113
4114protected:
4115 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
4116 const char *c = crush->get_item_class(qi.id);
4117 if (!c)
4118 c = "";
7c673cae 4119 *tbl << qi.id
224ce89b 4120 << c
7c673cae
FG
4121 << weightf_t(qi.weight);
4122
4123 ostringstream name;
4124 for (int k = 0; k < qi.depth; k++)
4125 name << " ";
4126 if (qi.is_bucket()) {
4127 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
4128 << crush->get_item_name(qi.id);
4129 } else {
4130 name << "osd." << qi.id;
4131 }
4132 *tbl << name.str();
4133
4134 if (!qi.is_bucket()) {
4135 if (!osdmap->exists(qi.id)) {
4136 *tbl << "DNE"
4137 << 0;
4138 } else {
c07f9fc5
FG
4139 string s;
4140 if (osdmap->is_up(qi.id)) {
4141 s = "up";
4142 } else if (osdmap->is_destroyed(qi.id)) {
4143 s = "destroyed";
4144 } else {
4145 s = "down";
4146 }
4147 *tbl << s
7c673cae
FG
4148 << weightf_t(osdmap->get_weightf(qi.id))
4149 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4150 }
4151 }
4152 *tbl << TextTable::endrow;
4153 }
4154
4155private:
4156 const OSDMap *osdmap;
31f18b77 4157 const unsigned filter;
7c673cae
FG
4158};
4159
4160class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4161public:
4162 typedef CrushTreeDumper::FormattingDumper Parent;
4163
31f18b77
FG
4164 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4165 unsigned f)
c07f9fc5 4166 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4167
4168 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4169 if (!filter) {
4170 return true; // normal case
4171 }
4172 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4173 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4174 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4175 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4176 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4177 return true;
31f18b77 4178 }
c07f9fc5 4179 return false;
31f18b77
FG
4180 }
4181
4182 bool should_dump_empty_bucket() const override {
4183 return !filter;
4184 }
7c673cae 4185
11fdf7f2
TL
4186 void dump(Formatter *f, string& bucket) {
4187 if (!bucket.empty()) {
4188 set_root(bucket);
4189 f->open_array_section("nodes");
4190 Parent::dump(f);
4191 f->close_section();
4192 } else {
4193 f->open_array_section("nodes");
4194 Parent::dump(f);
4195 f->close_section();
4196 f->open_array_section("stray");
4197 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4198 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4199 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4200 }
4201 f->close_section();
7c673cae 4202 }
7c673cae
FG
4203 }
4204
4205protected:
4206 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4207 Parent::dump_item_fields(qi, f);
4208 if (!qi.is_bucket())
4209 {
c07f9fc5
FG
4210 string s;
4211 if (osdmap->is_up(qi.id)) {
4212 s = "up";
4213 } else if (osdmap->is_destroyed(qi.id)) {
4214 s = "destroyed";
4215 } else {
4216 s = "down";
4217 }
7c673cae 4218 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 4219 f->dump_string("status", s);
7c673cae
FG
4220 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4221 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4222 }
4223 }
4224
4225private:
4226 const OSDMap *osdmap;
31f18b77 4227 const unsigned filter;
7c673cae
FG
4228};
4229
11fdf7f2 4230void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 4231{
31f18b77 4232 if (f) {
11fdf7f2 4233 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 4234 } else {
11fdf7f2 4235 ceph_assert(out);
7c673cae 4236 TextTable tbl;
11fdf7f2 4237 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
4238 *out << tbl;
4239 }
4240}
4241
224ce89b 4242void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 4243 const string& prefix, bool extra) const
7c673cae
FG
4244{
4245 if (f) {
7c673cae
FG
4246 f->dump_int("epoch", get_epoch());
4247 f->dump_int("num_osds", get_num_osds());
4248 f->dump_int("num_up_osds", get_num_up_osds());
9f95a23c 4249 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
7c673cae 4250 f->dump_int("num_in_osds", get_num_in_osds());
9f95a23c 4251 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
7c673cae 4252 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
7c673cae 4253 } else {
11fdf7f2 4254 utime_t now = ceph_clock_now();
31f18b77 4255 out << get_num_osds() << " osds: "
11fdf7f2
TL
4256 << get_num_up_osds() << " up";
4257 if (last_up_change != utime_t()) {
4258 out << " (since " << utimespan_str(now - last_up_change) << ")";
4259 }
4260 out << ", " << get_num_in_osds() << " in";
4261 if (last_in_change != utime_t()) {
4262 out << " (since " << utimespan_str(now - last_in_change) << ")";
4263 }
4264 if (extra)
4265 out << "; epoch: e" << get_epoch();
7c673cae
FG
4266 if (get_num_pg_temp())
4267 out << "; " << get_num_pg_temp() << " remapped pgs";
4268 out << "\n";
4269 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4270 if (important_flags)
224ce89b 4271 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
4272 }
4273}
4274
4275void OSDMap::print_oneline_summary(ostream& out) const
4276{
4277 out << "e" << get_epoch() << ": "
31f18b77 4278 << get_num_osds() << " total, "
7c673cae
FG
4279 << get_num_up_osds() << " up, "
4280 << get_num_in_osds() << " in";
7c673cae
FG
4281}
4282
3efd9988 4283bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
4284{
4285 for (const auto &pool : pools) {
3efd9988 4286 if (pool.second.crush_rule == rule_id)
7c673cae
FG
4287 return true;
4288 }
4289 return false;
4290}
4291
3efd9988
FG
4292int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4293 ostream *ss) const
4294{
4295 for (auto& i : pools) {
4296 auto& pool = i.second;
4297 int ruleno = pool.get_crush_rule();
4298 if (!newcrush->rule_exists(ruleno)) {
4299 *ss << "pool " << i.first << " references crush_rule " << ruleno
4300 << " but it is not present";
4301 return -EINVAL;
4302 }
20effc67 4303 if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) {
3efd9988
FG
4304 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4305 return -EINVAL;
4306 }
3efd9988
FG
4307 }
4308 return 0;
4309}
4310
224ce89b
WB
4311int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4312 int nosd, int pg_bits, int pgp_bits,
4313 bool default_pool)
7c673cae 4314{
224ce89b
WB
4315 ldout(cct, 10) << "build_simple on " << nosd
4316 << " osds" << dendl;
7c673cae
FG
4317 epoch = e;
4318 set_fsid(fsid);
4319 created = modified = ceph_clock_now();
4320
4321 if (nosd >= 0) {
4322 set_max_osd(nosd);
4323 } else {
4324 // count osds
4325 int maxosd = 0;
11fdf7f2 4326 const auto& conf = cct->_conf;
7c673cae 4327 vector<string> sections;
11fdf7f2 4328 conf.get_all_sections(sections);
7c673cae
FG
4329
4330 for (auto &section : sections) {
4331 if (section.find("osd.") != 0)
4332 continue;
4333
4334 const char *begin = section.c_str() + 4;
4335 char *end = (char*)begin;
4336 int o = strtol(begin, &end, 10);
4337 if (*end != '\0')
4338 continue;
4339
4340 if (o > cct->_conf->mon_max_osd) {
4341 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4342 return -ERANGE;
4343 }
4344
4345 if (o > maxosd)
4346 maxosd = o;
4347 }
4348
4349 set_max_osd(maxosd + 1);
4350 }
4351
7c673cae
FG
4352
4353 stringstream ss;
4354 int r;
4355 if (nosd >= 0)
4356 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4357 else
4358 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4359 ceph_assert(r == 0);
7c673cae
FG
4360
4361 int poolbase = get_max_osd() ? get_max_osd() : 1;
4362
20effc67 4363 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct);
11fdf7f2 4364 ceph_assert(default_replicated_rule >= 0);
7c673cae 4365
224ce89b
WB
4366 if (default_pool) {
4367 // pgp_num <= pg_num
4368 if (pgp_bits > pg_bits)
4369 pgp_bits = pg_bits;
4370
4371 vector<string> pool_names;
4372 pool_names.push_back("rbd");
4373 for (auto &plname : pool_names) {
4374 int64_t pool = ++pool_max;
4375 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4376 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4377 if (cct->_conf->osd_pool_default_flag_hashpspool)
4378 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4379 if (cct->_conf->osd_pool_default_flag_nodelete)
4380 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4381 if (cct->_conf->osd_pool_default_flag_nopgchange)
4382 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4383 if (cct->_conf->osd_pool_default_flag_nosizechange)
4384 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
20effc67
TL
4385 if (cct->_conf->osd_pool_default_flag_bulk)
4386 pools[pool].set_flag(pg_pool_t::FLAG_BULK);
11fdf7f2
TL
4387 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4388 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4389 pools[pool].size);
224ce89b
WB
4390 pools[pool].crush_rule = default_replicated_rule;
4391 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4392 pools[pool].set_pg_num(poolbase << pg_bits);
4393 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4394 pools[pool].set_pg_num_target(poolbase << pg_bits);
4395 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4396 pools[pool].last_change = epoch;
c07f9fc5
FG
4397 pools[pool].application_metadata.insert(
4398 {pg_pool_t::APPLICATION_NAME_RBD, {}});
9f95a23c
TL
4399 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4400 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4401 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4402 pools[pool].pg_autoscale_mode = m;
4403 } else {
4404 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4405 }
224ce89b
WB
4406 pool_name[pool] = plname;
4407 name_pool[plname] = pool;
4408 }
7c673cae
FG
4409 }
4410
7c673cae
FG
4411 map<string,string> profile_map;
4412 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4413 if (r < 0) {
4414 lderr(cct) << ss.str() << dendl;
4415 return r;
4416 }
4417 set_erasure_code_profile("default", profile_map);
4418 return 0;
4419}
4420
4421int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4422 map<string,string> &profile_map,
4423 ostream *ss)
4424{
11fdf7f2 4425 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4426 *ss,
4427 &profile_map);
4428 return r;
4429}
4430
4431int OSDMap::_build_crush_types(CrushWrapper& crush)
4432{
4433 crush.set_type_name(0, "osd");
4434 crush.set_type_name(1, "host");
4435 crush.set_type_name(2, "chassis");
4436 crush.set_type_name(3, "rack");
4437 crush.set_type_name(4, "row");
4438 crush.set_type_name(5, "pdu");
4439 crush.set_type_name(6, "pod");
4440 crush.set_type_name(7, "room");
4441 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4442 crush.set_type_name(9, "zone");
4443 crush.set_type_name(10, "region");
4444 crush.set_type_name(11, "root");
4445 return 11;
7c673cae
FG
4446}
4447
4448int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4449 int nosd, ostream *ss)
4450{
4451 crush.create();
4452
4453 // root
4454 int root_type = _build_crush_types(crush);
4455 int rootid;
4456 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4457 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4458 ceph_assert(r == 0);
7c673cae
FG
4459 crush.set_item_name(rootid, "default");
4460
f67539c2
TL
4461 map<string,string> loc{
4462 {"host", "localhost"},
4463 {"rack", "localrack"},
4464 {"root", "default"}
4465 };
7c673cae 4466 for (int o=0; o<nosd; o++) {
7c673cae
FG
4467 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4468 char name[32];
4469 snprintf(name, sizeof(name), "osd.%d", o);
4470 crush.insert_item(cct, o, 1.0, name, loc);
4471 }
4472
31f18b77 4473 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4474
4475 crush.finalize();
4476
4477 return 0;
4478}
4479
4480int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4481 CrushWrapper& crush,
4482 ostream *ss)
4483{
11fdf7f2 4484 const auto& conf = cct->_conf;
7c673cae
FG
4485
4486 crush.create();
4487
4488 // root
4489 int root_type = _build_crush_types(crush);
4490 int rootid;
4491 int r = crush.add_bucket(0, 0,
4492 CRUSH_HASH_DEFAULT,
4493 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4494 ceph_assert(r == 0);
7c673cae
FG
4495 crush.set_item_name(rootid, "default");
4496
4497 // add osds
4498 vector<string> sections;
11fdf7f2 4499 conf.get_all_sections(sections);
7c673cae
FG
4500
4501 for (auto &section : sections) {
4502 if (section.find("osd.") != 0)
4503 continue;
4504
4505 const char *begin = section.c_str() + 4;
4506 char *end = (char*)begin;
4507 int o = strtol(begin, &end, 10);
4508 if (*end != '\0')
4509 continue;
4510
4511 string host, rack, row, room, dc, pool;
4512 vector<string> sectiontmp;
4513 sectiontmp.push_back("osd");
4514 sectiontmp.push_back(section);
11fdf7f2
TL
4515 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4516 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4517 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4518 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4519 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4520 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4521
4522 if (host.length() == 0)
4523 host = "unknownhost";
4524 if (rack.length() == 0)
4525 rack = "unknownrack";
4526
4527 map<string,string> loc;
4528 loc["host"] = host;
4529 loc["rack"] = rack;
4530 if (row.size())
4531 loc["row"] = row;
4532 if (room.size())
4533 loc["room"] = room;
4534 if (dc.size())
4535 loc["datacenter"] = dc;
4536 loc["root"] = "default";
4537
4538 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4539 crush.insert_item(cct, o, 1.0, section, loc);
4540 }
4541
31f18b77 4542 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4543
4544 crush.finalize();
4545
4546 return 0;
4547}
4548
4549
31f18b77
FG
4550int OSDMap::build_simple_crush_rules(
4551 CephContext *cct,
4552 CrushWrapper& crush,
4553 const string& root,
4554 ostream *ss)
7c673cae 4555{
20effc67 4556 int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct);
7c673cae
FG
4557 string failure_domain =
4558 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4559
7c673cae 4560 int r;
31f18b77 4561 r = crush.add_simple_rule_at(
224ce89b 4562 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4563 "firstn", pg_pool_t::TYPE_REPLICATED,
4564 crush_rule, ss);
7c673cae
FG
4565 if (r < 0)
4566 return r;
4567 // do not add an erasure rule by default or else we will implicitly
4568 // require the crush_v2 feature of clients
4569 return 0;
4570}
4571
4572int OSDMap::summarize_mapping_stats(
4573 OSDMap *newmap,
4574 const set<int64_t> *pools,
4575 std::string *out,
4576 Formatter *f) const
4577{
4578 set<int64_t> ls;
4579 if (pools) {
4580 ls = *pools;
4581 } else {
4582 for (auto &p : get_pools())
4583 ls.insert(p.first);
4584 }
4585
4586 unsigned total_pg = 0;
4587 unsigned moved_pg = 0;
4588 vector<unsigned> base_by_osd(get_max_osd(), 0);
4589 vector<unsigned> new_by_osd(get_max_osd(), 0);
4590 for (int64_t pool_id : ls) {
4591 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4592 vector<int> up, up2;
4593 int up_primary;
7c673cae 4594 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4595 pg_t pgid(ps, pool_id);
7c673cae 4596 total_pg += pi->get_size();
31f18b77 4597 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4598 for (int osd : up) {
4599 if (osd >= 0 && osd < get_max_osd())
4600 ++base_by_osd[osd];
4601 }
4602 if (newmap) {
31f18b77 4603 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4604 for (int osd : up2) {
4605 if (osd >= 0 && osd < get_max_osd())
4606 ++new_by_osd[osd];
4607 }
4608 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4609 for (unsigned i=0; i<up.size(); ++i) {
4610 if (up[i] != up2[i]) {
4611 ++moved_pg;
4612 }
4613 }
4614 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4615 for (int osd : up) {
4616 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4617 ++moved_pg;
4618 }
4619 }
4620 } else {
11fdf7f2 4621 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4622 }
4623 }
4624 }
4625 }
4626
4627 unsigned num_up_in = 0;
4628 for (int osd = 0; osd < get_max_osd(); ++osd) {
4629 if (is_up(osd) && is_in(osd))
4630 ++num_up_in;
4631 }
4632 if (!num_up_in) {
4633 return -EINVAL;
4634 }
4635
4636 float avg_pg = (float)total_pg / (float)num_up_in;
4637 float base_stddev = 0, new_stddev = 0;
4638 int min = -1, max = -1;
4639 unsigned min_base_pg = 0, max_base_pg = 0;
4640 unsigned min_new_pg = 0, max_new_pg = 0;
4641 for (int osd = 0; osd < get_max_osd(); ++osd) {
4642 if (is_up(osd) && is_in(osd)) {
4643 float base_diff = (float)base_by_osd[osd] - avg_pg;
4644 base_stddev += base_diff * base_diff;
4645 float new_diff = (float)new_by_osd[osd] - avg_pg;
4646 new_stddev += new_diff * new_diff;
4647 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4648 min = osd;
4649 min_base_pg = base_by_osd[osd];
4650 min_new_pg = new_by_osd[osd];
4651 }
4652 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4653 max = osd;
4654 max_base_pg = base_by_osd[osd];
4655 max_new_pg = new_by_osd[osd];
4656 }
4657 }
4658 }
4659 base_stddev = sqrt(base_stddev / num_up_in);
4660 new_stddev = sqrt(new_stddev / num_up_in);
4661
4662 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4663
4664 ostringstream ss;
4665 if (f)
4666 f->open_object_section("utilization");
4667 if (newmap) {
4668 if (f) {
4669 f->dump_unsigned("moved_pgs", moved_pg);
4670 f->dump_unsigned("total_pgs", total_pg);
4671 } else {
4672 float percent = 0;
4673 if (total_pg)
4674 percent = (float)moved_pg * 100.0 / (float)total_pg;
4675 ss << "moved " << moved_pg << " / " << total_pg
4676 << " (" << percent << "%)\n";
4677 }
4678 }
4679 if (f) {
4680 f->dump_float("avg_pgs", avg_pg);
4681 f->dump_float("std_dev", base_stddev);
4682 f->dump_float("expected_baseline_std_dev", edev);
4683 if (newmap)
4684 f->dump_float("new_std_dev", new_stddev);
4685 } else {
4686 ss << "avg " << avg_pg << "\n";
4687 ss << "stddev " << base_stddev;
4688 if (newmap)
4689 ss << " -> " << new_stddev;
4690 ss << " (expected baseline " << edev << ")\n";
4691 }
4692 if (min >= 0) {
4693 if (f) {
4694 f->dump_unsigned("min_osd", min);
4695 f->dump_unsigned("min_osd_pgs", min_base_pg);
4696 if (newmap)
4697 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4698 } else {
4699 ss << "min osd." << min << " with " << min_base_pg;
4700 if (newmap)
4701 ss << " -> " << min_new_pg;
4702 ss << " pgs (" << (float)min_base_pg / avg_pg;
4703 if (newmap)
4704 ss << " -> " << (float)min_new_pg / avg_pg;
4705 ss << " * mean)\n";
4706 }
4707 }
4708 if (max >= 0) {
4709 if (f) {
4710 f->dump_unsigned("max_osd", max);
4711 f->dump_unsigned("max_osd_pgs", max_base_pg);
4712 if (newmap)
4713 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4714 } else {
4715 ss << "max osd." << max << " with " << max_base_pg;
4716 if (newmap)
4717 ss << " -> " << max_new_pg;
4718 ss << " pgs (" << (float)max_base_pg / avg_pg;
4719 if (newmap)
4720 ss << " -> " << (float)max_new_pg / avg_pg;
4721 ss << " * mean)\n";
4722 }
4723 }
4724 if (f)
4725 f->close_section();
4726 if (out)
4727 *out = ss.str();
4728 return 0;
4729}
4730
7c673cae
FG
4731bool OSDMap::try_pg_upmap(
4732 CephContext *cct,
4733 pg_t pg, ///< pg to potentially remap
4734 const set<int>& overfull, ///< osds we'd want to evacuate
4735 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4736 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4737 vector<int> *orig,
4738 vector<int> *out) ///< resulting alternative mapping
4739{
4740 const pg_pool_t *pool = get_pg_pool(pg.pool());
4741 if (!pool)
4742 return false;
20effc67 4743 int rule = pool->get_crush_rule();
7c673cae
FG
4744 if (rule < 0)
4745 return false;
4746
7c673cae
FG
4747 // make sure there is something there to remap
4748 bool any = false;
4749 for (auto osd : *orig) {
4750 if (overfull.count(osd)) {
4751 any = true;
4752 break;
4753 }
4754 }
4755 if (!any) {
4756 return false;
4757 }
4758
4759 int r = crush->try_remap_rule(
4760 cct,
4761 rule,
4762 pool->get_size(),
4763 overfull, underfull,
92f5a8d4 4764 more_underfull,
7c673cae
FG
4765 *orig,
4766 out);
4767 if (r < 0)
4768 return false;
4769 if (*out == *orig)
4770 return false;
4771 return true;
4772}
4773
4774int OSDMap::calc_pg_upmaps(
4775 CephContext *cct,
92f5a8d4 4776 uint32_t max_deviation,
7c673cae 4777 int max,
a8e16298 4778 const set<int64_t>& only_pools,
20effc67
TL
4779 OSDMap::Incremental *pending_inc,
4780 std::random_device::result_type *p_seed)
7c673cae 4781{
a8e16298 4782 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
20effc67 4783 OSDMap tmp_osd_map;
92f5a8d4
TL
4784 // Can't be less than 1 pg
4785 if (max_deviation < 1)
4786 max_deviation = 1;
20effc67 4787 tmp_osd_map.deepish_copy_from(*this);
7c673cae 4788 int num_changed = 0;
a8e16298
TL
4789 map<int,set<pg_t>> pgs_by_osd;
4790 int total_pgs = 0;
4791 float osd_weight_total = 0;
4792 map<int,float> osd_weight;
a8e16298 4793
20effc67
TL
4794 if (max <= 0) {
4795 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4796 return 0;
a8e16298 4797 }
20effc67
TL
4798
4799 osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map,
4800 total_pgs, pgs_by_osd, osd_weight);
a8e16298
TL
4801 if (osd_weight_total == 0) {
4802 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4803 return 0;
4804 }
20effc67 4805
a8e16298
TL
4806 float pgs_per_weight = total_pgs / osd_weight_total;
4807 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4808 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4809
a8e16298
TL
4810 float stddev = 0;
4811 map<int,float> osd_deviation; // osd, deviation(pgs)
4812 multimap<float,int> deviation_osd; // deviation(pgs), osd
20effc67
TL
4813 float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight,
4814 osd_deviation, deviation_osd, stddev);
4815
92f5a8d4
TL
4816 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4817 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
4818 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4819 << dendl;
4820 return 0;
4821 }
20effc67 4822
a8e16298
TL
4823 bool skip_overfull = false;
4824 auto aggressive =
11fdf7f2 4825 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4826 auto local_fallback_retries =
11fdf7f2 4827 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
20effc67 4828
a8e16298 4829 while (max--) {
92f5a8d4 4830 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
4831 // build overfull and underfull
4832 set<int> overfull;
92f5a8d4
TL
4833 set<int> more_overfull;
4834 bool using_more_overfull = false;
a8e16298 4835 vector<int> underfull;
92f5a8d4 4836 vector<int> more_underfull;
20effc67
TL
4837 fill_overfull_underfull(cct, deviation_osd, max_deviation,
4838 overfull, more_overfull,
4839 underfull, more_underfull);
7c673cae 4840
92f5a8d4
TL
4841 if (underfull.empty() && overfull.empty()) {
4842 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 4843 break;
a8e16298 4844 }
92f5a8d4
TL
4845 if (overfull.empty() && !underfull.empty()) {
4846 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4847 overfull = more_overfull;
4848 using_more_overfull = true;
4849 }
7c673cae 4850
a8e16298
TL
4851 ldout(cct, 10) << " overfull " << overfull
4852 << " underfull " << underfull
4853 << dendl;
4854 set<pg_t> to_skip;
4855 uint64_t local_fallback_retried = 0;
4856
4857 retry:
4858
4859 set<pg_t> to_unmap;
4860 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4861 auto temp_pgs_by_osd = pgs_by_osd;
4862 // always start with fullest, break if we find any changes to make
7c673cae 4863 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 4864 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
4865 ldout(cct, 10) << " skipping overfull " << dendl;
4866 break; // fall through to check underfull
4867 }
7c673cae 4868 int osd = p->second;
31f18b77 4869 float deviation = p->first;
9f95a23c
TL
4870 if (deviation < 0) {
4871 ldout(cct, 10) << " hitting underfull osds now"
4872 << " when trying to remap overfull osds"
4873 << dendl;
4874 break;
4875 }
7c673cae 4876 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
4877 ldout(cct, 10) << " Overfull search osd." << osd
4878 << " target " << target
4879 << " deviation " << deviation
4880 << dendl;
a8e16298 4881 ceph_assert(target > 0);
92f5a8d4 4882 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 4883 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4884 << " target " << target
4885 << " deviation " << deviation
92f5a8d4 4886 << " < max deviation " << max_deviation
a8e16298 4887 << dendl;
7c673cae
FG
4888 break;
4889 }
7c673cae 4890
a8e16298
TL
4891 vector<pg_t> pgs;
4892 pgs.reserve(pgs_by_osd[osd].size());
4893 for (auto& pg : pgs_by_osd[osd]) {
4894 if (to_skip.count(pg))
4895 continue;
4896 pgs.push_back(pg);
4897 }
4898 if (aggressive) {
4899 // shuffle PG list so they all get equal (in)attention
20effc67 4900 std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed));
a8e16298 4901 }
7c673cae 4902 // look for remaps we can un-remap
20effc67
TL
4903 if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd,
4904 temp_pgs_by_osd, to_unmap, to_upmap))
4905 goto test_change;
7c673cae 4906
a8e16298 4907 // try upmap
7c673cae 4908 for (auto pg : pgs) {
20effc67
TL
4909 auto temp_it = tmp_osd_map.pg_upmap.find(pg);
4910 if (temp_it != tmp_osd_map.pg_upmap.end()) {
a8e16298
TL
4911 // leave pg_upmap alone
4912 // it must be specified by admin since balancer does not
4913 // support pg_upmap yet
4914 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4915 << temp_it->second << ", skipping"
4916 << dendl;
7c673cae
FG
4917 continue;
4918 }
20effc67 4919 auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg);
a8e16298
TL
4920 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4921 set<int> existing;
20effc67
TL
4922 auto it = tmp_osd_map.pg_upmap_items.find(pg);
4923 if (it != tmp_osd_map.pg_upmap_items.end()) {
4924 auto& um_items = it->second;
4925 if (um_items.size() >= (size_t)pg_pool_size) {
4926 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4927 << um_items << ", skipping"
4928 << dendl;
4929 continue;
4930 } else {
4931 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4932 << um_items
4933 << dendl;
4934 new_upmap_items = um_items;
4935 // build existing too (for dedup)
4936 for (auto [um_from, um_to] : um_items) {
4937 existing.insert(um_from);
4938 existing.insert(um_to);
4939 }
4940 }
a8e16298
TL
4941 // fall through
4942 // to see if we can append more remapping pairs
20effc67 4943 }
a8e16298 4944 ldout(cct, 10) << " trying " << pg << dendl;
494da23a 4945 vector<int> raw, orig, out;
20effc67 4946 tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 4947 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
4948 continue;
4949 }
a8e16298 4950 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4951 if (orig.size() != out.size()) {
4952 continue;
4953 }
a8e16298 4954 ceph_assert(orig != out);
20effc67 4955 int pos = find_best_remap(cct, orig, out, existing, osd_deviation);
92f5a8d4 4956 if (pos != -1) {
a8e16298
TL
4957 // append new remapping pairs slowly
4958 // This way we can make sure that each tiny change will
4959 // definitely make distribution of PGs converging to
4960 // the perfect status.
20effc67
TL
4961 add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size,
4962 osd, existing, temp_pgs_by_osd,
4963 new_upmap_items, to_upmap);
a8e16298 4964 goto test_change;
7c673cae 4965 }
a8e16298
TL
4966 }
4967 }
7c673cae 4968
a8e16298
TL
4969 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4970 ldout(cct, 10) << " failed to find any changes for overfull osds"
4971 << dendl;
20effc67
TL
4972 for (auto& [deviation, osd] : deviation_osd) {
4973 if (std::find(underfull.begin(), underfull.end(), osd) ==
a8e16298
TL
4974 underfull.end())
4975 break;
a8e16298
TL
4976 float target = osd_weight[osd] * pgs_per_weight;
4977 ceph_assert(target > 0);
92f5a8d4
TL
4978 if (fabsf(deviation) < max_deviation) {
4979 // respect max_deviation too
a8e16298
TL
4980 ldout(cct, 10) << " osd." << osd
4981 << " target " << target
4982 << " deviation " << deviation
92f5a8d4
TL
4983 << " -> absolute " << fabsf(deviation)
4984 << " < max " << max_deviation
a8e16298
TL
4985 << dendl;
4986 break;
4987 }
4988 // look for remaps we can un-remap
20effc67
TL
4989 candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip,
4990 only_pools, aggressive, p_seed);
4991 if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd,
4992 to_unmap, to_upmap)) {
4993 goto test_change;
a8e16298 4994 }
7c673cae 4995 }
a8e16298
TL
4996
4997 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4998 ldout(cct, 10) << " failed to find any changes for underfull osds"
4999 << dendl;
5000 if (!aggressive) {
5001 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5002 break;
5003 } else if (!skip_overfull) {
5004 // safe to quit because below here we know
5005 // we've done checking both overfull and underfull osds..
5006 ldout(cct, 10) << " break due to not being able to find any"
5007 << " further optimizations"
5008 << dendl;
7c673cae
FG
5009 break;
5010 }
a8e16298
TL
5011 // restart with fullest and do exhaustive searching
5012 skip_overfull = false;
5013 continue;
5014
5015 test_change:
5016
5017 // test change, apply if change is good
5018 ceph_assert(to_unmap.size() || to_upmap.size());
5019 float new_stddev = 0;
5020 map<int,float> temp_osd_deviation;
5021 multimap<float,int> temp_deviation_osd;
20effc67
TL
5022 float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight,
5023 pgs_per_weight, temp_osd_deviation,
5024 temp_deviation_osd, new_stddev);
a8e16298
TL
5025 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5026 if (new_stddev >= stddev) {
5027 if (!aggressive) {
5028 ldout(cct, 10) << " break because stddev is not decreasing"
5029 << " and aggressive mode is not enabled"
5030 << dendl;
5031 break;
5032 }
5033 local_fallback_retried++;
5034 if (local_fallback_retried >= local_fallback_retries) {
5035 // does not make progress
5036 // flip *skip_overfull* so both overfull and underfull
5037 // get equal (in)attention
5038 skip_overfull = !skip_overfull;
5039 ldout(cct, 10) << " hit local_fallback_retries "
5040 << local_fallback_retries
5041 << dendl;
5042 continue;
5043 }
5044 for (auto& i : to_unmap)
5045 to_skip.insert(i);
5046 for (auto& i : to_upmap)
5047 to_skip.insert(i.first);
5048 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5049 << " to_skip " << to_skip
5050 << dendl;
5051 goto retry;
5052 }
5053
5054 // ready to go
5055 ceph_assert(new_stddev < stddev);
5056 stddev = new_stddev;
5057 pgs_by_osd = temp_pgs_by_osd;
5058 osd_deviation = temp_osd_deviation;
5059 deviation_osd = temp_deviation_osd;
20effc67
TL
5060
5061 num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc);
5062
92f5a8d4
TL
5063 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5064 if (cur_max_deviation <= max_deviation) {
5065 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5066 << dendl;
5067 break;
5068 }
7c673cae 5069 }
a8e16298 5070 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
5071 return num_changed;
5072}
31f18b77 5073
20effc67
TL
5074float OSDMap::build_pool_pgs_info (
5075 CephContext *cct,
5076 const std::set<int64_t>& only_pools, ///< [optional] restrict to pool
5077 const OSDMap& tmp_osd_map,
5078 int& total_pgs,
5079 map<int,set<pg_t>>& pgs_by_osd,
5080 map<int,float>& osd_weight)
5081{
5082 //
5083 // This function builds some data structures that are used by calc_pg_upmaps.
5084 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5085 // and returns the osd_weight_total
5086 //
5087 float osd_weight_total = 0.0;
5088 for (auto& [pid, pdata] : pools) {
5089 if (!only_pools.empty() && !only_pools.count(pid))
5090 continue;
5091 for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) {
5092 pg_t pg(ps, pid);
5093 vector<int> up;
5094 tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
5095 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
5096 for (auto osd : up) {
5097 if (osd != CRUSH_ITEM_NONE)
5098 pgs_by_osd[osd].insert(pg);
5099 }
5100 }
5101 total_pgs += pdata.get_size() * pdata.get_pg_num();
5102
5103 map<int,float> pmap;
5104 int ruleno = pdata.get_crush_rule();
5105 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap);
5106 ldout(cct,20) << __func__ << " pool " << pid
5107 << " ruleno " << ruleno
5108 << " weight-map " << pmap
5109 << dendl;
5110 for (auto [oid, oweight] : pmap) {
5111 auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight;
5112 if (adjusted_weight == 0) {
5113 continue;
5114 }
5115 osd_weight[oid] += adjusted_weight;
5116 osd_weight_total += adjusted_weight;
5117 }
5118 }
5119 for (auto& [oid, oweight] : osd_weight) {
5120 int pgs = 0;
5121 auto p = pgs_by_osd.find(oid);
5122 if (p != pgs_by_osd.end())
5123 pgs = p->second.size();
5124 else
5125 pgs_by_osd.emplace(oid, set<pg_t>());
5126 ldout(cct, 20) << " osd." << oid << " weight " << oweight
5127 << " pgs " << pgs << dendl;
5128 }
5129 return osd_weight_total;
5130
5131} // return total weight of all OSDs
5132
5133float OSDMap::calc_deviations (
5134 CephContext *cct,
5135 const map<int,set<pg_t>>& pgs_by_osd,
5136 const map<int,float>& osd_weight,
5137 float pgs_per_weight,
5138 map<int,float>& osd_deviation,
5139 multimap<float,int>& deviation_osd,
5140 float& stddev) // return current max deviation
5141{
5142 //
5143 // This function calculates the 2 maps osd_deviation and deviation_osd which
5144 // hold the deviation between the current number of PGs which map to an OSD
5145 // and the optimal number. Ot also calculates the stddev of the deviations and
5146 // returns the current max deviation.
5147 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5148 // long as it is monotonic with stddev (and it is), it is sufficient for
5149 // the balancer code.
5150 //
5151 float cur_max_deviation = 0.0;
5152 stddev = 0.0;
5153 for (auto& [oid, opgs] : pgs_by_osd) {
5154 // make sure osd is still there (belongs to this crush-tree)
5155 ceph_assert(osd_weight.count(oid));
5156 float target = osd_weight.at(oid) * pgs_per_weight;
5157 float deviation = (float)opgs.size() - target;
5158 ldout(cct, 20) << " osd." << oid
5159 << "\tpgs " << opgs.size()
5160 << "\ttarget " << target
5161 << "\tdeviation " << deviation
5162 << dendl;
5163 osd_deviation[oid] = deviation;
5164 deviation_osd.insert(make_pair(deviation, oid));
5165 stddev += deviation * deviation;
5166 if (fabsf(deviation) > cur_max_deviation)
5167 cur_max_deviation = fabsf(deviation);
5168 }
5169 return cur_max_deviation;
5170}
5171
5172void OSDMap::fill_overfull_underfull (
5173 CephContext *cct,
5174 const std::multimap<float,int>& deviation_osd,
5175 int max_deviation,
5176 std::set<int>& overfull,
5177 std::set<int>& more_overfull,
5178 std::vector<int>& underfull,
5179 std::vector<int>& more_underfull)
5180{
5181 //
5182 // This function just fills the overfull and underfull data structures for the
5183 // use of calc_pg_upmaps
5184 //
5185 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
5186 auto& odev = i->first;
5187 auto& oid = i->second;
5188 ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl;
5189 if (odev <= 0)
5190 break;
5191 if (odev > max_deviation) {
5192 ldout(cct, 30) << " add overfull osd." << oid << dendl;
5193 overfull.insert(oid);
5194 } else {
5195 more_overfull.insert(oid);
5196 }
5197 }
5198
5199 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
5200 auto& odev = i->first;
5201 auto& oid = i->second;
5202 ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl;
5203 if (odev >= 0)
5204 break;
5205 if (odev < -(int)max_deviation) {
5206 ldout(cct, 30) << " add underfull osd." << oid << dendl;
5207 underfull.push_back(oid);
5208 } else {
5209 more_underfull.push_back(oid);
5210 }
5211 }
5212}
5213
5214int OSDMap::pack_upmap_results(
5215 CephContext *cct,
5216 const std::set<pg_t>& to_unmap,
5217 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
5218 OSDMap& tmp_osd_map,
5219 OSDMap::Incremental *pending_inc)
5220{
5221 //
5222 // This function takes the input from the local variables to_unmap and to_upmap
5223 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5224 // (so that the results are visible outside calc_pg_upmaps)
5225 //
5226 int num_changed = 0;
5227 for (auto& i : to_unmap) {
5228 ldout(cct, 10) << " unmap pg " << i << dendl;
5229 ceph_assert(tmp_osd_map.pg_upmap_items.count(i));
5230 tmp_osd_map.pg_upmap_items.erase(i);
5231 pending_inc->old_pg_upmap_items.insert(i);
5232 ++num_changed;
5233 }
5234 for (auto& [pg, um_items] : to_upmap) {
5235 ldout(cct, 10) << " upmap pg " << pg
5236 << " new pg_upmap_items " << um_items
5237 << dendl;
5238 tmp_osd_map.pg_upmap_items[pg] = um_items;
5239 pending_inc->new_pg_upmap_items[pg] = um_items;
5240 ++num_changed;
5241 }
5242
5243 return num_changed;
5244}
5245
5246std::default_random_engine OSDMap::get_random_engine(
5247 CephContext *cct,
5248 std::random_device::result_type *p_seed)
5249{
5250 //
5251 // This function creates a random_engine to be used for shuffling.
5252 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5253 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5254 // increments seed_set. This is used in order to craete regression test without
5255 // random effect on the results.
5256 //
5257 static std::random_device::result_type seed_set = 0;
5258 std::random_device::result_type seed;
5259 if (p_seed == nullptr) {
5260 std::random_device rd;
5261 seed = rd();
5262 }
5263 else {
5264 seed = *p_seed + seed_set;
5265 ldout(cct, 30) << " Starting random engine with seed "
5266 << seed << dendl;
5267 seed_set++;
5268 }
5269 return std::default_random_engine{seed};
5270}
5271
5272bool OSDMap::try_drop_remap_overfull(
5273 CephContext *cct,
5274 const std::vector<pg_t>& pgs,
5275 const OSDMap& tmp_osd_map,
5276 int osd,
5277 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5278 set<pg_t>& to_unmap,
5279 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5280{
5281 //
5282 // This function tries to drop existimg upmap items which map data to overfull
5283 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5284 // if it found an item that can be dropped, false if not.
5285 //
5286 for (auto pg : pgs) {
5287 auto p = tmp_osd_map.pg_upmap_items.find(pg);
5288 if (p == tmp_osd_map.pg_upmap_items.end())
5289 continue;
5290 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5291 auto& pg_upmap_items = p->second;
5292 for (auto um_pair : pg_upmap_items) {
5293 auto& um_from = um_pair.first;
5294 auto& um_to = um_pair.second;
5295 if (um_to == osd) {
5296 ldout(cct, 10) << " will try dropping existing"
5297 << " remapping pair "
5298 << um_from << " -> " << um_to
5299 << " which remapped " << pg
5300 << " into overfull osd." << osd
5301 << dendl;
5302 temp_pgs_by_osd[um_to].erase(pg);
5303 temp_pgs_by_osd[um_from].insert(pg);
5304 } else {
5305 new_upmap_items.push_back(um_pair);
5306 }
5307 }
5308 if (new_upmap_items.empty()) {
5309 // drop whole item
5310 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5311 << " remapped " << pg << " into overfull osd." << osd
5312 << ", will try cancelling it entirely"
5313 << dendl;
5314 to_unmap.insert(pg);
5315 return true;
5316 } else if (new_upmap_items.size() != pg_upmap_items.size()) {
5317 // drop single remapping pair, updating
5318 ceph_assert(new_upmap_items.size() < pg_upmap_items.size());
5319 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5320 << " remapped " << pg << " into overfull osd." << osd
5321 << ", new_pg_upmap_items now " << new_upmap_items
5322 << dendl;
5323 to_upmap[pg] = new_upmap_items;
5324 return true;
5325 }
5326 }
5327 return false;
5328}
5329
5330bool OSDMap::try_drop_remap_underfull(
5331 CephContext *cct,
5332 const candidates_t& candidates,
5333 int osd,
5334 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5335 set<pg_t>& to_unmap,
5336 map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap)
5337{
5338 //
5339 // This function tries to drop existimg upmap items which map data from underfull
5340 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5341 // if it found an item that can be dropped, false if not.
5342 //
5343 for (auto& [pg, um_pairs] : candidates) {
5344 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5345 for (auto& ump : um_pairs) {
5346 auto& um_from = ump.first;
5347 auto& um_to = ump.second;
5348 if (um_from == osd) {
5349 ldout(cct, 10) << " will try dropping existing"
5350 << " remapping pair "
5351 << um_from << " -> " << um_to
5352 << " which remapped " << pg
5353 << " out from underfull osd." << osd
5354 << dendl;
5355 temp_pgs_by_osd[um_to].erase(pg);
5356 temp_pgs_by_osd[um_from].insert(pg);
5357 } else {
5358 new_upmap_items.push_back(ump);
5359 }
5360 }
5361 if (new_upmap_items.empty()) {
5362 // drop whole item
5363 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5364 << " remapped " << pg
5365 << " out from underfull osd." << osd
5366 << ", will try cancelling it entirely"
5367 << dendl;
5368 to_unmap.insert(pg);
5369 return true;
5370 } else if (new_upmap_items.size() != um_pairs.size()) {
5371 // drop single remapping pair, updating
5372 ceph_assert(new_upmap_items.size() < um_pairs.size());
5373 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5374 << " remapped " << pg
5375 << " out from underfull osd." << osd
5376 << ", new_pg_upmap_items now " << new_upmap_items
5377 << dendl;
5378 to_upmap[pg] = new_upmap_items;
5379 return true;
5380 }
5381 }
5382 return false;
5383}
5384
5385void OSDMap::add_remap_pair(
5386 CephContext *cct,
5387 int orig,
5388 int out,
5389 pg_t pg,
5390 size_t pg_pool_size,
5391 int osd,
5392 set<int>& existing,
5393 map<int,set<pg_t>>& temp_pgs_by_osd,
5394 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items,
5395 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5396{
5397 //
5398 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5399 // the relevant data structures
5400 //
5401 ldout(cct, 10) << " will try adding new remapping pair "
5402 << orig << " -> " << out << " for " << pg
5403 << (orig != osd ? " NOT selected osd" : "")
5404 << dendl;
5405 existing.insert(orig);
5406 existing.insert(out);
5407 temp_pgs_by_osd[orig].erase(pg);
5408 temp_pgs_by_osd[out].insert(pg);
5409 ceph_assert(new_upmap_items.size() < pg_pool_size);
5410 new_upmap_items.push_back(make_pair(orig, out));
5411 // append new remapping pairs slowly
5412 // This way we can make sure that each tiny change will
5413 // definitely make distribution of PGs converging to
5414 // the perfect status.
5415 to_upmap[pg] = new_upmap_items;
5416
5417}
5418
5419int OSDMap::find_best_remap (
5420 CephContext *cct,
5421 const vector<int>& orig,
5422 const vector<int>& out,
5423 const set<int>& existing,
5424 const map<int,float> osd_deviation)
5425{
5426 //
5427 // Find the best remap from the suggestions in orig and out - the best remap
5428 // is the one which maps from the OSD with the largest deviatoion (from the
5429 // OSDs which are part of orig)
5430 //
5431 int best_pos = -1;
5432 float max_dev = 0;
5433 for (unsigned i = 0; i < out.size(); ++i) {
5434 if (orig[i] == out[i])
5435 continue; // skip invalid remappings
5436 if (existing.count(orig[i]) || existing.count(out[i]))
5437 continue; // we want new remappings only!
5438 if (osd_deviation.at(orig[i]) > max_dev) {
5439 max_dev = osd_deviation.at(orig[i]);
5440 best_pos = i;
5441 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl;
5442 }
5443 }
5444 return best_pos;
5445}
5446
5447OSDMap::candidates_t OSDMap::build_candidates(
5448 CephContext *cct,
5449 const OSDMap& tmp_osd_map,
5450 const set<pg_t> to_skip,
5451 const set<int64_t>& only_pools,
5452 bool aggressive,
5453 std::random_device::result_type *p_seed)
5454{
5455 //
5456 // build the candidates data structure
5457 //
5458 candidates_t candidates;
5459 candidates.reserve(tmp_osd_map.pg_upmap_items.size());
5460 for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) {
5461 if (to_skip.count(pg))
5462 continue;
5463 if (!only_pools.empty() && !only_pools.count(pg.pool()))
5464 continue;
5465 candidates.push_back(make_pair(pg, um_pair));
5466 }
5467 if (aggressive) {
5468 // shuffle candidates so they all get equal (in)attention
5469 std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed));
5470 }
5471 return candidates;
5472}
5473
31f18b77
FG
5474int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
5475{
5476 return crush->get_leaves(name, osds);
5477}
5478
3efd9988
FG
5479// get pools whose crush rules might reference the given osd
5480void OSDMap::get_pool_ids_by_osd(CephContext *cct,
5481 int osd,
5482 set<int64_t> *pool_ids) const
5483{
11fdf7f2 5484 ceph_assert(pool_ids);
3efd9988
FG
5485 set<int> raw_rules;
5486 int r = crush->get_rules_by_osd(osd, &raw_rules);
5487 if (r < 0) {
5488 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
5489 << dendl;
11fdf7f2 5490 ceph_assert(r >= 0);
3efd9988
FG
5491 }
5492 set<int> rules;
5493 for (auto &i: raw_rules) {
5494 // exclude any dead rule
5495 if (crush_rule_in_use(i)) {
5496 rules.insert(i);
5497 }
5498 }
5499 for (auto &r: rules) {
5500 get_pool_ids_by_rule(r, pool_ids);
5501 }
5502}
5503
31f18b77
FG
5504template <typename F>
5505class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
5506public:
5507 typedef CrushTreeDumper::Dumper<F> Parent;
5508
5509 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2 5510 const PGMap& pgmap_, bool tree_,
9f95a23c 5511 const string& filter) :
c07f9fc5 5512 Parent(crush, osdmap_->get_pool_names()),
31f18b77 5513 osdmap(osdmap_),
11fdf7f2 5514 pgmap(pgmap_),
31f18b77 5515 tree(tree_),
31f18b77
FG
5516 min_var(-1),
5517 max_var(-1),
5518 stddev(0),
5519 sum(0) {
9f95a23c
TL
5520 if (osdmap->crush->name_exists(filter)) {
5521 // filter by crush node
5522 auto item_id = osdmap->crush->get_item_id(filter);
11fdf7f2
TL
5523 allowed.insert(item_id);
5524 osdmap->crush->get_all_children(item_id, &allowed);
9f95a23c
TL
5525 } else if (osdmap->crush->class_exists(filter)) {
5526 // filter by device class
5527 class_id = osdmap->crush->get_class_id(filter);
5528 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
5529 pool_id >= 0) {
5530 // filter by pool
5531 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
5532 set<int> roots;
5533 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
5534 allowed = roots;
5535 for (auto r : roots)
5536 osdmap->crush->get_all_children(r, &allowed);
11fdf7f2
TL
5537 }
5538 average_util = average_utilization();
31f18b77
FG
5539 }
5540
5541protected:
11fdf7f2
TL
5542
5543 bool should_dump(int id) const {
5544 if (!allowed.empty() && !allowed.count(id)) // filter by name
5545 return false;
9f95a23c
TL
5546 if (id >= 0 && class_id >= 0) {
5547 auto item_class_id = osdmap->crush->get_item_class_id(id);
5548 if (item_class_id < 0 || // not bound to a class yet
5549 item_class_id != class_id) // or already bound to a different class
11fdf7f2
TL
5550 return false;
5551 }
5552 return true;
5553 }
5554
5555 set<int> get_dumped_osds() {
9f95a23c 5556 if (allowed.empty() && class_id < 0) {
11fdf7f2
TL
5557 // old way, all
5558 return {};
5559 }
5560 return dumped_osds;
5561 }
5562
31f18b77
FG
5563 void dump_stray(F *f) {
5564 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5565 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 5566 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5567 }
5568 }
5569
5570 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
f67539c2 5571 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
31f18b77 5572 return;
11fdf7f2
TL
5573 if (!should_dump(qi.id))
5574 return;
31f18b77 5575
11fdf7f2
TL
5576 if (!qi.is_bucket())
5577 dumped_osds.insert(qi.id);
31f18b77 5578 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5579 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5580 kb_used_meta = 0, kb_avail = 0;
31f18b77 5581 double util = 0;
11fdf7f2
TL
5582 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5583 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5584 if (kb_used && kb)
5585 util = 100.0 * (double)kb_used / (double)kb;
5586
5587 double var = 1.0;
5588 if (average_util)
5589 var = util / average_util;
5590
11fdf7f2 5591 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5592
11fdf7f2
TL
5593 dump_item(qi, reweight, kb, kb_used,
5594 kb_used_data, kb_used_omap, kb_used_meta,
5595 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5596
5597 if (!qi.is_bucket() && reweight > 0) {
5598 if (min_var < 0 || var < min_var)
5599 min_var = var;
5600 if (max_var < 0 || var > max_var)
5601 max_var = var;
5602
5603 double dev = util - average_util;
5604 dev *= dev;
5605 stddev += reweight * dev;
5606 sum += reweight;
5607 }
5608 }
5609
5610 virtual void dump_item(const CrushTreeDumper::Item &qi,
5611 float &reweight,
5612 int64_t kb,
5613 int64_t kb_used,
11fdf7f2
TL
5614 int64_t kb_used_data,
5615 int64_t kb_used_omap,
5616 int64_t kb_used_meta,
31f18b77
FG
5617 int64_t kb_avail,
5618 double& util,
5619 double& var,
5620 const size_t num_pgs,
5621 F *f) = 0;
5622
5623 double dev() {
5624 return sum > 0 ? sqrt(stddev / sum) : 0;
5625 }
5626
5627 double average_utilization() {
5628 int64_t kb = 0, kb_used = 0;
5629 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5630 if (!osdmap->exists(i) ||
5631 osdmap->get_weight(i) == 0 ||
5632 !should_dump(i))
31f18b77 5633 continue;
11fdf7f2
TL
5634 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5635 kb_avail_i;
5636 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5637 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5638 kb += kb_i;
5639 kb_used += kb_used_i;
5640 }
5641 }
5642 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5643 }
5644
5645 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5646 int64_t* kb_used_data,
5647 int64_t* kb_used_omap,
5648 int64_t* kb_used_meta,
31f18b77 5649 int64_t* kb_avail) const {
11fdf7f2 5650 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5651 if (!p) return false;
11fdf7f2
TL
5652 *kb = p->statfs.kb();
5653 *kb_used = p->statfs.kb_used_raw();
5654 *kb_used_data = p->statfs.kb_used_data();
5655 *kb_used_omap = p->statfs.kb_used_omap();
5656 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5657 *kb_avail = p->statfs.kb_avail();
5658
f67539c2 5659 return true;
31f18b77
FG
5660 }
5661
5662 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5663 int64_t* kb_used_data,
5664 int64_t* kb_used_omap,
5665 int64_t* kb_used_meta,
31f18b77
FG
5666 int64_t* kb_avail) const {
5667 if (id >= 0) {
11fdf7f2 5668 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5669 *kb = 0;
5670 *kb_used = 0;
11fdf7f2
TL
5671 *kb_used_data = 0;
5672 *kb_used_omap = 0;
5673 *kb_used_meta = 0;
31f18b77
FG
5674 *kb_avail = 0;
5675 return true;
5676 }
11fdf7f2
TL
5677 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5678 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5679 }
5680
5681 *kb = 0;
5682 *kb_used = 0;
11fdf7f2
TL
5683 *kb_used_data = 0;
5684 *kb_used_omap = 0;
5685 *kb_used_meta = 0;
31f18b77
FG
5686 *kb_avail = 0;
5687
5688 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5689 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5690 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5691 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5692 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5693 &kb_used_data_i, &kb_used_omap_i,
5694 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5695 return false;
5696 *kb += kb_i;
5697 *kb_used += kb_used_i;
11fdf7f2
TL
5698 *kb_used_data += kb_used_data_i;
5699 *kb_used_omap += kb_used_omap_i;
5700 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5701 *kb_avail += kb_avail_i;
5702 }
f67539c2 5703 return true;
31f18b77
FG
5704 }
5705
5706protected:
5707 const OSDMap *osdmap;
11fdf7f2 5708 const PGMap& pgmap;
31f18b77
FG
5709 bool tree;
5710 double average_util;
5711 double min_var;
5712 double max_var;
5713 double stddev;
5714 double sum;
9f95a23c 5715 int class_id = -1;
11fdf7f2
TL
5716 set<int> allowed;
5717 set<int> dumped_osds;
31f18b77
FG
5718};
5719
5720
5721class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5722public:
5723 typedef OSDUtilizationDumper<TextTable> Parent;
5724
5725 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5726 const PGMap& pgmap, bool tree,
9f95a23c
TL
5727 const string& filter) :
5728 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5729
5730 void dump(TextTable *tbl) {
5731 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5732 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5733 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5734 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5735 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5736 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5737 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5738 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5739 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5740 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5741 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5742 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5743 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5744 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5745 if (tree)
5746 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5747
5748 Parent::dump(tbl);
5749
5750 dump_stray(tbl);
5751
11fdf7f2 5752 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5753 *tbl << ""
5754 << ""
5755 << "" << "TOTAL"
11fdf7f2
TL
5756 << byte_u_t(sum.statfs.total)
5757 << byte_u_t(sum.statfs.get_used_raw())
5758 << byte_u_t(sum.statfs.allocated)
5759 << byte_u_t(sum.statfs.omap_allocated)
5760 << byte_u_t(sum.statfs.internal_metadata)
5761 << byte_u_t(sum.statfs.available)
31f18b77
FG
5762 << lowprecision_t(average_util)
5763 << ""
5764 << TextTable::endrow;
5765 }
5766
5767protected:
5768 struct lowprecision_t {
5769 float v;
5770 explicit lowprecision_t(float _v) : v(_v) {}
5771 };
5772 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5773
5774 using OSDUtilizationDumper<TextTable>::dump_item;
5775 void dump_item(const CrushTreeDumper::Item &qi,
5776 float &reweight,
5777 int64_t kb,
5778 int64_t kb_used,
11fdf7f2
TL
5779 int64_t kb_used_data,
5780 int64_t kb_used_omap,
5781 int64_t kb_used_meta,
31f18b77
FG
5782 int64_t kb_avail,
5783 double& util,
5784 double& var,
5785 const size_t num_pgs,
5786 TextTable *tbl) override {
224ce89b
WB
5787 const char *c = crush->get_item_class(qi.id);
5788 if (!c)
5789 c = "";
31f18b77 5790 *tbl << qi.id
224ce89b 5791 << c
31f18b77
FG
5792 << weightf_t(qi.weight)
5793 << weightf_t(reweight)
1adf2230
AA
5794 << byte_u_t(kb << 10)
5795 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5796 << byte_u_t(kb_used_data << 10)
5797 << byte_u_t(kb_used_omap << 10)
5798 << byte_u_t(kb_used_meta << 10)
1adf2230 5799 << byte_u_t(kb_avail << 10)
31f18b77
FG
5800 << lowprecision_t(util)
5801 << lowprecision_t(var);
5802
5803 if (qi.is_bucket()) {
5804 *tbl << "-";
11fdf7f2 5805 *tbl << "";
31f18b77
FG
5806 } else {
5807 *tbl << num_pgs;
11fdf7f2
TL
5808 if (osdmap->is_up(qi.id)) {
5809 *tbl << "up";
5810 } else if (osdmap->is_destroyed(qi.id)) {
5811 *tbl << "destroyed";
5812 } else {
5813 *tbl << "down";
5814 }
31f18b77
FG
5815 }
5816
5817 if (tree) {
5818 ostringstream name;
5819 for (int k = 0; k < qi.depth; k++)
5820 name << " ";
5821 if (qi.is_bucket()) {
5822 int type = crush->get_bucket_type(qi.id);
5823 name << crush->get_type_name(type) << " "
5824 << crush->get_item_name(qi.id);
5825 } else {
5826 name << "osd." << qi.id;
5827 }
5828 *tbl << name.str();
5829 }
5830
5831 *tbl << TextTable::endrow;
5832 }
5833
5834public:
5835 string summary() {
5836 ostringstream out;
5837 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5838 << "/" << lowprecision_t(max_var) << " "
5839 << "STDDEV: " << lowprecision_t(dev());
5840 return out.str();
5841 }
5842};
5843
5844ostream& operator<<(ostream& out,
5845 const OSDUtilizationPlainDumper::lowprecision_t& v)
5846{
5847 if (v.v < -0.01) {
5848 return out << "-";
5849 } else if (v.v < 0.001) {
5850 return out << "0";
5851 } else {
5852 std::streamsize p = out.precision();
5853 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5854 }
5855}
5856
5857class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5858public:
5859 typedef OSDUtilizationDumper<Formatter> Parent;
5860
5861 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5862 const PGMap& pgmap, bool tree,
9f95a23c
TL
5863 const string& filter) :
5864 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5865
5866 void dump(Formatter *f) {
5867 f->open_array_section("nodes");
5868 Parent::dump(f);
5869 f->close_section();
5870
5871 f->open_array_section("stray");
5872 dump_stray(f);
5873 f->close_section();
5874 }
5875
5876protected:
5877 using OSDUtilizationDumper<Formatter>::dump_item;
5878 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5879 float &reweight,
5880 int64_t kb,
5881 int64_t kb_used,
5882 int64_t kb_used_data,
5883 int64_t kb_used_omap,
5884 int64_t kb_used_meta,
5885 int64_t kb_avail,
5886 double& util,
5887 double& var,
5888 const size_t num_pgs,
5889 Formatter *f) override {
31f18b77 5890 f->open_object_section("item");
c07f9fc5 5891 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5892 f->dump_float("reweight", reweight);
5893 f->dump_int("kb", kb);
5894 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5895 f->dump_int("kb_used_data", kb_used_data);
5896 f->dump_int("kb_used_omap", kb_used_omap);
5897 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5898 f->dump_int("kb_avail", kb_avail);
5899 f->dump_float("utilization", util);
5900 f->dump_float("var", var);
5901 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5902 if (!qi.is_bucket()) {
5903 if (osdmap->is_up(qi.id)) {
5904 f->dump_string("status", "up");
5905 } else if (osdmap->is_destroyed(qi.id)) {
5906 f->dump_string("status", "destroyed");
5907 } else {
5908 f->dump_string("status", "down");
5909 }
5910 }
31f18b77
FG
5911 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5912 f->close_section();
5913 }
5914
5915public:
5916 void summary(Formatter *f) {
5917 f->open_object_section("summary");
11fdf7f2
TL
5918 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5919 auto& s = sum.statfs;
5920
5921 f->dump_int("total_kb", s.kb());
5922 f->dump_int("total_kb_used", s.kb_used_raw());
5923 f->dump_int("total_kb_used_data", s.kb_used_data());
5924 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5925 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5926 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5927 f->dump_float("average_utilization", average_util);
5928 f->dump_float("min_var", min_var);
5929 f->dump_float("max_var", max_var);
5930 f->dump_float("dev", dev());
5931 f->close_section();
5932 }
5933};
5934
5935void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5936 const PGMap& pgmap,
5937 ostream& out,
5938 Formatter *f,
5939 bool tree,
9f95a23c 5940 const string& filter)
31f18b77
FG
5941{
5942 const CrushWrapper *crush = osdmap.crush.get();
5943 if (f) {
5944 f->open_object_section("df");
9f95a23c 5945 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5946 d.dump(f);
5947 d.summary(f);
5948 f->close_section();
5949 f->flush(out);
5950 } else {
9f95a23c 5951 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5952 TextTable tbl;
5953 d.dump(&tbl);
5954 out << tbl << d.summary() << "\n";
5955 }
5956}
224ce89b 5957
92f5a8d4
TL
5958void OSDMap::check_health(CephContext *cct,
5959 health_check_map_t *checks) const
224ce89b
WB
5960{
5961 int num_osds = get_num_osds();
5962
5963 // OSD_DOWN
5964 // OSD_$subtree_DOWN
5965 // OSD_ORPHAN
5966 if (num_osds >= 0) {
5967 int num_in_osds = 0;
5968 int num_down_in_osds = 0;
5969 set<int> osds;
5970 set<int> down_in_osds;
5971 set<int> up_in_osds;
5972 set<int> subtree_up;
5973 unordered_map<int, set<int> > subtree_type_down;
5974 unordered_map<int, int> num_osds_subtree;
5975 int max_type = crush->get_max_type_id();
5976
5977 for (int i = 0; i < get_max_osd(); i++) {
5978 if (!exists(i)) {
5979 if (crush->item_exists(i)) {
5980 osds.insert(i);
5981 }
5982 continue;
5983 }
f67539c2 5984 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
224ce89b
WB
5985 continue;
5986 ++num_in_osds;
5987 if (down_in_osds.count(i) || up_in_osds.count(i))
5988 continue;
5989 if (!is_up(i)) {
5990 down_in_osds.insert(i);
5991 int parent_id = 0;
5992 int current = i;
5993 for (int type = 0; type <= max_type; type++) {
5994 if (!crush->get_type_name(type))
5995 continue;
5996 int r = crush->get_immediate_parent_id(current, &parent_id);
5997 if (r == -ENOENT)
5998 break;
5999 // break early if this parent is already marked as up
6000 if (subtree_up.count(parent_id))
6001 break;
6002 type = crush->get_bucket_type(parent_id);
6003 if (!subtree_type_is_down(
92f5a8d4 6004 cct, parent_id, type,
224ce89b
WB
6005 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
6006 break;
6007 current = parent_id;
6008 }
6009 }
6010 }
6011
6012 // calculate the number of down osds in each down subtree and
6013 // store it in num_osds_subtree
6014 for (int type = 1; type <= max_type; type++) {
6015 if (!crush->get_type_name(type))
6016 continue;
6017 for (auto j = subtree_type_down[type].begin();
6018 j != subtree_type_down[type].end();
6019 ++j) {
6020 list<int> children;
6021 int num = 0;
6022 int num_children = crush->get_children(*j, &children);
6023 if (num_children == 0)
6024 continue;
6025 for (auto l = children.begin(); l != children.end(); ++l) {
6026 if (*l >= 0) {
6027 ++num;
6028 } else if (num_osds_subtree[*l] > 0) {
6029 num = num + num_osds_subtree[*l];
6030 }
6031 }
6032 num_osds_subtree[*j] = num;
6033 }
6034 }
6035 num_down_in_osds = down_in_osds.size();
11fdf7f2 6036 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
6037 if (num_down_in_osds > 0) {
6038 // summary of down subtree types and osds
6039 for (int type = max_type; type > 0; type--) {
6040 if (!crush->get_type_name(type))
6041 continue;
6042 if (subtree_type_down[type].size() > 0) {
6043 ostringstream ss;
6044 ss << subtree_type_down[type].size() << " "
6045 << crush->get_type_name(type);
6046 if (subtree_type_down[type].size() > 1) {
6047 ss << "s";
6048 }
6049 int sum_down_osds = 0;
6050 for (auto j = subtree_type_down[type].begin();
6051 j != subtree_type_down[type].end();
6052 ++j) {
6053 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
6054 }
6055 ss << " (" << sum_down_osds << " osds) down";
6056 string err = string("OSD_") +
6057 string(crush->get_type_name(type)) + "_DOWN";
6058 boost::to_upper(err);
9f95a23c
TL
6059 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
6060 subtree_type_down[type].size());
224ce89b
WB
6061 for (auto j = subtree_type_down[type].rbegin();
6062 j != subtree_type_down[type].rend();
6063 ++j) {
6064 ostringstream ss;
6065 ss << crush->get_type_name(type);
6066 ss << " ";
6067 ss << crush->get_item_name(*j);
6068 // at the top level, do not print location
6069 if (type != max_type) {
6070 ss << " (";
6071 ss << crush->get_full_location_ordered_string(*j);
6072 ss << ")";
6073 }
6074 int num = num_osds_subtree[*j];
6075 ss << " (" << num << " osds)";
6076 ss << " is down";
6077 d.detail.push_back(ss.str());
6078 }
6079 }
6080 }
6081 ostringstream ss;
6082 ss << down_in_osds.size() << " osds down";
9f95a23c
TL
6083 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
6084 down_in_osds.size());
224ce89b
WB
6085 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
6086 ostringstream ss;
6087 ss << "osd." << *it << " (";
6088 ss << crush->get_full_location_ordered_string(*it);
6089 ss << ") is down";
6090 d.detail.push_back(ss.str());
6091 }
6092 }
6093
6094 if (!osds.empty()) {
6095 ostringstream ss;
6096 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
9f95a23c
TL
6097 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
6098 osds.size());
224ce89b
WB
6099 for (auto osd : osds) {
6100 ostringstream ss;
6101 ss << "osd." << osd << " exists in crush map but not in osdmap";
6102 d.detail.push_back(ss.str());
6103 }
6104 }
6105 }
6106
eafe8130
TL
6107 std::list<std::string> scrub_messages;
6108 bool noscrub = false, nodeepscrub = false;
6109 for (const auto &p : pools) {
6110 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
6111 ostringstream ss;
6112 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
6113 scrub_messages.push_back(ss.str());
6114 noscrub = true;
6115 }
6116 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
6117 ostringstream ss;
6118 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
6119 scrub_messages.push_back(ss.str());
6120 nodeepscrub = true;
6121 }
6122 }
6123 if (noscrub || nodeepscrub) {
6124 string out = "";
6125 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
6126 out += nodeepscrub ? "nodeep-scrub" : "";
6127 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
9f95a23c 6128 "Some pool(s) have the " + out + " flag(s) set", 0);
eafe8130
TL
6129 d.detail.splice(d.detail.end(), scrub_messages);
6130 }
6131
224ce89b
WB
6132 // OSD_OUT_OF_ORDER_FULL
6133 {
6134 // An osd could configure failsafe ratio, to something different
6135 // but for now assume it is the same here.
92f5a8d4 6136 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
6137 if (fsr > 1.0) fsr /= 100;
6138 float fr = get_full_ratio();
6139 float br = get_backfillfull_ratio();
6140 float nr = get_nearfull_ratio();
6141
6142 list<string> detail;
6143 // These checks correspond to how OSDService::check_full_status() in an OSD
6144 // handles the improper setting of these values.
6145 if (br < nr) {
6146 ostringstream ss;
6147 ss << "backfillfull_ratio (" << br
6148 << ") < nearfull_ratio (" << nr << "), increased";
6149 detail.push_back(ss.str());
6150 br = nr;
6151 }
6152 if (fr < br) {
6153 ostringstream ss;
6154 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
6155 << "), increased";
6156 detail.push_back(ss.str());
6157 fr = br;
6158 }
6159 if (fsr < fr) {
6160 ostringstream ss;
6161 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
6162 << "), increased";
6163 detail.push_back(ss.str());
6164 }
6165 if (!detail.empty()) {
6166 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
9f95a23c 6167 "full ratio(s) out of order", 0);
224ce89b
WB
6168 d.detail.swap(detail);
6169 }
6170 }
6171
6172 // OSD_FULL
6173 // OSD_NEARFULL
6174 // OSD_BACKFILLFULL
6175 // OSD_FAILSAFE_FULL
6176 {
6177 set<int> full, backfillfull, nearfull;
6178 get_full_osd_counts(&full, &backfillfull, &nearfull);
6179 if (full.size()) {
6180 ostringstream ss;
6181 ss << full.size() << " full osd(s)";
9f95a23c 6182 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
224ce89b
WB
6183 for (auto& i: full) {
6184 ostringstream ss;
6185 ss << "osd." << i << " is full";
6186 d.detail.push_back(ss.str());
6187 }
6188 }
6189 if (backfillfull.size()) {
6190 ostringstream ss;
6191 ss << backfillfull.size() << " backfillfull osd(s)";
9f95a23c
TL
6192 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
6193 backfillfull.size());
224ce89b
WB
6194 for (auto& i: backfillfull) {
6195 ostringstream ss;
6196 ss << "osd." << i << " is backfill full";
6197 d.detail.push_back(ss.str());
6198 }
6199 }
6200 if (nearfull.size()) {
6201 ostringstream ss;
6202 ss << nearfull.size() << " nearfull osd(s)";
9f95a23c 6203 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
224ce89b
WB
6204 for (auto& i: nearfull) {
6205 ostringstream ss;
6206 ss << "osd." << i << " is near full";
6207 d.detail.push_back(ss.str());
6208 }
6209 }
6210 }
6211
6212 // OSDMAP_FLAGS
6213 {
6214 // warn about flags
6215 uint64_t warn_flags =
224ce89b
WB
6216 CEPH_OSDMAP_PAUSERD |
6217 CEPH_OSDMAP_PAUSEWR |
6218 CEPH_OSDMAP_PAUSEREC |
6219 CEPH_OSDMAP_NOUP |
6220 CEPH_OSDMAP_NODOWN |
6221 CEPH_OSDMAP_NOIN |
6222 CEPH_OSDMAP_NOOUT |
6223 CEPH_OSDMAP_NOBACKFILL |
6224 CEPH_OSDMAP_NORECOVER |
6225 CEPH_OSDMAP_NOSCRUB |
6226 CEPH_OSDMAP_NODEEP_SCRUB |
6227 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 6228 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
6229 CEPH_OSDMAP_NOREBALANCE;
6230 if (test_flag(warn_flags)) {
6231 ostringstream ss;
9f95a23c
TL
6232 string s = get_flag_string(get_flags() & warn_flags);
6233 ss << s << " flag(s) set";
6234 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
6235 s.size() /* kludgey but sufficient */);
224ce89b
WB
6236 }
6237 }
6238
6239 // OSD_FLAGS
6240 {
6241 list<string> detail;
6242 const unsigned flags =
6243 CEPH_OSD_NOUP |
6244 CEPH_OSD_NOIN |
6245 CEPH_OSD_NODOWN |
6246 CEPH_OSD_NOOUT;
6247 for (int i = 0; i < max_osd; ++i) {
6248 if (osd_state[i] & flags) {
6249 ostringstream ss;
6250 set<string> states;
6251 OSDMap::calc_state_set(osd_state[i] & flags, states);
6252 ss << "osd." << i << " has flags " << states;
6253 detail.push_back(ss.str());
6254 }
6255 }
81eedcae
TL
6256 for (auto& i : crush_node_flags) {
6257 if (i.second && crush->item_exists(i.first)) {
6258 ostringstream ss;
6259 set<string> states;
6260 OSDMap::calc_state_set(i.second, states);
6261 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
6262 const char *tn = crush->get_type_name(t);
6263 ss << (tn ? tn : "node") << " "
6264 << crush->get_item_name(i.first) << " has flags " << states;
6265 detail.push_back(ss.str());
6266 }
6267 }
6268 for (auto& i : device_class_flags) {
6269 const char* class_name = crush->get_class_name(i.first);
6270 if (i.second && class_name) {
6271 ostringstream ss;
6272 set<string> states;
6273 OSDMap::calc_state_set(i.second, states);
6274 ss << "device class '" << class_name << "' has flags " << states;
6275 detail.push_back(ss.str());
6276 }
6277 }
224ce89b
WB
6278 if (!detail.empty()) {
6279 ostringstream ss;
81eedcae 6280 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
9f95a23c 6281 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
224ce89b
WB
6282 d.detail.swap(detail);
6283 }
6284 }
6285
6286 // OLD_CRUSH_TUNABLES
92f5a8d4 6287 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 6288 string min = crush->get_min_required_version();
92f5a8d4 6289 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
6290 ostringstream ss;
6291 ss << "crush map has legacy tunables (require " << min
92f5a8d4 6292 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
9f95a23c 6293 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
f67539c2 6294 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
6295 }
6296 }
6297
6298 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 6299 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
6300 if (crush->get_straw_calc_version() == 0) {
6301 ostringstream ss;
6302 ss << "crush map has straw_calc_version=0";
9f95a23c 6303 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
224ce89b 6304 d.detail.push_back(
f67539c2 6305 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
6306 }
6307 }
6308
6309 // CACHE_POOL_NO_HIT_SET
92f5a8d4 6310 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b 6311 list<string> detail;
9f95a23c 6312 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
224ce89b
WB
6313 const pg_pool_t& info = p->second;
6314 if (info.cache_mode_requires_hit_set() &&
6315 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
6316 ostringstream ss;
6317 ss << "pool '" << get_pool_name(p->first)
6318 << "' with cache_mode " << info.get_cache_mode_name()
6319 << " needs hit_set_type to be set but it is not";
6320 detail.push_back(ss.str());
6321 }
6322 }
6323 if (!detail.empty()) {
6324 ostringstream ss;
6325 ss << detail.size() << " cache pools are missing hit_sets";
9f95a23c
TL
6326 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
6327 detail.size());
224ce89b
WB
6328 d.detail.swap(detail);
6329 }
6330 }
6331
6332 // OSD_NO_SORTBITWISE
11fdf7f2 6333 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 6334 ostringstream ss;
11fdf7f2 6335 ss << "'sortbitwise' flag is not set";
9f95a23c 6336 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
6337 }
6338
6339 // OSD_UPGRADE_FINISHED
20effc67
TL
6340 if (auto require_release = pending_require_osd_release()) {
6341 ostringstream ss;
6342 ss << "all OSDs are running " << *require_release << " or later but"
6343 << " require_osd_release < " << *require_release;
6344 auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
6345 d.detail.push_back(ss.str());
6346 }
224ce89b 6347
3efd9988 6348 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 6349 {
3efd9988 6350 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
6351 for (auto it : get_pools()) {
6352 const pg_pool_t &pool = it.second;
3efd9988 6353 const string& pool_name = get_pool_name(it.first);
224ce89b 6354 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 6355 stringstream ss;
11fdf7f2 6356 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
6357 // may run out of space too,
6358 // but we want EQUOTA taking precedence
11fdf7f2 6359 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
6360 } else {
6361 ss << "pool '" << pool_name << "' is full (no space)";
6362 }
6363 full_detail.push_back(ss.str());
6364 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
6365 stringstream ss;
6366 ss << "pool '" << pool_name << "' is backfillfull";
6367 backfillfull_detail.push_back(ss.str());
6368 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
6369 stringstream ss;
6370 ss << "pool '" << pool_name << "' is nearfull";
6371 nearfull_detail.push_back(ss.str());
224ce89b
WB
6372 }
6373 }
3efd9988 6374 if (!full_detail.empty()) {
224ce89b 6375 ostringstream ss;
3efd9988 6376 ss << full_detail.size() << " pool(s) full";
9f95a23c 6377 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
3efd9988
FG
6378 d.detail.swap(full_detail);
6379 }
6380 if (!backfillfull_detail.empty()) {
6381 ostringstream ss;
6382 ss << backfillfull_detail.size() << " pool(s) backfillfull";
9f95a23c
TL
6383 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
6384 backfillfull_detail.size());
3efd9988
FG
6385 d.detail.swap(backfillfull_detail);
6386 }
6387 if (!nearfull_detail.empty()) {
6388 ostringstream ss;
6389 ss << nearfull_detail.size() << " pool(s) nearfull";
9f95a23c
TL
6390 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
6391 nearfull_detail.size());
3efd9988 6392 d.detail.swap(nearfull_detail);
224ce89b
WB
6393 }
6394 }
92f5a8d4
TL
6395
6396 // POOL_PG_NUM_NOT_POWER_OF_TWO
6397 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
6398 list<string> detail;
6399 for (auto it : get_pools()) {
6400 if (!isp2(it.second.get_pg_num_target())) {
6401 ostringstream ss;
6402 ss << "pool '" << get_pool_name(it.first)
6403 << "' pg_num " << it.second.get_pg_num_target()
6404 << " is not a power of two";
6405 detail.push_back(ss.str());
6406 }
6407 }
6408 if (!detail.empty()) {
6409 ostringstream ss;
6410 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
6411 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
9f95a23c
TL
6412 ss.str(), detail.size());
6413 d.detail.swap(detail);
6414 }
6415 }
6416
6417 // POOL_NO_REDUNDANCY
6418 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
6419 {
6420 list<string> detail;
6421 for (auto it : get_pools()) {
6422 if (it.second.get_size() == 1) {
6423 ostringstream ss;
6424 ss << "pool '" << get_pool_name(it.first)
6425 << "' has no replicas configured";
6426 detail.push_back(ss.str());
6427 }
6428 }
6429 if (!detail.empty()) {
6430 ostringstream ss;
6431 ss << detail.size() << " pool(s) have no replicas configured";
6432 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
6433 ss.str(), detail.size());
92f5a8d4
TL
6434 d.detail.swap(detail);
6435 }
6436 }
f67539c2
TL
6437
6438 // DEGRADED STRETCH MODE
6439 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
6440 if (recovering_stretch_mode) {
6441 stringstream ss;
6442 ss << "We are recovering stretch mode buckets, only requiring "
6443 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6444 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
6445 ss.str(), 0);
6446 } else if (degraded_stretch_mode) {
6447 stringstream ss;
6448 ss << "We are missing stretch mode buckets, only requiring "
6449 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6450 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
6451 ss.str(), 0);
6452 }
6453 }
224ce89b 6454}
35e4c445
FG
6455
6456int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
6457 ostream *ss) const
6458{
6459 out->clear();
6460 for (auto i = ls.begin(); i != ls.end(); ++i) {
6461 if (i == ls.begin() &&
6462 (*i == "any" || *i == "all" || *i == "*")) {
6463 get_all_osds(*out);
6464 break;
6465 }
9f95a23c 6466 long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
35e4c445
FG
6467 if (osd < 0) {
6468 *ss << "invalid osd id '" << *i << "'";
6469 return -EINVAL;
6470 }
6471 out->insert(osd);
6472 }
6473 return 0;
6474}
11fdf7f2
TL
6475
6476void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
6477 string &subtree,
6478 int limit, // how many
6479 set<int> skip,
6480 set<int> *want) const {
6481 if (limit <= 0)
6482 return;
6483 int subtree_type = crush->get_type_id(subtree);
6484 if (subtree_type < 1)
6485 return;
6486 vector<int> subtrees;
6487 crush->get_subtree_of_type(subtree_type, &subtrees);
6488 std::random_device rd;
6489 std::default_random_engine rng{rd()};
6490 std::shuffle(subtrees.begin(), subtrees.end(), rng);
6491 for (auto s : subtrees) {
6492 if (limit <= 0)
6493 break;
6494 if (crush->subtree_contains(s, n))
6495 continue;
6496 vector<int> osds;
6497 crush->get_children_of_type(s, 0, &osds);
6498 if (osds.empty())
6499 continue;
6500 vector<int> up_osds;
6501 for (auto o : osds) {
6502 if (is_up(o) && !skip.count(o))
6503 up_osds.push_back(o);
6504 }
6505 if (up_osds.empty())
6506 continue;
6507 auto it = up_osds.begin();
6508 std::advance(it, (n % up_osds.size()));
6509 want->insert(*it);
6510 --limit;
6511 }
6512}
6513
6514float OSDMap::pool_raw_used_rate(int64_t poolid) const
6515{
6516 const pg_pool_t *pool = get_pg_pool(poolid);
6517 assert(pool != nullptr);
6518
6519 switch (pool->get_type()) {
6520 case pg_pool_t::TYPE_REPLICATED:
6521 return pool->get_size();
11fdf7f2
TL
6522 case pg_pool_t::TYPE_ERASURE:
6523 {
6524 auto& ecp =
6525 get_erasure_code_profile(pool->erasure_code_profile);
6526 auto pm = ecp.find("m");
6527 auto pk = ecp.find("k");
6528 if (pm != ecp.end() && pk != ecp.end()) {
6529 int k = atoi(pk->second.c_str());
6530 int m = atoi(pm->second.c_str());
6531 int mk = m + k;
6532 ceph_assert(mk != 0);
6533 ceph_assert(k != 0);
6534 return (float)mk / k;
6535 } else {
6536 return 0.0;
6537 }
6538 }
6539 break;
6540 default:
6541 ceph_abort_msg("unrecognized pool type");
6542 }
6543}
81eedcae
TL
6544
6545unsigned OSDMap::get_osd_crush_node_flags(int osd) const
6546{
6547 unsigned flags = 0;
6548 if (!crush_node_flags.empty()) {
6549 // the map will contain type -> name
6550 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
6551 for (auto& i : ploc) {
6552 int id = crush->get_item_id(i.second);
6553 auto p = crush_node_flags.find(id);
6554 if (p != crush_node_flags.end()) {
6555 flags |= p->second;
6556 }
6557 }
6558 }
6559 return flags;
6560}
6561
6562unsigned OSDMap::get_crush_node_flags(int id) const
6563{
6564 unsigned flags = 0;
6565 auto it = crush_node_flags.find(id);
6566 if (it != crush_node_flags.end())
6567 flags = it->second;
6568 return flags;
6569}
6570
6571unsigned OSDMap::get_device_class_flags(int id) const
6572{
6573 unsigned flags = 0;
6574 auto it = device_class_flags.find(id);
6575 if (it != device_class_flags.end())
6576 flags = it->second;
6577 return flags;
6578}
20effc67
TL
6579
6580std::optional<std::string> OSDMap::pending_require_osd_release() const
6581{
6582 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) &&
6583 require_osd_release < ceph_release_t::quincy) {
6584 return "quincy";
6585 }
6586 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
6587 require_osd_release < ceph_release_t::pacific) {
6588 return "pacific";
6589 }
6590 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
6591 require_osd_release < ceph_release_t::octopus) {
6592 return "octopus";
6593 }
6594 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
6595 require_osd_release < ceph_release_t::nautilus) {
6596 return "nautilus";
6597 }
6598
6599 return std::nullopt;
6600}