]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import ceph reef 18.2.2
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2 18#include <algorithm>
1e59de90 19#include <bit>
11fdf7f2
TL
20#include <optional>
21#include <random>
1e59de90 22#include <fmt/format.h>
11fdf7f2 23
224ce89b
WB
24#include <boost/algorithm/string.hpp>
25
7c673cae 26#include "OSDMap.h"
7c673cae 27#include "common/config.h"
3efd9988 28#include "common/errno.h"
7c673cae
FG
29#include "common/Formatter.h"
30#include "common/TextTable.h"
31#include "include/ceph_features.h"
9f95a23c 32#include "include/common_fwd.h"
7c673cae
FG
33#include "include/str_map.h"
34
35#include "common/code_environment.h"
224ce89b 36#include "mon/health_check.h"
7c673cae
FG
37
38#include "crush/CrushTreeDumper.h"
39#include "common/Clock.h"
11fdf7f2
TL
40#include "mon/PGMap.h"
41
9f95a23c
TL
42using std::list;
43using std::make_pair;
44using std::map;
45using std::multimap;
46using std::ostream;
47using std::ostringstream;
48using std::pair;
49using std::set;
50using std::string;
51using std::stringstream;
52using std::unordered_map;
53using std::vector;
54
55using ceph::decode;
56using ceph::encode;
57using ceph::Formatter;
58
7c673cae
FG
59#define dout_subsys ceph_subsys_osd
60
61MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
62MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
63
64
65// ----------------------------------
66// osd_info_t
67
68void osd_info_t::dump(Formatter *f) const
69{
70 f->dump_int("last_clean_begin", last_clean_begin);
71 f->dump_int("last_clean_end", last_clean_end);
72 f->dump_int("up_from", up_from);
73 f->dump_int("up_thru", up_thru);
74 f->dump_int("down_at", down_at);
75 f->dump_int("lost_at", lost_at);
76}
77
9f95a23c 78void osd_info_t::encode(ceph::buffer::list& bl) const
7c673cae 79{
11fdf7f2 80 using ceph::encode;
7c673cae 81 __u8 struct_v = 1;
11fdf7f2
TL
82 encode(struct_v, bl);
83 encode(last_clean_begin, bl);
84 encode(last_clean_end, bl);
85 encode(up_from, bl);
86 encode(up_thru, bl);
87 encode(down_at, bl);
88 encode(lost_at, bl);
7c673cae
FG
89}
90
9f95a23c 91void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 92{
11fdf7f2 93 using ceph::decode;
7c673cae 94 __u8 struct_v;
11fdf7f2
TL
95 decode(struct_v, bl);
96 decode(last_clean_begin, bl);
97 decode(last_clean_end, bl);
98 decode(up_from, bl);
99 decode(up_thru, bl);
100 decode(down_at, bl);
101 decode(lost_at, bl);
7c673cae
FG
102}
103
104void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
105{
106 o.push_back(new osd_info_t);
107 o.push_back(new osd_info_t);
108 o.back()->last_clean_begin = 1;
109 o.back()->last_clean_end = 2;
110 o.back()->up_from = 30;
111 o.back()->up_thru = 40;
112 o.back()->down_at = 5;
113 o.back()->lost_at = 6;
114}
115
116ostream& operator<<(ostream& out, const osd_info_t& info)
117{
118 out << "up_from " << info.up_from
119 << " up_thru " << info.up_thru
120 << " down_at " << info.down_at
121 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
122 if (info.lost_at)
123 out << " lost_at " << info.lost_at;
124 return out;
125}
126
127// ----------------------------------
128// osd_xinfo_t
129
130void osd_xinfo_t::dump(Formatter *f) const
131{
132 f->dump_stream("down_stamp") << down_stamp;
133 f->dump_float("laggy_probability", laggy_probability);
134 f->dump_int("laggy_interval", laggy_interval);
135 f->dump_int("features", features);
136 f->dump_unsigned("old_weight", old_weight);
9f95a23c
TL
137 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
138 f->dump_int("dead_epoch", dead_epoch);
7c673cae
FG
139}
140
9f95a23c 141void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
7c673cae 142{
9f95a23c
TL
143 uint8_t v = 4;
144 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
145 v = 3;
146 }
147 ENCODE_START(v, 1, bl);
11fdf7f2 148 encode(down_stamp, bl);
f67539c2 149 __u32 lp = laggy_probability * float(0xfffffffful);
11fdf7f2
TL
150 encode(lp, bl);
151 encode(laggy_interval, bl);
152 encode(features, bl);
153 encode(old_weight, bl);
9f95a23c
TL
154 if (v >= 4) {
155 encode(last_purged_snaps_scrub, bl);
156 encode(dead_epoch, bl);
157 }
7c673cae
FG
158 ENCODE_FINISH(bl);
159}
160
9f95a23c 161void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 162{
9f95a23c 163 DECODE_START(4, bl);
11fdf7f2 164 decode(down_stamp, bl);
7c673cae 165 __u32 lp;
11fdf7f2 166 decode(lp, bl);
7c673cae 167 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 168 decode(laggy_interval, bl);
7c673cae 169 if (struct_v >= 2)
11fdf7f2 170 decode(features, bl);
7c673cae
FG
171 else
172 features = 0;
173 if (struct_v >= 3)
11fdf7f2 174 decode(old_weight, bl);
7c673cae
FG
175 else
176 old_weight = 0;
9f95a23c
TL
177 if (struct_v >= 4) {
178 decode(last_purged_snaps_scrub, bl);
179 decode(dead_epoch, bl);
180 } else {
181 dead_epoch = 0;
182 }
7c673cae
FG
183 DECODE_FINISH(bl);
184}
185
186void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
187{
188 o.push_back(new osd_xinfo_t);
189 o.push_back(new osd_xinfo_t);
190 o.back()->down_stamp = utime_t(2, 3);
191 o.back()->laggy_probability = .123;
192 o.back()->laggy_interval = 123456;
193 o.back()->old_weight = 0x7fff;
194}
195
196ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
197{
198 return out << "down_stamp " << xi.down_stamp
199 << " laggy_probability " << xi.laggy_probability
200 << " laggy_interval " << xi.laggy_interval
9f95a23c
TL
201 << " old_weight " << xi.old_weight
202 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
203 << " dead_epoch " << xi.dead_epoch;
7c673cae
FG
204}
205
206// ----------------------------------
207// OSDMap::Incremental
208
209int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
210{
211 int n = 0;
212 for (auto &weight : new_weight) {
213 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
214 n++; // marked out
215 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
216 n--; // marked in
217 }
218 return n;
219}
220
221int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
222{
223 int n = 0;
224 for (auto &state : new_state) { //
225 if (state.second & CEPH_OSD_UP) {
226 if (previous->is_up(state.first))
227 n++; // marked down
228 else
229 n--; // marked up
230 }
231 }
232 return n;
233}
234
235int OSDMap::Incremental::identify_osd(uuid_d u) const
236{
237 for (auto &uuid : new_uuid)
238 if (uuid.second == u)
239 return uuid.first;
240 return -1;
241}
242
f67539c2
TL
243int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
244 const OSDMap& osdmap)
7c673cae 245{
11fdf7f2 246 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
247
248 for (auto &new_pool : new_pools) {
249 if (!new_pool.second.tiers.empty()) {
250 pg_pool_t& base = new_pool.second;
251
11fdf7f2
TL
252 auto new_rem_it = new_removed_snaps.find(new_pool.first);
253
7c673cae
FG
254 for (const auto &tier_pool : base.tiers) {
255 const auto &r = new_pools.find(tier_pool);
256 pg_pool_t *tier = 0;
257 if (r == new_pools.end()) {
258 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
259 if (!orig) {
260 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
261 return -EIO;
262 }
263 tier = get_new_pool(tier_pool, orig);
264 } else {
265 tier = &r->second;
266 }
267 if (tier->tier_of != new_pool.first) {
268 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
269 return -EIO;
270 }
271
272 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
273 << tier_pool << dendl;
274 tier->snap_seq = base.snap_seq;
275 tier->snap_epoch = base.snap_epoch;
276 tier->snaps = base.snaps;
277 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
278 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
279 pg_pool_t::FLAG_POOL_SNAPS);
280
281 if (new_rem_it != new_removed_snaps.end()) {
282 new_removed_snaps[tier_pool] = new_rem_it->second;
283 }
f67539c2
TL
284
285 tier->application_metadata = base.application_metadata;
7c673cae
FG
286 }
287 }
288 }
289 return 0;
290}
291
28e407b8
AA
292// ----------------------------------
293// OSDMap
7c673cae
FG
294
295bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
296{
297 if (id >= 0)
298 return is_down(id);
299
300 if (down_cache &&
301 down_cache->count(id)) {
302 return true;
303 }
304
305 list<int> children;
306 crush->get_children(id, &children);
307 for (const auto &child : children) {
308 if (!subtree_is_down(child, down_cache)) {
309 return false;
310 }
311 }
312 if (down_cache) {
313 down_cache->insert(id);
314 }
315 return true;
316}
317
318bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
319{
320 // use a stack-local down_cache if we didn't get one from the
321 // caller. then at least this particular call will avoid duplicated
322 // work.
323 set<int> local_down_cache;
324 if (!down_cache) {
325 down_cache = &local_down_cache;
326 }
327
328 int current = id;
329 while (true) {
330 int type;
331 if (current >= 0) {
332 type = 0;
333 } else {
334 type = crush->get_bucket_type(current);
335 }
11fdf7f2 336 ceph_assert(type >= 0);
7c673cae
FG
337
338 if (!subtree_is_down(current, down_cache)) {
339 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
340 return false;
341 }
342
343 // is this a big enough subtree to be marked as down?
344 if (type >= subtree_type) {
345 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
346 return true;
347 }
348
349 int r = crush->get_immediate_parent_id(current, &current);
350 if (r < 0) {
351 return false;
352 }
353 }
354}
355
224ce89b
WB
356bool OSDMap::subtree_type_is_down(
357 CephContext *cct,
358 int id,
359 int subtree_type,
360 set<int> *down_in_osds,
361 set<int> *up_in_osds,
362 set<int> *subtree_up,
363 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
364{
365 if (id >= 0) {
366 bool is_down_ret = is_down(id);
367 if (!is_out(id)) {
368 if (is_down_ret) {
369 down_in_osds->insert(id);
370 } else {
371 up_in_osds->insert(id);
372 }
373 }
374 return is_down_ret;
375 }
376
377 if (subtree_type_down &&
378 (*subtree_type_down)[subtree_type].count(id)) {
379 return true;
380 }
381
382 list<int> children;
383 crush->get_children(id, &children);
384 for (const auto &child : children) {
224ce89b
WB
385 if (!subtree_type_is_down(
386 cct, child, crush->get_bucket_type(child),
387 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
388 subtree_up->insert(id);
389 return false;
390 }
391 }
392 if (subtree_type_down) {
393 (*subtree_type_down)[subtree_type].insert(id);
394 }
395 return true;
396}
397
9f95a23c 398void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
7c673cae 399{
11fdf7f2 400 using ceph::encode;
7c673cae 401 __u16 v = 5;
11fdf7f2
TL
402 encode(v, bl);
403 encode(fsid, bl);
404 encode(epoch, bl);
405 encode(modified, bl);
7c673cae 406 int32_t new_t = new_pool_max;
11fdf7f2
TL
407 encode(new_t, bl);
408 encode(new_flags, bl);
409 encode(fullmap, bl);
410 encode(crush, bl);
7c673cae 411
11fdf7f2
TL
412 encode(new_max_osd, bl);
413 // for encode(new_pools, bl);
7c673cae 414 __u32 n = new_pools.size();
11fdf7f2 415 encode(n, bl);
7c673cae
FG
416 for (const auto &new_pool : new_pools) {
417 n = new_pool.first;
11fdf7f2
TL
418 encode(n, bl);
419 encode(new_pool.second, bl, 0);
7c673cae 420 }
11fdf7f2 421 // for encode(new_pool_names, bl);
7c673cae 422 n = new_pool_names.size();
11fdf7f2 423 encode(n, bl);
7c673cae
FG
424
425 for (const auto &new_pool_name : new_pool_names) {
426 n = new_pool_name.first;
11fdf7f2
TL
427 encode(n, bl);
428 encode(new_pool_name.second, bl);
7c673cae 429 }
11fdf7f2 430 // for encode(old_pools, bl);
7c673cae 431 n = old_pools.size();
11fdf7f2 432 encode(n, bl);
7c673cae
FG
433 for (auto &old_pool : old_pools) {
434 n = old_pool;
11fdf7f2 435 encode(n, bl);
7c673cae 436 }
11fdf7f2 437 encode(new_up_client, bl, 0);
31f18b77
FG
438 {
439 // legacy is map<int32_t,uint8_t>
9f95a23c 440 map<int32_t, uint8_t> os;
31f18b77 441 for (auto p : new_state) {
9f95a23c
TL
442 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
443 // that an old client could not understand.
444 // skip those!
445 uint8_t s = p.second;
446 if (p.second != 0 && s == 0)
447 continue;
448 os[p.first] = s;
449 }
450 uint32_t n = os.size();
451 encode(n, bl);
452 for (auto p : os) {
11fdf7f2 453 encode(p.first, bl);
9f95a23c 454 encode(p.second, bl);
31f18b77
FG
455 }
456 }
11fdf7f2
TL
457 encode(new_weight, bl);
458 // for encode(new_pg_temp, bl);
7c673cae 459 n = new_pg_temp.size();
11fdf7f2 460 encode(n, bl);
7c673cae
FG
461
462 for (const auto &pg_temp : new_pg_temp) {
463 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
464 encode(opg, bl);
465 encode(pg_temp.second, bl);
7c673cae
FG
466 }
467}
468
9f95a23c 469void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 470{
11fdf7f2 471 using ceph::encode;
7c673cae
FG
472 if ((features & CEPH_FEATURE_PGID64) == 0) {
473 encode_client_old(bl);
474 return;
475 }
476
477 // base
478 __u16 v = 6;
11fdf7f2
TL
479 encode(v, bl);
480 encode(fsid, bl);
481 encode(epoch, bl);
482 encode(modified, bl);
483 encode(new_pool_max, bl);
484 encode(new_flags, bl);
485 encode(fullmap, bl);
486 encode(crush, bl);
487
488 encode(new_max_osd, bl);
489 encode(new_pools, bl, features);
490 encode(new_pool_names, bl);
491 encode(old_pools, bl);
492 encode(new_up_client, bl, features);
31f18b77 493 {
9f95a23c 494 map<int32_t, uint8_t> os;
31f18b77 495 for (auto p : new_state) {
9f95a23c
TL
496 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
497 // that an old client could not understand.
498 // skip those!
499 uint8_t s = p.second;
500 if (p.second != 0 && s == 0)
501 continue;
502 os[p.first] = s;
503 }
504 uint32_t n = os.size();
505 encode(n, bl);
506 for (auto p : os) {
11fdf7f2 507 encode(p.first, bl);
9f95a23c 508 encode(p.second, bl);
31f18b77
FG
509 }
510 }
11fdf7f2
TL
511 encode(new_weight, bl);
512 encode(new_pg_temp, bl);
7c673cae
FG
513
514 // extended
515 __u16 ev = 10;
11fdf7f2
TL
516 encode(ev, bl);
517 encode(new_hb_back_up, bl, features);
518 encode(new_up_thru, bl);
519 encode(new_last_clean_interval, bl);
520 encode(new_lost, bl);
f67539c2
TL
521 encode(new_blocklist, bl, features);
522 encode(old_blocklist, bl, features);
11fdf7f2
TL
523 encode(new_up_cluster, bl, features);
524 encode(cluster_snapshot, bl);
525 encode(new_uuid, bl);
9f95a23c 526 encode(new_xinfo, bl, features);
11fdf7f2
TL
527 encode(new_hb_front_up, bl, features);
528}
529
530template<class T>
9f95a23c 531static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
532{
533 uint32_t n = m.size();
534 encode(n, bl);
535 for (auto& i : m) {
536 encode(i.first, bl);
537 encode(i.second.legacy_addr(), bl, f);
538 }
539}
540
541template<class T>
9f95a23c 542static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
543{
544 uint32_t n = m.size();
545 encode(n, bl);
546 for (auto& i : m) {
547 if (i) {
548 encode(i->legacy_addr(), bl, f);
549 } else {
550 encode(entity_addr_t(), bl, f);
551 }
552 }
7c673cae
FG
553}
554
11fdf7f2
TL
555/* for a description of osdmap incremental versions, and when they were
556 * introduced, please refer to
557 * doc/dev/osd_internals/osdmap_versions.txt
558 */
9f95a23c 559void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 560{
11fdf7f2 561 using ceph::encode;
7c673cae
FG
562 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
563 encode_classic(bl, features);
564 return;
565 }
566
567 // only a select set of callers should *ever* be encoding new
568 // OSDMaps. others should be passing around the canonical encoded
569 // buffers from on high. select out those callers by passing in an
570 // "impossible" feature bit.
11fdf7f2 571 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
572 features &= ~CEPH_FEATURE_RESERVED;
573
574 size_t start_offset = bl.length();
575 size_t tail_offset;
11fdf7f2 576 size_t crc_offset;
9f95a23c 577 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
578
579 // meta-encoding: how we include client-used and osd-specific data
580 ENCODE_START(8, 7, bl);
581
582 {
1e59de90 583 uint8_t v = 9;
7c673cae
FG
584 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
585 v = 3;
11fdf7f2
TL
586 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
587 v = 5;
588 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
589 v = 6;
df9f7d3d 590 } else if (!HAVE_FEATURE(features, SERVER_REEF)) {
1e59de90 591 v = 8;
df9f7d3d 592 }
7c673cae 593 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
594 encode(fsid, bl);
595 encode(epoch, bl);
596 encode(modified, bl);
597 encode(new_pool_max, bl);
598 encode(new_flags, bl);
599 encode(fullmap, bl);
600 encode(crush, bl);
601
602 encode(new_max_osd, bl);
603 encode(new_pools, bl, features);
604 encode(new_pool_names, bl);
605 encode(old_pools, bl);
606 if (v >= 7) {
607 encode(new_up_client, bl, features);
608 } else {
609 encode_addrvec_map_as_addr(new_up_client, bl, features);
610 }
31f18b77 611 if (v >= 5) {
11fdf7f2 612 encode(new_state, bl);
31f18b77 613 } else {
9f95a23c 614 map<int32_t, uint8_t> os;
31f18b77 615 for (auto p : new_state) {
9f95a23c
TL
616 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
617 // that an old client could not understand.
618 // skip those!
619 uint8_t s = p.second;
620 if (p.second != 0 && s == 0)
621 continue;
622 os[p.first] = s;
623 }
624 uint32_t n = os.size();
625 encode(n, bl);
626 for (auto p : os) {
627 encode(p.first, bl);
628 encode(p.second, bl);
31f18b77
FG
629 }
630 }
11fdf7f2
TL
631 encode(new_weight, bl);
632 encode(new_pg_temp, bl);
633 encode(new_primary_temp, bl);
634 encode(new_primary_affinity, bl);
635 encode(new_erasure_code_profiles, bl);
636 encode(old_erasure_code_profiles, bl);
7c673cae 637 if (v >= 4) {
11fdf7f2
TL
638 encode(new_pg_upmap, bl);
639 encode(old_pg_upmap, bl);
640 encode(new_pg_upmap_items, bl);
641 encode(old_pg_upmap_items, bl);
642 }
643 if (v >= 6) {
644 encode(new_removed_snaps, bl);
645 encode(new_purged_snaps, bl);
646 }
647 if (v >= 8) {
648 encode(new_last_up_change, bl);
649 encode(new_last_in_change, bl);
7c673cae 650 }
1e59de90
TL
651 if (v >= 9) {
652 encode(new_pg_upmap_primary, bl);
653 encode(old_pg_upmap_primary, bl);
654 }
7c673cae
FG
655 ENCODE_FINISH(bl); // client-usable data
656 }
657
658 {
1e59de90 659 uint8_t target_v = 9; // if bumping this, be aware of allow_crimson 12
7c673cae
FG
660 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
661 target_v = 2;
11fdf7f2
TL
662 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
663 target_v = 6;
7c673cae 664 }
f67539c2 665 if (change_stretch_mode) {
f67539c2
TL
666 target_v = std::max((uint8_t)10, target_v);
667 }
33c7a0ef
TL
668 if (!new_range_blocklist.empty() ||
669 !old_range_blocklist.empty()) {
670 target_v = std::max((uint8_t)11, target_v);
671 }
1e59de90
TL
672 if (mutate_allow_crimson != mutate_allow_crimson_t::NONE) {
673 target_v = std::max((uint8_t)12, target_v);
674 }
7c673cae 675 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
676 if (target_v < 7) {
677 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
678 } else {
679 encode(new_hb_back_up, bl, features);
680 }
681 encode(new_up_thru, bl);
682 encode(new_last_clean_interval, bl);
683 encode(new_lost, bl);
f67539c2
TL
684 encode(new_blocklist, bl, features);
685 encode(old_blocklist, bl, features);
11fdf7f2
TL
686 if (target_v < 7) {
687 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
688 } else {
689 encode(new_up_cluster, bl, features);
690 }
691 encode(cluster_snapshot, bl);
692 encode(new_uuid, bl);
9f95a23c 693 encode(new_xinfo, bl, features);
11fdf7f2
TL
694 if (target_v < 7) {
695 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
696 } else {
697 encode(new_hb_front_up, bl, features);
698 }
699 encode(features, bl); // NOTE: features arg, not the member
7c673cae 700 if (target_v >= 3) {
11fdf7f2
TL
701 encode(new_nearfull_ratio, bl);
702 encode(new_full_ratio, bl);
703 encode(new_backfillfull_ratio, bl);
31f18b77
FG
704 }
705 // 5 was string-based new_require_min_compat_client
706 if (target_v >= 6) {
11fdf7f2
TL
707 encode(new_require_min_compat_client, bl);
708 encode(new_require_osd_release, bl);
7c673cae 709 }
81eedcae
TL
710 if (target_v >= 8) {
711 encode(new_crush_node_flags, bl);
712 }
713 if (target_v >= 9) {
714 encode(new_device_class_flags, bl);
715 }
f67539c2
TL
716 if (target_v >= 10) {
717 encode(change_stretch_mode, bl);
718 encode(new_stretch_bucket_count, bl);
719 encode(new_degraded_stretch_mode, bl);
720 encode(new_recovering_stretch_mode, bl);
721 encode(new_stretch_mode_bucket, bl);
722 encode(stretch_mode_enabled, bl);
723 }
33c7a0ef
TL
724 if (target_v >= 11) {
725 encode(new_range_blocklist, bl, features);
726 encode(old_range_blocklist, bl, features);
727 }
1e59de90
TL
728 if (target_v >= 12) {
729 encode(mutate_allow_crimson, bl);
730 }
7c673cae
FG
731 ENCODE_FINISH(bl); // osd-only data
732 }
733
11fdf7f2
TL
734 crc_offset = bl.length();
735 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
736 tail_offset = bl.length();
737
11fdf7f2 738 encode(full_crc, bl);
7c673cae
FG
739
740 ENCODE_FINISH(bl); // meta-encoding wrapper
741
742 // fill in crc
9f95a23c 743 ceph::buffer::list front;
11fdf7f2 744 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae 745 inc_crc = front.crc32c(-1);
9f95a23c 746 ceph::buffer::list tail;
7c673cae
FG
747 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
748 inc_crc = tail.crc32c(inc_crc);
749 ceph_le32 crc_le;
750 crc_le = inc_crc;
11fdf7f2 751 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
752 have_crc = true;
753}
754
9f95a23c 755void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
7c673cae 756{
11fdf7f2 757 using ceph::decode;
7c673cae
FG
758 __u32 n, t;
759 // base
760 __u16 v;
11fdf7f2
TL
761 decode(v, p);
762 decode(fsid, p);
763 decode(epoch, p);
764 decode(modified, p);
7c673cae 765 if (v == 4 || v == 5) {
11fdf7f2 766 decode(n, p);
7c673cae
FG
767 new_pool_max = n;
768 } else if (v >= 6)
11fdf7f2
TL
769 decode(new_pool_max, p);
770 decode(new_flags, p);
771 decode(fullmap, p);
772 decode(crush, p);
7c673cae 773
11fdf7f2 774 decode(new_max_osd, p);
7c673cae
FG
775 if (v < 6) {
776 new_pools.clear();
11fdf7f2 777 decode(n, p);
7c673cae 778 while (n--) {
11fdf7f2
TL
779 decode(t, p);
780 decode(new_pools[t], p);
7c673cae
FG
781 }
782 } else {
11fdf7f2 783 decode(new_pools, p);
7c673cae
FG
784 }
785 if (v == 5) {
786 new_pool_names.clear();
11fdf7f2 787 decode(n, p);
7c673cae 788 while (n--) {
11fdf7f2
TL
789 decode(t, p);
790 decode(new_pool_names[t], p);
7c673cae
FG
791 }
792 } else if (v >= 6) {
11fdf7f2 793 decode(new_pool_names, p);
7c673cae
FG
794 }
795 if (v < 6) {
796 old_pools.clear();
11fdf7f2 797 decode(n, p);
7c673cae 798 while (n--) {
11fdf7f2 799 decode(t, p);
7c673cae
FG
800 old_pools.insert(t);
801 }
802 } else {
11fdf7f2 803 decode(old_pools, p);
7c673cae 804 }
11fdf7f2 805 decode(new_up_client, p);
31f18b77
FG
806 {
807 map<int32_t,uint8_t> ns;
11fdf7f2 808 decode(ns, p);
31f18b77
FG
809 for (auto q : ns) {
810 new_state[q.first] = q.second;
811 }
812 }
11fdf7f2 813 decode(new_weight, p);
7c673cae
FG
814
815 if (v < 6) {
816 new_pg_temp.clear();
11fdf7f2 817 decode(n, p);
7c673cae
FG
818 while (n--) {
819 old_pg_t opg;
9f95a23c 820 ceph::decode_raw(opg, p);
11fdf7f2 821 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
822 }
823 } else {
11fdf7f2 824 decode(new_pg_temp, p);
7c673cae
FG
825 }
826
827 // decode short map, too.
828 if (v == 5 && p.end())
829 return;
830
831 // extended
832 __u16 ev = 0;
833 if (v >= 5)
11fdf7f2
TL
834 decode(ev, p);
835 decode(new_hb_back_up, p);
7c673cae 836 if (v < 5)
11fdf7f2
TL
837 decode(new_pool_names, p);
838 decode(new_up_thru, p);
839 decode(new_last_clean_interval, p);
840 decode(new_lost, p);
f67539c2
TL
841 decode(new_blocklist, p);
842 decode(old_blocklist, p);
7c673cae 843 if (ev >= 6)
11fdf7f2 844 decode(new_up_cluster, p);
7c673cae 845 if (ev >= 7)
11fdf7f2 846 decode(cluster_snapshot, p);
7c673cae 847 if (ev >= 8)
11fdf7f2 848 decode(new_uuid, p);
7c673cae 849 if (ev >= 9)
11fdf7f2 850 decode(new_xinfo, p);
7c673cae 851 if (ev >= 10)
11fdf7f2 852 decode(new_hb_front_up, p);
7c673cae
FG
853}
854
11fdf7f2
TL
855/* for a description of osdmap incremental versions, and when they were
856 * introduced, please refer to
857 * doc/dev/osd_internals/osdmap_versions.txt
858 */
9f95a23c 859void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 860{
11fdf7f2 861 using ceph::decode;
7c673cae
FG
862 /**
863 * Older encodings of the Incremental had a single struct_v which
864 * covered the whole encoding, and was prior to our modern
865 * stuff which includes a compatv and a size. So if we see
866 * a struct_v < 7, we must rewind to the beginning and use our
867 * classic decoder.
868 */
869 size_t start_offset = bl.get_off();
870 size_t tail_offset = 0;
9f95a23c 871 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
872
873 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
874 if (struct_v < 7) {
11fdf7f2 875 bl.seek(start_offset);
7c673cae
FG
876 decode_classic(bl);
877 encode_features = 0;
878 if (struct_v >= 6)
879 encode_features = CEPH_FEATURE_PGID64;
880 else
881 encode_features = 0;
882 return;
883 }
884 {
11fdf7f2
TL
885 DECODE_START(8, bl); // client-usable data
886 decode(fsid, bl);
887 decode(epoch, bl);
888 decode(modified, bl);
889 decode(new_pool_max, bl);
890 decode(new_flags, bl);
891 decode(fullmap, bl);
892 decode(crush, bl);
893
894 decode(new_max_osd, bl);
895 decode(new_pools, bl);
896 decode(new_pool_names, bl);
897 decode(old_pools, bl);
898 decode(new_up_client, bl);
31f18b77 899 if (struct_v >= 5) {
11fdf7f2 900 decode(new_state, bl);
31f18b77
FG
901 } else {
902 map<int32_t,uint8_t> ns;
11fdf7f2 903 decode(ns, bl);
31f18b77
FG
904 for (auto q : ns) {
905 new_state[q.first] = q.second;
906 }
907 }
11fdf7f2
TL
908 decode(new_weight, bl);
909 decode(new_pg_temp, bl);
910 decode(new_primary_temp, bl);
7c673cae 911 if (struct_v >= 2)
11fdf7f2 912 decode(new_primary_affinity, bl);
7c673cae
FG
913 else
914 new_primary_affinity.clear();
915 if (struct_v >= 3) {
11fdf7f2
TL
916 decode(new_erasure_code_profiles, bl);
917 decode(old_erasure_code_profiles, bl);
7c673cae
FG
918 } else {
919 new_erasure_code_profiles.clear();
920 old_erasure_code_profiles.clear();
921 }
922 if (struct_v >= 4) {
11fdf7f2
TL
923 decode(new_pg_upmap, bl);
924 decode(old_pg_upmap, bl);
925 decode(new_pg_upmap_items, bl);
926 decode(old_pg_upmap_items, bl);
927 }
928 if (struct_v >= 6) {
929 decode(new_removed_snaps, bl);
930 decode(new_purged_snaps, bl);
931 }
932 if (struct_v >= 8) {
933 decode(new_last_up_change, bl);
934 decode(new_last_in_change, bl);
7c673cae
FG
935 }
936 DECODE_FINISH(bl); // client-usable data
937 }
938
939 {
f67539c2 940 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
941 decode(new_hb_back_up, bl);
942 decode(new_up_thru, bl);
943 decode(new_last_clean_interval, bl);
944 decode(new_lost, bl);
f67539c2
TL
945 decode(new_blocklist, bl);
946 decode(old_blocklist, bl);
11fdf7f2
TL
947 decode(new_up_cluster, bl);
948 decode(cluster_snapshot, bl);
949 decode(new_uuid, bl);
950 decode(new_xinfo, bl);
951 decode(new_hb_front_up, bl);
7c673cae 952 if (struct_v >= 2)
11fdf7f2 953 decode(encode_features, bl);
7c673cae
FG
954 else
955 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
956 if (struct_v >= 3) {
11fdf7f2
TL
957 decode(new_nearfull_ratio, bl);
958 decode(new_full_ratio, bl);
7c673cae
FG
959 } else {
960 new_nearfull_ratio = -1;
961 new_full_ratio = -1;
962 }
963 if (struct_v >= 4) {
11fdf7f2 964 decode(new_backfillfull_ratio, bl);
7c673cae
FG
965 } else {
966 new_backfillfull_ratio = -1;
967 }
31f18b77
FG
968 if (struct_v == 5) {
969 string r;
11fdf7f2 970 decode(r, bl);
31f18b77 971 if (r.length()) {
9f95a23c 972 new_require_min_compat_client = ceph_release_from_name(r);
31f18b77
FG
973 }
974 }
975 if (struct_v >= 6) {
11fdf7f2
TL
976 decode(new_require_min_compat_client, bl);
977 decode(new_require_osd_release, bl);
31f18b77
FG
978 } else {
979 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
980 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 981 new_require_osd_release = ceph_release_t::luminous;
31f18b77
FG
982 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
983 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
9f95a23c 984 new_require_osd_release = ceph_release_t::kraken;
31f18b77 985 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
9f95a23c 986 new_require_osd_release = ceph_release_t::jewel;
31f18b77 987 } else {
9f95a23c 988 new_require_osd_release = ceph_release_t::unknown;
31f18b77
FG
989 }
990 }
81eedcae
TL
991 if (struct_v >= 8) {
992 decode(new_crush_node_flags, bl);
993 }
994 if (struct_v >= 9) {
995 decode(new_device_class_flags, bl);
996 }
f67539c2
TL
997 if (struct_v >= 10) {
998 decode(change_stretch_mode, bl);
999 decode(new_stretch_bucket_count, bl);
1000 decode(new_degraded_stretch_mode, bl);
1001 decode(new_recovering_stretch_mode, bl);
1002 decode(new_stretch_mode_bucket, bl);
1003 decode(stretch_mode_enabled, bl);
1004 }
33c7a0ef
TL
1005 if (struct_v >= 11) {
1006 decode(new_range_blocklist, bl);
1007 decode(old_range_blocklist, bl);
1008 }
1e59de90
TL
1009 if (struct_v >= 12) {
1010 decode(mutate_allow_crimson, bl);
1011 }
7c673cae
FG
1012 DECODE_FINISH(bl); // osd-only data
1013 }
1014
1015 if (struct_v >= 8) {
1016 have_crc = true;
1017 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 1018 decode(inc_crc, bl);
7c673cae 1019 tail_offset = bl.get_off();
11fdf7f2 1020 decode(full_crc, bl);
7c673cae
FG
1021 } else {
1022 have_crc = false;
1023 full_crc = 0;
1024 inc_crc = 0;
1025 }
1026
1027 DECODE_FINISH(bl); // wrapper
1028
1029 if (have_crc) {
1030 // verify crc
1031 uint32_t actual = crc_front.crc32c(-1);
1032 if (tail_offset < bl.get_off()) {
9f95a23c 1033 ceph::buffer::list tail;
7c673cae
FG
1034 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1035 actual = tail.crc32c(actual);
1036 }
1037 if (inc_crc != actual) {
1038 ostringstream ss;
1039 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1040 string s = ss.str();
9f95a23c 1041 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
1042 }
1043 }
1044}
1045
1046void OSDMap::Incremental::dump(Formatter *f) const
1047{
1048 f->dump_int("epoch", epoch);
1049 f->dump_stream("fsid") << fsid;
1050 f->dump_stream("modified") << modified;
11fdf7f2
TL
1051 f->dump_stream("new_last_up_change") << new_last_up_change;
1052 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
1053 f->dump_int("new_pool_max", new_pool_max);
1054 f->dump_int("new_flags", new_flags);
1055 f->dump_float("new_full_ratio", new_full_ratio);
1056 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1057 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
f67539c2
TL
1058 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1059 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
1e59de90 1060 f->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson));
7c673cae
FG
1061
1062 if (fullmap.length()) {
1063 f->open_object_section("full_map");
1064 OSDMap full;
9f95a23c 1065 ceph::buffer::list fbl = fullmap; // kludge around constness.
11fdf7f2 1066 auto p = fbl.cbegin();
7c673cae
FG
1067 full.decode(p);
1068 full.dump(f);
1069 f->close_section();
1070 }
1071 if (crush.length()) {
1072 f->open_object_section("crush");
1073 CrushWrapper c;
9f95a23c 1074 ceph::buffer::list tbl = crush; // kludge around constness.
11fdf7f2 1075 auto p = tbl.cbegin();
7c673cae
FG
1076 c.decode(p);
1077 c.dump(f);
1078 f->close_section();
1079 }
1080
1081 f->dump_int("new_max_osd", new_max_osd);
1082
1083 f->open_array_section("new_pools");
1084
1085 for (const auto &new_pool : new_pools) {
1086 f->open_object_section("pool");
1087 f->dump_int("pool", new_pool.first);
1088 new_pool.second.dump(f);
1089 f->close_section();
1090 }
1091 f->close_section();
1092 f->open_array_section("new_pool_names");
1093
1094 for (const auto &new_pool_name : new_pool_names) {
1095 f->open_object_section("pool_name");
1096 f->dump_int("pool", new_pool_name.first);
1097 f->dump_string("name", new_pool_name.second);
1098 f->close_section();
1099 }
1100 f->close_section();
1101 f->open_array_section("old_pools");
1102
1103 for (const auto &old_pool : old_pools)
1104 f->dump_int("pool", old_pool);
1105 f->close_section();
1106
1107 f->open_array_section("new_up_osds");
1108
1109 for (const auto &upclient : new_up_client) {
1110 f->open_object_section("osd");
1111 f->dump_int("osd", upclient.first);
11fdf7f2
TL
1112 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1113 f->dump_object("public_addrs", upclient.second);
1114 if (auto p = new_up_cluster.find(upclient.first);
1115 p != new_up_cluster.end()) {
1116 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1117 f->dump_object("cluster_addrs", p->second);
1118 }
1119 if (auto p = new_hb_back_up.find(upclient.first);
1120 p != new_hb_back_up.end()) {
1121 f->dump_object("heartbeat_back_addrs", p->second);
1122 }
1123 if (auto p = new_hb_front_up.find(upclient.first);
1124 p != new_hb_front_up.end()) {
1125 f->dump_object("heartbeat_front_addrs", p->second);
1126 }
7c673cae
FG
1127 f->close_section();
1128 }
1129 f->close_section();
1130
1131 f->open_array_section("new_weight");
1132
1133 for (const auto &weight : new_weight) {
1134 f->open_object_section("osd");
1135 f->dump_int("osd", weight.first);
1136 f->dump_int("weight", weight.second);
1137 f->close_section();
1138 }
1139 f->close_section();
1140
1141 f->open_array_section("osd_state_xor");
1142 for (const auto &ns : new_state) {
1143 f->open_object_section("osd");
1144 f->dump_int("osd", ns.first);
1145 set<string> st;
1146 calc_state_set(new_state.find(ns.first)->second, st);
1147 f->open_array_section("state_xor");
1148 for (auto &state : st)
1149 f->dump_string("state", state);
1150 f->close_section();
c07f9fc5 1151 f->close_section();
7c673cae
FG
1152 }
1153 f->close_section();
1154
1155 f->open_array_section("new_pg_temp");
1156
1157 for (const auto &pg_temp : new_pg_temp) {
1158 f->open_object_section("pg");
1159 f->dump_stream("pgid") << pg_temp.first;
1160 f->open_array_section("osds");
1161
1162 for (const auto &osd : pg_temp.second)
1163 f->dump_int("osd", osd);
1164 f->close_section();
1165 f->close_section();
1166 }
1167 f->close_section();
1168
1169 f->open_array_section("primary_temp");
1170
1171 for (const auto &primary_temp : new_primary_temp) {
1172 f->dump_stream("pgid") << primary_temp.first;
1173 f->dump_int("osd", primary_temp.second);
1174 }
1175 f->close_section(); // primary_temp
1176
1177 f->open_array_section("new_pg_upmap");
1178 for (auto& i : new_pg_upmap) {
1179 f->open_object_section("mapping");
1180 f->dump_stream("pgid") << i.first;
1181 f->open_array_section("osds");
1182 for (auto osd : i.second) {
1183 f->dump_int("osd", osd);
1184 }
1185 f->close_section();
1186 f->close_section();
1187 }
1188 f->close_section();
1189 f->open_array_section("old_pg_upmap");
1190 for (auto& i : old_pg_upmap) {
1191 f->dump_stream("pgid") << i;
1192 }
1193 f->close_section();
1194
1195 f->open_array_section("new_pg_upmap_items");
1196 for (auto& i : new_pg_upmap_items) {
1197 f->open_object_section("mapping");
1198 f->dump_stream("pgid") << i.first;
1199 f->open_array_section("mappings");
1200 for (auto& p : i.second) {
1201 f->open_object_section("mapping");
1202 f->dump_int("from", p.first);
1203 f->dump_int("to", p.second);
1204 f->close_section();
1205 }
1206 f->close_section();
1207 f->close_section();
1208 }
1209 f->close_section();
1210 f->open_array_section("old_pg_upmap_items");
1211 for (auto& i : old_pg_upmap_items) {
1212 f->dump_stream("pgid") << i;
1213 }
1214 f->close_section();
1215
1e59de90
TL
1216 // dump upmap_primaries
1217 f->open_array_section("new_pg_upmap_primaries");
1218 for (auto& [pg, osd] : new_pg_upmap_primary) {
1219 f->open_object_section("primary_mapping");
1220 f->dump_stream("pgid") << pg;
1221 f->dump_int("primary_osd", osd);
1222 f->close_section();
1223 }
1224 f->close_section(); // new_pg_upmap_primaries
1225
1226 // dump old_pg_upmap_primaries (removed primary mappings)
1227 f->open_array_section("old_pg_upmap_primaries");
1228 for (auto& pg : old_pg_upmap_primary) {
1229 f->dump_stream("pgid") << pg;
1230 }
1231 f->close_section(); // old_pg_upmap_primaries
1232
7c673cae
FG
1233 f->open_array_section("new_up_thru");
1234
1235 for (const auto &up_thru : new_up_thru) {
1236 f->open_object_section("osd");
1237 f->dump_int("osd", up_thru.first);
1238 f->dump_int("up_thru", up_thru.second);
1239 f->close_section();
1240 }
1241 f->close_section();
1242
1243 f->open_array_section("new_lost");
1244
1245 for (const auto &lost : new_lost) {
1246 f->open_object_section("osd");
1247 f->dump_int("osd", lost.first);
1248 f->dump_int("epoch_lost", lost.second);
1249 f->close_section();
1250 }
1251 f->close_section();
1252
1253 f->open_array_section("new_last_clean_interval");
1254
1255 for (const auto &last_clean_interval : new_last_clean_interval) {
1256 f->open_object_section("osd");
1257 f->dump_int("osd", last_clean_interval.first);
1258 f->dump_int("first", last_clean_interval.second.first);
1259 f->dump_int("last", last_clean_interval.second.second);
1260 f->close_section();
1261 }
1262 f->close_section();
1263
f67539c2
TL
1264 f->open_array_section("new_blocklist");
1265 for (const auto &blist : new_blocklist) {
7c673cae
FG
1266 stringstream ss;
1267 ss << blist.first;
1268 f->dump_stream(ss.str().c_str()) << blist.second;
1269 }
1270 f->close_section();
f67539c2
TL
1271 f->open_array_section("old_blocklist");
1272 for (const auto &blist : old_blocklist)
7c673cae
FG
1273 f->dump_stream("addr") << blist;
1274 f->close_section();
33c7a0ef
TL
1275 f->open_array_section("new_range_blocklist");
1276 for (const auto &blist : new_range_blocklist) {
1277 stringstream ss;
1278 ss << blist.first;
1279 f->dump_stream(ss.str().c_str()) << blist.second;
1280 }
1281 f->close_section();
1282 f->open_array_section("old_range_blocklist");
1283 for (const auto &blist : old_range_blocklist)
1284 f->dump_stream("addr") << blist;
1285 f->close_section();
7c673cae
FG
1286
1287 f->open_array_section("new_xinfo");
1288 for (const auto &xinfo : new_xinfo) {
1289 f->open_object_section("xinfo");
1290 f->dump_int("osd", xinfo.first);
1291 xinfo.second.dump(f);
1292 f->close_section();
1293 }
1294 f->close_section();
1295
1296 if (cluster_snapshot.size())
1297 f->dump_string("cluster_snapshot", cluster_snapshot);
1298
1299 f->open_array_section("new_uuid");
1300 for (const auto &uuid : new_uuid) {
1301 f->open_object_section("osd");
1302 f->dump_int("osd", uuid.first);
1303 f->dump_stream("uuid") << uuid.second;
1304 f->close_section();
1305 }
1306 f->close_section();
1307
1308 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1309 f->open_array_section("old_erasure_code_profiles");
1310 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
9f95a23c 1311 f->dump_string("old", erasure_code_profile);
7c673cae
FG
1312 }
1313 f->close_section();
11fdf7f2
TL
1314
1315 f->open_array_section("new_removed_snaps");
1316 for (auto& p : new_removed_snaps) {
1317 f->open_object_section("pool");
1318 f->dump_int("pool", p.first);
1319 f->open_array_section("snaps");
1320 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1321 f->open_object_section("interval");
1322 f->dump_unsigned("begin", q.get_start());
1323 f->dump_unsigned("length", q.get_len());
1324 f->close_section();
1325 }
1326 f->close_section();
1327 f->close_section();
1328 }
1329 f->close_section();
1330 f->open_array_section("new_purged_snaps");
1331 for (auto& p : new_purged_snaps) {
1332 f->open_object_section("pool");
1333 f->dump_int("pool", p.first);
1334 f->open_array_section("snaps");
1335 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1336 f->open_object_section("interval");
1337 f->dump_unsigned("begin", q.get_start());
1338 f->dump_unsigned("length", q.get_len());
1339 f->close_section();
1340 }
1341 f->close_section();
1342 f->close_section();
1343 }
81eedcae
TL
1344 f->open_array_section("new_crush_node_flags");
1345 for (auto& i : new_crush_node_flags) {
1346 f->open_object_section("node");
1347 f->dump_int("id", i.first);
1348 set<string> st;
1349 calc_state_set(i.second, st);
1350 for (auto& j : st) {
1351 f->dump_string("flag", j);
1352 }
1353 f->close_section();
1354 }
1355 f->close_section();
1356 f->open_array_section("new_device_class_flags");
1357 for (auto& i : new_device_class_flags) {
1358 f->open_object_section("device_class");
1359 f->dump_int("id", i.first);
1360 set<string> st;
1361 calc_state_set(i.second, st);
1362 for (auto& j : st) {
1363 f->dump_string("flag", j);
1364 }
1365 f->close_section();
1366 }
1367 f->close_section();
f67539c2
TL
1368 f->open_object_section("stretch_mode");
1369 {
1370 f->dump_bool("change_stretch_mode", change_stretch_mode);
1371 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1372 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1373 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1374 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1375 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1376 }
1377 f->close_section();
11fdf7f2 1378 f->close_section();
7c673cae
FG
1379}
1380
1381void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1382{
1383 o.push_back(new Incremental);
1384}
1385
1386// ----------------------------------
1387// OSDMap
1388
1389void OSDMap::set_epoch(epoch_t e)
1390{
1391 epoch = e;
1392 for (auto &pool : pools)
1393 pool.second.last_change = e;
1394}
1395
33c7a0ef
TL
1396OSDMap::range_bits::range_bits() : ipv6(false) {
1397 memset(&bits, 0, sizeof(bits));
1398}
1399
1400OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
1401 memset(&bits, 0, sizeof(bits));
1402 parse(addr);
1403}
1404
1405void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
1406 uint64_t *upper, uint64_t *lower)
1407{
1408 *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
1409 ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
1410 *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
1411 ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
1412}
1413
1414void OSDMap::range_bits::parse(const entity_addr_t& addr) {
1415 // parse it into meaningful data
1416 if (addr.is_ipv6()) {
1417 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
1418 &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
1419 int32_t lower_shift = std::min(128-
1420 static_cast<int32_t>(addr.get_nonce()), 64);
1421 int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
1422 static_cast<int32_t>(addr.get_nonce()), 0);
1423
1424 auto get_mask = [](int32_t shift) -> uint64_t {
1425 if (shift >= 0 && shift < 64) {
1426 return UINT64_MAX << shift;
1427 }
1428 return 0;
1429 };
1430
1431 bits.ipv6.lower_mask = get_mask(lower_shift);
1432 bits.ipv6.upper_mask = get_mask(upper_shift);
1433 ipv6 = true;
1434 } else if (addr.is_ipv4()) {
1435 bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
1436 if (addr.get_nonce() > 0) {
1437 bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
1438 } else {
1439 bits.ipv4.mask = 0;
1440 }
1441 } else {
1442 // uh...
1443 }
1444}
1445
1446bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
1447 if (addr.is_ipv4() && !ipv6) {
1448 return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
1449 (bits.ipv4.ip_32_bits & bits.ipv4.mask));
1450 } else if (addr.is_ipv6() && ipv6) {
1451 uint64_t upper_64, lower_64;
1452 get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
1453 return (((upper_64 & bits.ipv6.upper_mask) ==
1454 (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
1455 ((lower_64 & bits.ipv6.lower_mask) ==
1456 (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
1457 }
1458 return false;
1459}
1460
1461bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
7c673cae 1462{
33c7a0ef
TL
1463 if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
1464 if (blocklist.empty() && range_blocklist.empty()) {
1465 if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
7c673cae 1466 return false;
11fdf7f2
TL
1467 }
1468
f67539c2 1469 // all blocklist entries are type ANY for nautilus+
11fdf7f2
TL
1470 // FIXME: avoid this copy!
1471 entity_addr_t a = orig;
9f95a23c 1472 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1473 a.set_type(entity_addr_t::TYPE_LEGACY);
1474 } else {
1475 a.set_type(entity_addr_t::TYPE_ANY);
1476 }
7c673cae
FG
1477
1478 // this specific instance?
f67539c2 1479 if (blocklist.count(a)) {
33c7a0ef 1480 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
7c673cae 1481 return true;
11fdf7f2 1482 }
7c673cae 1483
f67539c2 1484 // is entire ip blocklisted?
7c673cae 1485 if (a.is_ip()) {
11fdf7f2
TL
1486 a.set_port(0);
1487 a.set_nonce(0);
f67539c2 1488 if (blocklist.count(a)) {
33c7a0ef 1489 if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
11fdf7f2
TL
1490 return true;
1491 }
1492 }
1493
33c7a0ef
TL
1494 // is it in a blocklisted range?
1495 for (const auto& i : calculated_ranges) {
1496 bool blocked = i.second.matches(a);
1497 if (blocked) {
1498 if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
1499 return true;
1500 }
1501 }
1502
1503 if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
11fdf7f2
TL
1504 return false;
1505}
1506
33c7a0ef 1507bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
11fdf7f2 1508{
33c7a0ef 1509 if (blocklist.empty() && range_blocklist.empty())
11fdf7f2
TL
1510 return false;
1511
1512 for (auto& a : av.v) {
33c7a0ef 1513 if (is_blocklisted(a, cct)) {
7c673cae
FG
1514 return true;
1515 }
1516 }
1517
1518 return false;
1519}
1520
33c7a0ef
TL
1521void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
1522 std::list<std::pair<entity_addr_t,utime_t> > *rl) const
7c673cae 1523{
f67539c2 1524 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
33c7a0ef
TL
1525 std::copy(range_blocklist.begin(), range_blocklist.end(),
1526 std::back_inserter(*rl));
7c673cae
FG
1527}
1528
33c7a0ef
TL
1529void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
1530 std::set<entity_addr_t> *rl) const
31f18b77 1531{
f67539c2 1532 for (const auto &i : blocklist) {
31f18b77
FG
1533 bl->insert(i.first);
1534 }
33c7a0ef
TL
1535 for (const auto &i : range_blocklist) {
1536 rl->insert(i.first);
1537 }
31f18b77
FG
1538}
1539
7c673cae
FG
1540void OSDMap::set_max_osd(int m)
1541{
7c673cae 1542 max_osd = m;
f67539c2
TL
1543 osd_state.resize(max_osd, 0);
1544 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1545 osd_info.resize(max_osd);
1546 osd_xinfo.resize(max_osd);
1547 osd_addrs->client_addrs.resize(max_osd);
1548 osd_addrs->cluster_addrs.resize(max_osd);
1549 osd_addrs->hb_back_addrs.resize(max_osd);
1550 osd_addrs->hb_front_addrs.resize(max_osd);
1551 osd_uuid->resize(max_osd);
7c673cae 1552 if (osd_primary_affinity)
f67539c2 1553 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
7c673cae
FG
1554
1555 calc_num_osds();
1556}
1557
1558int OSDMap::calc_num_osds()
1559{
1560 num_osd = 0;
1561 num_up_osd = 0;
1562 num_in_osd = 0;
1563 for (int i=0; i<max_osd; i++) {
1564 if (osd_state[i] & CEPH_OSD_EXISTS) {
1565 ++num_osd;
1566 if (osd_state[i] & CEPH_OSD_UP) {
1567 ++num_up_osd;
1568 }
1569 if (get_weight(i) != CEPH_OSD_OUT) {
1570 ++num_in_osd;
1571 }
1572 }
1573 }
1574 return num_osd;
1575}
1576
3efd9988
FG
1577void OSDMap::get_full_pools(CephContext *cct,
1578 set<int64_t> *full,
1579 set<int64_t> *backfillfull,
1580 set<int64_t> *nearfull) const
7c673cae 1581{
11fdf7f2
TL
1582 ceph_assert(full);
1583 ceph_assert(backfillfull);
1584 ceph_assert(nearfull);
3efd9988
FG
1585 full->clear();
1586 backfillfull->clear();
1587 nearfull->clear();
1588
1589 vector<int> full_osds;
1590 vector<int> backfillfull_osds;
1591 vector<int> nearfull_osds;
7c673cae
FG
1592 for (int i = 0; i < max_osd; ++i) {
1593 if (exists(i) && is_up(i) && is_in(i)) {
1594 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1595 full_osds.push_back(i);
7c673cae 1596 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1597 backfillfull_osds.push_back(i);
7c673cae 1598 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1599 nearfull_osds.push_back(i);
7c673cae
FG
1600 }
1601 }
3efd9988
FG
1602
1603 for (auto i: full_osds) {
1604 get_pool_ids_by_osd(cct, i, full);
1605 }
1606 for (auto i: backfillfull_osds) {
1607 get_pool_ids_by_osd(cct, i, backfillfull);
1608 }
1609 for (auto i: nearfull_osds) {
1610 get_pool_ids_by_osd(cct, i, nearfull);
1611 }
7c673cae
FG
1612}
1613
31f18b77
FG
1614void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1615 set<int> *nearfull) const
1616{
1617 full->clear();
1618 backfill->clear();
1619 nearfull->clear();
1620 for (int i = 0; i < max_osd; ++i) {
1621 if (exists(i) && is_up(i) && is_in(i)) {
1622 if (osd_state[i] & CEPH_OSD_FULL)
1623 full->emplace(i);
1624 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1625 backfill->emplace(i);
1626 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1627 nearfull->emplace(i);
1628 }
1629 }
1630}
1631
7c673cae
FG
1632void OSDMap::get_all_osds(set<int32_t>& ls) const
1633{
1634 for (int i=0; i<max_osd; i++)
1635 if (exists(i))
1636 ls.insert(i);
1637}
1638
1639void OSDMap::get_up_osds(set<int32_t>& ls) const
1640{
1641 for (int i = 0; i < max_osd; i++) {
1642 if (is_up(i))
1643 ls.insert(i);
1644 }
1645}
1646
81eedcae 1647void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1648{
1649 for (int i = 0; i < max_osd; i++) {
81eedcae 1650 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1651 ls.insert(i);
1652 }
1653}
1654
11fdf7f2
TL
1655void OSDMap::get_flag_set(set<string> *flagset) const
1656{
1657 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1658 if (flags & (1<<i)) {
1659 flagset->insert(get_flag_string(flags & (1<<i)));
1660 }
1661 }
1662}
1663
7c673cae
FG
1664void OSDMap::calc_state_set(int state, set<string>& st)
1665{
1666 unsigned t = state;
1667 for (unsigned s = 1; t; s <<= 1) {
1668 if (t & s) {
1669 t &= ~s;
1670 st.insert(ceph_osd_state_name(s));
1671 }
1672 }
1673}
1674
1675void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1676{
1677 float max = 0;
1678 for (const auto &weight : weights) {
1679 if (weight.second > max)
1680 max = weight.second;
1681 }
1682
1683 for (const auto &weight : weights) {
1684 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1685 }
1686}
1687
1688int OSDMap::identify_osd(const entity_addr_t& addr) const
1689{
1690 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1691 if (exists(i) && (get_addrs(i).contains(addr) ||
1692 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1693 return i;
1694 return -1;
1695}
1696
1697int OSDMap::identify_osd(const uuid_d& u) const
1698{
1699 for (int i=0; i<max_osd; i++)
1700 if (exists(i) && get_uuid(i) == u)
1701 return i;
1702 return -1;
1703}
1704
1705int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1706{
1707 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1708 if (exists(i) && (get_addrs(i).contains(addr) ||
1709 get_cluster_addrs(i).contains(addr) ||
1710 get_hb_back_addrs(i).contains(addr) ||
1711 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1712 return i;
1713 return -1;
1714}
1715
1716int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1717{
1718 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1719 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1720 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1721 return i;
1722 return -1;
1723}
1724
1725
1726uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1727{
1728 uint64_t features = 0; // things we actually have
1729 uint64_t mask = 0; // things we could have
1730
1731 if (crush->has_nondefault_tunables())
1732 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1733 if (crush->has_nondefault_tunables2())
1734 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1735 if (crush->has_nondefault_tunables3())
1736 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1737 if (crush->has_v4_buckets())
1738 features |= CEPH_FEATURE_CRUSH_V4;
1739 if (crush->has_nondefault_tunables5())
1740 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1741 if (crush->has_incompat_choose_args()) {
1742 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1743 }
7c673cae
FG
1744 mask |= CEPH_FEATURES_CRUSH;
1745
1e59de90 1746 if (!pg_upmap.empty() || !pg_upmap_items.empty() || !pg_upmap_primaries.empty())
7c673cae
FG
1747 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1748 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1749
1750 for (auto &pool: pools) {
1751 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1752 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1753 }
7c673cae
FG
1754 if (!pool.second.tiers.empty() ||
1755 pool.second.is_tier()) {
1756 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1757 }
20effc67 1758 int ruleid = pool.second.get_crush_rule();
7c673cae
FG
1759 if (ruleid >= 0) {
1760 if (crush->is_v2_rule(ruleid))
1761 features |= CEPH_FEATURE_CRUSH_V2;
1762 if (crush->is_v3_rule(ruleid))
1763 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1764 if (crush->is_v5_rule(ruleid))
1765 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1766 }
1767 }
7c673cae 1768 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1769
1770 if (osd_primary_affinity) {
1771 for (int i = 0; i < max_osd; ++i) {
1772 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1773 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1774 break;
1775 }
1776 }
1777 }
1778 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1779
1780 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1781 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
9f95a23c 1782 if (require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
1783 features |= jewel_features;
1784 }
1785 mask |= jewel_features;
1786
1787 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1788 | CEPH_FEATURE_MSG_ADDR2;
9f95a23c 1789 if (require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
1790 features |= kraken_features;
1791 }
1792 mask |= kraken_features;
f67539c2
TL
1793
1794 if (stretch_mode_enabled) {
1795 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1796 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1797 }
7c673cae
FG
1798 }
1799
9f95a23c 1800 if (require_min_compat_client >= ceph_release_t::nautilus) {
11fdf7f2
TL
1801 // if min_compat_client is >= nautilus, require v2 cephx signatures
1802 // from everyone
1803 features |= CEPH_FEATUREMASK_CEPHX_V2;
9f95a23c 1804 } else if (require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
1805 entity_type == CEPH_ENTITY_TYPE_OSD) {
1806 // if osds are >= nautilus, at least require the signatures from them
1807 features |= CEPH_FEATUREMASK_CEPHX_V2;
1808 }
1809 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1810
7c673cae
FG
1811 if (pmask)
1812 *pmask = mask;
1813 return features;
1814}
1815
9f95a23c 1816ceph_release_t OSDMap::get_min_compat_client() const
7c673cae
FG
1817{
1818 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1819
1820 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77 1821 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
9f95a23c 1822 return ceph_release_t::luminous; // v12.2.0
7c673cae
FG
1823 }
1824 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
9f95a23c 1825 return ceph_release_t::jewel; // v10.2.0
7c673cae
FG
1826 }
1827 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
9f95a23c 1828 return ceph_release_t::hammer; // v0.94.0
7c673cae
FG
1829 }
1830 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1831 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1832 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
9f95a23c 1833 return ceph_release_t::firefly; // v0.80.0
7c673cae
FG
1834 }
1835 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1836 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
9f95a23c 1837 return ceph_release_t::dumpling; // v0.67.0
7c673cae
FG
1838 }
1839 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
9f95a23c 1840 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae 1841 }
9f95a23c 1842 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae
FG
1843}
1844
9f95a23c 1845ceph_release_t OSDMap::get_require_min_compat_client() const
11fdf7f2
TL
1846{
1847 return require_min_compat_client;
1848}
1849
7c673cae
FG
1850void OSDMap::_calc_up_osd_features()
1851{
1852 bool first = true;
1853 cached_up_osd_features = 0;
1854 for (int osd = 0; osd < max_osd; ++osd) {
1855 if (!is_up(osd))
1856 continue;
1857 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1858 if (xi.features == 0)
1859 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1860 if (first) {
1861 cached_up_osd_features = xi.features;
1862 first = false;
1863 } else {
1864 cached_up_osd_features &= xi.features;
1865 }
1866 }
1867}
1868
1869uint64_t OSDMap::get_up_osd_features() const
1870{
1871 return cached_up_osd_features;
1872}
1873
1874void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1875{
11fdf7f2 1876 using ceph::encode;
7c673cae
FG
1877 if (o->epoch == n->epoch)
1878 return;
1879
1880 int diff = 0;
1881
1882 // do addrs match?
1883 if (o->max_osd != n->max_osd)
1884 diff++;
1885 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1886 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1887 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1888 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1889 else
1890 diff++;
11fdf7f2
TL
1891 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1892 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1893 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1894 else
1895 diff++;
11fdf7f2
TL
1896 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1897 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1898 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1899 else
1900 diff++;
11fdf7f2
TL
1901 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1902 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1903 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1904 else
1905 diff++;
1906 }
1907 if (diff == 0) {
1908 // zoinks, no differences at all!
1909 n->osd_addrs = o->osd_addrs;
1910 }
1911
1912 // does crush match?
9f95a23c 1913 ceph::buffer::list oc, nc;
11fdf7f2
TL
1914 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1915 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1916 if (oc.contents_equal(nc)) {
1917 n->crush = o->crush;
1918 }
1919
1920 // does pg_temp match?
31f18b77
FG
1921 if (*o->pg_temp == *n->pg_temp)
1922 n->pg_temp = o->pg_temp;
7c673cae
FG
1923
1924 // does primary_temp match?
1925 if (o->primary_temp->size() == n->primary_temp->size()) {
1926 if (*o->primary_temp == *n->primary_temp)
1927 n->primary_temp = o->primary_temp;
1928 }
1929
1930 // do uuids match?
1931 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1932 *o->osd_uuid == *n->osd_uuid)
1933 n->osd_uuid = o->osd_uuid;
1934}
1935
1936void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1937 const OSDMap& oldmap,
1938 const OSDMap& nextmap,
1939 Incremental *pending_inc)
7c673cae
FG
1940{
1941 ldout(cct, 10) << __func__ << dendl;
7c673cae 1942
11fdf7f2 1943 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1944 // if pool does not exist, remove any existing pg_temps associated with
1945 // it. we don't care about pg_temps on the pending_inc either; if there
1946 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1947 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1948 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1949 << " for nonexistent pool " << pg.first.pool() << dendl;
1950 pending_inc->new_pg_temp[pg.first].clear();
1951 continue;
1952 }
20effc67
TL
1953 if (!nextmap.pg_exists(pg.first)) {
1954 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1955 << " for nonexistent pg " << dendl;
1956 pending_inc->new_pg_temp[pg.first].clear();
1957 continue;
1958 }
7c673cae
FG
1959 // all osds down?
1960 unsigned num_up = 0;
1961 for (auto o : pg.second) {
11fdf7f2 1962 if (!nextmap.is_down(o)) {
7c673cae
FG
1963 ++num_up;
1964 break;
1965 }
1966 }
1967 if (num_up == 0) {
1968 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1969 << " with all down osds" << pg.second << dendl;
1970 pending_inc->new_pg_temp[pg.first].clear();
1971 continue;
1972 }
1973 // redundant pg_temp?
1974 vector<int> raw_up;
1975 int primary;
11fdf7f2 1976 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1977 bool remove = false;
11fdf7f2 1978 if (raw_up == pg.second) {
7c673cae
FG
1979 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1980 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1981 remove = true;
1982 }
1983 // oversized pg_temp?
11fdf7f2 1984 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1985 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1986 << pg.second << " exceeds pool size" << dendl;
1987 remove = true;
1988 }
1989 if (remove) {
11fdf7f2 1990 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1991 pending_inc->new_pg_temp[pg.first].clear();
1992 else
1993 pending_inc->new_pg_temp.erase(pg.first);
1994 }
1995 }
1996
11fdf7f2 1997 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1998 // primary down?
11fdf7f2 1999 if (nextmap.is_down(pg.second)) {
7c673cae
FG
2000 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
2001 << " to down " << pg.second << dendl;
2002 pending_inc->new_primary_temp[pg.first] = -1;
2003 continue;
2004 }
2005 // redundant primary_temp?
2006 vector<int> real_up, templess_up;
2007 int real_primary, templess_primary;
2008 pg_t pgid = pg.first;
11fdf7f2
TL
2009 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
2010 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
2011 if (real_primary == templess_primary){
2012 ldout(cct, 10) << __func__ << " removing primary_temp "
2013 << pgid << " -> " << real_primary
2014 << " (unnecessary/redundant)" << dendl;
11fdf7f2 2015 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
2016 pending_inc->new_primary_temp[pgid] = -1;
2017 else
2018 pending_inc->new_primary_temp.erase(pgid);
2019 }
2020 }
2021}
2022
494da23a 2023void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 2024{
494da23a
TL
2025 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
2026 for (auto& p : pg_upmap)
2027 upmap_pgs->push_back(p.first);
2028 for (auto& p : pg_upmap_items)
2029 upmap_pgs->push_back(p.first);
2030}
94b18763 2031
494da23a
TL
2032bool OSDMap::check_pg_upmaps(
2033 CephContext *cct,
2034 const vector<pg_t>& to_check,
2035 vector<pg_t> *to_cancel,
2036 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
2037{
2038 bool any_change = false;
2039 map<int, map<int, float>> rule_weight_map;
28e407b8 2040 for (auto& pg : to_check) {
494da23a 2041 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
2042 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
2043 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
2044 << dendl;
494da23a 2045 to_cancel->push_back(pg);
11fdf7f2
TL
2046 continue;
2047 }
2048 if (pi->is_pending_merge(pg, nullptr)) {
2049 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
2050 << dendl;
494da23a 2051 to_cancel->push_back(pg);
94b18763
FG
2052 continue;
2053 }
494da23a
TL
2054 vector<int> raw, up;
2055 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
2056 auto crush_rule = get_pg_pool_crush_rule(pg);
2057 auto r = crush->verify_upmap(cct,
2058 crush_rule,
2059 get_pg_pool_size(pg),
2060 up);
a8e16298
TL
2061 if (r < 0) {
2062 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
2063 << " returning " << r
2064 << dendl;
494da23a 2065 to_cancel->push_back(pg);
a8e16298
TL
2066 continue;
2067 }
2068 // below we check against crush-topology changing..
28e407b8
AA
2069 map<int, float> weight_map;
2070 auto it = rule_weight_map.find(crush_rule);
2071 if (it == rule_weight_map.end()) {
494da23a 2072 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
2073 if (r < 0) {
2074 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
2075 << "crush_rule " << crush_rule
2076 << dendl;
28e407b8
AA
2077 continue;
2078 }
2079 rule_weight_map[crush_rule] = weight_map;
2080 } else {
2081 weight_map = it->second;
2082 }
28e407b8 2083 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 2084 << " weight_map " << weight_map
94b18763 2085 << dendl;
a8e16298 2086 for (auto osd : up) {
28e407b8
AA
2087 auto it = weight_map.find(osd);
2088 if (it == weight_map.end()) {
92f5a8d4
TL
2089 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
2090 << "been moved out of the specific crush-tree"
2091 << dendl;
494da23a 2092 to_cancel->push_back(pg);
94b18763
FG
2093 break;
2094 }
494da23a 2095 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 2096 if (adjusted_weight == 0) {
92f5a8d4
TL
2097 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
2098 << " is out/crush-out"
2099 << dendl;
494da23a 2100 to_cancel->push_back(pg);
94b18763
FG
2101 break;
2102 }
2103 }
eafe8130
TL
2104 if (!to_cancel->empty() && to_cancel->back() == pg)
2105 continue;
2106 // okay, upmap is valid
2107 // continue to check if it is still necessary
2108 auto i = pg_upmap.find(pg);
a4b75251
TL
2109 if (i != pg_upmap.end()) {
2110 if (i->second == raw) {
1e59de90 2111 ldout(cct, 10) << __func__ << "removing redundant pg_upmap " << i->first << " "
a4b75251
TL
2112 << i->second << dendl;
2113 to_cancel->push_back(pg);
2114 continue;
2115 }
2116 if ((int)i->second.size() != get_pg_pool_size(pg)) {
1e59de90 2117 ldout(cct, 10) << __func__ << "removing pg_upmap " << i->first << " "
a4b75251
TL
2118 << i->second << " != pool size " << get_pg_pool_size(pg)
2119 << dendl;
2120 to_cancel->push_back(pg);
2121 continue;
2122 }
eafe8130
TL
2123 }
2124 auto j = pg_upmap_items.find(pg);
2125 if (j != pg_upmap_items.end()) {
2126 mempool::osdmap::vector<pair<int,int>> newmap;
2127 for (auto& p : j->second) {
1e59de90
TL
2128 auto osd_from = p.first;
2129 auto osd_to = p.second;
2130 if (std::find(raw.begin(), raw.end(), osd_from) == raw.end()) {
eafe8130 2131 // cancel mapping if source osd does not exist anymore
1e59de90 2132 ldout(cct, 20) << __func__ << " pg_upmap_items (source osd does not exist) " << pg_upmap_items << dendl;
eafe8130
TL
2133 continue;
2134 }
1e59de90
TL
2135 if (osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2136 osd_to >= 0 && osd_weight[osd_to] == 0) {
eafe8130 2137 // cancel mapping if target osd is out
1e59de90 2138 ldout(cct, 20) << __func__ << " pg_upmap_items (target osd is out) " << pg_upmap_items << dendl;
eafe8130
TL
2139 continue;
2140 }
2141 newmap.push_back(p);
2142 }
2143 if (newmap.empty()) {
1e59de90 2144 ldout(cct, 10) << __func__ << " removing no-op pg_upmap_items "
eafe8130
TL
2145 << j->first << " " << j->second
2146 << dendl;
2147 to_cancel->push_back(pg);
aee94f69
TL
2148 } else if (newmap != j->second) {
2149 // check partial no-op here.
1e59de90 2150 ldout(cct, 10) << __func__ << " simplifying partially no-op pg_upmap_items "
eafe8130
TL
2151 << j->first << " " << j->second
2152 << " -> " << newmap
2153 << dendl;
2154 to_remap->insert({pg, newmap});
2155 any_change = true;
2156 }
2157 }
28e407b8 2158 }
494da23a
TL
2159 any_change = any_change || !to_cancel->empty();
2160 return any_change;
2161}
2162
2163void OSDMap::clean_pg_upmaps(
2164 CephContext *cct,
2165 Incremental *pending_inc,
2166 const vector<pg_t>& to_cancel,
2167 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2168{
28e407b8 2169 for (auto &pg: to_cancel) {
494da23a
TL
2170 auto i = pending_inc->new_pg_upmap.find(pg);
2171 if (i != pending_inc->new_pg_upmap.end()) {
2172 ldout(cct, 10) << __func__ << " cancel invalid pending "
2173 << "pg_upmap entry "
2174 << i->first << "->" << i->second
2175 << dendl;
2176 pending_inc->new_pg_upmap.erase(i);
94b18763 2177 }
494da23a
TL
2178 auto j = pg_upmap.find(pg);
2179 if (j != pg_upmap.end()) {
2180 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2181 << j->first << "->" << j->second
2182 << dendl;
2183 pending_inc->old_pg_upmap.insert(pg);
2184 }
2185 auto p = pending_inc->new_pg_upmap_items.find(pg);
2186 if (p != pending_inc->new_pg_upmap_items.end()) {
2187 ldout(cct, 10) << __func__ << " cancel invalid pending "
2188 << "pg_upmap_items entry "
2189 << p->first << "->" << p->second
2190 << dendl;
2191 pending_inc->new_pg_upmap_items.erase(p);
2192 }
2193 auto q = pg_upmap_items.find(pg);
2194 if (q != pg_upmap_items.end()) {
2195 ldout(cct, 10) << __func__ << " cancel invalid "
2196 << "pg_upmap_items entry "
2197 << q->first << "->" << q->second
2198 << dendl;
2199 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
2200 }
2201 }
494da23a
TL
2202 for (auto& i : to_remap)
2203 pending_inc->new_pg_upmap_items[i.first] = i.second;
2204}
2205
2206bool OSDMap::clean_pg_upmaps(
2207 CephContext *cct,
2208 Incremental *pending_inc) const
2209{
2210 ldout(cct, 10) << __func__ << dendl;
2211 vector<pg_t> to_check;
2212 vector<pg_t> to_cancel;
2213 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2214
2215 get_upmap_pgs(&to_check);
2216 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2217 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
1e59de90
TL
2218 //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
2219 // and cleaned in the same way as pg_upmap. This is not critical since invalid
2220 // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
494da23a 2221 return any_change;
94b18763
FG
2222}
2223
7c673cae
FG
2224int OSDMap::apply_incremental(const Incremental &inc)
2225{
f67539c2 2226 new_blocklist_entries = false;
7c673cae
FG
2227 if (inc.epoch == 1)
2228 fsid = inc.fsid;
2229 else if (inc.fsid != fsid)
2230 return -EINVAL;
2231
11fdf7f2 2232 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
2233
2234 epoch++;
2235 modified = inc.modified;
2236
2237 // full map?
2238 if (inc.fullmap.length()) {
9f95a23c 2239 ceph::buffer::list bl(inc.fullmap);
7c673cae
FG
2240 decode(bl);
2241 return 0;
2242 }
2243
2244 // nope, incremental.
31f18b77 2245 if (inc.new_flags >= 0) {
7c673cae 2246 flags = inc.new_flags;
31f18b77
FG
2247 // the below is just to cover a newly-upgraded luminous mon
2248 // cluster that has to set require_jewel_osds or
2249 // require_kraken_osds before the osds can be upgraded to
2250 // luminous.
2251 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c
TL
2252 if (require_osd_release < ceph_release_t::kraken) {
2253 require_osd_release = ceph_release_t::kraken;
31f18b77
FG
2254 }
2255 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c
TL
2256 if (require_osd_release < ceph_release_t::jewel) {
2257 require_osd_release = ceph_release_t::jewel;
31f18b77
FG
2258 }
2259 }
2260 }
7c673cae
FG
2261
2262 if (inc.new_max_osd >= 0)
2263 set_max_osd(inc.new_max_osd);
2264
2265 if (inc.new_pool_max != -1)
2266 pool_max = inc.new_pool_max;
2267
2268 for (const auto &pool : inc.new_pools) {
2269 pools[pool.first] = pool.second;
2270 pools[pool.first].last_change = epoch;
2271 }
2272
11fdf7f2
TL
2273 new_removed_snaps = inc.new_removed_snaps;
2274 new_purged_snaps = inc.new_purged_snaps;
2275 for (auto p = new_removed_snaps.begin();
2276 p != new_removed_snaps.end();
2277 ++p) {
2278 removed_snaps_queue[p->first].union_of(p->second);
2279 }
2280 for (auto p = new_purged_snaps.begin();
2281 p != new_purged_snaps.end();
2282 ++p) {
2283 auto q = removed_snaps_queue.find(p->first);
2284 ceph_assert(q != removed_snaps_queue.end());
2285 q->second.subtract(p->second);
2286 if (q->second.empty()) {
2287 removed_snaps_queue.erase(q);
2288 }
2289 }
2290
2291 if (inc.new_last_up_change != utime_t()) {
2292 last_up_change = inc.new_last_up_change;
2293 }
2294 if (inc.new_last_in_change != utime_t()) {
2295 last_in_change = inc.new_last_in_change;
2296 }
2297
7c673cae
FG
2298 for (const auto &pname : inc.new_pool_names) {
2299 auto pool_name_entry = pool_name.find(pname.first);
2300 if (pool_name_entry != pool_name.end()) {
2301 name_pool.erase(pool_name_entry->second);
2302 pool_name_entry->second = pname.second;
2303 } else {
2304 pool_name[pname.first] = pname.second;
2305 }
2306 name_pool[pname.second] = pname.first;
2307 }
2308
2309 for (const auto &pool : inc.old_pools) {
2310 pools.erase(pool);
2311 name_pool.erase(pool_name[pool]);
2312 pool_name.erase(pool);
2313 }
2314
2315 for (const auto &weight : inc.new_weight) {
2316 set_weight(weight.first, weight.second);
2317
2318 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2319 // xinfo old_weight.
2320 if (weight.second) {
2321 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2322 osd_xinfo[weight.first].old_weight = 0;
2323 }
2324 }
2325
2326 for (const auto &primary_affinity : inc.new_primary_affinity) {
2327 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2328 }
2329
2330 // erasure_code_profiles
2331 for (const auto &profile : inc.old_erasure_code_profiles)
2332 erasure_code_profiles.erase(profile);
2333
2334 for (const auto &profile : inc.new_erasure_code_profiles) {
2335 set_erasure_code_profile(profile.first, profile.second);
2336 }
2337
2338 // up/down
2339 for (const auto &state : inc.new_state) {
2340 const auto osd = state.first;
2341 int s = state.second ? state.second : CEPH_OSD_UP;
2342 if ((osd_state[osd] & CEPH_OSD_UP) &&
2343 (s & CEPH_OSD_UP)) {
2344 osd_info[osd].down_at = epoch;
2345 osd_xinfo[osd].down_stamp = modified;
2346 }
2347 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2348 (s & CEPH_OSD_EXISTS)) {
2349 // osd is destroyed; clear out anything interesting.
2350 (*osd_uuid)[osd] = uuid_d();
2351 osd_info[osd] = osd_info_t();
2352 osd_xinfo[osd] = osd_xinfo_t();
2353 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2354 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2355 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2356 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2357 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2358 osd_state[osd] = 0;
2359 } else {
2360 osd_state[osd] ^= s;
2361 }
2362 }
2363
2364 for (const auto &client : inc.new_up_client) {
2365 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
9f95a23c 2366 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
11fdf7f2
TL
2367 osd_addrs->client_addrs[client.first].reset(
2368 new entity_addrvec_t(client.second));
2369 osd_addrs->hb_back_addrs[client.first].reset(
2370 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2371 osd_addrs->hb_front_addrs[client.first].reset(
2372 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2373
2374 osd_info[client.first].up_from = epoch;
2375 }
2376
2377 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2378 osd_addrs->cluster_addrs[cluster.first].reset(
2379 new entity_addrvec_t(cluster.second));
7c673cae
FG
2380
2381 // info
2382 for (const auto &thru : inc.new_up_thru)
2383 osd_info[thru.first].up_thru = thru.second;
2384
2385 for (const auto &interval : inc.new_last_clean_interval) {
2386 osd_info[interval.first].last_clean_begin = interval.second.first;
2387 osd_info[interval.first].last_clean_end = interval.second.second;
2388 }
2389
2390 for (const auto &lost : inc.new_lost)
2391 osd_info[lost.first].lost_at = lost.second;
2392
2393 // xinfo
2394 for (const auto &xinfo : inc.new_xinfo)
2395 osd_xinfo[xinfo.first] = xinfo.second;
2396
2397 // uuid
2398 for (const auto &uuid : inc.new_uuid)
2399 (*osd_uuid)[uuid.first] = uuid.second;
2400
2401 // pg rebuild
2402 for (const auto &pg : inc.new_pg_temp) {
2403 if (pg.second.empty())
2404 pg_temp->erase(pg.first);
2405 else
31f18b77
FG
2406 pg_temp->set(pg.first, pg.second);
2407 }
2408 if (!inc.new_pg_temp.empty()) {
2409 // make sure pg_temp is efficiently stored
2410 pg_temp->rebuild();
7c673cae
FG
2411 }
2412
2413 for (const auto &pg : inc.new_primary_temp) {
2414 if (pg.second == -1)
2415 primary_temp->erase(pg.first);
2416 else
2417 (*primary_temp)[pg.first] = pg.second;
2418 }
2419
2420 for (auto& p : inc.new_pg_upmap) {
2421 pg_upmap[p.first] = p.second;
2422 }
2423 for (auto& pg : inc.old_pg_upmap) {
2424 pg_upmap.erase(pg);
2425 }
2426 for (auto& p : inc.new_pg_upmap_items) {
2427 pg_upmap_items[p.first] = p.second;
2428 }
2429 for (auto& pg : inc.old_pg_upmap_items) {
2430 pg_upmap_items.erase(pg);
2431 }
2432
1e59de90
TL
2433 for (auto& [pg, prim] : inc.new_pg_upmap_primary) {
2434 pg_upmap_primaries[pg] = prim;
2435 }
2436 for (auto& pg : inc.old_pg_upmap_primary) {
2437 pg_upmap_primaries.erase(pg);
2438 }
2439
f67539c2
TL
2440 // blocklist
2441 if (!inc.new_blocklist.empty()) {
2442 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2443 new_blocklist_entries = true;
7c673cae 2444 }
f67539c2
TL
2445 for (const auto &addr : inc.old_blocklist)
2446 blocklist.erase(addr);
7c673cae 2447
33c7a0ef
TL
2448 for (const auto& addr_p : inc.new_range_blocklist) {
2449 range_blocklist.insert(addr_p);
2450 calculated_ranges.emplace(addr_p.first, addr_p.first);
2451 new_blocklist_entries = true;
2452 }
2453 for (const auto &addr : inc.old_range_blocklist) {
2454 calculated_ranges.erase(addr);
2455 range_blocklist.erase(addr);
2456 }
2457
81eedcae
TL
2458 for (auto& i : inc.new_crush_node_flags) {
2459 if (i.second) {
2460 crush_node_flags[i.first] = i.second;
2461 } else {
2462 crush_node_flags.erase(i.first);
2463 }
2464 }
2465
2466 for (auto& i : inc.new_device_class_flags) {
2467 if (i.second) {
2468 device_class_flags[i.first] = i.second;
2469 } else {
2470 device_class_flags.erase(i.first);
2471 }
2472 }
2473
7c673cae
FG
2474 // cluster snapshot?
2475 if (inc.cluster_snapshot.length()) {
2476 cluster_snapshot = inc.cluster_snapshot;
2477 cluster_snapshot_epoch = inc.epoch;
2478 } else {
2479 cluster_snapshot.clear();
2480 cluster_snapshot_epoch = 0;
2481 }
2482
2483 if (inc.new_nearfull_ratio >= 0) {
2484 nearfull_ratio = inc.new_nearfull_ratio;
2485 }
2486 if (inc.new_backfillfull_ratio >= 0) {
2487 backfillfull_ratio = inc.new_backfillfull_ratio;
2488 }
2489 if (inc.new_full_ratio >= 0) {
2490 full_ratio = inc.new_full_ratio;
2491 }
9f95a23c 2492 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
7c673cae
FG
2493 require_min_compat_client = inc.new_require_min_compat_client;
2494 }
9f95a23c 2495 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
31f18b77 2496 require_osd_release = inc.new_require_osd_release;
9f95a23c 2497 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 2498 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2499 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2500 }
2501 }
7c673cae 2502
9f95a23c 2503 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
11fdf7f2 2504 require_osd_release = inc.new_require_osd_release;
9f95a23c 2505 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
2506 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2507 }
2508 }
7c673cae
FG
2509 // do new crush map last (after up/down stuff)
2510 if (inc.crush.length()) {
9f95a23c 2511 ceph::buffer::list bl(inc.crush);
11fdf7f2 2512 auto blp = bl.cbegin();
7c673cae
FG
2513 crush.reset(new CrushWrapper);
2514 crush->decode(blp);
9f95a23c 2515 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77
FG
2516 // only increment if this is a luminous-encoded osdmap, lest
2517 // the mon's crush_version diverge from what the osds or others
2518 // are decoding and applying on their end. if we won't encode
2519 // it in the canonical version, don't change it.
2520 ++crush_version;
2521 }
81eedcae
TL
2522 for (auto it = device_class_flags.begin();
2523 it != device_class_flags.end();) {
2524 const char* class_name = crush->get_class_name(it->first);
2525 if (!class_name) // device class is gone
2526 it = device_class_flags.erase(it);
2527 else
2528 it++;
2529 }
7c673cae
FG
2530 }
2531
f67539c2
TL
2532 if (inc.change_stretch_mode) {
2533 stretch_mode_enabled = inc.stretch_mode_enabled;
2534 stretch_bucket_count = inc.new_stretch_bucket_count;
2535 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2536 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2537 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2538 }
2539
1e59de90
TL
2540 switch (inc.mutate_allow_crimson) {
2541 case Incremental::mutate_allow_crimson_t::NONE:
2542 break;
2543 case Incremental::mutate_allow_crimson_t::SET:
2544 allow_crimson = true;
2545 break;
2546 case Incremental::mutate_allow_crimson_t::CLEAR:
2547 allow_crimson = false;
2548 break;
2549 }
2550
7c673cae
FG
2551 calc_num_osds();
2552 _calc_up_osd_features();
2553 return 0;
2554}
2555
2556// mapping
2557int OSDMap::map_to_pg(
2558 int64_t poolid,
2559 const string& name,
2560 const string& key,
2561 const string& nspace,
2562 pg_t *pg) const
2563{
2564 // calculate ps (placement seed)
2565 const pg_pool_t *pool = get_pg_pool(poolid);
2566 if (!pool)
2567 return -ENOENT;
2568 ps_t ps;
2569 if (!key.empty())
2570 ps = pool->hash_key(key, nspace);
2571 else
2572 ps = pool->hash_key(name, nspace);
2573 *pg = pg_t(ps, poolid);
2574 return 0;
2575}
2576
2577int OSDMap::object_locator_to_pg(
2578 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2579{
2580 if (loc.hash >= 0) {
2581 if (!get_pg_pool(loc.get_pool())) {
2582 return -ENOENT;
2583 }
2584 pg = pg_t(loc.hash, loc.get_pool());
2585 return 0;
2586 }
2587 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2588}
2589
2590ceph_object_layout OSDMap::make_object_layout(
2591 object_t oid, int pg_pool, string nspace) const
2592{
2593 object_locator_t loc(pg_pool, nspace);
2594
2595 ceph_object_layout ol;
2596 pg_t pgid = object_locator_to_pg(oid, loc);
2597 ol.ol_pgid = pgid.get_old_pg().v;
2598 ol.ol_stripe_unit = 0;
2599 return ol;
2600}
2601
2602void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2603 vector<int>& osds) const
2604{
2605 if (pool.can_shift_osds()) {
2606 unsigned removed = 0;
2607 for (unsigned i = 0; i < osds.size(); i++) {
2608 if (!exists(osds[i])) {
2609 removed++;
2610 continue;
2611 }
2612 if (removed) {
2613 osds[i - removed] = osds[i];
2614 }
2615 }
2616 if (removed)
2617 osds.resize(osds.size() - removed);
2618 } else {
2619 for (auto& osd : osds) {
2620 if (!exists(osd))
2621 osd = CRUSH_ITEM_NONE;
2622 }
2623 }
2624}
2625
31f18b77 2626void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2627 const pg_pool_t& pool, pg_t pg,
2628 vector<int> *osds,
2629 ps_t *ppps) const
2630{
2631 // map to osds[]
2632 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2633 unsigned size = pool.get_size();
2634
2635 // what crush rule?
20effc67 2636 int ruleno = pool.get_crush_rule();
7c673cae
FG
2637 if (ruleno >= 0)
2638 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2639
2640 _remove_nonexistent_osds(pool, *osds);
2641
2642 if (ppps)
2643 *ppps = pps;
7c673cae
FG
2644}
2645
2646int OSDMap::_pick_primary(const vector<int>& osds) const
2647{
2648 for (auto osd : osds) {
2649 if (osd != CRUSH_ITEM_NONE) {
2650 return osd;
2651 }
2652 }
2653 return -1;
2654}
2655
224ce89b 2656void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2657{
2658 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2659 auto p = pg_upmap.find(pg);
2660 if (p != pg_upmap.end()) {
2661 // make sure targets aren't marked out
2662 for (auto osd : p->second) {
91327a77
AA
2663 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2664 osd_weight[osd] == 0) {
7c673cae
FG
2665 // reject/ignore the explicit mapping
2666 return;
2667 }
2668 }
2669 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2670 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2671 }
2672
2673 auto q = pg_upmap_items.find(pg);
2674 if (q != pg_upmap_items.end()) {
181888fb
FG
2675 // NOTE: this approach does not allow a bidirectional swap,
2676 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1e59de90
TL
2677 for (auto& [osd_from, osd_to] : q->second) {
2678 // A capcaity change upmap (repace osd in the pg with osd not in the pg)
181888fb
FG
2679 // make sure the replacement value doesn't already appear
2680 bool exists = false;
2681 ssize_t pos = -1;
2682 for (unsigned i = 0; i < raw->size(); ++i) {
2683 int osd = (*raw)[i];
1e59de90 2684 if (osd == osd_to) {
181888fb
FG
2685 exists = true;
2686 break;
2687 }
2688 // ignore mapping if target is marked out (or invalid osd id)
1e59de90 2689 if (osd == osd_from &&
181888fb 2690 pos < 0 &&
1e59de90
TL
2691 !(osd_to != CRUSH_ITEM_NONE && osd_to < max_osd &&
2692 osd_to >= 0 && osd_weight[osd_to] == 0)) {
181888fb 2693 pos = i;
1e59de90 2694 }
181888fb
FG
2695 }
2696 if (!exists && pos >= 0) {
1e59de90
TL
2697 (*raw)[pos] = osd_to;
2698 }
2699 }
2700 }
2701 auto r = pg_upmap_primaries.find(pg);
2702 if (r != pg_upmap_primaries.end()) {
2703 auto new_prim = r->second;
2704 // Apply mapping only if new primary is not marked out and valid osd id
2705 if (new_prim != CRUSH_ITEM_NONE && new_prim < max_osd && new_prim >= 0 &&
2706 osd_weight[new_prim] != 0) {
2707 int new_prim_idx = 0;
2708 for (int i = 1 ; i < (int)raw->size(); i++) { // start from 1 on purpose
2709 if ((*raw)[i] == new_prim) {
2710 new_prim_idx = i;
2711 break;
2712 }
2713 }
2714 if (new_prim_idx > 0) {
2715 // swap primary
2716 (*raw)[new_prim_idx] = (*raw)[0];
2717 (*raw)[0] = new_prim;
7c673cae
FG
2718 }
2719 }
2720 }
2721}
2722
2723// pg -> (up osd list)
2724void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2725 vector<int> *up) const
2726{
2727 if (pool.can_shift_osds()) {
2728 // shift left
2729 up->clear();
2730 up->reserve(raw.size());
2731 for (unsigned i=0; i<raw.size(); i++) {
2732 if (!exists(raw[i]) || is_down(raw[i]))
2733 continue;
2734 up->push_back(raw[i]);
2735 }
2736 } else {
2737 // set down/dne devices to NONE
2738 up->resize(raw.size());
2739 for (int i = raw.size() - 1; i >= 0; --i) {
2740 if (!exists(raw[i]) || is_down(raw[i])) {
2741 (*up)[i] = CRUSH_ITEM_NONE;
2742 } else {
2743 (*up)[i] = raw[i];
2744 }
2745 }
2746 }
2747}
2748
2749void OSDMap::_apply_primary_affinity(ps_t seed,
2750 const pg_pool_t& pool,
2751 vector<int> *osds,
2752 int *primary) const
2753{
2754 // do we have any non-default primary_affinity values for these osds?
2755 if (!osd_primary_affinity)
2756 return;
2757
2758 bool any = false;
2759 for (const auto osd : *osds) {
2760 if (osd != CRUSH_ITEM_NONE &&
2761 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2762 any = true;
2763 break;
2764 }
2765 }
2766 if (!any)
2767 return;
2768
2769 // pick the primary. feed both the seed (for the pg) and the osd
2770 // into the hash/rng so that a proportional fraction of an osd's pgs
2771 // get rejected as primary.
2772 int pos = -1;
2773 for (unsigned i = 0; i < osds->size(); ++i) {
2774 int o = (*osds)[i];
2775 if (o == CRUSH_ITEM_NONE)
2776 continue;
2777 unsigned a = (*osd_primary_affinity)[o];
2778 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2779 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2780 seed, o) >> 16) >= a) {
2781 // we chose not to use this primary. note it anyway as a
2782 // fallback in case we don't pick anyone else, but keep looking.
2783 if (pos < 0)
2784 pos = i;
2785 } else {
2786 pos = i;
2787 break;
2788 }
2789 }
2790 if (pos < 0)
2791 return;
2792
2793 *primary = (*osds)[pos];
2794
2795 if (pool.can_shift_osds() && pos > 0) {
2796 // move the new primary to the front.
2797 for (int i = pos; i > 0; --i) {
2798 (*osds)[i] = (*osds)[i-1];
2799 }
2800 (*osds)[0] = *primary;
2801 }
2802}
2803
2804void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2805 vector<int> *temp_pg, int *temp_primary) const
2806{
2807 pg = pool.raw_pg_to_pg(pg);
2808 const auto p = pg_temp->find(pg);
2809 temp_pg->clear();
2810 if (p != pg_temp->end()) {
2811 for (unsigned i=0; i<p->second.size(); i++) {
2812 if (!exists(p->second[i]) || is_down(p->second[i])) {
2813 if (pool.can_shift_osds()) {
2814 continue;
2815 } else {
2816 temp_pg->push_back(CRUSH_ITEM_NONE);
2817 }
2818 } else {
2819 temp_pg->push_back(p->second[i]);
2820 }
2821 }
2822 }
2823 const auto &pp = primary_temp->find(pg);
2824 *temp_primary = -1;
2825 if (pp != primary_temp->end()) {
2826 *temp_primary = pp->second;
2827 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2828 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2829 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2830 *temp_primary = (*temp_pg)[i];
2831 break;
2832 }
2833 }
2834 }
2835}
2836
31f18b77 2837void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2838{
7c673cae 2839 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2840 if (!pool) {
2841 *primary = -1;
2842 raw->clear();
31f18b77 2843 return;
11fdf7f2 2844 }
31f18b77 2845 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2846 *primary = _pick_primary(*raw);
7c673cae
FG
2847}
2848
494da23a
TL
2849void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2850 vector<int> *raw_upmap) const
a8e16298
TL
2851{
2852 auto pool = get_pg_pool(pg.pool());
2853 if (!pool) {
2854 raw_upmap->clear();
2855 return;
2856 }
494da23a
TL
2857 _pg_to_raw_osds(*pool, pg, raw, NULL);
2858 *raw_upmap = *raw;
a8e16298
TL
2859 _apply_upmap(*pool, pg, raw_upmap);
2860}
2861
7c673cae
FG
2862void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2863{
2864 const pg_pool_t *pool = get_pg_pool(pg.pool());
2865 if (!pool) {
11fdf7f2
TL
2866 *primary = -1;
2867 up->clear();
7c673cae
FG
2868 return;
2869 }
2870 vector<int> raw;
2871 ps_t pps;
2872 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2873 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2874 _raw_to_up_osds(*pool, raw, up);
2875 *primary = _pick_primary(raw);
2876 _apply_primary_affinity(pps, *pool, up, primary);
2877}
31f18b77 2878
7c673cae
FG
2879void OSDMap::_pg_to_up_acting_osds(
2880 const pg_t& pg, vector<int> *up, int *up_primary,
2881 vector<int> *acting, int *acting_primary,
2882 bool raw_pg_to_pg) const
2883{
2884 const pg_pool_t *pool = get_pg_pool(pg.pool());
2885 if (!pool ||
2886 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2887 if (up)
2888 up->clear();
2889 if (up_primary)
2890 *up_primary = -1;
2891 if (acting)
2892 acting->clear();
2893 if (acting_primary)
2894 *acting_primary = -1;
2895 return;
2896 }
2897 vector<int> raw;
2898 vector<int> _up;
2899 vector<int> _acting;
2900 int _up_primary;
2901 int _acting_primary;
2902 ps_t pps;
2903 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2904 if (_acting.empty() || up || up_primary) {
2905 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2906 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2907 _raw_to_up_osds(*pool, raw, &_up);
2908 _up_primary = _pick_primary(_up);
2909 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2910 if (_acting.empty()) {
2911 _acting = _up;
2912 if (_acting_primary == -1) {
2913 _acting_primary = _up_primary;
2914 }
2915 }
2916
2917 if (up)
2918 up->swap(_up);
2919 if (up_primary)
2920 *up_primary = _up_primary;
2921 }
2922
2923 if (acting)
2924 acting->swap(_acting);
2925 if (acting_primary)
2926 *acting_primary = _acting_primary;
2927}
2928
9f95a23c 2929int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
7c673cae 2930{
9f95a23c
TL
2931 // This implementation is broken for EC PGs since the osd may appear
2932 // multiple times in the acting set. See
2933 // https://tracker.ceph.com/issues/43213
7c673cae
FG
2934 if (!nrep)
2935 nrep = acting.size();
2936 for (int i=0; i<nrep; i++)
2937 if (acting[i] == osd)
2938 return i;
2939 return -1;
2940}
2941
9f95a23c 2942int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
7c673cae 2943{
9f95a23c
TL
2944 int nrep = acting.size();
2945 if (who.shard == shard_id_t::NO_SHARD) {
2946 for (int i=0; i<nrep; i++) {
2947 if (acting[i] == who.osd) {
2948 return i;
2949 }
2950 }
2951 } else {
2952 if (who.shard < nrep && acting[who.shard] == who.osd) {
2953 return who.shard;
2954 }
2955 }
2956 return -1;
7c673cae
FG
2957}
2958
9f95a23c 2959bool OSDMap::primary_changed_broken(
7c673cae
FG
2960 int oldprimary,
2961 const vector<int> &oldacting,
2962 int newprimary,
2963 const vector<int> &newacting)
2964{
2965 if (oldacting.empty() && newacting.empty())
2966 return false; // both still empty
2967 if (oldacting.empty() ^ newacting.empty())
2968 return true; // was empty, now not, or vice versa
2969 if (oldprimary != newprimary)
2970 return true; // primary changed
9f95a23c
TL
2971 if (calc_pg_role_broken(oldprimary, oldacting) !=
2972 calc_pg_role_broken(newprimary, newacting))
7c673cae
FG
2973 return true;
2974 return false; // same primary (tho replicas may have changed)
2975}
2976
28e407b8
AA
2977uint64_t OSDMap::get_encoding_features() const
2978{
2979 uint64_t f = SIGNIFICANT_FEATURES;
df9f7d3d
TL
2980 if (require_osd_release < ceph_release_t::reef) {
2981 f &= ~CEPH_FEATURE_SERVER_REEF;
2982 }
9f95a23c
TL
2983 if (require_osd_release < ceph_release_t::octopus) {
2984 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2985 }
2986 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
2987 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2988 }
9f95a23c 2989 if (require_osd_release < ceph_release_t::mimic) {
11fdf7f2
TL
2990 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2991 }
9f95a23c 2992 if (require_osd_release < ceph_release_t::luminous) {
28e407b8
AA
2993 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2994 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2995 }
9f95a23c 2996 if (require_osd_release < ceph_release_t::kraken) {
28e407b8 2997 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2998 CEPH_FEATURE_MSG_ADDR2);
28e407b8 2999 }
9f95a23c 3000 if (require_osd_release < ceph_release_t::jewel) {
28e407b8 3001 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
3002 CEPH_FEATURE_NEW_OSDOP_ENCODING |
3003 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
3004 }
3005 return f;
3006}
7c673cae
FG
3007
3008// serialize, unserialize
9f95a23c 3009void OSDMap::encode_client_old(ceph::buffer::list& bl) const
7c673cae 3010{
11fdf7f2 3011 using ceph::encode;
7c673cae 3012 __u16 v = 5;
11fdf7f2 3013 encode(v, bl);
7c673cae
FG
3014
3015 // base
11fdf7f2
TL
3016 encode(fsid, bl);
3017 encode(epoch, bl);
3018 encode(created, bl);
3019 encode(modified, bl);
7c673cae 3020
11fdf7f2 3021 // for encode(pools, bl);
7c673cae 3022 __u32 n = pools.size();
11fdf7f2 3023 encode(n, bl);
7c673cae
FG
3024
3025 for (const auto &pool : pools) {
3026 n = pool.first;
11fdf7f2
TL
3027 encode(n, bl);
3028 encode(pool.second, bl, 0);
7c673cae 3029 }
11fdf7f2 3030 // for encode(pool_name, bl);
7c673cae 3031 n = pool_name.size();
11fdf7f2 3032 encode(n, bl);
7c673cae
FG
3033 for (const auto &pname : pool_name) {
3034 n = pname.first;
11fdf7f2
TL
3035 encode(n, bl);
3036 encode(pname.second, bl);
7c673cae 3037 }
11fdf7f2 3038 // for encode(pool_max, bl);
7c673cae 3039 n = pool_max;
11fdf7f2 3040 encode(n, bl);
7c673cae 3041
11fdf7f2 3042 encode(flags, bl);
7c673cae 3043
11fdf7f2 3044 encode(max_osd, bl);
31f18b77
FG
3045 {
3046 uint32_t n = osd_state.size();
11fdf7f2 3047 encode(n, bl);
31f18b77 3048 for (auto s : osd_state) {
11fdf7f2 3049 encode((uint8_t)s, bl);
31f18b77
FG
3050 }
3051 }
11fdf7f2
TL
3052 encode(osd_weight, bl);
3053 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 3054
11fdf7f2 3055 // for encode(pg_temp, bl);
7c673cae 3056 n = pg_temp->size();
11fdf7f2 3057 encode(n, bl);
f67539c2 3058 for (const auto& pg : *pg_temp) {
7c673cae 3059 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
3060 encode(opg, bl);
3061 encode(pg.second, bl);
7c673cae
FG
3062 }
3063
3064 // crush
9f95a23c 3065 ceph::buffer::list cbl;
7c673cae 3066 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 3067 encode(cbl, bl);
7c673cae
FG
3068}
3069
9f95a23c 3070void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 3071{
11fdf7f2 3072 using ceph::encode;
7c673cae
FG
3073 if ((features & CEPH_FEATURE_PGID64) == 0) {
3074 encode_client_old(bl);
3075 return;
3076 }
3077
3078 __u16 v = 6;
11fdf7f2 3079 encode(v, bl);
7c673cae
FG
3080
3081 // base
11fdf7f2
TL
3082 encode(fsid, bl);
3083 encode(epoch, bl);
3084 encode(created, bl);
3085 encode(modified, bl);
7c673cae 3086
11fdf7f2
TL
3087 encode(pools, bl, features);
3088 encode(pool_name, bl);
3089 encode(pool_max, bl);
7c673cae 3090
11fdf7f2 3091 encode(flags, bl);
7c673cae 3092
11fdf7f2 3093 encode(max_osd, bl);
31f18b77
FG
3094 {
3095 uint32_t n = osd_state.size();
11fdf7f2 3096 encode(n, bl);
31f18b77 3097 for (auto s : osd_state) {
11fdf7f2 3098 encode((uint8_t)s, bl);
31f18b77
FG
3099 }
3100 }
11fdf7f2
TL
3101 encode(osd_weight, bl);
3102 encode(osd_addrs->client_addrs, bl, features);
7c673cae 3103
11fdf7f2 3104 encode(*pg_temp, bl);
7c673cae
FG
3105
3106 // crush
9f95a23c 3107 ceph::buffer::list cbl;
7c673cae 3108 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 3109 encode(cbl, bl);
7c673cae
FG
3110
3111 // extended
3112 __u16 ev = 10;
11fdf7f2
TL
3113 encode(ev, bl);
3114 encode(osd_addrs->hb_back_addrs, bl, features);
3115 encode(osd_info, bl);
f67539c2 3116 encode(blocklist, bl, features);
11fdf7f2
TL
3117 encode(osd_addrs->cluster_addrs, bl, features);
3118 encode(cluster_snapshot_epoch, bl);
3119 encode(cluster_snapshot, bl);
3120 encode(*osd_uuid, bl);
9f95a23c 3121 encode(osd_xinfo, bl, features);
11fdf7f2 3122 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
3123}
3124
11fdf7f2
TL
3125/* for a description of osdmap versions, and when they were introduced, please
3126 * refer to
3127 * doc/dev/osd_internals/osdmap_versions.txt
3128 */
9f95a23c 3129void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 3130{
11fdf7f2 3131 using ceph::encode;
7c673cae
FG
3132 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
3133 encode_classic(bl, features);
3134 return;
3135 }
3136
3137 // only a select set of callers should *ever* be encoding new
3138 // OSDMaps. others should be passing around the canonical encoded
3139 // buffers from on high. select out those callers by passing in an
3140 // "impossible" feature bit.
11fdf7f2 3141 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
3142 features &= ~CEPH_FEATURE_RESERVED;
3143
3144 size_t start_offset = bl.length();
3145 size_t tail_offset;
11fdf7f2 3146 size_t crc_offset;
9f95a23c 3147 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
3148
3149 // meta-encoding: how we include client-used and osd-specific data
3150 ENCODE_START(8, 7, bl);
3151
3152 {
28e407b8
AA
3153 // NOTE: any new encoding dependencies must be reflected by
3154 // SIGNIFICANT_FEATURES
1e59de90 3155 uint8_t v = 10;
31f18b77 3156 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 3157 v = 3;
11fdf7f2
TL
3158 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3159 v = 6;
3160 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3161 v = 7;
df9f7d3d 3162 } else if (!HAVE_FEATURE(features, SERVER_REEF)) {
1e59de90 3163 v = 9;
df9f7d3d 3164 }
7c673cae
FG
3165 ENCODE_START(v, 1, bl); // client-usable data
3166 // base
11fdf7f2
TL
3167 encode(fsid, bl);
3168 encode(epoch, bl);
3169 encode(created, bl);
3170 encode(modified, bl);
7c673cae 3171
11fdf7f2
TL
3172 encode(pools, bl, features);
3173 encode(pool_name, bl);
3174 encode(pool_max, bl);
7c673cae 3175
31f18b77
FG
3176 if (v < 4) {
3177 decltype(flags) f = flags;
9f95a23c 3178 if (require_osd_release >= ceph_release_t::luminous)
c07f9fc5 3179 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
9f95a23c 3180 else if (require_osd_release == ceph_release_t::kraken)
31f18b77 3181 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
9f95a23c 3182 else if (require_osd_release == ceph_release_t::jewel)
31f18b77 3183 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 3184 encode(f, bl);
31f18b77 3185 } else {
11fdf7f2 3186 encode(flags, bl);
31f18b77 3187 }
7c673cae 3188
11fdf7f2 3189 encode(max_osd, bl);
31f18b77 3190 if (v >= 5) {
11fdf7f2 3191 encode(osd_state, bl);
31f18b77
FG
3192 } else {
3193 uint32_t n = osd_state.size();
11fdf7f2 3194 encode(n, bl);
31f18b77 3195 for (auto s : osd_state) {
11fdf7f2 3196 encode((uint8_t)s, bl);
31f18b77
FG
3197 }
3198 }
11fdf7f2
TL
3199 encode(osd_weight, bl);
3200 if (v >= 8) {
3201 encode(osd_addrs->client_addrs, bl, features);
3202 } else {
3203 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
3204 }
7c673cae 3205
11fdf7f2
TL
3206 encode(*pg_temp, bl);
3207 encode(*primary_temp, bl);
7c673cae 3208 if (osd_primary_affinity) {
11fdf7f2 3209 encode(*osd_primary_affinity, bl);
7c673cae
FG
3210 } else {
3211 vector<__u32> v;
11fdf7f2 3212 encode(v, bl);
7c673cae
FG
3213 }
3214
3215 // crush
9f95a23c 3216 ceph::buffer::list cbl;
7c673cae 3217 crush->encode(cbl, features);
11fdf7f2
TL
3218 encode(cbl, bl);
3219 encode(erasure_code_profiles, bl);
7c673cae
FG
3220
3221 if (v >= 4) {
11fdf7f2
TL
3222 encode(pg_upmap, bl);
3223 encode(pg_upmap_items, bl);
7c673cae 3224 } else {
11fdf7f2
TL
3225 ceph_assert(pg_upmap.empty());
3226 ceph_assert(pg_upmap_items.empty());
7c673cae 3227 }
31f18b77 3228 if (v >= 6) {
11fdf7f2
TL
3229 encode(crush_version, bl);
3230 }
3231 if (v >= 7) {
3232 encode(new_removed_snaps, bl);
3233 encode(new_purged_snaps, bl);
3234 }
3235 if (v >= 9) {
3236 encode(last_up_change, bl);
3237 encode(last_in_change, bl);
31f18b77 3238 }
1e59de90
TL
3239 if (v >= 10) {
3240 encode(pg_upmap_primaries, bl);
3241 } else {
3242 ceph_assert(pg_upmap_primaries.empty());
3243 }
7c673cae
FG
3244 ENCODE_FINISH(bl); // client-usable data
3245 }
3246
3247 {
28e407b8
AA
3248 // NOTE: any new encoding dependencies must be reflected by
3249 // SIGNIFICANT_FEATURES
1e59de90 3250 uint8_t target_v = 9; // when bumping this, be aware of allow_crimson
7c673cae
FG
3251 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3252 target_v = 1;
11fdf7f2
TL
3253 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3254 target_v = 5;
3255 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3256 target_v = 6;
7c673cae 3257 }
f67539c2
TL
3258 if (stretch_mode_enabled) {
3259 target_v = std::max((uint8_t)10, target_v);
3260 }
33c7a0ef
TL
3261 if (!range_blocklist.empty()) {
3262 target_v = std::max((uint8_t)11, target_v);
3263 }
1e59de90
TL
3264 if (allow_crimson) {
3265 target_v = std::max((uint8_t)12, target_v);
3266 }
7c673cae 3267 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
3268 if (target_v < 7) {
3269 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3270 } else {
3271 encode(osd_addrs->hb_back_addrs, bl, features);
3272 }
3273 encode(osd_info, bl);
7c673cae
FG
3274 {
3275 // put this in a sorted, ordered map<> so that we encode in a
3276 // deterministic order.
f67539c2
TL
3277 map<entity_addr_t,utime_t> blocklist_map;
3278 for (const auto &addr : blocklist)
3279 blocklist_map.insert(make_pair(addr.first, addr.second));
3280 encode(blocklist_map, bl, features);
11fdf7f2
TL
3281 }
3282 if (target_v < 7) {
3283 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3284 } else {
3285 encode(osd_addrs->cluster_addrs, bl, features);
3286 }
3287 encode(cluster_snapshot_epoch, bl);
3288 encode(cluster_snapshot, bl);
3289 encode(*osd_uuid, bl);
9f95a23c 3290 encode(osd_xinfo, bl, features);
11fdf7f2
TL
3291 if (target_v < 7) {
3292 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3293 } else {
3294 encode(osd_addrs->hb_front_addrs, bl, features);
3295 }
7c673cae 3296 if (target_v >= 2) {
11fdf7f2
TL
3297 encode(nearfull_ratio, bl);
3298 encode(full_ratio, bl);
3299 encode(backfillfull_ratio, bl);
31f18b77
FG
3300 }
3301 // 4 was string-based new_require_min_compat_client
3302 if (target_v >= 5) {
11fdf7f2
TL
3303 encode(require_min_compat_client, bl);
3304 encode(require_osd_release, bl);
3305 }
3306 if (target_v >= 6) {
3307 encode(removed_snaps_queue, bl);
7c673cae 3308 }
81eedcae
TL
3309 if (target_v >= 8) {
3310 encode(crush_node_flags, bl);
3311 }
3312 if (target_v >= 9) {
3313 encode(device_class_flags, bl);
3314 }
f67539c2
TL
3315 if (target_v >= 10) {
3316 encode(stretch_mode_enabled, bl);
3317 encode(stretch_bucket_count, bl);
3318 encode(degraded_stretch_mode, bl);
3319 encode(recovering_stretch_mode, bl);
3320 encode(stretch_mode_bucket, bl);
3321 }
33c7a0ef
TL
3322 if (target_v >= 11) {
3323 ::encode(range_blocklist, bl, features);
3324 }
1e59de90
TL
3325 if (target_v >= 12) {
3326 ::encode(allow_crimson, bl);
3327 }
7c673cae
FG
3328 ENCODE_FINISH(bl); // osd-only data
3329 }
3330
11fdf7f2
TL
3331 crc_offset = bl.length();
3332 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
3333 tail_offset = bl.length();
3334
3335 ENCODE_FINISH(bl); // meta-encoding wrapper
3336
3337 // fill in crc
9f95a23c 3338 ceph::buffer::list front;
11fdf7f2 3339 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
3340 crc = front.crc32c(-1);
3341 if (tail_offset < bl.length()) {
9f95a23c 3342 ceph::buffer::list tail;
7c673cae
FG
3343 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3344 crc = tail.crc32c(crc);
3345 }
3346 ceph_le32 crc_le;
3347 crc_le = crc;
11fdf7f2 3348 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
3349 crc_defined = true;
3350}
3351
11fdf7f2
TL
3352/* for a description of osdmap versions, and when they were introduced, please
3353 * refer to
3354 * doc/dev/osd_internals/osdmap_versions.txt
3355 */
9f95a23c 3356void OSDMap::decode(ceph::buffer::list& bl)
7c673cae 3357{
11fdf7f2 3358 auto p = bl.cbegin();
7c673cae
FG
3359 decode(p);
3360}
3361
9f95a23c 3362void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
7c673cae 3363{
11fdf7f2 3364 using ceph::decode;
7c673cae
FG
3365 __u32 n, t;
3366 __u16 v;
11fdf7f2 3367 decode(v, p);
7c673cae
FG
3368
3369 // base
11fdf7f2
TL
3370 decode(fsid, p);
3371 decode(epoch, p);
3372 decode(created, p);
3373 decode(modified, p);
7c673cae
FG
3374
3375 if (v < 6) {
3376 if (v < 4) {
3377 int32_t max_pools = 0;
11fdf7f2 3378 decode(max_pools, p);
7c673cae
FG
3379 pool_max = max_pools;
3380 }
3381 pools.clear();
11fdf7f2 3382 decode(n, p);
7c673cae 3383 while (n--) {
11fdf7f2
TL
3384 decode(t, p);
3385 decode(pools[t], p);
7c673cae
FG
3386 }
3387 if (v == 4) {
11fdf7f2 3388 decode(n, p);
7c673cae
FG
3389 pool_max = n;
3390 } else if (v == 5) {
3391 pool_name.clear();
11fdf7f2 3392 decode(n, p);
7c673cae 3393 while (n--) {
11fdf7f2
TL
3394 decode(t, p);
3395 decode(pool_name[t], p);
7c673cae 3396 }
11fdf7f2 3397 decode(n, p);
7c673cae
FG
3398 pool_max = n;
3399 }
3400 } else {
11fdf7f2
TL
3401 decode(pools, p);
3402 decode(pool_name, p);
3403 decode(pool_max, p);
7c673cae
FG
3404 }
3405 // kludge around some old bug that zeroed out pool_max (#2307)
3406 if (pools.size() && pool_max < pools.rbegin()->first) {
3407 pool_max = pools.rbegin()->first;
3408 }
3409
11fdf7f2 3410 decode(flags, p);
7c673cae 3411
11fdf7f2 3412 decode(max_osd, p);
31f18b77
FG
3413 {
3414 vector<uint8_t> os;
11fdf7f2 3415 decode(os, p);
31f18b77
FG
3416 osd_state.resize(os.size());
3417 for (unsigned i = 0; i < os.size(); ++i) {
3418 osd_state[i] = os[i];
3419 }
3420 }
11fdf7f2
TL
3421 decode(osd_weight, p);
3422 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3423 if (v <= 5) {
3424 pg_temp->clear();
11fdf7f2 3425 decode(n, p);
7c673cae
FG
3426 while (n--) {
3427 old_pg_t opg;
9f95a23c 3428 ceph::decode_raw(opg, p);
31f18b77 3429 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3430 decode(v, p);
31f18b77 3431 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3432 }
3433 } else {
11fdf7f2 3434 decode(*pg_temp, p);
7c673cae
FG
3435 }
3436
3437 // crush
9f95a23c 3438 ceph::buffer::list cbl;
11fdf7f2
TL
3439 decode(cbl, p);
3440 auto cblp = cbl.cbegin();
7c673cae
FG
3441 crush->decode(cblp);
3442
3443 // extended
3444 __u16 ev = 0;
3445 if (v >= 5)
11fdf7f2
TL
3446 decode(ev, p);
3447 decode(osd_addrs->hb_back_addrs, p);
3448 decode(osd_info, p);
7c673cae 3449 if (v < 5)
11fdf7f2 3450 decode(pool_name, p);
7c673cae 3451
f67539c2 3452 decode(blocklist, p);
7c673cae 3453 if (ev >= 6)
11fdf7f2 3454 decode(osd_addrs->cluster_addrs, p);
7c673cae 3455 else
11fdf7f2 3456 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3457
3458 if (ev >= 7) {
11fdf7f2
TL
3459 decode(cluster_snapshot_epoch, p);
3460 decode(cluster_snapshot, p);
7c673cae
FG
3461 }
3462
3463 if (ev >= 8) {
11fdf7f2 3464 decode(*osd_uuid, p);
7c673cae
FG
3465 } else {
3466 osd_uuid->resize(max_osd);
3467 }
3468 if (ev >= 9)
11fdf7f2 3469 decode(osd_xinfo, p);
7c673cae
FG
3470 else
3471 osd_xinfo.resize(max_osd);
3472
3473 if (ev >= 10)
11fdf7f2 3474 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3475 else
11fdf7f2 3476 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3477
3478 osd_primary_affinity.reset();
3479
3480 post_decode();
3481}
3482
9f95a23c 3483void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 3484{
11fdf7f2 3485 using ceph::decode;
7c673cae
FG
3486 /**
3487 * Older encodings of the OSDMap had a single struct_v which
3488 * covered the whole encoding, and was prior to our modern
3489 * stuff which includes a compatv and a size. So if we see
3490 * a struct_v < 7, we must rewind to the beginning and use our
3491 * classic decoder.
3492 */
3493 size_t start_offset = bl.get_off();
3494 size_t tail_offset = 0;
9f95a23c 3495 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
3496
3497 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3498 if (struct_v < 7) {
11fdf7f2 3499 bl.seek(start_offset);
7c673cae
FG
3500 decode_classic(bl);
3501 return;
3502 }
3503 /**
3504 * Since we made it past that hurdle, we can use our normal paths.
3505 */
3506 {
11fdf7f2 3507 DECODE_START(9, bl); // client-usable data
7c673cae 3508 // base
11fdf7f2
TL
3509 decode(fsid, bl);
3510 decode(epoch, bl);
3511 decode(created, bl);
3512 decode(modified, bl);
7c673cae 3513
11fdf7f2
TL
3514 decode(pools, bl);
3515 decode(pool_name, bl);
3516 decode(pool_max, bl);
7c673cae 3517
11fdf7f2 3518 decode(flags, bl);
7c673cae 3519
11fdf7f2 3520 decode(max_osd, bl);
31f18b77 3521 if (struct_v >= 5) {
11fdf7f2 3522 decode(osd_state, bl);
31f18b77
FG
3523 } else {
3524 vector<uint8_t> os;
11fdf7f2 3525 decode(os, bl);
31f18b77
FG
3526 osd_state.resize(os.size());
3527 for (unsigned i = 0; i < os.size(); ++i) {
3528 osd_state[i] = os[i];
3529 }
3530 }
11fdf7f2
TL
3531 decode(osd_weight, bl);
3532 decode(osd_addrs->client_addrs, bl);
7c673cae 3533
11fdf7f2
TL
3534 decode(*pg_temp, bl);
3535 decode(*primary_temp, bl);
3536 // dates back to firefly. version increased from 2 to 3 still in firefly.
3537 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3538 if (struct_v >= 2) {
3539 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3540 decode(*osd_primary_affinity, bl);
7c673cae
FG
3541 if (osd_primary_affinity->empty())
3542 osd_primary_affinity.reset();
3543 } else {
3544 osd_primary_affinity.reset();
3545 }
3546
3547 // crush
9f95a23c 3548 ceph::buffer::list cbl;
11fdf7f2
TL
3549 decode(cbl, bl);
3550 auto cblp = cbl.cbegin();
7c673cae 3551 crush->decode(cblp);
11fdf7f2
TL
3552 // added in firefly; version increased in luminous, so it affects
3553 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3554 // alone until we require clients to be all luminous?
7c673cae 3555 if (struct_v >= 3) {
11fdf7f2 3556 decode(erasure_code_profiles, bl);
7c673cae
FG
3557 } else {
3558 erasure_code_profiles.clear();
3559 }
11fdf7f2
TL
3560 // version increased from 3 to 4 still in luminous, so same as above
3561 // applies.
7c673cae 3562 if (struct_v >= 4) {
11fdf7f2
TL
3563 decode(pg_upmap, bl);
3564 decode(pg_upmap_items, bl);
7c673cae
FG
3565 } else {
3566 pg_upmap.clear();
3567 pg_upmap_items.clear();
3568 }
11fdf7f2
TL
3569 // again, version increased from 5 to 6 still in luminous, so above
3570 // applies.
31f18b77 3571 if (struct_v >= 6) {
11fdf7f2
TL
3572 decode(crush_version, bl);
3573 }
3574 // version increase from 6 to 7 in mimic
3575 if (struct_v >= 7) {
3576 decode(new_removed_snaps, bl);
3577 decode(new_purged_snaps, bl);
3578 }
3579 // version increase from 7 to 8, 8 to 9, in nautilus.
3580 if (struct_v >= 9) {
3581 decode(last_up_change, bl);
3582 decode(last_in_change, bl);
31f18b77 3583 }
1e59de90
TL
3584 if (struct_v >= 10) {
3585 decode(pg_upmap_primaries, bl);
3586 } else {
3587 pg_upmap_primaries.clear();
3588 }
7c673cae
FG
3589 DECODE_FINISH(bl); // client-usable data
3590 }
3591
3592 {
f67539c2 3593 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
3594 decode(osd_addrs->hb_back_addrs, bl);
3595 decode(osd_info, bl);
f67539c2 3596 decode(blocklist, bl);
11fdf7f2
TL
3597 decode(osd_addrs->cluster_addrs, bl);
3598 decode(cluster_snapshot_epoch, bl);
3599 decode(cluster_snapshot, bl);
3600 decode(*osd_uuid, bl);
3601 decode(osd_xinfo, bl);
3602 decode(osd_addrs->hb_front_addrs, bl);
3603 //
7c673cae 3604 if (struct_v >= 2) {
11fdf7f2
TL
3605 decode(nearfull_ratio, bl);
3606 decode(full_ratio, bl);
7c673cae
FG
3607 } else {
3608 nearfull_ratio = 0;
3609 full_ratio = 0;
3610 }
3611 if (struct_v >= 3) {
11fdf7f2 3612 decode(backfillfull_ratio, bl);
7c673cae
FG
3613 } else {
3614 backfillfull_ratio = 0;
3615 }
31f18b77
FG
3616 if (struct_v == 4) {
3617 string r;
11fdf7f2 3618 decode(r, bl);
31f18b77
FG
3619 if (r.length())
3620 require_min_compat_client = ceph_release_from_name(r.c_str());
3621 }
3622 if (struct_v >= 5) {
11fdf7f2
TL
3623 decode(require_min_compat_client, bl);
3624 decode(require_osd_release, bl);
9f95a23c 3625 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
3626 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3627 }
9f95a23c 3628 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 3629 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3630 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3631 }
3632 } else {
3633 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3634 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 3635 require_osd_release = ceph_release_t::luminous;
31f18b77 3636 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3637 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77 3638 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c 3639 require_osd_release = ceph_release_t::kraken;
31f18b77 3640 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c 3641 require_osd_release = ceph_release_t::jewel;
31f18b77 3642 } else {
9f95a23c 3643 require_osd_release = ceph_release_t::unknown;
31f18b77
FG
3644 }
3645 }
11fdf7f2
TL
3646 if (struct_v >= 6) {
3647 decode(removed_snaps_queue, bl);
3648 }
81eedcae
TL
3649 if (struct_v >= 8) {
3650 decode(crush_node_flags, bl);
3651 } else {
3652 crush_node_flags.clear();
3653 }
3654 if (struct_v >= 9) {
3655 decode(device_class_flags, bl);
3656 } else {
3657 device_class_flags.clear();
3658 }
f67539c2
TL
3659 if (struct_v >= 10) {
3660 decode(stretch_mode_enabled, bl);
3661 decode(stretch_bucket_count, bl);
3662 decode(degraded_stretch_mode, bl);
3663 decode(recovering_stretch_mode, bl);
3664 decode(stretch_mode_bucket, bl);
3665 } else {
3666 stretch_mode_enabled = false;
3667 stretch_bucket_count = 0;
3668 degraded_stretch_mode = 0;
3669 recovering_stretch_mode = 0;
3670 stretch_mode_bucket = 0;
3671 }
33c7a0ef
TL
3672 if (struct_v >= 11) {
3673 decode(range_blocklist, bl);
3674 calculated_ranges.clear();
3675 for (const auto& i : range_blocklist) {
3676 calculated_ranges.emplace(i.first, i.first);
3677 }
3678 }
1e59de90
TL
3679 if (struct_v >= 12) {
3680 decode(allow_crimson, bl);
3681 }
7c673cae
FG
3682 DECODE_FINISH(bl); // osd-only data
3683 }
3684
3685 if (struct_v >= 8) {
3686 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3687 decode(crc, bl);
7c673cae
FG
3688 tail_offset = bl.get_off();
3689 crc_defined = true;
3690 } else {
3691 crc_defined = false;
3692 crc = 0;
3693 }
3694
3695 DECODE_FINISH(bl); // wrapper
3696
3697 if (tail_offset) {
3698 // verify crc
3699 uint32_t actual = crc_front.crc32c(-1);
3700 if (tail_offset < bl.get_off()) {
9f95a23c 3701 ceph::buffer::list tail;
7c673cae
FG
3702 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3703 actual = tail.crc32c(actual);
3704 }
3705 if (crc != actual) {
3706 ostringstream ss;
3707 ss << "bad crc, actual " << actual << " != expected " << crc;
3708 string s = ss.str();
9f95a23c 3709 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
3710 }
3711 }
3712
3713 post_decode();
3714}
3715
3716void OSDMap::post_decode()
3717{
3718 // index pool names
3719 name_pool.clear();
3720 for (const auto &pname : pool_name) {
3721 name_pool[pname.second] = pname.first;
3722 }
3723
3724 calc_num_osds();
3725 _calc_up_osd_features();
3726}
3727
3728void OSDMap::dump_erasure_code_profiles(
3729 const mempool::osdmap::map<string,map<string,string>>& profiles,
3730 Formatter *f)
3731{
3732 f->open_object_section("erasure_code_profiles");
3733 for (const auto &profile : profiles) {
3734 f->open_object_section(profile.first.c_str());
3735 for (const auto &profm : profile.second) {
9f95a23c 3736 f->dump_string(profm.first.c_str(), profm.second);
7c673cae
FG
3737 }
3738 f->close_section();
3739 }
3740 f->close_section();
3741}
3742
9f95a23c
TL
3743void OSDMap::dump_osds(Formatter *f) const
3744{
3745 f->open_array_section("osds");
3746 for (int i=0; i<get_max_osd(); i++) {
3747 if (exists(i)) {
3748 dump_osd(i, f);
3749 }
3750 }
3751 f->close_section();
3752}
3753
3754void OSDMap::dump_osd(int id, Formatter *f) const
3755{
3756 ceph_assert(f != nullptr);
3757 if (!exists(id)) {
3758 return;
3759 }
3760
3761 f->open_object_section("osd_info");
3762 f->dump_int("osd", id);
3763 f->dump_stream("uuid") << get_uuid(id);
3764 f->dump_int("up", is_up(id));
3765 f->dump_int("in", is_in(id));
3766 f->dump_float("weight", get_weightf(id));
3767 f->dump_float("primary_affinity", get_primary_affinityf(id));
3768 get_info(id).dump(f);
3769 f->dump_object("public_addrs", get_addrs(id));
3770 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3771 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3772 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3773 // compat
3774 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3775 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3776 f->dump_stream("heartbeat_back_addr")
3777 << get_hb_back_addrs(id).get_legacy_str();
3778 f->dump_stream("heartbeat_front_addr")
3779 << get_hb_front_addrs(id).get_legacy_str();
3780
3781 set<string> st;
3782 get_state(id, st);
3783 f->open_array_section("state");
3784 for (const auto &state : st)
3785 f->dump_string("state", state);
3786 f->close_section();
3787
3788 f->close_section();
3789}
3790
1e59de90
TL
3791void OSDMap::dump_pool(CephContext *cct,
3792 int64_t pid,
3793 const pg_pool_t &pdata,
3794 ceph::Formatter *f) const
3795{
3796 std::string name("<unknown>");
3797 const auto &pni = pool_name.find(pid);
3798 if (pni != pool_name.end())
3799 name = pni->second;
3800 f->open_object_section("pool");
3801 f->dump_int("pool", pid);
3802 f->dump_string("pool_name", name);
3803 pdata.dump(f);
3804 dump_read_balance_score(cct, pid, pdata, f);
3805 f->close_section(); // pool
3806}
3807
3808void OSDMap::dump_read_balance_score(CephContext *cct,
3809 int64_t pid,
3810 const pg_pool_t &pdata,
3811 ceph::Formatter *f) const
3812{
3813 if (pdata.is_replicated()) {
3814 // Add rb section with values for score, optimal score, raw score
3815 // // and primary_affinity average
3816 OSDMap::read_balance_info_t rb_info;
3817 auto rc = calc_read_balance_score(cct, pid, &rb_info);
3818 if (rc >= 0) {
3819 f->open_object_section("read_balance");
3820 f->dump_float("score_acting", rb_info.acting_adj_score);
3821 f->dump_float("score_stable", rb_info.adjusted_score);
3822 f->dump_float("optimal_score", rb_info.optimal_score);
3823 f->dump_float("raw_score_acting", rb_info.acting_raw_score);
3824 f->dump_float("raw_score_stable", rb_info.raw_score);
3825 f->dump_float("primary_affinity_weighted", rb_info.pa_weighted);
3826 f->dump_float("average_primary_affinity", rb_info.pa_avg);
3827 f->dump_float("average_primary_affinity_weighted", rb_info.pa_weighted_avg);
3828 if (rb_info.err_msg.length() > 0) {
3829 f->dump_string("error_message", rb_info.err_msg);
3830 }
3831 f->close_section(); // read_balance
3832 }
3833 else {
3834 if (rb_info.err_msg.length() > 0) {
3835 f->open_object_section("read_balance");
3836 f->dump_string("error_message", rb_info.err_msg);
3837 f->dump_float("score_acting", rb_info.acting_adj_score);
3838 f->dump_float("score_stable", rb_info.adjusted_score);
3839 f->close_section(); // read_balance
3840 }
3841 }
3842 }
3843}
3844
3845void OSDMap::dump(Formatter *f, CephContext *cct) const
7c673cae
FG
3846{
3847 f->dump_int("epoch", get_epoch());
3848 f->dump_stream("fsid") << get_fsid();
3849 f->dump_stream("created") << get_created();
3850 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3851 f->dump_stream("last_up_change") << last_up_change;
3852 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3853 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3854 f->dump_unsigned("flags_num", flags);
3855 f->open_array_section("flags_set");
3856 set<string> flagset;
3857 get_flag_set(&flagset);
3858 for (auto p : flagset) {
3859 f->dump_string("flag", p);
3860 }
3861 f->close_section();
31f18b77 3862 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3863 f->dump_float("full_ratio", full_ratio);
3864 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3865 f->dump_float("nearfull_ratio", nearfull_ratio);
3866 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3867 f->dump_int("pool_max", get_pool_max());
3868 f->dump_int("max_osd", get_max_osd());
31f18b77 3869 f->dump_string("require_min_compat_client",
f67539c2 3870 to_string(require_min_compat_client));
31f18b77 3871 f->dump_string("min_compat_client",
f67539c2 3872 to_string(get_min_compat_client()));
31f18b77 3873 f->dump_string("require_osd_release",
f67539c2 3874 to_string(require_osd_release));
7c673cae 3875
1e59de90 3876 f->dump_bool("allow_crimson", allow_crimson);
7c673cae 3877 f->open_array_section("pools");
1e59de90
TL
3878 for (const auto &[pid, pdata] : pools) {
3879 dump_pool(cct, pid, pdata, f);
7c673cae
FG
3880 }
3881 f->close_section();
3882
9f95a23c 3883 dump_osds(f);
7c673cae
FG
3884
3885 f->open_array_section("osd_xinfo");
3886 for (int i=0; i<get_max_osd(); i++) {
3887 if (exists(i)) {
3888 f->open_object_section("xinfo");
3889 f->dump_int("osd", i);
3890 osd_xinfo[i].dump(f);
3891 f->close_section();
3892 }
3893 }
3894 f->close_section();
3895
3896 f->open_array_section("pg_upmap");
3897 for (auto& p : pg_upmap) {
3898 f->open_object_section("mapping");
3899 f->dump_stream("pgid") << p.first;
3900 f->open_array_section("osds");
3901 for (auto q : p.second) {
3902 f->dump_int("osd", q);
3903 }
3904 f->close_section();
3905 f->close_section();
3906 }
3907 f->close_section();
1e59de90 3908
7c673cae 3909 f->open_array_section("pg_upmap_items");
1e59de90 3910 for (auto& [pgid, mappings] : pg_upmap_items) {
7c673cae 3911 f->open_object_section("mapping");
1e59de90 3912 f->dump_stream("pgid") << pgid;
7c673cae 3913 f->open_array_section("mappings");
1e59de90 3914 for (auto& [from, to] : mappings) {
7c673cae 3915 f->open_object_section("mapping");
1e59de90
TL
3916 f->dump_int("from", from);
3917 f->dump_int("to", to);
7c673cae
FG
3918 f->close_section();
3919 }
3920 f->close_section();
3921 f->close_section();
3922 }
3923 f->close_section();
1e59de90
TL
3924
3925 f->open_array_section("pg_upmap_primaries");
3926 for (const auto& [pg, osd] : pg_upmap_primaries) {
3927 f->open_object_section("primary_mapping");
3928 f->dump_stream("pgid") << pg;
3929 f->dump_int("primary_osd", osd);
3930 f->close_section();
3931 }
3932 f->close_section(); // primary_temp
3933
7c673cae 3934 f->open_array_section("pg_temp");
31f18b77 3935 pg_temp->dump(f);
7c673cae
FG
3936 f->close_section();
3937
3938 f->open_array_section("primary_temp");
3939 for (const auto &pg : *primary_temp) {
3940 f->dump_stream("pgid") << pg.first;
3941 f->dump_int("osd", pg.second);
3942 }
3943 f->close_section(); // primary_temp
3944
f67539c2
TL
3945 f->open_object_section("blocklist");
3946 for (const auto &addr : blocklist) {
7c673cae
FG
3947 stringstream ss;
3948 ss << addr.first;
3949 f->dump_stream(ss.str().c_str()) << addr.second;
3950 }
3951 f->close_section();
33c7a0ef
TL
3952 f->open_object_section("range_blocklist");
3953 for (const auto &addr : range_blocklist) {
3954 stringstream ss;
3955 ss << addr.first;
3956 f->dump_stream(ss.str().c_str()) << addr.second;
3957 }
3958 f->close_section();
7c673cae
FG
3959
3960 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3961
3962 f->open_array_section("removed_snaps_queue");
3963 for (auto& p : removed_snaps_queue) {
3964 f->open_object_section("pool");
3965 f->dump_int("pool", p.first);
3966 f->open_array_section("snaps");
3967 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3968 f->open_object_section("interval");
3969 f->dump_unsigned("begin", q.get_start());
3970 f->dump_unsigned("length", q.get_len());
3971 f->close_section();
3972 }
3973 f->close_section();
3974 f->close_section();
3975 }
3976 f->close_section();
3977 f->open_array_section("new_removed_snaps");
3978 for (auto& p : new_removed_snaps) {
3979 f->open_object_section("pool");
3980 f->dump_int("pool", p.first);
3981 f->open_array_section("snaps");
3982 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3983 f->open_object_section("interval");
3984 f->dump_unsigned("begin", q.get_start());
3985 f->dump_unsigned("length", q.get_len());
3986 f->close_section();
3987 }
3988 f->close_section();
3989 f->close_section();
3990 }
3991 f->close_section();
3992 f->open_array_section("new_purged_snaps");
3993 for (auto& p : new_purged_snaps) {
3994 f->open_object_section("pool");
3995 f->dump_int("pool", p.first);
3996 f->open_array_section("snaps");
3997 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3998 f->open_object_section("interval");
3999 f->dump_unsigned("begin", q.get_start());
4000 f->dump_unsigned("length", q.get_len());
4001 f->close_section();
4002 }
4003 f->close_section();
4004 f->close_section();
4005 }
4006 f->close_section();
81eedcae
TL
4007 f->open_object_section("crush_node_flags");
4008 for (auto& i : crush_node_flags) {
4009 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
4010 : stringify(i.first);
4011 f->open_array_section(s.c_str());
4012 set<string> st;
4013 calc_state_set(i.second, st);
4014 for (auto& j : st) {
4015 f->dump_string("flag", j);
4016 }
4017 f->close_section();
4018 }
4019 f->close_section();
4020 f->open_object_section("device_class_flags");
4021 for (auto& i : device_class_flags) {
4022 const char* class_name = crush->get_class_name(i.first);
4023 string s = class_name ? class_name : stringify(i.first);
4024 f->open_array_section(s.c_str());
4025 set<string> st;
4026 calc_state_set(i.second, st);
4027 for (auto& j : st) {
4028 f->dump_string("flag", j);
4029 }
4030 f->close_section();
4031 }
4032 f->close_section();
f67539c2
TL
4033 f->open_object_section("stretch_mode");
4034 {
4035 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
4036 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
4037 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
4038 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
4039 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
4040 }
4041 f->close_section();
7c673cae
FG
4042}
4043
4044void OSDMap::generate_test_instances(list<OSDMap*>& o)
4045{
4046 o.push_back(new OSDMap);
4047
4048 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
4049 o.push_back(new OSDMap);
4050 uuid_d fsid;
224ce89b 4051 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae 4052 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
f67539c2 4053 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
7c673cae
FG
4054 cct->put();
4055}
4056
4057string OSDMap::get_flag_string(unsigned f)
4058{
4059 string s;
7c673cae
FG
4060 if (f & CEPH_OSDMAP_PAUSERD)
4061 s += ",pauserd";
4062 if (f & CEPH_OSDMAP_PAUSEWR)
4063 s += ",pausewr";
4064 if (f & CEPH_OSDMAP_PAUSEREC)
4065 s += ",pauserec";
4066 if (f & CEPH_OSDMAP_NOUP)
4067 s += ",noup";
4068 if (f & CEPH_OSDMAP_NODOWN)
4069 s += ",nodown";
4070 if (f & CEPH_OSDMAP_NOOUT)
4071 s += ",noout";
4072 if (f & CEPH_OSDMAP_NOIN)
4073 s += ",noin";
4074 if (f & CEPH_OSDMAP_NOBACKFILL)
4075 s += ",nobackfill";
4076 if (f & CEPH_OSDMAP_NOREBALANCE)
4077 s += ",norebalance";
4078 if (f & CEPH_OSDMAP_NORECOVER)
4079 s += ",norecover";
4080 if (f & CEPH_OSDMAP_NOSCRUB)
4081 s += ",noscrub";
4082 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
4083 s += ",nodeep-scrub";
4084 if (f & CEPH_OSDMAP_NOTIERAGENT)
4085 s += ",notieragent";
11fdf7f2
TL
4086 if (f & CEPH_OSDMAP_NOSNAPTRIM)
4087 s += ",nosnaptrim";
7c673cae
FG
4088 if (f & CEPH_OSDMAP_SORTBITWISE)
4089 s += ",sortbitwise";
4090 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
4091 s += ",require_jewel_osds";
4092 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
4093 s += ",require_kraken_osds";
4094 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
4095 s += ",require_luminous_osds";
c07f9fc5
FG
4096 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
4097 s += ",recovery_deletes";
181888fb
FG
4098 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
4099 s += ",purged_snapdirs";
f64942e4
AA
4100 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
4101 s += ",pglog_hardlimit";
aee94f69
TL
4102 if (f & CEPH_OSDMAP_NOAUTOSCALE)
4103 s += ",noautoscale";
7c673cae
FG
4104 if (s.length())
4105 s.erase(0, 1);
4106 return s;
4107}
4108
4109string OSDMap::get_flag_string() const
4110{
4111 return get_flag_string(flags);
4112}
4113
1e59de90 4114void OSDMap::print_pools(CephContext *cct, ostream& out) const
7c673cae 4115{
1e59de90 4116 for (const auto &[pid, pdata] : pools) {
7c673cae 4117 std::string name("<unknown>");
1e59de90 4118 const auto &pni = pool_name.find(pid);
7c673cae
FG
4119 if (pni != pool_name.end())
4120 name = pni->second;
1e59de90
TL
4121 char rb_score_str[32] = "";
4122 int rc = 0;
4123 read_balance_info_t rb_info;
4124 if (pdata.is_replicated()) {
4125 rc = calc_read_balance_score(cct, pid, &rb_info);
4126 if (rc >= 0)
4127 snprintf (rb_score_str, sizeof(rb_score_str),
4128 " read_balance_score %.2f", rb_info.acting_adj_score);
4129 }
4130
4131 out << "pool " << pid
7c673cae 4132 << " '" << name
1e59de90
TL
4133 << "' " << pdata
4134 << rb_score_str << "\n";
4135 if (rb_info.err_msg.length() > 0) {
4136 out << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << "\n";
4137 }
4138
4139 //TODO - print error messages here.
7c673cae 4140
1e59de90 4141 for (const auto &snap : pdata.snaps)
7c673cae
FG
4142 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
4143
1e59de90
TL
4144 if (!pdata.removed_snaps.empty())
4145 out << "\tremoved_snaps " << pdata.removed_snaps << "\n";
4146 auto p = removed_snaps_queue.find(pid);
11fdf7f2
TL
4147 if (p != removed_snaps_queue.end()) {
4148 out << "\tremoved_snaps_queue " << p->second << "\n";
4149 }
7c673cae
FG
4150 }
4151 out << std::endl;
4152}
4153
9f95a23c
TL
4154void OSDMap::print_osds(ostream& out) const
4155{
4156 for (int i=0; i<get_max_osd(); i++) {
4157 if (exists(i)) {
4158 print_osd(i, out);
4159 }
4160 }
4161}
4162void OSDMap::print_osd(int id, ostream& out) const
4163{
4164 if (!exists(id)) {
4165 return;
4166 }
4167
4168 out << "osd." << id;
4169 out << (is_up(id) ? " up ":" down");
4170 out << (is_in(id) ? " in ":" out");
4171 out << " weight " << get_weightf(id);
4172 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
4173 out << " primary_affinity " << get_primary_affinityf(id);
4174 }
4175 const osd_info_t& info(get_info(id));
4176 out << " " << info;
4177 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
4178 set<string> st;
4179 get_state(id, st);
4180 out << " " << st;
4181 if (!get_uuid(id).is_zero()) {
4182 out << " " << get_uuid(id);
4183 }
4184 out << "\n";
4185}
4186
1e59de90 4187void OSDMap::print(CephContext *cct, ostream& out) const
7c673cae
FG
4188{
4189 out << "epoch " << get_epoch() << "\n"
4190 << "fsid " << get_fsid() << "\n"
4191 << "created " << get_created() << "\n"
4192 << "modified " << get_modified() << "\n";
4193
4194 out << "flags " << get_flag_string() << "\n";
31f18b77 4195 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
4196 out << "full_ratio " << full_ratio << "\n";
4197 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
4198 out << "nearfull_ratio " << nearfull_ratio << "\n";
9f95a23c 4199 if (require_min_compat_client != ceph_release_t::unknown) {
31f18b77 4200 out << "require_min_compat_client "
9f95a23c 4201 << require_min_compat_client << "\n";
7c673cae 4202 }
9f95a23c 4203 out << "min_compat_client " << get_min_compat_client()
31f18b77 4204 << "\n";
9f95a23c
TL
4205 if (require_osd_release > ceph_release_t::unknown) {
4206 out << "require_osd_release " << require_osd_release
224ce89b
WB
4207 << "\n";
4208 }
f67539c2
TL
4209 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
4210 if (stretch_mode_enabled) {
4211 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
4212 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
4213 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
4214 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
4215 }
7c673cae
FG
4216 if (get_cluster_snapshot().length())
4217 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
1e59de90
TL
4218 if (allow_crimson) {
4219 out << "allow_crimson=true\n";
4220 }
7c673cae
FG
4221 out << "\n";
4222
1e59de90 4223 print_pools(cct, out);
7c673cae
FG
4224
4225 out << "max_osd " << get_max_osd() << "\n";
9f95a23c 4226 print_osds(out);
7c673cae
FG
4227 out << std::endl;
4228
4229 for (auto& p : pg_upmap) {
4230 out << "pg_upmap " << p.first << " " << p.second << "\n";
4231 }
4232 for (auto& p : pg_upmap_items) {
4233 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
4234 }
4235
1e59de90
TL
4236 for (auto& [pg, osd] : pg_upmap_primaries) {
4237 out << "pg_upmap_primary " << pg << " " << osd << "\n";
4238 }
4239
f67539c2 4240 for (const auto& pg : *pg_temp)
7c673cae
FG
4241 out << "pg_temp " << pg.first << " " << pg.second << "\n";
4242
f67539c2 4243 for (const auto& pg : *primary_temp)
7c673cae
FG
4244 out << "primary_temp " << pg.first << " " << pg.second << "\n";
4245
f67539c2
TL
4246 for (const auto &addr : blocklist)
4247 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
33c7a0ef
TL
4248 for (const auto &addr : range_blocklist)
4249 out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
4250}
4251
4252class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
4253public:
4254 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
4255
4256 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4257 unsigned f)
c07f9fc5 4258 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4259
4260 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4261 if (!filter) {
4262 return true; // normal case
4263 }
4264 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4265 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4266 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4267 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4268 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4269 return true;
31f18b77 4270 }
c07f9fc5 4271 return false;
31f18b77
FG
4272 }
4273
4274 bool should_dump_empty_bucket() const override {
4275 return !filter;
4276 }
7c673cae 4277
11fdf7f2 4278 void init_table(TextTable *tbl) {
7c673cae 4279 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 4280 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
4281 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4282 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 4283 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 4284 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 4285 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
4286 }
4287 void dump(TextTable *tbl, string& bucket) {
4288 init_table(tbl);
7c673cae 4289
11fdf7f2
TL
4290 if (!bucket.empty()) {
4291 set_root(bucket);
4292 Parent::dump(tbl);
4293 } else {
4294 Parent::dump(tbl);
4295 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4296 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
4297 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
4298 }
31f18b77 4299 }
7c673cae
FG
4300 }
4301 }
4302
4303protected:
4304 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
4305 const char *c = crush->get_item_class(qi.id);
4306 if (!c)
4307 c = "";
7c673cae 4308 *tbl << qi.id
224ce89b 4309 << c
7c673cae
FG
4310 << weightf_t(qi.weight);
4311
4312 ostringstream name;
4313 for (int k = 0; k < qi.depth; k++)
4314 name << " ";
4315 if (qi.is_bucket()) {
4316 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
4317 << crush->get_item_name(qi.id);
4318 } else {
4319 name << "osd." << qi.id;
4320 }
4321 *tbl << name.str();
4322
4323 if (!qi.is_bucket()) {
4324 if (!osdmap->exists(qi.id)) {
4325 *tbl << "DNE"
4326 << 0;
4327 } else {
c07f9fc5
FG
4328 string s;
4329 if (osdmap->is_up(qi.id)) {
4330 s = "up";
4331 } else if (osdmap->is_destroyed(qi.id)) {
4332 s = "destroyed";
4333 } else {
4334 s = "down";
4335 }
4336 *tbl << s
7c673cae
FG
4337 << weightf_t(osdmap->get_weightf(qi.id))
4338 << weightf_t(osdmap->get_primary_affinityf(qi.id));
4339 }
4340 }
4341 *tbl << TextTable::endrow;
4342 }
4343
4344private:
4345 const OSDMap *osdmap;
31f18b77 4346 const unsigned filter;
7c673cae
FG
4347};
4348
4349class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4350public:
4351 typedef CrushTreeDumper::FormattingDumper Parent;
4352
31f18b77
FG
4353 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4354 unsigned f)
c07f9fc5 4355 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4356
4357 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4358 if (!filter) {
4359 return true; // normal case
4360 }
4361 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4362 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4363 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4364 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4365 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4366 return true;
31f18b77 4367 }
c07f9fc5 4368 return false;
31f18b77
FG
4369 }
4370
4371 bool should_dump_empty_bucket() const override {
4372 return !filter;
4373 }
7c673cae 4374
11fdf7f2
TL
4375 void dump(Formatter *f, string& bucket) {
4376 if (!bucket.empty()) {
4377 set_root(bucket);
4378 f->open_array_section("nodes");
4379 Parent::dump(f);
4380 f->close_section();
4381 } else {
4382 f->open_array_section("nodes");
4383 Parent::dump(f);
4384 f->close_section();
4385 f->open_array_section("stray");
4386 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4387 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4388 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4389 }
4390 f->close_section();
7c673cae 4391 }
7c673cae
FG
4392 }
4393
4394protected:
4395 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4396 Parent::dump_item_fields(qi, f);
4397 if (!qi.is_bucket())
4398 {
c07f9fc5
FG
4399 string s;
4400 if (osdmap->is_up(qi.id)) {
4401 s = "up";
4402 } else if (osdmap->is_destroyed(qi.id)) {
4403 s = "destroyed";
4404 } else {
4405 s = "down";
4406 }
7c673cae 4407 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 4408 f->dump_string("status", s);
7c673cae
FG
4409 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4410 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4411 }
4412 }
4413
4414private:
4415 const OSDMap *osdmap;
31f18b77 4416 const unsigned filter;
7c673cae
FG
4417};
4418
11fdf7f2 4419void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 4420{
31f18b77 4421 if (f) {
11fdf7f2 4422 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 4423 } else {
11fdf7f2 4424 ceph_assert(out);
7c673cae 4425 TextTable tbl;
11fdf7f2 4426 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
4427 *out << tbl;
4428 }
4429}
4430
224ce89b 4431void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 4432 const string& prefix, bool extra) const
7c673cae
FG
4433{
4434 if (f) {
7c673cae
FG
4435 f->dump_int("epoch", get_epoch());
4436 f->dump_int("num_osds", get_num_osds());
4437 f->dump_int("num_up_osds", get_num_up_osds());
9f95a23c 4438 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
7c673cae 4439 f->dump_int("num_in_osds", get_num_in_osds());
9f95a23c 4440 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
7c673cae 4441 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
7c673cae 4442 } else {
11fdf7f2 4443 utime_t now = ceph_clock_now();
31f18b77 4444 out << get_num_osds() << " osds: "
11fdf7f2
TL
4445 << get_num_up_osds() << " up";
4446 if (last_up_change != utime_t()) {
4447 out << " (since " << utimespan_str(now - last_up_change) << ")";
4448 }
4449 out << ", " << get_num_in_osds() << " in";
4450 if (last_in_change != utime_t()) {
4451 out << " (since " << utimespan_str(now - last_in_change) << ")";
4452 }
4453 if (extra)
4454 out << "; epoch: e" << get_epoch();
7c673cae
FG
4455 if (get_num_pg_temp())
4456 out << "; " << get_num_pg_temp() << " remapped pgs";
4457 out << "\n";
4458 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4459 if (important_flags)
224ce89b 4460 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
4461 }
4462}
4463
4464void OSDMap::print_oneline_summary(ostream& out) const
4465{
4466 out << "e" << get_epoch() << ": "
31f18b77 4467 << get_num_osds() << " total, "
7c673cae
FG
4468 << get_num_up_osds() << " up, "
4469 << get_num_in_osds() << " in";
7c673cae
FG
4470}
4471
3efd9988 4472bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
4473{
4474 for (const auto &pool : pools) {
3efd9988 4475 if (pool.second.crush_rule == rule_id)
7c673cae
FG
4476 return true;
4477 }
4478 return false;
4479}
4480
3efd9988
FG
4481int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4482 ostream *ss) const
4483{
4484 for (auto& i : pools) {
4485 auto& pool = i.second;
4486 int ruleno = pool.get_crush_rule();
4487 if (!newcrush->rule_exists(ruleno)) {
4488 *ss << "pool " << i.first << " references crush_rule " << ruleno
4489 << " but it is not present";
4490 return -EINVAL;
4491 }
20effc67 4492 if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) {
3efd9988
FG
4493 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4494 return -EINVAL;
4495 }
3efd9988
FG
4496 }
4497 return 0;
4498}
4499
224ce89b
WB
4500int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4501 int nosd, int pg_bits, int pgp_bits,
4502 bool default_pool)
7c673cae 4503{
224ce89b
WB
4504 ldout(cct, 10) << "build_simple on " << nosd
4505 << " osds" << dendl;
7c673cae
FG
4506 epoch = e;
4507 set_fsid(fsid);
4508 created = modified = ceph_clock_now();
4509
4510 if (nosd >= 0) {
4511 set_max_osd(nosd);
4512 } else {
4513 // count osds
4514 int maxosd = 0;
11fdf7f2 4515 const auto& conf = cct->_conf;
7c673cae 4516 vector<string> sections;
11fdf7f2 4517 conf.get_all_sections(sections);
7c673cae
FG
4518
4519 for (auto &section : sections) {
4520 if (section.find("osd.") != 0)
4521 continue;
4522
4523 const char *begin = section.c_str() + 4;
4524 char *end = (char*)begin;
4525 int o = strtol(begin, &end, 10);
4526 if (*end != '\0')
4527 continue;
4528
4529 if (o > cct->_conf->mon_max_osd) {
4530 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4531 return -ERANGE;
4532 }
4533
4534 if (o > maxosd)
4535 maxosd = o;
4536 }
4537
4538 set_max_osd(maxosd + 1);
4539 }
4540
7c673cae
FG
4541
4542 stringstream ss;
4543 int r;
4544 if (nosd >= 0)
4545 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4546 else
4547 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4548 ceph_assert(r == 0);
7c673cae
FG
4549
4550 int poolbase = get_max_osd() ? get_max_osd() : 1;
4551
20effc67 4552 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct);
11fdf7f2 4553 ceph_assert(default_replicated_rule >= 0);
7c673cae 4554
224ce89b
WB
4555 if (default_pool) {
4556 // pgp_num <= pg_num
4557 if (pgp_bits > pg_bits)
4558 pgp_bits = pg_bits;
4559
4560 vector<string> pool_names;
4561 pool_names.push_back("rbd");
4562 for (auto &plname : pool_names) {
4563 int64_t pool = ++pool_max;
4564 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4565 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4566 if (cct->_conf->osd_pool_default_flag_hashpspool)
4567 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4568 if (cct->_conf->osd_pool_default_flag_nodelete)
4569 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4570 if (cct->_conf->osd_pool_default_flag_nopgchange)
4571 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4572 if (cct->_conf->osd_pool_default_flag_nosizechange)
4573 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
20effc67
TL
4574 if (cct->_conf->osd_pool_default_flag_bulk)
4575 pools[pool].set_flag(pg_pool_t::FLAG_BULK);
11fdf7f2
TL
4576 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4577 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4578 pools[pool].size);
224ce89b
WB
4579 pools[pool].crush_rule = default_replicated_rule;
4580 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4581 pools[pool].set_pg_num(poolbase << pg_bits);
4582 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4583 pools[pool].set_pg_num_target(poolbase << pg_bits);
4584 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4585 pools[pool].last_change = epoch;
c07f9fc5
FG
4586 pools[pool].application_metadata.insert(
4587 {pg_pool_t::APPLICATION_NAME_RBD, {}});
9f95a23c
TL
4588 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4589 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4590 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4591 pools[pool].pg_autoscale_mode = m;
4592 } else {
4593 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4594 }
224ce89b
WB
4595 pool_name[pool] = plname;
4596 name_pool[plname] = pool;
4597 }
7c673cae
FG
4598 }
4599
7c673cae
FG
4600 map<string,string> profile_map;
4601 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4602 if (r < 0) {
4603 lderr(cct) << ss.str() << dendl;
4604 return r;
4605 }
4606 set_erasure_code_profile("default", profile_map);
4607 return 0;
4608}
4609
4610int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4611 map<string,string> &profile_map,
4612 ostream *ss)
4613{
11fdf7f2 4614 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4615 *ss,
4616 &profile_map);
4617 return r;
4618}
4619
4620int OSDMap::_build_crush_types(CrushWrapper& crush)
4621{
4622 crush.set_type_name(0, "osd");
4623 crush.set_type_name(1, "host");
4624 crush.set_type_name(2, "chassis");
4625 crush.set_type_name(3, "rack");
4626 crush.set_type_name(4, "row");
4627 crush.set_type_name(5, "pdu");
4628 crush.set_type_name(6, "pod");
4629 crush.set_type_name(7, "room");
4630 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4631 crush.set_type_name(9, "zone");
4632 crush.set_type_name(10, "region");
4633 crush.set_type_name(11, "root");
4634 return 11;
7c673cae
FG
4635}
4636
4637int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4638 int nosd, ostream *ss)
4639{
4640 crush.create();
4641
4642 // root
4643 int root_type = _build_crush_types(crush);
4644 int rootid;
4645 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4646 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4647 ceph_assert(r == 0);
7c673cae
FG
4648 crush.set_item_name(rootid, "default");
4649
f67539c2
TL
4650 map<string,string> loc{
4651 {"host", "localhost"},
4652 {"rack", "localrack"},
4653 {"root", "default"}
4654 };
7c673cae 4655 for (int o=0; o<nosd; o++) {
7c673cae
FG
4656 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4657 char name[32];
4658 snprintf(name, sizeof(name), "osd.%d", o);
4659 crush.insert_item(cct, o, 1.0, name, loc);
4660 }
4661
31f18b77 4662 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4663
4664 crush.finalize();
4665
4666 return 0;
4667}
4668
4669int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4670 CrushWrapper& crush,
4671 ostream *ss)
4672{
11fdf7f2 4673 const auto& conf = cct->_conf;
7c673cae
FG
4674
4675 crush.create();
4676
4677 // root
4678 int root_type = _build_crush_types(crush);
4679 int rootid;
4680 int r = crush.add_bucket(0, 0,
4681 CRUSH_HASH_DEFAULT,
4682 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4683 ceph_assert(r == 0);
7c673cae
FG
4684 crush.set_item_name(rootid, "default");
4685
4686 // add osds
4687 vector<string> sections;
11fdf7f2 4688 conf.get_all_sections(sections);
7c673cae
FG
4689
4690 for (auto &section : sections) {
4691 if (section.find("osd.") != 0)
4692 continue;
4693
4694 const char *begin = section.c_str() + 4;
4695 char *end = (char*)begin;
4696 int o = strtol(begin, &end, 10);
4697 if (*end != '\0')
4698 continue;
4699
4700 string host, rack, row, room, dc, pool;
4701 vector<string> sectiontmp;
4702 sectiontmp.push_back("osd");
4703 sectiontmp.push_back(section);
11fdf7f2
TL
4704 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4705 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4706 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4707 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4708 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4709 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4710
4711 if (host.length() == 0)
4712 host = "unknownhost";
4713 if (rack.length() == 0)
4714 rack = "unknownrack";
4715
4716 map<string,string> loc;
4717 loc["host"] = host;
4718 loc["rack"] = rack;
4719 if (row.size())
4720 loc["row"] = row;
4721 if (room.size())
4722 loc["room"] = room;
4723 if (dc.size())
4724 loc["datacenter"] = dc;
4725 loc["root"] = "default";
4726
4727 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4728 crush.insert_item(cct, o, 1.0, section, loc);
4729 }
4730
31f18b77 4731 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4732
4733 crush.finalize();
4734
4735 return 0;
4736}
4737
4738
31f18b77
FG
4739int OSDMap::build_simple_crush_rules(
4740 CephContext *cct,
4741 CrushWrapper& crush,
4742 const string& root,
4743 ostream *ss)
7c673cae 4744{
20effc67 4745 int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct);
7c673cae
FG
4746 string failure_domain =
4747 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4748
7c673cae 4749 int r;
31f18b77 4750 r = crush.add_simple_rule_at(
224ce89b 4751 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4752 "firstn", pg_pool_t::TYPE_REPLICATED,
4753 crush_rule, ss);
7c673cae
FG
4754 if (r < 0)
4755 return r;
4756 // do not add an erasure rule by default or else we will implicitly
4757 // require the crush_v2 feature of clients
4758 return 0;
4759}
4760
4761int OSDMap::summarize_mapping_stats(
4762 OSDMap *newmap,
4763 const set<int64_t> *pools,
4764 std::string *out,
4765 Formatter *f) const
4766{
4767 set<int64_t> ls;
4768 if (pools) {
4769 ls = *pools;
4770 } else {
4771 for (auto &p : get_pools())
4772 ls.insert(p.first);
4773 }
4774
4775 unsigned total_pg = 0;
4776 unsigned moved_pg = 0;
4777 vector<unsigned> base_by_osd(get_max_osd(), 0);
4778 vector<unsigned> new_by_osd(get_max_osd(), 0);
4779 for (int64_t pool_id : ls) {
4780 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4781 vector<int> up, up2;
4782 int up_primary;
7c673cae 4783 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4784 pg_t pgid(ps, pool_id);
7c673cae 4785 total_pg += pi->get_size();
31f18b77 4786 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4787 for (int osd : up) {
4788 if (osd >= 0 && osd < get_max_osd())
4789 ++base_by_osd[osd];
4790 }
4791 if (newmap) {
31f18b77 4792 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4793 for (int osd : up2) {
4794 if (osd >= 0 && osd < get_max_osd())
4795 ++new_by_osd[osd];
4796 }
1e59de90 4797 if (pi->is_erasure()) {
7c673cae
FG
4798 for (unsigned i=0; i<up.size(); ++i) {
4799 if (up[i] != up2[i]) {
4800 ++moved_pg;
4801 }
4802 }
1e59de90 4803 } else if (pi->is_replicated()) {
7c673cae
FG
4804 for (int osd : up) {
4805 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4806 ++moved_pg;
4807 }
4808 }
4809 } else {
11fdf7f2 4810 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4811 }
4812 }
4813 }
4814 }
4815
4816 unsigned num_up_in = 0;
4817 for (int osd = 0; osd < get_max_osd(); ++osd) {
4818 if (is_up(osd) && is_in(osd))
4819 ++num_up_in;
4820 }
4821 if (!num_up_in) {
4822 return -EINVAL;
4823 }
4824
4825 float avg_pg = (float)total_pg / (float)num_up_in;
4826 float base_stddev = 0, new_stddev = 0;
4827 int min = -1, max = -1;
4828 unsigned min_base_pg = 0, max_base_pg = 0;
4829 unsigned min_new_pg = 0, max_new_pg = 0;
4830 for (int osd = 0; osd < get_max_osd(); ++osd) {
4831 if (is_up(osd) && is_in(osd)) {
4832 float base_diff = (float)base_by_osd[osd] - avg_pg;
4833 base_stddev += base_diff * base_diff;
4834 float new_diff = (float)new_by_osd[osd] - avg_pg;
4835 new_stddev += new_diff * new_diff;
4836 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4837 min = osd;
4838 min_base_pg = base_by_osd[osd];
4839 min_new_pg = new_by_osd[osd];
4840 }
4841 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4842 max = osd;
4843 max_base_pg = base_by_osd[osd];
4844 max_new_pg = new_by_osd[osd];
4845 }
4846 }
4847 }
4848 base_stddev = sqrt(base_stddev / num_up_in);
4849 new_stddev = sqrt(new_stddev / num_up_in);
4850
4851 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4852
4853 ostringstream ss;
4854 if (f)
4855 f->open_object_section("utilization");
4856 if (newmap) {
4857 if (f) {
4858 f->dump_unsigned("moved_pgs", moved_pg);
4859 f->dump_unsigned("total_pgs", total_pg);
4860 } else {
4861 float percent = 0;
4862 if (total_pg)
4863 percent = (float)moved_pg * 100.0 / (float)total_pg;
4864 ss << "moved " << moved_pg << " / " << total_pg
4865 << " (" << percent << "%)\n";
4866 }
4867 }
4868 if (f) {
4869 f->dump_float("avg_pgs", avg_pg);
4870 f->dump_float("std_dev", base_stddev);
4871 f->dump_float("expected_baseline_std_dev", edev);
4872 if (newmap)
4873 f->dump_float("new_std_dev", new_stddev);
4874 } else {
4875 ss << "avg " << avg_pg << "\n";
4876 ss << "stddev " << base_stddev;
4877 if (newmap)
4878 ss << " -> " << new_stddev;
4879 ss << " (expected baseline " << edev << ")\n";
4880 }
4881 if (min >= 0) {
4882 if (f) {
4883 f->dump_unsigned("min_osd", min);
4884 f->dump_unsigned("min_osd_pgs", min_base_pg);
4885 if (newmap)
4886 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4887 } else {
4888 ss << "min osd." << min << " with " << min_base_pg;
4889 if (newmap)
4890 ss << " -> " << min_new_pg;
4891 ss << " pgs (" << (float)min_base_pg / avg_pg;
4892 if (newmap)
4893 ss << " -> " << (float)min_new_pg / avg_pg;
4894 ss << " * mean)\n";
4895 }
4896 }
4897 if (max >= 0) {
4898 if (f) {
4899 f->dump_unsigned("max_osd", max);
4900 f->dump_unsigned("max_osd_pgs", max_base_pg);
4901 if (newmap)
4902 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4903 } else {
4904 ss << "max osd." << max << " with " << max_base_pg;
4905 if (newmap)
4906 ss << " -> " << max_new_pg;
4907 ss << " pgs (" << (float)max_base_pg / avg_pg;
4908 if (newmap)
4909 ss << " -> " << (float)max_new_pg / avg_pg;
4910 ss << " * mean)\n";
4911 }
4912 }
4913 if (f)
4914 f->close_section();
4915 if (out)
4916 *out = ss.str();
4917 return 0;
4918}
4919
7c673cae
FG
4920bool OSDMap::try_pg_upmap(
4921 CephContext *cct,
4922 pg_t pg, ///< pg to potentially remap
4923 const set<int>& overfull, ///< osds we'd want to evacuate
4924 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4925 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4926 vector<int> *orig,
4927 vector<int> *out) ///< resulting alternative mapping
4928{
4929 const pg_pool_t *pool = get_pg_pool(pg.pool());
4930 if (!pool)
4931 return false;
20effc67 4932 int rule = pool->get_crush_rule();
7c673cae
FG
4933 if (rule < 0)
4934 return false;
4935
7c673cae
FG
4936 // make sure there is something there to remap
4937 bool any = false;
4938 for (auto osd : *orig) {
4939 if (overfull.count(osd)) {
4940 any = true;
4941 break;
4942 }
4943 }
4944 if (!any) {
4945 return false;
4946 }
4947
4948 int r = crush->try_remap_rule(
4949 cct,
4950 rule,
4951 pool->get_size(),
4952 overfull, underfull,
92f5a8d4 4953 more_underfull,
7c673cae
FG
4954 *orig,
4955 out);
4956 if (r < 0)
4957 return false;
4958 if (*out == *orig)
4959 return false;
4960 return true;
4961}
4962
1e59de90
TL
4963
4964int OSDMap::balance_primaries(
4965 CephContext *cct,
4966 int64_t pid,
4967 OSDMap::Incremental *pending_inc,
4968 OSDMap& tmp_osd_map) const
4969{
4970 // This function only handles replicated pools.
4971 const pg_pool_t* pool = get_pg_pool(pid);
4972 if (! pool->is_replicated()) {
4973 ldout(cct, 10) << __func__ << " skipping erasure pool "
4974 << get_pool_name(pid) << dendl;
4975 return -EINVAL;
4976 }
4977
4978 // Info to be used in verify_upmap
4979 int pool_size = pool->get_size();
4980 int crush_rule = pool->get_crush_rule();
4981
4982 // Get pgs by osd (map of osd -> pgs)
4983 // Get primaries by osd (map of osd -> primary)
4984 map<uint64_t,set<pg_t>> pgs_by_osd;
4985 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
4986 map<uint64_t,set<pg_t>> acting_prims_by_osd;
4987 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pid, &prim_pgs_by_osd, &acting_prims_by_osd);
4988
aee94f69
TL
4989 // Construct information about the pgs and osds we will consider in new primary mappings,
4990 // as well as a map of all pgs and their original primary osds.
1e59de90
TL
4991 map<pg_t,bool> prim_pgs_to_check;
4992 vector<uint64_t> osds_to_check;
aee94f69 4993 map<pg_t, uint64_t> orig_prims;
1e59de90
TL
4994 for (const auto & [osd, pgs] : prim_pgs_by_osd) {
4995 osds_to_check.push_back(osd);
4996 for (const auto & pg : pgs) {
4997 prim_pgs_to_check.insert({pg, false});
aee94f69 4998 orig_prims.insert({pg, osd});
1e59de90
TL
4999 }
5000 }
5001
5002 // calculate desired primary distribution for each osd
5003 map<uint64_t,float> desired_prim_dist;
5004 int rc = 0;
5005 rc = calc_desired_primary_distribution(cct, pid, osds_to_check, desired_prim_dist);
5006 if (rc < 0) {
5007 ldout(cct, 10) << __func__ << " Error in calculating desired primary distribution" << dendl;
5008 return -EINVAL;
5009 }
5010 map<uint64_t,float> prim_dist_scores;
5011 float actual;
5012 float desired;
5013 for (auto osd : osds_to_check) {
5014 actual = prim_pgs_by_osd[osd].size();
5015 desired = desired_prim_dist[osd];
5016 prim_dist_scores[osd] = actual - desired;
5017 ldout(cct, 10) << __func__ << " desired distribution for osd." << osd << " " << desired << dendl;
5018 }
5019
5020 // get read balance score before balancing
5021 float read_balance_score_before = 0.0;
5022 read_balance_info_t rb_info;
5023 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5024 if (rc >= 0) {
5025 read_balance_score_before = rb_info.adjusted_score;
5026 }
5027 if (rb_info.err_msg.length() > 0) {
5028 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5029 return -EINVAL;
5030 }
5031
5032 // get ready to swap pgs
5033 while (true) {
5034 int curr_num_changes = 0;
5035 vector<int> up_osds;
5036 vector<int> acting_osds;
5037 int up_primary, acting_primary;
5038 for (const auto & [pg, mapped] : prim_pgs_to_check) {
5039 // fill in the up, up primary, acting, and acting primary for the current PG
5040 tmp_osd_map.pg_to_up_acting_osds(pg, &up_osds, &up_primary,
5041 &acting_osds, &acting_primary);
5042
5043 // find the OSD that would make the best swap based on its score
5044 // We start by first testing the OSD that is currently primary for the PG we are checking.
5045 uint64_t curr_best_osd = up_primary;
5046 float prim_score = prim_dist_scores[up_primary];
5047 for (auto potential_osd : up_osds) {
5048 float potential_score = prim_dist_scores[potential_osd];
5049 if ((prim_score > 0) && // taking 1 pg from the prim would not make its score worse
5050 (potential_score < 0) && // adding 1 pg to the potential would not make its score worse
5051 ((prim_score - potential_score) > 1) && // swapping a pg would not just keep the scores the same
5052 (desired_prim_dist[potential_osd] > 0)) // the potential is not off limits (the primary affinity is above 0)
5053 {
5054 curr_best_osd = potential_osd;
5055 }
5056 }
5057
5058 // Make the swap only if:
5059 // 1. The swap is legal
5060 // 2. The balancer has chosen a new primary
5061 auto legal_swap = crush->verify_upmap(cct,
5062 crush_rule,
5063 pool_size,
5064 {(int)curr_best_osd});
5065 if (legal_swap >= 0 &&
5066 ((int)curr_best_osd != up_primary)) {
5067 // Update prim_dist_scores
5068 prim_dist_scores[curr_best_osd] += 1;
5069 prim_dist_scores[up_primary] -= 1;
5070
5071 // Update the mappings
1e59de90 5072 tmp_osd_map.pg_upmap_primaries[pg] = curr_best_osd;
aee94f69
TL
5073 if (curr_best_osd == orig_prims[pg]) {
5074 pending_inc->new_pg_upmap_primary.erase(pg);
5075 prim_pgs_to_check[pg] = false;
5076 } else {
5077 pending_inc->new_pg_upmap_primary[pg] = curr_best_osd;
5078 prim_pgs_to_check[pg] = true; // mark that this pg changed mappings
5079 }
1e59de90
TL
5080
5081 curr_num_changes++;
5082 }
5083 ldout(cct, 20) << __func__ << " curr_num_changes: " << curr_num_changes << dendl;
5084 }
5085 // If there are no changes after one pass through the pgs, then no further optimizations can be made.
5086 if (curr_num_changes == 0) {
5087 ldout(cct, 20) << __func__ << " curr_num_changes is 0; no further optimizations can be made." << dendl;
5088 break;
5089 }
5090 }
5091
5092 // get read balance score after balancing
5093 float read_balance_score_after = 0.0;
5094 rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info);
5095 if (rc >= 0) {
5096 read_balance_score_after = rb_info.adjusted_score;
5097 }
5098 if (rb_info.err_msg.length() > 0) {
5099 ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl;
5100 return -EINVAL;
5101 }
5102
5103 // Tally total number of changes
5104 int num_changes = 0;
5105 if (read_balance_score_after < read_balance_score_before) {
5106 for (auto [pg, mapped] : prim_pgs_to_check) {
5107 if (mapped) {
5108 num_changes++;
5109 }
5110 }
5111 }
5112
5113 ldout(cct, 10) << __func__ << " num_changes " << num_changes << dendl;
5114 return num_changes;
5115}
5116
5117int OSDMap::calc_desired_primary_distribution(
5118 CephContext *cct,
5119 int64_t pid,
5120 const vector<uint64_t> &osds,
5121 std::map<uint64_t, float>& desired_primary_distribution) const
5122{
5123 // will return a perfect distribution of floats
5124 // without calculating the floor of each value
5125 //
5126 // This function only handles replicated pools.
5127 const pg_pool_t* pool = get_pg_pool(pid);
5128 if (pool->is_replicated()) {
5129 ldout(cct, 20) << __func__ << " calculating distribution for replicated pool "
5130 << get_pool_name(pid) << dendl;
5131 uint64_t replica_count = pool->get_size();
5132
5133 map<uint64_t,set<pg_t>> pgs_by_osd;
5134 pgs_by_osd = get_pgs_by_osd(cct, pid);
5135
5136 // First calculate the distribution using primary affinity and tally up the sum
5137 auto distribution_sum = 0.0;
5138 for (const auto & osd : osds) {
5139 float osd_primary_count = ((float)pgs_by_osd[osd].size() / (float)replica_count) * get_primary_affinityf(osd);
5140 desired_primary_distribution.insert({osd, osd_primary_count});
5141 distribution_sum += osd_primary_count;
5142 }
5143 if (distribution_sum <= 0) {
5144 ldout(cct, 10) << __func__ << " Unable to calculate primary distribution, likely because primary affinity is"
5145 << " set to 0 on all OSDs." << dendl;
5146 return -EINVAL;
5147 }
5148
5149 // Then, stretch the value (necessary when primary affinity is smaller than 1)
5150 float factor = (float)pool->get_pg_num() / (float)distribution_sum;
5151 float distribution_sum_desired = 0.0;
5152
5153 ceph_assert(factor >= 1.0);
5154 for (const auto & [osd, osd_primary_count] : desired_primary_distribution) {
5155 desired_primary_distribution[osd] *= factor;
5156 distribution_sum_desired += desired_primary_distribution[osd];
5157 }
5158 ceph_assert(fabs(distribution_sum_desired - pool->get_pg_num()) < 0.01);
5159 } else {
5160 ldout(cct, 10) << __func__ <<" skipping erasure pool "
5161 << get_pool_name(pid) << dendl;
5162 return -EINVAL;
5163 }
5164
5165 return 0;
5166}
5167
7c673cae
FG
5168int OSDMap::calc_pg_upmaps(
5169 CephContext *cct,
92f5a8d4 5170 uint32_t max_deviation,
7c673cae 5171 int max,
a8e16298 5172 const set<int64_t>& only_pools,
20effc67
TL
5173 OSDMap::Incremental *pending_inc,
5174 std::random_device::result_type *p_seed)
7c673cae 5175{
a8e16298 5176 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
20effc67 5177 OSDMap tmp_osd_map;
92f5a8d4
TL
5178 // Can't be less than 1 pg
5179 if (max_deviation < 1)
5180 max_deviation = 1;
20effc67 5181 tmp_osd_map.deepish_copy_from(*this);
7c673cae 5182 int num_changed = 0;
a8e16298
TL
5183 map<int,set<pg_t>> pgs_by_osd;
5184 int total_pgs = 0;
5185 float osd_weight_total = 0;
5186 map<int,float> osd_weight;
a8e16298 5187
20effc67
TL
5188 if (max <= 0) {
5189 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
5190 return 0;
a8e16298 5191 }
20effc67
TL
5192
5193 osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map,
5194 total_pgs, pgs_by_osd, osd_weight);
a8e16298
TL
5195 if (osd_weight_total == 0) {
5196 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
5197 return 0;
5198 }
20effc67 5199
a8e16298
TL
5200 float pgs_per_weight = total_pgs / osd_weight_total;
5201 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
5202 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 5203
a8e16298
TL
5204 float stddev = 0;
5205 map<int,float> osd_deviation; // osd, deviation(pgs)
5206 multimap<float,int> deviation_osd; // deviation(pgs), osd
20effc67
TL
5207 float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight,
5208 osd_deviation, deviation_osd, stddev);
5209
92f5a8d4
TL
5210 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5211 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
5212 ldout(cct, 10) << __func__ << " distribution is almost perfect"
5213 << dendl;
5214 return 0;
5215 }
20effc67 5216
a8e16298
TL
5217 bool skip_overfull = false;
5218 auto aggressive =
11fdf7f2 5219 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
1e59de90
TL
5220 auto fast_aggressive = aggressive &&
5221 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively_fast");
a8e16298 5222 auto local_fallback_retries =
11fdf7f2 5223 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
20effc67 5224
a8e16298 5225 while (max--) {
92f5a8d4 5226 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
5227 // build overfull and underfull
5228 set<int> overfull;
92f5a8d4
TL
5229 set<int> more_overfull;
5230 bool using_more_overfull = false;
a8e16298 5231 vector<int> underfull;
92f5a8d4 5232 vector<int> more_underfull;
20effc67
TL
5233 fill_overfull_underfull(cct, deviation_osd, max_deviation,
5234 overfull, more_overfull,
5235 underfull, more_underfull);
7c673cae 5236
92f5a8d4
TL
5237 if (underfull.empty() && overfull.empty()) {
5238 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 5239 break;
a8e16298 5240 }
92f5a8d4
TL
5241 if (overfull.empty() && !underfull.empty()) {
5242 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
5243 overfull = more_overfull;
5244 using_more_overfull = true;
5245 }
7c673cae 5246
a8e16298
TL
5247 ldout(cct, 10) << " overfull " << overfull
5248 << " underfull " << underfull
5249 << dendl;
5250 set<pg_t> to_skip;
5251 uint64_t local_fallback_retried = 0;
5252
1e59de90
TL
5253 // Used to prevent some of the unsuccessful loop iterations (save runtime)
5254 // If we can't find a change per OSD we skip further iterations for this OSD
5255 uint n_changes = 0, prev_n_changes = 0;
5256 set<int> osd_to_skip;
5257
a8e16298
TL
5258 retry:
5259
5260 set<pg_t> to_unmap;
5261 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
5262 auto temp_pgs_by_osd = pgs_by_osd;
5263 // always start with fullest, break if we find any changes to make
7c673cae 5264 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 5265 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
5266 ldout(cct, 10) << " skipping overfull " << dendl;
5267 break; // fall through to check underfull
5268 }
7c673cae 5269 int osd = p->second;
31f18b77 5270 float deviation = p->first;
1e59de90
TL
5271 if (fast_aggressive && osd_to_skip.count(osd)) {
5272 ldout(cct, 20) << " Fast aggressive mode: skipping osd " << osd
5273 << " osd_to_skip size = " << osd_to_skip.size() << dendl;
5274 continue;
5275 }
5276
9f95a23c
TL
5277 if (deviation < 0) {
5278 ldout(cct, 10) << " hitting underfull osds now"
5279 << " when trying to remap overfull osds"
5280 << dendl;
5281 break;
5282 }
7c673cae 5283 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
5284 ldout(cct, 10) << " Overfull search osd." << osd
5285 << " target " << target
5286 << " deviation " << deviation
5287 << dendl;
a8e16298 5288 ceph_assert(target > 0);
92f5a8d4 5289 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 5290 ldout(cct, 10) << " osd." << osd
a8e16298
TL
5291 << " target " << target
5292 << " deviation " << deviation
92f5a8d4 5293 << " < max deviation " << max_deviation
a8e16298 5294 << dendl;
7c673cae
FG
5295 break;
5296 }
7c673cae 5297
a8e16298
TL
5298 vector<pg_t> pgs;
5299 pgs.reserve(pgs_by_osd[osd].size());
5300 for (auto& pg : pgs_by_osd[osd]) {
5301 if (to_skip.count(pg))
5302 continue;
5303 pgs.push_back(pg);
5304 }
5305 if (aggressive) {
5306 // shuffle PG list so they all get equal (in)attention
20effc67 5307 std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed));
a8e16298 5308 }
7c673cae 5309 // look for remaps we can un-remap
20effc67
TL
5310 if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd,
5311 temp_pgs_by_osd, to_unmap, to_upmap))
5312 goto test_change;
7c673cae 5313
a8e16298 5314 // try upmap
7c673cae 5315 for (auto pg : pgs) {
20effc67
TL
5316 auto temp_it = tmp_osd_map.pg_upmap.find(pg);
5317 if (temp_it != tmp_osd_map.pg_upmap.end()) {
a8e16298
TL
5318 // leave pg_upmap alone
5319 // it must be specified by admin since balancer does not
5320 // support pg_upmap yet
5321 ldout(cct, 10) << " " << pg << " already has pg_upmap "
5322 << temp_it->second << ", skipping"
5323 << dendl;
7c673cae
FG
5324 continue;
5325 }
20effc67 5326 auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg);
a8e16298
TL
5327 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5328 set<int> existing;
20effc67
TL
5329 auto it = tmp_osd_map.pg_upmap_items.find(pg);
5330 if (it != tmp_osd_map.pg_upmap_items.end()) {
5331 auto& um_items = it->second;
5332 if (um_items.size() >= (size_t)pg_pool_size) {
5333 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
5334 << um_items << ", skipping"
5335 << dendl;
5336 continue;
5337 } else {
5338 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
5339 << um_items
5340 << dendl;
5341 new_upmap_items = um_items;
5342 // build existing too (for dedup)
5343 for (auto [um_from, um_to] : um_items) {
5344 existing.insert(um_from);
5345 existing.insert(um_to);
5346 }
5347 }
a8e16298
TL
5348 // fall through
5349 // to see if we can append more remapping pairs
20effc67 5350 }
a8e16298 5351 ldout(cct, 10) << " trying " << pg << dendl;
494da23a 5352 vector<int> raw, orig, out;
20effc67 5353 tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 5354 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
5355 continue;
5356 }
a8e16298 5357 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
5358 if (orig.size() != out.size()) {
5359 continue;
5360 }
a8e16298 5361 ceph_assert(orig != out);
20effc67 5362 int pos = find_best_remap(cct, orig, out, existing, osd_deviation);
92f5a8d4 5363 if (pos != -1) {
a8e16298
TL
5364 // append new remapping pairs slowly
5365 // This way we can make sure that each tiny change will
5366 // definitely make distribution of PGs converging to
5367 // the perfect status.
20effc67
TL
5368 add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size,
5369 osd, existing, temp_pgs_by_osd,
5370 new_upmap_items, to_upmap);
a8e16298 5371 goto test_change;
7c673cae 5372 }
a8e16298 5373 }
1e59de90
TL
5374 if (fast_aggressive) {
5375 if (prev_n_changes == n_changes) { // no changes for prev OSD
5376 osd_to_skip.insert(osd);
5377 }
5378 else {
5379 prev_n_changes = n_changes;
5380 }
5381 }
5382
a8e16298 5383 }
7c673cae 5384
a8e16298
TL
5385 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5386 ldout(cct, 10) << " failed to find any changes for overfull osds"
5387 << dendl;
20effc67
TL
5388 for (auto& [deviation, osd] : deviation_osd) {
5389 if (std::find(underfull.begin(), underfull.end(), osd) ==
a8e16298
TL
5390 underfull.end())
5391 break;
a8e16298
TL
5392 float target = osd_weight[osd] * pgs_per_weight;
5393 ceph_assert(target > 0);
92f5a8d4
TL
5394 if (fabsf(deviation) < max_deviation) {
5395 // respect max_deviation too
a8e16298
TL
5396 ldout(cct, 10) << " osd." << osd
5397 << " target " << target
5398 << " deviation " << deviation
92f5a8d4
TL
5399 << " -> absolute " << fabsf(deviation)
5400 << " < max " << max_deviation
a8e16298
TL
5401 << dendl;
5402 break;
5403 }
5404 // look for remaps we can un-remap
20effc67
TL
5405 candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip,
5406 only_pools, aggressive, p_seed);
5407 if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd,
5408 to_unmap, to_upmap)) {
5409 goto test_change;
a8e16298 5410 }
7c673cae 5411 }
a8e16298
TL
5412
5413 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5414 ldout(cct, 10) << " failed to find any changes for underfull osds"
5415 << dendl;
5416 if (!aggressive) {
5417 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5418 break;
5419 } else if (!skip_overfull) {
5420 // safe to quit because below here we know
5421 // we've done checking both overfull and underfull osds..
5422 ldout(cct, 10) << " break due to not being able to find any"
5423 << " further optimizations"
5424 << dendl;
7c673cae
FG
5425 break;
5426 }
a8e16298
TL
5427 // restart with fullest and do exhaustive searching
5428 skip_overfull = false;
5429 continue;
5430
5431 test_change:
5432
5433 // test change, apply if change is good
5434 ceph_assert(to_unmap.size() || to_upmap.size());
5435 float new_stddev = 0;
5436 map<int,float> temp_osd_deviation;
5437 multimap<float,int> temp_deviation_osd;
20effc67
TL
5438 float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight,
5439 pgs_per_weight, temp_osd_deviation,
5440 temp_deviation_osd, new_stddev);
a8e16298
TL
5441 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5442 if (new_stddev >= stddev) {
5443 if (!aggressive) {
5444 ldout(cct, 10) << " break because stddev is not decreasing"
5445 << " and aggressive mode is not enabled"
5446 << dendl;
5447 break;
5448 }
5449 local_fallback_retried++;
5450 if (local_fallback_retried >= local_fallback_retries) {
5451 // does not make progress
5452 // flip *skip_overfull* so both overfull and underfull
5453 // get equal (in)attention
5454 skip_overfull = !skip_overfull;
5455 ldout(cct, 10) << " hit local_fallback_retries "
5456 << local_fallback_retries
5457 << dendl;
5458 continue;
5459 }
5460 for (auto& i : to_unmap)
5461 to_skip.insert(i);
5462 for (auto& i : to_upmap)
5463 to_skip.insert(i.first);
5464 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5465 << " to_skip " << to_skip
5466 << dendl;
5467 goto retry;
5468 }
5469
5470 // ready to go
5471 ceph_assert(new_stddev < stddev);
5472 stddev = new_stddev;
5473 pgs_by_osd = temp_pgs_by_osd;
5474 osd_deviation = temp_osd_deviation;
5475 deviation_osd = temp_deviation_osd;
1e59de90
TL
5476 n_changes++;
5477
20effc67
TL
5478
5479 num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc);
5480
92f5a8d4
TL
5481 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5482 if (cur_max_deviation <= max_deviation) {
5483 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5484 << dendl;
5485 break;
5486 }
7c673cae 5487 }
a8e16298 5488 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
5489 return num_changed;
5490}
31f18b77 5491
1e59de90
TL
5492map<uint64_t,set<pg_t>> OSDMap::get_pgs_by_osd(
5493 CephContext *cct,
5494 int64_t pid,
5495 map<uint64_t, set<pg_t>> *p_primaries_by_osd,
5496 map<uint64_t, set<pg_t>> *p_acting_primaries_by_osd) const
5497{
5498 // Set up the OSDMap
5499 OSDMap tmp_osd_map;
5500 tmp_osd_map.deepish_copy_from(*this);
5501
5502 // Get the pool from the provided pool id
5503 const pg_pool_t* pool = get_pg_pool(pid);
5504
5505 // build array of pgs from the pool
5506 map<uint64_t,set<pg_t>> pgs_by_osd;
5507 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5508 pg_t pg(ps, pid);
5509 vector<int> up;
5510 int primary;
5511 int acting_prim;
5512 tmp_osd_map.pg_to_up_acting_osds(pg, &up, &primary, nullptr, &acting_prim);
5513 if (cct != nullptr)
5514 ldout(cct, 20) << __func__ << " " << pg
5515 << " up " << up
5516 << " primary " << primary
5517 << " acting_primary " << acting_prim
5518 << dendl;
5519
5520 if (!up.empty()) { // up can be empty is test generated files
5521 // in this case, we return empty result
5522 for (auto osd : up) {
5523 if (osd != CRUSH_ITEM_NONE)
5524 pgs_by_osd[osd].insert(pg);
5525 }
5526 if (p_primaries_by_osd != nullptr) {
5527 if (primary != CRUSH_ITEM_NONE)
5528 (*p_primaries_by_osd)[primary].insert(pg);
5529 }
5530 if (p_acting_primaries_by_osd != nullptr) {
5531 if (acting_prim != CRUSH_ITEM_NONE)
5532 (*p_acting_primaries_by_osd)[acting_prim].insert(pg);
5533 }
5534 }
5535 }
5536 return pgs_by_osd;
5537}
5538
5539float OSDMap::get_osds_weight(
5540 CephContext *cct,
5541 const OSDMap& tmp_osd_map,
5542 int64_t pid,
5543 map<int,float>& osds_weight) const
5544{
5545 map<int,float> pmap;
5546 ceph_assert(pools.count(pid));
5547 int ruleno = pools.at(pid).get_crush_rule();
5548 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap);
5549 ldout(cct,20) << __func__ << " pool " << pid
5550 << " ruleno " << ruleno
5551 << " weight-map " << pmap
5552 << dendl;
5553 float osds_weight_total = 0;
5554 for (auto [oid, oweight] : pmap) {
5555 auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight;
5556 if (adjusted_weight != 0) {
5557 osds_weight[oid] += adjusted_weight;
5558 osds_weight_total += adjusted_weight;
5559 }
5560 }
5561 return osds_weight_total;
5562}
5563
20effc67
TL
5564float OSDMap::build_pool_pgs_info (
5565 CephContext *cct,
5566 const std::set<int64_t>& only_pools, ///< [optional] restrict to pool
5567 const OSDMap& tmp_osd_map,
5568 int& total_pgs,
5569 map<int,set<pg_t>>& pgs_by_osd,
1e59de90 5570 map<int,float>& osds_weight)
20effc67
TL
5571{
5572 //
5573 // This function builds some data structures that are used by calc_pg_upmaps.
5574 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5575 // and returns the osd_weight_total
5576 //
1e59de90 5577 float osds_weight_total = 0.0;
20effc67
TL
5578 for (auto& [pid, pdata] : pools) {
5579 if (!only_pools.empty() && !only_pools.count(pid))
5580 continue;
5581 for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) {
5582 pg_t pg(ps, pid);
5583 vector<int> up;
5584 tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
5585 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
5586 for (auto osd : up) {
5587 if (osd != CRUSH_ITEM_NONE)
5588 pgs_by_osd[osd].insert(pg);
5589 }
5590 }
5591 total_pgs += pdata.get_size() * pdata.get_pg_num();
5592
1e59de90 5593 osds_weight_total = get_osds_weight(cct, tmp_osd_map, pid, osds_weight);
20effc67 5594 }
1e59de90 5595 for (auto& [oid, oweight] : osds_weight) {
20effc67
TL
5596 int pgs = 0;
5597 auto p = pgs_by_osd.find(oid);
5598 if (p != pgs_by_osd.end())
5599 pgs = p->second.size();
5600 else
5601 pgs_by_osd.emplace(oid, set<pg_t>());
5602 ldout(cct, 20) << " osd." << oid << " weight " << oweight
5603 << " pgs " << pgs << dendl;
5604 }
1e59de90 5605 return osds_weight_total;
20effc67
TL
5606
5607} // return total weight of all OSDs
5608
5609float OSDMap::calc_deviations (
5610 CephContext *cct,
5611 const map<int,set<pg_t>>& pgs_by_osd,
5612 const map<int,float>& osd_weight,
5613 float pgs_per_weight,
5614 map<int,float>& osd_deviation,
5615 multimap<float,int>& deviation_osd,
5616 float& stddev) // return current max deviation
5617{
5618 //
5619 // This function calculates the 2 maps osd_deviation and deviation_osd which
5620 // hold the deviation between the current number of PGs which map to an OSD
5621 // and the optimal number. Ot also calculates the stddev of the deviations and
5622 // returns the current max deviation.
5623 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5624 // long as it is monotonic with stddev (and it is), it is sufficient for
5625 // the balancer code.
5626 //
5627 float cur_max_deviation = 0.0;
5628 stddev = 0.0;
5629 for (auto& [oid, opgs] : pgs_by_osd) {
5630 // make sure osd is still there (belongs to this crush-tree)
5631 ceph_assert(osd_weight.count(oid));
5632 float target = osd_weight.at(oid) * pgs_per_weight;
5633 float deviation = (float)opgs.size() - target;
5634 ldout(cct, 20) << " osd." << oid
5635 << "\tpgs " << opgs.size()
5636 << "\ttarget " << target
5637 << "\tdeviation " << deviation
5638 << dendl;
5639 osd_deviation[oid] = deviation;
5640 deviation_osd.insert(make_pair(deviation, oid));
5641 stddev += deviation * deviation;
5642 if (fabsf(deviation) > cur_max_deviation)
5643 cur_max_deviation = fabsf(deviation);
5644 }
5645 return cur_max_deviation;
5646}
5647
5648void OSDMap::fill_overfull_underfull (
5649 CephContext *cct,
5650 const std::multimap<float,int>& deviation_osd,
5651 int max_deviation,
5652 std::set<int>& overfull,
5653 std::set<int>& more_overfull,
5654 std::vector<int>& underfull,
5655 std::vector<int>& more_underfull)
5656{
5657 //
5658 // This function just fills the overfull and underfull data structures for the
5659 // use of calc_pg_upmaps
5660 //
5661 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
5662 auto& odev = i->first;
5663 auto& oid = i->second;
5664 ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl;
5665 if (odev <= 0)
5666 break;
5667 if (odev > max_deviation) {
5668 ldout(cct, 30) << " add overfull osd." << oid << dendl;
5669 overfull.insert(oid);
5670 } else {
5671 more_overfull.insert(oid);
5672 }
5673 }
5674
5675 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
5676 auto& odev = i->first;
5677 auto& oid = i->second;
5678 ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl;
5679 if (odev >= 0)
5680 break;
5681 if (odev < -(int)max_deviation) {
5682 ldout(cct, 30) << " add underfull osd." << oid << dendl;
5683 underfull.push_back(oid);
5684 } else {
5685 more_underfull.push_back(oid);
5686 }
5687 }
5688}
5689
5690int OSDMap::pack_upmap_results(
5691 CephContext *cct,
5692 const std::set<pg_t>& to_unmap,
5693 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
5694 OSDMap& tmp_osd_map,
5695 OSDMap::Incremental *pending_inc)
5696{
5697 //
5698 // This function takes the input from the local variables to_unmap and to_upmap
5699 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5700 // (so that the results are visible outside calc_pg_upmaps)
5701 //
5702 int num_changed = 0;
5703 for (auto& i : to_unmap) {
5704 ldout(cct, 10) << " unmap pg " << i << dendl;
5705 ceph_assert(tmp_osd_map.pg_upmap_items.count(i));
5706 tmp_osd_map.pg_upmap_items.erase(i);
5707 pending_inc->old_pg_upmap_items.insert(i);
5708 ++num_changed;
5709 }
5710 for (auto& [pg, um_items] : to_upmap) {
5711 ldout(cct, 10) << " upmap pg " << pg
5712 << " new pg_upmap_items " << um_items
5713 << dendl;
5714 tmp_osd_map.pg_upmap_items[pg] = um_items;
5715 pending_inc->new_pg_upmap_items[pg] = um_items;
5716 ++num_changed;
5717 }
5718
5719 return num_changed;
5720}
5721
5722std::default_random_engine OSDMap::get_random_engine(
5723 CephContext *cct,
5724 std::random_device::result_type *p_seed)
5725{
5726 //
5727 // This function creates a random_engine to be used for shuffling.
5728 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5729 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5730 // increments seed_set. This is used in order to craete regression test without
5731 // random effect on the results.
5732 //
5733 static std::random_device::result_type seed_set = 0;
5734 std::random_device::result_type seed;
5735 if (p_seed == nullptr) {
5736 std::random_device rd;
5737 seed = rd();
5738 }
5739 else {
5740 seed = *p_seed + seed_set;
5741 ldout(cct, 30) << " Starting random engine with seed "
5742 << seed << dendl;
5743 seed_set++;
5744 }
5745 return std::default_random_engine{seed};
5746}
5747
5748bool OSDMap::try_drop_remap_overfull(
5749 CephContext *cct,
5750 const std::vector<pg_t>& pgs,
5751 const OSDMap& tmp_osd_map,
5752 int osd,
5753 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5754 set<pg_t>& to_unmap,
5755 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5756{
5757 //
5758 // This function tries to drop existimg upmap items which map data to overfull
5759 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5760 // if it found an item that can be dropped, false if not.
5761 //
5762 for (auto pg : pgs) {
5763 auto p = tmp_osd_map.pg_upmap_items.find(pg);
5764 if (p == tmp_osd_map.pg_upmap_items.end())
5765 continue;
5766 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5767 auto& pg_upmap_items = p->second;
5768 for (auto um_pair : pg_upmap_items) {
5769 auto& um_from = um_pair.first;
5770 auto& um_to = um_pair.second;
5771 if (um_to == osd) {
5772 ldout(cct, 10) << " will try dropping existing"
5773 << " remapping pair "
5774 << um_from << " -> " << um_to
5775 << " which remapped " << pg
5776 << " into overfull osd." << osd
5777 << dendl;
5778 temp_pgs_by_osd[um_to].erase(pg);
5779 temp_pgs_by_osd[um_from].insert(pg);
5780 } else {
5781 new_upmap_items.push_back(um_pair);
5782 }
5783 }
5784 if (new_upmap_items.empty()) {
5785 // drop whole item
5786 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5787 << " remapped " << pg << " into overfull osd." << osd
5788 << ", will try cancelling it entirely"
5789 << dendl;
5790 to_unmap.insert(pg);
5791 return true;
5792 } else if (new_upmap_items.size() != pg_upmap_items.size()) {
5793 // drop single remapping pair, updating
5794 ceph_assert(new_upmap_items.size() < pg_upmap_items.size());
5795 ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items
5796 << " remapped " << pg << " into overfull osd." << osd
5797 << ", new_pg_upmap_items now " << new_upmap_items
5798 << dendl;
5799 to_upmap[pg] = new_upmap_items;
5800 return true;
5801 }
5802 }
5803 return false;
5804}
5805
5806bool OSDMap::try_drop_remap_underfull(
5807 CephContext *cct,
5808 const candidates_t& candidates,
5809 int osd,
5810 map<int,std::set<pg_t>>& temp_pgs_by_osd,
5811 set<pg_t>& to_unmap,
5812 map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap)
5813{
5814 //
5815 // This function tries to drop existimg upmap items which map data from underfull
5816 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5817 // if it found an item that can be dropped, false if not.
5818 //
5819 for (auto& [pg, um_pairs] : candidates) {
5820 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
5821 for (auto& ump : um_pairs) {
5822 auto& um_from = ump.first;
5823 auto& um_to = ump.second;
5824 if (um_from == osd) {
5825 ldout(cct, 10) << " will try dropping existing"
5826 << " remapping pair "
5827 << um_from << " -> " << um_to
5828 << " which remapped " << pg
5829 << " out from underfull osd." << osd
5830 << dendl;
5831 temp_pgs_by_osd[um_to].erase(pg);
5832 temp_pgs_by_osd[um_from].insert(pg);
5833 } else {
5834 new_upmap_items.push_back(ump);
5835 }
5836 }
5837 if (new_upmap_items.empty()) {
5838 // drop whole item
5839 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5840 << " remapped " << pg
5841 << " out from underfull osd." << osd
5842 << ", will try cancelling it entirely"
5843 << dendl;
5844 to_unmap.insert(pg);
5845 return true;
5846 } else if (new_upmap_items.size() != um_pairs.size()) {
5847 // drop single remapping pair, updating
5848 ceph_assert(new_upmap_items.size() < um_pairs.size());
5849 ldout(cct, 10) << " existing pg_upmap_items " << um_pairs
5850 << " remapped " << pg
5851 << " out from underfull osd." << osd
5852 << ", new_pg_upmap_items now " << new_upmap_items
5853 << dendl;
5854 to_upmap[pg] = new_upmap_items;
5855 return true;
5856 }
5857 }
5858 return false;
5859}
5860
5861void OSDMap::add_remap_pair(
5862 CephContext *cct,
5863 int orig,
5864 int out,
5865 pg_t pg,
5866 size_t pg_pool_size,
5867 int osd,
5868 set<int>& existing,
5869 map<int,set<pg_t>>& temp_pgs_by_osd,
5870 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items,
5871 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap)
5872{
5873 //
5874 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5875 // the relevant data structures
5876 //
5877 ldout(cct, 10) << " will try adding new remapping pair "
5878 << orig << " -> " << out << " for " << pg
5879 << (orig != osd ? " NOT selected osd" : "")
5880 << dendl;
5881 existing.insert(orig);
5882 existing.insert(out);
5883 temp_pgs_by_osd[orig].erase(pg);
5884 temp_pgs_by_osd[out].insert(pg);
5885 ceph_assert(new_upmap_items.size() < pg_pool_size);
5886 new_upmap_items.push_back(make_pair(orig, out));
5887 // append new remapping pairs slowly
5888 // This way we can make sure that each tiny change will
5889 // definitely make distribution of PGs converging to
5890 // the perfect status.
5891 to_upmap[pg] = new_upmap_items;
5892
5893}
5894
5895int OSDMap::find_best_remap (
5896 CephContext *cct,
5897 const vector<int>& orig,
5898 const vector<int>& out,
5899 const set<int>& existing,
5900 const map<int,float> osd_deviation)
5901{
5902 //
5903 // Find the best remap from the suggestions in orig and out - the best remap
5904 // is the one which maps from the OSD with the largest deviatoion (from the
5905 // OSDs which are part of orig)
5906 //
5907 int best_pos = -1;
5908 float max_dev = 0;
5909 for (unsigned i = 0; i < out.size(); ++i) {
5910 if (orig[i] == out[i])
5911 continue; // skip invalid remappings
5912 if (existing.count(orig[i]) || existing.count(out[i]))
5913 continue; // we want new remappings only!
5914 if (osd_deviation.at(orig[i]) > max_dev) {
5915 max_dev = osd_deviation.at(orig[i]);
5916 best_pos = i;
5917 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl;
5918 }
5919 }
5920 return best_pos;
5921}
5922
5923OSDMap::candidates_t OSDMap::build_candidates(
5924 CephContext *cct,
5925 const OSDMap& tmp_osd_map,
5926 const set<pg_t> to_skip,
5927 const set<int64_t>& only_pools,
5928 bool aggressive,
5929 std::random_device::result_type *p_seed)
5930{
5931 //
5932 // build the candidates data structure
5933 //
5934 candidates_t candidates;
5935 candidates.reserve(tmp_osd_map.pg_upmap_items.size());
5936 for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) {
5937 if (to_skip.count(pg))
5938 continue;
5939 if (!only_pools.empty() && !only_pools.count(pg.pool()))
5940 continue;
5941 candidates.push_back(make_pair(pg, um_pair));
5942 }
5943 if (aggressive) {
5944 // shuffle candidates so they all get equal (in)attention
5945 std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed));
5946 }
5947 return candidates;
5948}
5949
1e59de90
TL
5950// return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs
5951int64_t OSDMap::has_zero_pa_pgs(CephContext *cct, int64_t pool_id) const
5952{
5953 const pg_pool_t* pool = get_pg_pool(pool_id);
5954 for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
5955 pg_t pg(ps, pool_id);
5956 vector<int> acting;
5957 pg_to_up_acting_osds(pg, nullptr, nullptr, &acting, nullptr);
5958 if (cct != nullptr) {
5959 ldout(cct, 30) << __func__ << " " << pg << " acting " << acting << dendl;
5960 }
5961 bool pg_zero_pa = true;
5962 for (auto osd : acting) {
5963 if (get_primary_affinityf(osd) != 0) {
5964 pg_zero_pa = false;
5965 break;
5966 }
5967 }
5968 if (pg_zero_pa) {
5969 if (cct != nullptr) {
5970 ldout(cct, 20) << __func__ << " " << pg << " - maps only to OSDs with primiary affinity 0" << dendl;
5971 }
5972 return (int64_t)ps;
5973 }
5974 }
5975 return -1;
5976}
5977
5978void OSDMap::zero_rbi(read_balance_info_t &rbi) const {
5979 rbi.pa_avg = 0.;
5980 rbi.pa_weighted = 0.;
5981 rbi.pa_weighted_avg = 0.;
5982 rbi.raw_score = 0.;
5983 rbi.optimal_score = 0.;
5984 rbi.adjusted_score = 0.;
5985 rbi.acting_raw_score = 0.;
5986 rbi.acting_adj_score = 0.;
5987 rbi.err_msg = "";
5988}
5989
5990int OSDMap::set_rbi(
5991 CephContext *cct,
5992 read_balance_info_t &rbi,
5993 int64_t pool_id,
5994 float total_w_pa,
5995 float pa_sum,
5996 int num_osds,
5997 int osd_pa_count,
5998 float total_osd_weight,
5999 uint max_prims_per_osd,
6000 uint max_acting_prims_per_osd,
6001 float avg_prims_per_osd,
6002 bool prim_on_zero_pa,
6003 bool acting_on_zero_pa,
6004 float max_osd_score) const
6005{
6006 // put all the ugly code here, so rest of code is nicer.
6007 const pg_pool_t* pool = get_pg_pool(pool_id);
6008 zero_rbi(rbi);
6009
6010 if (total_w_pa / total_osd_weight < 1. / float(pool->get_size())) {
6011 ldout(cct, 20) << __func__ << " pool " << pool_id << " average primary affinity is lower than"
6012 << 1. / float(pool->get_size()) << dendl;
6013 rbi.err_msg = fmt::format(
6014 "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable",
6015 pool_id, 1. / float(pool->get_size()));
6016 return -EINVAL;
6017 }
6018 rbi.pa_weighted = total_w_pa;
6019
6020 // weighted_prim_affinity_avg
6021 rbi.pa_weighted_avg = rbi_round(rbi.pa_weighted / total_osd_weight); // in [0..1]
6022 // p_rbi->pa_weighted / osd_pa_count; // in [0..1]
6023
6024 rbi.raw_score = rbi_round((float)max_prims_per_osd / avg_prims_per_osd); // >=1
6025 if (acting_on_zero_pa) {
6026 rbi.acting_raw_score = rbi_round(max_osd_score);
6027 rbi.err_msg = fmt::format(
6028 "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate",
6029 pool_id);
6030 } else {
6031 rbi.acting_raw_score = rbi_round((float)max_acting_prims_per_osd / avg_prims_per_osd);
6032 }
6033
6034 if (osd_pa_count != 0) {
6035 // this implies that pa_sum > 0
6036 rbi.pa_avg = rbi_round(pa_sum / osd_pa_count); // in [0..1]
6037 } else {
6038 rbi.pa_avg = 0.;
6039 }
6040
6041 if (rbi.pa_avg != 0.) {
6042 int64_t zpg;
6043 if ((zpg = has_zero_pa_pgs(cct, pool_id)) >= 0) {
6044 pg_t pg(zpg, pool_id);
6045 std::stringstream ss;
6046 ss << pg;
6047 ldout(cct, 10) << __func__ << " pool " << pool_id << " has some PGs where all OSDs are with primary_affinity 0 (" << pg << ",...)" << dendl;
6048 rbi.err_msg = fmt::format(
6049 "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable",
6050 pool_id, ss.str());
6051 return -EINVAL;
6052 }
6053 rbi.optimal_score = rbi_round(float(num_osds) / float(osd_pa_count)); // >= 1
6054 // adjust the score to the primary affinity setting (if prim affinity is set
6055 // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1)
6056 // When total system primary affinity is too low (average < 1 / pool replica count)
6057 // the score is negative in order to grab the user's attention.
6058 rbi.adjusted_score = rbi_round(rbi.raw_score / rbi.optimal_score); // >= 1 if PA is not low
6059 rbi.acting_adj_score = rbi_round(rbi.acting_raw_score / rbi.optimal_score); // >= 1 if PA is not low
6060
6061 } else {
6062 // We should never get here - this condition is checked before calling this function - this is just sanity check code.
6063 rbi.err_msg = fmt::format(
6064 "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score",
6065 pool_id);
6066 return -EINVAL;
6067 }
6068
6069 return 0;
6070}
6071
6072int OSDMap::calc_read_balance_score(CephContext *cct, int64_t pool_id,
6073 read_balance_info_t *p_rbi) const
6074{
6075 //BUG: wrong score with one PG replica 3 and 4 OSDs
6076 if (cct != nullptr)
6077 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) << dendl;
6078
6079 OSDMap tmp_osd_map;
6080 tmp_osd_map.deepish_copy_from(*this);
6081 if (p_rbi == nullptr) {
6082 // The only case where error message is not set - this is not tested in the unit test.
6083 if (cct != nullptr)
6084 ldout(cct,30) << __func__ << " p_rbi is nullptr." << dendl;
6085 return -EINVAL;
6086 }
6087
6088 if (tmp_osd_map.pools.count(pool_id) == 0) {
6089 if (cct != nullptr)
6090 ldout(cct,30) << __func__ << " pool " << pool_id << " not found." << dendl;
6091 zero_rbi(*p_rbi);
6092 p_rbi->err_msg = fmt::format("pool {} not found", pool_id);
6093 return -ENOENT;
6094 }
6095 int rc = 0;
6096 const pg_pool_t* pool = tmp_osd_map.get_pg_pool(pool_id);
6097 auto num_pgs = pool->get_pg_num();
6098
6099 map<uint64_t,set<pg_t>> pgs_by_osd;
6100 map<uint64_t,set<pg_t>> prim_pgs_by_osd;
6101 map<uint64_t,set<pg_t>> acting_prims_by_osd;
6102
6103 pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pool_id, &prim_pgs_by_osd, &acting_prims_by_osd);
6104
6105 if (cct != nullptr)
6106 ldout(cct,30) << __func__ << " Primaries for pool: "
6107 << prim_pgs_by_osd << dendl;
6108
6109 if (pgs_by_osd.empty()) {
6110 //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id);
6111 return -EINVAL;
6112 }
6113 if (cct != nullptr) {
6114 for (auto& [osd,pgs] : prim_pgs_by_osd) {
6115 ldout(cct,20) << __func__ << " Pool " << pool_id << " OSD." << osd
6116 << " has " << pgs.size() << " primary PGs, "
6117 << acting_prims_by_osd[osd].size() << " acting primaries."
6118 << dendl;
6119 }
6120 }
6121
6122 auto num_osds = pgs_by_osd.size();
6123
6124 float avg_prims_per_osd = (float)num_pgs / (float)num_osds;
6125 uint64_t max_prims_per_osd = 0;
6126 uint64_t max_acting_prims_per_osd = 0;
6127 float max_osd_score = 0.;
6128 bool prim_on_zero_pa = false;
6129 bool acting_on_zero_pa = false;
6130
6131 float prim_affinity_sum = 0.;
6132 float total_osd_weight = 0.;
6133 float total_weighted_pa = 0.;
6134
6135 map<int,float> osds_crush_weight;
6136 // Set up the OSDMap
6137 int ruleno = tmp_osd_map.pools.at(pool_id).get_crush_rule();
6138 tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &osds_crush_weight);
6139
6140 if (cct != nullptr) {
6141 ldout(cct,20) << __func__ << " pool " << pool_id
6142 << " ruleno " << ruleno
6143 << " weight-map " << osds_crush_weight
6144 << dendl;
6145 }
6146 uint osd_pa_count = 0;
6147
6148 for (auto [osd, oweight] : osds_crush_weight) { // loop over all OSDs
6149 total_osd_weight += oweight;
6150 float osd_pa = tmp_osd_map.get_primary_affinityf(osd);
6151 total_weighted_pa += oweight * osd_pa;
6152 if (osd_pa != 0.) {
6153 osd_pa_count++;
6154 }
6155 if (prim_pgs_by_osd.count(osd)) {
6156 auto n_prims = prim_pgs_by_osd.at(osd).size();
6157 max_prims_per_osd = std::max(max_prims_per_osd, n_prims);
6158 if (osd_pa == 0.) {
6159 prim_on_zero_pa = true;
6160 }
6161 }
6162 if (acting_prims_by_osd.count(osd)) {
6163 auto n_aprims = acting_prims_by_osd.at(osd).size();
6164 max_acting_prims_per_osd = std::max(max_acting_prims_per_osd, n_aprims);
6165 if (osd_pa != 0.) {
6166 max_osd_score = std::max(max_osd_score, float(n_aprims) / osd_pa);
6167 }
6168 else {
6169 acting_on_zero_pa = true;
6170 }
6171 }
6172
6173 prim_affinity_sum += osd_pa;
6174 if (cct != nullptr) {
6175 auto np = prim_pgs_by_osd.count(osd) ? prim_pgs_by_osd.at(osd).size() : 0;
6176 auto nap = acting_prims_by_osd.count(osd) ? acting_prims_by_osd.at(osd).size() : 0;
6177 auto wt = osds_crush_weight.count(osd) ? osds_crush_weight.at(osd) : 0.;
6178 ldout(cct,30) << __func__ << " OSD." << osd << " info: "
6179 << " num_primaries " << np
6180 << " num_acting_prims " << nap
6181 << " prim_affinity " << tmp_osd_map.get_primary_affinityf(osd)
6182 << " weight " << wt
6183 << dendl;
6184 }
6185 }
6186 if (cct != nullptr) {
6187 ldout(cct,30) << __func__ << " pool " << pool_id
6188 << " total_osd_weight " << total_osd_weight
6189 << " total_weighted_pa " << total_weighted_pa
6190 << dendl;
6191 }
6192
6193 if (prim_affinity_sum == 0.0) {
6194 if (cct != nullptr) {
6195 ldout(cct, 10) << __func__ << " pool " << pool_id
6196 << " has primary_affinity set to zero on all OSDs" << dendl;
6197 }
6198 zero_rbi(*p_rbi);
6199 p_rbi->err_msg = fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id);
6200
6201 return -ERANGE; // score has a different meaning now.
6202 }
6203 else {
6204 max_osd_score *= prim_affinity_sum / num_osds;
6205 }
6206
6207 rc = tmp_osd_map.set_rbi(cct, *p_rbi, pool_id, total_weighted_pa,
6208 prim_affinity_sum, num_osds, osd_pa_count,
6209 total_osd_weight, max_prims_per_osd,
6210 max_acting_prims_per_osd, avg_prims_per_osd,
6211 prim_on_zero_pa, acting_on_zero_pa, max_osd_score);
6212
6213 if (cct != nullptr) {
6214 ldout(cct,30) << __func__ << " pool " << get_pool_name(pool_id)
6215 << " pa_avg " << p_rbi->pa_avg
6216 << " pa_weighted " << p_rbi->pa_weighted
6217 << " pa_weighted_avg " << p_rbi->pa_weighted_avg
6218 << " optimal_score " << p_rbi->optimal_score
6219 << " adjusted_score " << p_rbi->adjusted_score
6220 << " acting_adj_score " << p_rbi->acting_adj_score
6221 << dendl;
6222 ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id)
6223 << " raw_score: " << p_rbi->raw_score
6224 << " acting_raw_score: " << p_rbi->acting_raw_score
6225 << dendl;
6226 ldout(cct,10) << __func__ << " pool " << get_pool_name(pool_id)
6227 << " wl_score: " << p_rbi->acting_adj_score << dendl;
6228 }
6229
6230 return rc;
6231}
6232
31f18b77
FG
6233int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
6234{
6235 return crush->get_leaves(name, osds);
6236}
6237
3efd9988
FG
6238// get pools whose crush rules might reference the given osd
6239void OSDMap::get_pool_ids_by_osd(CephContext *cct,
6240 int osd,
6241 set<int64_t> *pool_ids) const
6242{
11fdf7f2 6243 ceph_assert(pool_ids);
3efd9988
FG
6244 set<int> raw_rules;
6245 int r = crush->get_rules_by_osd(osd, &raw_rules);
6246 if (r < 0) {
6247 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
6248 << dendl;
11fdf7f2 6249 ceph_assert(r >= 0);
3efd9988
FG
6250 }
6251 set<int> rules;
6252 for (auto &i: raw_rules) {
6253 // exclude any dead rule
6254 if (crush_rule_in_use(i)) {
6255 rules.insert(i);
6256 }
6257 }
6258 for (auto &r: rules) {
6259 get_pool_ids_by_rule(r, pool_ids);
6260 }
6261}
6262
31f18b77
FG
6263template <typename F>
6264class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
6265public:
6266 typedef CrushTreeDumper::Dumper<F> Parent;
6267
6268 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2 6269 const PGMap& pgmap_, bool tree_,
9f95a23c 6270 const string& filter) :
c07f9fc5 6271 Parent(crush, osdmap_->get_pool_names()),
31f18b77 6272 osdmap(osdmap_),
11fdf7f2 6273 pgmap(pgmap_),
31f18b77 6274 tree(tree_),
31f18b77
FG
6275 min_var(-1),
6276 max_var(-1),
6277 stddev(0),
6278 sum(0) {
9f95a23c
TL
6279 if (osdmap->crush->name_exists(filter)) {
6280 // filter by crush node
6281 auto item_id = osdmap->crush->get_item_id(filter);
11fdf7f2
TL
6282 allowed.insert(item_id);
6283 osdmap->crush->get_all_children(item_id, &allowed);
9f95a23c
TL
6284 } else if (osdmap->crush->class_exists(filter)) {
6285 // filter by device class
6286 class_id = osdmap->crush->get_class_id(filter);
6287 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
6288 pool_id >= 0) {
6289 // filter by pool
6290 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
6291 set<int> roots;
6292 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
6293 allowed = roots;
6294 for (auto r : roots)
6295 osdmap->crush->get_all_children(r, &allowed);
11fdf7f2
TL
6296 }
6297 average_util = average_utilization();
31f18b77
FG
6298 }
6299
6300protected:
11fdf7f2
TL
6301
6302 bool should_dump(int id) const {
6303 if (!allowed.empty() && !allowed.count(id)) // filter by name
6304 return false;
9f95a23c
TL
6305 if (id >= 0 && class_id >= 0) {
6306 auto item_class_id = osdmap->crush->get_item_class_id(id);
6307 if (item_class_id < 0 || // not bound to a class yet
6308 item_class_id != class_id) // or already bound to a different class
11fdf7f2
TL
6309 return false;
6310 }
6311 return true;
6312 }
6313
6314 set<int> get_dumped_osds() {
9f95a23c 6315 if (allowed.empty() && class_id < 0) {
11fdf7f2
TL
6316 // old way, all
6317 return {};
6318 }
6319 return dumped_osds;
6320 }
6321
31f18b77
FG
6322 void dump_stray(F *f) {
6323 for (int i = 0; i < osdmap->get_max_osd(); i++) {
6324 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 6325 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
6326 }
6327 }
6328
6329 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
f67539c2 6330 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
31f18b77 6331 return;
11fdf7f2
TL
6332 if (!should_dump(qi.id))
6333 return;
31f18b77 6334
11fdf7f2
TL
6335 if (!qi.is_bucket())
6336 dumped_osds.insert(qi.id);
31f18b77 6337 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
6338 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
6339 kb_used_meta = 0, kb_avail = 0;
31f18b77 6340 double util = 0;
11fdf7f2
TL
6341 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
6342 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
6343 if (kb_used && kb)
6344 util = 100.0 * (double)kb_used / (double)kb;
6345
6346 double var = 1.0;
6347 if (average_util)
6348 var = util / average_util;
6349
11fdf7f2 6350 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 6351
11fdf7f2
TL
6352 dump_item(qi, reweight, kb, kb_used,
6353 kb_used_data, kb_used_omap, kb_used_meta,
6354 kb_avail, util, var, num_pgs, f);
31f18b77
FG
6355
6356 if (!qi.is_bucket() && reweight > 0) {
6357 if (min_var < 0 || var < min_var)
6358 min_var = var;
6359 if (max_var < 0 || var > max_var)
6360 max_var = var;
6361
6362 double dev = util - average_util;
6363 dev *= dev;
6364 stddev += reweight * dev;
6365 sum += reweight;
6366 }
6367 }
6368
6369 virtual void dump_item(const CrushTreeDumper::Item &qi,
6370 float &reweight,
6371 int64_t kb,
6372 int64_t kb_used,
11fdf7f2
TL
6373 int64_t kb_used_data,
6374 int64_t kb_used_omap,
6375 int64_t kb_used_meta,
31f18b77
FG
6376 int64_t kb_avail,
6377 double& util,
6378 double& var,
6379 const size_t num_pgs,
6380 F *f) = 0;
6381
6382 double dev() {
6383 return sum > 0 ? sqrt(stddev / sum) : 0;
6384 }
6385
6386 double average_utilization() {
6387 int64_t kb = 0, kb_used = 0;
6388 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
6389 if (!osdmap->exists(i) ||
6390 osdmap->get_weight(i) == 0 ||
6391 !should_dump(i))
31f18b77 6392 continue;
11fdf7f2
TL
6393 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
6394 kb_avail_i;
6395 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
6396 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
6397 kb += kb_i;
6398 kb_used += kb_used_i;
6399 }
6400 }
6401 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
6402 }
6403
6404 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
6405 int64_t* kb_used_data,
6406 int64_t* kb_used_omap,
6407 int64_t* kb_used_meta,
31f18b77 6408 int64_t* kb_avail) const {
11fdf7f2 6409 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 6410 if (!p) return false;
11fdf7f2
TL
6411 *kb = p->statfs.kb();
6412 *kb_used = p->statfs.kb_used_raw();
6413 *kb_used_data = p->statfs.kb_used_data();
6414 *kb_used_omap = p->statfs.kb_used_omap();
6415 *kb_used_meta = p->statfs.kb_used_internal_metadata();
6416 *kb_avail = p->statfs.kb_avail();
6417
f67539c2 6418 return true;
31f18b77
FG
6419 }
6420
6421 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
6422 int64_t* kb_used_data,
6423 int64_t* kb_used_omap,
6424 int64_t* kb_used_meta,
31f18b77
FG
6425 int64_t* kb_avail) const {
6426 if (id >= 0) {
11fdf7f2 6427 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
6428 *kb = 0;
6429 *kb_used = 0;
11fdf7f2
TL
6430 *kb_used_data = 0;
6431 *kb_used_omap = 0;
6432 *kb_used_meta = 0;
31f18b77
FG
6433 *kb_avail = 0;
6434 return true;
6435 }
11fdf7f2
TL
6436 return get_osd_utilization(id, kb, kb_used, kb_used_data,
6437 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
6438 }
6439
6440 *kb = 0;
6441 *kb_used = 0;
11fdf7f2
TL
6442 *kb_used_data = 0;
6443 *kb_used_omap = 0;
6444 *kb_used_meta = 0;
31f18b77
FG
6445 *kb_avail = 0;
6446
6447 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
6448 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
6449 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
6450 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
6451 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
6452 &kb_used_data_i, &kb_used_omap_i,
6453 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
6454 return false;
6455 *kb += kb_i;
6456 *kb_used += kb_used_i;
11fdf7f2
TL
6457 *kb_used_data += kb_used_data_i;
6458 *kb_used_omap += kb_used_omap_i;
6459 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
6460 *kb_avail += kb_avail_i;
6461 }
f67539c2 6462 return true;
31f18b77
FG
6463 }
6464
6465protected:
6466 const OSDMap *osdmap;
11fdf7f2 6467 const PGMap& pgmap;
31f18b77
FG
6468 bool tree;
6469 double average_util;
6470 double min_var;
6471 double max_var;
6472 double stddev;
6473 double sum;
9f95a23c 6474 int class_id = -1;
11fdf7f2
TL
6475 set<int> allowed;
6476 set<int> dumped_osds;
31f18b77
FG
6477};
6478
6479
6480class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
6481public:
6482 typedef OSDUtilizationDumper<TextTable> Parent;
6483
6484 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 6485 const PGMap& pgmap, bool tree,
9f95a23c
TL
6486 const string& filter) :
6487 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
6488
6489 void dump(TextTable *tbl) {
6490 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 6491 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
6492 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
6493 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
6494 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
6495 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
6496 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
6497 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
6498 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
6499 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
6500 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
6501 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
6502 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 6503 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
6504 if (tree)
6505 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
6506
6507 Parent::dump(tbl);
6508
6509 dump_stray(tbl);
6510
11fdf7f2 6511 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
6512 *tbl << ""
6513 << ""
6514 << "" << "TOTAL"
11fdf7f2
TL
6515 << byte_u_t(sum.statfs.total)
6516 << byte_u_t(sum.statfs.get_used_raw())
6517 << byte_u_t(sum.statfs.allocated)
6518 << byte_u_t(sum.statfs.omap_allocated)
6519 << byte_u_t(sum.statfs.internal_metadata)
6520 << byte_u_t(sum.statfs.available)
31f18b77
FG
6521 << lowprecision_t(average_util)
6522 << ""
6523 << TextTable::endrow;
6524 }
6525
6526protected:
6527 struct lowprecision_t {
6528 float v;
6529 explicit lowprecision_t(float _v) : v(_v) {}
6530 };
6531 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
6532
6533 using OSDUtilizationDumper<TextTable>::dump_item;
6534 void dump_item(const CrushTreeDumper::Item &qi,
6535 float &reweight,
6536 int64_t kb,
6537 int64_t kb_used,
11fdf7f2
TL
6538 int64_t kb_used_data,
6539 int64_t kb_used_omap,
6540 int64_t kb_used_meta,
31f18b77
FG
6541 int64_t kb_avail,
6542 double& util,
6543 double& var,
6544 const size_t num_pgs,
6545 TextTable *tbl) override {
224ce89b
WB
6546 const char *c = crush->get_item_class(qi.id);
6547 if (!c)
6548 c = "";
31f18b77 6549 *tbl << qi.id
224ce89b 6550 << c
31f18b77
FG
6551 << weightf_t(qi.weight)
6552 << weightf_t(reweight)
1adf2230
AA
6553 << byte_u_t(kb << 10)
6554 << byte_u_t(kb_used << 10)
11fdf7f2
TL
6555 << byte_u_t(kb_used_data << 10)
6556 << byte_u_t(kb_used_omap << 10)
6557 << byte_u_t(kb_used_meta << 10)
1adf2230 6558 << byte_u_t(kb_avail << 10)
31f18b77
FG
6559 << lowprecision_t(util)
6560 << lowprecision_t(var);
6561
6562 if (qi.is_bucket()) {
6563 *tbl << "-";
11fdf7f2 6564 *tbl << "";
31f18b77
FG
6565 } else {
6566 *tbl << num_pgs;
11fdf7f2
TL
6567 if (osdmap->is_up(qi.id)) {
6568 *tbl << "up";
6569 } else if (osdmap->is_destroyed(qi.id)) {
6570 *tbl << "destroyed";
6571 } else {
6572 *tbl << "down";
6573 }
31f18b77
FG
6574 }
6575
6576 if (tree) {
6577 ostringstream name;
6578 for (int k = 0; k < qi.depth; k++)
6579 name << " ";
6580 if (qi.is_bucket()) {
6581 int type = crush->get_bucket_type(qi.id);
6582 name << crush->get_type_name(type) << " "
6583 << crush->get_item_name(qi.id);
6584 } else {
6585 name << "osd." << qi.id;
6586 }
6587 *tbl << name.str();
6588 }
6589
6590 *tbl << TextTable::endrow;
6591 }
6592
6593public:
6594 string summary() {
6595 ostringstream out;
6596 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
6597 << "/" << lowprecision_t(max_var) << " "
6598 << "STDDEV: " << lowprecision_t(dev());
6599 return out.str();
6600 }
6601};
6602
6603ostream& operator<<(ostream& out,
6604 const OSDUtilizationPlainDumper::lowprecision_t& v)
6605{
6606 if (v.v < -0.01) {
6607 return out << "-";
6608 } else if (v.v < 0.001) {
6609 return out << "0";
6610 } else {
6611 std::streamsize p = out.precision();
6612 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
6613 }
6614}
6615
6616class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
6617public:
6618 typedef OSDUtilizationDumper<Formatter> Parent;
6619
6620 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 6621 const PGMap& pgmap, bool tree,
9f95a23c
TL
6622 const string& filter) :
6623 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
6624
6625 void dump(Formatter *f) {
6626 f->open_array_section("nodes");
6627 Parent::dump(f);
6628 f->close_section();
6629
6630 f->open_array_section("stray");
6631 dump_stray(f);
6632 f->close_section();
6633 }
6634
6635protected:
6636 using OSDUtilizationDumper<Formatter>::dump_item;
6637 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
6638 float &reweight,
6639 int64_t kb,
6640 int64_t kb_used,
6641 int64_t kb_used_data,
6642 int64_t kb_used_omap,
6643 int64_t kb_used_meta,
6644 int64_t kb_avail,
6645 double& util,
6646 double& var,
6647 const size_t num_pgs,
6648 Formatter *f) override {
31f18b77 6649 f->open_object_section("item");
c07f9fc5 6650 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
6651 f->dump_float("reweight", reweight);
6652 f->dump_int("kb", kb);
6653 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
6654 f->dump_int("kb_used_data", kb_used_data);
6655 f->dump_int("kb_used_omap", kb_used_omap);
6656 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
6657 f->dump_int("kb_avail", kb_avail);
6658 f->dump_float("utilization", util);
6659 f->dump_float("var", var);
6660 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
6661 if (!qi.is_bucket()) {
6662 if (osdmap->is_up(qi.id)) {
6663 f->dump_string("status", "up");
6664 } else if (osdmap->is_destroyed(qi.id)) {
6665 f->dump_string("status", "destroyed");
6666 } else {
6667 f->dump_string("status", "down");
6668 }
6669 }
31f18b77
FG
6670 CrushTreeDumper::dump_bucket_children(crush, qi, f);
6671 f->close_section();
6672 }
6673
6674public:
6675 void summary(Formatter *f) {
6676 f->open_object_section("summary");
11fdf7f2
TL
6677 auto sum = pgmap.get_osd_sum(get_dumped_osds());
6678 auto& s = sum.statfs;
6679
6680 f->dump_int("total_kb", s.kb());
6681 f->dump_int("total_kb_used", s.kb_used_raw());
6682 f->dump_int("total_kb_used_data", s.kb_used_data());
6683 f->dump_int("total_kb_used_omap", s.kb_used_omap());
6684 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
6685 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
6686 f->dump_float("average_utilization", average_util);
6687 f->dump_float("min_var", min_var);
6688 f->dump_float("max_var", max_var);
6689 f->dump_float("dev", dev());
6690 f->close_section();
6691 }
6692};
6693
6694void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
6695 const PGMap& pgmap,
6696 ostream& out,
6697 Formatter *f,
6698 bool tree,
9f95a23c 6699 const string& filter)
31f18b77
FG
6700{
6701 const CrushWrapper *crush = osdmap.crush.get();
6702 if (f) {
6703 f->open_object_section("df");
9f95a23c 6704 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
6705 d.dump(f);
6706 d.summary(f);
6707 f->close_section();
6708 f->flush(out);
6709 } else {
9f95a23c 6710 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
6711 TextTable tbl;
6712 d.dump(&tbl);
6713 out << tbl << d.summary() << "\n";
6714 }
6715}
224ce89b 6716
92f5a8d4
TL
6717void OSDMap::check_health(CephContext *cct,
6718 health_check_map_t *checks) const
224ce89b
WB
6719{
6720 int num_osds = get_num_osds();
6721
6722 // OSD_DOWN
6723 // OSD_$subtree_DOWN
6724 // OSD_ORPHAN
6725 if (num_osds >= 0) {
6726 int num_in_osds = 0;
6727 int num_down_in_osds = 0;
6728 set<int> osds;
6729 set<int> down_in_osds;
6730 set<int> up_in_osds;
6731 set<int> subtree_up;
6732 unordered_map<int, set<int> > subtree_type_down;
6733 unordered_map<int, int> num_osds_subtree;
6734 int max_type = crush->get_max_type_id();
6735
6736 for (int i = 0; i < get_max_osd(); i++) {
6737 if (!exists(i)) {
6738 if (crush->item_exists(i)) {
6739 osds.insert(i);
6740 }
6741 continue;
6742 }
f67539c2 6743 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
224ce89b
WB
6744 continue;
6745 ++num_in_osds;
6746 if (down_in_osds.count(i) || up_in_osds.count(i))
6747 continue;
6748 if (!is_up(i)) {
6749 down_in_osds.insert(i);
6750 int parent_id = 0;
6751 int current = i;
6752 for (int type = 0; type <= max_type; type++) {
6753 if (!crush->get_type_name(type))
6754 continue;
6755 int r = crush->get_immediate_parent_id(current, &parent_id);
6756 if (r == -ENOENT)
6757 break;
6758 // break early if this parent is already marked as up
6759 if (subtree_up.count(parent_id))
6760 break;
6761 type = crush->get_bucket_type(parent_id);
6762 if (!subtree_type_is_down(
92f5a8d4 6763 cct, parent_id, type,
224ce89b
WB
6764 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
6765 break;
6766 current = parent_id;
6767 }
6768 }
6769 }
6770
6771 // calculate the number of down osds in each down subtree and
6772 // store it in num_osds_subtree
6773 for (int type = 1; type <= max_type; type++) {
6774 if (!crush->get_type_name(type))
6775 continue;
6776 for (auto j = subtree_type_down[type].begin();
6777 j != subtree_type_down[type].end();
6778 ++j) {
6779 list<int> children;
6780 int num = 0;
6781 int num_children = crush->get_children(*j, &children);
6782 if (num_children == 0)
6783 continue;
6784 for (auto l = children.begin(); l != children.end(); ++l) {
6785 if (*l >= 0) {
6786 ++num;
6787 } else if (num_osds_subtree[*l] > 0) {
6788 num = num + num_osds_subtree[*l];
6789 }
6790 }
6791 num_osds_subtree[*j] = num;
6792 }
6793 }
6794 num_down_in_osds = down_in_osds.size();
11fdf7f2 6795 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
6796 if (num_down_in_osds > 0) {
6797 // summary of down subtree types and osds
6798 for (int type = max_type; type > 0; type--) {
6799 if (!crush->get_type_name(type))
6800 continue;
6801 if (subtree_type_down[type].size() > 0) {
6802 ostringstream ss;
6803 ss << subtree_type_down[type].size() << " "
6804 << crush->get_type_name(type);
6805 if (subtree_type_down[type].size() > 1) {
6806 ss << "s";
6807 }
6808 int sum_down_osds = 0;
6809 for (auto j = subtree_type_down[type].begin();
6810 j != subtree_type_down[type].end();
6811 ++j) {
6812 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
6813 }
6814 ss << " (" << sum_down_osds << " osds) down";
6815 string err = string("OSD_") +
6816 string(crush->get_type_name(type)) + "_DOWN";
6817 boost::to_upper(err);
9f95a23c
TL
6818 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
6819 subtree_type_down[type].size());
224ce89b
WB
6820 for (auto j = subtree_type_down[type].rbegin();
6821 j != subtree_type_down[type].rend();
6822 ++j) {
6823 ostringstream ss;
6824 ss << crush->get_type_name(type);
6825 ss << " ";
6826 ss << crush->get_item_name(*j);
6827 // at the top level, do not print location
6828 if (type != max_type) {
6829 ss << " (";
6830 ss << crush->get_full_location_ordered_string(*j);
6831 ss << ")";
6832 }
6833 int num = num_osds_subtree[*j];
6834 ss << " (" << num << " osds)";
6835 ss << " is down";
6836 d.detail.push_back(ss.str());
6837 }
6838 }
6839 }
6840 ostringstream ss;
6841 ss << down_in_osds.size() << " osds down";
9f95a23c
TL
6842 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
6843 down_in_osds.size());
224ce89b
WB
6844 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
6845 ostringstream ss;
6846 ss << "osd." << *it << " (";
6847 ss << crush->get_full_location_ordered_string(*it);
6848 ss << ") is down";
6849 d.detail.push_back(ss.str());
6850 }
6851 }
6852
6853 if (!osds.empty()) {
6854 ostringstream ss;
6855 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
9f95a23c
TL
6856 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
6857 osds.size());
224ce89b
WB
6858 for (auto osd : osds) {
6859 ostringstream ss;
6860 ss << "osd." << osd << " exists in crush map but not in osdmap";
6861 d.detail.push_back(ss.str());
6862 }
6863 }
6864 }
6865
eafe8130
TL
6866 std::list<std::string> scrub_messages;
6867 bool noscrub = false, nodeepscrub = false;
6868 for (const auto &p : pools) {
6869 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
6870 ostringstream ss;
6871 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
6872 scrub_messages.push_back(ss.str());
6873 noscrub = true;
6874 }
6875 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
6876 ostringstream ss;
6877 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
6878 scrub_messages.push_back(ss.str());
6879 nodeepscrub = true;
6880 }
6881 }
6882 if (noscrub || nodeepscrub) {
6883 string out = "";
6884 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
6885 out += nodeepscrub ? "nodeep-scrub" : "";
6886 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
9f95a23c 6887 "Some pool(s) have the " + out + " flag(s) set", 0);
eafe8130
TL
6888 d.detail.splice(d.detail.end(), scrub_messages);
6889 }
6890
224ce89b
WB
6891 // OSD_OUT_OF_ORDER_FULL
6892 {
6893 // An osd could configure failsafe ratio, to something different
6894 // but for now assume it is the same here.
92f5a8d4 6895 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
6896 if (fsr > 1.0) fsr /= 100;
6897 float fr = get_full_ratio();
6898 float br = get_backfillfull_ratio();
6899 float nr = get_nearfull_ratio();
6900
6901 list<string> detail;
6902 // These checks correspond to how OSDService::check_full_status() in an OSD
6903 // handles the improper setting of these values.
6904 if (br < nr) {
6905 ostringstream ss;
6906 ss << "backfillfull_ratio (" << br
6907 << ") < nearfull_ratio (" << nr << "), increased";
6908 detail.push_back(ss.str());
6909 br = nr;
6910 }
6911 if (fr < br) {
6912 ostringstream ss;
6913 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
6914 << "), increased";
6915 detail.push_back(ss.str());
6916 fr = br;
6917 }
6918 if (fsr < fr) {
6919 ostringstream ss;
6920 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
6921 << "), increased";
6922 detail.push_back(ss.str());
6923 }
6924 if (!detail.empty()) {
6925 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
9f95a23c 6926 "full ratio(s) out of order", 0);
224ce89b
WB
6927 d.detail.swap(detail);
6928 }
6929 }
6930
6931 // OSD_FULL
6932 // OSD_NEARFULL
6933 // OSD_BACKFILLFULL
6934 // OSD_FAILSAFE_FULL
6935 {
6936 set<int> full, backfillfull, nearfull;
6937 get_full_osd_counts(&full, &backfillfull, &nearfull);
6938 if (full.size()) {
6939 ostringstream ss;
6940 ss << full.size() << " full osd(s)";
9f95a23c 6941 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
224ce89b
WB
6942 for (auto& i: full) {
6943 ostringstream ss;
6944 ss << "osd." << i << " is full";
6945 d.detail.push_back(ss.str());
6946 }
6947 }
6948 if (backfillfull.size()) {
6949 ostringstream ss;
6950 ss << backfillfull.size() << " backfillfull osd(s)";
9f95a23c
TL
6951 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
6952 backfillfull.size());
224ce89b
WB
6953 for (auto& i: backfillfull) {
6954 ostringstream ss;
6955 ss << "osd." << i << " is backfill full";
6956 d.detail.push_back(ss.str());
6957 }
6958 }
6959 if (nearfull.size()) {
6960 ostringstream ss;
6961 ss << nearfull.size() << " nearfull osd(s)";
9f95a23c 6962 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
224ce89b
WB
6963 for (auto& i: nearfull) {
6964 ostringstream ss;
6965 ss << "osd." << i << " is near full";
6966 d.detail.push_back(ss.str());
6967 }
6968 }
6969 }
6970
6971 // OSDMAP_FLAGS
6972 {
6973 // warn about flags
6974 uint64_t warn_flags =
224ce89b
WB
6975 CEPH_OSDMAP_PAUSERD |
6976 CEPH_OSDMAP_PAUSEWR |
6977 CEPH_OSDMAP_PAUSEREC |
6978 CEPH_OSDMAP_NOUP |
6979 CEPH_OSDMAP_NODOWN |
6980 CEPH_OSDMAP_NOIN |
6981 CEPH_OSDMAP_NOOUT |
6982 CEPH_OSDMAP_NOBACKFILL |
6983 CEPH_OSDMAP_NORECOVER |
6984 CEPH_OSDMAP_NOSCRUB |
6985 CEPH_OSDMAP_NODEEP_SCRUB |
6986 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 6987 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
6988 CEPH_OSDMAP_NOREBALANCE;
6989 if (test_flag(warn_flags)) {
6990 ostringstream ss;
9f95a23c
TL
6991 string s = get_flag_string(get_flags() & warn_flags);
6992 ss << s << " flag(s) set";
6993 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
6994 s.size() /* kludgey but sufficient */);
224ce89b
WB
6995 }
6996 }
6997
6998 // OSD_FLAGS
6999 {
7000 list<string> detail;
7001 const unsigned flags =
7002 CEPH_OSD_NOUP |
7003 CEPH_OSD_NOIN |
7004 CEPH_OSD_NODOWN |
7005 CEPH_OSD_NOOUT;
7006 for (int i = 0; i < max_osd; ++i) {
7007 if (osd_state[i] & flags) {
7008 ostringstream ss;
7009 set<string> states;
7010 OSDMap::calc_state_set(osd_state[i] & flags, states);
7011 ss << "osd." << i << " has flags " << states;
7012 detail.push_back(ss.str());
7013 }
7014 }
81eedcae
TL
7015 for (auto& i : crush_node_flags) {
7016 if (i.second && crush->item_exists(i.first)) {
7017 ostringstream ss;
7018 set<string> states;
7019 OSDMap::calc_state_set(i.second, states);
7020 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
7021 const char *tn = crush->get_type_name(t);
7022 ss << (tn ? tn : "node") << " "
7023 << crush->get_item_name(i.first) << " has flags " << states;
7024 detail.push_back(ss.str());
7025 }
7026 }
7027 for (auto& i : device_class_flags) {
7028 const char* class_name = crush->get_class_name(i.first);
7029 if (i.second && class_name) {
7030 ostringstream ss;
7031 set<string> states;
7032 OSDMap::calc_state_set(i.second, states);
7033 ss << "device class '" << class_name << "' has flags " << states;
7034 detail.push_back(ss.str());
7035 }
7036 }
224ce89b
WB
7037 if (!detail.empty()) {
7038 ostringstream ss;
81eedcae 7039 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
9f95a23c 7040 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
224ce89b
WB
7041 d.detail.swap(detail);
7042 }
7043 }
7044
7045 // OLD_CRUSH_TUNABLES
92f5a8d4 7046 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 7047 string min = crush->get_min_required_version();
92f5a8d4 7048 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
7049 ostringstream ss;
7050 ss << "crush map has legacy tunables (require " << min
92f5a8d4 7051 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
9f95a23c 7052 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
f67539c2 7053 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
7054 }
7055 }
7056
7057 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 7058 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
7059 if (crush->get_straw_calc_version() == 0) {
7060 ostringstream ss;
7061 ss << "crush map has straw_calc_version=0";
9f95a23c 7062 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
224ce89b 7063 d.detail.push_back(
f67539c2 7064 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
7065 }
7066 }
7067
7068 // CACHE_POOL_NO_HIT_SET
92f5a8d4 7069 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b 7070 list<string> detail;
9f95a23c 7071 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
224ce89b
WB
7072 const pg_pool_t& info = p->second;
7073 if (info.cache_mode_requires_hit_set() &&
7074 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
7075 ostringstream ss;
7076 ss << "pool '" << get_pool_name(p->first)
7077 << "' with cache_mode " << info.get_cache_mode_name()
7078 << " needs hit_set_type to be set but it is not";
7079 detail.push_back(ss.str());
7080 }
7081 }
7082 if (!detail.empty()) {
7083 ostringstream ss;
7084 ss << detail.size() << " cache pools are missing hit_sets";
9f95a23c
TL
7085 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
7086 detail.size());
224ce89b
WB
7087 d.detail.swap(detail);
7088 }
7089 }
7090
7091 // OSD_NO_SORTBITWISE
11fdf7f2 7092 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 7093 ostringstream ss;
11fdf7f2 7094 ss << "'sortbitwise' flag is not set";
9f95a23c 7095 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
7096 }
7097
7098 // OSD_UPGRADE_FINISHED
20effc67
TL
7099 if (auto require_release = pending_require_osd_release()) {
7100 ostringstream ss;
7101 ss << "all OSDs are running " << *require_release << " or later but"
7102 << " require_osd_release < " << *require_release;
7103 auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
7104 d.detail.push_back(ss.str());
7105 }
224ce89b 7106
3efd9988 7107 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 7108 {
3efd9988 7109 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
7110 for (auto it : get_pools()) {
7111 const pg_pool_t &pool = it.second;
3efd9988 7112 const string& pool_name = get_pool_name(it.first);
224ce89b 7113 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 7114 stringstream ss;
11fdf7f2 7115 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
7116 // may run out of space too,
7117 // but we want EQUOTA taking precedence
11fdf7f2 7118 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
7119 } else {
7120 ss << "pool '" << pool_name << "' is full (no space)";
7121 }
7122 full_detail.push_back(ss.str());
7123 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
7124 stringstream ss;
7125 ss << "pool '" << pool_name << "' is backfillfull";
7126 backfillfull_detail.push_back(ss.str());
7127 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
7128 stringstream ss;
7129 ss << "pool '" << pool_name << "' is nearfull";
7130 nearfull_detail.push_back(ss.str());
224ce89b
WB
7131 }
7132 }
3efd9988 7133 if (!full_detail.empty()) {
224ce89b 7134 ostringstream ss;
3efd9988 7135 ss << full_detail.size() << " pool(s) full";
9f95a23c 7136 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
3efd9988
FG
7137 d.detail.swap(full_detail);
7138 }
7139 if (!backfillfull_detail.empty()) {
7140 ostringstream ss;
7141 ss << backfillfull_detail.size() << " pool(s) backfillfull";
9f95a23c
TL
7142 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
7143 backfillfull_detail.size());
3efd9988
FG
7144 d.detail.swap(backfillfull_detail);
7145 }
7146 if (!nearfull_detail.empty()) {
7147 ostringstream ss;
7148 ss << nearfull_detail.size() << " pool(s) nearfull";
9f95a23c
TL
7149 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
7150 nearfull_detail.size());
3efd9988 7151 d.detail.swap(nearfull_detail);
224ce89b
WB
7152 }
7153 }
92f5a8d4
TL
7154
7155 // POOL_PG_NUM_NOT_POWER_OF_TWO
7156 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
7157 list<string> detail;
7158 for (auto it : get_pools()) {
1e59de90 7159 if (!std::has_single_bit(it.second.get_pg_num_target())) {
92f5a8d4
TL
7160 ostringstream ss;
7161 ss << "pool '" << get_pool_name(it.first)
7162 << "' pg_num " << it.second.get_pg_num_target()
7163 << " is not a power of two";
7164 detail.push_back(ss.str());
7165 }
7166 }
7167 if (!detail.empty()) {
7168 ostringstream ss;
7169 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
7170 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
9f95a23c
TL
7171 ss.str(), detail.size());
7172 d.detail.swap(detail);
7173 }
7174 }
7175
7176 // POOL_NO_REDUNDANCY
7177 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
7178 {
7179 list<string> detail;
7180 for (auto it : get_pools()) {
7181 if (it.second.get_size() == 1) {
7182 ostringstream ss;
7183 ss << "pool '" << get_pool_name(it.first)
7184 << "' has no replicas configured";
7185 detail.push_back(ss.str());
7186 }
7187 }
7188 if (!detail.empty()) {
7189 ostringstream ss;
7190 ss << detail.size() << " pool(s) have no replicas configured";
7191 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
7192 ss.str(), detail.size());
92f5a8d4
TL
7193 d.detail.swap(detail);
7194 }
7195 }
f67539c2
TL
7196
7197 // DEGRADED STRETCH MODE
7198 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
7199 if (recovering_stretch_mode) {
7200 stringstream ss;
7201 ss << "We are recovering stretch mode buckets, only requiring "
7202 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7203 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
7204 ss.str(), 0);
7205 } else if (degraded_stretch_mode) {
7206 stringstream ss;
7207 ss << "We are missing stretch mode buckets, only requiring "
7208 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
7209 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
7210 ss.str(), 0);
7211 }
7212 }
aee94f69
TL
7213 // UNEQUAL_WEIGHT
7214 if (stretch_mode_enabled) {
7215 vector<int> subtrees;
7216 crush->get_subtree_of_type(stretch_mode_bucket, &subtrees);
7217 if (subtrees.size() != 2) {
7218 stringstream ss;
7219 ss << "Stretch mode buckets != 2";
7220 checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
7221 return;
7222 }
7223 int weight1 = crush->get_item_weight(subtrees[0]);
7224 int weight2 = crush->get_item_weight(subtrees[1]);
7225 stringstream ss;
7226 if (weight1 != weight2) {
7227 ss << "Stretch mode buckets have different weights!";
7228 checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0);
7229 }
7230 }
224ce89b 7231}
35e4c445
FG
7232
7233int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
7234 ostream *ss) const
7235{
7236 out->clear();
7237 for (auto i = ls.begin(); i != ls.end(); ++i) {
7238 if (i == ls.begin() &&
7239 (*i == "any" || *i == "all" || *i == "*")) {
7240 get_all_osds(*out);
7241 break;
7242 }
1e59de90 7243 long osd = ceph::common::parse_osd_id(i->c_str(), ss);
35e4c445
FG
7244 if (osd < 0) {
7245 *ss << "invalid osd id '" << *i << "'";
7246 return -EINVAL;
7247 }
7248 out->insert(osd);
7249 }
7250 return 0;
7251}
11fdf7f2
TL
7252
7253void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
7254 string &subtree,
7255 int limit, // how many
7256 set<int> skip,
7257 set<int> *want) const {
7258 if (limit <= 0)
7259 return;
7260 int subtree_type = crush->get_type_id(subtree);
7261 if (subtree_type < 1)
7262 return;
7263 vector<int> subtrees;
7264 crush->get_subtree_of_type(subtree_type, &subtrees);
7265 std::random_device rd;
7266 std::default_random_engine rng{rd()};
7267 std::shuffle(subtrees.begin(), subtrees.end(), rng);
7268 for (auto s : subtrees) {
7269 if (limit <= 0)
7270 break;
7271 if (crush->subtree_contains(s, n))
7272 continue;
7273 vector<int> osds;
7274 crush->get_children_of_type(s, 0, &osds);
7275 if (osds.empty())
7276 continue;
7277 vector<int> up_osds;
7278 for (auto o : osds) {
7279 if (is_up(o) && !skip.count(o))
7280 up_osds.push_back(o);
7281 }
7282 if (up_osds.empty())
7283 continue;
7284 auto it = up_osds.begin();
7285 std::advance(it, (n % up_osds.size()));
7286 want->insert(*it);
7287 --limit;
7288 }
7289}
7290
7291float OSDMap::pool_raw_used_rate(int64_t poolid) const
7292{
7293 const pg_pool_t *pool = get_pg_pool(poolid);
7294 assert(pool != nullptr);
7295
7296 switch (pool->get_type()) {
7297 case pg_pool_t::TYPE_REPLICATED:
7298 return pool->get_size();
11fdf7f2
TL
7299 case pg_pool_t::TYPE_ERASURE:
7300 {
7301 auto& ecp =
7302 get_erasure_code_profile(pool->erasure_code_profile);
7303 auto pm = ecp.find("m");
7304 auto pk = ecp.find("k");
7305 if (pm != ecp.end() && pk != ecp.end()) {
7306 int k = atoi(pk->second.c_str());
7307 int m = atoi(pm->second.c_str());
7308 int mk = m + k;
7309 ceph_assert(mk != 0);
7310 ceph_assert(k != 0);
7311 return (float)mk / k;
7312 } else {
7313 return 0.0;
7314 }
7315 }
7316 break;
7317 default:
7318 ceph_abort_msg("unrecognized pool type");
7319 }
7320}
81eedcae
TL
7321
7322unsigned OSDMap::get_osd_crush_node_flags(int osd) const
7323{
7324 unsigned flags = 0;
7325 if (!crush_node_flags.empty()) {
7326 // the map will contain type -> name
7327 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
7328 for (auto& i : ploc) {
7329 int id = crush->get_item_id(i.second);
7330 auto p = crush_node_flags.find(id);
7331 if (p != crush_node_flags.end()) {
7332 flags |= p->second;
7333 }
7334 }
7335 }
7336 return flags;
7337}
7338
7339unsigned OSDMap::get_crush_node_flags(int id) const
7340{
7341 unsigned flags = 0;
7342 auto it = crush_node_flags.find(id);
7343 if (it != crush_node_flags.end())
7344 flags = it->second;
7345 return flags;
7346}
7347
7348unsigned OSDMap::get_device_class_flags(int id) const
7349{
7350 unsigned flags = 0;
7351 auto it = device_class_flags.find(id);
7352 if (it != device_class_flags.end())
7353 flags = it->second;
7354 return flags;
7355}
20effc67
TL
7356
7357std::optional<std::string> OSDMap::pending_require_osd_release() const
7358{
7359 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) &&
7360 require_osd_release < ceph_release_t::quincy) {
7361 return "quincy";
7362 }
7363 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
7364 require_osd_release < ceph_release_t::pacific) {
7365 return "pacific";
7366 }
7367 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
7368 require_osd_release < ceph_release_t::octopus) {
7369 return "octopus";
7370 }
7371 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
7372 require_osd_release < ceph_release_t::nautilus) {
7373 return "nautilus";
7374 }
7375
7376 return std::nullopt;
7377}