]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
bump version to 16.2.6-pve2
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
29#include "include/ceph_features.h"
9f95a23c 30#include "include/common_fwd.h"
7c673cae
FG
31#include "include/str_map.h"
32
33#include "common/code_environment.h"
224ce89b 34#include "mon/health_check.h"
7c673cae
FG
35
36#include "crush/CrushTreeDumper.h"
37#include "common/Clock.h"
11fdf7f2
TL
38#include "mon/PGMap.h"
39
9f95a23c
TL
40using std::list;
41using std::make_pair;
42using std::map;
43using std::multimap;
44using std::ostream;
45using std::ostringstream;
46using std::pair;
47using std::set;
48using std::string;
49using std::stringstream;
50using std::unordered_map;
51using std::vector;
52
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56
7c673cae
FG
57#define dout_subsys ceph_subsys_osd
58
59MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
60MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
61
62
63// ----------------------------------
64// osd_info_t
65
66void osd_info_t::dump(Formatter *f) const
67{
68 f->dump_int("last_clean_begin", last_clean_begin);
69 f->dump_int("last_clean_end", last_clean_end);
70 f->dump_int("up_from", up_from);
71 f->dump_int("up_thru", up_thru);
72 f->dump_int("down_at", down_at);
73 f->dump_int("lost_at", lost_at);
74}
75
9f95a23c 76void osd_info_t::encode(ceph::buffer::list& bl) const
7c673cae 77{
11fdf7f2 78 using ceph::encode;
7c673cae 79 __u8 struct_v = 1;
11fdf7f2
TL
80 encode(struct_v, bl);
81 encode(last_clean_begin, bl);
82 encode(last_clean_end, bl);
83 encode(up_from, bl);
84 encode(up_thru, bl);
85 encode(down_at, bl);
86 encode(lost_at, bl);
7c673cae
FG
87}
88
9f95a23c 89void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 90{
11fdf7f2 91 using ceph::decode;
7c673cae 92 __u8 struct_v;
11fdf7f2
TL
93 decode(struct_v, bl);
94 decode(last_clean_begin, bl);
95 decode(last_clean_end, bl);
96 decode(up_from, bl);
97 decode(up_thru, bl);
98 decode(down_at, bl);
99 decode(lost_at, bl);
7c673cae
FG
100}
101
102void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
103{
104 o.push_back(new osd_info_t);
105 o.push_back(new osd_info_t);
106 o.back()->last_clean_begin = 1;
107 o.back()->last_clean_end = 2;
108 o.back()->up_from = 30;
109 o.back()->up_thru = 40;
110 o.back()->down_at = 5;
111 o.back()->lost_at = 6;
112}
113
114ostream& operator<<(ostream& out, const osd_info_t& info)
115{
116 out << "up_from " << info.up_from
117 << " up_thru " << info.up_thru
118 << " down_at " << info.down_at
119 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
120 if (info.lost_at)
121 out << " lost_at " << info.lost_at;
122 return out;
123}
124
125// ----------------------------------
126// osd_xinfo_t
127
128void osd_xinfo_t::dump(Formatter *f) const
129{
130 f->dump_stream("down_stamp") << down_stamp;
131 f->dump_float("laggy_probability", laggy_probability);
132 f->dump_int("laggy_interval", laggy_interval);
133 f->dump_int("features", features);
134 f->dump_unsigned("old_weight", old_weight);
9f95a23c
TL
135 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
136 f->dump_int("dead_epoch", dead_epoch);
7c673cae
FG
137}
138
9f95a23c 139void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
7c673cae 140{
9f95a23c
TL
141 uint8_t v = 4;
142 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
143 v = 3;
144 }
145 ENCODE_START(v, 1, bl);
11fdf7f2 146 encode(down_stamp, bl);
f67539c2 147 __u32 lp = laggy_probability * float(0xfffffffful);
11fdf7f2
TL
148 encode(lp, bl);
149 encode(laggy_interval, bl);
150 encode(features, bl);
151 encode(old_weight, bl);
9f95a23c
TL
152 if (v >= 4) {
153 encode(last_purged_snaps_scrub, bl);
154 encode(dead_epoch, bl);
155 }
7c673cae
FG
156 ENCODE_FINISH(bl);
157}
158
9f95a23c 159void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 160{
9f95a23c 161 DECODE_START(4, bl);
11fdf7f2 162 decode(down_stamp, bl);
7c673cae 163 __u32 lp;
11fdf7f2 164 decode(lp, bl);
7c673cae 165 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 166 decode(laggy_interval, bl);
7c673cae 167 if (struct_v >= 2)
11fdf7f2 168 decode(features, bl);
7c673cae
FG
169 else
170 features = 0;
171 if (struct_v >= 3)
11fdf7f2 172 decode(old_weight, bl);
7c673cae
FG
173 else
174 old_weight = 0;
9f95a23c
TL
175 if (struct_v >= 4) {
176 decode(last_purged_snaps_scrub, bl);
177 decode(dead_epoch, bl);
178 } else {
179 dead_epoch = 0;
180 }
7c673cae
FG
181 DECODE_FINISH(bl);
182}
183
184void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
185{
186 o.push_back(new osd_xinfo_t);
187 o.push_back(new osd_xinfo_t);
188 o.back()->down_stamp = utime_t(2, 3);
189 o.back()->laggy_probability = .123;
190 o.back()->laggy_interval = 123456;
191 o.back()->old_weight = 0x7fff;
192}
193
194ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
195{
196 return out << "down_stamp " << xi.down_stamp
197 << " laggy_probability " << xi.laggy_probability
198 << " laggy_interval " << xi.laggy_interval
9f95a23c
TL
199 << " old_weight " << xi.old_weight
200 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
201 << " dead_epoch " << xi.dead_epoch;
7c673cae
FG
202}
203
204// ----------------------------------
205// OSDMap::Incremental
206
207int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
208{
209 int n = 0;
210 for (auto &weight : new_weight) {
211 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
212 n++; // marked out
213 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
214 n--; // marked in
215 }
216 return n;
217}
218
219int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
220{
221 int n = 0;
222 for (auto &state : new_state) { //
223 if (state.second & CEPH_OSD_UP) {
224 if (previous->is_up(state.first))
225 n++; // marked down
226 else
227 n--; // marked up
228 }
229 }
230 return n;
231}
232
233int OSDMap::Incremental::identify_osd(uuid_d u) const
234{
235 for (auto &uuid : new_uuid)
236 if (uuid.second == u)
237 return uuid.first;
238 return -1;
239}
240
f67539c2
TL
241int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
242 const OSDMap& osdmap)
7c673cae 243{
11fdf7f2 244 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
245
246 for (auto &new_pool : new_pools) {
247 if (!new_pool.second.tiers.empty()) {
248 pg_pool_t& base = new_pool.second;
249
11fdf7f2
TL
250 auto new_rem_it = new_removed_snaps.find(new_pool.first);
251
7c673cae
FG
252 for (const auto &tier_pool : base.tiers) {
253 const auto &r = new_pools.find(tier_pool);
254 pg_pool_t *tier = 0;
255 if (r == new_pools.end()) {
256 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
257 if (!orig) {
258 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
259 return -EIO;
260 }
261 tier = get_new_pool(tier_pool, orig);
262 } else {
263 tier = &r->second;
264 }
265 if (tier->tier_of != new_pool.first) {
266 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
267 return -EIO;
268 }
269
270 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
271 << tier_pool << dendl;
272 tier->snap_seq = base.snap_seq;
273 tier->snap_epoch = base.snap_epoch;
274 tier->snaps = base.snaps;
275 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
276 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
277 pg_pool_t::FLAG_POOL_SNAPS);
278
279 if (new_rem_it != new_removed_snaps.end()) {
280 new_removed_snaps[tier_pool] = new_rem_it->second;
281 }
f67539c2
TL
282
283 tier->application_metadata = base.application_metadata;
7c673cae
FG
284 }
285 }
286 }
287 return 0;
288}
289
28e407b8
AA
290// ----------------------------------
291// OSDMap
7c673cae
FG
292
293bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
294{
295 if (id >= 0)
296 return is_down(id);
297
298 if (down_cache &&
299 down_cache->count(id)) {
300 return true;
301 }
302
303 list<int> children;
304 crush->get_children(id, &children);
305 for (const auto &child : children) {
306 if (!subtree_is_down(child, down_cache)) {
307 return false;
308 }
309 }
310 if (down_cache) {
311 down_cache->insert(id);
312 }
313 return true;
314}
315
316bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
317{
318 // use a stack-local down_cache if we didn't get one from the
319 // caller. then at least this particular call will avoid duplicated
320 // work.
321 set<int> local_down_cache;
322 if (!down_cache) {
323 down_cache = &local_down_cache;
324 }
325
326 int current = id;
327 while (true) {
328 int type;
329 if (current >= 0) {
330 type = 0;
331 } else {
332 type = crush->get_bucket_type(current);
333 }
11fdf7f2 334 ceph_assert(type >= 0);
7c673cae
FG
335
336 if (!subtree_is_down(current, down_cache)) {
337 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
338 return false;
339 }
340
341 // is this a big enough subtree to be marked as down?
342 if (type >= subtree_type) {
343 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
344 return true;
345 }
346
347 int r = crush->get_immediate_parent_id(current, &current);
348 if (r < 0) {
349 return false;
350 }
351 }
352}
353
224ce89b
WB
354bool OSDMap::subtree_type_is_down(
355 CephContext *cct,
356 int id,
357 int subtree_type,
358 set<int> *down_in_osds,
359 set<int> *up_in_osds,
360 set<int> *subtree_up,
361 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
362{
363 if (id >= 0) {
364 bool is_down_ret = is_down(id);
365 if (!is_out(id)) {
366 if (is_down_ret) {
367 down_in_osds->insert(id);
368 } else {
369 up_in_osds->insert(id);
370 }
371 }
372 return is_down_ret;
373 }
374
375 if (subtree_type_down &&
376 (*subtree_type_down)[subtree_type].count(id)) {
377 return true;
378 }
379
380 list<int> children;
381 crush->get_children(id, &children);
382 for (const auto &child : children) {
224ce89b
WB
383 if (!subtree_type_is_down(
384 cct, child, crush->get_bucket_type(child),
385 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
386 subtree_up->insert(id);
387 return false;
388 }
389 }
390 if (subtree_type_down) {
391 (*subtree_type_down)[subtree_type].insert(id);
392 }
393 return true;
394}
395
9f95a23c 396void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
7c673cae 397{
11fdf7f2 398 using ceph::encode;
7c673cae 399 __u16 v = 5;
11fdf7f2
TL
400 encode(v, bl);
401 encode(fsid, bl);
402 encode(epoch, bl);
403 encode(modified, bl);
7c673cae 404 int32_t new_t = new_pool_max;
11fdf7f2
TL
405 encode(new_t, bl);
406 encode(new_flags, bl);
407 encode(fullmap, bl);
408 encode(crush, bl);
7c673cae 409
11fdf7f2
TL
410 encode(new_max_osd, bl);
411 // for encode(new_pools, bl);
7c673cae 412 __u32 n = new_pools.size();
11fdf7f2 413 encode(n, bl);
7c673cae
FG
414 for (const auto &new_pool : new_pools) {
415 n = new_pool.first;
11fdf7f2
TL
416 encode(n, bl);
417 encode(new_pool.second, bl, 0);
7c673cae 418 }
11fdf7f2 419 // for encode(new_pool_names, bl);
7c673cae 420 n = new_pool_names.size();
11fdf7f2 421 encode(n, bl);
7c673cae
FG
422
423 for (const auto &new_pool_name : new_pool_names) {
424 n = new_pool_name.first;
11fdf7f2
TL
425 encode(n, bl);
426 encode(new_pool_name.second, bl);
7c673cae 427 }
11fdf7f2 428 // for encode(old_pools, bl);
7c673cae 429 n = old_pools.size();
11fdf7f2 430 encode(n, bl);
7c673cae
FG
431 for (auto &old_pool : old_pools) {
432 n = old_pool;
11fdf7f2 433 encode(n, bl);
7c673cae 434 }
11fdf7f2 435 encode(new_up_client, bl, 0);
31f18b77
FG
436 {
437 // legacy is map<int32_t,uint8_t>
9f95a23c 438 map<int32_t, uint8_t> os;
31f18b77 439 for (auto p : new_state) {
9f95a23c
TL
440 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
441 // that an old client could not understand.
442 // skip those!
443 uint8_t s = p.second;
444 if (p.second != 0 && s == 0)
445 continue;
446 os[p.first] = s;
447 }
448 uint32_t n = os.size();
449 encode(n, bl);
450 for (auto p : os) {
11fdf7f2 451 encode(p.first, bl);
9f95a23c 452 encode(p.second, bl);
31f18b77
FG
453 }
454 }
11fdf7f2
TL
455 encode(new_weight, bl);
456 // for encode(new_pg_temp, bl);
7c673cae 457 n = new_pg_temp.size();
11fdf7f2 458 encode(n, bl);
7c673cae
FG
459
460 for (const auto &pg_temp : new_pg_temp) {
461 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
462 encode(opg, bl);
463 encode(pg_temp.second, bl);
7c673cae
FG
464 }
465}
466
9f95a23c 467void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 468{
11fdf7f2 469 using ceph::encode;
7c673cae
FG
470 if ((features & CEPH_FEATURE_PGID64) == 0) {
471 encode_client_old(bl);
472 return;
473 }
474
475 // base
476 __u16 v = 6;
11fdf7f2
TL
477 encode(v, bl);
478 encode(fsid, bl);
479 encode(epoch, bl);
480 encode(modified, bl);
481 encode(new_pool_max, bl);
482 encode(new_flags, bl);
483 encode(fullmap, bl);
484 encode(crush, bl);
485
486 encode(new_max_osd, bl);
487 encode(new_pools, bl, features);
488 encode(new_pool_names, bl);
489 encode(old_pools, bl);
490 encode(new_up_client, bl, features);
31f18b77 491 {
9f95a23c 492 map<int32_t, uint8_t> os;
31f18b77 493 for (auto p : new_state) {
9f95a23c
TL
494 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
495 // that an old client could not understand.
496 // skip those!
497 uint8_t s = p.second;
498 if (p.second != 0 && s == 0)
499 continue;
500 os[p.first] = s;
501 }
502 uint32_t n = os.size();
503 encode(n, bl);
504 for (auto p : os) {
11fdf7f2 505 encode(p.first, bl);
9f95a23c 506 encode(p.second, bl);
31f18b77
FG
507 }
508 }
11fdf7f2
TL
509 encode(new_weight, bl);
510 encode(new_pg_temp, bl);
7c673cae
FG
511
512 // extended
513 __u16 ev = 10;
11fdf7f2
TL
514 encode(ev, bl);
515 encode(new_hb_back_up, bl, features);
516 encode(new_up_thru, bl);
517 encode(new_last_clean_interval, bl);
518 encode(new_lost, bl);
f67539c2
TL
519 encode(new_blocklist, bl, features);
520 encode(old_blocklist, bl, features);
11fdf7f2
TL
521 encode(new_up_cluster, bl, features);
522 encode(cluster_snapshot, bl);
523 encode(new_uuid, bl);
9f95a23c 524 encode(new_xinfo, bl, features);
11fdf7f2
TL
525 encode(new_hb_front_up, bl, features);
526}
527
528template<class T>
9f95a23c 529static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
530{
531 uint32_t n = m.size();
532 encode(n, bl);
533 for (auto& i : m) {
534 encode(i.first, bl);
535 encode(i.second.legacy_addr(), bl, f);
536 }
537}
538
539template<class T>
9f95a23c 540static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
541{
542 uint32_t n = m.size();
543 encode(n, bl);
544 for (auto& i : m) {
545 if (i) {
546 encode(i->legacy_addr(), bl, f);
547 } else {
548 encode(entity_addr_t(), bl, f);
549 }
550 }
7c673cae
FG
551}
552
11fdf7f2
TL
553/* for a description of osdmap incremental versions, and when they were
554 * introduced, please refer to
555 * doc/dev/osd_internals/osdmap_versions.txt
556 */
9f95a23c 557void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 558{
11fdf7f2 559 using ceph::encode;
7c673cae
FG
560 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
561 encode_classic(bl, features);
562 return;
563 }
564
565 // only a select set of callers should *ever* be encoding new
566 // OSDMaps. others should be passing around the canonical encoded
567 // buffers from on high. select out those callers by passing in an
568 // "impossible" feature bit.
11fdf7f2 569 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
570 features &= ~CEPH_FEATURE_RESERVED;
571
572 size_t start_offset = bl.length();
573 size_t tail_offset;
11fdf7f2 574 size_t crc_offset;
9f95a23c 575 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
576
577 // meta-encoding: how we include client-used and osd-specific data
578 ENCODE_START(8, 7, bl);
579
580 {
11fdf7f2 581 uint8_t v = 8;
7c673cae
FG
582 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
583 v = 3;
11fdf7f2
TL
584 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
585 v = 5;
586 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
587 v = 6;
7c673cae
FG
588 }
589 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
590 encode(fsid, bl);
591 encode(epoch, bl);
592 encode(modified, bl);
593 encode(new_pool_max, bl);
594 encode(new_flags, bl);
595 encode(fullmap, bl);
596 encode(crush, bl);
597
598 encode(new_max_osd, bl);
599 encode(new_pools, bl, features);
600 encode(new_pool_names, bl);
601 encode(old_pools, bl);
602 if (v >= 7) {
603 encode(new_up_client, bl, features);
604 } else {
605 encode_addrvec_map_as_addr(new_up_client, bl, features);
606 }
31f18b77 607 if (v >= 5) {
11fdf7f2 608 encode(new_state, bl);
31f18b77 609 } else {
9f95a23c 610 map<int32_t, uint8_t> os;
31f18b77 611 for (auto p : new_state) {
9f95a23c
TL
612 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
613 // that an old client could not understand.
614 // skip those!
615 uint8_t s = p.second;
616 if (p.second != 0 && s == 0)
617 continue;
618 os[p.first] = s;
619 }
620 uint32_t n = os.size();
621 encode(n, bl);
622 for (auto p : os) {
623 encode(p.first, bl);
624 encode(p.second, bl);
31f18b77
FG
625 }
626 }
11fdf7f2
TL
627 encode(new_weight, bl);
628 encode(new_pg_temp, bl);
629 encode(new_primary_temp, bl);
630 encode(new_primary_affinity, bl);
631 encode(new_erasure_code_profiles, bl);
632 encode(old_erasure_code_profiles, bl);
7c673cae 633 if (v >= 4) {
11fdf7f2
TL
634 encode(new_pg_upmap, bl);
635 encode(old_pg_upmap, bl);
636 encode(new_pg_upmap_items, bl);
637 encode(old_pg_upmap_items, bl);
638 }
639 if (v >= 6) {
640 encode(new_removed_snaps, bl);
641 encode(new_purged_snaps, bl);
642 }
643 if (v >= 8) {
644 encode(new_last_up_change, bl);
645 encode(new_last_in_change, bl);
7c673cae
FG
646 }
647 ENCODE_FINISH(bl); // client-usable data
648 }
649
650 {
f67539c2 651 uint8_t target_v = 9; // if bumping this, be aware of stretch_mode target_v 10!
7c673cae
FG
652 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
653 target_v = 2;
11fdf7f2
TL
654 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
655 target_v = 6;
7c673cae 656 }
f67539c2 657 if (change_stretch_mode) {
f67539c2
TL
658 target_v = std::max((uint8_t)10, target_v);
659 }
7c673cae 660 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
661 if (target_v < 7) {
662 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
663 } else {
664 encode(new_hb_back_up, bl, features);
665 }
666 encode(new_up_thru, bl);
667 encode(new_last_clean_interval, bl);
668 encode(new_lost, bl);
f67539c2
TL
669 encode(new_blocklist, bl, features);
670 encode(old_blocklist, bl, features);
11fdf7f2
TL
671 if (target_v < 7) {
672 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
673 } else {
674 encode(new_up_cluster, bl, features);
675 }
676 encode(cluster_snapshot, bl);
677 encode(new_uuid, bl);
9f95a23c 678 encode(new_xinfo, bl, features);
11fdf7f2
TL
679 if (target_v < 7) {
680 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
681 } else {
682 encode(new_hb_front_up, bl, features);
683 }
684 encode(features, bl); // NOTE: features arg, not the member
7c673cae 685 if (target_v >= 3) {
11fdf7f2
TL
686 encode(new_nearfull_ratio, bl);
687 encode(new_full_ratio, bl);
688 encode(new_backfillfull_ratio, bl);
31f18b77
FG
689 }
690 // 5 was string-based new_require_min_compat_client
691 if (target_v >= 6) {
11fdf7f2
TL
692 encode(new_require_min_compat_client, bl);
693 encode(new_require_osd_release, bl);
7c673cae 694 }
81eedcae
TL
695 if (target_v >= 8) {
696 encode(new_crush_node_flags, bl);
697 }
698 if (target_v >= 9) {
699 encode(new_device_class_flags, bl);
700 }
f67539c2
TL
701 if (target_v >= 10) {
702 encode(change_stretch_mode, bl);
703 encode(new_stretch_bucket_count, bl);
704 encode(new_degraded_stretch_mode, bl);
705 encode(new_recovering_stretch_mode, bl);
706 encode(new_stretch_mode_bucket, bl);
707 encode(stretch_mode_enabled, bl);
708 }
7c673cae
FG
709 ENCODE_FINISH(bl); // osd-only data
710 }
711
11fdf7f2
TL
712 crc_offset = bl.length();
713 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
714 tail_offset = bl.length();
715
11fdf7f2 716 encode(full_crc, bl);
7c673cae
FG
717
718 ENCODE_FINISH(bl); // meta-encoding wrapper
719
720 // fill in crc
9f95a23c 721 ceph::buffer::list front;
11fdf7f2 722 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae 723 inc_crc = front.crc32c(-1);
9f95a23c 724 ceph::buffer::list tail;
7c673cae
FG
725 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
726 inc_crc = tail.crc32c(inc_crc);
727 ceph_le32 crc_le;
728 crc_le = inc_crc;
11fdf7f2 729 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
730 have_crc = true;
731}
732
9f95a23c 733void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
7c673cae 734{
11fdf7f2 735 using ceph::decode;
7c673cae
FG
736 __u32 n, t;
737 // base
738 __u16 v;
11fdf7f2
TL
739 decode(v, p);
740 decode(fsid, p);
741 decode(epoch, p);
742 decode(modified, p);
7c673cae 743 if (v == 4 || v == 5) {
11fdf7f2 744 decode(n, p);
7c673cae
FG
745 new_pool_max = n;
746 } else if (v >= 6)
11fdf7f2
TL
747 decode(new_pool_max, p);
748 decode(new_flags, p);
749 decode(fullmap, p);
750 decode(crush, p);
7c673cae 751
11fdf7f2 752 decode(new_max_osd, p);
7c673cae
FG
753 if (v < 6) {
754 new_pools.clear();
11fdf7f2 755 decode(n, p);
7c673cae 756 while (n--) {
11fdf7f2
TL
757 decode(t, p);
758 decode(new_pools[t], p);
7c673cae
FG
759 }
760 } else {
11fdf7f2 761 decode(new_pools, p);
7c673cae
FG
762 }
763 if (v == 5) {
764 new_pool_names.clear();
11fdf7f2 765 decode(n, p);
7c673cae 766 while (n--) {
11fdf7f2
TL
767 decode(t, p);
768 decode(new_pool_names[t], p);
7c673cae
FG
769 }
770 } else if (v >= 6) {
11fdf7f2 771 decode(new_pool_names, p);
7c673cae
FG
772 }
773 if (v < 6) {
774 old_pools.clear();
11fdf7f2 775 decode(n, p);
7c673cae 776 while (n--) {
11fdf7f2 777 decode(t, p);
7c673cae
FG
778 old_pools.insert(t);
779 }
780 } else {
11fdf7f2 781 decode(old_pools, p);
7c673cae 782 }
11fdf7f2 783 decode(new_up_client, p);
31f18b77
FG
784 {
785 map<int32_t,uint8_t> ns;
11fdf7f2 786 decode(ns, p);
31f18b77
FG
787 for (auto q : ns) {
788 new_state[q.first] = q.second;
789 }
790 }
11fdf7f2 791 decode(new_weight, p);
7c673cae
FG
792
793 if (v < 6) {
794 new_pg_temp.clear();
11fdf7f2 795 decode(n, p);
7c673cae
FG
796 while (n--) {
797 old_pg_t opg;
9f95a23c 798 ceph::decode_raw(opg, p);
11fdf7f2 799 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
800 }
801 } else {
11fdf7f2 802 decode(new_pg_temp, p);
7c673cae
FG
803 }
804
805 // decode short map, too.
806 if (v == 5 && p.end())
807 return;
808
809 // extended
810 __u16 ev = 0;
811 if (v >= 5)
11fdf7f2
TL
812 decode(ev, p);
813 decode(new_hb_back_up, p);
7c673cae 814 if (v < 5)
11fdf7f2
TL
815 decode(new_pool_names, p);
816 decode(new_up_thru, p);
817 decode(new_last_clean_interval, p);
818 decode(new_lost, p);
f67539c2
TL
819 decode(new_blocklist, p);
820 decode(old_blocklist, p);
7c673cae 821 if (ev >= 6)
11fdf7f2 822 decode(new_up_cluster, p);
7c673cae 823 if (ev >= 7)
11fdf7f2 824 decode(cluster_snapshot, p);
7c673cae 825 if (ev >= 8)
11fdf7f2 826 decode(new_uuid, p);
7c673cae 827 if (ev >= 9)
11fdf7f2 828 decode(new_xinfo, p);
7c673cae 829 if (ev >= 10)
11fdf7f2 830 decode(new_hb_front_up, p);
7c673cae
FG
831}
832
11fdf7f2
TL
833/* for a description of osdmap incremental versions, and when they were
834 * introduced, please refer to
835 * doc/dev/osd_internals/osdmap_versions.txt
836 */
9f95a23c 837void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 838{
11fdf7f2 839 using ceph::decode;
7c673cae
FG
840 /**
841 * Older encodings of the Incremental had a single struct_v which
842 * covered the whole encoding, and was prior to our modern
843 * stuff which includes a compatv and a size. So if we see
844 * a struct_v < 7, we must rewind to the beginning and use our
845 * classic decoder.
846 */
847 size_t start_offset = bl.get_off();
848 size_t tail_offset = 0;
9f95a23c 849 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
850
851 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
852 if (struct_v < 7) {
11fdf7f2 853 bl.seek(start_offset);
7c673cae
FG
854 decode_classic(bl);
855 encode_features = 0;
856 if (struct_v >= 6)
857 encode_features = CEPH_FEATURE_PGID64;
858 else
859 encode_features = 0;
860 return;
861 }
862 {
11fdf7f2
TL
863 DECODE_START(8, bl); // client-usable data
864 decode(fsid, bl);
865 decode(epoch, bl);
866 decode(modified, bl);
867 decode(new_pool_max, bl);
868 decode(new_flags, bl);
869 decode(fullmap, bl);
870 decode(crush, bl);
871
872 decode(new_max_osd, bl);
873 decode(new_pools, bl);
874 decode(new_pool_names, bl);
875 decode(old_pools, bl);
876 decode(new_up_client, bl);
31f18b77 877 if (struct_v >= 5) {
11fdf7f2 878 decode(new_state, bl);
31f18b77
FG
879 } else {
880 map<int32_t,uint8_t> ns;
11fdf7f2 881 decode(ns, bl);
31f18b77
FG
882 for (auto q : ns) {
883 new_state[q.first] = q.second;
884 }
885 }
11fdf7f2
TL
886 decode(new_weight, bl);
887 decode(new_pg_temp, bl);
888 decode(new_primary_temp, bl);
7c673cae 889 if (struct_v >= 2)
11fdf7f2 890 decode(new_primary_affinity, bl);
7c673cae
FG
891 else
892 new_primary_affinity.clear();
893 if (struct_v >= 3) {
11fdf7f2
TL
894 decode(new_erasure_code_profiles, bl);
895 decode(old_erasure_code_profiles, bl);
7c673cae
FG
896 } else {
897 new_erasure_code_profiles.clear();
898 old_erasure_code_profiles.clear();
899 }
900 if (struct_v >= 4) {
11fdf7f2
TL
901 decode(new_pg_upmap, bl);
902 decode(old_pg_upmap, bl);
903 decode(new_pg_upmap_items, bl);
904 decode(old_pg_upmap_items, bl);
905 }
906 if (struct_v >= 6) {
907 decode(new_removed_snaps, bl);
908 decode(new_purged_snaps, bl);
909 }
910 if (struct_v >= 8) {
911 decode(new_last_up_change, bl);
912 decode(new_last_in_change, bl);
7c673cae
FG
913 }
914 DECODE_FINISH(bl); // client-usable data
915 }
916
917 {
f67539c2 918 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
919 decode(new_hb_back_up, bl);
920 decode(new_up_thru, bl);
921 decode(new_last_clean_interval, bl);
922 decode(new_lost, bl);
f67539c2
TL
923 decode(new_blocklist, bl);
924 decode(old_blocklist, bl);
11fdf7f2
TL
925 decode(new_up_cluster, bl);
926 decode(cluster_snapshot, bl);
927 decode(new_uuid, bl);
928 decode(new_xinfo, bl);
929 decode(new_hb_front_up, bl);
7c673cae 930 if (struct_v >= 2)
11fdf7f2 931 decode(encode_features, bl);
7c673cae
FG
932 else
933 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
934 if (struct_v >= 3) {
11fdf7f2
TL
935 decode(new_nearfull_ratio, bl);
936 decode(new_full_ratio, bl);
7c673cae
FG
937 } else {
938 new_nearfull_ratio = -1;
939 new_full_ratio = -1;
940 }
941 if (struct_v >= 4) {
11fdf7f2 942 decode(new_backfillfull_ratio, bl);
7c673cae
FG
943 } else {
944 new_backfillfull_ratio = -1;
945 }
31f18b77
FG
946 if (struct_v == 5) {
947 string r;
11fdf7f2 948 decode(r, bl);
31f18b77 949 if (r.length()) {
9f95a23c 950 new_require_min_compat_client = ceph_release_from_name(r);
31f18b77
FG
951 }
952 }
953 if (struct_v >= 6) {
11fdf7f2
TL
954 decode(new_require_min_compat_client, bl);
955 decode(new_require_osd_release, bl);
31f18b77
FG
956 } else {
957 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
958 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 959 new_require_osd_release = ceph_release_t::luminous;
31f18b77
FG
960 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
961 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
9f95a23c 962 new_require_osd_release = ceph_release_t::kraken;
31f18b77 963 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
9f95a23c 964 new_require_osd_release = ceph_release_t::jewel;
31f18b77 965 } else {
9f95a23c 966 new_require_osd_release = ceph_release_t::unknown;
31f18b77
FG
967 }
968 }
81eedcae
TL
969 if (struct_v >= 8) {
970 decode(new_crush_node_flags, bl);
971 }
972 if (struct_v >= 9) {
973 decode(new_device_class_flags, bl);
974 }
f67539c2
TL
975 if (struct_v >= 10) {
976 decode(change_stretch_mode, bl);
977 decode(new_stretch_bucket_count, bl);
978 decode(new_degraded_stretch_mode, bl);
979 decode(new_recovering_stretch_mode, bl);
980 decode(new_stretch_mode_bucket, bl);
981 decode(stretch_mode_enabled, bl);
982 }
983
7c673cae
FG
984 DECODE_FINISH(bl); // osd-only data
985 }
986
987 if (struct_v >= 8) {
988 have_crc = true;
989 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 990 decode(inc_crc, bl);
7c673cae 991 tail_offset = bl.get_off();
11fdf7f2 992 decode(full_crc, bl);
7c673cae
FG
993 } else {
994 have_crc = false;
995 full_crc = 0;
996 inc_crc = 0;
997 }
998
999 DECODE_FINISH(bl); // wrapper
1000
1001 if (have_crc) {
1002 // verify crc
1003 uint32_t actual = crc_front.crc32c(-1);
1004 if (tail_offset < bl.get_off()) {
9f95a23c 1005 ceph::buffer::list tail;
7c673cae
FG
1006 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
1007 actual = tail.crc32c(actual);
1008 }
1009 if (inc_crc != actual) {
1010 ostringstream ss;
1011 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
1012 string s = ss.str();
9f95a23c 1013 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
1014 }
1015 }
1016}
1017
1018void OSDMap::Incremental::dump(Formatter *f) const
1019{
1020 f->dump_int("epoch", epoch);
1021 f->dump_stream("fsid") << fsid;
1022 f->dump_stream("modified") << modified;
11fdf7f2
TL
1023 f->dump_stream("new_last_up_change") << new_last_up_change;
1024 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
1025 f->dump_int("new_pool_max", new_pool_max);
1026 f->dump_int("new_flags", new_flags);
1027 f->dump_float("new_full_ratio", new_full_ratio);
1028 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1029 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
f67539c2
TL
1030 f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
1031 f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
7c673cae
FG
1032
1033 if (fullmap.length()) {
1034 f->open_object_section("full_map");
1035 OSDMap full;
9f95a23c 1036 ceph::buffer::list fbl = fullmap; // kludge around constness.
11fdf7f2 1037 auto p = fbl.cbegin();
7c673cae
FG
1038 full.decode(p);
1039 full.dump(f);
1040 f->close_section();
1041 }
1042 if (crush.length()) {
1043 f->open_object_section("crush");
1044 CrushWrapper c;
9f95a23c 1045 ceph::buffer::list tbl = crush; // kludge around constness.
11fdf7f2 1046 auto p = tbl.cbegin();
7c673cae
FG
1047 c.decode(p);
1048 c.dump(f);
1049 f->close_section();
1050 }
1051
1052 f->dump_int("new_max_osd", new_max_osd);
1053
1054 f->open_array_section("new_pools");
1055
1056 for (const auto &new_pool : new_pools) {
1057 f->open_object_section("pool");
1058 f->dump_int("pool", new_pool.first);
1059 new_pool.second.dump(f);
1060 f->close_section();
1061 }
1062 f->close_section();
1063 f->open_array_section("new_pool_names");
1064
1065 for (const auto &new_pool_name : new_pool_names) {
1066 f->open_object_section("pool_name");
1067 f->dump_int("pool", new_pool_name.first);
1068 f->dump_string("name", new_pool_name.second);
1069 f->close_section();
1070 }
1071 f->close_section();
1072 f->open_array_section("old_pools");
1073
1074 for (const auto &old_pool : old_pools)
1075 f->dump_int("pool", old_pool);
1076 f->close_section();
1077
1078 f->open_array_section("new_up_osds");
1079
1080 for (const auto &upclient : new_up_client) {
1081 f->open_object_section("osd");
1082 f->dump_int("osd", upclient.first);
11fdf7f2
TL
1083 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1084 f->dump_object("public_addrs", upclient.second);
1085 if (auto p = new_up_cluster.find(upclient.first);
1086 p != new_up_cluster.end()) {
1087 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1088 f->dump_object("cluster_addrs", p->second);
1089 }
1090 if (auto p = new_hb_back_up.find(upclient.first);
1091 p != new_hb_back_up.end()) {
1092 f->dump_object("heartbeat_back_addrs", p->second);
1093 }
1094 if (auto p = new_hb_front_up.find(upclient.first);
1095 p != new_hb_front_up.end()) {
1096 f->dump_object("heartbeat_front_addrs", p->second);
1097 }
7c673cae
FG
1098 f->close_section();
1099 }
1100 f->close_section();
1101
1102 f->open_array_section("new_weight");
1103
1104 for (const auto &weight : new_weight) {
1105 f->open_object_section("osd");
1106 f->dump_int("osd", weight.first);
1107 f->dump_int("weight", weight.second);
1108 f->close_section();
1109 }
1110 f->close_section();
1111
1112 f->open_array_section("osd_state_xor");
1113 for (const auto &ns : new_state) {
1114 f->open_object_section("osd");
1115 f->dump_int("osd", ns.first);
1116 set<string> st;
1117 calc_state_set(new_state.find(ns.first)->second, st);
1118 f->open_array_section("state_xor");
1119 for (auto &state : st)
1120 f->dump_string("state", state);
1121 f->close_section();
c07f9fc5 1122 f->close_section();
7c673cae
FG
1123 }
1124 f->close_section();
1125
1126 f->open_array_section("new_pg_temp");
1127
1128 for (const auto &pg_temp : new_pg_temp) {
1129 f->open_object_section("pg");
1130 f->dump_stream("pgid") << pg_temp.first;
1131 f->open_array_section("osds");
1132
1133 for (const auto &osd : pg_temp.second)
1134 f->dump_int("osd", osd);
1135 f->close_section();
1136 f->close_section();
1137 }
1138 f->close_section();
1139
1140 f->open_array_section("primary_temp");
1141
1142 for (const auto &primary_temp : new_primary_temp) {
1143 f->dump_stream("pgid") << primary_temp.first;
1144 f->dump_int("osd", primary_temp.second);
1145 }
1146 f->close_section(); // primary_temp
1147
1148 f->open_array_section("new_pg_upmap");
1149 for (auto& i : new_pg_upmap) {
1150 f->open_object_section("mapping");
1151 f->dump_stream("pgid") << i.first;
1152 f->open_array_section("osds");
1153 for (auto osd : i.second) {
1154 f->dump_int("osd", osd);
1155 }
1156 f->close_section();
1157 f->close_section();
1158 }
1159 f->close_section();
1160 f->open_array_section("old_pg_upmap");
1161 for (auto& i : old_pg_upmap) {
1162 f->dump_stream("pgid") << i;
1163 }
1164 f->close_section();
1165
1166 f->open_array_section("new_pg_upmap_items");
1167 for (auto& i : new_pg_upmap_items) {
1168 f->open_object_section("mapping");
1169 f->dump_stream("pgid") << i.first;
1170 f->open_array_section("mappings");
1171 for (auto& p : i.second) {
1172 f->open_object_section("mapping");
1173 f->dump_int("from", p.first);
1174 f->dump_int("to", p.second);
1175 f->close_section();
1176 }
1177 f->close_section();
1178 f->close_section();
1179 }
1180 f->close_section();
1181 f->open_array_section("old_pg_upmap_items");
1182 for (auto& i : old_pg_upmap_items) {
1183 f->dump_stream("pgid") << i;
1184 }
1185 f->close_section();
1186
1187 f->open_array_section("new_up_thru");
1188
1189 for (const auto &up_thru : new_up_thru) {
1190 f->open_object_section("osd");
1191 f->dump_int("osd", up_thru.first);
1192 f->dump_int("up_thru", up_thru.second);
1193 f->close_section();
1194 }
1195 f->close_section();
1196
1197 f->open_array_section("new_lost");
1198
1199 for (const auto &lost : new_lost) {
1200 f->open_object_section("osd");
1201 f->dump_int("osd", lost.first);
1202 f->dump_int("epoch_lost", lost.second);
1203 f->close_section();
1204 }
1205 f->close_section();
1206
1207 f->open_array_section("new_last_clean_interval");
1208
1209 for (const auto &last_clean_interval : new_last_clean_interval) {
1210 f->open_object_section("osd");
1211 f->dump_int("osd", last_clean_interval.first);
1212 f->dump_int("first", last_clean_interval.second.first);
1213 f->dump_int("last", last_clean_interval.second.second);
1214 f->close_section();
1215 }
1216 f->close_section();
1217
f67539c2
TL
1218 f->open_array_section("new_blocklist");
1219 for (const auto &blist : new_blocklist) {
7c673cae
FG
1220 stringstream ss;
1221 ss << blist.first;
1222 f->dump_stream(ss.str().c_str()) << blist.second;
1223 }
1224 f->close_section();
f67539c2
TL
1225 f->open_array_section("old_blocklist");
1226 for (const auto &blist : old_blocklist)
7c673cae
FG
1227 f->dump_stream("addr") << blist;
1228 f->close_section();
1229
1230 f->open_array_section("new_xinfo");
1231 for (const auto &xinfo : new_xinfo) {
1232 f->open_object_section("xinfo");
1233 f->dump_int("osd", xinfo.first);
1234 xinfo.second.dump(f);
1235 f->close_section();
1236 }
1237 f->close_section();
1238
1239 if (cluster_snapshot.size())
1240 f->dump_string("cluster_snapshot", cluster_snapshot);
1241
1242 f->open_array_section("new_uuid");
1243 for (const auto &uuid : new_uuid) {
1244 f->open_object_section("osd");
1245 f->dump_int("osd", uuid.first);
1246 f->dump_stream("uuid") << uuid.second;
1247 f->close_section();
1248 }
1249 f->close_section();
1250
1251 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1252 f->open_array_section("old_erasure_code_profiles");
1253 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
9f95a23c 1254 f->dump_string("old", erasure_code_profile);
7c673cae
FG
1255 }
1256 f->close_section();
11fdf7f2
TL
1257
1258 f->open_array_section("new_removed_snaps");
1259 for (auto& p : new_removed_snaps) {
1260 f->open_object_section("pool");
1261 f->dump_int("pool", p.first);
1262 f->open_array_section("snaps");
1263 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1264 f->open_object_section("interval");
1265 f->dump_unsigned("begin", q.get_start());
1266 f->dump_unsigned("length", q.get_len());
1267 f->close_section();
1268 }
1269 f->close_section();
1270 f->close_section();
1271 }
1272 f->close_section();
1273 f->open_array_section("new_purged_snaps");
1274 for (auto& p : new_purged_snaps) {
1275 f->open_object_section("pool");
1276 f->dump_int("pool", p.first);
1277 f->open_array_section("snaps");
1278 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1279 f->open_object_section("interval");
1280 f->dump_unsigned("begin", q.get_start());
1281 f->dump_unsigned("length", q.get_len());
1282 f->close_section();
1283 }
1284 f->close_section();
1285 f->close_section();
1286 }
81eedcae
TL
1287 f->open_array_section("new_crush_node_flags");
1288 for (auto& i : new_crush_node_flags) {
1289 f->open_object_section("node");
1290 f->dump_int("id", i.first);
1291 set<string> st;
1292 calc_state_set(i.second, st);
1293 for (auto& j : st) {
1294 f->dump_string("flag", j);
1295 }
1296 f->close_section();
1297 }
1298 f->close_section();
1299 f->open_array_section("new_device_class_flags");
1300 for (auto& i : new_device_class_flags) {
1301 f->open_object_section("device_class");
1302 f->dump_int("id", i.first);
1303 set<string> st;
1304 calc_state_set(i.second, st);
1305 for (auto& j : st) {
1306 f->dump_string("flag", j);
1307 }
1308 f->close_section();
1309 }
1310 f->close_section();
f67539c2
TL
1311 f->open_object_section("stretch_mode");
1312 {
1313 f->dump_bool("change_stretch_mode", change_stretch_mode);
1314 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
1315 f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
1316 f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
1317 f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
1318 f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
1319 }
1320 f->close_section();
11fdf7f2 1321 f->close_section();
7c673cae
FG
1322}
1323
1324void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1325{
1326 o.push_back(new Incremental);
1327}
1328
1329// ----------------------------------
1330// OSDMap
1331
1332void OSDMap::set_epoch(epoch_t e)
1333{
1334 epoch = e;
1335 for (auto &pool : pools)
1336 pool.second.last_change = e;
1337}
1338
f67539c2 1339bool OSDMap::is_blocklisted(const entity_addr_t& orig) const
7c673cae 1340{
f67539c2 1341 if (blocklist.empty()) {
7c673cae 1342 return false;
11fdf7f2
TL
1343 }
1344
f67539c2 1345 // all blocklist entries are type ANY for nautilus+
11fdf7f2
TL
1346 // FIXME: avoid this copy!
1347 entity_addr_t a = orig;
9f95a23c 1348 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1349 a.set_type(entity_addr_t::TYPE_LEGACY);
1350 } else {
1351 a.set_type(entity_addr_t::TYPE_ANY);
1352 }
7c673cae
FG
1353
1354 // this specific instance?
f67539c2 1355 if (blocklist.count(a)) {
7c673cae 1356 return true;
11fdf7f2 1357 }
7c673cae 1358
f67539c2 1359 // is entire ip blocklisted?
7c673cae 1360 if (a.is_ip()) {
11fdf7f2
TL
1361 a.set_port(0);
1362 a.set_nonce(0);
f67539c2 1363 if (blocklist.count(a)) {
11fdf7f2
TL
1364 return true;
1365 }
1366 }
1367
1368 return false;
1369}
1370
f67539c2 1371bool OSDMap::is_blocklisted(const entity_addrvec_t& av) const
11fdf7f2 1372{
f67539c2 1373 if (blocklist.empty())
11fdf7f2
TL
1374 return false;
1375
1376 for (auto& a : av.v) {
f67539c2 1377 if (is_blocklisted(a)) {
7c673cae
FG
1378 return true;
1379 }
1380 }
1381
1382 return false;
1383}
1384
f67539c2 1385void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl) const
7c673cae 1386{
f67539c2 1387 std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
7c673cae
FG
1388}
1389
f67539c2 1390void OSDMap::get_blocklist(std::set<entity_addr_t> *bl) const
31f18b77 1391{
f67539c2 1392 for (const auto &i : blocklist) {
31f18b77
FG
1393 bl->insert(i.first);
1394 }
1395}
1396
7c673cae
FG
1397void OSDMap::set_max_osd(int m)
1398{
7c673cae 1399 max_osd = m;
f67539c2
TL
1400 osd_state.resize(max_osd, 0);
1401 osd_weight.resize(max_osd, CEPH_OSD_OUT);
1402 osd_info.resize(max_osd);
1403 osd_xinfo.resize(max_osd);
1404 osd_addrs->client_addrs.resize(max_osd);
1405 osd_addrs->cluster_addrs.resize(max_osd);
1406 osd_addrs->hb_back_addrs.resize(max_osd);
1407 osd_addrs->hb_front_addrs.resize(max_osd);
1408 osd_uuid->resize(max_osd);
7c673cae 1409 if (osd_primary_affinity)
f67539c2 1410 osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
7c673cae
FG
1411
1412 calc_num_osds();
1413}
1414
1415int OSDMap::calc_num_osds()
1416{
1417 num_osd = 0;
1418 num_up_osd = 0;
1419 num_in_osd = 0;
1420 for (int i=0; i<max_osd; i++) {
1421 if (osd_state[i] & CEPH_OSD_EXISTS) {
1422 ++num_osd;
1423 if (osd_state[i] & CEPH_OSD_UP) {
1424 ++num_up_osd;
1425 }
1426 if (get_weight(i) != CEPH_OSD_OUT) {
1427 ++num_in_osd;
1428 }
1429 }
1430 }
1431 return num_osd;
1432}
1433
3efd9988
FG
1434void OSDMap::get_full_pools(CephContext *cct,
1435 set<int64_t> *full,
1436 set<int64_t> *backfillfull,
1437 set<int64_t> *nearfull) const
7c673cae 1438{
11fdf7f2
TL
1439 ceph_assert(full);
1440 ceph_assert(backfillfull);
1441 ceph_assert(nearfull);
3efd9988
FG
1442 full->clear();
1443 backfillfull->clear();
1444 nearfull->clear();
1445
1446 vector<int> full_osds;
1447 vector<int> backfillfull_osds;
1448 vector<int> nearfull_osds;
7c673cae
FG
1449 for (int i = 0; i < max_osd; ++i) {
1450 if (exists(i) && is_up(i) && is_in(i)) {
1451 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1452 full_osds.push_back(i);
7c673cae 1453 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1454 backfillfull_osds.push_back(i);
7c673cae 1455 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1456 nearfull_osds.push_back(i);
7c673cae
FG
1457 }
1458 }
3efd9988
FG
1459
1460 for (auto i: full_osds) {
1461 get_pool_ids_by_osd(cct, i, full);
1462 }
1463 for (auto i: backfillfull_osds) {
1464 get_pool_ids_by_osd(cct, i, backfillfull);
1465 }
1466 for (auto i: nearfull_osds) {
1467 get_pool_ids_by_osd(cct, i, nearfull);
1468 }
7c673cae
FG
1469}
1470
31f18b77
FG
1471void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1472 set<int> *nearfull) const
1473{
1474 full->clear();
1475 backfill->clear();
1476 nearfull->clear();
1477 for (int i = 0; i < max_osd; ++i) {
1478 if (exists(i) && is_up(i) && is_in(i)) {
1479 if (osd_state[i] & CEPH_OSD_FULL)
1480 full->emplace(i);
1481 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1482 backfill->emplace(i);
1483 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1484 nearfull->emplace(i);
1485 }
1486 }
1487}
1488
7c673cae
FG
1489void OSDMap::get_all_osds(set<int32_t>& ls) const
1490{
1491 for (int i=0; i<max_osd; i++)
1492 if (exists(i))
1493 ls.insert(i);
1494}
1495
1496void OSDMap::get_up_osds(set<int32_t>& ls) const
1497{
1498 for (int i = 0; i < max_osd; i++) {
1499 if (is_up(i))
1500 ls.insert(i);
1501 }
1502}
1503
81eedcae 1504void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1505{
1506 for (int i = 0; i < max_osd; i++) {
81eedcae 1507 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1508 ls.insert(i);
1509 }
1510}
1511
11fdf7f2
TL
1512void OSDMap::get_flag_set(set<string> *flagset) const
1513{
1514 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1515 if (flags & (1<<i)) {
1516 flagset->insert(get_flag_string(flags & (1<<i)));
1517 }
1518 }
1519}
1520
7c673cae
FG
1521void OSDMap::calc_state_set(int state, set<string>& st)
1522{
1523 unsigned t = state;
1524 for (unsigned s = 1; t; s <<= 1) {
1525 if (t & s) {
1526 t &= ~s;
1527 st.insert(ceph_osd_state_name(s));
1528 }
1529 }
1530}
1531
1532void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1533{
1534 float max = 0;
1535 for (const auto &weight : weights) {
1536 if (weight.second > max)
1537 max = weight.second;
1538 }
1539
1540 for (const auto &weight : weights) {
1541 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1542 }
1543}
1544
1545int OSDMap::identify_osd(const entity_addr_t& addr) const
1546{
1547 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1548 if (exists(i) && (get_addrs(i).contains(addr) ||
1549 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1550 return i;
1551 return -1;
1552}
1553
1554int OSDMap::identify_osd(const uuid_d& u) const
1555{
1556 for (int i=0; i<max_osd; i++)
1557 if (exists(i) && get_uuid(i) == u)
1558 return i;
1559 return -1;
1560}
1561
1562int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1563{
1564 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1565 if (exists(i) && (get_addrs(i).contains(addr) ||
1566 get_cluster_addrs(i).contains(addr) ||
1567 get_hb_back_addrs(i).contains(addr) ||
1568 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1569 return i;
1570 return -1;
1571}
1572
1573int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1574{
1575 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1576 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1577 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1578 return i;
1579 return -1;
1580}
1581
1582
1583uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1584{
1585 uint64_t features = 0; // things we actually have
1586 uint64_t mask = 0; // things we could have
1587
1588 if (crush->has_nondefault_tunables())
1589 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1590 if (crush->has_nondefault_tunables2())
1591 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1592 if (crush->has_nondefault_tunables3())
1593 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1594 if (crush->has_v4_buckets())
1595 features |= CEPH_FEATURE_CRUSH_V4;
1596 if (crush->has_nondefault_tunables5())
1597 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1598 if (crush->has_incompat_choose_args()) {
1599 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1600 }
7c673cae
FG
1601 mask |= CEPH_FEATURES_CRUSH;
1602
1603 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1604 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1605 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1606
1607 for (auto &pool: pools) {
1608 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1609 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1610 }
7c673cae
FG
1611 if (!pool.second.tiers.empty() ||
1612 pool.second.is_tier()) {
1613 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1614 }
31f18b77 1615 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
7c673cae
FG
1616 pool.second.get_type(),
1617 pool.second.get_size());
1618 if (ruleid >= 0) {
1619 if (crush->is_v2_rule(ruleid))
1620 features |= CEPH_FEATURE_CRUSH_V2;
1621 if (crush->is_v3_rule(ruleid))
1622 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1623 if (crush->is_v5_rule(ruleid))
1624 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1625 }
1626 }
7c673cae 1627 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1628
1629 if (osd_primary_affinity) {
1630 for (int i = 0; i < max_osd; ++i) {
1631 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1632 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1633 break;
1634 }
1635 }
1636 }
1637 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1638
1639 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1640 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
9f95a23c 1641 if (require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
1642 features |= jewel_features;
1643 }
1644 mask |= jewel_features;
1645
1646 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1647 | CEPH_FEATURE_MSG_ADDR2;
9f95a23c 1648 if (require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
1649 features |= kraken_features;
1650 }
1651 mask |= kraken_features;
f67539c2
TL
1652
1653 if (stretch_mode_enabled) {
1654 features |= CEPH_FEATUREMASK_STRETCH_MODE;
1655 mask |= CEPH_FEATUREMASK_STRETCH_MODE;
1656 }
7c673cae
FG
1657 }
1658
9f95a23c 1659 if (require_min_compat_client >= ceph_release_t::nautilus) {
11fdf7f2
TL
1660 // if min_compat_client is >= nautilus, require v2 cephx signatures
1661 // from everyone
1662 features |= CEPH_FEATUREMASK_CEPHX_V2;
9f95a23c 1663 } else if (require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
1664 entity_type == CEPH_ENTITY_TYPE_OSD) {
1665 // if osds are >= nautilus, at least require the signatures from them
1666 features |= CEPH_FEATUREMASK_CEPHX_V2;
1667 }
1668 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1669
7c673cae
FG
1670 if (pmask)
1671 *pmask = mask;
1672 return features;
1673}
1674
9f95a23c 1675ceph_release_t OSDMap::get_min_compat_client() const
7c673cae
FG
1676{
1677 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1678
1679 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77 1680 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
9f95a23c 1681 return ceph_release_t::luminous; // v12.2.0
7c673cae
FG
1682 }
1683 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
9f95a23c 1684 return ceph_release_t::jewel; // v10.2.0
7c673cae
FG
1685 }
1686 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
9f95a23c 1687 return ceph_release_t::hammer; // v0.94.0
7c673cae
FG
1688 }
1689 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1690 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1691 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
9f95a23c 1692 return ceph_release_t::firefly; // v0.80.0
7c673cae
FG
1693 }
1694 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1695 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
9f95a23c 1696 return ceph_release_t::dumpling; // v0.67.0
7c673cae
FG
1697 }
1698 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
9f95a23c 1699 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae 1700 }
9f95a23c 1701 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae
FG
1702}
1703
9f95a23c 1704ceph_release_t OSDMap::get_require_min_compat_client() const
11fdf7f2
TL
1705{
1706 return require_min_compat_client;
1707}
1708
7c673cae
FG
1709void OSDMap::_calc_up_osd_features()
1710{
1711 bool first = true;
1712 cached_up_osd_features = 0;
1713 for (int osd = 0; osd < max_osd; ++osd) {
1714 if (!is_up(osd))
1715 continue;
1716 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1717 if (xi.features == 0)
1718 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1719 if (first) {
1720 cached_up_osd_features = xi.features;
1721 first = false;
1722 } else {
1723 cached_up_osd_features &= xi.features;
1724 }
1725 }
1726}
1727
1728uint64_t OSDMap::get_up_osd_features() const
1729{
1730 return cached_up_osd_features;
1731}
1732
1733void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1734{
11fdf7f2 1735 using ceph::encode;
7c673cae
FG
1736 if (o->epoch == n->epoch)
1737 return;
1738
1739 int diff = 0;
1740
1741 // do addrs match?
1742 if (o->max_osd != n->max_osd)
1743 diff++;
1744 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1745 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1746 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1747 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1748 else
1749 diff++;
11fdf7f2
TL
1750 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1751 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1752 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1753 else
1754 diff++;
11fdf7f2
TL
1755 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1756 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1757 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1758 else
1759 diff++;
11fdf7f2
TL
1760 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1761 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1762 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1763 else
1764 diff++;
1765 }
1766 if (diff == 0) {
1767 // zoinks, no differences at all!
1768 n->osd_addrs = o->osd_addrs;
1769 }
1770
1771 // does crush match?
9f95a23c 1772 ceph::buffer::list oc, nc;
11fdf7f2
TL
1773 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1774 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1775 if (oc.contents_equal(nc)) {
1776 n->crush = o->crush;
1777 }
1778
1779 // does pg_temp match?
31f18b77
FG
1780 if (*o->pg_temp == *n->pg_temp)
1781 n->pg_temp = o->pg_temp;
7c673cae
FG
1782
1783 // does primary_temp match?
1784 if (o->primary_temp->size() == n->primary_temp->size()) {
1785 if (*o->primary_temp == *n->primary_temp)
1786 n->primary_temp = o->primary_temp;
1787 }
1788
1789 // do uuids match?
1790 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1791 *o->osd_uuid == *n->osd_uuid)
1792 n->osd_uuid = o->osd_uuid;
1793}
1794
1795void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1796 const OSDMap& oldmap,
1797 const OSDMap& nextmap,
1798 Incremental *pending_inc)
7c673cae
FG
1799{
1800 ldout(cct, 10) << __func__ << dendl;
7c673cae 1801
11fdf7f2 1802 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1803 // if pool does not exist, remove any existing pg_temps associated with
1804 // it. we don't care about pg_temps on the pending_inc either; if there
1805 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1806 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1807 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1808 << " for nonexistent pool " << pg.first.pool() << dendl;
1809 pending_inc->new_pg_temp[pg.first].clear();
1810 continue;
1811 }
1812 // all osds down?
1813 unsigned num_up = 0;
1814 for (auto o : pg.second) {
11fdf7f2 1815 if (!nextmap.is_down(o)) {
7c673cae
FG
1816 ++num_up;
1817 break;
1818 }
1819 }
1820 if (num_up == 0) {
1821 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1822 << " with all down osds" << pg.second << dendl;
1823 pending_inc->new_pg_temp[pg.first].clear();
1824 continue;
1825 }
1826 // redundant pg_temp?
1827 vector<int> raw_up;
1828 int primary;
11fdf7f2 1829 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1830 bool remove = false;
11fdf7f2 1831 if (raw_up == pg.second) {
7c673cae
FG
1832 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1833 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1834 remove = true;
1835 }
1836 // oversized pg_temp?
11fdf7f2 1837 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1838 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1839 << pg.second << " exceeds pool size" << dendl;
1840 remove = true;
1841 }
1842 if (remove) {
11fdf7f2 1843 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1844 pending_inc->new_pg_temp[pg.first].clear();
1845 else
1846 pending_inc->new_pg_temp.erase(pg.first);
1847 }
1848 }
1849
11fdf7f2 1850 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1851 // primary down?
11fdf7f2 1852 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1853 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1854 << " to down " << pg.second << dendl;
1855 pending_inc->new_primary_temp[pg.first] = -1;
1856 continue;
1857 }
1858 // redundant primary_temp?
1859 vector<int> real_up, templess_up;
1860 int real_primary, templess_primary;
1861 pg_t pgid = pg.first;
11fdf7f2
TL
1862 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1863 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1864 if (real_primary == templess_primary){
1865 ldout(cct, 10) << __func__ << " removing primary_temp "
1866 << pgid << " -> " << real_primary
1867 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1868 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1869 pending_inc->new_primary_temp[pgid] = -1;
1870 else
1871 pending_inc->new_primary_temp.erase(pgid);
1872 }
1873 }
1874}
1875
494da23a 1876void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1877{
494da23a
TL
1878 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1879 for (auto& p : pg_upmap)
1880 upmap_pgs->push_back(p.first);
1881 for (auto& p : pg_upmap_items)
1882 upmap_pgs->push_back(p.first);
1883}
94b18763 1884
494da23a
TL
1885bool OSDMap::check_pg_upmaps(
1886 CephContext *cct,
1887 const vector<pg_t>& to_check,
1888 vector<pg_t> *to_cancel,
1889 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1890{
1891 bool any_change = false;
1892 map<int, map<int, float>> rule_weight_map;
28e407b8 1893 for (auto& pg : to_check) {
494da23a 1894 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
1895 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1896 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1897 << dendl;
494da23a 1898 to_cancel->push_back(pg);
11fdf7f2
TL
1899 continue;
1900 }
1901 if (pi->is_pending_merge(pg, nullptr)) {
1902 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1903 << dendl;
494da23a 1904 to_cancel->push_back(pg);
94b18763
FG
1905 continue;
1906 }
494da23a
TL
1907 vector<int> raw, up;
1908 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
1909 auto crush_rule = get_pg_pool_crush_rule(pg);
1910 auto r = crush->verify_upmap(cct,
1911 crush_rule,
1912 get_pg_pool_size(pg),
1913 up);
a8e16298
TL
1914 if (r < 0) {
1915 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1916 << " returning " << r
1917 << dendl;
494da23a 1918 to_cancel->push_back(pg);
a8e16298
TL
1919 continue;
1920 }
1921 // below we check against crush-topology changing..
28e407b8
AA
1922 map<int, float> weight_map;
1923 auto it = rule_weight_map.find(crush_rule);
1924 if (it == rule_weight_map.end()) {
494da23a 1925 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
1926 if (r < 0) {
1927 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
1928 << "crush_rule " << crush_rule
1929 << dendl;
28e407b8
AA
1930 continue;
1931 }
1932 rule_weight_map[crush_rule] = weight_map;
1933 } else {
1934 weight_map = it->second;
1935 }
28e407b8 1936 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1937 << " weight_map " << weight_map
94b18763 1938 << dendl;
a8e16298 1939 for (auto osd : up) {
28e407b8
AA
1940 auto it = weight_map.find(osd);
1941 if (it == weight_map.end()) {
92f5a8d4
TL
1942 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
1943 << "been moved out of the specific crush-tree"
1944 << dendl;
494da23a 1945 to_cancel->push_back(pg);
94b18763
FG
1946 break;
1947 }
494da23a 1948 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 1949 if (adjusted_weight == 0) {
92f5a8d4
TL
1950 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
1951 << " is out/crush-out"
1952 << dendl;
494da23a 1953 to_cancel->push_back(pg);
94b18763
FG
1954 break;
1955 }
1956 }
eafe8130
TL
1957 if (!to_cancel->empty() && to_cancel->back() == pg)
1958 continue;
1959 // okay, upmap is valid
1960 // continue to check if it is still necessary
1961 auto i = pg_upmap.find(pg);
1962 if (i != pg_upmap.end() && raw == i->second) {
1963 ldout(cct, 10) << " removing redundant pg_upmap "
1964 << i->first << " " << i->second
1965 << dendl;
1966 to_cancel->push_back(pg);
1967 continue;
1968 }
1969 auto j = pg_upmap_items.find(pg);
1970 if (j != pg_upmap_items.end()) {
1971 mempool::osdmap::vector<pair<int,int>> newmap;
1972 for (auto& p : j->second) {
1973 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1974 // cancel mapping if source osd does not exist anymore
1975 continue;
1976 }
1977 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1978 p.second >= 0 && osd_weight[p.second] == 0) {
1979 // cancel mapping if target osd is out
1980 continue;
1981 }
1982 newmap.push_back(p);
1983 }
1984 if (newmap.empty()) {
1985 ldout(cct, 10) << " removing no-op pg_upmap_items "
1986 << j->first << " " << j->second
1987 << dendl;
1988 to_cancel->push_back(pg);
1989 } else if (newmap != j->second) {
1990 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
1991 << j->first << " " << j->second
1992 << " -> " << newmap
1993 << dendl;
1994 to_remap->insert({pg, newmap});
1995 any_change = true;
1996 }
1997 }
28e407b8 1998 }
494da23a
TL
1999 any_change = any_change || !to_cancel->empty();
2000 return any_change;
2001}
2002
2003void OSDMap::clean_pg_upmaps(
2004 CephContext *cct,
2005 Incremental *pending_inc,
2006 const vector<pg_t>& to_cancel,
2007 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
2008{
28e407b8 2009 for (auto &pg: to_cancel) {
494da23a
TL
2010 auto i = pending_inc->new_pg_upmap.find(pg);
2011 if (i != pending_inc->new_pg_upmap.end()) {
2012 ldout(cct, 10) << __func__ << " cancel invalid pending "
2013 << "pg_upmap entry "
2014 << i->first << "->" << i->second
2015 << dendl;
2016 pending_inc->new_pg_upmap.erase(i);
94b18763 2017 }
494da23a
TL
2018 auto j = pg_upmap.find(pg);
2019 if (j != pg_upmap.end()) {
2020 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
2021 << j->first << "->" << j->second
2022 << dendl;
2023 pending_inc->old_pg_upmap.insert(pg);
2024 }
2025 auto p = pending_inc->new_pg_upmap_items.find(pg);
2026 if (p != pending_inc->new_pg_upmap_items.end()) {
2027 ldout(cct, 10) << __func__ << " cancel invalid pending "
2028 << "pg_upmap_items entry "
2029 << p->first << "->" << p->second
2030 << dendl;
2031 pending_inc->new_pg_upmap_items.erase(p);
2032 }
2033 auto q = pg_upmap_items.find(pg);
2034 if (q != pg_upmap_items.end()) {
2035 ldout(cct, 10) << __func__ << " cancel invalid "
2036 << "pg_upmap_items entry "
2037 << q->first << "->" << q->second
2038 << dendl;
2039 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
2040 }
2041 }
494da23a
TL
2042 for (auto& i : to_remap)
2043 pending_inc->new_pg_upmap_items[i.first] = i.second;
2044}
2045
2046bool OSDMap::clean_pg_upmaps(
2047 CephContext *cct,
2048 Incremental *pending_inc) const
2049{
2050 ldout(cct, 10) << __func__ << dendl;
2051 vector<pg_t> to_check;
2052 vector<pg_t> to_cancel;
2053 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2054
2055 get_upmap_pgs(&to_check);
2056 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2057 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2058 return any_change;
94b18763
FG
2059}
2060
7c673cae
FG
2061int OSDMap::apply_incremental(const Incremental &inc)
2062{
f67539c2 2063 new_blocklist_entries = false;
7c673cae
FG
2064 if (inc.epoch == 1)
2065 fsid = inc.fsid;
2066 else if (inc.fsid != fsid)
2067 return -EINVAL;
2068
11fdf7f2 2069 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
2070
2071 epoch++;
2072 modified = inc.modified;
2073
2074 // full map?
2075 if (inc.fullmap.length()) {
9f95a23c 2076 ceph::buffer::list bl(inc.fullmap);
7c673cae
FG
2077 decode(bl);
2078 return 0;
2079 }
2080
2081 // nope, incremental.
31f18b77 2082 if (inc.new_flags >= 0) {
7c673cae 2083 flags = inc.new_flags;
31f18b77
FG
2084 // the below is just to cover a newly-upgraded luminous mon
2085 // cluster that has to set require_jewel_osds or
2086 // require_kraken_osds before the osds can be upgraded to
2087 // luminous.
2088 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c
TL
2089 if (require_osd_release < ceph_release_t::kraken) {
2090 require_osd_release = ceph_release_t::kraken;
31f18b77
FG
2091 }
2092 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c
TL
2093 if (require_osd_release < ceph_release_t::jewel) {
2094 require_osd_release = ceph_release_t::jewel;
31f18b77
FG
2095 }
2096 }
2097 }
7c673cae
FG
2098
2099 if (inc.new_max_osd >= 0)
2100 set_max_osd(inc.new_max_osd);
2101
2102 if (inc.new_pool_max != -1)
2103 pool_max = inc.new_pool_max;
2104
2105 for (const auto &pool : inc.new_pools) {
2106 pools[pool.first] = pool.second;
2107 pools[pool.first].last_change = epoch;
2108 }
2109
11fdf7f2
TL
2110 new_removed_snaps = inc.new_removed_snaps;
2111 new_purged_snaps = inc.new_purged_snaps;
2112 for (auto p = new_removed_snaps.begin();
2113 p != new_removed_snaps.end();
2114 ++p) {
2115 removed_snaps_queue[p->first].union_of(p->second);
2116 }
2117 for (auto p = new_purged_snaps.begin();
2118 p != new_purged_snaps.end();
2119 ++p) {
2120 auto q = removed_snaps_queue.find(p->first);
2121 ceph_assert(q != removed_snaps_queue.end());
2122 q->second.subtract(p->second);
2123 if (q->second.empty()) {
2124 removed_snaps_queue.erase(q);
2125 }
2126 }
2127
2128 if (inc.new_last_up_change != utime_t()) {
2129 last_up_change = inc.new_last_up_change;
2130 }
2131 if (inc.new_last_in_change != utime_t()) {
2132 last_in_change = inc.new_last_in_change;
2133 }
2134
7c673cae
FG
2135 for (const auto &pname : inc.new_pool_names) {
2136 auto pool_name_entry = pool_name.find(pname.first);
2137 if (pool_name_entry != pool_name.end()) {
2138 name_pool.erase(pool_name_entry->second);
2139 pool_name_entry->second = pname.second;
2140 } else {
2141 pool_name[pname.first] = pname.second;
2142 }
2143 name_pool[pname.second] = pname.first;
2144 }
2145
2146 for (const auto &pool : inc.old_pools) {
2147 pools.erase(pool);
2148 name_pool.erase(pool_name[pool]);
2149 pool_name.erase(pool);
2150 }
2151
2152 for (const auto &weight : inc.new_weight) {
2153 set_weight(weight.first, weight.second);
2154
2155 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2156 // xinfo old_weight.
2157 if (weight.second) {
2158 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2159 osd_xinfo[weight.first].old_weight = 0;
2160 }
2161 }
2162
2163 for (const auto &primary_affinity : inc.new_primary_affinity) {
2164 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2165 }
2166
2167 // erasure_code_profiles
2168 for (const auto &profile : inc.old_erasure_code_profiles)
2169 erasure_code_profiles.erase(profile);
2170
2171 for (const auto &profile : inc.new_erasure_code_profiles) {
2172 set_erasure_code_profile(profile.first, profile.second);
2173 }
2174
2175 // up/down
2176 for (const auto &state : inc.new_state) {
2177 const auto osd = state.first;
2178 int s = state.second ? state.second : CEPH_OSD_UP;
2179 if ((osd_state[osd] & CEPH_OSD_UP) &&
2180 (s & CEPH_OSD_UP)) {
2181 osd_info[osd].down_at = epoch;
2182 osd_xinfo[osd].down_stamp = modified;
2183 }
2184 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2185 (s & CEPH_OSD_EXISTS)) {
2186 // osd is destroyed; clear out anything interesting.
2187 (*osd_uuid)[osd] = uuid_d();
2188 osd_info[osd] = osd_info_t();
2189 osd_xinfo[osd] = osd_xinfo_t();
2190 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2191 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2192 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2193 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2194 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2195 osd_state[osd] = 0;
2196 } else {
2197 osd_state[osd] ^= s;
2198 }
2199 }
2200
2201 for (const auto &client : inc.new_up_client) {
2202 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
9f95a23c 2203 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
11fdf7f2
TL
2204 osd_addrs->client_addrs[client.first].reset(
2205 new entity_addrvec_t(client.second));
2206 osd_addrs->hb_back_addrs[client.first].reset(
2207 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2208 osd_addrs->hb_front_addrs[client.first].reset(
2209 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2210
2211 osd_info[client.first].up_from = epoch;
2212 }
2213
2214 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2215 osd_addrs->cluster_addrs[cluster.first].reset(
2216 new entity_addrvec_t(cluster.second));
7c673cae
FG
2217
2218 // info
2219 for (const auto &thru : inc.new_up_thru)
2220 osd_info[thru.first].up_thru = thru.second;
2221
2222 for (const auto &interval : inc.new_last_clean_interval) {
2223 osd_info[interval.first].last_clean_begin = interval.second.first;
2224 osd_info[interval.first].last_clean_end = interval.second.second;
2225 }
2226
2227 for (const auto &lost : inc.new_lost)
2228 osd_info[lost.first].lost_at = lost.second;
2229
2230 // xinfo
2231 for (const auto &xinfo : inc.new_xinfo)
2232 osd_xinfo[xinfo.first] = xinfo.second;
2233
2234 // uuid
2235 for (const auto &uuid : inc.new_uuid)
2236 (*osd_uuid)[uuid.first] = uuid.second;
2237
2238 // pg rebuild
2239 for (const auto &pg : inc.new_pg_temp) {
2240 if (pg.second.empty())
2241 pg_temp->erase(pg.first);
2242 else
31f18b77
FG
2243 pg_temp->set(pg.first, pg.second);
2244 }
2245 if (!inc.new_pg_temp.empty()) {
2246 // make sure pg_temp is efficiently stored
2247 pg_temp->rebuild();
7c673cae
FG
2248 }
2249
2250 for (const auto &pg : inc.new_primary_temp) {
2251 if (pg.second == -1)
2252 primary_temp->erase(pg.first);
2253 else
2254 (*primary_temp)[pg.first] = pg.second;
2255 }
2256
2257 for (auto& p : inc.new_pg_upmap) {
2258 pg_upmap[p.first] = p.second;
2259 }
2260 for (auto& pg : inc.old_pg_upmap) {
2261 pg_upmap.erase(pg);
2262 }
2263 for (auto& p : inc.new_pg_upmap_items) {
2264 pg_upmap_items[p.first] = p.second;
2265 }
2266 for (auto& pg : inc.old_pg_upmap_items) {
2267 pg_upmap_items.erase(pg);
2268 }
2269
f67539c2
TL
2270 // blocklist
2271 if (!inc.new_blocklist.empty()) {
2272 blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
2273 new_blocklist_entries = true;
7c673cae 2274 }
f67539c2
TL
2275 for (const auto &addr : inc.old_blocklist)
2276 blocklist.erase(addr);
7c673cae 2277
81eedcae
TL
2278 for (auto& i : inc.new_crush_node_flags) {
2279 if (i.second) {
2280 crush_node_flags[i.first] = i.second;
2281 } else {
2282 crush_node_flags.erase(i.first);
2283 }
2284 }
2285
2286 for (auto& i : inc.new_device_class_flags) {
2287 if (i.second) {
2288 device_class_flags[i.first] = i.second;
2289 } else {
2290 device_class_flags.erase(i.first);
2291 }
2292 }
2293
7c673cae
FG
2294 // cluster snapshot?
2295 if (inc.cluster_snapshot.length()) {
2296 cluster_snapshot = inc.cluster_snapshot;
2297 cluster_snapshot_epoch = inc.epoch;
2298 } else {
2299 cluster_snapshot.clear();
2300 cluster_snapshot_epoch = 0;
2301 }
2302
2303 if (inc.new_nearfull_ratio >= 0) {
2304 nearfull_ratio = inc.new_nearfull_ratio;
2305 }
2306 if (inc.new_backfillfull_ratio >= 0) {
2307 backfillfull_ratio = inc.new_backfillfull_ratio;
2308 }
2309 if (inc.new_full_ratio >= 0) {
2310 full_ratio = inc.new_full_ratio;
2311 }
9f95a23c 2312 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
7c673cae
FG
2313 require_min_compat_client = inc.new_require_min_compat_client;
2314 }
9f95a23c 2315 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
31f18b77 2316 require_osd_release = inc.new_require_osd_release;
9f95a23c 2317 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 2318 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2319 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2320 }
2321 }
7c673cae 2322
9f95a23c 2323 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
11fdf7f2 2324 require_osd_release = inc.new_require_osd_release;
9f95a23c 2325 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
2326 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2327 }
2328 }
7c673cae
FG
2329 // do new crush map last (after up/down stuff)
2330 if (inc.crush.length()) {
9f95a23c 2331 ceph::buffer::list bl(inc.crush);
11fdf7f2 2332 auto blp = bl.cbegin();
7c673cae
FG
2333 crush.reset(new CrushWrapper);
2334 crush->decode(blp);
9f95a23c 2335 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77
FG
2336 // only increment if this is a luminous-encoded osdmap, lest
2337 // the mon's crush_version diverge from what the osds or others
2338 // are decoding and applying on their end. if we won't encode
2339 // it in the canonical version, don't change it.
2340 ++crush_version;
2341 }
81eedcae
TL
2342 for (auto it = device_class_flags.begin();
2343 it != device_class_flags.end();) {
2344 const char* class_name = crush->get_class_name(it->first);
2345 if (!class_name) // device class is gone
2346 it = device_class_flags.erase(it);
2347 else
2348 it++;
2349 }
7c673cae
FG
2350 }
2351
f67539c2
TL
2352 if (inc.change_stretch_mode) {
2353 stretch_mode_enabled = inc.stretch_mode_enabled;
2354 stretch_bucket_count = inc.new_stretch_bucket_count;
2355 degraded_stretch_mode = inc.new_degraded_stretch_mode;
2356 recovering_stretch_mode = inc.new_recovering_stretch_mode;
2357 stretch_mode_bucket = inc.new_stretch_mode_bucket;
2358 }
2359
7c673cae
FG
2360 calc_num_osds();
2361 _calc_up_osd_features();
2362 return 0;
2363}
2364
2365// mapping
2366int OSDMap::map_to_pg(
2367 int64_t poolid,
2368 const string& name,
2369 const string& key,
2370 const string& nspace,
2371 pg_t *pg) const
2372{
2373 // calculate ps (placement seed)
2374 const pg_pool_t *pool = get_pg_pool(poolid);
2375 if (!pool)
2376 return -ENOENT;
2377 ps_t ps;
2378 if (!key.empty())
2379 ps = pool->hash_key(key, nspace);
2380 else
2381 ps = pool->hash_key(name, nspace);
2382 *pg = pg_t(ps, poolid);
2383 return 0;
2384}
2385
2386int OSDMap::object_locator_to_pg(
2387 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2388{
2389 if (loc.hash >= 0) {
2390 if (!get_pg_pool(loc.get_pool())) {
2391 return -ENOENT;
2392 }
2393 pg = pg_t(loc.hash, loc.get_pool());
2394 return 0;
2395 }
2396 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2397}
2398
2399ceph_object_layout OSDMap::make_object_layout(
2400 object_t oid, int pg_pool, string nspace) const
2401{
2402 object_locator_t loc(pg_pool, nspace);
2403
2404 ceph_object_layout ol;
2405 pg_t pgid = object_locator_to_pg(oid, loc);
2406 ol.ol_pgid = pgid.get_old_pg().v;
2407 ol.ol_stripe_unit = 0;
2408 return ol;
2409}
2410
2411void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2412 vector<int>& osds) const
2413{
2414 if (pool.can_shift_osds()) {
2415 unsigned removed = 0;
2416 for (unsigned i = 0; i < osds.size(); i++) {
2417 if (!exists(osds[i])) {
2418 removed++;
2419 continue;
2420 }
2421 if (removed) {
2422 osds[i - removed] = osds[i];
2423 }
2424 }
2425 if (removed)
2426 osds.resize(osds.size() - removed);
2427 } else {
2428 for (auto& osd : osds) {
2429 if (!exists(osd))
2430 osd = CRUSH_ITEM_NONE;
2431 }
2432 }
2433}
2434
31f18b77 2435void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2436 const pg_pool_t& pool, pg_t pg,
2437 vector<int> *osds,
2438 ps_t *ppps) const
2439{
2440 // map to osds[]
2441 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2442 unsigned size = pool.get_size();
2443
2444 // what crush rule?
31f18b77 2445 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
7c673cae
FG
2446 if (ruleno >= 0)
2447 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2448
2449 _remove_nonexistent_osds(pool, *osds);
2450
2451 if (ppps)
2452 *ppps = pps;
7c673cae
FG
2453}
2454
2455int OSDMap::_pick_primary(const vector<int>& osds) const
2456{
2457 for (auto osd : osds) {
2458 if (osd != CRUSH_ITEM_NONE) {
2459 return osd;
2460 }
2461 }
2462 return -1;
2463}
2464
224ce89b 2465void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2466{
2467 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2468 auto p = pg_upmap.find(pg);
2469 if (p != pg_upmap.end()) {
2470 // make sure targets aren't marked out
2471 for (auto osd : p->second) {
91327a77
AA
2472 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2473 osd_weight[osd] == 0) {
7c673cae
FG
2474 // reject/ignore the explicit mapping
2475 return;
2476 }
2477 }
2478 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2479 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2480 }
2481
2482 auto q = pg_upmap_items.find(pg);
2483 if (q != pg_upmap_items.end()) {
181888fb
FG
2484 // NOTE: this approach does not allow a bidirectional swap,
2485 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2486 for (auto& r : q->second) {
2487 // make sure the replacement value doesn't already appear
2488 bool exists = false;
2489 ssize_t pos = -1;
2490 for (unsigned i = 0; i < raw->size(); ++i) {
2491 int osd = (*raw)[i];
2492 if (osd == r.second) {
2493 exists = true;
2494 break;
2495 }
2496 // ignore mapping if target is marked out (or invalid osd id)
2497 if (osd == r.first &&
2498 pos < 0 &&
2499 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2500 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2501 pos = i;
2502 }
2503 }
2504 if (!exists && pos >= 0) {
2505 (*raw)[pos] = r.second;
7c673cae
FG
2506 }
2507 }
2508 }
2509}
2510
2511// pg -> (up osd list)
2512void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2513 vector<int> *up) const
2514{
2515 if (pool.can_shift_osds()) {
2516 // shift left
2517 up->clear();
2518 up->reserve(raw.size());
2519 for (unsigned i=0; i<raw.size(); i++) {
2520 if (!exists(raw[i]) || is_down(raw[i]))
2521 continue;
2522 up->push_back(raw[i]);
2523 }
2524 } else {
2525 // set down/dne devices to NONE
2526 up->resize(raw.size());
2527 for (int i = raw.size() - 1; i >= 0; --i) {
2528 if (!exists(raw[i]) || is_down(raw[i])) {
2529 (*up)[i] = CRUSH_ITEM_NONE;
2530 } else {
2531 (*up)[i] = raw[i];
2532 }
2533 }
2534 }
2535}
2536
2537void OSDMap::_apply_primary_affinity(ps_t seed,
2538 const pg_pool_t& pool,
2539 vector<int> *osds,
2540 int *primary) const
2541{
2542 // do we have any non-default primary_affinity values for these osds?
2543 if (!osd_primary_affinity)
2544 return;
2545
2546 bool any = false;
2547 for (const auto osd : *osds) {
2548 if (osd != CRUSH_ITEM_NONE &&
2549 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2550 any = true;
2551 break;
2552 }
2553 }
2554 if (!any)
2555 return;
2556
2557 // pick the primary. feed both the seed (for the pg) and the osd
2558 // into the hash/rng so that a proportional fraction of an osd's pgs
2559 // get rejected as primary.
2560 int pos = -1;
2561 for (unsigned i = 0; i < osds->size(); ++i) {
2562 int o = (*osds)[i];
2563 if (o == CRUSH_ITEM_NONE)
2564 continue;
2565 unsigned a = (*osd_primary_affinity)[o];
2566 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2567 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2568 seed, o) >> 16) >= a) {
2569 // we chose not to use this primary. note it anyway as a
2570 // fallback in case we don't pick anyone else, but keep looking.
2571 if (pos < 0)
2572 pos = i;
2573 } else {
2574 pos = i;
2575 break;
2576 }
2577 }
2578 if (pos < 0)
2579 return;
2580
2581 *primary = (*osds)[pos];
2582
2583 if (pool.can_shift_osds() && pos > 0) {
2584 // move the new primary to the front.
2585 for (int i = pos; i > 0; --i) {
2586 (*osds)[i] = (*osds)[i-1];
2587 }
2588 (*osds)[0] = *primary;
2589 }
2590}
2591
2592void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2593 vector<int> *temp_pg, int *temp_primary) const
2594{
2595 pg = pool.raw_pg_to_pg(pg);
2596 const auto p = pg_temp->find(pg);
2597 temp_pg->clear();
2598 if (p != pg_temp->end()) {
2599 for (unsigned i=0; i<p->second.size(); i++) {
2600 if (!exists(p->second[i]) || is_down(p->second[i])) {
2601 if (pool.can_shift_osds()) {
2602 continue;
2603 } else {
2604 temp_pg->push_back(CRUSH_ITEM_NONE);
2605 }
2606 } else {
2607 temp_pg->push_back(p->second[i]);
2608 }
2609 }
2610 }
2611 const auto &pp = primary_temp->find(pg);
2612 *temp_primary = -1;
2613 if (pp != primary_temp->end()) {
2614 *temp_primary = pp->second;
2615 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2616 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2617 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2618 *temp_primary = (*temp_pg)[i];
2619 break;
2620 }
2621 }
2622 }
2623}
2624
31f18b77 2625void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2626{
7c673cae 2627 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2628 if (!pool) {
2629 *primary = -1;
2630 raw->clear();
31f18b77 2631 return;
11fdf7f2 2632 }
31f18b77 2633 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2634 *primary = _pick_primary(*raw);
7c673cae
FG
2635}
2636
494da23a
TL
2637void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2638 vector<int> *raw_upmap) const
a8e16298
TL
2639{
2640 auto pool = get_pg_pool(pg.pool());
2641 if (!pool) {
2642 raw_upmap->clear();
2643 return;
2644 }
494da23a
TL
2645 _pg_to_raw_osds(*pool, pg, raw, NULL);
2646 *raw_upmap = *raw;
a8e16298
TL
2647 _apply_upmap(*pool, pg, raw_upmap);
2648}
2649
7c673cae
FG
2650void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2651{
2652 const pg_pool_t *pool = get_pg_pool(pg.pool());
2653 if (!pool) {
11fdf7f2
TL
2654 *primary = -1;
2655 up->clear();
7c673cae
FG
2656 return;
2657 }
2658 vector<int> raw;
2659 ps_t pps;
2660 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2661 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2662 _raw_to_up_osds(*pool, raw, up);
2663 *primary = _pick_primary(raw);
2664 _apply_primary_affinity(pps, *pool, up, primary);
2665}
31f18b77 2666
7c673cae
FG
2667void OSDMap::_pg_to_up_acting_osds(
2668 const pg_t& pg, vector<int> *up, int *up_primary,
2669 vector<int> *acting, int *acting_primary,
2670 bool raw_pg_to_pg) const
2671{
2672 const pg_pool_t *pool = get_pg_pool(pg.pool());
2673 if (!pool ||
2674 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2675 if (up)
2676 up->clear();
2677 if (up_primary)
2678 *up_primary = -1;
2679 if (acting)
2680 acting->clear();
2681 if (acting_primary)
2682 *acting_primary = -1;
2683 return;
2684 }
2685 vector<int> raw;
2686 vector<int> _up;
2687 vector<int> _acting;
2688 int _up_primary;
2689 int _acting_primary;
2690 ps_t pps;
2691 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2692 if (_acting.empty() || up || up_primary) {
2693 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2694 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2695 _raw_to_up_osds(*pool, raw, &_up);
2696 _up_primary = _pick_primary(_up);
2697 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2698 if (_acting.empty()) {
2699 _acting = _up;
2700 if (_acting_primary == -1) {
2701 _acting_primary = _up_primary;
2702 }
2703 }
2704
2705 if (up)
2706 up->swap(_up);
2707 if (up_primary)
2708 *up_primary = _up_primary;
2709 }
2710
2711 if (acting)
2712 acting->swap(_acting);
2713 if (acting_primary)
2714 *acting_primary = _acting_primary;
2715}
2716
9f95a23c 2717int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
7c673cae 2718{
9f95a23c
TL
2719 // This implementation is broken for EC PGs since the osd may appear
2720 // multiple times in the acting set. See
2721 // https://tracker.ceph.com/issues/43213
7c673cae
FG
2722 if (!nrep)
2723 nrep = acting.size();
2724 for (int i=0; i<nrep; i++)
2725 if (acting[i] == osd)
2726 return i;
2727 return -1;
2728}
2729
9f95a23c 2730int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
7c673cae 2731{
9f95a23c
TL
2732 int nrep = acting.size();
2733 if (who.shard == shard_id_t::NO_SHARD) {
2734 for (int i=0; i<nrep; i++) {
2735 if (acting[i] == who.osd) {
2736 return i;
2737 }
2738 }
2739 } else {
2740 if (who.shard < nrep && acting[who.shard] == who.osd) {
2741 return who.shard;
2742 }
2743 }
2744 return -1;
7c673cae
FG
2745}
2746
9f95a23c 2747bool OSDMap::primary_changed_broken(
7c673cae
FG
2748 int oldprimary,
2749 const vector<int> &oldacting,
2750 int newprimary,
2751 const vector<int> &newacting)
2752{
2753 if (oldacting.empty() && newacting.empty())
2754 return false; // both still empty
2755 if (oldacting.empty() ^ newacting.empty())
2756 return true; // was empty, now not, or vice versa
2757 if (oldprimary != newprimary)
2758 return true; // primary changed
9f95a23c
TL
2759 if (calc_pg_role_broken(oldprimary, oldacting) !=
2760 calc_pg_role_broken(newprimary, newacting))
7c673cae
FG
2761 return true;
2762 return false; // same primary (tho replicas may have changed)
2763}
2764
28e407b8
AA
2765uint64_t OSDMap::get_encoding_features() const
2766{
2767 uint64_t f = SIGNIFICANT_FEATURES;
9f95a23c
TL
2768 if (require_osd_release < ceph_release_t::octopus) {
2769 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2770 }
2771 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
2772 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2773 }
9f95a23c 2774 if (require_osd_release < ceph_release_t::mimic) {
11fdf7f2
TL
2775 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2776 }
9f95a23c 2777 if (require_osd_release < ceph_release_t::luminous) {
28e407b8
AA
2778 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2779 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2780 }
9f95a23c 2781 if (require_osd_release < ceph_release_t::kraken) {
28e407b8 2782 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2783 CEPH_FEATURE_MSG_ADDR2);
28e407b8 2784 }
9f95a23c 2785 if (require_osd_release < ceph_release_t::jewel) {
28e407b8 2786 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2787 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2788 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2789 }
2790 return f;
2791}
7c673cae
FG
2792
2793// serialize, unserialize
9f95a23c 2794void OSDMap::encode_client_old(ceph::buffer::list& bl) const
7c673cae 2795{
11fdf7f2 2796 using ceph::encode;
7c673cae 2797 __u16 v = 5;
11fdf7f2 2798 encode(v, bl);
7c673cae
FG
2799
2800 // base
11fdf7f2
TL
2801 encode(fsid, bl);
2802 encode(epoch, bl);
2803 encode(created, bl);
2804 encode(modified, bl);
7c673cae 2805
11fdf7f2 2806 // for encode(pools, bl);
7c673cae 2807 __u32 n = pools.size();
11fdf7f2 2808 encode(n, bl);
7c673cae
FG
2809
2810 for (const auto &pool : pools) {
2811 n = pool.first;
11fdf7f2
TL
2812 encode(n, bl);
2813 encode(pool.second, bl, 0);
7c673cae 2814 }
11fdf7f2 2815 // for encode(pool_name, bl);
7c673cae 2816 n = pool_name.size();
11fdf7f2 2817 encode(n, bl);
7c673cae
FG
2818 for (const auto &pname : pool_name) {
2819 n = pname.first;
11fdf7f2
TL
2820 encode(n, bl);
2821 encode(pname.second, bl);
7c673cae 2822 }
11fdf7f2 2823 // for encode(pool_max, bl);
7c673cae 2824 n = pool_max;
11fdf7f2 2825 encode(n, bl);
7c673cae 2826
11fdf7f2 2827 encode(flags, bl);
7c673cae 2828
11fdf7f2 2829 encode(max_osd, bl);
31f18b77
FG
2830 {
2831 uint32_t n = osd_state.size();
11fdf7f2 2832 encode(n, bl);
31f18b77 2833 for (auto s : osd_state) {
11fdf7f2 2834 encode((uint8_t)s, bl);
31f18b77
FG
2835 }
2836 }
11fdf7f2
TL
2837 encode(osd_weight, bl);
2838 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2839
11fdf7f2 2840 // for encode(pg_temp, bl);
7c673cae 2841 n = pg_temp->size();
11fdf7f2 2842 encode(n, bl);
f67539c2 2843 for (const auto& pg : *pg_temp) {
7c673cae 2844 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2845 encode(opg, bl);
2846 encode(pg.second, bl);
7c673cae
FG
2847 }
2848
2849 // crush
9f95a23c 2850 ceph::buffer::list cbl;
7c673cae 2851 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2852 encode(cbl, bl);
7c673cae
FG
2853}
2854
9f95a23c 2855void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2856{
11fdf7f2 2857 using ceph::encode;
7c673cae
FG
2858 if ((features & CEPH_FEATURE_PGID64) == 0) {
2859 encode_client_old(bl);
2860 return;
2861 }
2862
2863 __u16 v = 6;
11fdf7f2 2864 encode(v, bl);
7c673cae
FG
2865
2866 // base
11fdf7f2
TL
2867 encode(fsid, bl);
2868 encode(epoch, bl);
2869 encode(created, bl);
2870 encode(modified, bl);
7c673cae 2871
11fdf7f2
TL
2872 encode(pools, bl, features);
2873 encode(pool_name, bl);
2874 encode(pool_max, bl);
7c673cae 2875
11fdf7f2 2876 encode(flags, bl);
7c673cae 2877
11fdf7f2 2878 encode(max_osd, bl);
31f18b77
FG
2879 {
2880 uint32_t n = osd_state.size();
11fdf7f2 2881 encode(n, bl);
31f18b77 2882 for (auto s : osd_state) {
11fdf7f2 2883 encode((uint8_t)s, bl);
31f18b77
FG
2884 }
2885 }
11fdf7f2
TL
2886 encode(osd_weight, bl);
2887 encode(osd_addrs->client_addrs, bl, features);
7c673cae 2888
11fdf7f2 2889 encode(*pg_temp, bl);
7c673cae
FG
2890
2891 // crush
9f95a23c 2892 ceph::buffer::list cbl;
7c673cae 2893 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2894 encode(cbl, bl);
7c673cae
FG
2895
2896 // extended
2897 __u16 ev = 10;
11fdf7f2
TL
2898 encode(ev, bl);
2899 encode(osd_addrs->hb_back_addrs, bl, features);
2900 encode(osd_info, bl);
f67539c2 2901 encode(blocklist, bl, features);
11fdf7f2
TL
2902 encode(osd_addrs->cluster_addrs, bl, features);
2903 encode(cluster_snapshot_epoch, bl);
2904 encode(cluster_snapshot, bl);
2905 encode(*osd_uuid, bl);
9f95a23c 2906 encode(osd_xinfo, bl, features);
11fdf7f2 2907 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
2908}
2909
11fdf7f2
TL
2910/* for a description of osdmap versions, and when they were introduced, please
2911 * refer to
2912 * doc/dev/osd_internals/osdmap_versions.txt
2913 */
9f95a23c 2914void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2915{
11fdf7f2 2916 using ceph::encode;
7c673cae
FG
2917 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2918 encode_classic(bl, features);
2919 return;
2920 }
2921
2922 // only a select set of callers should *ever* be encoding new
2923 // OSDMaps. others should be passing around the canonical encoded
2924 // buffers from on high. select out those callers by passing in an
2925 // "impossible" feature bit.
11fdf7f2 2926 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
2927 features &= ~CEPH_FEATURE_RESERVED;
2928
2929 size_t start_offset = bl.length();
2930 size_t tail_offset;
11fdf7f2 2931 size_t crc_offset;
9f95a23c 2932 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
2933
2934 // meta-encoding: how we include client-used and osd-specific data
2935 ENCODE_START(8, 7, bl);
2936
2937 {
28e407b8
AA
2938 // NOTE: any new encoding dependencies must be reflected by
2939 // SIGNIFICANT_FEATURES
11fdf7f2 2940 uint8_t v = 9;
31f18b77 2941 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 2942 v = 3;
11fdf7f2
TL
2943 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2944 v = 6;
2945 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2946 v = 7;
7c673cae
FG
2947 }
2948 ENCODE_START(v, 1, bl); // client-usable data
2949 // base
11fdf7f2
TL
2950 encode(fsid, bl);
2951 encode(epoch, bl);
2952 encode(created, bl);
2953 encode(modified, bl);
7c673cae 2954
11fdf7f2
TL
2955 encode(pools, bl, features);
2956 encode(pool_name, bl);
2957 encode(pool_max, bl);
7c673cae 2958
31f18b77
FG
2959 if (v < 4) {
2960 decltype(flags) f = flags;
9f95a23c 2961 if (require_osd_release >= ceph_release_t::luminous)
c07f9fc5 2962 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
9f95a23c 2963 else if (require_osd_release == ceph_release_t::kraken)
31f18b77 2964 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
9f95a23c 2965 else if (require_osd_release == ceph_release_t::jewel)
31f18b77 2966 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 2967 encode(f, bl);
31f18b77 2968 } else {
11fdf7f2 2969 encode(flags, bl);
31f18b77 2970 }
7c673cae 2971
11fdf7f2 2972 encode(max_osd, bl);
31f18b77 2973 if (v >= 5) {
11fdf7f2 2974 encode(osd_state, bl);
31f18b77
FG
2975 } else {
2976 uint32_t n = osd_state.size();
11fdf7f2 2977 encode(n, bl);
31f18b77 2978 for (auto s : osd_state) {
11fdf7f2 2979 encode((uint8_t)s, bl);
31f18b77
FG
2980 }
2981 }
11fdf7f2
TL
2982 encode(osd_weight, bl);
2983 if (v >= 8) {
2984 encode(osd_addrs->client_addrs, bl, features);
2985 } else {
2986 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2987 }
7c673cae 2988
11fdf7f2
TL
2989 encode(*pg_temp, bl);
2990 encode(*primary_temp, bl);
7c673cae 2991 if (osd_primary_affinity) {
11fdf7f2 2992 encode(*osd_primary_affinity, bl);
7c673cae
FG
2993 } else {
2994 vector<__u32> v;
11fdf7f2 2995 encode(v, bl);
7c673cae
FG
2996 }
2997
2998 // crush
9f95a23c 2999 ceph::buffer::list cbl;
7c673cae 3000 crush->encode(cbl, features);
11fdf7f2
TL
3001 encode(cbl, bl);
3002 encode(erasure_code_profiles, bl);
7c673cae
FG
3003
3004 if (v >= 4) {
11fdf7f2
TL
3005 encode(pg_upmap, bl);
3006 encode(pg_upmap_items, bl);
7c673cae 3007 } else {
11fdf7f2
TL
3008 ceph_assert(pg_upmap.empty());
3009 ceph_assert(pg_upmap_items.empty());
7c673cae 3010 }
31f18b77 3011 if (v >= 6) {
11fdf7f2
TL
3012 encode(crush_version, bl);
3013 }
3014 if (v >= 7) {
3015 encode(new_removed_snaps, bl);
3016 encode(new_purged_snaps, bl);
3017 }
3018 if (v >= 9) {
3019 encode(last_up_change, bl);
3020 encode(last_in_change, bl);
31f18b77 3021 }
7c673cae
FG
3022 ENCODE_FINISH(bl); // client-usable data
3023 }
3024
3025 {
28e407b8
AA
3026 // NOTE: any new encoding dependencies must be reflected by
3027 // SIGNIFICANT_FEATURES
f67539c2 3028 uint8_t target_v = 9; // when bumping this, be aware of stretch_mode target_v 10!
7c673cae
FG
3029 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
3030 target_v = 1;
11fdf7f2
TL
3031 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
3032 target_v = 5;
3033 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
3034 target_v = 6;
7c673cae 3035 }
f67539c2
TL
3036 if (stretch_mode_enabled) {
3037 target_v = std::max((uint8_t)10, target_v);
3038 }
7c673cae 3039 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
3040 if (target_v < 7) {
3041 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
3042 } else {
3043 encode(osd_addrs->hb_back_addrs, bl, features);
3044 }
3045 encode(osd_info, bl);
7c673cae
FG
3046 {
3047 // put this in a sorted, ordered map<> so that we encode in a
3048 // deterministic order.
f67539c2
TL
3049 map<entity_addr_t,utime_t> blocklist_map;
3050 for (const auto &addr : blocklist)
3051 blocklist_map.insert(make_pair(addr.first, addr.second));
3052 encode(blocklist_map, bl, features);
11fdf7f2
TL
3053 }
3054 if (target_v < 7) {
3055 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3056 } else {
3057 encode(osd_addrs->cluster_addrs, bl, features);
3058 }
3059 encode(cluster_snapshot_epoch, bl);
3060 encode(cluster_snapshot, bl);
3061 encode(*osd_uuid, bl);
9f95a23c 3062 encode(osd_xinfo, bl, features);
11fdf7f2
TL
3063 if (target_v < 7) {
3064 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3065 } else {
3066 encode(osd_addrs->hb_front_addrs, bl, features);
3067 }
7c673cae 3068 if (target_v >= 2) {
11fdf7f2
TL
3069 encode(nearfull_ratio, bl);
3070 encode(full_ratio, bl);
3071 encode(backfillfull_ratio, bl);
31f18b77
FG
3072 }
3073 // 4 was string-based new_require_min_compat_client
3074 if (target_v >= 5) {
11fdf7f2
TL
3075 encode(require_min_compat_client, bl);
3076 encode(require_osd_release, bl);
3077 }
3078 if (target_v >= 6) {
3079 encode(removed_snaps_queue, bl);
7c673cae 3080 }
81eedcae
TL
3081 if (target_v >= 8) {
3082 encode(crush_node_flags, bl);
3083 }
3084 if (target_v >= 9) {
3085 encode(device_class_flags, bl);
3086 }
f67539c2
TL
3087 if (target_v >= 10) {
3088 encode(stretch_mode_enabled, bl);
3089 encode(stretch_bucket_count, bl);
3090 encode(degraded_stretch_mode, bl);
3091 encode(recovering_stretch_mode, bl);
3092 encode(stretch_mode_bucket, bl);
3093 }
7c673cae
FG
3094 ENCODE_FINISH(bl); // osd-only data
3095 }
3096
11fdf7f2
TL
3097 crc_offset = bl.length();
3098 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
3099 tail_offset = bl.length();
3100
3101 ENCODE_FINISH(bl); // meta-encoding wrapper
3102
3103 // fill in crc
9f95a23c 3104 ceph::buffer::list front;
11fdf7f2 3105 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
3106 crc = front.crc32c(-1);
3107 if (tail_offset < bl.length()) {
9f95a23c 3108 ceph::buffer::list tail;
7c673cae
FG
3109 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3110 crc = tail.crc32c(crc);
3111 }
3112 ceph_le32 crc_le;
3113 crc_le = crc;
11fdf7f2 3114 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
3115 crc_defined = true;
3116}
3117
11fdf7f2
TL
3118/* for a description of osdmap versions, and when they were introduced, please
3119 * refer to
3120 * doc/dev/osd_internals/osdmap_versions.txt
3121 */
9f95a23c 3122void OSDMap::decode(ceph::buffer::list& bl)
7c673cae 3123{
11fdf7f2 3124 auto p = bl.cbegin();
7c673cae
FG
3125 decode(p);
3126}
3127
9f95a23c 3128void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
7c673cae 3129{
11fdf7f2 3130 using ceph::decode;
7c673cae
FG
3131 __u32 n, t;
3132 __u16 v;
11fdf7f2 3133 decode(v, p);
7c673cae
FG
3134
3135 // base
11fdf7f2
TL
3136 decode(fsid, p);
3137 decode(epoch, p);
3138 decode(created, p);
3139 decode(modified, p);
7c673cae
FG
3140
3141 if (v < 6) {
3142 if (v < 4) {
3143 int32_t max_pools = 0;
11fdf7f2 3144 decode(max_pools, p);
7c673cae
FG
3145 pool_max = max_pools;
3146 }
3147 pools.clear();
11fdf7f2 3148 decode(n, p);
7c673cae 3149 while (n--) {
11fdf7f2
TL
3150 decode(t, p);
3151 decode(pools[t], p);
7c673cae
FG
3152 }
3153 if (v == 4) {
11fdf7f2 3154 decode(n, p);
7c673cae
FG
3155 pool_max = n;
3156 } else if (v == 5) {
3157 pool_name.clear();
11fdf7f2 3158 decode(n, p);
7c673cae 3159 while (n--) {
11fdf7f2
TL
3160 decode(t, p);
3161 decode(pool_name[t], p);
7c673cae 3162 }
11fdf7f2 3163 decode(n, p);
7c673cae
FG
3164 pool_max = n;
3165 }
3166 } else {
11fdf7f2
TL
3167 decode(pools, p);
3168 decode(pool_name, p);
3169 decode(pool_max, p);
7c673cae
FG
3170 }
3171 // kludge around some old bug that zeroed out pool_max (#2307)
3172 if (pools.size() && pool_max < pools.rbegin()->first) {
3173 pool_max = pools.rbegin()->first;
3174 }
3175
11fdf7f2 3176 decode(flags, p);
7c673cae 3177
11fdf7f2 3178 decode(max_osd, p);
31f18b77
FG
3179 {
3180 vector<uint8_t> os;
11fdf7f2 3181 decode(os, p);
31f18b77
FG
3182 osd_state.resize(os.size());
3183 for (unsigned i = 0; i < os.size(); ++i) {
3184 osd_state[i] = os[i];
3185 }
3186 }
11fdf7f2
TL
3187 decode(osd_weight, p);
3188 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3189 if (v <= 5) {
3190 pg_temp->clear();
11fdf7f2 3191 decode(n, p);
7c673cae
FG
3192 while (n--) {
3193 old_pg_t opg;
9f95a23c 3194 ceph::decode_raw(opg, p);
31f18b77 3195 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3196 decode(v, p);
31f18b77 3197 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3198 }
3199 } else {
11fdf7f2 3200 decode(*pg_temp, p);
7c673cae
FG
3201 }
3202
3203 // crush
9f95a23c 3204 ceph::buffer::list cbl;
11fdf7f2
TL
3205 decode(cbl, p);
3206 auto cblp = cbl.cbegin();
7c673cae
FG
3207 crush->decode(cblp);
3208
3209 // extended
3210 __u16 ev = 0;
3211 if (v >= 5)
11fdf7f2
TL
3212 decode(ev, p);
3213 decode(osd_addrs->hb_back_addrs, p);
3214 decode(osd_info, p);
7c673cae 3215 if (v < 5)
11fdf7f2 3216 decode(pool_name, p);
7c673cae 3217
f67539c2 3218 decode(blocklist, p);
7c673cae 3219 if (ev >= 6)
11fdf7f2 3220 decode(osd_addrs->cluster_addrs, p);
7c673cae 3221 else
11fdf7f2 3222 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3223
3224 if (ev >= 7) {
11fdf7f2
TL
3225 decode(cluster_snapshot_epoch, p);
3226 decode(cluster_snapshot, p);
7c673cae
FG
3227 }
3228
3229 if (ev >= 8) {
11fdf7f2 3230 decode(*osd_uuid, p);
7c673cae
FG
3231 } else {
3232 osd_uuid->resize(max_osd);
3233 }
3234 if (ev >= 9)
11fdf7f2 3235 decode(osd_xinfo, p);
7c673cae
FG
3236 else
3237 osd_xinfo.resize(max_osd);
3238
3239 if (ev >= 10)
11fdf7f2 3240 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3241 else
11fdf7f2 3242 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3243
3244 osd_primary_affinity.reset();
3245
3246 post_decode();
3247}
3248
9f95a23c 3249void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 3250{
11fdf7f2 3251 using ceph::decode;
7c673cae
FG
3252 /**
3253 * Older encodings of the OSDMap had a single struct_v which
3254 * covered the whole encoding, and was prior to our modern
3255 * stuff which includes a compatv and a size. So if we see
3256 * a struct_v < 7, we must rewind to the beginning and use our
3257 * classic decoder.
3258 */
3259 size_t start_offset = bl.get_off();
3260 size_t tail_offset = 0;
9f95a23c 3261 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
3262
3263 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3264 if (struct_v < 7) {
11fdf7f2 3265 bl.seek(start_offset);
7c673cae
FG
3266 decode_classic(bl);
3267 return;
3268 }
3269 /**
3270 * Since we made it past that hurdle, we can use our normal paths.
3271 */
3272 {
11fdf7f2 3273 DECODE_START(9, bl); // client-usable data
7c673cae 3274 // base
11fdf7f2
TL
3275 decode(fsid, bl);
3276 decode(epoch, bl);
3277 decode(created, bl);
3278 decode(modified, bl);
7c673cae 3279
11fdf7f2
TL
3280 decode(pools, bl);
3281 decode(pool_name, bl);
3282 decode(pool_max, bl);
7c673cae 3283
11fdf7f2 3284 decode(flags, bl);
7c673cae 3285
11fdf7f2 3286 decode(max_osd, bl);
31f18b77 3287 if (struct_v >= 5) {
11fdf7f2 3288 decode(osd_state, bl);
31f18b77
FG
3289 } else {
3290 vector<uint8_t> os;
11fdf7f2 3291 decode(os, bl);
31f18b77
FG
3292 osd_state.resize(os.size());
3293 for (unsigned i = 0; i < os.size(); ++i) {
3294 osd_state[i] = os[i];
3295 }
3296 }
11fdf7f2
TL
3297 decode(osd_weight, bl);
3298 decode(osd_addrs->client_addrs, bl);
7c673cae 3299
11fdf7f2
TL
3300 decode(*pg_temp, bl);
3301 decode(*primary_temp, bl);
3302 // dates back to firefly. version increased from 2 to 3 still in firefly.
3303 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3304 if (struct_v >= 2) {
3305 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3306 decode(*osd_primary_affinity, bl);
7c673cae
FG
3307 if (osd_primary_affinity->empty())
3308 osd_primary_affinity.reset();
3309 } else {
3310 osd_primary_affinity.reset();
3311 }
3312
3313 // crush
9f95a23c 3314 ceph::buffer::list cbl;
11fdf7f2
TL
3315 decode(cbl, bl);
3316 auto cblp = cbl.cbegin();
7c673cae 3317 crush->decode(cblp);
11fdf7f2
TL
3318 // added in firefly; version increased in luminous, so it affects
3319 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3320 // alone until we require clients to be all luminous?
7c673cae 3321 if (struct_v >= 3) {
11fdf7f2 3322 decode(erasure_code_profiles, bl);
7c673cae
FG
3323 } else {
3324 erasure_code_profiles.clear();
3325 }
11fdf7f2
TL
3326 // version increased from 3 to 4 still in luminous, so same as above
3327 // applies.
7c673cae 3328 if (struct_v >= 4) {
11fdf7f2
TL
3329 decode(pg_upmap, bl);
3330 decode(pg_upmap_items, bl);
7c673cae
FG
3331 } else {
3332 pg_upmap.clear();
3333 pg_upmap_items.clear();
3334 }
11fdf7f2
TL
3335 // again, version increased from 5 to 6 still in luminous, so above
3336 // applies.
31f18b77 3337 if (struct_v >= 6) {
11fdf7f2
TL
3338 decode(crush_version, bl);
3339 }
3340 // version increase from 6 to 7 in mimic
3341 if (struct_v >= 7) {
3342 decode(new_removed_snaps, bl);
3343 decode(new_purged_snaps, bl);
3344 }
3345 // version increase from 7 to 8, 8 to 9, in nautilus.
3346 if (struct_v >= 9) {
3347 decode(last_up_change, bl);
3348 decode(last_in_change, bl);
31f18b77 3349 }
7c673cae
FG
3350 DECODE_FINISH(bl); // client-usable data
3351 }
3352
3353 {
f67539c2 3354 DECODE_START(10, bl); // extended, osd-only data
11fdf7f2
TL
3355 decode(osd_addrs->hb_back_addrs, bl);
3356 decode(osd_info, bl);
f67539c2 3357 decode(blocklist, bl);
11fdf7f2
TL
3358 decode(osd_addrs->cluster_addrs, bl);
3359 decode(cluster_snapshot_epoch, bl);
3360 decode(cluster_snapshot, bl);
3361 decode(*osd_uuid, bl);
3362 decode(osd_xinfo, bl);
3363 decode(osd_addrs->hb_front_addrs, bl);
3364 //
7c673cae 3365 if (struct_v >= 2) {
11fdf7f2
TL
3366 decode(nearfull_ratio, bl);
3367 decode(full_ratio, bl);
7c673cae
FG
3368 } else {
3369 nearfull_ratio = 0;
3370 full_ratio = 0;
3371 }
3372 if (struct_v >= 3) {
11fdf7f2 3373 decode(backfillfull_ratio, bl);
7c673cae
FG
3374 } else {
3375 backfillfull_ratio = 0;
3376 }
31f18b77
FG
3377 if (struct_v == 4) {
3378 string r;
11fdf7f2 3379 decode(r, bl);
31f18b77
FG
3380 if (r.length())
3381 require_min_compat_client = ceph_release_from_name(r.c_str());
3382 }
3383 if (struct_v >= 5) {
11fdf7f2
TL
3384 decode(require_min_compat_client, bl);
3385 decode(require_osd_release, bl);
9f95a23c 3386 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
3387 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3388 }
9f95a23c 3389 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 3390 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3391 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3392 }
3393 } else {
3394 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3395 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 3396 require_osd_release = ceph_release_t::luminous;
31f18b77 3397 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3398 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77 3399 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c 3400 require_osd_release = ceph_release_t::kraken;
31f18b77 3401 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c 3402 require_osd_release = ceph_release_t::jewel;
31f18b77 3403 } else {
9f95a23c 3404 require_osd_release = ceph_release_t::unknown;
31f18b77
FG
3405 }
3406 }
11fdf7f2
TL
3407 if (struct_v >= 6) {
3408 decode(removed_snaps_queue, bl);
3409 }
81eedcae
TL
3410 if (struct_v >= 8) {
3411 decode(crush_node_flags, bl);
3412 } else {
3413 crush_node_flags.clear();
3414 }
3415 if (struct_v >= 9) {
3416 decode(device_class_flags, bl);
3417 } else {
3418 device_class_flags.clear();
3419 }
f67539c2
TL
3420 if (struct_v >= 10) {
3421 decode(stretch_mode_enabled, bl);
3422 decode(stretch_bucket_count, bl);
3423 decode(degraded_stretch_mode, bl);
3424 decode(recovering_stretch_mode, bl);
3425 decode(stretch_mode_bucket, bl);
3426 } else {
3427 stretch_mode_enabled = false;
3428 stretch_bucket_count = 0;
3429 degraded_stretch_mode = 0;
3430 recovering_stretch_mode = 0;
3431 stretch_mode_bucket = 0;
3432 }
7c673cae
FG
3433 DECODE_FINISH(bl); // osd-only data
3434 }
3435
3436 if (struct_v >= 8) {
3437 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3438 decode(crc, bl);
7c673cae
FG
3439 tail_offset = bl.get_off();
3440 crc_defined = true;
3441 } else {
3442 crc_defined = false;
3443 crc = 0;
3444 }
3445
3446 DECODE_FINISH(bl); // wrapper
3447
3448 if (tail_offset) {
3449 // verify crc
3450 uint32_t actual = crc_front.crc32c(-1);
3451 if (tail_offset < bl.get_off()) {
9f95a23c 3452 ceph::buffer::list tail;
7c673cae
FG
3453 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3454 actual = tail.crc32c(actual);
3455 }
3456 if (crc != actual) {
3457 ostringstream ss;
3458 ss << "bad crc, actual " << actual << " != expected " << crc;
3459 string s = ss.str();
9f95a23c 3460 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
3461 }
3462 }
3463
3464 post_decode();
3465}
3466
3467void OSDMap::post_decode()
3468{
3469 // index pool names
3470 name_pool.clear();
3471 for (const auto &pname : pool_name) {
3472 name_pool[pname.second] = pname.first;
3473 }
3474
3475 calc_num_osds();
3476 _calc_up_osd_features();
3477}
3478
3479void OSDMap::dump_erasure_code_profiles(
3480 const mempool::osdmap::map<string,map<string,string>>& profiles,
3481 Formatter *f)
3482{
3483 f->open_object_section("erasure_code_profiles");
3484 for (const auto &profile : profiles) {
3485 f->open_object_section(profile.first.c_str());
3486 for (const auto &profm : profile.second) {
9f95a23c 3487 f->dump_string(profm.first.c_str(), profm.second);
7c673cae
FG
3488 }
3489 f->close_section();
3490 }
3491 f->close_section();
3492}
3493
9f95a23c
TL
3494void OSDMap::dump_osds(Formatter *f) const
3495{
3496 f->open_array_section("osds");
3497 for (int i=0; i<get_max_osd(); i++) {
3498 if (exists(i)) {
3499 dump_osd(i, f);
3500 }
3501 }
3502 f->close_section();
3503}
3504
3505void OSDMap::dump_osd(int id, Formatter *f) const
3506{
3507 ceph_assert(f != nullptr);
3508 if (!exists(id)) {
3509 return;
3510 }
3511
3512 f->open_object_section("osd_info");
3513 f->dump_int("osd", id);
3514 f->dump_stream("uuid") << get_uuid(id);
3515 f->dump_int("up", is_up(id));
3516 f->dump_int("in", is_in(id));
3517 f->dump_float("weight", get_weightf(id));
3518 f->dump_float("primary_affinity", get_primary_affinityf(id));
3519 get_info(id).dump(f);
3520 f->dump_object("public_addrs", get_addrs(id));
3521 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3522 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3523 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3524 // compat
3525 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3526 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3527 f->dump_stream("heartbeat_back_addr")
3528 << get_hb_back_addrs(id).get_legacy_str();
3529 f->dump_stream("heartbeat_front_addr")
3530 << get_hb_front_addrs(id).get_legacy_str();
3531
3532 set<string> st;
3533 get_state(id, st);
3534 f->open_array_section("state");
3535 for (const auto &state : st)
3536 f->dump_string("state", state);
3537 f->close_section();
3538
3539 f->close_section();
3540}
3541
7c673cae
FG
3542void OSDMap::dump(Formatter *f) const
3543{
3544 f->dump_int("epoch", get_epoch());
3545 f->dump_stream("fsid") << get_fsid();
3546 f->dump_stream("created") << get_created();
3547 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3548 f->dump_stream("last_up_change") << last_up_change;
3549 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3550 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3551 f->dump_unsigned("flags_num", flags);
3552 f->open_array_section("flags_set");
3553 set<string> flagset;
3554 get_flag_set(&flagset);
3555 for (auto p : flagset) {
3556 f->dump_string("flag", p);
3557 }
3558 f->close_section();
31f18b77 3559 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3560 f->dump_float("full_ratio", full_ratio);
3561 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3562 f->dump_float("nearfull_ratio", nearfull_ratio);
3563 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3564 f->dump_int("pool_max", get_pool_max());
3565 f->dump_int("max_osd", get_max_osd());
31f18b77 3566 f->dump_string("require_min_compat_client",
f67539c2 3567 to_string(require_min_compat_client));
31f18b77 3568 f->dump_string("min_compat_client",
f67539c2 3569 to_string(get_min_compat_client()));
31f18b77 3570 f->dump_string("require_osd_release",
f67539c2 3571 to_string(require_osd_release));
7c673cae
FG
3572
3573 f->open_array_section("pools");
3574 for (const auto &pool : pools) {
3575 std::string name("<unknown>");
3576 const auto &pni = pool_name.find(pool.first);
3577 if (pni != pool_name.end())
3578 name = pni->second;
3579 f->open_object_section("pool");
3580 f->dump_int("pool", pool.first);
3581 f->dump_string("pool_name", name);
3582 pool.second.dump(f);
3583 f->close_section();
3584 }
3585 f->close_section();
3586
9f95a23c 3587 dump_osds(f);
7c673cae
FG
3588
3589 f->open_array_section("osd_xinfo");
3590 for (int i=0; i<get_max_osd(); i++) {
3591 if (exists(i)) {
3592 f->open_object_section("xinfo");
3593 f->dump_int("osd", i);
3594 osd_xinfo[i].dump(f);
3595 f->close_section();
3596 }
3597 }
3598 f->close_section();
3599
3600 f->open_array_section("pg_upmap");
3601 for (auto& p : pg_upmap) {
3602 f->open_object_section("mapping");
3603 f->dump_stream("pgid") << p.first;
3604 f->open_array_section("osds");
3605 for (auto q : p.second) {
3606 f->dump_int("osd", q);
3607 }
3608 f->close_section();
3609 f->close_section();
3610 }
3611 f->close_section();
3612 f->open_array_section("pg_upmap_items");
3613 for (auto& p : pg_upmap_items) {
3614 f->open_object_section("mapping");
3615 f->dump_stream("pgid") << p.first;
3616 f->open_array_section("mappings");
3617 for (auto& q : p.second) {
3618 f->open_object_section("mapping");
3619 f->dump_int("from", q.first);
3620 f->dump_int("to", q.second);
3621 f->close_section();
3622 }
3623 f->close_section();
3624 f->close_section();
3625 }
3626 f->close_section();
3627 f->open_array_section("pg_temp");
31f18b77 3628 pg_temp->dump(f);
7c673cae
FG
3629 f->close_section();
3630
3631 f->open_array_section("primary_temp");
3632 for (const auto &pg : *primary_temp) {
3633 f->dump_stream("pgid") << pg.first;
3634 f->dump_int("osd", pg.second);
3635 }
3636 f->close_section(); // primary_temp
3637
f67539c2
TL
3638 f->open_object_section("blocklist");
3639 for (const auto &addr : blocklist) {
7c673cae
FG
3640 stringstream ss;
3641 ss << addr.first;
3642 f->dump_stream(ss.str().c_str()) << addr.second;
3643 }
3644 f->close_section();
3645
3646 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3647
3648 f->open_array_section("removed_snaps_queue");
3649 for (auto& p : removed_snaps_queue) {
3650 f->open_object_section("pool");
3651 f->dump_int("pool", p.first);
3652 f->open_array_section("snaps");
3653 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3654 f->open_object_section("interval");
3655 f->dump_unsigned("begin", q.get_start());
3656 f->dump_unsigned("length", q.get_len());
3657 f->close_section();
3658 }
3659 f->close_section();
3660 f->close_section();
3661 }
3662 f->close_section();
3663 f->open_array_section("new_removed_snaps");
3664 for (auto& p : new_removed_snaps) {
3665 f->open_object_section("pool");
3666 f->dump_int("pool", p.first);
3667 f->open_array_section("snaps");
3668 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3669 f->open_object_section("interval");
3670 f->dump_unsigned("begin", q.get_start());
3671 f->dump_unsigned("length", q.get_len());
3672 f->close_section();
3673 }
3674 f->close_section();
3675 f->close_section();
3676 }
3677 f->close_section();
3678 f->open_array_section("new_purged_snaps");
3679 for (auto& p : new_purged_snaps) {
3680 f->open_object_section("pool");
3681 f->dump_int("pool", p.first);
3682 f->open_array_section("snaps");
3683 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3684 f->open_object_section("interval");
3685 f->dump_unsigned("begin", q.get_start());
3686 f->dump_unsigned("length", q.get_len());
3687 f->close_section();
3688 }
3689 f->close_section();
3690 f->close_section();
3691 }
3692 f->close_section();
81eedcae
TL
3693 f->open_object_section("crush_node_flags");
3694 for (auto& i : crush_node_flags) {
3695 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3696 : stringify(i.first);
3697 f->open_array_section(s.c_str());
3698 set<string> st;
3699 calc_state_set(i.second, st);
3700 for (auto& j : st) {
3701 f->dump_string("flag", j);
3702 }
3703 f->close_section();
3704 }
3705 f->close_section();
3706 f->open_object_section("device_class_flags");
3707 for (auto& i : device_class_flags) {
3708 const char* class_name = crush->get_class_name(i.first);
3709 string s = class_name ? class_name : stringify(i.first);
3710 f->open_array_section(s.c_str());
3711 set<string> st;
3712 calc_state_set(i.second, st);
3713 for (auto& j : st) {
3714 f->dump_string("flag", j);
3715 }
3716 f->close_section();
3717 }
3718 f->close_section();
f67539c2
TL
3719 f->open_object_section("stretch_mode");
3720 {
3721 f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
3722 f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
3723 f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
3724 f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
3725 f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
3726 }
3727 f->close_section();
7c673cae
FG
3728}
3729
3730void OSDMap::generate_test_instances(list<OSDMap*>& o)
3731{
3732 o.push_back(new OSDMap);
3733
3734 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3735 o.push_back(new OSDMap);
3736 uuid_d fsid;
224ce89b 3737 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae 3738 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
f67539c2 3739 o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
7c673cae
FG
3740 cct->put();
3741}
3742
3743string OSDMap::get_flag_string(unsigned f)
3744{
3745 string s;
7c673cae
FG
3746 if (f & CEPH_OSDMAP_PAUSERD)
3747 s += ",pauserd";
3748 if (f & CEPH_OSDMAP_PAUSEWR)
3749 s += ",pausewr";
3750 if (f & CEPH_OSDMAP_PAUSEREC)
3751 s += ",pauserec";
3752 if (f & CEPH_OSDMAP_NOUP)
3753 s += ",noup";
3754 if (f & CEPH_OSDMAP_NODOWN)
3755 s += ",nodown";
3756 if (f & CEPH_OSDMAP_NOOUT)
3757 s += ",noout";
3758 if (f & CEPH_OSDMAP_NOIN)
3759 s += ",noin";
3760 if (f & CEPH_OSDMAP_NOBACKFILL)
3761 s += ",nobackfill";
3762 if (f & CEPH_OSDMAP_NOREBALANCE)
3763 s += ",norebalance";
3764 if (f & CEPH_OSDMAP_NORECOVER)
3765 s += ",norecover";
3766 if (f & CEPH_OSDMAP_NOSCRUB)
3767 s += ",noscrub";
3768 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3769 s += ",nodeep-scrub";
3770 if (f & CEPH_OSDMAP_NOTIERAGENT)
3771 s += ",notieragent";
11fdf7f2
TL
3772 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3773 s += ",nosnaptrim";
7c673cae
FG
3774 if (f & CEPH_OSDMAP_SORTBITWISE)
3775 s += ",sortbitwise";
3776 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3777 s += ",require_jewel_osds";
3778 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3779 s += ",require_kraken_osds";
3780 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3781 s += ",require_luminous_osds";
c07f9fc5
FG
3782 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3783 s += ",recovery_deletes";
181888fb
FG
3784 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3785 s += ",purged_snapdirs";
f64942e4
AA
3786 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3787 s += ",pglog_hardlimit";
7c673cae
FG
3788 if (s.length())
3789 s.erase(0, 1);
3790 return s;
3791}
3792
3793string OSDMap::get_flag_string() const
3794{
3795 return get_flag_string(flags);
3796}
3797
7c673cae
FG
3798void OSDMap::print_pools(ostream& out) const
3799{
3800 for (const auto &pool : pools) {
3801 std::string name("<unknown>");
3802 const auto &pni = pool_name.find(pool.first);
3803 if (pni != pool_name.end())
3804 name = pni->second;
3805 out << "pool " << pool.first
3806 << " '" << name
3807 << "' " << pool.second << "\n";
3808
3809 for (const auto &snap : pool.second.snaps)
3810 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3811
3812 if (!pool.second.removed_snaps.empty())
3813 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3814 auto p = removed_snaps_queue.find(pool.first);
3815 if (p != removed_snaps_queue.end()) {
3816 out << "\tremoved_snaps_queue " << p->second << "\n";
3817 }
7c673cae
FG
3818 }
3819 out << std::endl;
3820}
3821
9f95a23c
TL
3822void OSDMap::print_osds(ostream& out) const
3823{
3824 for (int i=0; i<get_max_osd(); i++) {
3825 if (exists(i)) {
3826 print_osd(i, out);
3827 }
3828 }
3829}
3830void OSDMap::print_osd(int id, ostream& out) const
3831{
3832 if (!exists(id)) {
3833 return;
3834 }
3835
3836 out << "osd." << id;
3837 out << (is_up(id) ? " up ":" down");
3838 out << (is_in(id) ? " in ":" out");
3839 out << " weight " << get_weightf(id);
3840 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
3841 out << " primary_affinity " << get_primary_affinityf(id);
3842 }
3843 const osd_info_t& info(get_info(id));
3844 out << " " << info;
3845 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
3846 set<string> st;
3847 get_state(id, st);
3848 out << " " << st;
3849 if (!get_uuid(id).is_zero()) {
3850 out << " " << get_uuid(id);
3851 }
3852 out << "\n";
3853}
3854
7c673cae
FG
3855void OSDMap::print(ostream& out) const
3856{
3857 out << "epoch " << get_epoch() << "\n"
3858 << "fsid " << get_fsid() << "\n"
3859 << "created " << get_created() << "\n"
3860 << "modified " << get_modified() << "\n";
3861
3862 out << "flags " << get_flag_string() << "\n";
31f18b77 3863 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3864 out << "full_ratio " << full_ratio << "\n";
3865 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3866 out << "nearfull_ratio " << nearfull_ratio << "\n";
9f95a23c 3867 if (require_min_compat_client != ceph_release_t::unknown) {
31f18b77 3868 out << "require_min_compat_client "
9f95a23c 3869 << require_min_compat_client << "\n";
7c673cae 3870 }
9f95a23c 3871 out << "min_compat_client " << get_min_compat_client()
31f18b77 3872 << "\n";
9f95a23c
TL
3873 if (require_osd_release > ceph_release_t::unknown) {
3874 out << "require_osd_release " << require_osd_release
224ce89b
WB
3875 << "\n";
3876 }
f67539c2
TL
3877 out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
3878 if (stretch_mode_enabled) {
3879 out << "stretch_bucket_count " << stretch_bucket_count << "\n";
3880 out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
3881 out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
3882 out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
3883 }
7c673cae
FG
3884 if (get_cluster_snapshot().length())
3885 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3886 out << "\n";
3887
3888 print_pools(out);
3889
3890 out << "max_osd " << get_max_osd() << "\n";
9f95a23c 3891 print_osds(out);
7c673cae
FG
3892 out << std::endl;
3893
3894 for (auto& p : pg_upmap) {
3895 out << "pg_upmap " << p.first << " " << p.second << "\n";
3896 }
3897 for (auto& p : pg_upmap_items) {
3898 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3899 }
3900
f67539c2 3901 for (const auto& pg : *pg_temp)
7c673cae
FG
3902 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3903
f67539c2 3904 for (const auto& pg : *primary_temp)
7c673cae
FG
3905 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3906
f67539c2
TL
3907 for (const auto &addr : blocklist)
3908 out << "blocklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
3909}
3910
3911class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3912public:
3913 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3914
3915 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3916 unsigned f)
c07f9fc5 3917 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3918
3919 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3920 if (!filter) {
3921 return true; // normal case
3922 }
3923 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3924 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3925 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3926 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3927 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3928 return true;
31f18b77 3929 }
c07f9fc5 3930 return false;
31f18b77
FG
3931 }
3932
3933 bool should_dump_empty_bucket() const override {
3934 return !filter;
3935 }
7c673cae 3936
11fdf7f2 3937 void init_table(TextTable *tbl) {
7c673cae 3938 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3939 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3940 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3941 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3942 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3943 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3944 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
3945 }
3946 void dump(TextTable *tbl, string& bucket) {
3947 init_table(tbl);
7c673cae 3948
11fdf7f2
TL
3949 if (!bucket.empty()) {
3950 set_root(bucket);
3951 Parent::dump(tbl);
3952 } else {
3953 Parent::dump(tbl);
3954 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3955 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3956 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3957 }
31f18b77 3958 }
7c673cae
FG
3959 }
3960 }
3961
3962protected:
3963 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3964 const char *c = crush->get_item_class(qi.id);
3965 if (!c)
3966 c = "";
7c673cae 3967 *tbl << qi.id
224ce89b 3968 << c
7c673cae
FG
3969 << weightf_t(qi.weight);
3970
3971 ostringstream name;
3972 for (int k = 0; k < qi.depth; k++)
3973 name << " ";
3974 if (qi.is_bucket()) {
3975 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3976 << crush->get_item_name(qi.id);
3977 } else {
3978 name << "osd." << qi.id;
3979 }
3980 *tbl << name.str();
3981
3982 if (!qi.is_bucket()) {
3983 if (!osdmap->exists(qi.id)) {
3984 *tbl << "DNE"
3985 << 0;
3986 } else {
c07f9fc5
FG
3987 string s;
3988 if (osdmap->is_up(qi.id)) {
3989 s = "up";
3990 } else if (osdmap->is_destroyed(qi.id)) {
3991 s = "destroyed";
3992 } else {
3993 s = "down";
3994 }
3995 *tbl << s
7c673cae
FG
3996 << weightf_t(osdmap->get_weightf(qi.id))
3997 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3998 }
3999 }
4000 *tbl << TextTable::endrow;
4001 }
4002
4003private:
4004 const OSDMap *osdmap;
31f18b77 4005 const unsigned filter;
7c673cae
FG
4006};
4007
4008class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
4009public:
4010 typedef CrushTreeDumper::FormattingDumper Parent;
4011
31f18b77
FG
4012 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4013 unsigned f)
c07f9fc5 4014 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
4015
4016 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
4017 if (!filter) {
4018 return true; // normal case
4019 }
4020 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
4021 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
4022 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
4023 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
4024 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
4025 return true;
31f18b77 4026 }
c07f9fc5 4027 return false;
31f18b77
FG
4028 }
4029
4030 bool should_dump_empty_bucket() const override {
4031 return !filter;
4032 }
7c673cae 4033
11fdf7f2
TL
4034 void dump(Formatter *f, string& bucket) {
4035 if (!bucket.empty()) {
4036 set_root(bucket);
4037 f->open_array_section("nodes");
4038 Parent::dump(f);
4039 f->close_section();
4040 } else {
4041 f->open_array_section("nodes");
4042 Parent::dump(f);
4043 f->close_section();
4044 f->open_array_section("stray");
4045 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4046 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
4047 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
4048 }
4049 f->close_section();
7c673cae 4050 }
7c673cae
FG
4051 }
4052
4053protected:
4054 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
4055 Parent::dump_item_fields(qi, f);
4056 if (!qi.is_bucket())
4057 {
c07f9fc5
FG
4058 string s;
4059 if (osdmap->is_up(qi.id)) {
4060 s = "up";
4061 } else if (osdmap->is_destroyed(qi.id)) {
4062 s = "destroyed";
4063 } else {
4064 s = "down";
4065 }
7c673cae 4066 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 4067 f->dump_string("status", s);
7c673cae
FG
4068 f->dump_float("reweight", osdmap->get_weightf(qi.id));
4069 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
4070 }
4071 }
4072
4073private:
4074 const OSDMap *osdmap;
31f18b77 4075 const unsigned filter;
7c673cae
FG
4076};
4077
11fdf7f2 4078void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 4079{
31f18b77 4080 if (f) {
11fdf7f2 4081 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 4082 } else {
11fdf7f2 4083 ceph_assert(out);
7c673cae 4084 TextTable tbl;
11fdf7f2 4085 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
4086 *out << tbl;
4087 }
4088}
4089
224ce89b 4090void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 4091 const string& prefix, bool extra) const
7c673cae
FG
4092{
4093 if (f) {
7c673cae
FG
4094 f->dump_int("epoch", get_epoch());
4095 f->dump_int("num_osds", get_num_osds());
4096 f->dump_int("num_up_osds", get_num_up_osds());
9f95a23c 4097 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
7c673cae 4098 f->dump_int("num_in_osds", get_num_in_osds());
9f95a23c 4099 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
7c673cae 4100 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
7c673cae 4101 } else {
11fdf7f2 4102 utime_t now = ceph_clock_now();
31f18b77 4103 out << get_num_osds() << " osds: "
11fdf7f2
TL
4104 << get_num_up_osds() << " up";
4105 if (last_up_change != utime_t()) {
4106 out << " (since " << utimespan_str(now - last_up_change) << ")";
4107 }
4108 out << ", " << get_num_in_osds() << " in";
4109 if (last_in_change != utime_t()) {
4110 out << " (since " << utimespan_str(now - last_in_change) << ")";
4111 }
4112 if (extra)
4113 out << "; epoch: e" << get_epoch();
7c673cae
FG
4114 if (get_num_pg_temp())
4115 out << "; " << get_num_pg_temp() << " remapped pgs";
4116 out << "\n";
4117 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4118 if (important_flags)
224ce89b 4119 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
4120 }
4121}
4122
4123void OSDMap::print_oneline_summary(ostream& out) const
4124{
4125 out << "e" << get_epoch() << ": "
31f18b77 4126 << get_num_osds() << " total, "
7c673cae
FG
4127 << get_num_up_osds() << " up, "
4128 << get_num_in_osds() << " in";
7c673cae
FG
4129}
4130
3efd9988 4131bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
4132{
4133 for (const auto &pool : pools) {
3efd9988 4134 if (pool.second.crush_rule == rule_id)
7c673cae
FG
4135 return true;
4136 }
4137 return false;
4138}
4139
3efd9988
FG
4140int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4141 ostream *ss) const
4142{
4143 for (auto& i : pools) {
4144 auto& pool = i.second;
4145 int ruleno = pool.get_crush_rule();
4146 if (!newcrush->rule_exists(ruleno)) {
4147 *ss << "pool " << i.first << " references crush_rule " << ruleno
4148 << " but it is not present";
4149 return -EINVAL;
4150 }
4151 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
4152 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
4153 return -EINVAL;
4154 }
4155 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
4156 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4157 return -EINVAL;
4158 }
11fdf7f2
TL
4159 int poolsize = pool.get_size();
4160 if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
4161 poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
4162 *ss << "pool " << i.first << " size " << poolsize << " does not"
3efd9988
FG
4163 << " fall within rule " << ruleno
4164 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
4165 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
4166 return -EINVAL;
4167 }
4168 }
4169 return 0;
4170}
4171
224ce89b
WB
4172int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4173 int nosd, int pg_bits, int pgp_bits,
4174 bool default_pool)
7c673cae 4175{
224ce89b
WB
4176 ldout(cct, 10) << "build_simple on " << nosd
4177 << " osds" << dendl;
7c673cae
FG
4178 epoch = e;
4179 set_fsid(fsid);
4180 created = modified = ceph_clock_now();
4181
4182 if (nosd >= 0) {
4183 set_max_osd(nosd);
4184 } else {
4185 // count osds
4186 int maxosd = 0;
11fdf7f2 4187 const auto& conf = cct->_conf;
7c673cae 4188 vector<string> sections;
11fdf7f2 4189 conf.get_all_sections(sections);
7c673cae
FG
4190
4191 for (auto &section : sections) {
4192 if (section.find("osd.") != 0)
4193 continue;
4194
4195 const char *begin = section.c_str() + 4;
4196 char *end = (char*)begin;
4197 int o = strtol(begin, &end, 10);
4198 if (*end != '\0')
4199 continue;
4200
4201 if (o > cct->_conf->mon_max_osd) {
4202 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4203 return -ERANGE;
4204 }
4205
4206 if (o > maxosd)
4207 maxosd = o;
4208 }
4209
4210 set_max_osd(maxosd + 1);
4211 }
4212
7c673cae
FG
4213
4214 stringstream ss;
4215 int r;
4216 if (nosd >= 0)
4217 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4218 else
4219 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4220 ceph_assert(r == 0);
7c673cae
FG
4221
4222 int poolbase = get_max_osd() ? get_max_osd() : 1;
4223
d2e6a577 4224 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
11fdf7f2 4225 ceph_assert(default_replicated_rule >= 0);
7c673cae 4226
224ce89b
WB
4227 if (default_pool) {
4228 // pgp_num <= pg_num
4229 if (pgp_bits > pg_bits)
4230 pgp_bits = pg_bits;
4231
4232 vector<string> pool_names;
4233 pool_names.push_back("rbd");
4234 for (auto &plname : pool_names) {
4235 int64_t pool = ++pool_max;
4236 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4237 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4238 if (cct->_conf->osd_pool_default_flag_hashpspool)
4239 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4240 if (cct->_conf->osd_pool_default_flag_nodelete)
4241 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4242 if (cct->_conf->osd_pool_default_flag_nopgchange)
4243 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4244 if (cct->_conf->osd_pool_default_flag_nosizechange)
4245 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
4246 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4247 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4248 pools[pool].size);
224ce89b
WB
4249 pools[pool].crush_rule = default_replicated_rule;
4250 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4251 pools[pool].set_pg_num(poolbase << pg_bits);
4252 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4253 pools[pool].set_pg_num_target(poolbase << pg_bits);
4254 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4255 pools[pool].last_change = epoch;
c07f9fc5
FG
4256 pools[pool].application_metadata.insert(
4257 {pg_pool_t::APPLICATION_NAME_RBD, {}});
9f95a23c
TL
4258 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4259 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4260 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4261 pools[pool].pg_autoscale_mode = m;
4262 } else {
4263 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4264 }
224ce89b
WB
4265 pool_name[pool] = plname;
4266 name_pool[plname] = pool;
4267 }
7c673cae
FG
4268 }
4269
7c673cae
FG
4270 map<string,string> profile_map;
4271 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4272 if (r < 0) {
4273 lderr(cct) << ss.str() << dendl;
4274 return r;
4275 }
4276 set_erasure_code_profile("default", profile_map);
4277 return 0;
4278}
4279
4280int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4281 map<string,string> &profile_map,
4282 ostream *ss)
4283{
11fdf7f2 4284 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4285 *ss,
4286 &profile_map);
4287 return r;
4288}
4289
4290int OSDMap::_build_crush_types(CrushWrapper& crush)
4291{
4292 crush.set_type_name(0, "osd");
4293 crush.set_type_name(1, "host");
4294 crush.set_type_name(2, "chassis");
4295 crush.set_type_name(3, "rack");
4296 crush.set_type_name(4, "row");
4297 crush.set_type_name(5, "pdu");
4298 crush.set_type_name(6, "pod");
4299 crush.set_type_name(7, "room");
4300 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4301 crush.set_type_name(9, "zone");
4302 crush.set_type_name(10, "region");
4303 crush.set_type_name(11, "root");
4304 return 11;
7c673cae
FG
4305}
4306
4307int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4308 int nosd, ostream *ss)
4309{
4310 crush.create();
4311
4312 // root
4313 int root_type = _build_crush_types(crush);
4314 int rootid;
4315 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4316 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4317 ceph_assert(r == 0);
7c673cae
FG
4318 crush.set_item_name(rootid, "default");
4319
f67539c2
TL
4320 map<string,string> loc{
4321 {"host", "localhost"},
4322 {"rack", "localrack"},
4323 {"root", "default"}
4324 };
7c673cae 4325 for (int o=0; o<nosd; o++) {
7c673cae
FG
4326 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4327 char name[32];
4328 snprintf(name, sizeof(name), "osd.%d", o);
4329 crush.insert_item(cct, o, 1.0, name, loc);
4330 }
4331
31f18b77 4332 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4333
4334 crush.finalize();
4335
4336 return 0;
4337}
4338
4339int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4340 CrushWrapper& crush,
4341 ostream *ss)
4342{
11fdf7f2 4343 const auto& conf = cct->_conf;
7c673cae
FG
4344
4345 crush.create();
4346
4347 // root
4348 int root_type = _build_crush_types(crush);
4349 int rootid;
4350 int r = crush.add_bucket(0, 0,
4351 CRUSH_HASH_DEFAULT,
4352 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4353 ceph_assert(r == 0);
7c673cae
FG
4354 crush.set_item_name(rootid, "default");
4355
4356 // add osds
4357 vector<string> sections;
11fdf7f2 4358 conf.get_all_sections(sections);
7c673cae
FG
4359
4360 for (auto &section : sections) {
4361 if (section.find("osd.") != 0)
4362 continue;
4363
4364 const char *begin = section.c_str() + 4;
4365 char *end = (char*)begin;
4366 int o = strtol(begin, &end, 10);
4367 if (*end != '\0')
4368 continue;
4369
4370 string host, rack, row, room, dc, pool;
4371 vector<string> sectiontmp;
4372 sectiontmp.push_back("osd");
4373 sectiontmp.push_back(section);
11fdf7f2
TL
4374 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4375 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4376 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4377 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4378 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4379 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4380
4381 if (host.length() == 0)
4382 host = "unknownhost";
4383 if (rack.length() == 0)
4384 rack = "unknownrack";
4385
4386 map<string,string> loc;
4387 loc["host"] = host;
4388 loc["rack"] = rack;
4389 if (row.size())
4390 loc["row"] = row;
4391 if (room.size())
4392 loc["room"] = room;
4393 if (dc.size())
4394 loc["datacenter"] = dc;
4395 loc["root"] = "default";
4396
4397 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4398 crush.insert_item(cct, o, 1.0, section, loc);
4399 }
4400
31f18b77 4401 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4402
4403 crush.finalize();
4404
4405 return 0;
4406}
4407
4408
31f18b77
FG
4409int OSDMap::build_simple_crush_rules(
4410 CephContext *cct,
4411 CrushWrapper& crush,
4412 const string& root,
4413 ostream *ss)
7c673cae 4414{
31f18b77 4415 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
7c673cae
FG
4416 string failure_domain =
4417 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4418
7c673cae 4419 int r;
31f18b77 4420 r = crush.add_simple_rule_at(
224ce89b 4421 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4422 "firstn", pg_pool_t::TYPE_REPLICATED,
4423 crush_rule, ss);
7c673cae
FG
4424 if (r < 0)
4425 return r;
4426 // do not add an erasure rule by default or else we will implicitly
4427 // require the crush_v2 feature of clients
4428 return 0;
4429}
4430
4431int OSDMap::summarize_mapping_stats(
4432 OSDMap *newmap,
4433 const set<int64_t> *pools,
4434 std::string *out,
4435 Formatter *f) const
4436{
4437 set<int64_t> ls;
4438 if (pools) {
4439 ls = *pools;
4440 } else {
4441 for (auto &p : get_pools())
4442 ls.insert(p.first);
4443 }
4444
4445 unsigned total_pg = 0;
4446 unsigned moved_pg = 0;
4447 vector<unsigned> base_by_osd(get_max_osd(), 0);
4448 vector<unsigned> new_by_osd(get_max_osd(), 0);
4449 for (int64_t pool_id : ls) {
4450 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4451 vector<int> up, up2;
4452 int up_primary;
7c673cae 4453 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4454 pg_t pgid(ps, pool_id);
7c673cae 4455 total_pg += pi->get_size();
31f18b77 4456 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4457 for (int osd : up) {
4458 if (osd >= 0 && osd < get_max_osd())
4459 ++base_by_osd[osd];
4460 }
4461 if (newmap) {
31f18b77 4462 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4463 for (int osd : up2) {
4464 if (osd >= 0 && osd < get_max_osd())
4465 ++new_by_osd[osd];
4466 }
4467 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4468 for (unsigned i=0; i<up.size(); ++i) {
4469 if (up[i] != up2[i]) {
4470 ++moved_pg;
4471 }
4472 }
4473 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4474 for (int osd : up) {
4475 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4476 ++moved_pg;
4477 }
4478 }
4479 } else {
11fdf7f2 4480 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4481 }
4482 }
4483 }
4484 }
4485
4486 unsigned num_up_in = 0;
4487 for (int osd = 0; osd < get_max_osd(); ++osd) {
4488 if (is_up(osd) && is_in(osd))
4489 ++num_up_in;
4490 }
4491 if (!num_up_in) {
4492 return -EINVAL;
4493 }
4494
4495 float avg_pg = (float)total_pg / (float)num_up_in;
4496 float base_stddev = 0, new_stddev = 0;
4497 int min = -1, max = -1;
4498 unsigned min_base_pg = 0, max_base_pg = 0;
4499 unsigned min_new_pg = 0, max_new_pg = 0;
4500 for (int osd = 0; osd < get_max_osd(); ++osd) {
4501 if (is_up(osd) && is_in(osd)) {
4502 float base_diff = (float)base_by_osd[osd] - avg_pg;
4503 base_stddev += base_diff * base_diff;
4504 float new_diff = (float)new_by_osd[osd] - avg_pg;
4505 new_stddev += new_diff * new_diff;
4506 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4507 min = osd;
4508 min_base_pg = base_by_osd[osd];
4509 min_new_pg = new_by_osd[osd];
4510 }
4511 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4512 max = osd;
4513 max_base_pg = base_by_osd[osd];
4514 max_new_pg = new_by_osd[osd];
4515 }
4516 }
4517 }
4518 base_stddev = sqrt(base_stddev / num_up_in);
4519 new_stddev = sqrt(new_stddev / num_up_in);
4520
4521 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4522
4523 ostringstream ss;
4524 if (f)
4525 f->open_object_section("utilization");
4526 if (newmap) {
4527 if (f) {
4528 f->dump_unsigned("moved_pgs", moved_pg);
4529 f->dump_unsigned("total_pgs", total_pg);
4530 } else {
4531 float percent = 0;
4532 if (total_pg)
4533 percent = (float)moved_pg * 100.0 / (float)total_pg;
4534 ss << "moved " << moved_pg << " / " << total_pg
4535 << " (" << percent << "%)\n";
4536 }
4537 }
4538 if (f) {
4539 f->dump_float("avg_pgs", avg_pg);
4540 f->dump_float("std_dev", base_stddev);
4541 f->dump_float("expected_baseline_std_dev", edev);
4542 if (newmap)
4543 f->dump_float("new_std_dev", new_stddev);
4544 } else {
4545 ss << "avg " << avg_pg << "\n";
4546 ss << "stddev " << base_stddev;
4547 if (newmap)
4548 ss << " -> " << new_stddev;
4549 ss << " (expected baseline " << edev << ")\n";
4550 }
4551 if (min >= 0) {
4552 if (f) {
4553 f->dump_unsigned("min_osd", min);
4554 f->dump_unsigned("min_osd_pgs", min_base_pg);
4555 if (newmap)
4556 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4557 } else {
4558 ss << "min osd." << min << " with " << min_base_pg;
4559 if (newmap)
4560 ss << " -> " << min_new_pg;
4561 ss << " pgs (" << (float)min_base_pg / avg_pg;
4562 if (newmap)
4563 ss << " -> " << (float)min_new_pg / avg_pg;
4564 ss << " * mean)\n";
4565 }
4566 }
4567 if (max >= 0) {
4568 if (f) {
4569 f->dump_unsigned("max_osd", max);
4570 f->dump_unsigned("max_osd_pgs", max_base_pg);
4571 if (newmap)
4572 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4573 } else {
4574 ss << "max osd." << max << " with " << max_base_pg;
4575 if (newmap)
4576 ss << " -> " << max_new_pg;
4577 ss << " pgs (" << (float)max_base_pg / avg_pg;
4578 if (newmap)
4579 ss << " -> " << (float)max_new_pg / avg_pg;
4580 ss << " * mean)\n";
4581 }
4582 }
4583 if (f)
4584 f->close_section();
4585 if (out)
4586 *out = ss.str();
4587 return 0;
4588}
4589
7c673cae
FG
4590bool OSDMap::try_pg_upmap(
4591 CephContext *cct,
4592 pg_t pg, ///< pg to potentially remap
4593 const set<int>& overfull, ///< osds we'd want to evacuate
4594 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4595 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4596 vector<int> *orig,
4597 vector<int> *out) ///< resulting alternative mapping
4598{
4599 const pg_pool_t *pool = get_pg_pool(pg.pool());
4600 if (!pool)
4601 return false;
31f18b77 4602 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
7c673cae
FG
4603 pool->get_size());
4604 if (rule < 0)
4605 return false;
4606
7c673cae
FG
4607 // make sure there is something there to remap
4608 bool any = false;
4609 for (auto osd : *orig) {
4610 if (overfull.count(osd)) {
4611 any = true;
4612 break;
4613 }
4614 }
4615 if (!any) {
4616 return false;
4617 }
4618
4619 int r = crush->try_remap_rule(
4620 cct,
4621 rule,
4622 pool->get_size(),
4623 overfull, underfull,
92f5a8d4 4624 more_underfull,
7c673cae
FG
4625 *orig,
4626 out);
4627 if (r < 0)
4628 return false;
4629 if (*out == *orig)
4630 return false;
4631 return true;
4632}
4633
4634int OSDMap::calc_pg_upmaps(
4635 CephContext *cct,
92f5a8d4 4636 uint32_t max_deviation,
7c673cae 4637 int max,
a8e16298 4638 const set<int64_t>& only_pools,
7c673cae
FG
4639 OSDMap::Incremental *pending_inc)
4640{
a8e16298 4641 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
7c673cae 4642 OSDMap tmp;
92f5a8d4
TL
4643 // Can't be less than 1 pg
4644 if (max_deviation < 1)
4645 max_deviation = 1;
7c673cae
FG
4646 tmp.deepish_copy_from(*this);
4647 int num_changed = 0;
a8e16298
TL
4648 map<int,set<pg_t>> pgs_by_osd;
4649 int total_pgs = 0;
4650 float osd_weight_total = 0;
4651 map<int,float> osd_weight;
4652 for (auto& i : pools) {
4653 if (!only_pools.empty() && !only_pools.count(i.first))
4654 continue;
4655 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4656 pg_t pg(ps, i.first);
4657 vector<int> up;
4658 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4659 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4660 for (auto osd : up) {
4661 if (osd != CRUSH_ITEM_NONE)
4662 pgs_by_osd[osd].insert(pg);
7c673cae 4663 }
a8e16298
TL
4664 }
4665 total_pgs += i.second.get_size() * i.second.get_pg_num();
4666
4667 map<int,float> pmap;
4668 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4669 i.second.get_type(),
4670 i.second.get_size());
4671 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4672 ldout(cct,20) << __func__ << " pool " << i.first
4673 << " ruleno " << ruleno
4674 << " weight-map " << pmap
4675 << dendl;
4676 for (auto p : pmap) {
4677 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4678 if (adjusted_weight == 0) {
4679 continue;
31f18b77 4680 }
a8e16298
TL
4681 osd_weight[p.first] += adjusted_weight;
4682 osd_weight_total += adjusted_weight;
7c673cae 4683 }
a8e16298
TL
4684 }
4685 for (auto& i : osd_weight) {
4686 int pgs = 0;
4687 auto p = pgs_by_osd.find(i.first);
4688 if (p != pgs_by_osd.end())
31f18b77 4689 pgs = p->second.size();
a8e16298 4690 else
31f18b77 4691 pgs_by_osd.emplace(i.first, set<pg_t>());
a8e16298 4692 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
31f18b77 4693 << " pgs " << pgs << dendl;
a8e16298
TL
4694 }
4695 if (osd_weight_total == 0) {
4696 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4697 return 0;
4698 }
4699 float pgs_per_weight = total_pgs / osd_weight_total;
4700 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4701 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4702
a8e16298
TL
4703 if (max <= 0) {
4704 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4705 return 0;
4706 }
a8e16298
TL
4707 float stddev = 0;
4708 map<int,float> osd_deviation; // osd, deviation(pgs)
4709 multimap<float,int> deviation_osd; // deviation(pgs), osd
92f5a8d4 4710 float cur_max_deviation = 0;
a8e16298
TL
4711 for (auto& i : pgs_by_osd) {
4712 // make sure osd is still there (belongs to this crush-tree)
4713 ceph_assert(osd_weight.count(i.first));
4714 float target = osd_weight[i.first] * pgs_per_weight;
4715 float deviation = (float)i.second.size() - target;
4716 ldout(cct, 20) << " osd." << i.first
4717 << "\tpgs " << i.second.size()
4718 << "\ttarget " << target
4719 << "\tdeviation " << deviation
4720 << dendl;
4721 osd_deviation[i.first] = deviation;
4722 deviation_osd.insert(make_pair(deviation, i.first));
4723 stddev += deviation * deviation;
92f5a8d4
TL
4724 if (fabsf(deviation) > cur_max_deviation)
4725 cur_max_deviation = fabsf(deviation);
a8e16298 4726 }
92f5a8d4
TL
4727 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4728 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
4729 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4730 << dendl;
4731 return 0;
4732 }
4733 bool skip_overfull = false;
4734 auto aggressive =
11fdf7f2 4735 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4736 auto local_fallback_retries =
11fdf7f2 4737 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
a8e16298 4738 while (max--) {
92f5a8d4 4739 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
4740 // build overfull and underfull
4741 set<int> overfull;
92f5a8d4
TL
4742 set<int> more_overfull;
4743 bool using_more_overfull = false;
a8e16298 4744 vector<int> underfull;
92f5a8d4
TL
4745 vector<int> more_underfull;
4746 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4747 ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
4748 if (i->first <= 0)
4749 break;
4750 if (i->first > max_deviation) {
4751 ldout(cct, 30) << " add overfull osd." << i->second << dendl;
a8e16298 4752 overfull.insert(i->second);
92f5a8d4
TL
4753 } else {
4754 more_overfull.insert(i->second);
4755 }
a8e16298 4756 }
7c673cae 4757
92f5a8d4
TL
4758 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4759 ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
4760 if (i->first >= 0)
a8e16298 4761 break;
92f5a8d4
TL
4762 if (i->first < -(int)max_deviation) {
4763 ldout(cct, 30) << " add underfull osd." << i->second << dendl;
4764 underfull.push_back(i->second);
4765 } else {
4766 more_underfull.push_back(i->second);
4767 }
7c673cae 4768 }
92f5a8d4
TL
4769 if (underfull.empty() && overfull.empty()) {
4770 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 4771 break;
a8e16298 4772 }
92f5a8d4
TL
4773 if (overfull.empty() && !underfull.empty()) {
4774 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4775 overfull = more_overfull;
4776 using_more_overfull = true;
4777 }
7c673cae 4778
a8e16298
TL
4779 ldout(cct, 10) << " overfull " << overfull
4780 << " underfull " << underfull
4781 << dendl;
4782 set<pg_t> to_skip;
4783 uint64_t local_fallback_retried = 0;
4784
4785 retry:
4786
4787 set<pg_t> to_unmap;
4788 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4789 auto temp_pgs_by_osd = pgs_by_osd;
4790 // always start with fullest, break if we find any changes to make
7c673cae 4791 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 4792 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
4793 ldout(cct, 10) << " skipping overfull " << dendl;
4794 break; // fall through to check underfull
4795 }
7c673cae 4796 int osd = p->second;
31f18b77 4797 float deviation = p->first;
9f95a23c
TL
4798 if (deviation < 0) {
4799 ldout(cct, 10) << " hitting underfull osds now"
4800 << " when trying to remap overfull osds"
4801 << dendl;
4802 break;
4803 }
7c673cae 4804 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
4805 ldout(cct, 10) << " Overfull search osd." << osd
4806 << " target " << target
4807 << " deviation " << deviation
4808 << dendl;
a8e16298 4809 ceph_assert(target > 0);
92f5a8d4 4810 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 4811 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4812 << " target " << target
4813 << " deviation " << deviation
92f5a8d4 4814 << " < max deviation " << max_deviation
a8e16298 4815 << dendl;
7c673cae
FG
4816 break;
4817 }
7c673cae 4818
a8e16298
TL
4819 vector<pg_t> pgs;
4820 pgs.reserve(pgs_by_osd[osd].size());
4821 for (auto& pg : pgs_by_osd[osd]) {
4822 if (to_skip.count(pg))
4823 continue;
4824 pgs.push_back(pg);
4825 }
4826 if (aggressive) {
4827 // shuffle PG list so they all get equal (in)attention
4828 std::random_device rd;
4829 std::default_random_engine rng{rd()};
4830 std::shuffle(pgs.begin(), pgs.end(), rng);
4831 }
7c673cae
FG
4832 // look for remaps we can un-remap
4833 for (auto pg : pgs) {
4834 auto p = tmp.pg_upmap_items.find(pg);
a8e16298
TL
4835 if (p == tmp.pg_upmap_items.end())
4836 continue;
4837 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4838 for (auto q : p->second) {
4839 if (q.second == osd) {
4840 ldout(cct, 10) << " will try dropping existing"
4841 << " remapping pair "
4842 << q.first << " -> " << q.second
4843 << " which remapped " << pg
4844 << " into overfull osd." << osd
4845 << dendl;
4846 temp_pgs_by_osd[q.second].erase(pg);
4847 temp_pgs_by_osd[q.first].insert(pg);
4848 } else {
4849 new_upmap_items.push_back(q);
4850 }
4851 }
4852 if (new_upmap_items.empty()) {
4853 // drop whole item
4854 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4855 << " remapped " << pg << " into overfull osd." << osd
4856 << ", will try cancelling it entirely"
4857 << dendl;
4858 to_unmap.insert(pg);
4859 goto test_change;
4860 } else if (new_upmap_items.size() != p->second.size()) {
4861 // drop single remapping pair, updating
4862 ceph_assert(new_upmap_items.size() < p->second.size());
4863 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4864 << " remapped " << pg << " into overfull osd." << osd
4865 << ", new_pg_upmap_items now " << new_upmap_items
4866 << dendl;
4867 to_upmap[pg] = new_upmap_items;
4868 goto test_change;
4869 }
4870 }
7c673cae 4871
a8e16298 4872 // try upmap
7c673cae 4873 for (auto pg : pgs) {
a8e16298
TL
4874 auto temp_it = tmp.pg_upmap.find(pg);
4875 if (temp_it != tmp.pg_upmap.end()) {
4876 // leave pg_upmap alone
4877 // it must be specified by admin since balancer does not
4878 // support pg_upmap yet
4879 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4880 << temp_it->second << ", skipping"
4881 << dendl;
7c673cae
FG
4882 continue;
4883 }
a8e16298
TL
4884 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4885 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4886 set<int> existing;
4887 auto it = tmp.pg_upmap_items.find(pg);
4888 if (it != tmp.pg_upmap_items.end() &&
4889 it->second.size() >= (size_t)pg_pool_size) {
4890 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4891 << it->second << ", skipping"
4892 << dendl;
4893 continue;
4894 } else if (it != tmp.pg_upmap_items.end()) {
4895 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4896 << it->second
4897 << dendl;
4898 new_upmap_items = it->second;
4899 // build existing too (for dedup)
4900 for (auto i : it->second) {
4901 existing.insert(i.first);
4902 existing.insert(i.second);
4903 }
4904 // fall through
4905 // to see if we can append more remapping pairs
4906 }
4907 ldout(cct, 10) << " trying " << pg << dendl;
494da23a
TL
4908 vector<int> raw, orig, out;
4909 tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 4910 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
4911 continue;
4912 }
a8e16298 4913 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4914 if (orig.size() != out.size()) {
4915 continue;
4916 }
a8e16298 4917 ceph_assert(orig != out);
92f5a8d4
TL
4918 int pos = -1;
4919 float max_dev = 0;
7c673cae 4920 for (unsigned i = 0; i < out.size(); ++i) {
a8e16298
TL
4921 if (orig[i] == out[i])
4922 continue; // skip invalid remappings
4923 if (existing.count(orig[i]) || existing.count(out[i]))
4924 continue; // we want new remappings only!
92f5a8d4
TL
4925 if (osd_deviation[orig[i]] > max_dev) {
4926 max_dev = osd_deviation[orig[i]];
4927 pos = i;
4928 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
4929 }
4930 }
4931 if (pos != -1) {
4932 int i = pos;
a8e16298
TL
4933 ldout(cct, 10) << " will try adding new remapping pair "
4934 << orig[i] << " -> " << out[i] << " for " << pg
92f5a8d4 4935 << (orig[i] != osd ? " NOT selected osd" : "")
a8e16298
TL
4936 << dendl;
4937 existing.insert(orig[i]);
4938 existing.insert(out[i]);
4939 temp_pgs_by_osd[orig[i]].erase(pg);
4940 temp_pgs_by_osd[out[i]].insert(pg);
4941 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4942 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4943 // append new remapping pairs slowly
4944 // This way we can make sure that each tiny change will
4945 // definitely make distribution of PGs converging to
4946 // the perfect status.
4947 to_upmap[pg] = new_upmap_items;
4948 goto test_change;
7c673cae 4949 }
a8e16298
TL
4950 }
4951 }
7c673cae 4952
a8e16298
TL
4953 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4954 ldout(cct, 10) << " failed to find any changes for overfull osds"
4955 << dendl;
4956 for (auto& p : deviation_osd) {
4957 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4958 underfull.end())
4959 break;
4960 int osd = p.second;
4961 float deviation = p.first;
4962 float target = osd_weight[osd] * pgs_per_weight;
4963 ceph_assert(target > 0);
92f5a8d4
TL
4964 if (fabsf(deviation) < max_deviation) {
4965 // respect max_deviation too
a8e16298
TL
4966 ldout(cct, 10) << " osd." << osd
4967 << " target " << target
4968 << " deviation " << deviation
92f5a8d4
TL
4969 << " -> absolute " << fabsf(deviation)
4970 << " < max " << max_deviation
a8e16298
TL
4971 << dendl;
4972 break;
4973 }
4974 // look for remaps we can un-remap
4975 vector<pair<pg_t,
4976 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4977 candidates.reserve(tmp.pg_upmap_items.size());
4978 for (auto& i : tmp.pg_upmap_items) {
4979 if (to_skip.count(i.first))
4980 continue;
4981 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4982 continue;
4983 candidates.push_back(make_pair(i.first, i.second));
4984 }
4985 if (aggressive) {
4986 // shuffle candidates so they all get equal (in)attention
4987 std::random_device rd;
4988 std::default_random_engine rng{rd()};
4989 std::shuffle(candidates.begin(), candidates.end(), rng);
4990 }
4991 for (auto& i : candidates) {
4992 auto pg = i.first;
4993 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4994 for (auto& j : i.second) {
4995 if (j.first == osd) {
4996 ldout(cct, 10) << " will try dropping existing"
4997 << " remapping pair "
4998 << j.first << " -> " << j.second
4999 << " which remapped " << pg
5000 << " out from underfull osd." << osd
5001 << dendl;
5002 temp_pgs_by_osd[j.second].erase(pg);
5003 temp_pgs_by_osd[j.first].insert(pg);
5004 } else {
5005 new_upmap_items.push_back(j);
5006 }
5007 }
5008 if (new_upmap_items.empty()) {
5009 // drop whole item
5010 ldout(cct, 10) << " existing pg_upmap_items " << i.second
5011 << " remapped " << pg
5012 << " out from underfull osd." << osd
5013 << ", will try cancelling it entirely"
5014 << dendl;
5015 to_unmap.insert(pg);
5016 goto test_change;
5017 } else if (new_upmap_items.size() != i.second.size()) {
5018 // drop single remapping pair, updating
5019 ceph_assert(new_upmap_items.size() < i.second.size());
5020 ldout(cct, 10) << " existing pg_upmap_items " << i.second
5021 << " remapped " << pg
5022 << " out from underfull osd." << osd
5023 << ", new_pg_upmap_items now " << new_upmap_items
5024 << dendl;
5025 to_upmap[pg] = new_upmap_items;
5026 goto test_change;
5027 }
5028 }
7c673cae 5029 }
a8e16298
TL
5030
5031 ceph_assert(!(to_unmap.size() || to_upmap.size()));
5032 ldout(cct, 10) << " failed to find any changes for underfull osds"
5033 << dendl;
5034 if (!aggressive) {
5035 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
5036 break;
5037 } else if (!skip_overfull) {
5038 // safe to quit because below here we know
5039 // we've done checking both overfull and underfull osds..
5040 ldout(cct, 10) << " break due to not being able to find any"
5041 << " further optimizations"
5042 << dendl;
7c673cae
FG
5043 break;
5044 }
a8e16298
TL
5045 // restart with fullest and do exhaustive searching
5046 skip_overfull = false;
5047 continue;
5048
5049 test_change:
5050
5051 // test change, apply if change is good
5052 ceph_assert(to_unmap.size() || to_upmap.size());
5053 float new_stddev = 0;
5054 map<int,float> temp_osd_deviation;
5055 multimap<float,int> temp_deviation_osd;
92f5a8d4 5056 float cur_max_deviation = 0;
a8e16298
TL
5057 for (auto& i : temp_pgs_by_osd) {
5058 // make sure osd is still there (belongs to this crush-tree)
5059 ceph_assert(osd_weight.count(i.first));
5060 float target = osd_weight[i.first] * pgs_per_weight;
5061 float deviation = (float)i.second.size() - target;
5062 ldout(cct, 20) << " osd." << i.first
5063 << "\tpgs " << i.second.size()
5064 << "\ttarget " << target
5065 << "\tdeviation " << deviation
5066 << dendl;
5067 temp_osd_deviation[i.first] = deviation;
5068 temp_deviation_osd.insert(make_pair(deviation, i.first));
92f5a8d4
TL
5069 new_stddev += deviation * deviation;
5070 if (fabsf(deviation) > cur_max_deviation)
5071 cur_max_deviation = fabsf(deviation);
a8e16298
TL
5072 }
5073 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
5074 if (new_stddev >= stddev) {
5075 if (!aggressive) {
5076 ldout(cct, 10) << " break because stddev is not decreasing"
5077 << " and aggressive mode is not enabled"
5078 << dendl;
5079 break;
5080 }
5081 local_fallback_retried++;
5082 if (local_fallback_retried >= local_fallback_retries) {
5083 // does not make progress
5084 // flip *skip_overfull* so both overfull and underfull
5085 // get equal (in)attention
5086 skip_overfull = !skip_overfull;
5087 ldout(cct, 10) << " hit local_fallback_retries "
5088 << local_fallback_retries
5089 << dendl;
5090 continue;
5091 }
5092 for (auto& i : to_unmap)
5093 to_skip.insert(i);
5094 for (auto& i : to_upmap)
5095 to_skip.insert(i.first);
5096 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5097 << " to_skip " << to_skip
5098 << dendl;
5099 goto retry;
5100 }
5101
5102 // ready to go
5103 ceph_assert(new_stddev < stddev);
5104 stddev = new_stddev;
5105 pgs_by_osd = temp_pgs_by_osd;
5106 osd_deviation = temp_osd_deviation;
5107 deviation_osd = temp_deviation_osd;
5108 for (auto& i : to_unmap) {
5109 ldout(cct, 10) << " unmap pg " << i << dendl;
5110 ceph_assert(tmp.pg_upmap_items.count(i));
5111 tmp.pg_upmap_items.erase(i);
5112 pending_inc->old_pg_upmap_items.insert(i);
5113 ++num_changed;
5114 }
5115 for (auto& i : to_upmap) {
5116 ldout(cct, 10) << " upmap pg " << i.first
5117 << " new pg_upmap_items " << i.second
5118 << dendl;
5119 tmp.pg_upmap_items[i.first] = i.second;
5120 pending_inc->new_pg_upmap_items[i.first] = i.second;
5121 ++num_changed;
5122 }
92f5a8d4
TL
5123 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5124 if (cur_max_deviation <= max_deviation) {
5125 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5126 << dendl;
5127 break;
5128 }
7c673cae 5129 }
a8e16298 5130 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
5131 return num_changed;
5132}
31f18b77
FG
5133
5134int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
5135{
5136 return crush->get_leaves(name, osds);
5137}
5138
3efd9988
FG
5139// get pools whose crush rules might reference the given osd
5140void OSDMap::get_pool_ids_by_osd(CephContext *cct,
5141 int osd,
5142 set<int64_t> *pool_ids) const
5143{
11fdf7f2 5144 ceph_assert(pool_ids);
3efd9988
FG
5145 set<int> raw_rules;
5146 int r = crush->get_rules_by_osd(osd, &raw_rules);
5147 if (r < 0) {
5148 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
5149 << dendl;
11fdf7f2 5150 ceph_assert(r >= 0);
3efd9988
FG
5151 }
5152 set<int> rules;
5153 for (auto &i: raw_rules) {
5154 // exclude any dead rule
5155 if (crush_rule_in_use(i)) {
5156 rules.insert(i);
5157 }
5158 }
5159 for (auto &r: rules) {
5160 get_pool_ids_by_rule(r, pool_ids);
5161 }
5162}
5163
31f18b77
FG
5164template <typename F>
5165class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
5166public:
5167 typedef CrushTreeDumper::Dumper<F> Parent;
5168
5169 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2 5170 const PGMap& pgmap_, bool tree_,
9f95a23c 5171 const string& filter) :
c07f9fc5 5172 Parent(crush, osdmap_->get_pool_names()),
31f18b77 5173 osdmap(osdmap_),
11fdf7f2 5174 pgmap(pgmap_),
31f18b77 5175 tree(tree_),
31f18b77
FG
5176 min_var(-1),
5177 max_var(-1),
5178 stddev(0),
5179 sum(0) {
9f95a23c
TL
5180 if (osdmap->crush->name_exists(filter)) {
5181 // filter by crush node
5182 auto item_id = osdmap->crush->get_item_id(filter);
11fdf7f2
TL
5183 allowed.insert(item_id);
5184 osdmap->crush->get_all_children(item_id, &allowed);
9f95a23c
TL
5185 } else if (osdmap->crush->class_exists(filter)) {
5186 // filter by device class
5187 class_id = osdmap->crush->get_class_id(filter);
5188 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
5189 pool_id >= 0) {
5190 // filter by pool
5191 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
5192 set<int> roots;
5193 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
5194 allowed = roots;
5195 for (auto r : roots)
5196 osdmap->crush->get_all_children(r, &allowed);
11fdf7f2
TL
5197 }
5198 average_util = average_utilization();
31f18b77
FG
5199 }
5200
5201protected:
11fdf7f2
TL
5202
5203 bool should_dump(int id) const {
5204 if (!allowed.empty() && !allowed.count(id)) // filter by name
5205 return false;
9f95a23c
TL
5206 if (id >= 0 && class_id >= 0) {
5207 auto item_class_id = osdmap->crush->get_item_class_id(id);
5208 if (item_class_id < 0 || // not bound to a class yet
5209 item_class_id != class_id) // or already bound to a different class
11fdf7f2
TL
5210 return false;
5211 }
5212 return true;
5213 }
5214
5215 set<int> get_dumped_osds() {
9f95a23c 5216 if (allowed.empty() && class_id < 0) {
11fdf7f2
TL
5217 // old way, all
5218 return {};
5219 }
5220 return dumped_osds;
5221 }
5222
31f18b77
FG
5223 void dump_stray(F *f) {
5224 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5225 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 5226 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5227 }
5228 }
5229
5230 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
f67539c2 5231 if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
31f18b77 5232 return;
11fdf7f2
TL
5233 if (!should_dump(qi.id))
5234 return;
31f18b77 5235
11fdf7f2
TL
5236 if (!qi.is_bucket())
5237 dumped_osds.insert(qi.id);
31f18b77 5238 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5239 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5240 kb_used_meta = 0, kb_avail = 0;
31f18b77 5241 double util = 0;
11fdf7f2
TL
5242 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5243 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5244 if (kb_used && kb)
5245 util = 100.0 * (double)kb_used / (double)kb;
5246
5247 double var = 1.0;
5248 if (average_util)
5249 var = util / average_util;
5250
11fdf7f2 5251 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5252
11fdf7f2
TL
5253 dump_item(qi, reweight, kb, kb_used,
5254 kb_used_data, kb_used_omap, kb_used_meta,
5255 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5256
5257 if (!qi.is_bucket() && reweight > 0) {
5258 if (min_var < 0 || var < min_var)
5259 min_var = var;
5260 if (max_var < 0 || var > max_var)
5261 max_var = var;
5262
5263 double dev = util - average_util;
5264 dev *= dev;
5265 stddev += reweight * dev;
5266 sum += reweight;
5267 }
5268 }
5269
5270 virtual void dump_item(const CrushTreeDumper::Item &qi,
5271 float &reweight,
5272 int64_t kb,
5273 int64_t kb_used,
11fdf7f2
TL
5274 int64_t kb_used_data,
5275 int64_t kb_used_omap,
5276 int64_t kb_used_meta,
31f18b77
FG
5277 int64_t kb_avail,
5278 double& util,
5279 double& var,
5280 const size_t num_pgs,
5281 F *f) = 0;
5282
5283 double dev() {
5284 return sum > 0 ? sqrt(stddev / sum) : 0;
5285 }
5286
5287 double average_utilization() {
5288 int64_t kb = 0, kb_used = 0;
5289 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5290 if (!osdmap->exists(i) ||
5291 osdmap->get_weight(i) == 0 ||
5292 !should_dump(i))
31f18b77 5293 continue;
11fdf7f2
TL
5294 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5295 kb_avail_i;
5296 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5297 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5298 kb += kb_i;
5299 kb_used += kb_used_i;
5300 }
5301 }
5302 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5303 }
5304
5305 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5306 int64_t* kb_used_data,
5307 int64_t* kb_used_omap,
5308 int64_t* kb_used_meta,
31f18b77 5309 int64_t* kb_avail) const {
11fdf7f2 5310 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5311 if (!p) return false;
11fdf7f2
TL
5312 *kb = p->statfs.kb();
5313 *kb_used = p->statfs.kb_used_raw();
5314 *kb_used_data = p->statfs.kb_used_data();
5315 *kb_used_omap = p->statfs.kb_used_omap();
5316 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5317 *kb_avail = p->statfs.kb_avail();
5318
f67539c2 5319 return true;
31f18b77
FG
5320 }
5321
5322 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5323 int64_t* kb_used_data,
5324 int64_t* kb_used_omap,
5325 int64_t* kb_used_meta,
31f18b77
FG
5326 int64_t* kb_avail) const {
5327 if (id >= 0) {
11fdf7f2 5328 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5329 *kb = 0;
5330 *kb_used = 0;
11fdf7f2
TL
5331 *kb_used_data = 0;
5332 *kb_used_omap = 0;
5333 *kb_used_meta = 0;
31f18b77
FG
5334 *kb_avail = 0;
5335 return true;
5336 }
11fdf7f2
TL
5337 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5338 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5339 }
5340
5341 *kb = 0;
5342 *kb_used = 0;
11fdf7f2
TL
5343 *kb_used_data = 0;
5344 *kb_used_omap = 0;
5345 *kb_used_meta = 0;
31f18b77
FG
5346 *kb_avail = 0;
5347
5348 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5349 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5350 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5351 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5352 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5353 &kb_used_data_i, &kb_used_omap_i,
5354 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5355 return false;
5356 *kb += kb_i;
5357 *kb_used += kb_used_i;
11fdf7f2
TL
5358 *kb_used_data += kb_used_data_i;
5359 *kb_used_omap += kb_used_omap_i;
5360 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5361 *kb_avail += kb_avail_i;
5362 }
f67539c2 5363 return true;
31f18b77
FG
5364 }
5365
5366protected:
5367 const OSDMap *osdmap;
11fdf7f2 5368 const PGMap& pgmap;
31f18b77
FG
5369 bool tree;
5370 double average_util;
5371 double min_var;
5372 double max_var;
5373 double stddev;
5374 double sum;
9f95a23c 5375 int class_id = -1;
11fdf7f2
TL
5376 set<int> allowed;
5377 set<int> dumped_osds;
31f18b77
FG
5378};
5379
5380
5381class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5382public:
5383 typedef OSDUtilizationDumper<TextTable> Parent;
5384
5385 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5386 const PGMap& pgmap, bool tree,
9f95a23c
TL
5387 const string& filter) :
5388 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5389
5390 void dump(TextTable *tbl) {
5391 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5392 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5393 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5394 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5395 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5396 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5397 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5398 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5399 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5400 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5401 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5402 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5403 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5404 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5405 if (tree)
5406 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5407
5408 Parent::dump(tbl);
5409
5410 dump_stray(tbl);
5411
11fdf7f2 5412 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5413 *tbl << ""
5414 << ""
5415 << "" << "TOTAL"
11fdf7f2
TL
5416 << byte_u_t(sum.statfs.total)
5417 << byte_u_t(sum.statfs.get_used_raw())
5418 << byte_u_t(sum.statfs.allocated)
5419 << byte_u_t(sum.statfs.omap_allocated)
5420 << byte_u_t(sum.statfs.internal_metadata)
5421 << byte_u_t(sum.statfs.available)
31f18b77
FG
5422 << lowprecision_t(average_util)
5423 << ""
5424 << TextTable::endrow;
5425 }
5426
5427protected:
5428 struct lowprecision_t {
5429 float v;
5430 explicit lowprecision_t(float _v) : v(_v) {}
5431 };
5432 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5433
5434 using OSDUtilizationDumper<TextTable>::dump_item;
5435 void dump_item(const CrushTreeDumper::Item &qi,
5436 float &reweight,
5437 int64_t kb,
5438 int64_t kb_used,
11fdf7f2
TL
5439 int64_t kb_used_data,
5440 int64_t kb_used_omap,
5441 int64_t kb_used_meta,
31f18b77
FG
5442 int64_t kb_avail,
5443 double& util,
5444 double& var,
5445 const size_t num_pgs,
5446 TextTable *tbl) override {
224ce89b
WB
5447 const char *c = crush->get_item_class(qi.id);
5448 if (!c)
5449 c = "";
31f18b77 5450 *tbl << qi.id
224ce89b 5451 << c
31f18b77
FG
5452 << weightf_t(qi.weight)
5453 << weightf_t(reweight)
1adf2230
AA
5454 << byte_u_t(kb << 10)
5455 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5456 << byte_u_t(kb_used_data << 10)
5457 << byte_u_t(kb_used_omap << 10)
5458 << byte_u_t(kb_used_meta << 10)
1adf2230 5459 << byte_u_t(kb_avail << 10)
31f18b77
FG
5460 << lowprecision_t(util)
5461 << lowprecision_t(var);
5462
5463 if (qi.is_bucket()) {
5464 *tbl << "-";
11fdf7f2 5465 *tbl << "";
31f18b77
FG
5466 } else {
5467 *tbl << num_pgs;
11fdf7f2
TL
5468 if (osdmap->is_up(qi.id)) {
5469 *tbl << "up";
5470 } else if (osdmap->is_destroyed(qi.id)) {
5471 *tbl << "destroyed";
5472 } else {
5473 *tbl << "down";
5474 }
31f18b77
FG
5475 }
5476
5477 if (tree) {
5478 ostringstream name;
5479 for (int k = 0; k < qi.depth; k++)
5480 name << " ";
5481 if (qi.is_bucket()) {
5482 int type = crush->get_bucket_type(qi.id);
5483 name << crush->get_type_name(type) << " "
5484 << crush->get_item_name(qi.id);
5485 } else {
5486 name << "osd." << qi.id;
5487 }
5488 *tbl << name.str();
5489 }
5490
5491 *tbl << TextTable::endrow;
5492 }
5493
5494public:
5495 string summary() {
5496 ostringstream out;
5497 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5498 << "/" << lowprecision_t(max_var) << " "
5499 << "STDDEV: " << lowprecision_t(dev());
5500 return out.str();
5501 }
5502};
5503
5504ostream& operator<<(ostream& out,
5505 const OSDUtilizationPlainDumper::lowprecision_t& v)
5506{
5507 if (v.v < -0.01) {
5508 return out << "-";
5509 } else if (v.v < 0.001) {
5510 return out << "0";
5511 } else {
5512 std::streamsize p = out.precision();
5513 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5514 }
5515}
5516
5517class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5518public:
5519 typedef OSDUtilizationDumper<Formatter> Parent;
5520
5521 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5522 const PGMap& pgmap, bool tree,
9f95a23c
TL
5523 const string& filter) :
5524 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5525
5526 void dump(Formatter *f) {
5527 f->open_array_section("nodes");
5528 Parent::dump(f);
5529 f->close_section();
5530
5531 f->open_array_section("stray");
5532 dump_stray(f);
5533 f->close_section();
5534 }
5535
5536protected:
5537 using OSDUtilizationDumper<Formatter>::dump_item;
5538 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5539 float &reweight,
5540 int64_t kb,
5541 int64_t kb_used,
5542 int64_t kb_used_data,
5543 int64_t kb_used_omap,
5544 int64_t kb_used_meta,
5545 int64_t kb_avail,
5546 double& util,
5547 double& var,
5548 const size_t num_pgs,
5549 Formatter *f) override {
31f18b77 5550 f->open_object_section("item");
c07f9fc5 5551 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5552 f->dump_float("reweight", reweight);
5553 f->dump_int("kb", kb);
5554 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5555 f->dump_int("kb_used_data", kb_used_data);
5556 f->dump_int("kb_used_omap", kb_used_omap);
5557 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5558 f->dump_int("kb_avail", kb_avail);
5559 f->dump_float("utilization", util);
5560 f->dump_float("var", var);
5561 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5562 if (!qi.is_bucket()) {
5563 if (osdmap->is_up(qi.id)) {
5564 f->dump_string("status", "up");
5565 } else if (osdmap->is_destroyed(qi.id)) {
5566 f->dump_string("status", "destroyed");
5567 } else {
5568 f->dump_string("status", "down");
5569 }
5570 }
31f18b77
FG
5571 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5572 f->close_section();
5573 }
5574
5575public:
5576 void summary(Formatter *f) {
5577 f->open_object_section("summary");
11fdf7f2
TL
5578 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5579 auto& s = sum.statfs;
5580
5581 f->dump_int("total_kb", s.kb());
5582 f->dump_int("total_kb_used", s.kb_used_raw());
5583 f->dump_int("total_kb_used_data", s.kb_used_data());
5584 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5585 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5586 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5587 f->dump_float("average_utilization", average_util);
5588 f->dump_float("min_var", min_var);
5589 f->dump_float("max_var", max_var);
5590 f->dump_float("dev", dev());
5591 f->close_section();
5592 }
5593};
5594
5595void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5596 const PGMap& pgmap,
5597 ostream& out,
5598 Formatter *f,
5599 bool tree,
9f95a23c 5600 const string& filter)
31f18b77
FG
5601{
5602 const CrushWrapper *crush = osdmap.crush.get();
5603 if (f) {
5604 f->open_object_section("df");
9f95a23c 5605 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5606 d.dump(f);
5607 d.summary(f);
5608 f->close_section();
5609 f->flush(out);
5610 } else {
9f95a23c 5611 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5612 TextTable tbl;
5613 d.dump(&tbl);
5614 out << tbl << d.summary() << "\n";
5615 }
5616}
224ce89b 5617
92f5a8d4
TL
5618void OSDMap::check_health(CephContext *cct,
5619 health_check_map_t *checks) const
224ce89b
WB
5620{
5621 int num_osds = get_num_osds();
5622
5623 // OSD_DOWN
5624 // OSD_$subtree_DOWN
5625 // OSD_ORPHAN
5626 if (num_osds >= 0) {
5627 int num_in_osds = 0;
5628 int num_down_in_osds = 0;
5629 set<int> osds;
5630 set<int> down_in_osds;
5631 set<int> up_in_osds;
5632 set<int> subtree_up;
5633 unordered_map<int, set<int> > subtree_type_down;
5634 unordered_map<int, int> num_osds_subtree;
5635 int max_type = crush->get_max_type_id();
5636
5637 for (int i = 0; i < get_max_osd(); i++) {
5638 if (!exists(i)) {
5639 if (crush->item_exists(i)) {
5640 osds.insert(i);
5641 }
5642 continue;
5643 }
f67539c2 5644 if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
224ce89b
WB
5645 continue;
5646 ++num_in_osds;
5647 if (down_in_osds.count(i) || up_in_osds.count(i))
5648 continue;
5649 if (!is_up(i)) {
5650 down_in_osds.insert(i);
5651 int parent_id = 0;
5652 int current = i;
5653 for (int type = 0; type <= max_type; type++) {
5654 if (!crush->get_type_name(type))
5655 continue;
5656 int r = crush->get_immediate_parent_id(current, &parent_id);
5657 if (r == -ENOENT)
5658 break;
5659 // break early if this parent is already marked as up
5660 if (subtree_up.count(parent_id))
5661 break;
5662 type = crush->get_bucket_type(parent_id);
5663 if (!subtree_type_is_down(
92f5a8d4 5664 cct, parent_id, type,
224ce89b
WB
5665 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5666 break;
5667 current = parent_id;
5668 }
5669 }
5670 }
5671
5672 // calculate the number of down osds in each down subtree and
5673 // store it in num_osds_subtree
5674 for (int type = 1; type <= max_type; type++) {
5675 if (!crush->get_type_name(type))
5676 continue;
5677 for (auto j = subtree_type_down[type].begin();
5678 j != subtree_type_down[type].end();
5679 ++j) {
5680 list<int> children;
5681 int num = 0;
5682 int num_children = crush->get_children(*j, &children);
5683 if (num_children == 0)
5684 continue;
5685 for (auto l = children.begin(); l != children.end(); ++l) {
5686 if (*l >= 0) {
5687 ++num;
5688 } else if (num_osds_subtree[*l] > 0) {
5689 num = num + num_osds_subtree[*l];
5690 }
5691 }
5692 num_osds_subtree[*j] = num;
5693 }
5694 }
5695 num_down_in_osds = down_in_osds.size();
11fdf7f2 5696 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
5697 if (num_down_in_osds > 0) {
5698 // summary of down subtree types and osds
5699 for (int type = max_type; type > 0; type--) {
5700 if (!crush->get_type_name(type))
5701 continue;
5702 if (subtree_type_down[type].size() > 0) {
5703 ostringstream ss;
5704 ss << subtree_type_down[type].size() << " "
5705 << crush->get_type_name(type);
5706 if (subtree_type_down[type].size() > 1) {
5707 ss << "s";
5708 }
5709 int sum_down_osds = 0;
5710 for (auto j = subtree_type_down[type].begin();
5711 j != subtree_type_down[type].end();
5712 ++j) {
5713 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5714 }
5715 ss << " (" << sum_down_osds << " osds) down";
5716 string err = string("OSD_") +
5717 string(crush->get_type_name(type)) + "_DOWN";
5718 boost::to_upper(err);
9f95a23c
TL
5719 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
5720 subtree_type_down[type].size());
224ce89b
WB
5721 for (auto j = subtree_type_down[type].rbegin();
5722 j != subtree_type_down[type].rend();
5723 ++j) {
5724 ostringstream ss;
5725 ss << crush->get_type_name(type);
5726 ss << " ";
5727 ss << crush->get_item_name(*j);
5728 // at the top level, do not print location
5729 if (type != max_type) {
5730 ss << " (";
5731 ss << crush->get_full_location_ordered_string(*j);
5732 ss << ")";
5733 }
5734 int num = num_osds_subtree[*j];
5735 ss << " (" << num << " osds)";
5736 ss << " is down";
5737 d.detail.push_back(ss.str());
5738 }
5739 }
5740 }
5741 ostringstream ss;
5742 ss << down_in_osds.size() << " osds down";
9f95a23c
TL
5743 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
5744 down_in_osds.size());
224ce89b
WB
5745 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5746 ostringstream ss;
5747 ss << "osd." << *it << " (";
5748 ss << crush->get_full_location_ordered_string(*it);
5749 ss << ") is down";
5750 d.detail.push_back(ss.str());
5751 }
5752 }
5753
5754 if (!osds.empty()) {
5755 ostringstream ss;
5756 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
9f95a23c
TL
5757 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
5758 osds.size());
224ce89b
WB
5759 for (auto osd : osds) {
5760 ostringstream ss;
5761 ss << "osd." << osd << " exists in crush map but not in osdmap";
5762 d.detail.push_back(ss.str());
5763 }
5764 }
5765 }
5766
eafe8130
TL
5767 std::list<std::string> scrub_messages;
5768 bool noscrub = false, nodeepscrub = false;
5769 for (const auto &p : pools) {
5770 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5771 ostringstream ss;
5772 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5773 scrub_messages.push_back(ss.str());
5774 noscrub = true;
5775 }
5776 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5777 ostringstream ss;
5778 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5779 scrub_messages.push_back(ss.str());
5780 nodeepscrub = true;
5781 }
5782 }
5783 if (noscrub || nodeepscrub) {
5784 string out = "";
5785 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5786 out += nodeepscrub ? "nodeep-scrub" : "";
5787 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
9f95a23c 5788 "Some pool(s) have the " + out + " flag(s) set", 0);
eafe8130
TL
5789 d.detail.splice(d.detail.end(), scrub_messages);
5790 }
5791
224ce89b
WB
5792 // OSD_OUT_OF_ORDER_FULL
5793 {
5794 // An osd could configure failsafe ratio, to something different
5795 // but for now assume it is the same here.
92f5a8d4 5796 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
5797 if (fsr > 1.0) fsr /= 100;
5798 float fr = get_full_ratio();
5799 float br = get_backfillfull_ratio();
5800 float nr = get_nearfull_ratio();
5801
5802 list<string> detail;
5803 // These checks correspond to how OSDService::check_full_status() in an OSD
5804 // handles the improper setting of these values.
5805 if (br < nr) {
5806 ostringstream ss;
5807 ss << "backfillfull_ratio (" << br
5808 << ") < nearfull_ratio (" << nr << "), increased";
5809 detail.push_back(ss.str());
5810 br = nr;
5811 }
5812 if (fr < br) {
5813 ostringstream ss;
5814 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5815 << "), increased";
5816 detail.push_back(ss.str());
5817 fr = br;
5818 }
5819 if (fsr < fr) {
5820 ostringstream ss;
5821 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5822 << "), increased";
5823 detail.push_back(ss.str());
5824 }
5825 if (!detail.empty()) {
5826 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
9f95a23c 5827 "full ratio(s) out of order", 0);
224ce89b
WB
5828 d.detail.swap(detail);
5829 }
5830 }
5831
5832 // OSD_FULL
5833 // OSD_NEARFULL
5834 // OSD_BACKFILLFULL
5835 // OSD_FAILSAFE_FULL
5836 {
5837 set<int> full, backfillfull, nearfull;
5838 get_full_osd_counts(&full, &backfillfull, &nearfull);
5839 if (full.size()) {
5840 ostringstream ss;
5841 ss << full.size() << " full osd(s)";
9f95a23c 5842 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
224ce89b
WB
5843 for (auto& i: full) {
5844 ostringstream ss;
5845 ss << "osd." << i << " is full";
5846 d.detail.push_back(ss.str());
5847 }
5848 }
5849 if (backfillfull.size()) {
5850 ostringstream ss;
5851 ss << backfillfull.size() << " backfillfull osd(s)";
9f95a23c
TL
5852 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
5853 backfillfull.size());
224ce89b
WB
5854 for (auto& i: backfillfull) {
5855 ostringstream ss;
5856 ss << "osd." << i << " is backfill full";
5857 d.detail.push_back(ss.str());
5858 }
5859 }
5860 if (nearfull.size()) {
5861 ostringstream ss;
5862 ss << nearfull.size() << " nearfull osd(s)";
9f95a23c 5863 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
224ce89b
WB
5864 for (auto& i: nearfull) {
5865 ostringstream ss;
5866 ss << "osd." << i << " is near full";
5867 d.detail.push_back(ss.str());
5868 }
5869 }
5870 }
5871
5872 // OSDMAP_FLAGS
5873 {
5874 // warn about flags
5875 uint64_t warn_flags =
224ce89b
WB
5876 CEPH_OSDMAP_PAUSERD |
5877 CEPH_OSDMAP_PAUSEWR |
5878 CEPH_OSDMAP_PAUSEREC |
5879 CEPH_OSDMAP_NOUP |
5880 CEPH_OSDMAP_NODOWN |
5881 CEPH_OSDMAP_NOIN |
5882 CEPH_OSDMAP_NOOUT |
5883 CEPH_OSDMAP_NOBACKFILL |
5884 CEPH_OSDMAP_NORECOVER |
5885 CEPH_OSDMAP_NOSCRUB |
5886 CEPH_OSDMAP_NODEEP_SCRUB |
5887 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 5888 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
5889 CEPH_OSDMAP_NOREBALANCE;
5890 if (test_flag(warn_flags)) {
5891 ostringstream ss;
9f95a23c
TL
5892 string s = get_flag_string(get_flags() & warn_flags);
5893 ss << s << " flag(s) set";
5894 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
5895 s.size() /* kludgey but sufficient */);
224ce89b
WB
5896 }
5897 }
5898
5899 // OSD_FLAGS
5900 {
5901 list<string> detail;
5902 const unsigned flags =
5903 CEPH_OSD_NOUP |
5904 CEPH_OSD_NOIN |
5905 CEPH_OSD_NODOWN |
5906 CEPH_OSD_NOOUT;
5907 for (int i = 0; i < max_osd; ++i) {
5908 if (osd_state[i] & flags) {
5909 ostringstream ss;
5910 set<string> states;
5911 OSDMap::calc_state_set(osd_state[i] & flags, states);
5912 ss << "osd." << i << " has flags " << states;
5913 detail.push_back(ss.str());
5914 }
5915 }
81eedcae
TL
5916 for (auto& i : crush_node_flags) {
5917 if (i.second && crush->item_exists(i.first)) {
5918 ostringstream ss;
5919 set<string> states;
5920 OSDMap::calc_state_set(i.second, states);
5921 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
5922 const char *tn = crush->get_type_name(t);
5923 ss << (tn ? tn : "node") << " "
5924 << crush->get_item_name(i.first) << " has flags " << states;
5925 detail.push_back(ss.str());
5926 }
5927 }
5928 for (auto& i : device_class_flags) {
5929 const char* class_name = crush->get_class_name(i.first);
5930 if (i.second && class_name) {
5931 ostringstream ss;
5932 set<string> states;
5933 OSDMap::calc_state_set(i.second, states);
5934 ss << "device class '" << class_name << "' has flags " << states;
5935 detail.push_back(ss.str());
5936 }
5937 }
224ce89b
WB
5938 if (!detail.empty()) {
5939 ostringstream ss;
81eedcae 5940 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
9f95a23c 5941 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
224ce89b
WB
5942 d.detail.swap(detail);
5943 }
5944 }
5945
5946 // OLD_CRUSH_TUNABLES
92f5a8d4 5947 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 5948 string min = crush->get_min_required_version();
92f5a8d4 5949 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
5950 ostringstream ss;
5951 ss << "crush map has legacy tunables (require " << min
92f5a8d4 5952 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
9f95a23c 5953 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
f67539c2 5954 d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
5955 }
5956 }
5957
5958 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 5959 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
5960 if (crush->get_straw_calc_version() == 0) {
5961 ostringstream ss;
5962 ss << "crush map has straw_calc_version=0";
9f95a23c 5963 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
224ce89b 5964 d.detail.push_back(
f67539c2 5965 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
224ce89b
WB
5966 }
5967 }
5968
5969 // CACHE_POOL_NO_HIT_SET
92f5a8d4 5970 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b 5971 list<string> detail;
9f95a23c 5972 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
224ce89b
WB
5973 const pg_pool_t& info = p->second;
5974 if (info.cache_mode_requires_hit_set() &&
5975 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5976 ostringstream ss;
5977 ss << "pool '" << get_pool_name(p->first)
5978 << "' with cache_mode " << info.get_cache_mode_name()
5979 << " needs hit_set_type to be set but it is not";
5980 detail.push_back(ss.str());
5981 }
5982 }
5983 if (!detail.empty()) {
5984 ostringstream ss;
5985 ss << detail.size() << " cache pools are missing hit_sets";
9f95a23c
TL
5986 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
5987 detail.size());
224ce89b
WB
5988 d.detail.swap(detail);
5989 }
5990 }
5991
5992 // OSD_NO_SORTBITWISE
11fdf7f2 5993 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 5994 ostringstream ss;
11fdf7f2 5995 ss << "'sortbitwise' flag is not set";
9f95a23c 5996 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
5997 }
5998
5999 // OSD_UPGRADE_FINISHED
6000 // none of these (yet) since we don't run until luminous upgrade is done.
6001
3efd9988 6002 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 6003 {
3efd9988 6004 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
6005 for (auto it : get_pools()) {
6006 const pg_pool_t &pool = it.second;
3efd9988 6007 const string& pool_name = get_pool_name(it.first);
224ce89b 6008 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 6009 stringstream ss;
11fdf7f2 6010 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
6011 // may run out of space too,
6012 // but we want EQUOTA taking precedence
11fdf7f2 6013 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
6014 } else {
6015 ss << "pool '" << pool_name << "' is full (no space)";
6016 }
6017 full_detail.push_back(ss.str());
6018 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
6019 stringstream ss;
6020 ss << "pool '" << pool_name << "' is backfillfull";
6021 backfillfull_detail.push_back(ss.str());
6022 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
6023 stringstream ss;
6024 ss << "pool '" << pool_name << "' is nearfull";
6025 nearfull_detail.push_back(ss.str());
224ce89b
WB
6026 }
6027 }
3efd9988 6028 if (!full_detail.empty()) {
224ce89b 6029 ostringstream ss;
3efd9988 6030 ss << full_detail.size() << " pool(s) full";
9f95a23c 6031 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
3efd9988
FG
6032 d.detail.swap(full_detail);
6033 }
6034 if (!backfillfull_detail.empty()) {
6035 ostringstream ss;
6036 ss << backfillfull_detail.size() << " pool(s) backfillfull";
9f95a23c
TL
6037 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
6038 backfillfull_detail.size());
3efd9988
FG
6039 d.detail.swap(backfillfull_detail);
6040 }
6041 if (!nearfull_detail.empty()) {
6042 ostringstream ss;
6043 ss << nearfull_detail.size() << " pool(s) nearfull";
9f95a23c
TL
6044 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
6045 nearfull_detail.size());
3efd9988 6046 d.detail.swap(nearfull_detail);
224ce89b
WB
6047 }
6048 }
92f5a8d4
TL
6049
6050 // POOL_PG_NUM_NOT_POWER_OF_TWO
6051 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
6052 list<string> detail;
6053 for (auto it : get_pools()) {
6054 if (!isp2(it.second.get_pg_num_target())) {
6055 ostringstream ss;
6056 ss << "pool '" << get_pool_name(it.first)
6057 << "' pg_num " << it.second.get_pg_num_target()
6058 << " is not a power of two";
6059 detail.push_back(ss.str());
6060 }
6061 }
6062 if (!detail.empty()) {
6063 ostringstream ss;
6064 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
6065 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
9f95a23c
TL
6066 ss.str(), detail.size());
6067 d.detail.swap(detail);
6068 }
6069 }
6070
6071 // POOL_NO_REDUNDANCY
6072 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
6073 {
6074 list<string> detail;
6075 for (auto it : get_pools()) {
6076 if (it.second.get_size() == 1) {
6077 ostringstream ss;
6078 ss << "pool '" << get_pool_name(it.first)
6079 << "' has no replicas configured";
6080 detail.push_back(ss.str());
6081 }
6082 }
6083 if (!detail.empty()) {
6084 ostringstream ss;
6085 ss << detail.size() << " pool(s) have no replicas configured";
6086 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
6087 ss.str(), detail.size());
92f5a8d4
TL
6088 d.detail.swap(detail);
6089 }
6090 }
f67539c2
TL
6091
6092 // DEGRADED STRETCH MODE
6093 if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
6094 if (recovering_stretch_mode) {
6095 stringstream ss;
6096 ss << "We are recovering stretch mode buckets, only requiring "
6097 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6098 checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
6099 ss.str(), 0);
6100 } else if (degraded_stretch_mode) {
6101 stringstream ss;
6102 ss << "We are missing stretch mode buckets, only requiring "
6103 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
6104 checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
6105 ss.str(), 0);
6106 }
6107 }
224ce89b 6108}
35e4c445
FG
6109
6110int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
6111 ostream *ss) const
6112{
6113 out->clear();
6114 for (auto i = ls.begin(); i != ls.end(); ++i) {
6115 if (i == ls.begin() &&
6116 (*i == "any" || *i == "all" || *i == "*")) {
6117 get_all_osds(*out);
6118 break;
6119 }
9f95a23c 6120 long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
35e4c445
FG
6121 if (osd < 0) {
6122 *ss << "invalid osd id '" << *i << "'";
6123 return -EINVAL;
6124 }
6125 out->insert(osd);
6126 }
6127 return 0;
6128}
11fdf7f2
TL
6129
6130void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
6131 string &subtree,
6132 int limit, // how many
6133 set<int> skip,
6134 set<int> *want) const {
6135 if (limit <= 0)
6136 return;
6137 int subtree_type = crush->get_type_id(subtree);
6138 if (subtree_type < 1)
6139 return;
6140 vector<int> subtrees;
6141 crush->get_subtree_of_type(subtree_type, &subtrees);
6142 std::random_device rd;
6143 std::default_random_engine rng{rd()};
6144 std::shuffle(subtrees.begin(), subtrees.end(), rng);
6145 for (auto s : subtrees) {
6146 if (limit <= 0)
6147 break;
6148 if (crush->subtree_contains(s, n))
6149 continue;
6150 vector<int> osds;
6151 crush->get_children_of_type(s, 0, &osds);
6152 if (osds.empty())
6153 continue;
6154 vector<int> up_osds;
6155 for (auto o : osds) {
6156 if (is_up(o) && !skip.count(o))
6157 up_osds.push_back(o);
6158 }
6159 if (up_osds.empty())
6160 continue;
6161 auto it = up_osds.begin();
6162 std::advance(it, (n % up_osds.size()));
6163 want->insert(*it);
6164 --limit;
6165 }
6166}
6167
6168float OSDMap::pool_raw_used_rate(int64_t poolid) const
6169{
6170 const pg_pool_t *pool = get_pg_pool(poolid);
6171 assert(pool != nullptr);
6172
6173 switch (pool->get_type()) {
6174 case pg_pool_t::TYPE_REPLICATED:
6175 return pool->get_size();
11fdf7f2
TL
6176 case pg_pool_t::TYPE_ERASURE:
6177 {
6178 auto& ecp =
6179 get_erasure_code_profile(pool->erasure_code_profile);
6180 auto pm = ecp.find("m");
6181 auto pk = ecp.find("k");
6182 if (pm != ecp.end() && pk != ecp.end()) {
6183 int k = atoi(pk->second.c_str());
6184 int m = atoi(pm->second.c_str());
6185 int mk = m + k;
6186 ceph_assert(mk != 0);
6187 ceph_assert(k != 0);
6188 return (float)mk / k;
6189 } else {
6190 return 0.0;
6191 }
6192 }
6193 break;
6194 default:
6195 ceph_abort_msg("unrecognized pool type");
6196 }
6197}
81eedcae
TL
6198
6199unsigned OSDMap::get_osd_crush_node_flags(int osd) const
6200{
6201 unsigned flags = 0;
6202 if (!crush_node_flags.empty()) {
6203 // the map will contain type -> name
6204 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
6205 for (auto& i : ploc) {
6206 int id = crush->get_item_id(i.second);
6207 auto p = crush_node_flags.find(id);
6208 if (p != crush_node_flags.end()) {
6209 flags |= p->second;
6210 }
6211 }
6212 }
6213 return flags;
6214}
6215
6216unsigned OSDMap::get_crush_node_flags(int id) const
6217{
6218 unsigned flags = 0;
6219 auto it = crush_node_flags.find(id);
6220 if (it != crush_node_flags.end())
6221 flags = it->second;
6222 return flags;
6223}
6224
6225unsigned OSDMap::get_device_class_flags(int id) const
6226{
6227 unsigned flags = 0;
6228 auto it = device_class_flags.find(id);
6229 if (it != device_class_flags.end())
6230 flags = it->second;
6231 return flags;
6232}