]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
29#include "include/ceph_features.h"
9f95a23c 30#include "include/common_fwd.h"
7c673cae
FG
31#include "include/str_map.h"
32
33#include "common/code_environment.h"
224ce89b 34#include "mon/health_check.h"
7c673cae
FG
35
36#include "crush/CrushTreeDumper.h"
37#include "common/Clock.h"
11fdf7f2
TL
38#include "mon/PGMap.h"
39
9f95a23c
TL
40using std::list;
41using std::make_pair;
42using std::map;
43using std::multimap;
44using std::ostream;
45using std::ostringstream;
46using std::pair;
47using std::set;
48using std::string;
49using std::stringstream;
50using std::unordered_map;
51using std::vector;
52
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56
7c673cae
FG
57#define dout_subsys ceph_subsys_osd
58
59MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
60MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
61
62
63// ----------------------------------
64// osd_info_t
65
66void osd_info_t::dump(Formatter *f) const
67{
68 f->dump_int("last_clean_begin", last_clean_begin);
69 f->dump_int("last_clean_end", last_clean_end);
70 f->dump_int("up_from", up_from);
71 f->dump_int("up_thru", up_thru);
72 f->dump_int("down_at", down_at);
73 f->dump_int("lost_at", lost_at);
74}
75
9f95a23c 76void osd_info_t::encode(ceph::buffer::list& bl) const
7c673cae 77{
11fdf7f2 78 using ceph::encode;
7c673cae 79 __u8 struct_v = 1;
11fdf7f2
TL
80 encode(struct_v, bl);
81 encode(last_clean_begin, bl);
82 encode(last_clean_end, bl);
83 encode(up_from, bl);
84 encode(up_thru, bl);
85 encode(down_at, bl);
86 encode(lost_at, bl);
7c673cae
FG
87}
88
9f95a23c 89void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 90{
11fdf7f2 91 using ceph::decode;
7c673cae 92 __u8 struct_v;
11fdf7f2
TL
93 decode(struct_v, bl);
94 decode(last_clean_begin, bl);
95 decode(last_clean_end, bl);
96 decode(up_from, bl);
97 decode(up_thru, bl);
98 decode(down_at, bl);
99 decode(lost_at, bl);
7c673cae
FG
100}
101
102void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
103{
104 o.push_back(new osd_info_t);
105 o.push_back(new osd_info_t);
106 o.back()->last_clean_begin = 1;
107 o.back()->last_clean_end = 2;
108 o.back()->up_from = 30;
109 o.back()->up_thru = 40;
110 o.back()->down_at = 5;
111 o.back()->lost_at = 6;
112}
113
114ostream& operator<<(ostream& out, const osd_info_t& info)
115{
116 out << "up_from " << info.up_from
117 << " up_thru " << info.up_thru
118 << " down_at " << info.down_at
119 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
120 if (info.lost_at)
121 out << " lost_at " << info.lost_at;
122 return out;
123}
124
125// ----------------------------------
126// osd_xinfo_t
127
128void osd_xinfo_t::dump(Formatter *f) const
129{
130 f->dump_stream("down_stamp") << down_stamp;
131 f->dump_float("laggy_probability", laggy_probability);
132 f->dump_int("laggy_interval", laggy_interval);
133 f->dump_int("features", features);
134 f->dump_unsigned("old_weight", old_weight);
9f95a23c
TL
135 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
136 f->dump_int("dead_epoch", dead_epoch);
7c673cae
FG
137}
138
9f95a23c 139void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
7c673cae 140{
9f95a23c
TL
141 uint8_t v = 4;
142 if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
143 v = 3;
144 }
145 ENCODE_START(v, 1, bl);
11fdf7f2 146 encode(down_stamp, bl);
7c673cae 147 __u32 lp = laggy_probability * 0xfffffffful;
11fdf7f2
TL
148 encode(lp, bl);
149 encode(laggy_interval, bl);
150 encode(features, bl);
151 encode(old_weight, bl);
9f95a23c
TL
152 if (v >= 4) {
153 encode(last_purged_snaps_scrub, bl);
154 encode(dead_epoch, bl);
155 }
7c673cae
FG
156 ENCODE_FINISH(bl);
157}
158
9f95a23c 159void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 160{
9f95a23c 161 DECODE_START(4, bl);
11fdf7f2 162 decode(down_stamp, bl);
7c673cae 163 __u32 lp;
11fdf7f2 164 decode(lp, bl);
7c673cae 165 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 166 decode(laggy_interval, bl);
7c673cae 167 if (struct_v >= 2)
11fdf7f2 168 decode(features, bl);
7c673cae
FG
169 else
170 features = 0;
171 if (struct_v >= 3)
11fdf7f2 172 decode(old_weight, bl);
7c673cae
FG
173 else
174 old_weight = 0;
9f95a23c
TL
175 if (struct_v >= 4) {
176 decode(last_purged_snaps_scrub, bl);
177 decode(dead_epoch, bl);
178 } else {
179 dead_epoch = 0;
180 }
7c673cae
FG
181 DECODE_FINISH(bl);
182}
183
184void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
185{
186 o.push_back(new osd_xinfo_t);
187 o.push_back(new osd_xinfo_t);
188 o.back()->down_stamp = utime_t(2, 3);
189 o.back()->laggy_probability = .123;
190 o.back()->laggy_interval = 123456;
191 o.back()->old_weight = 0x7fff;
192}
193
194ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
195{
196 return out << "down_stamp " << xi.down_stamp
197 << " laggy_probability " << xi.laggy_probability
198 << " laggy_interval " << xi.laggy_interval
9f95a23c
TL
199 << " old_weight " << xi.old_weight
200 << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
201 << " dead_epoch " << xi.dead_epoch;
7c673cae
FG
202}
203
204// ----------------------------------
205// OSDMap::Incremental
206
207int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
208{
209 int n = 0;
210 for (auto &weight : new_weight) {
211 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
212 n++; // marked out
213 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
214 n--; // marked in
215 }
216 return n;
217}
218
219int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
220{
221 int n = 0;
222 for (auto &state : new_state) { //
223 if (state.second & CEPH_OSD_UP) {
224 if (previous->is_up(state.first))
225 n++; // marked down
226 else
227 n--; // marked up
228 }
229 }
230 return n;
231}
232
233int OSDMap::Incremental::identify_osd(uuid_d u) const
234{
235 for (auto &uuid : new_uuid)
236 if (uuid.second == u)
237 return uuid.first;
238 return -1;
239}
240
241int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
242 const OSDMap& osdmap)
243{
11fdf7f2 244 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
245
246 for (auto &new_pool : new_pools) {
247 if (!new_pool.second.tiers.empty()) {
248 pg_pool_t& base = new_pool.second;
249
11fdf7f2
TL
250 auto new_rem_it = new_removed_snaps.find(new_pool.first);
251
7c673cae
FG
252 for (const auto &tier_pool : base.tiers) {
253 const auto &r = new_pools.find(tier_pool);
254 pg_pool_t *tier = 0;
255 if (r == new_pools.end()) {
256 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
257 if (!orig) {
258 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
259 return -EIO;
260 }
261 tier = get_new_pool(tier_pool, orig);
262 } else {
263 tier = &r->second;
264 }
265 if (tier->tier_of != new_pool.first) {
266 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
267 return -EIO;
268 }
269
270 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
271 << tier_pool << dendl;
272 tier->snap_seq = base.snap_seq;
273 tier->snap_epoch = base.snap_epoch;
274 tier->snaps = base.snaps;
275 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
276 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
277 pg_pool_t::FLAG_POOL_SNAPS);
278
279 if (new_rem_it != new_removed_snaps.end()) {
280 new_removed_snaps[tier_pool] = new_rem_it->second;
281 }
7c673cae
FG
282 }
283 }
284 }
285 return 0;
286}
287
28e407b8
AA
288// ----------------------------------
289// OSDMap
7c673cae
FG
290
291bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
292{
293 if (id >= 0)
294 return is_down(id);
295
296 if (down_cache &&
297 down_cache->count(id)) {
298 return true;
299 }
300
301 list<int> children;
302 crush->get_children(id, &children);
303 for (const auto &child : children) {
304 if (!subtree_is_down(child, down_cache)) {
305 return false;
306 }
307 }
308 if (down_cache) {
309 down_cache->insert(id);
310 }
311 return true;
312}
313
314bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
315{
316 // use a stack-local down_cache if we didn't get one from the
317 // caller. then at least this particular call will avoid duplicated
318 // work.
319 set<int> local_down_cache;
320 if (!down_cache) {
321 down_cache = &local_down_cache;
322 }
323
324 int current = id;
325 while (true) {
326 int type;
327 if (current >= 0) {
328 type = 0;
329 } else {
330 type = crush->get_bucket_type(current);
331 }
11fdf7f2 332 ceph_assert(type >= 0);
7c673cae
FG
333
334 if (!subtree_is_down(current, down_cache)) {
335 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
336 return false;
337 }
338
339 // is this a big enough subtree to be marked as down?
340 if (type >= subtree_type) {
341 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
342 return true;
343 }
344
345 int r = crush->get_immediate_parent_id(current, &current);
346 if (r < 0) {
347 return false;
348 }
349 }
350}
351
224ce89b
WB
352bool OSDMap::subtree_type_is_down(
353 CephContext *cct,
354 int id,
355 int subtree_type,
356 set<int> *down_in_osds,
357 set<int> *up_in_osds,
358 set<int> *subtree_up,
359 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
360{
361 if (id >= 0) {
362 bool is_down_ret = is_down(id);
363 if (!is_out(id)) {
364 if (is_down_ret) {
365 down_in_osds->insert(id);
366 } else {
367 up_in_osds->insert(id);
368 }
369 }
370 return is_down_ret;
371 }
372
373 if (subtree_type_down &&
374 (*subtree_type_down)[subtree_type].count(id)) {
375 return true;
376 }
377
378 list<int> children;
379 crush->get_children(id, &children);
380 for (const auto &child : children) {
224ce89b
WB
381 if (!subtree_type_is_down(
382 cct, child, crush->get_bucket_type(child),
383 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
384 subtree_up->insert(id);
385 return false;
386 }
387 }
388 if (subtree_type_down) {
389 (*subtree_type_down)[subtree_type].insert(id);
390 }
391 return true;
392}
393
9f95a23c 394void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
7c673cae 395{
11fdf7f2 396 using ceph::encode;
7c673cae 397 __u16 v = 5;
11fdf7f2
TL
398 encode(v, bl);
399 encode(fsid, bl);
400 encode(epoch, bl);
401 encode(modified, bl);
7c673cae 402 int32_t new_t = new_pool_max;
11fdf7f2
TL
403 encode(new_t, bl);
404 encode(new_flags, bl);
405 encode(fullmap, bl);
406 encode(crush, bl);
7c673cae 407
11fdf7f2
TL
408 encode(new_max_osd, bl);
409 // for encode(new_pools, bl);
7c673cae 410 __u32 n = new_pools.size();
11fdf7f2 411 encode(n, bl);
7c673cae
FG
412 for (const auto &new_pool : new_pools) {
413 n = new_pool.first;
11fdf7f2
TL
414 encode(n, bl);
415 encode(new_pool.second, bl, 0);
7c673cae 416 }
11fdf7f2 417 // for encode(new_pool_names, bl);
7c673cae 418 n = new_pool_names.size();
11fdf7f2 419 encode(n, bl);
7c673cae
FG
420
421 for (const auto &new_pool_name : new_pool_names) {
422 n = new_pool_name.first;
11fdf7f2
TL
423 encode(n, bl);
424 encode(new_pool_name.second, bl);
7c673cae 425 }
11fdf7f2 426 // for encode(old_pools, bl);
7c673cae 427 n = old_pools.size();
11fdf7f2 428 encode(n, bl);
7c673cae
FG
429 for (auto &old_pool : old_pools) {
430 n = old_pool;
11fdf7f2 431 encode(n, bl);
7c673cae 432 }
11fdf7f2 433 encode(new_up_client, bl, 0);
31f18b77
FG
434 {
435 // legacy is map<int32_t,uint8_t>
9f95a23c 436 map<int32_t, uint8_t> os;
31f18b77 437 for (auto p : new_state) {
9f95a23c
TL
438 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
439 // that an old client could not understand.
440 // skip those!
441 uint8_t s = p.second;
442 if (p.second != 0 && s == 0)
443 continue;
444 os[p.first] = s;
445 }
446 uint32_t n = os.size();
447 encode(n, bl);
448 for (auto p : os) {
11fdf7f2 449 encode(p.first, bl);
9f95a23c 450 encode(p.second, bl);
31f18b77
FG
451 }
452 }
11fdf7f2
TL
453 encode(new_weight, bl);
454 // for encode(new_pg_temp, bl);
7c673cae 455 n = new_pg_temp.size();
11fdf7f2 456 encode(n, bl);
7c673cae
FG
457
458 for (const auto &pg_temp : new_pg_temp) {
459 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
460 encode(opg, bl);
461 encode(pg_temp.second, bl);
7c673cae
FG
462 }
463}
464
9f95a23c 465void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 466{
11fdf7f2 467 using ceph::encode;
7c673cae
FG
468 if ((features & CEPH_FEATURE_PGID64) == 0) {
469 encode_client_old(bl);
470 return;
471 }
472
473 // base
474 __u16 v = 6;
11fdf7f2
TL
475 encode(v, bl);
476 encode(fsid, bl);
477 encode(epoch, bl);
478 encode(modified, bl);
479 encode(new_pool_max, bl);
480 encode(new_flags, bl);
481 encode(fullmap, bl);
482 encode(crush, bl);
483
484 encode(new_max_osd, bl);
485 encode(new_pools, bl, features);
486 encode(new_pool_names, bl);
487 encode(old_pools, bl);
488 encode(new_up_client, bl, features);
31f18b77 489 {
9f95a23c 490 map<int32_t, uint8_t> os;
31f18b77 491 for (auto p : new_state) {
9f95a23c
TL
492 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
493 // that an old client could not understand.
494 // skip those!
495 uint8_t s = p.second;
496 if (p.second != 0 && s == 0)
497 continue;
498 os[p.first] = s;
499 }
500 uint32_t n = os.size();
501 encode(n, bl);
502 for (auto p : os) {
11fdf7f2 503 encode(p.first, bl);
9f95a23c 504 encode(p.second, bl);
31f18b77
FG
505 }
506 }
11fdf7f2
TL
507 encode(new_weight, bl);
508 encode(new_pg_temp, bl);
7c673cae
FG
509
510 // extended
511 __u16 ev = 10;
11fdf7f2
TL
512 encode(ev, bl);
513 encode(new_hb_back_up, bl, features);
514 encode(new_up_thru, bl);
515 encode(new_last_clean_interval, bl);
516 encode(new_lost, bl);
517 encode(new_blacklist, bl, features);
518 encode(old_blacklist, bl, features);
519 encode(new_up_cluster, bl, features);
520 encode(cluster_snapshot, bl);
521 encode(new_uuid, bl);
9f95a23c 522 encode(new_xinfo, bl, features);
11fdf7f2
TL
523 encode(new_hb_front_up, bl, features);
524}
525
526template<class T>
9f95a23c 527static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
528{
529 uint32_t n = m.size();
530 encode(n, bl);
531 for (auto& i : m) {
532 encode(i.first, bl);
533 encode(i.second.legacy_addr(), bl, f);
534 }
535}
536
537template<class T>
9f95a23c 538static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
11fdf7f2
TL
539{
540 uint32_t n = m.size();
541 encode(n, bl);
542 for (auto& i : m) {
543 if (i) {
544 encode(i->legacy_addr(), bl, f);
545 } else {
546 encode(entity_addr_t(), bl, f);
547 }
548 }
7c673cae
FG
549}
550
11fdf7f2
TL
551/* for a description of osdmap incremental versions, and when they were
552 * introduced, please refer to
553 * doc/dev/osd_internals/osdmap_versions.txt
554 */
9f95a23c 555void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 556{
11fdf7f2 557 using ceph::encode;
7c673cae
FG
558 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
559 encode_classic(bl, features);
560 return;
561 }
562
563 // only a select set of callers should *ever* be encoding new
564 // OSDMaps. others should be passing around the canonical encoded
565 // buffers from on high. select out those callers by passing in an
566 // "impossible" feature bit.
11fdf7f2 567 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
568 features &= ~CEPH_FEATURE_RESERVED;
569
570 size_t start_offset = bl.length();
571 size_t tail_offset;
11fdf7f2 572 size_t crc_offset;
9f95a23c 573 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
574
575 // meta-encoding: how we include client-used and osd-specific data
576 ENCODE_START(8, 7, bl);
577
578 {
11fdf7f2 579 uint8_t v = 8;
7c673cae
FG
580 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
581 v = 3;
11fdf7f2
TL
582 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
583 v = 5;
584 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
585 v = 6;
7c673cae
FG
586 }
587 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
588 encode(fsid, bl);
589 encode(epoch, bl);
590 encode(modified, bl);
591 encode(new_pool_max, bl);
592 encode(new_flags, bl);
593 encode(fullmap, bl);
594 encode(crush, bl);
595
596 encode(new_max_osd, bl);
597 encode(new_pools, bl, features);
598 encode(new_pool_names, bl);
599 encode(old_pools, bl);
600 if (v >= 7) {
601 encode(new_up_client, bl, features);
602 } else {
603 encode_addrvec_map_as_addr(new_up_client, bl, features);
604 }
31f18b77 605 if (v >= 5) {
11fdf7f2 606 encode(new_state, bl);
31f18b77 607 } else {
9f95a23c 608 map<int32_t, uint8_t> os;
31f18b77 609 for (auto p : new_state) {
9f95a23c
TL
610 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
611 // that an old client could not understand.
612 // skip those!
613 uint8_t s = p.second;
614 if (p.second != 0 && s == 0)
615 continue;
616 os[p.first] = s;
617 }
618 uint32_t n = os.size();
619 encode(n, bl);
620 for (auto p : os) {
621 encode(p.first, bl);
622 encode(p.second, bl);
31f18b77
FG
623 }
624 }
11fdf7f2
TL
625 encode(new_weight, bl);
626 encode(new_pg_temp, bl);
627 encode(new_primary_temp, bl);
628 encode(new_primary_affinity, bl);
629 encode(new_erasure_code_profiles, bl);
630 encode(old_erasure_code_profiles, bl);
7c673cae 631 if (v >= 4) {
11fdf7f2
TL
632 encode(new_pg_upmap, bl);
633 encode(old_pg_upmap, bl);
634 encode(new_pg_upmap_items, bl);
635 encode(old_pg_upmap_items, bl);
636 }
637 if (v >= 6) {
638 encode(new_removed_snaps, bl);
639 encode(new_purged_snaps, bl);
640 }
641 if (v >= 8) {
642 encode(new_last_up_change, bl);
643 encode(new_last_in_change, bl);
7c673cae
FG
644 }
645 ENCODE_FINISH(bl); // client-usable data
646 }
647
648 {
81eedcae 649 uint8_t target_v = 9;
7c673cae
FG
650 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
651 target_v = 2;
11fdf7f2
TL
652 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
653 target_v = 6;
7c673cae
FG
654 }
655 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
656 if (target_v < 7) {
657 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
658 } else {
659 encode(new_hb_back_up, bl, features);
660 }
661 encode(new_up_thru, bl);
662 encode(new_last_clean_interval, bl);
663 encode(new_lost, bl);
664 encode(new_blacklist, bl, features);
665 encode(old_blacklist, bl, features);
666 if (target_v < 7) {
667 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
668 } else {
669 encode(new_up_cluster, bl, features);
670 }
671 encode(cluster_snapshot, bl);
672 encode(new_uuid, bl);
9f95a23c 673 encode(new_xinfo, bl, features);
11fdf7f2
TL
674 if (target_v < 7) {
675 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
676 } else {
677 encode(new_hb_front_up, bl, features);
678 }
679 encode(features, bl); // NOTE: features arg, not the member
7c673cae 680 if (target_v >= 3) {
11fdf7f2
TL
681 encode(new_nearfull_ratio, bl);
682 encode(new_full_ratio, bl);
683 encode(new_backfillfull_ratio, bl);
31f18b77
FG
684 }
685 // 5 was string-based new_require_min_compat_client
686 if (target_v >= 6) {
11fdf7f2
TL
687 encode(new_require_min_compat_client, bl);
688 encode(new_require_osd_release, bl);
7c673cae 689 }
81eedcae
TL
690 if (target_v >= 8) {
691 encode(new_crush_node_flags, bl);
692 }
693 if (target_v >= 9) {
694 encode(new_device_class_flags, bl);
695 }
7c673cae
FG
696 ENCODE_FINISH(bl); // osd-only data
697 }
698
11fdf7f2
TL
699 crc_offset = bl.length();
700 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
701 tail_offset = bl.length();
702
11fdf7f2 703 encode(full_crc, bl);
7c673cae
FG
704
705 ENCODE_FINISH(bl); // meta-encoding wrapper
706
707 // fill in crc
9f95a23c 708 ceph::buffer::list front;
11fdf7f2 709 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae 710 inc_crc = front.crc32c(-1);
9f95a23c 711 ceph::buffer::list tail;
7c673cae
FG
712 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
713 inc_crc = tail.crc32c(inc_crc);
714 ceph_le32 crc_le;
715 crc_le = inc_crc;
11fdf7f2 716 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
717 have_crc = true;
718}
719
9f95a23c 720void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
7c673cae 721{
11fdf7f2 722 using ceph::decode;
7c673cae
FG
723 __u32 n, t;
724 // base
725 __u16 v;
11fdf7f2
TL
726 decode(v, p);
727 decode(fsid, p);
728 decode(epoch, p);
729 decode(modified, p);
7c673cae 730 if (v == 4 || v == 5) {
11fdf7f2 731 decode(n, p);
7c673cae
FG
732 new_pool_max = n;
733 } else if (v >= 6)
11fdf7f2
TL
734 decode(new_pool_max, p);
735 decode(new_flags, p);
736 decode(fullmap, p);
737 decode(crush, p);
7c673cae 738
11fdf7f2 739 decode(new_max_osd, p);
7c673cae
FG
740 if (v < 6) {
741 new_pools.clear();
11fdf7f2 742 decode(n, p);
7c673cae 743 while (n--) {
11fdf7f2
TL
744 decode(t, p);
745 decode(new_pools[t], p);
7c673cae
FG
746 }
747 } else {
11fdf7f2 748 decode(new_pools, p);
7c673cae
FG
749 }
750 if (v == 5) {
751 new_pool_names.clear();
11fdf7f2 752 decode(n, p);
7c673cae 753 while (n--) {
11fdf7f2
TL
754 decode(t, p);
755 decode(new_pool_names[t], p);
7c673cae
FG
756 }
757 } else if (v >= 6) {
11fdf7f2 758 decode(new_pool_names, p);
7c673cae
FG
759 }
760 if (v < 6) {
761 old_pools.clear();
11fdf7f2 762 decode(n, p);
7c673cae 763 while (n--) {
11fdf7f2 764 decode(t, p);
7c673cae
FG
765 old_pools.insert(t);
766 }
767 } else {
11fdf7f2 768 decode(old_pools, p);
7c673cae 769 }
11fdf7f2 770 decode(new_up_client, p);
31f18b77
FG
771 {
772 map<int32_t,uint8_t> ns;
11fdf7f2 773 decode(ns, p);
31f18b77
FG
774 for (auto q : ns) {
775 new_state[q.first] = q.second;
776 }
777 }
11fdf7f2 778 decode(new_weight, p);
7c673cae
FG
779
780 if (v < 6) {
781 new_pg_temp.clear();
11fdf7f2 782 decode(n, p);
7c673cae
FG
783 while (n--) {
784 old_pg_t opg;
9f95a23c 785 ceph::decode_raw(opg, p);
11fdf7f2 786 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
787 }
788 } else {
11fdf7f2 789 decode(new_pg_temp, p);
7c673cae
FG
790 }
791
792 // decode short map, too.
793 if (v == 5 && p.end())
794 return;
795
796 // extended
797 __u16 ev = 0;
798 if (v >= 5)
11fdf7f2
TL
799 decode(ev, p);
800 decode(new_hb_back_up, p);
7c673cae 801 if (v < 5)
11fdf7f2
TL
802 decode(new_pool_names, p);
803 decode(new_up_thru, p);
804 decode(new_last_clean_interval, p);
805 decode(new_lost, p);
806 decode(new_blacklist, p);
807 decode(old_blacklist, p);
7c673cae 808 if (ev >= 6)
11fdf7f2 809 decode(new_up_cluster, p);
7c673cae 810 if (ev >= 7)
11fdf7f2 811 decode(cluster_snapshot, p);
7c673cae 812 if (ev >= 8)
11fdf7f2 813 decode(new_uuid, p);
7c673cae 814 if (ev >= 9)
11fdf7f2 815 decode(new_xinfo, p);
7c673cae 816 if (ev >= 10)
11fdf7f2 817 decode(new_hb_front_up, p);
7c673cae
FG
818}
819
11fdf7f2
TL
820/* for a description of osdmap incremental versions, and when they were
821 * introduced, please refer to
822 * doc/dev/osd_internals/osdmap_versions.txt
823 */
9f95a23c 824void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 825{
11fdf7f2 826 using ceph::decode;
7c673cae
FG
827 /**
828 * Older encodings of the Incremental had a single struct_v which
829 * covered the whole encoding, and was prior to our modern
830 * stuff which includes a compatv and a size. So if we see
831 * a struct_v < 7, we must rewind to the beginning and use our
832 * classic decoder.
833 */
834 size_t start_offset = bl.get_off();
835 size_t tail_offset = 0;
9f95a23c 836 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
837
838 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
839 if (struct_v < 7) {
11fdf7f2 840 bl.seek(start_offset);
7c673cae
FG
841 decode_classic(bl);
842 encode_features = 0;
843 if (struct_v >= 6)
844 encode_features = CEPH_FEATURE_PGID64;
845 else
846 encode_features = 0;
847 return;
848 }
849 {
11fdf7f2
TL
850 DECODE_START(8, bl); // client-usable data
851 decode(fsid, bl);
852 decode(epoch, bl);
853 decode(modified, bl);
854 decode(new_pool_max, bl);
855 decode(new_flags, bl);
856 decode(fullmap, bl);
857 decode(crush, bl);
858
859 decode(new_max_osd, bl);
860 decode(new_pools, bl);
861 decode(new_pool_names, bl);
862 decode(old_pools, bl);
863 decode(new_up_client, bl);
31f18b77 864 if (struct_v >= 5) {
11fdf7f2 865 decode(new_state, bl);
31f18b77
FG
866 } else {
867 map<int32_t,uint8_t> ns;
11fdf7f2 868 decode(ns, bl);
31f18b77
FG
869 for (auto q : ns) {
870 new_state[q.first] = q.second;
871 }
872 }
11fdf7f2
TL
873 decode(new_weight, bl);
874 decode(new_pg_temp, bl);
875 decode(new_primary_temp, bl);
7c673cae 876 if (struct_v >= 2)
11fdf7f2 877 decode(new_primary_affinity, bl);
7c673cae
FG
878 else
879 new_primary_affinity.clear();
880 if (struct_v >= 3) {
11fdf7f2
TL
881 decode(new_erasure_code_profiles, bl);
882 decode(old_erasure_code_profiles, bl);
7c673cae
FG
883 } else {
884 new_erasure_code_profiles.clear();
885 old_erasure_code_profiles.clear();
886 }
887 if (struct_v >= 4) {
11fdf7f2
TL
888 decode(new_pg_upmap, bl);
889 decode(old_pg_upmap, bl);
890 decode(new_pg_upmap_items, bl);
891 decode(old_pg_upmap_items, bl);
892 }
893 if (struct_v >= 6) {
894 decode(new_removed_snaps, bl);
895 decode(new_purged_snaps, bl);
896 }
897 if (struct_v >= 8) {
898 decode(new_last_up_change, bl);
899 decode(new_last_in_change, bl);
7c673cae
FG
900 }
901 DECODE_FINISH(bl); // client-usable data
902 }
903
904 {
81eedcae 905 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
906 decode(new_hb_back_up, bl);
907 decode(new_up_thru, bl);
908 decode(new_last_clean_interval, bl);
909 decode(new_lost, bl);
910 decode(new_blacklist, bl);
911 decode(old_blacklist, bl);
912 decode(new_up_cluster, bl);
913 decode(cluster_snapshot, bl);
914 decode(new_uuid, bl);
915 decode(new_xinfo, bl);
916 decode(new_hb_front_up, bl);
7c673cae 917 if (struct_v >= 2)
11fdf7f2 918 decode(encode_features, bl);
7c673cae
FG
919 else
920 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
921 if (struct_v >= 3) {
11fdf7f2
TL
922 decode(new_nearfull_ratio, bl);
923 decode(new_full_ratio, bl);
7c673cae
FG
924 } else {
925 new_nearfull_ratio = -1;
926 new_full_ratio = -1;
927 }
928 if (struct_v >= 4) {
11fdf7f2 929 decode(new_backfillfull_ratio, bl);
7c673cae
FG
930 } else {
931 new_backfillfull_ratio = -1;
932 }
31f18b77
FG
933 if (struct_v == 5) {
934 string r;
11fdf7f2 935 decode(r, bl);
31f18b77 936 if (r.length()) {
9f95a23c 937 new_require_min_compat_client = ceph_release_from_name(r);
31f18b77
FG
938 }
939 }
940 if (struct_v >= 6) {
11fdf7f2
TL
941 decode(new_require_min_compat_client, bl);
942 decode(new_require_osd_release, bl);
31f18b77
FG
943 } else {
944 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
945 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 946 new_require_osd_release = ceph_release_t::luminous;
31f18b77
FG
947 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
948 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
9f95a23c 949 new_require_osd_release = ceph_release_t::kraken;
31f18b77 950 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
9f95a23c 951 new_require_osd_release = ceph_release_t::jewel;
31f18b77 952 } else {
9f95a23c 953 new_require_osd_release = ceph_release_t::unknown;
31f18b77
FG
954 }
955 }
81eedcae
TL
956 if (struct_v >= 8) {
957 decode(new_crush_node_flags, bl);
958 }
959 if (struct_v >= 9) {
960 decode(new_device_class_flags, bl);
961 }
7c673cae
FG
962 DECODE_FINISH(bl); // osd-only data
963 }
964
965 if (struct_v >= 8) {
966 have_crc = true;
967 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 968 decode(inc_crc, bl);
7c673cae 969 tail_offset = bl.get_off();
11fdf7f2 970 decode(full_crc, bl);
7c673cae
FG
971 } else {
972 have_crc = false;
973 full_crc = 0;
974 inc_crc = 0;
975 }
976
977 DECODE_FINISH(bl); // wrapper
978
979 if (have_crc) {
980 // verify crc
981 uint32_t actual = crc_front.crc32c(-1);
982 if (tail_offset < bl.get_off()) {
9f95a23c 983 ceph::buffer::list tail;
7c673cae
FG
984 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
985 actual = tail.crc32c(actual);
986 }
987 if (inc_crc != actual) {
988 ostringstream ss;
989 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
990 string s = ss.str();
9f95a23c 991 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
992 }
993 }
994}
995
996void OSDMap::Incremental::dump(Formatter *f) const
997{
998 f->dump_int("epoch", epoch);
999 f->dump_stream("fsid") << fsid;
1000 f->dump_stream("modified") << modified;
11fdf7f2
TL
1001 f->dump_stream("new_last_up_change") << new_last_up_change;
1002 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
1003 f->dump_int("new_pool_max", new_pool_max);
1004 f->dump_int("new_flags", new_flags);
1005 f->dump_float("new_full_ratio", new_full_ratio);
1006 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
1007 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
9f95a23c
TL
1008 f->dump_int("new_require_min_compat_client", ceph::to_integer<int>(new_require_min_compat_client));
1009 f->dump_int("new_require_osd_release", ceph::to_integer<int>(new_require_osd_release));
7c673cae
FG
1010
1011 if (fullmap.length()) {
1012 f->open_object_section("full_map");
1013 OSDMap full;
9f95a23c 1014 ceph::buffer::list fbl = fullmap; // kludge around constness.
11fdf7f2 1015 auto p = fbl.cbegin();
7c673cae
FG
1016 full.decode(p);
1017 full.dump(f);
1018 f->close_section();
1019 }
1020 if (crush.length()) {
1021 f->open_object_section("crush");
1022 CrushWrapper c;
9f95a23c 1023 ceph::buffer::list tbl = crush; // kludge around constness.
11fdf7f2 1024 auto p = tbl.cbegin();
7c673cae
FG
1025 c.decode(p);
1026 c.dump(f);
1027 f->close_section();
1028 }
1029
1030 f->dump_int("new_max_osd", new_max_osd);
1031
1032 f->open_array_section("new_pools");
1033
1034 for (const auto &new_pool : new_pools) {
1035 f->open_object_section("pool");
1036 f->dump_int("pool", new_pool.first);
1037 new_pool.second.dump(f);
1038 f->close_section();
1039 }
1040 f->close_section();
1041 f->open_array_section("new_pool_names");
1042
1043 for (const auto &new_pool_name : new_pool_names) {
1044 f->open_object_section("pool_name");
1045 f->dump_int("pool", new_pool_name.first);
1046 f->dump_string("name", new_pool_name.second);
1047 f->close_section();
1048 }
1049 f->close_section();
1050 f->open_array_section("old_pools");
1051
1052 for (const auto &old_pool : old_pools)
1053 f->dump_int("pool", old_pool);
1054 f->close_section();
1055
1056 f->open_array_section("new_up_osds");
1057
1058 for (const auto &upclient : new_up_client) {
1059 f->open_object_section("osd");
1060 f->dump_int("osd", upclient.first);
11fdf7f2
TL
1061 f->dump_stream("public_addr") << upclient.second.legacy_addr();
1062 f->dump_object("public_addrs", upclient.second);
1063 if (auto p = new_up_cluster.find(upclient.first);
1064 p != new_up_cluster.end()) {
1065 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1066 f->dump_object("cluster_addrs", p->second);
1067 }
1068 if (auto p = new_hb_back_up.find(upclient.first);
1069 p != new_hb_back_up.end()) {
1070 f->dump_object("heartbeat_back_addrs", p->second);
1071 }
1072 if (auto p = new_hb_front_up.find(upclient.first);
1073 p != new_hb_front_up.end()) {
1074 f->dump_object("heartbeat_front_addrs", p->second);
1075 }
7c673cae
FG
1076 f->close_section();
1077 }
1078 f->close_section();
1079
1080 f->open_array_section("new_weight");
1081
1082 for (const auto &weight : new_weight) {
1083 f->open_object_section("osd");
1084 f->dump_int("osd", weight.first);
1085 f->dump_int("weight", weight.second);
1086 f->close_section();
1087 }
1088 f->close_section();
1089
1090 f->open_array_section("osd_state_xor");
1091 for (const auto &ns : new_state) {
1092 f->open_object_section("osd");
1093 f->dump_int("osd", ns.first);
1094 set<string> st;
1095 calc_state_set(new_state.find(ns.first)->second, st);
1096 f->open_array_section("state_xor");
1097 for (auto &state : st)
1098 f->dump_string("state", state);
1099 f->close_section();
c07f9fc5 1100 f->close_section();
7c673cae
FG
1101 }
1102 f->close_section();
1103
1104 f->open_array_section("new_pg_temp");
1105
1106 for (const auto &pg_temp : new_pg_temp) {
1107 f->open_object_section("pg");
1108 f->dump_stream("pgid") << pg_temp.first;
1109 f->open_array_section("osds");
1110
1111 for (const auto &osd : pg_temp.second)
1112 f->dump_int("osd", osd);
1113 f->close_section();
1114 f->close_section();
1115 }
1116 f->close_section();
1117
1118 f->open_array_section("primary_temp");
1119
1120 for (const auto &primary_temp : new_primary_temp) {
1121 f->dump_stream("pgid") << primary_temp.first;
1122 f->dump_int("osd", primary_temp.second);
1123 }
1124 f->close_section(); // primary_temp
1125
1126 f->open_array_section("new_pg_upmap");
1127 for (auto& i : new_pg_upmap) {
1128 f->open_object_section("mapping");
1129 f->dump_stream("pgid") << i.first;
1130 f->open_array_section("osds");
1131 for (auto osd : i.second) {
1132 f->dump_int("osd", osd);
1133 }
1134 f->close_section();
1135 f->close_section();
1136 }
1137 f->close_section();
1138 f->open_array_section("old_pg_upmap");
1139 for (auto& i : old_pg_upmap) {
1140 f->dump_stream("pgid") << i;
1141 }
1142 f->close_section();
1143
1144 f->open_array_section("new_pg_upmap_items");
1145 for (auto& i : new_pg_upmap_items) {
1146 f->open_object_section("mapping");
1147 f->dump_stream("pgid") << i.first;
1148 f->open_array_section("mappings");
1149 for (auto& p : i.second) {
1150 f->open_object_section("mapping");
1151 f->dump_int("from", p.first);
1152 f->dump_int("to", p.second);
1153 f->close_section();
1154 }
1155 f->close_section();
1156 f->close_section();
1157 }
1158 f->close_section();
1159 f->open_array_section("old_pg_upmap_items");
1160 for (auto& i : old_pg_upmap_items) {
1161 f->dump_stream("pgid") << i;
1162 }
1163 f->close_section();
1164
1165 f->open_array_section("new_up_thru");
1166
1167 for (const auto &up_thru : new_up_thru) {
1168 f->open_object_section("osd");
1169 f->dump_int("osd", up_thru.first);
1170 f->dump_int("up_thru", up_thru.second);
1171 f->close_section();
1172 }
1173 f->close_section();
1174
1175 f->open_array_section("new_lost");
1176
1177 for (const auto &lost : new_lost) {
1178 f->open_object_section("osd");
1179 f->dump_int("osd", lost.first);
1180 f->dump_int("epoch_lost", lost.second);
1181 f->close_section();
1182 }
1183 f->close_section();
1184
1185 f->open_array_section("new_last_clean_interval");
1186
1187 for (const auto &last_clean_interval : new_last_clean_interval) {
1188 f->open_object_section("osd");
1189 f->dump_int("osd", last_clean_interval.first);
1190 f->dump_int("first", last_clean_interval.second.first);
1191 f->dump_int("last", last_clean_interval.second.second);
1192 f->close_section();
1193 }
1194 f->close_section();
1195
1196 f->open_array_section("new_blacklist");
1197 for (const auto &blist : new_blacklist) {
1198 stringstream ss;
1199 ss << blist.first;
1200 f->dump_stream(ss.str().c_str()) << blist.second;
1201 }
1202 f->close_section();
1203 f->open_array_section("old_blacklist");
1204 for (const auto &blist : old_blacklist)
1205 f->dump_stream("addr") << blist;
1206 f->close_section();
1207
1208 f->open_array_section("new_xinfo");
1209 for (const auto &xinfo : new_xinfo) {
1210 f->open_object_section("xinfo");
1211 f->dump_int("osd", xinfo.first);
1212 xinfo.second.dump(f);
1213 f->close_section();
1214 }
1215 f->close_section();
1216
1217 if (cluster_snapshot.size())
1218 f->dump_string("cluster_snapshot", cluster_snapshot);
1219
1220 f->open_array_section("new_uuid");
1221 for (const auto &uuid : new_uuid) {
1222 f->open_object_section("osd");
1223 f->dump_int("osd", uuid.first);
1224 f->dump_stream("uuid") << uuid.second;
1225 f->close_section();
1226 }
1227 f->close_section();
1228
1229 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1230 f->open_array_section("old_erasure_code_profiles");
1231 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
9f95a23c 1232 f->dump_string("old", erasure_code_profile);
7c673cae
FG
1233 }
1234 f->close_section();
11fdf7f2
TL
1235
1236 f->open_array_section("new_removed_snaps");
1237 for (auto& p : new_removed_snaps) {
1238 f->open_object_section("pool");
1239 f->dump_int("pool", p.first);
1240 f->open_array_section("snaps");
1241 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1242 f->open_object_section("interval");
1243 f->dump_unsigned("begin", q.get_start());
1244 f->dump_unsigned("length", q.get_len());
1245 f->close_section();
1246 }
1247 f->close_section();
1248 f->close_section();
1249 }
1250 f->close_section();
1251 f->open_array_section("new_purged_snaps");
1252 for (auto& p : new_purged_snaps) {
1253 f->open_object_section("pool");
1254 f->dump_int("pool", p.first);
1255 f->open_array_section("snaps");
1256 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1257 f->open_object_section("interval");
1258 f->dump_unsigned("begin", q.get_start());
1259 f->dump_unsigned("length", q.get_len());
1260 f->close_section();
1261 }
1262 f->close_section();
1263 f->close_section();
1264 }
81eedcae
TL
1265 f->open_array_section("new_crush_node_flags");
1266 for (auto& i : new_crush_node_flags) {
1267 f->open_object_section("node");
1268 f->dump_int("id", i.first);
1269 set<string> st;
1270 calc_state_set(i.second, st);
1271 for (auto& j : st) {
1272 f->dump_string("flag", j);
1273 }
1274 f->close_section();
1275 }
1276 f->close_section();
1277 f->open_array_section("new_device_class_flags");
1278 for (auto& i : new_device_class_flags) {
1279 f->open_object_section("device_class");
1280 f->dump_int("id", i.first);
1281 set<string> st;
1282 calc_state_set(i.second, st);
1283 for (auto& j : st) {
1284 f->dump_string("flag", j);
1285 }
1286 f->close_section();
1287 }
1288 f->close_section();
11fdf7f2 1289 f->close_section();
7c673cae
FG
1290}
1291
1292void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1293{
1294 o.push_back(new Incremental);
1295}
1296
1297// ----------------------------------
1298// OSDMap
1299
1300void OSDMap::set_epoch(epoch_t e)
1301{
1302 epoch = e;
1303 for (auto &pool : pools)
1304 pool.second.last_change = e;
1305}
1306
11fdf7f2 1307bool OSDMap::is_blacklisted(const entity_addr_t& orig) const
7c673cae 1308{
11fdf7f2 1309 if (blacklist.empty()) {
7c673cae 1310 return false;
11fdf7f2
TL
1311 }
1312
1313 // all blacklist entries are type ANY for nautilus+
1314 // FIXME: avoid this copy!
1315 entity_addr_t a = orig;
9f95a23c 1316 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1317 a.set_type(entity_addr_t::TYPE_LEGACY);
1318 } else {
1319 a.set_type(entity_addr_t::TYPE_ANY);
1320 }
7c673cae
FG
1321
1322 // this specific instance?
11fdf7f2 1323 if (blacklist.count(a)) {
7c673cae 1324 return true;
11fdf7f2 1325 }
7c673cae
FG
1326
1327 // is entire ip blacklisted?
1328 if (a.is_ip()) {
11fdf7f2
TL
1329 a.set_port(0);
1330 a.set_nonce(0);
1331 if (blacklist.count(a)) {
1332 return true;
1333 }
1334 }
1335
1336 return false;
1337}
1338
1339bool OSDMap::is_blacklisted(const entity_addrvec_t& av) const
1340{
1341 if (blacklist.empty())
1342 return false;
1343
1344 for (auto& a : av.v) {
1345 if (is_blacklisted(a)) {
7c673cae
FG
1346 return true;
1347 }
1348 }
1349
1350 return false;
1351}
1352
1353void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1354{
1355 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1356}
1357
31f18b77
FG
1358void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1359{
1360 for (const auto &i : blacklist) {
1361 bl->insert(i.first);
1362 }
1363}
1364
7c673cae
FG
1365void OSDMap::set_max_osd(int m)
1366{
1367 int o = max_osd;
1368 max_osd = m;
1369 osd_state.resize(m);
1370 osd_weight.resize(m);
1371 for (; o<max_osd; o++) {
1372 osd_state[o] = 0;
1373 osd_weight[o] = CEPH_OSD_OUT;
1374 }
1375 osd_info.resize(m);
1376 osd_xinfo.resize(m);
11fdf7f2
TL
1377 osd_addrs->client_addrs.resize(m);
1378 osd_addrs->cluster_addrs.resize(m);
1379 osd_addrs->hb_back_addrs.resize(m);
1380 osd_addrs->hb_front_addrs.resize(m);
7c673cae
FG
1381 osd_uuid->resize(m);
1382 if (osd_primary_affinity)
1383 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1384
1385 calc_num_osds();
1386}
1387
1388int OSDMap::calc_num_osds()
1389{
1390 num_osd = 0;
1391 num_up_osd = 0;
1392 num_in_osd = 0;
1393 for (int i=0; i<max_osd; i++) {
1394 if (osd_state[i] & CEPH_OSD_EXISTS) {
1395 ++num_osd;
1396 if (osd_state[i] & CEPH_OSD_UP) {
1397 ++num_up_osd;
1398 }
1399 if (get_weight(i) != CEPH_OSD_OUT) {
1400 ++num_in_osd;
1401 }
1402 }
1403 }
1404 return num_osd;
1405}
1406
3efd9988
FG
1407void OSDMap::get_full_pools(CephContext *cct,
1408 set<int64_t> *full,
1409 set<int64_t> *backfillfull,
1410 set<int64_t> *nearfull) const
7c673cae 1411{
11fdf7f2
TL
1412 ceph_assert(full);
1413 ceph_assert(backfillfull);
1414 ceph_assert(nearfull);
3efd9988
FG
1415 full->clear();
1416 backfillfull->clear();
1417 nearfull->clear();
1418
1419 vector<int> full_osds;
1420 vector<int> backfillfull_osds;
1421 vector<int> nearfull_osds;
7c673cae
FG
1422 for (int i = 0; i < max_osd; ++i) {
1423 if (exists(i) && is_up(i) && is_in(i)) {
1424 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1425 full_osds.push_back(i);
7c673cae 1426 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1427 backfillfull_osds.push_back(i);
7c673cae 1428 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1429 nearfull_osds.push_back(i);
7c673cae
FG
1430 }
1431 }
3efd9988
FG
1432
1433 for (auto i: full_osds) {
1434 get_pool_ids_by_osd(cct, i, full);
1435 }
1436 for (auto i: backfillfull_osds) {
1437 get_pool_ids_by_osd(cct, i, backfillfull);
1438 }
1439 for (auto i: nearfull_osds) {
1440 get_pool_ids_by_osd(cct, i, nearfull);
1441 }
7c673cae
FG
1442}
1443
31f18b77
FG
1444void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1445 set<int> *nearfull) const
1446{
1447 full->clear();
1448 backfill->clear();
1449 nearfull->clear();
1450 for (int i = 0; i < max_osd; ++i) {
1451 if (exists(i) && is_up(i) && is_in(i)) {
1452 if (osd_state[i] & CEPH_OSD_FULL)
1453 full->emplace(i);
1454 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1455 backfill->emplace(i);
1456 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1457 nearfull->emplace(i);
1458 }
1459 }
1460}
1461
7c673cae
FG
1462void OSDMap::get_all_osds(set<int32_t>& ls) const
1463{
1464 for (int i=0; i<max_osd; i++)
1465 if (exists(i))
1466 ls.insert(i);
1467}
1468
1469void OSDMap::get_up_osds(set<int32_t>& ls) const
1470{
1471 for (int i = 0; i < max_osd; i++) {
1472 if (is_up(i))
1473 ls.insert(i);
1474 }
1475}
1476
81eedcae 1477void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1478{
1479 for (int i = 0; i < max_osd; i++) {
81eedcae 1480 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1481 ls.insert(i);
1482 }
1483}
1484
11fdf7f2
TL
1485void OSDMap::get_flag_set(set<string> *flagset) const
1486{
1487 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1488 if (flags & (1<<i)) {
1489 flagset->insert(get_flag_string(flags & (1<<i)));
1490 }
1491 }
1492}
1493
7c673cae
FG
1494void OSDMap::calc_state_set(int state, set<string>& st)
1495{
1496 unsigned t = state;
1497 for (unsigned s = 1; t; s <<= 1) {
1498 if (t & s) {
1499 t &= ~s;
1500 st.insert(ceph_osd_state_name(s));
1501 }
1502 }
1503}
1504
1505void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1506{
1507 float max = 0;
1508 for (const auto &weight : weights) {
1509 if (weight.second > max)
1510 max = weight.second;
1511 }
1512
1513 for (const auto &weight : weights) {
1514 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1515 }
1516}
1517
1518int OSDMap::identify_osd(const entity_addr_t& addr) const
1519{
1520 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1521 if (exists(i) && (get_addrs(i).contains(addr) ||
1522 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1523 return i;
1524 return -1;
1525}
1526
1527int OSDMap::identify_osd(const uuid_d& u) const
1528{
1529 for (int i=0; i<max_osd; i++)
1530 if (exists(i) && get_uuid(i) == u)
1531 return i;
1532 return -1;
1533}
1534
1535int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1536{
1537 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1538 if (exists(i) && (get_addrs(i).contains(addr) ||
1539 get_cluster_addrs(i).contains(addr) ||
1540 get_hb_back_addrs(i).contains(addr) ||
1541 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1542 return i;
1543 return -1;
1544}
1545
1546int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1547{
1548 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1549 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1550 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1551 return i;
1552 return -1;
1553}
1554
1555
1556uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1557{
1558 uint64_t features = 0; // things we actually have
1559 uint64_t mask = 0; // things we could have
1560
1561 if (crush->has_nondefault_tunables())
1562 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1563 if (crush->has_nondefault_tunables2())
1564 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1565 if (crush->has_nondefault_tunables3())
1566 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1567 if (crush->has_v4_buckets())
1568 features |= CEPH_FEATURE_CRUSH_V4;
1569 if (crush->has_nondefault_tunables5())
1570 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1571 if (crush->has_incompat_choose_args()) {
1572 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1573 }
7c673cae
FG
1574 mask |= CEPH_FEATURES_CRUSH;
1575
1576 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1577 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1578 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1579
1580 for (auto &pool: pools) {
1581 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1582 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1583 }
7c673cae
FG
1584 if (!pool.second.tiers.empty() ||
1585 pool.second.is_tier()) {
1586 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1587 }
31f18b77 1588 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
7c673cae
FG
1589 pool.second.get_type(),
1590 pool.second.get_size());
1591 if (ruleid >= 0) {
1592 if (crush->is_v2_rule(ruleid))
1593 features |= CEPH_FEATURE_CRUSH_V2;
1594 if (crush->is_v3_rule(ruleid))
1595 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1596 if (crush->is_v5_rule(ruleid))
1597 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1598 }
1599 }
7c673cae 1600 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1601
1602 if (osd_primary_affinity) {
1603 for (int i = 0; i < max_osd; ++i) {
1604 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1605 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1606 break;
1607 }
1608 }
1609 }
1610 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1611
1612 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1613 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
9f95a23c 1614 if (require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
1615 features |= jewel_features;
1616 }
1617 mask |= jewel_features;
1618
1619 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1620 | CEPH_FEATURE_MSG_ADDR2;
9f95a23c 1621 if (require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
1622 features |= kraken_features;
1623 }
1624 mask |= kraken_features;
1625 }
1626
9f95a23c 1627 if (require_min_compat_client >= ceph_release_t::nautilus) {
11fdf7f2
TL
1628 // if min_compat_client is >= nautilus, require v2 cephx signatures
1629 // from everyone
1630 features |= CEPH_FEATUREMASK_CEPHX_V2;
9f95a23c 1631 } else if (require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
1632 entity_type == CEPH_ENTITY_TYPE_OSD) {
1633 // if osds are >= nautilus, at least require the signatures from them
1634 features |= CEPH_FEATUREMASK_CEPHX_V2;
1635 }
1636 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1637
7c673cae
FG
1638 if (pmask)
1639 *pmask = mask;
1640 return features;
1641}
1642
9f95a23c 1643ceph_release_t OSDMap::get_min_compat_client() const
7c673cae
FG
1644{
1645 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1646
1647 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77 1648 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
9f95a23c 1649 return ceph_release_t::luminous; // v12.2.0
7c673cae
FG
1650 }
1651 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
9f95a23c 1652 return ceph_release_t::jewel; // v10.2.0
7c673cae
FG
1653 }
1654 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
9f95a23c 1655 return ceph_release_t::hammer; // v0.94.0
7c673cae
FG
1656 }
1657 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1658 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1659 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
9f95a23c 1660 return ceph_release_t::firefly; // v0.80.0
7c673cae
FG
1661 }
1662 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1663 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
9f95a23c 1664 return ceph_release_t::dumpling; // v0.67.0
7c673cae
FG
1665 }
1666 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
9f95a23c 1667 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae 1668 }
9f95a23c 1669 return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
7c673cae
FG
1670}
1671
9f95a23c 1672ceph_release_t OSDMap::get_require_min_compat_client() const
11fdf7f2
TL
1673{
1674 return require_min_compat_client;
1675}
1676
7c673cae
FG
1677void OSDMap::_calc_up_osd_features()
1678{
1679 bool first = true;
1680 cached_up_osd_features = 0;
1681 for (int osd = 0; osd < max_osd; ++osd) {
1682 if (!is_up(osd))
1683 continue;
1684 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1685 if (xi.features == 0)
1686 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1687 if (first) {
1688 cached_up_osd_features = xi.features;
1689 first = false;
1690 } else {
1691 cached_up_osd_features &= xi.features;
1692 }
1693 }
1694}
1695
1696uint64_t OSDMap::get_up_osd_features() const
1697{
1698 return cached_up_osd_features;
1699}
1700
1701void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1702{
11fdf7f2 1703 using ceph::encode;
7c673cae
FG
1704 if (o->epoch == n->epoch)
1705 return;
1706
1707 int diff = 0;
1708
1709 // do addrs match?
1710 if (o->max_osd != n->max_osd)
1711 diff++;
1712 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1713 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1714 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1715 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1716 else
1717 diff++;
11fdf7f2
TL
1718 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1719 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1720 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1721 else
1722 diff++;
11fdf7f2
TL
1723 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1724 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1725 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1726 else
1727 diff++;
11fdf7f2
TL
1728 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1729 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1730 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1731 else
1732 diff++;
1733 }
1734 if (diff == 0) {
1735 // zoinks, no differences at all!
1736 n->osd_addrs = o->osd_addrs;
1737 }
1738
1739 // does crush match?
9f95a23c 1740 ceph::buffer::list oc, nc;
11fdf7f2
TL
1741 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1742 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1743 if (oc.contents_equal(nc)) {
1744 n->crush = o->crush;
1745 }
1746
1747 // does pg_temp match?
31f18b77
FG
1748 if (*o->pg_temp == *n->pg_temp)
1749 n->pg_temp = o->pg_temp;
7c673cae
FG
1750
1751 // does primary_temp match?
1752 if (o->primary_temp->size() == n->primary_temp->size()) {
1753 if (*o->primary_temp == *n->primary_temp)
1754 n->primary_temp = o->primary_temp;
1755 }
1756
1757 // do uuids match?
1758 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1759 *o->osd_uuid == *n->osd_uuid)
1760 n->osd_uuid = o->osd_uuid;
1761}
1762
1763void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1764 const OSDMap& oldmap,
1765 const OSDMap& nextmap,
1766 Incremental *pending_inc)
7c673cae
FG
1767{
1768 ldout(cct, 10) << __func__ << dendl;
7c673cae 1769
11fdf7f2 1770 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1771 // if pool does not exist, remove any existing pg_temps associated with
1772 // it. we don't care about pg_temps on the pending_inc either; if there
1773 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1774 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1775 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1776 << " for nonexistent pool " << pg.first.pool() << dendl;
1777 pending_inc->new_pg_temp[pg.first].clear();
1778 continue;
1779 }
1780 // all osds down?
1781 unsigned num_up = 0;
1782 for (auto o : pg.second) {
11fdf7f2 1783 if (!nextmap.is_down(o)) {
7c673cae
FG
1784 ++num_up;
1785 break;
1786 }
1787 }
1788 if (num_up == 0) {
1789 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1790 << " with all down osds" << pg.second << dendl;
1791 pending_inc->new_pg_temp[pg.first].clear();
1792 continue;
1793 }
1794 // redundant pg_temp?
1795 vector<int> raw_up;
1796 int primary;
11fdf7f2 1797 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1798 bool remove = false;
11fdf7f2 1799 if (raw_up == pg.second) {
7c673cae
FG
1800 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1801 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1802 remove = true;
1803 }
1804 // oversized pg_temp?
11fdf7f2 1805 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1806 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1807 << pg.second << " exceeds pool size" << dendl;
1808 remove = true;
1809 }
1810 if (remove) {
11fdf7f2 1811 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1812 pending_inc->new_pg_temp[pg.first].clear();
1813 else
1814 pending_inc->new_pg_temp.erase(pg.first);
1815 }
1816 }
1817
11fdf7f2 1818 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1819 // primary down?
11fdf7f2 1820 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1821 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1822 << " to down " << pg.second << dendl;
1823 pending_inc->new_primary_temp[pg.first] = -1;
1824 continue;
1825 }
1826 // redundant primary_temp?
1827 vector<int> real_up, templess_up;
1828 int real_primary, templess_primary;
1829 pg_t pgid = pg.first;
11fdf7f2
TL
1830 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1831 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1832 if (real_primary == templess_primary){
1833 ldout(cct, 10) << __func__ << " removing primary_temp "
1834 << pgid << " -> " << real_primary
1835 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1836 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1837 pending_inc->new_primary_temp[pgid] = -1;
1838 else
1839 pending_inc->new_primary_temp.erase(pgid);
1840 }
1841 }
1842}
1843
494da23a 1844void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1845{
494da23a
TL
1846 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1847 for (auto& p : pg_upmap)
1848 upmap_pgs->push_back(p.first);
1849 for (auto& p : pg_upmap_items)
1850 upmap_pgs->push_back(p.first);
1851}
94b18763 1852
494da23a
TL
1853bool OSDMap::check_pg_upmaps(
1854 CephContext *cct,
1855 const vector<pg_t>& to_check,
1856 vector<pg_t> *to_cancel,
1857 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1858{
1859 bool any_change = false;
1860 map<int, map<int, float>> rule_weight_map;
28e407b8 1861 for (auto& pg : to_check) {
494da23a 1862 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
1863 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1864 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1865 << dendl;
494da23a 1866 to_cancel->push_back(pg);
11fdf7f2
TL
1867 continue;
1868 }
1869 if (pi->is_pending_merge(pg, nullptr)) {
1870 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1871 << dendl;
494da23a 1872 to_cancel->push_back(pg);
94b18763
FG
1873 continue;
1874 }
494da23a
TL
1875 vector<int> raw, up;
1876 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
1877 auto crush_rule = get_pg_pool_crush_rule(pg);
1878 auto r = crush->verify_upmap(cct,
1879 crush_rule,
1880 get_pg_pool_size(pg),
1881 up);
a8e16298
TL
1882 if (r < 0) {
1883 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1884 << " returning " << r
1885 << dendl;
494da23a 1886 to_cancel->push_back(pg);
a8e16298
TL
1887 continue;
1888 }
1889 // below we check against crush-topology changing..
28e407b8
AA
1890 map<int, float> weight_map;
1891 auto it = rule_weight_map.find(crush_rule);
1892 if (it == rule_weight_map.end()) {
494da23a 1893 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
1894 if (r < 0) {
1895 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
1896 << "crush_rule " << crush_rule
1897 << dendl;
28e407b8
AA
1898 continue;
1899 }
1900 rule_weight_map[crush_rule] = weight_map;
1901 } else {
1902 weight_map = it->second;
1903 }
28e407b8 1904 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1905 << " weight_map " << weight_map
94b18763 1906 << dendl;
a8e16298 1907 for (auto osd : up) {
28e407b8
AA
1908 auto it = weight_map.find(osd);
1909 if (it == weight_map.end()) {
92f5a8d4
TL
1910 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
1911 << "been moved out of the specific crush-tree"
1912 << dendl;
494da23a 1913 to_cancel->push_back(pg);
94b18763
FG
1914 break;
1915 }
494da23a 1916 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 1917 if (adjusted_weight == 0) {
92f5a8d4
TL
1918 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
1919 << " is out/crush-out"
1920 << dendl;
494da23a 1921 to_cancel->push_back(pg);
94b18763
FG
1922 break;
1923 }
1924 }
eafe8130
TL
1925 if (!to_cancel->empty() && to_cancel->back() == pg)
1926 continue;
1927 // okay, upmap is valid
1928 // continue to check if it is still necessary
1929 auto i = pg_upmap.find(pg);
1930 if (i != pg_upmap.end() && raw == i->second) {
1931 ldout(cct, 10) << " removing redundant pg_upmap "
1932 << i->first << " " << i->second
1933 << dendl;
1934 to_cancel->push_back(pg);
1935 continue;
1936 }
1937 auto j = pg_upmap_items.find(pg);
1938 if (j != pg_upmap_items.end()) {
1939 mempool::osdmap::vector<pair<int,int>> newmap;
1940 for (auto& p : j->second) {
1941 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1942 // cancel mapping if source osd does not exist anymore
1943 continue;
1944 }
1945 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1946 p.second >= 0 && osd_weight[p.second] == 0) {
1947 // cancel mapping if target osd is out
1948 continue;
1949 }
1950 newmap.push_back(p);
1951 }
1952 if (newmap.empty()) {
1953 ldout(cct, 10) << " removing no-op pg_upmap_items "
1954 << j->first << " " << j->second
1955 << dendl;
1956 to_cancel->push_back(pg);
1957 } else if (newmap != j->second) {
1958 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
1959 << j->first << " " << j->second
1960 << " -> " << newmap
1961 << dendl;
1962 to_remap->insert({pg, newmap});
1963 any_change = true;
1964 }
1965 }
28e407b8 1966 }
494da23a
TL
1967 any_change = any_change || !to_cancel->empty();
1968 return any_change;
1969}
1970
1971void OSDMap::clean_pg_upmaps(
1972 CephContext *cct,
1973 Incremental *pending_inc,
1974 const vector<pg_t>& to_cancel,
1975 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
1976{
28e407b8 1977 for (auto &pg: to_cancel) {
494da23a
TL
1978 auto i = pending_inc->new_pg_upmap.find(pg);
1979 if (i != pending_inc->new_pg_upmap.end()) {
1980 ldout(cct, 10) << __func__ << " cancel invalid pending "
1981 << "pg_upmap entry "
1982 << i->first << "->" << i->second
1983 << dendl;
1984 pending_inc->new_pg_upmap.erase(i);
94b18763 1985 }
494da23a
TL
1986 auto j = pg_upmap.find(pg);
1987 if (j != pg_upmap.end()) {
1988 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
1989 << j->first << "->" << j->second
1990 << dendl;
1991 pending_inc->old_pg_upmap.insert(pg);
1992 }
1993 auto p = pending_inc->new_pg_upmap_items.find(pg);
1994 if (p != pending_inc->new_pg_upmap_items.end()) {
1995 ldout(cct, 10) << __func__ << " cancel invalid pending "
1996 << "pg_upmap_items entry "
1997 << p->first << "->" << p->second
1998 << dendl;
1999 pending_inc->new_pg_upmap_items.erase(p);
2000 }
2001 auto q = pg_upmap_items.find(pg);
2002 if (q != pg_upmap_items.end()) {
2003 ldout(cct, 10) << __func__ << " cancel invalid "
2004 << "pg_upmap_items entry "
2005 << q->first << "->" << q->second
2006 << dendl;
2007 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
2008 }
2009 }
494da23a
TL
2010 for (auto& i : to_remap)
2011 pending_inc->new_pg_upmap_items[i.first] = i.second;
2012}
2013
2014bool OSDMap::clean_pg_upmaps(
2015 CephContext *cct,
2016 Incremental *pending_inc) const
2017{
2018 ldout(cct, 10) << __func__ << dendl;
2019 vector<pg_t> to_check;
2020 vector<pg_t> to_cancel;
2021 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
2022
2023 get_upmap_pgs(&to_check);
2024 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
2025 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
2026 return any_change;
94b18763
FG
2027}
2028
7c673cae
FG
2029int OSDMap::apply_incremental(const Incremental &inc)
2030{
2031 new_blacklist_entries = false;
2032 if (inc.epoch == 1)
2033 fsid = inc.fsid;
2034 else if (inc.fsid != fsid)
2035 return -EINVAL;
2036
11fdf7f2 2037 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
2038
2039 epoch++;
2040 modified = inc.modified;
2041
2042 // full map?
2043 if (inc.fullmap.length()) {
9f95a23c 2044 ceph::buffer::list bl(inc.fullmap);
7c673cae
FG
2045 decode(bl);
2046 return 0;
2047 }
2048
2049 // nope, incremental.
31f18b77 2050 if (inc.new_flags >= 0) {
7c673cae 2051 flags = inc.new_flags;
31f18b77
FG
2052 // the below is just to cover a newly-upgraded luminous mon
2053 // cluster that has to set require_jewel_osds or
2054 // require_kraken_osds before the osds can be upgraded to
2055 // luminous.
2056 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c
TL
2057 if (require_osd_release < ceph_release_t::kraken) {
2058 require_osd_release = ceph_release_t::kraken;
31f18b77
FG
2059 }
2060 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c
TL
2061 if (require_osd_release < ceph_release_t::jewel) {
2062 require_osd_release = ceph_release_t::jewel;
31f18b77
FG
2063 }
2064 }
2065 }
7c673cae
FG
2066
2067 if (inc.new_max_osd >= 0)
2068 set_max_osd(inc.new_max_osd);
2069
2070 if (inc.new_pool_max != -1)
2071 pool_max = inc.new_pool_max;
2072
2073 for (const auto &pool : inc.new_pools) {
2074 pools[pool.first] = pool.second;
2075 pools[pool.first].last_change = epoch;
2076 }
2077
11fdf7f2
TL
2078 new_removed_snaps = inc.new_removed_snaps;
2079 new_purged_snaps = inc.new_purged_snaps;
2080 for (auto p = new_removed_snaps.begin();
2081 p != new_removed_snaps.end();
2082 ++p) {
2083 removed_snaps_queue[p->first].union_of(p->second);
2084 }
2085 for (auto p = new_purged_snaps.begin();
2086 p != new_purged_snaps.end();
2087 ++p) {
2088 auto q = removed_snaps_queue.find(p->first);
2089 ceph_assert(q != removed_snaps_queue.end());
2090 q->second.subtract(p->second);
2091 if (q->second.empty()) {
2092 removed_snaps_queue.erase(q);
2093 }
2094 }
2095
2096 if (inc.new_last_up_change != utime_t()) {
2097 last_up_change = inc.new_last_up_change;
2098 }
2099 if (inc.new_last_in_change != utime_t()) {
2100 last_in_change = inc.new_last_in_change;
2101 }
2102
7c673cae
FG
2103 for (const auto &pname : inc.new_pool_names) {
2104 auto pool_name_entry = pool_name.find(pname.first);
2105 if (pool_name_entry != pool_name.end()) {
2106 name_pool.erase(pool_name_entry->second);
2107 pool_name_entry->second = pname.second;
2108 } else {
2109 pool_name[pname.first] = pname.second;
2110 }
2111 name_pool[pname.second] = pname.first;
2112 }
2113
2114 for (const auto &pool : inc.old_pools) {
2115 pools.erase(pool);
2116 name_pool.erase(pool_name[pool]);
2117 pool_name.erase(pool);
2118 }
2119
2120 for (const auto &weight : inc.new_weight) {
2121 set_weight(weight.first, weight.second);
2122
2123 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2124 // xinfo old_weight.
2125 if (weight.second) {
2126 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2127 osd_xinfo[weight.first].old_weight = 0;
2128 }
2129 }
2130
2131 for (const auto &primary_affinity : inc.new_primary_affinity) {
2132 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2133 }
2134
2135 // erasure_code_profiles
2136 for (const auto &profile : inc.old_erasure_code_profiles)
2137 erasure_code_profiles.erase(profile);
2138
2139 for (const auto &profile : inc.new_erasure_code_profiles) {
2140 set_erasure_code_profile(profile.first, profile.second);
2141 }
2142
2143 // up/down
2144 for (const auto &state : inc.new_state) {
2145 const auto osd = state.first;
2146 int s = state.second ? state.second : CEPH_OSD_UP;
2147 if ((osd_state[osd] & CEPH_OSD_UP) &&
2148 (s & CEPH_OSD_UP)) {
2149 osd_info[osd].down_at = epoch;
2150 osd_xinfo[osd].down_stamp = modified;
2151 }
2152 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2153 (s & CEPH_OSD_EXISTS)) {
2154 // osd is destroyed; clear out anything interesting.
2155 (*osd_uuid)[osd] = uuid_d();
2156 osd_info[osd] = osd_info_t();
2157 osd_xinfo[osd] = osd_xinfo_t();
2158 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2159 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2160 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2161 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2162 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2163 osd_state[osd] = 0;
2164 } else {
2165 osd_state[osd] ^= s;
2166 }
2167 }
2168
2169 for (const auto &client : inc.new_up_client) {
2170 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
9f95a23c 2171 osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
11fdf7f2
TL
2172 osd_addrs->client_addrs[client.first].reset(
2173 new entity_addrvec_t(client.second));
2174 osd_addrs->hb_back_addrs[client.first].reset(
2175 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2176 osd_addrs->hb_front_addrs[client.first].reset(
2177 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2178
2179 osd_info[client.first].up_from = epoch;
2180 }
2181
2182 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2183 osd_addrs->cluster_addrs[cluster.first].reset(
2184 new entity_addrvec_t(cluster.second));
7c673cae
FG
2185
2186 // info
2187 for (const auto &thru : inc.new_up_thru)
2188 osd_info[thru.first].up_thru = thru.second;
2189
2190 for (const auto &interval : inc.new_last_clean_interval) {
2191 osd_info[interval.first].last_clean_begin = interval.second.first;
2192 osd_info[interval.first].last_clean_end = interval.second.second;
2193 }
2194
2195 for (const auto &lost : inc.new_lost)
2196 osd_info[lost.first].lost_at = lost.second;
2197
2198 // xinfo
2199 for (const auto &xinfo : inc.new_xinfo)
2200 osd_xinfo[xinfo.first] = xinfo.second;
2201
2202 // uuid
2203 for (const auto &uuid : inc.new_uuid)
2204 (*osd_uuid)[uuid.first] = uuid.second;
2205
2206 // pg rebuild
2207 for (const auto &pg : inc.new_pg_temp) {
2208 if (pg.second.empty())
2209 pg_temp->erase(pg.first);
2210 else
31f18b77
FG
2211 pg_temp->set(pg.first, pg.second);
2212 }
2213 if (!inc.new_pg_temp.empty()) {
2214 // make sure pg_temp is efficiently stored
2215 pg_temp->rebuild();
7c673cae
FG
2216 }
2217
2218 for (const auto &pg : inc.new_primary_temp) {
2219 if (pg.second == -1)
2220 primary_temp->erase(pg.first);
2221 else
2222 (*primary_temp)[pg.first] = pg.second;
2223 }
2224
2225 for (auto& p : inc.new_pg_upmap) {
2226 pg_upmap[p.first] = p.second;
2227 }
2228 for (auto& pg : inc.old_pg_upmap) {
2229 pg_upmap.erase(pg);
2230 }
2231 for (auto& p : inc.new_pg_upmap_items) {
2232 pg_upmap_items[p.first] = p.second;
2233 }
2234 for (auto& pg : inc.old_pg_upmap_items) {
2235 pg_upmap_items.erase(pg);
2236 }
2237
2238 // blacklist
2239 if (!inc.new_blacklist.empty()) {
2240 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
2241 new_blacklist_entries = true;
2242 }
2243 for (const auto &addr : inc.old_blacklist)
2244 blacklist.erase(addr);
2245
81eedcae
TL
2246 for (auto& i : inc.new_crush_node_flags) {
2247 if (i.second) {
2248 crush_node_flags[i.first] = i.second;
2249 } else {
2250 crush_node_flags.erase(i.first);
2251 }
2252 }
2253
2254 for (auto& i : inc.new_device_class_flags) {
2255 if (i.second) {
2256 device_class_flags[i.first] = i.second;
2257 } else {
2258 device_class_flags.erase(i.first);
2259 }
2260 }
2261
7c673cae
FG
2262 // cluster snapshot?
2263 if (inc.cluster_snapshot.length()) {
2264 cluster_snapshot = inc.cluster_snapshot;
2265 cluster_snapshot_epoch = inc.epoch;
2266 } else {
2267 cluster_snapshot.clear();
2268 cluster_snapshot_epoch = 0;
2269 }
2270
2271 if (inc.new_nearfull_ratio >= 0) {
2272 nearfull_ratio = inc.new_nearfull_ratio;
2273 }
2274 if (inc.new_backfillfull_ratio >= 0) {
2275 backfillfull_ratio = inc.new_backfillfull_ratio;
2276 }
2277 if (inc.new_full_ratio >= 0) {
2278 full_ratio = inc.new_full_ratio;
2279 }
9f95a23c 2280 if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
7c673cae
FG
2281 require_min_compat_client = inc.new_require_min_compat_client;
2282 }
9f95a23c 2283 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
31f18b77 2284 require_osd_release = inc.new_require_osd_release;
9f95a23c 2285 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 2286 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2287 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2288 }
2289 }
7c673cae 2290
9f95a23c 2291 if (inc.new_require_osd_release >= ceph_release_t::unknown) {
11fdf7f2 2292 require_osd_release = inc.new_require_osd_release;
9f95a23c 2293 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
2294 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2295 }
2296 }
7c673cae
FG
2297 // do new crush map last (after up/down stuff)
2298 if (inc.crush.length()) {
9f95a23c 2299 ceph::buffer::list bl(inc.crush);
11fdf7f2 2300 auto blp = bl.cbegin();
7c673cae
FG
2301 crush.reset(new CrushWrapper);
2302 crush->decode(blp);
9f95a23c 2303 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77
FG
2304 // only increment if this is a luminous-encoded osdmap, lest
2305 // the mon's crush_version diverge from what the osds or others
2306 // are decoding and applying on their end. if we won't encode
2307 // it in the canonical version, don't change it.
2308 ++crush_version;
2309 }
81eedcae
TL
2310 for (auto it = device_class_flags.begin();
2311 it != device_class_flags.end();) {
2312 const char* class_name = crush->get_class_name(it->first);
2313 if (!class_name) // device class is gone
2314 it = device_class_flags.erase(it);
2315 else
2316 it++;
2317 }
7c673cae
FG
2318 }
2319
2320 calc_num_osds();
2321 _calc_up_osd_features();
2322 return 0;
2323}
2324
2325// mapping
2326int OSDMap::map_to_pg(
2327 int64_t poolid,
2328 const string& name,
2329 const string& key,
2330 const string& nspace,
2331 pg_t *pg) const
2332{
2333 // calculate ps (placement seed)
2334 const pg_pool_t *pool = get_pg_pool(poolid);
2335 if (!pool)
2336 return -ENOENT;
2337 ps_t ps;
2338 if (!key.empty())
2339 ps = pool->hash_key(key, nspace);
2340 else
2341 ps = pool->hash_key(name, nspace);
2342 *pg = pg_t(ps, poolid);
2343 return 0;
2344}
2345
2346int OSDMap::object_locator_to_pg(
2347 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2348{
2349 if (loc.hash >= 0) {
2350 if (!get_pg_pool(loc.get_pool())) {
2351 return -ENOENT;
2352 }
2353 pg = pg_t(loc.hash, loc.get_pool());
2354 return 0;
2355 }
2356 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2357}
2358
2359ceph_object_layout OSDMap::make_object_layout(
2360 object_t oid, int pg_pool, string nspace) const
2361{
2362 object_locator_t loc(pg_pool, nspace);
2363
2364 ceph_object_layout ol;
2365 pg_t pgid = object_locator_to_pg(oid, loc);
2366 ol.ol_pgid = pgid.get_old_pg().v;
2367 ol.ol_stripe_unit = 0;
2368 return ol;
2369}
2370
2371void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2372 vector<int>& osds) const
2373{
2374 if (pool.can_shift_osds()) {
2375 unsigned removed = 0;
2376 for (unsigned i = 0; i < osds.size(); i++) {
2377 if (!exists(osds[i])) {
2378 removed++;
2379 continue;
2380 }
2381 if (removed) {
2382 osds[i - removed] = osds[i];
2383 }
2384 }
2385 if (removed)
2386 osds.resize(osds.size() - removed);
2387 } else {
2388 for (auto& osd : osds) {
2389 if (!exists(osd))
2390 osd = CRUSH_ITEM_NONE;
2391 }
2392 }
2393}
2394
31f18b77 2395void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2396 const pg_pool_t& pool, pg_t pg,
2397 vector<int> *osds,
2398 ps_t *ppps) const
2399{
2400 // map to osds[]
2401 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2402 unsigned size = pool.get_size();
2403
2404 // what crush rule?
31f18b77 2405 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
7c673cae
FG
2406 if (ruleno >= 0)
2407 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2408
2409 _remove_nonexistent_osds(pool, *osds);
2410
2411 if (ppps)
2412 *ppps = pps;
7c673cae
FG
2413}
2414
2415int OSDMap::_pick_primary(const vector<int>& osds) const
2416{
2417 for (auto osd : osds) {
2418 if (osd != CRUSH_ITEM_NONE) {
2419 return osd;
2420 }
2421 }
2422 return -1;
2423}
2424
224ce89b 2425void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2426{
2427 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2428 auto p = pg_upmap.find(pg);
2429 if (p != pg_upmap.end()) {
2430 // make sure targets aren't marked out
2431 for (auto osd : p->second) {
91327a77
AA
2432 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2433 osd_weight[osd] == 0) {
7c673cae
FG
2434 // reject/ignore the explicit mapping
2435 return;
2436 }
2437 }
2438 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2439 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2440 }
2441
2442 auto q = pg_upmap_items.find(pg);
2443 if (q != pg_upmap_items.end()) {
181888fb
FG
2444 // NOTE: this approach does not allow a bidirectional swap,
2445 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2446 for (auto& r : q->second) {
2447 // make sure the replacement value doesn't already appear
2448 bool exists = false;
2449 ssize_t pos = -1;
2450 for (unsigned i = 0; i < raw->size(); ++i) {
2451 int osd = (*raw)[i];
2452 if (osd == r.second) {
2453 exists = true;
2454 break;
2455 }
2456 // ignore mapping if target is marked out (or invalid osd id)
2457 if (osd == r.first &&
2458 pos < 0 &&
2459 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2460 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2461 pos = i;
2462 }
2463 }
2464 if (!exists && pos >= 0) {
2465 (*raw)[pos] = r.second;
7c673cae
FG
2466 }
2467 }
2468 }
2469}
2470
2471// pg -> (up osd list)
2472void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2473 vector<int> *up) const
2474{
2475 if (pool.can_shift_osds()) {
2476 // shift left
2477 up->clear();
2478 up->reserve(raw.size());
2479 for (unsigned i=0; i<raw.size(); i++) {
2480 if (!exists(raw[i]) || is_down(raw[i]))
2481 continue;
2482 up->push_back(raw[i]);
2483 }
2484 } else {
2485 // set down/dne devices to NONE
2486 up->resize(raw.size());
2487 for (int i = raw.size() - 1; i >= 0; --i) {
2488 if (!exists(raw[i]) || is_down(raw[i])) {
2489 (*up)[i] = CRUSH_ITEM_NONE;
2490 } else {
2491 (*up)[i] = raw[i];
2492 }
2493 }
2494 }
2495}
2496
2497void OSDMap::_apply_primary_affinity(ps_t seed,
2498 const pg_pool_t& pool,
2499 vector<int> *osds,
2500 int *primary) const
2501{
2502 // do we have any non-default primary_affinity values for these osds?
2503 if (!osd_primary_affinity)
2504 return;
2505
2506 bool any = false;
2507 for (const auto osd : *osds) {
2508 if (osd != CRUSH_ITEM_NONE &&
2509 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2510 any = true;
2511 break;
2512 }
2513 }
2514 if (!any)
2515 return;
2516
2517 // pick the primary. feed both the seed (for the pg) and the osd
2518 // into the hash/rng so that a proportional fraction of an osd's pgs
2519 // get rejected as primary.
2520 int pos = -1;
2521 for (unsigned i = 0; i < osds->size(); ++i) {
2522 int o = (*osds)[i];
2523 if (o == CRUSH_ITEM_NONE)
2524 continue;
2525 unsigned a = (*osd_primary_affinity)[o];
2526 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2527 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2528 seed, o) >> 16) >= a) {
2529 // we chose not to use this primary. note it anyway as a
2530 // fallback in case we don't pick anyone else, but keep looking.
2531 if (pos < 0)
2532 pos = i;
2533 } else {
2534 pos = i;
2535 break;
2536 }
2537 }
2538 if (pos < 0)
2539 return;
2540
2541 *primary = (*osds)[pos];
2542
2543 if (pool.can_shift_osds() && pos > 0) {
2544 // move the new primary to the front.
2545 for (int i = pos; i > 0; --i) {
2546 (*osds)[i] = (*osds)[i-1];
2547 }
2548 (*osds)[0] = *primary;
2549 }
2550}
2551
2552void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2553 vector<int> *temp_pg, int *temp_primary) const
2554{
2555 pg = pool.raw_pg_to_pg(pg);
2556 const auto p = pg_temp->find(pg);
2557 temp_pg->clear();
2558 if (p != pg_temp->end()) {
2559 for (unsigned i=0; i<p->second.size(); i++) {
2560 if (!exists(p->second[i]) || is_down(p->second[i])) {
2561 if (pool.can_shift_osds()) {
2562 continue;
2563 } else {
2564 temp_pg->push_back(CRUSH_ITEM_NONE);
2565 }
2566 } else {
2567 temp_pg->push_back(p->second[i]);
2568 }
2569 }
2570 }
2571 const auto &pp = primary_temp->find(pg);
2572 *temp_primary = -1;
2573 if (pp != primary_temp->end()) {
2574 *temp_primary = pp->second;
2575 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2576 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2577 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2578 *temp_primary = (*temp_pg)[i];
2579 break;
2580 }
2581 }
2582 }
2583}
2584
31f18b77 2585void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2586{
7c673cae 2587 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2588 if (!pool) {
2589 *primary = -1;
2590 raw->clear();
31f18b77 2591 return;
11fdf7f2 2592 }
31f18b77 2593 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2594 *primary = _pick_primary(*raw);
7c673cae
FG
2595}
2596
494da23a
TL
2597void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2598 vector<int> *raw_upmap) const
a8e16298
TL
2599{
2600 auto pool = get_pg_pool(pg.pool());
2601 if (!pool) {
2602 raw_upmap->clear();
2603 return;
2604 }
494da23a
TL
2605 _pg_to_raw_osds(*pool, pg, raw, NULL);
2606 *raw_upmap = *raw;
a8e16298
TL
2607 _apply_upmap(*pool, pg, raw_upmap);
2608}
2609
7c673cae
FG
2610void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2611{
2612 const pg_pool_t *pool = get_pg_pool(pg.pool());
2613 if (!pool) {
11fdf7f2
TL
2614 *primary = -1;
2615 up->clear();
7c673cae
FG
2616 return;
2617 }
2618 vector<int> raw;
2619 ps_t pps;
2620 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2621 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2622 _raw_to_up_osds(*pool, raw, up);
2623 *primary = _pick_primary(raw);
2624 _apply_primary_affinity(pps, *pool, up, primary);
2625}
31f18b77 2626
7c673cae
FG
2627void OSDMap::_pg_to_up_acting_osds(
2628 const pg_t& pg, vector<int> *up, int *up_primary,
2629 vector<int> *acting, int *acting_primary,
2630 bool raw_pg_to_pg) const
2631{
2632 const pg_pool_t *pool = get_pg_pool(pg.pool());
2633 if (!pool ||
2634 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2635 if (up)
2636 up->clear();
2637 if (up_primary)
2638 *up_primary = -1;
2639 if (acting)
2640 acting->clear();
2641 if (acting_primary)
2642 *acting_primary = -1;
2643 return;
2644 }
2645 vector<int> raw;
2646 vector<int> _up;
2647 vector<int> _acting;
2648 int _up_primary;
2649 int _acting_primary;
2650 ps_t pps;
2651 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2652 if (_acting.empty() || up || up_primary) {
2653 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2654 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2655 _raw_to_up_osds(*pool, raw, &_up);
2656 _up_primary = _pick_primary(_up);
2657 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2658 if (_acting.empty()) {
2659 _acting = _up;
2660 if (_acting_primary == -1) {
2661 _acting_primary = _up_primary;
2662 }
2663 }
2664
2665 if (up)
2666 up->swap(_up);
2667 if (up_primary)
2668 *up_primary = _up_primary;
2669 }
2670
2671 if (acting)
2672 acting->swap(_acting);
2673 if (acting_primary)
2674 *acting_primary = _acting_primary;
2675}
2676
9f95a23c 2677int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
7c673cae 2678{
9f95a23c
TL
2679 // This implementation is broken for EC PGs since the osd may appear
2680 // multiple times in the acting set. See
2681 // https://tracker.ceph.com/issues/43213
7c673cae
FG
2682 if (!nrep)
2683 nrep = acting.size();
2684 for (int i=0; i<nrep; i++)
2685 if (acting[i] == osd)
2686 return i;
2687 return -1;
2688}
2689
9f95a23c 2690int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
7c673cae 2691{
9f95a23c
TL
2692 int nrep = acting.size();
2693 if (who.shard == shard_id_t::NO_SHARD) {
2694 for (int i=0; i<nrep; i++) {
2695 if (acting[i] == who.osd) {
2696 return i;
2697 }
2698 }
2699 } else {
2700 if (who.shard < nrep && acting[who.shard] == who.osd) {
2701 return who.shard;
2702 }
2703 }
2704 return -1;
7c673cae
FG
2705}
2706
9f95a23c 2707bool OSDMap::primary_changed_broken(
7c673cae
FG
2708 int oldprimary,
2709 const vector<int> &oldacting,
2710 int newprimary,
2711 const vector<int> &newacting)
2712{
2713 if (oldacting.empty() && newacting.empty())
2714 return false; // both still empty
2715 if (oldacting.empty() ^ newacting.empty())
2716 return true; // was empty, now not, or vice versa
2717 if (oldprimary != newprimary)
2718 return true; // primary changed
9f95a23c
TL
2719 if (calc_pg_role_broken(oldprimary, oldacting) !=
2720 calc_pg_role_broken(newprimary, newacting))
7c673cae
FG
2721 return true;
2722 return false; // same primary (tho replicas may have changed)
2723}
2724
28e407b8
AA
2725uint64_t OSDMap::get_encoding_features() const
2726{
2727 uint64_t f = SIGNIFICANT_FEATURES;
9f95a23c
TL
2728 if (require_osd_release < ceph_release_t::octopus) {
2729 f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
2730 }
2731 if (require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
2732 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2733 }
9f95a23c 2734 if (require_osd_release < ceph_release_t::mimic) {
11fdf7f2
TL
2735 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2736 }
9f95a23c 2737 if (require_osd_release < ceph_release_t::luminous) {
28e407b8
AA
2738 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2739 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2740 }
9f95a23c 2741 if (require_osd_release < ceph_release_t::kraken) {
28e407b8 2742 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2743 CEPH_FEATURE_MSG_ADDR2);
28e407b8 2744 }
9f95a23c 2745 if (require_osd_release < ceph_release_t::jewel) {
28e407b8 2746 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2747 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2748 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2749 }
2750 return f;
2751}
7c673cae
FG
2752
2753// serialize, unserialize
9f95a23c 2754void OSDMap::encode_client_old(ceph::buffer::list& bl) const
7c673cae 2755{
11fdf7f2 2756 using ceph::encode;
7c673cae 2757 __u16 v = 5;
11fdf7f2 2758 encode(v, bl);
7c673cae
FG
2759
2760 // base
11fdf7f2
TL
2761 encode(fsid, bl);
2762 encode(epoch, bl);
2763 encode(created, bl);
2764 encode(modified, bl);
7c673cae 2765
11fdf7f2 2766 // for encode(pools, bl);
7c673cae 2767 __u32 n = pools.size();
11fdf7f2 2768 encode(n, bl);
7c673cae
FG
2769
2770 for (const auto &pool : pools) {
2771 n = pool.first;
11fdf7f2
TL
2772 encode(n, bl);
2773 encode(pool.second, bl, 0);
7c673cae 2774 }
11fdf7f2 2775 // for encode(pool_name, bl);
7c673cae 2776 n = pool_name.size();
11fdf7f2 2777 encode(n, bl);
7c673cae
FG
2778 for (const auto &pname : pool_name) {
2779 n = pname.first;
11fdf7f2
TL
2780 encode(n, bl);
2781 encode(pname.second, bl);
7c673cae 2782 }
11fdf7f2 2783 // for encode(pool_max, bl);
7c673cae 2784 n = pool_max;
11fdf7f2 2785 encode(n, bl);
7c673cae 2786
11fdf7f2 2787 encode(flags, bl);
7c673cae 2788
11fdf7f2 2789 encode(max_osd, bl);
31f18b77
FG
2790 {
2791 uint32_t n = osd_state.size();
11fdf7f2 2792 encode(n, bl);
31f18b77 2793 for (auto s : osd_state) {
11fdf7f2 2794 encode((uint8_t)s, bl);
31f18b77
FG
2795 }
2796 }
11fdf7f2
TL
2797 encode(osd_weight, bl);
2798 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2799
11fdf7f2 2800 // for encode(pg_temp, bl);
7c673cae 2801 n = pg_temp->size();
11fdf7f2 2802 encode(n, bl);
7c673cae
FG
2803 for (const auto pg : *pg_temp) {
2804 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2805 encode(opg, bl);
2806 encode(pg.second, bl);
7c673cae
FG
2807 }
2808
2809 // crush
9f95a23c 2810 ceph::buffer::list cbl;
7c673cae 2811 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2812 encode(cbl, bl);
7c673cae
FG
2813}
2814
9f95a23c 2815void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2816{
11fdf7f2 2817 using ceph::encode;
7c673cae
FG
2818 if ((features & CEPH_FEATURE_PGID64) == 0) {
2819 encode_client_old(bl);
2820 return;
2821 }
2822
2823 __u16 v = 6;
11fdf7f2 2824 encode(v, bl);
7c673cae
FG
2825
2826 // base
11fdf7f2
TL
2827 encode(fsid, bl);
2828 encode(epoch, bl);
2829 encode(created, bl);
2830 encode(modified, bl);
7c673cae 2831
11fdf7f2
TL
2832 encode(pools, bl, features);
2833 encode(pool_name, bl);
2834 encode(pool_max, bl);
7c673cae 2835
11fdf7f2 2836 encode(flags, bl);
7c673cae 2837
11fdf7f2 2838 encode(max_osd, bl);
31f18b77
FG
2839 {
2840 uint32_t n = osd_state.size();
11fdf7f2 2841 encode(n, bl);
31f18b77 2842 for (auto s : osd_state) {
11fdf7f2 2843 encode((uint8_t)s, bl);
31f18b77
FG
2844 }
2845 }
11fdf7f2
TL
2846 encode(osd_weight, bl);
2847 encode(osd_addrs->client_addrs, bl, features);
7c673cae 2848
11fdf7f2 2849 encode(*pg_temp, bl);
7c673cae
FG
2850
2851 // crush
9f95a23c 2852 ceph::buffer::list cbl;
7c673cae 2853 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2854 encode(cbl, bl);
7c673cae
FG
2855
2856 // extended
2857 __u16 ev = 10;
11fdf7f2
TL
2858 encode(ev, bl);
2859 encode(osd_addrs->hb_back_addrs, bl, features);
2860 encode(osd_info, bl);
2861 encode(blacklist, bl, features);
2862 encode(osd_addrs->cluster_addrs, bl, features);
2863 encode(cluster_snapshot_epoch, bl);
2864 encode(cluster_snapshot, bl);
2865 encode(*osd_uuid, bl);
9f95a23c 2866 encode(osd_xinfo, bl, features);
11fdf7f2 2867 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
2868}
2869
11fdf7f2
TL
2870/* for a description of osdmap versions, and when they were introduced, please
2871 * refer to
2872 * doc/dev/osd_internals/osdmap_versions.txt
2873 */
9f95a23c 2874void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
7c673cae 2875{
11fdf7f2 2876 using ceph::encode;
7c673cae
FG
2877 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2878 encode_classic(bl, features);
2879 return;
2880 }
2881
2882 // only a select set of callers should *ever* be encoding new
2883 // OSDMaps. others should be passing around the canonical encoded
2884 // buffers from on high. select out those callers by passing in an
2885 // "impossible" feature bit.
11fdf7f2 2886 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
2887 features &= ~CEPH_FEATURE_RESERVED;
2888
2889 size_t start_offset = bl.length();
2890 size_t tail_offset;
11fdf7f2 2891 size_t crc_offset;
9f95a23c 2892 std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
2893
2894 // meta-encoding: how we include client-used and osd-specific data
2895 ENCODE_START(8, 7, bl);
2896
2897 {
28e407b8
AA
2898 // NOTE: any new encoding dependencies must be reflected by
2899 // SIGNIFICANT_FEATURES
11fdf7f2 2900 uint8_t v = 9;
31f18b77 2901 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 2902 v = 3;
11fdf7f2
TL
2903 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2904 v = 6;
2905 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2906 v = 7;
7c673cae
FG
2907 }
2908 ENCODE_START(v, 1, bl); // client-usable data
2909 // base
11fdf7f2
TL
2910 encode(fsid, bl);
2911 encode(epoch, bl);
2912 encode(created, bl);
2913 encode(modified, bl);
7c673cae 2914
11fdf7f2
TL
2915 encode(pools, bl, features);
2916 encode(pool_name, bl);
2917 encode(pool_max, bl);
7c673cae 2918
31f18b77
FG
2919 if (v < 4) {
2920 decltype(flags) f = flags;
9f95a23c 2921 if (require_osd_release >= ceph_release_t::luminous)
c07f9fc5 2922 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
9f95a23c 2923 else if (require_osd_release == ceph_release_t::kraken)
31f18b77 2924 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
9f95a23c 2925 else if (require_osd_release == ceph_release_t::jewel)
31f18b77 2926 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 2927 encode(f, bl);
31f18b77 2928 } else {
11fdf7f2 2929 encode(flags, bl);
31f18b77 2930 }
7c673cae 2931
11fdf7f2 2932 encode(max_osd, bl);
31f18b77 2933 if (v >= 5) {
11fdf7f2 2934 encode(osd_state, bl);
31f18b77
FG
2935 } else {
2936 uint32_t n = osd_state.size();
11fdf7f2 2937 encode(n, bl);
31f18b77 2938 for (auto s : osd_state) {
11fdf7f2 2939 encode((uint8_t)s, bl);
31f18b77
FG
2940 }
2941 }
11fdf7f2
TL
2942 encode(osd_weight, bl);
2943 if (v >= 8) {
2944 encode(osd_addrs->client_addrs, bl, features);
2945 } else {
2946 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2947 }
7c673cae 2948
11fdf7f2
TL
2949 encode(*pg_temp, bl);
2950 encode(*primary_temp, bl);
7c673cae 2951 if (osd_primary_affinity) {
11fdf7f2 2952 encode(*osd_primary_affinity, bl);
7c673cae
FG
2953 } else {
2954 vector<__u32> v;
11fdf7f2 2955 encode(v, bl);
7c673cae
FG
2956 }
2957
2958 // crush
9f95a23c 2959 ceph::buffer::list cbl;
7c673cae 2960 crush->encode(cbl, features);
11fdf7f2
TL
2961 encode(cbl, bl);
2962 encode(erasure_code_profiles, bl);
7c673cae
FG
2963
2964 if (v >= 4) {
11fdf7f2
TL
2965 encode(pg_upmap, bl);
2966 encode(pg_upmap_items, bl);
7c673cae 2967 } else {
11fdf7f2
TL
2968 ceph_assert(pg_upmap.empty());
2969 ceph_assert(pg_upmap_items.empty());
7c673cae 2970 }
31f18b77 2971 if (v >= 6) {
11fdf7f2
TL
2972 encode(crush_version, bl);
2973 }
2974 if (v >= 7) {
2975 encode(new_removed_snaps, bl);
2976 encode(new_purged_snaps, bl);
2977 }
2978 if (v >= 9) {
2979 encode(last_up_change, bl);
2980 encode(last_in_change, bl);
31f18b77 2981 }
7c673cae
FG
2982 ENCODE_FINISH(bl); // client-usable data
2983 }
2984
2985 {
28e407b8
AA
2986 // NOTE: any new encoding dependencies must be reflected by
2987 // SIGNIFICANT_FEATURES
81eedcae 2988 uint8_t target_v = 9;
7c673cae
FG
2989 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2990 target_v = 1;
11fdf7f2
TL
2991 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2992 target_v = 5;
2993 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2994 target_v = 6;
7c673cae
FG
2995 }
2996 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
2997 if (target_v < 7) {
2998 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
2999 } else {
3000 encode(osd_addrs->hb_back_addrs, bl, features);
3001 }
3002 encode(osd_info, bl);
7c673cae
FG
3003 {
3004 // put this in a sorted, ordered map<> so that we encode in a
3005 // deterministic order.
3006 map<entity_addr_t,utime_t> blacklist_map;
3007 for (const auto &addr : blacklist)
3008 blacklist_map.insert(make_pair(addr.first, addr.second));
11fdf7f2
TL
3009 encode(blacklist_map, bl, features);
3010 }
3011 if (target_v < 7) {
3012 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
3013 } else {
3014 encode(osd_addrs->cluster_addrs, bl, features);
3015 }
3016 encode(cluster_snapshot_epoch, bl);
3017 encode(cluster_snapshot, bl);
3018 encode(*osd_uuid, bl);
9f95a23c 3019 encode(osd_xinfo, bl, features);
11fdf7f2
TL
3020 if (target_v < 7) {
3021 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
3022 } else {
3023 encode(osd_addrs->hb_front_addrs, bl, features);
3024 }
7c673cae 3025 if (target_v >= 2) {
11fdf7f2
TL
3026 encode(nearfull_ratio, bl);
3027 encode(full_ratio, bl);
3028 encode(backfillfull_ratio, bl);
31f18b77
FG
3029 }
3030 // 4 was string-based new_require_min_compat_client
3031 if (target_v >= 5) {
11fdf7f2
TL
3032 encode(require_min_compat_client, bl);
3033 encode(require_osd_release, bl);
3034 }
3035 if (target_v >= 6) {
3036 encode(removed_snaps_queue, bl);
7c673cae 3037 }
81eedcae
TL
3038 if (target_v >= 8) {
3039 encode(crush_node_flags, bl);
3040 }
3041 if (target_v >= 9) {
3042 encode(device_class_flags, bl);
3043 }
7c673cae
FG
3044 ENCODE_FINISH(bl); // osd-only data
3045 }
3046
11fdf7f2
TL
3047 crc_offset = bl.length();
3048 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
3049 tail_offset = bl.length();
3050
3051 ENCODE_FINISH(bl); // meta-encoding wrapper
3052
3053 // fill in crc
9f95a23c 3054 ceph::buffer::list front;
11fdf7f2 3055 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
3056 crc = front.crc32c(-1);
3057 if (tail_offset < bl.length()) {
9f95a23c 3058 ceph::buffer::list tail;
7c673cae
FG
3059 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
3060 crc = tail.crc32c(crc);
3061 }
3062 ceph_le32 crc_le;
3063 crc_le = crc;
11fdf7f2 3064 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
3065 crc_defined = true;
3066}
3067
11fdf7f2
TL
3068/* for a description of osdmap versions, and when they were introduced, please
3069 * refer to
3070 * doc/dev/osd_internals/osdmap_versions.txt
3071 */
9f95a23c 3072void OSDMap::decode(ceph::buffer::list& bl)
7c673cae 3073{
11fdf7f2 3074 auto p = bl.cbegin();
7c673cae
FG
3075 decode(p);
3076}
3077
9f95a23c 3078void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
7c673cae 3079{
11fdf7f2 3080 using ceph::decode;
7c673cae
FG
3081 __u32 n, t;
3082 __u16 v;
11fdf7f2 3083 decode(v, p);
7c673cae
FG
3084
3085 // base
11fdf7f2
TL
3086 decode(fsid, p);
3087 decode(epoch, p);
3088 decode(created, p);
3089 decode(modified, p);
7c673cae
FG
3090
3091 if (v < 6) {
3092 if (v < 4) {
3093 int32_t max_pools = 0;
11fdf7f2 3094 decode(max_pools, p);
7c673cae
FG
3095 pool_max = max_pools;
3096 }
3097 pools.clear();
11fdf7f2 3098 decode(n, p);
7c673cae 3099 while (n--) {
11fdf7f2
TL
3100 decode(t, p);
3101 decode(pools[t], p);
7c673cae
FG
3102 }
3103 if (v == 4) {
11fdf7f2 3104 decode(n, p);
7c673cae
FG
3105 pool_max = n;
3106 } else if (v == 5) {
3107 pool_name.clear();
11fdf7f2 3108 decode(n, p);
7c673cae 3109 while (n--) {
11fdf7f2
TL
3110 decode(t, p);
3111 decode(pool_name[t], p);
7c673cae 3112 }
11fdf7f2 3113 decode(n, p);
7c673cae
FG
3114 pool_max = n;
3115 }
3116 } else {
11fdf7f2
TL
3117 decode(pools, p);
3118 decode(pool_name, p);
3119 decode(pool_max, p);
7c673cae
FG
3120 }
3121 // kludge around some old bug that zeroed out pool_max (#2307)
3122 if (pools.size() && pool_max < pools.rbegin()->first) {
3123 pool_max = pools.rbegin()->first;
3124 }
3125
11fdf7f2 3126 decode(flags, p);
7c673cae 3127
11fdf7f2 3128 decode(max_osd, p);
31f18b77
FG
3129 {
3130 vector<uint8_t> os;
11fdf7f2 3131 decode(os, p);
31f18b77
FG
3132 osd_state.resize(os.size());
3133 for (unsigned i = 0; i < os.size(); ++i) {
3134 osd_state[i] = os[i];
3135 }
3136 }
11fdf7f2
TL
3137 decode(osd_weight, p);
3138 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3139 if (v <= 5) {
3140 pg_temp->clear();
11fdf7f2 3141 decode(n, p);
7c673cae
FG
3142 while (n--) {
3143 old_pg_t opg;
9f95a23c 3144 ceph::decode_raw(opg, p);
31f18b77 3145 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3146 decode(v, p);
31f18b77 3147 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3148 }
3149 } else {
11fdf7f2 3150 decode(*pg_temp, p);
7c673cae
FG
3151 }
3152
3153 // crush
9f95a23c 3154 ceph::buffer::list cbl;
11fdf7f2
TL
3155 decode(cbl, p);
3156 auto cblp = cbl.cbegin();
7c673cae
FG
3157 crush->decode(cblp);
3158
3159 // extended
3160 __u16 ev = 0;
3161 if (v >= 5)
11fdf7f2
TL
3162 decode(ev, p);
3163 decode(osd_addrs->hb_back_addrs, p);
3164 decode(osd_info, p);
7c673cae 3165 if (v < 5)
11fdf7f2 3166 decode(pool_name, p);
7c673cae 3167
11fdf7f2 3168 decode(blacklist, p);
7c673cae 3169 if (ev >= 6)
11fdf7f2 3170 decode(osd_addrs->cluster_addrs, p);
7c673cae 3171 else
11fdf7f2 3172 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3173
3174 if (ev >= 7) {
11fdf7f2
TL
3175 decode(cluster_snapshot_epoch, p);
3176 decode(cluster_snapshot, p);
7c673cae
FG
3177 }
3178
3179 if (ev >= 8) {
11fdf7f2 3180 decode(*osd_uuid, p);
7c673cae
FG
3181 } else {
3182 osd_uuid->resize(max_osd);
3183 }
3184 if (ev >= 9)
11fdf7f2 3185 decode(osd_xinfo, p);
7c673cae
FG
3186 else
3187 osd_xinfo.resize(max_osd);
3188
3189 if (ev >= 10)
11fdf7f2 3190 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3191 else
11fdf7f2 3192 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3193
3194 osd_primary_affinity.reset();
3195
3196 post_decode();
3197}
3198
9f95a23c 3199void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
7c673cae 3200{
11fdf7f2 3201 using ceph::decode;
7c673cae
FG
3202 /**
3203 * Older encodings of the OSDMap had a single struct_v which
3204 * covered the whole encoding, and was prior to our modern
3205 * stuff which includes a compatv and a size. So if we see
3206 * a struct_v < 7, we must rewind to the beginning and use our
3207 * classic decoder.
3208 */
3209 size_t start_offset = bl.get_off();
3210 size_t tail_offset = 0;
9f95a23c 3211 ceph::buffer::list crc_front, crc_tail;
7c673cae
FG
3212
3213 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3214 if (struct_v < 7) {
11fdf7f2 3215 bl.seek(start_offset);
7c673cae
FG
3216 decode_classic(bl);
3217 return;
3218 }
3219 /**
3220 * Since we made it past that hurdle, we can use our normal paths.
3221 */
3222 {
11fdf7f2 3223 DECODE_START(9, bl); // client-usable data
7c673cae 3224 // base
11fdf7f2
TL
3225 decode(fsid, bl);
3226 decode(epoch, bl);
3227 decode(created, bl);
3228 decode(modified, bl);
7c673cae 3229
11fdf7f2
TL
3230 decode(pools, bl);
3231 decode(pool_name, bl);
3232 decode(pool_max, bl);
7c673cae 3233
11fdf7f2 3234 decode(flags, bl);
7c673cae 3235
11fdf7f2 3236 decode(max_osd, bl);
31f18b77 3237 if (struct_v >= 5) {
11fdf7f2 3238 decode(osd_state, bl);
31f18b77
FG
3239 } else {
3240 vector<uint8_t> os;
11fdf7f2 3241 decode(os, bl);
31f18b77
FG
3242 osd_state.resize(os.size());
3243 for (unsigned i = 0; i < os.size(); ++i) {
3244 osd_state[i] = os[i];
3245 }
3246 }
11fdf7f2
TL
3247 decode(osd_weight, bl);
3248 decode(osd_addrs->client_addrs, bl);
7c673cae 3249
11fdf7f2
TL
3250 decode(*pg_temp, bl);
3251 decode(*primary_temp, bl);
3252 // dates back to firefly. version increased from 2 to 3 still in firefly.
3253 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3254 if (struct_v >= 2) {
3255 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3256 decode(*osd_primary_affinity, bl);
7c673cae
FG
3257 if (osd_primary_affinity->empty())
3258 osd_primary_affinity.reset();
3259 } else {
3260 osd_primary_affinity.reset();
3261 }
3262
3263 // crush
9f95a23c 3264 ceph::buffer::list cbl;
11fdf7f2
TL
3265 decode(cbl, bl);
3266 auto cblp = cbl.cbegin();
7c673cae 3267 crush->decode(cblp);
11fdf7f2
TL
3268 // added in firefly; version increased in luminous, so it affects
3269 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3270 // alone until we require clients to be all luminous?
7c673cae 3271 if (struct_v >= 3) {
11fdf7f2 3272 decode(erasure_code_profiles, bl);
7c673cae
FG
3273 } else {
3274 erasure_code_profiles.clear();
3275 }
11fdf7f2
TL
3276 // version increased from 3 to 4 still in luminous, so same as above
3277 // applies.
7c673cae 3278 if (struct_v >= 4) {
11fdf7f2
TL
3279 decode(pg_upmap, bl);
3280 decode(pg_upmap_items, bl);
7c673cae
FG
3281 } else {
3282 pg_upmap.clear();
3283 pg_upmap_items.clear();
3284 }
11fdf7f2
TL
3285 // again, version increased from 5 to 6 still in luminous, so above
3286 // applies.
31f18b77 3287 if (struct_v >= 6) {
11fdf7f2
TL
3288 decode(crush_version, bl);
3289 }
3290 // version increase from 6 to 7 in mimic
3291 if (struct_v >= 7) {
3292 decode(new_removed_snaps, bl);
3293 decode(new_purged_snaps, bl);
3294 }
3295 // version increase from 7 to 8, 8 to 9, in nautilus.
3296 if (struct_v >= 9) {
3297 decode(last_up_change, bl);
3298 decode(last_in_change, bl);
31f18b77 3299 }
7c673cae
FG
3300 DECODE_FINISH(bl); // client-usable data
3301 }
3302
3303 {
81eedcae 3304 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
3305 decode(osd_addrs->hb_back_addrs, bl);
3306 decode(osd_info, bl);
3307 decode(blacklist, bl);
3308 decode(osd_addrs->cluster_addrs, bl);
3309 decode(cluster_snapshot_epoch, bl);
3310 decode(cluster_snapshot, bl);
3311 decode(*osd_uuid, bl);
3312 decode(osd_xinfo, bl);
3313 decode(osd_addrs->hb_front_addrs, bl);
3314 //
7c673cae 3315 if (struct_v >= 2) {
11fdf7f2
TL
3316 decode(nearfull_ratio, bl);
3317 decode(full_ratio, bl);
7c673cae
FG
3318 } else {
3319 nearfull_ratio = 0;
3320 full_ratio = 0;
3321 }
3322 if (struct_v >= 3) {
11fdf7f2 3323 decode(backfillfull_ratio, bl);
7c673cae
FG
3324 } else {
3325 backfillfull_ratio = 0;
3326 }
31f18b77
FG
3327 if (struct_v == 4) {
3328 string r;
11fdf7f2 3329 decode(r, bl);
31f18b77
FG
3330 if (r.length())
3331 require_min_compat_client = ceph_release_from_name(r.c_str());
3332 }
3333 if (struct_v >= 5) {
11fdf7f2
TL
3334 decode(require_min_compat_client, bl);
3335 decode(require_osd_release, bl);
9f95a23c 3336 if (require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
3337 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3338 }
9f95a23c 3339 if (require_osd_release >= ceph_release_t::luminous) {
31f18b77 3340 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3341 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3342 }
3343 } else {
3344 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3345 // only for compat with post-kraken pre-luminous test clusters
9f95a23c 3346 require_osd_release = ceph_release_t::luminous;
31f18b77 3347 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3348 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77 3349 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
9f95a23c 3350 require_osd_release = ceph_release_t::kraken;
31f18b77 3351 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
9f95a23c 3352 require_osd_release = ceph_release_t::jewel;
31f18b77 3353 } else {
9f95a23c 3354 require_osd_release = ceph_release_t::unknown;
31f18b77
FG
3355 }
3356 }
11fdf7f2
TL
3357 if (struct_v >= 6) {
3358 decode(removed_snaps_queue, bl);
3359 }
81eedcae
TL
3360 if (struct_v >= 8) {
3361 decode(crush_node_flags, bl);
3362 } else {
3363 crush_node_flags.clear();
3364 }
3365 if (struct_v >= 9) {
3366 decode(device_class_flags, bl);
3367 } else {
3368 device_class_flags.clear();
3369 }
7c673cae
FG
3370 DECODE_FINISH(bl); // osd-only data
3371 }
3372
3373 if (struct_v >= 8) {
3374 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3375 decode(crc, bl);
7c673cae
FG
3376 tail_offset = bl.get_off();
3377 crc_defined = true;
3378 } else {
3379 crc_defined = false;
3380 crc = 0;
3381 }
3382
3383 DECODE_FINISH(bl); // wrapper
3384
3385 if (tail_offset) {
3386 // verify crc
3387 uint32_t actual = crc_front.crc32c(-1);
3388 if (tail_offset < bl.get_off()) {
9f95a23c 3389 ceph::buffer::list tail;
7c673cae
FG
3390 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3391 actual = tail.crc32c(actual);
3392 }
3393 if (crc != actual) {
3394 ostringstream ss;
3395 ss << "bad crc, actual " << actual << " != expected " << crc;
3396 string s = ss.str();
9f95a23c 3397 throw ceph::buffer::malformed_input(s.c_str());
7c673cae
FG
3398 }
3399 }
3400
3401 post_decode();
3402}
3403
3404void OSDMap::post_decode()
3405{
3406 // index pool names
3407 name_pool.clear();
3408 for (const auto &pname : pool_name) {
3409 name_pool[pname.second] = pname.first;
3410 }
3411
3412 calc_num_osds();
3413 _calc_up_osd_features();
3414}
3415
3416void OSDMap::dump_erasure_code_profiles(
3417 const mempool::osdmap::map<string,map<string,string>>& profiles,
3418 Formatter *f)
3419{
3420 f->open_object_section("erasure_code_profiles");
3421 for (const auto &profile : profiles) {
3422 f->open_object_section(profile.first.c_str());
3423 for (const auto &profm : profile.second) {
9f95a23c 3424 f->dump_string(profm.first.c_str(), profm.second);
7c673cae
FG
3425 }
3426 f->close_section();
3427 }
3428 f->close_section();
3429}
3430
9f95a23c
TL
3431void OSDMap::dump_osds(Formatter *f) const
3432{
3433 f->open_array_section("osds");
3434 for (int i=0; i<get_max_osd(); i++) {
3435 if (exists(i)) {
3436 dump_osd(i, f);
3437 }
3438 }
3439 f->close_section();
3440}
3441
3442void OSDMap::dump_osd(int id, Formatter *f) const
3443{
3444 ceph_assert(f != nullptr);
3445 if (!exists(id)) {
3446 return;
3447 }
3448
3449 f->open_object_section("osd_info");
3450 f->dump_int("osd", id);
3451 f->dump_stream("uuid") << get_uuid(id);
3452 f->dump_int("up", is_up(id));
3453 f->dump_int("in", is_in(id));
3454 f->dump_float("weight", get_weightf(id));
3455 f->dump_float("primary_affinity", get_primary_affinityf(id));
3456 get_info(id).dump(f);
3457 f->dump_object("public_addrs", get_addrs(id));
3458 f->dump_object("cluster_addrs", get_cluster_addrs(id));
3459 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
3460 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
3461 // compat
3462 f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
3463 f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
3464 f->dump_stream("heartbeat_back_addr")
3465 << get_hb_back_addrs(id).get_legacy_str();
3466 f->dump_stream("heartbeat_front_addr")
3467 << get_hb_front_addrs(id).get_legacy_str();
3468
3469 set<string> st;
3470 get_state(id, st);
3471 f->open_array_section("state");
3472 for (const auto &state : st)
3473 f->dump_string("state", state);
3474 f->close_section();
3475
3476 f->close_section();
3477}
3478
7c673cae
FG
3479void OSDMap::dump(Formatter *f) const
3480{
3481 f->dump_int("epoch", get_epoch());
3482 f->dump_stream("fsid") << get_fsid();
3483 f->dump_stream("created") << get_created();
3484 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3485 f->dump_stream("last_up_change") << last_up_change;
3486 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3487 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3488 f->dump_unsigned("flags_num", flags);
3489 f->open_array_section("flags_set");
3490 set<string> flagset;
3491 get_flag_set(&flagset);
3492 for (auto p : flagset) {
3493 f->dump_string("flag", p);
3494 }
3495 f->close_section();
31f18b77 3496 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3497 f->dump_float("full_ratio", full_ratio);
3498 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3499 f->dump_float("nearfull_ratio", nearfull_ratio);
3500 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3501 f->dump_int("pool_max", get_pool_max());
3502 f->dump_int("max_osd", get_max_osd());
31f18b77 3503 f->dump_string("require_min_compat_client",
9f95a23c 3504 ceph::to_string(require_min_compat_client));
31f18b77 3505 f->dump_string("min_compat_client",
9f95a23c 3506 ceph::to_string(get_min_compat_client()));
31f18b77 3507 f->dump_string("require_osd_release",
9f95a23c 3508 ceph::to_string(require_osd_release));
7c673cae
FG
3509
3510 f->open_array_section("pools");
3511 for (const auto &pool : pools) {
3512 std::string name("<unknown>");
3513 const auto &pni = pool_name.find(pool.first);
3514 if (pni != pool_name.end())
3515 name = pni->second;
3516 f->open_object_section("pool");
3517 f->dump_int("pool", pool.first);
3518 f->dump_string("pool_name", name);
3519 pool.second.dump(f);
3520 f->close_section();
3521 }
3522 f->close_section();
3523
9f95a23c 3524 dump_osds(f);
7c673cae
FG
3525
3526 f->open_array_section("osd_xinfo");
3527 for (int i=0; i<get_max_osd(); i++) {
3528 if (exists(i)) {
3529 f->open_object_section("xinfo");
3530 f->dump_int("osd", i);
3531 osd_xinfo[i].dump(f);
3532 f->close_section();
3533 }
3534 }
3535 f->close_section();
3536
3537 f->open_array_section("pg_upmap");
3538 for (auto& p : pg_upmap) {
3539 f->open_object_section("mapping");
3540 f->dump_stream("pgid") << p.first;
3541 f->open_array_section("osds");
3542 for (auto q : p.second) {
3543 f->dump_int("osd", q);
3544 }
3545 f->close_section();
3546 f->close_section();
3547 }
3548 f->close_section();
3549 f->open_array_section("pg_upmap_items");
3550 for (auto& p : pg_upmap_items) {
3551 f->open_object_section("mapping");
3552 f->dump_stream("pgid") << p.first;
3553 f->open_array_section("mappings");
3554 for (auto& q : p.second) {
3555 f->open_object_section("mapping");
3556 f->dump_int("from", q.first);
3557 f->dump_int("to", q.second);
3558 f->close_section();
3559 }
3560 f->close_section();
3561 f->close_section();
3562 }
3563 f->close_section();
3564 f->open_array_section("pg_temp");
31f18b77 3565 pg_temp->dump(f);
7c673cae
FG
3566 f->close_section();
3567
3568 f->open_array_section("primary_temp");
3569 for (const auto &pg : *primary_temp) {
3570 f->dump_stream("pgid") << pg.first;
3571 f->dump_int("osd", pg.second);
3572 }
3573 f->close_section(); // primary_temp
3574
3575 f->open_object_section("blacklist");
3576 for (const auto &addr : blacklist) {
3577 stringstream ss;
3578 ss << addr.first;
3579 f->dump_stream(ss.str().c_str()) << addr.second;
3580 }
3581 f->close_section();
3582
3583 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3584
3585 f->open_array_section("removed_snaps_queue");
3586 for (auto& p : removed_snaps_queue) {
3587 f->open_object_section("pool");
3588 f->dump_int("pool", p.first);
3589 f->open_array_section("snaps");
3590 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3591 f->open_object_section("interval");
3592 f->dump_unsigned("begin", q.get_start());
3593 f->dump_unsigned("length", q.get_len());
3594 f->close_section();
3595 }
3596 f->close_section();
3597 f->close_section();
3598 }
3599 f->close_section();
3600 f->open_array_section("new_removed_snaps");
3601 for (auto& p : new_removed_snaps) {
3602 f->open_object_section("pool");
3603 f->dump_int("pool", p.first);
3604 f->open_array_section("snaps");
3605 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3606 f->open_object_section("interval");
3607 f->dump_unsigned("begin", q.get_start());
3608 f->dump_unsigned("length", q.get_len());
3609 f->close_section();
3610 }
3611 f->close_section();
3612 f->close_section();
3613 }
3614 f->close_section();
3615 f->open_array_section("new_purged_snaps");
3616 for (auto& p : new_purged_snaps) {
3617 f->open_object_section("pool");
3618 f->dump_int("pool", p.first);
3619 f->open_array_section("snaps");
3620 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3621 f->open_object_section("interval");
3622 f->dump_unsigned("begin", q.get_start());
3623 f->dump_unsigned("length", q.get_len());
3624 f->close_section();
3625 }
3626 f->close_section();
3627 f->close_section();
3628 }
3629 f->close_section();
81eedcae
TL
3630 f->open_object_section("crush_node_flags");
3631 for (auto& i : crush_node_flags) {
3632 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3633 : stringify(i.first);
3634 f->open_array_section(s.c_str());
3635 set<string> st;
3636 calc_state_set(i.second, st);
3637 for (auto& j : st) {
3638 f->dump_string("flag", j);
3639 }
3640 f->close_section();
3641 }
3642 f->close_section();
3643 f->open_object_section("device_class_flags");
3644 for (auto& i : device_class_flags) {
3645 const char* class_name = crush->get_class_name(i.first);
3646 string s = class_name ? class_name : stringify(i.first);
3647 f->open_array_section(s.c_str());
3648 set<string> st;
3649 calc_state_set(i.second, st);
3650 for (auto& j : st) {
3651 f->dump_string("flag", j);
3652 }
3653 f->close_section();
3654 }
3655 f->close_section();
7c673cae
FG
3656}
3657
3658void OSDMap::generate_test_instances(list<OSDMap*>& o)
3659{
3660 o.push_back(new OSDMap);
3661
3662 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3663 o.push_back(new OSDMap);
3664 uuid_d fsid;
224ce89b 3665 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae
FG
3666 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
3667 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
3668 cct->put();
3669}
3670
3671string OSDMap::get_flag_string(unsigned f)
3672{
3673 string s;
7c673cae
FG
3674 if (f & CEPH_OSDMAP_PAUSERD)
3675 s += ",pauserd";
3676 if (f & CEPH_OSDMAP_PAUSEWR)
3677 s += ",pausewr";
3678 if (f & CEPH_OSDMAP_PAUSEREC)
3679 s += ",pauserec";
3680 if (f & CEPH_OSDMAP_NOUP)
3681 s += ",noup";
3682 if (f & CEPH_OSDMAP_NODOWN)
3683 s += ",nodown";
3684 if (f & CEPH_OSDMAP_NOOUT)
3685 s += ",noout";
3686 if (f & CEPH_OSDMAP_NOIN)
3687 s += ",noin";
3688 if (f & CEPH_OSDMAP_NOBACKFILL)
3689 s += ",nobackfill";
3690 if (f & CEPH_OSDMAP_NOREBALANCE)
3691 s += ",norebalance";
3692 if (f & CEPH_OSDMAP_NORECOVER)
3693 s += ",norecover";
3694 if (f & CEPH_OSDMAP_NOSCRUB)
3695 s += ",noscrub";
3696 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3697 s += ",nodeep-scrub";
3698 if (f & CEPH_OSDMAP_NOTIERAGENT)
3699 s += ",notieragent";
11fdf7f2
TL
3700 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3701 s += ",nosnaptrim";
7c673cae
FG
3702 if (f & CEPH_OSDMAP_SORTBITWISE)
3703 s += ",sortbitwise";
3704 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3705 s += ",require_jewel_osds";
3706 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3707 s += ",require_kraken_osds";
3708 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3709 s += ",require_luminous_osds";
c07f9fc5
FG
3710 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3711 s += ",recovery_deletes";
181888fb
FG
3712 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3713 s += ",purged_snapdirs";
f64942e4
AA
3714 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3715 s += ",pglog_hardlimit";
7c673cae
FG
3716 if (s.length())
3717 s.erase(0, 1);
3718 return s;
3719}
3720
3721string OSDMap::get_flag_string() const
3722{
3723 return get_flag_string(flags);
3724}
3725
7c673cae
FG
3726void OSDMap::print_pools(ostream& out) const
3727{
3728 for (const auto &pool : pools) {
3729 std::string name("<unknown>");
3730 const auto &pni = pool_name.find(pool.first);
3731 if (pni != pool_name.end())
3732 name = pni->second;
3733 out << "pool " << pool.first
3734 << " '" << name
3735 << "' " << pool.second << "\n";
3736
3737 for (const auto &snap : pool.second.snaps)
3738 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3739
3740 if (!pool.second.removed_snaps.empty())
3741 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3742 auto p = removed_snaps_queue.find(pool.first);
3743 if (p != removed_snaps_queue.end()) {
3744 out << "\tremoved_snaps_queue " << p->second << "\n";
3745 }
7c673cae
FG
3746 }
3747 out << std::endl;
3748}
3749
9f95a23c
TL
3750void OSDMap::print_osds(ostream& out) const
3751{
3752 for (int i=0; i<get_max_osd(); i++) {
3753 if (exists(i)) {
3754 print_osd(i, out);
3755 }
3756 }
3757}
3758void OSDMap::print_osd(int id, ostream& out) const
3759{
3760 if (!exists(id)) {
3761 return;
3762 }
3763
3764 out << "osd." << id;
3765 out << (is_up(id) ? " up ":" down");
3766 out << (is_in(id) ? " in ":" out");
3767 out << " weight " << get_weightf(id);
3768 if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
3769 out << " primary_affinity " << get_primary_affinityf(id);
3770 }
3771 const osd_info_t& info(get_info(id));
3772 out << " " << info;
3773 out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
3774 set<string> st;
3775 get_state(id, st);
3776 out << " " << st;
3777 if (!get_uuid(id).is_zero()) {
3778 out << " " << get_uuid(id);
3779 }
3780 out << "\n";
3781}
3782
7c673cae
FG
3783void OSDMap::print(ostream& out) const
3784{
3785 out << "epoch " << get_epoch() << "\n"
3786 << "fsid " << get_fsid() << "\n"
3787 << "created " << get_created() << "\n"
3788 << "modified " << get_modified() << "\n";
3789
3790 out << "flags " << get_flag_string() << "\n";
31f18b77 3791 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3792 out << "full_ratio " << full_ratio << "\n";
3793 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3794 out << "nearfull_ratio " << nearfull_ratio << "\n";
9f95a23c 3795 if (require_min_compat_client != ceph_release_t::unknown) {
31f18b77 3796 out << "require_min_compat_client "
9f95a23c 3797 << require_min_compat_client << "\n";
7c673cae 3798 }
9f95a23c 3799 out << "min_compat_client " << get_min_compat_client()
31f18b77 3800 << "\n";
9f95a23c
TL
3801 if (require_osd_release > ceph_release_t::unknown) {
3802 out << "require_osd_release " << require_osd_release
224ce89b
WB
3803 << "\n";
3804 }
7c673cae
FG
3805 if (get_cluster_snapshot().length())
3806 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3807 out << "\n";
3808
3809 print_pools(out);
3810
3811 out << "max_osd " << get_max_osd() << "\n";
9f95a23c 3812 print_osds(out);
7c673cae
FG
3813 out << std::endl;
3814
3815 for (auto& p : pg_upmap) {
3816 out << "pg_upmap " << p.first << " " << p.second << "\n";
3817 }
3818 for (auto& p : pg_upmap_items) {
3819 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3820 }
3821
3822 for (const auto pg : *pg_temp)
3823 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3824
3825 for (const auto pg : *primary_temp)
3826 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3827
3828 for (const auto &addr : blacklist)
3829 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
3830}
3831
3832class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3833public:
3834 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3835
3836 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3837 unsigned f)
c07f9fc5 3838 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3839
3840 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3841 if (!filter) {
3842 return true; // normal case
3843 }
3844 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3845 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3846 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3847 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3848 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3849 return true;
31f18b77 3850 }
c07f9fc5 3851 return false;
31f18b77
FG
3852 }
3853
3854 bool should_dump_empty_bucket() const override {
3855 return !filter;
3856 }
7c673cae 3857
11fdf7f2 3858 void init_table(TextTable *tbl) {
7c673cae 3859 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3860 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3861 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3862 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3863 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3864 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3865 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
3866 }
3867 void dump(TextTable *tbl, string& bucket) {
3868 init_table(tbl);
7c673cae 3869
11fdf7f2
TL
3870 if (!bucket.empty()) {
3871 set_root(bucket);
3872 Parent::dump(tbl);
3873 } else {
3874 Parent::dump(tbl);
3875 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3876 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3877 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3878 }
31f18b77 3879 }
7c673cae
FG
3880 }
3881 }
3882
3883protected:
3884 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3885 const char *c = crush->get_item_class(qi.id);
3886 if (!c)
3887 c = "";
7c673cae 3888 *tbl << qi.id
224ce89b 3889 << c
7c673cae
FG
3890 << weightf_t(qi.weight);
3891
3892 ostringstream name;
3893 for (int k = 0; k < qi.depth; k++)
3894 name << " ";
3895 if (qi.is_bucket()) {
3896 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3897 << crush->get_item_name(qi.id);
3898 } else {
3899 name << "osd." << qi.id;
3900 }
3901 *tbl << name.str();
3902
3903 if (!qi.is_bucket()) {
3904 if (!osdmap->exists(qi.id)) {
3905 *tbl << "DNE"
3906 << 0;
3907 } else {
c07f9fc5
FG
3908 string s;
3909 if (osdmap->is_up(qi.id)) {
3910 s = "up";
3911 } else if (osdmap->is_destroyed(qi.id)) {
3912 s = "destroyed";
3913 } else {
3914 s = "down";
3915 }
3916 *tbl << s
7c673cae
FG
3917 << weightf_t(osdmap->get_weightf(qi.id))
3918 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3919 }
3920 }
3921 *tbl << TextTable::endrow;
3922 }
3923
3924private:
3925 const OSDMap *osdmap;
31f18b77 3926 const unsigned filter;
7c673cae
FG
3927};
3928
3929class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3930public:
3931 typedef CrushTreeDumper::FormattingDumper Parent;
3932
31f18b77
FG
3933 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3934 unsigned f)
c07f9fc5 3935 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3936
3937 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3938 if (!filter) {
3939 return true; // normal case
3940 }
3941 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3942 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3943 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3944 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3945 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3946 return true;
31f18b77 3947 }
c07f9fc5 3948 return false;
31f18b77
FG
3949 }
3950
3951 bool should_dump_empty_bucket() const override {
3952 return !filter;
3953 }
7c673cae 3954
11fdf7f2
TL
3955 void dump(Formatter *f, string& bucket) {
3956 if (!bucket.empty()) {
3957 set_root(bucket);
3958 f->open_array_section("nodes");
3959 Parent::dump(f);
3960 f->close_section();
3961 } else {
3962 f->open_array_section("nodes");
3963 Parent::dump(f);
3964 f->close_section();
3965 f->open_array_section("stray");
3966 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3967 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3968 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3969 }
3970 f->close_section();
7c673cae 3971 }
7c673cae
FG
3972 }
3973
3974protected:
3975 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3976 Parent::dump_item_fields(qi, f);
3977 if (!qi.is_bucket())
3978 {
c07f9fc5
FG
3979 string s;
3980 if (osdmap->is_up(qi.id)) {
3981 s = "up";
3982 } else if (osdmap->is_destroyed(qi.id)) {
3983 s = "destroyed";
3984 } else {
3985 s = "down";
3986 }
7c673cae 3987 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 3988 f->dump_string("status", s);
7c673cae
FG
3989 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3990 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3991 }
3992 }
3993
3994private:
3995 const OSDMap *osdmap;
31f18b77 3996 const unsigned filter;
7c673cae
FG
3997};
3998
11fdf7f2 3999void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 4000{
31f18b77 4001 if (f) {
11fdf7f2 4002 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 4003 } else {
11fdf7f2 4004 ceph_assert(out);
7c673cae 4005 TextTable tbl;
11fdf7f2 4006 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
4007 *out << tbl;
4008 }
4009}
4010
224ce89b 4011void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 4012 const string& prefix, bool extra) const
7c673cae
FG
4013{
4014 if (f) {
7c673cae
FG
4015 f->dump_int("epoch", get_epoch());
4016 f->dump_int("num_osds", get_num_osds());
4017 f->dump_int("num_up_osds", get_num_up_osds());
9f95a23c 4018 f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
7c673cae 4019 f->dump_int("num_in_osds", get_num_in_osds());
9f95a23c 4020 f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
7c673cae 4021 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
7c673cae 4022 } else {
11fdf7f2 4023 utime_t now = ceph_clock_now();
31f18b77 4024 out << get_num_osds() << " osds: "
11fdf7f2
TL
4025 << get_num_up_osds() << " up";
4026 if (last_up_change != utime_t()) {
4027 out << " (since " << utimespan_str(now - last_up_change) << ")";
4028 }
4029 out << ", " << get_num_in_osds() << " in";
4030 if (last_in_change != utime_t()) {
4031 out << " (since " << utimespan_str(now - last_in_change) << ")";
4032 }
4033 if (extra)
4034 out << "; epoch: e" << get_epoch();
7c673cae
FG
4035 if (get_num_pg_temp())
4036 out << "; " << get_num_pg_temp() << " remapped pgs";
4037 out << "\n";
4038 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
4039 if (important_flags)
224ce89b 4040 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
4041 }
4042}
4043
4044void OSDMap::print_oneline_summary(ostream& out) const
4045{
4046 out << "e" << get_epoch() << ": "
31f18b77 4047 << get_num_osds() << " total, "
7c673cae
FG
4048 << get_num_up_osds() << " up, "
4049 << get_num_in_osds() << " in";
7c673cae
FG
4050}
4051
3efd9988 4052bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
4053{
4054 for (const auto &pool : pools) {
3efd9988 4055 if (pool.second.crush_rule == rule_id)
7c673cae
FG
4056 return true;
4057 }
4058 return false;
4059}
4060
3efd9988
FG
4061int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
4062 ostream *ss) const
4063{
4064 for (auto& i : pools) {
4065 auto& pool = i.second;
4066 int ruleno = pool.get_crush_rule();
4067 if (!newcrush->rule_exists(ruleno)) {
4068 *ss << "pool " << i.first << " references crush_rule " << ruleno
4069 << " but it is not present";
4070 return -EINVAL;
4071 }
4072 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
4073 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
4074 return -EINVAL;
4075 }
4076 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
4077 *ss << "pool " << i.first << " type does not match rule " << ruleno;
4078 return -EINVAL;
4079 }
11fdf7f2
TL
4080 int poolsize = pool.get_size();
4081 if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
4082 poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
4083 *ss << "pool " << i.first << " size " << poolsize << " does not"
3efd9988
FG
4084 << " fall within rule " << ruleno
4085 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
4086 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
4087 return -EINVAL;
4088 }
4089 }
4090 return 0;
4091}
4092
224ce89b
WB
4093int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
4094 int nosd, int pg_bits, int pgp_bits,
4095 bool default_pool)
7c673cae 4096{
224ce89b
WB
4097 ldout(cct, 10) << "build_simple on " << nosd
4098 << " osds" << dendl;
7c673cae
FG
4099 epoch = e;
4100 set_fsid(fsid);
4101 created = modified = ceph_clock_now();
4102
4103 if (nosd >= 0) {
4104 set_max_osd(nosd);
4105 } else {
4106 // count osds
4107 int maxosd = 0;
11fdf7f2 4108 const auto& conf = cct->_conf;
7c673cae 4109 vector<string> sections;
11fdf7f2 4110 conf.get_all_sections(sections);
7c673cae
FG
4111
4112 for (auto &section : sections) {
4113 if (section.find("osd.") != 0)
4114 continue;
4115
4116 const char *begin = section.c_str() + 4;
4117 char *end = (char*)begin;
4118 int o = strtol(begin, &end, 10);
4119 if (*end != '\0')
4120 continue;
4121
4122 if (o > cct->_conf->mon_max_osd) {
4123 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4124 return -ERANGE;
4125 }
4126
4127 if (o > maxosd)
4128 maxosd = o;
4129 }
4130
4131 set_max_osd(maxosd + 1);
4132 }
4133
7c673cae
FG
4134
4135 stringstream ss;
4136 int r;
4137 if (nosd >= 0)
4138 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4139 else
4140 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4141 ceph_assert(r == 0);
7c673cae
FG
4142
4143 int poolbase = get_max_osd() ? get_max_osd() : 1;
4144
d2e6a577 4145 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
11fdf7f2 4146 ceph_assert(default_replicated_rule >= 0);
7c673cae 4147
224ce89b
WB
4148 if (default_pool) {
4149 // pgp_num <= pg_num
4150 if (pgp_bits > pg_bits)
4151 pgp_bits = pg_bits;
4152
4153 vector<string> pool_names;
4154 pool_names.push_back("rbd");
4155 for (auto &plname : pool_names) {
4156 int64_t pool = ++pool_max;
4157 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4158 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4159 if (cct->_conf->osd_pool_default_flag_hashpspool)
4160 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4161 if (cct->_conf->osd_pool_default_flag_nodelete)
4162 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4163 if (cct->_conf->osd_pool_default_flag_nopgchange)
4164 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4165 if (cct->_conf->osd_pool_default_flag_nosizechange)
4166 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
4167 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4168 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4169 pools[pool].size);
224ce89b
WB
4170 pools[pool].crush_rule = default_replicated_rule;
4171 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4172 pools[pool].set_pg_num(poolbase << pg_bits);
4173 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4174 pools[pool].set_pg_num_target(poolbase << pg_bits);
4175 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4176 pools[pool].last_change = epoch;
c07f9fc5
FG
4177 pools[pool].application_metadata.insert(
4178 {pg_pool_t::APPLICATION_NAME_RBD, {}});
9f95a23c
TL
4179 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4180 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4181 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
4182 pools[pool].pg_autoscale_mode = m;
4183 } else {
4184 pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
4185 }
224ce89b
WB
4186 pool_name[pool] = plname;
4187 name_pool[plname] = pool;
4188 }
7c673cae
FG
4189 }
4190
4191 for (int i=0; i<get_max_osd(); i++) {
4192 set_state(i, 0);
4193 set_weight(i, CEPH_OSD_OUT);
4194 }
4195
4196 map<string,string> profile_map;
4197 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4198 if (r < 0) {
4199 lderr(cct) << ss.str() << dendl;
4200 return r;
4201 }
4202 set_erasure_code_profile("default", profile_map);
4203 return 0;
4204}
4205
4206int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4207 map<string,string> &profile_map,
4208 ostream *ss)
4209{
11fdf7f2 4210 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4211 *ss,
4212 &profile_map);
4213 return r;
4214}
4215
4216int OSDMap::_build_crush_types(CrushWrapper& crush)
4217{
4218 crush.set_type_name(0, "osd");
4219 crush.set_type_name(1, "host");
4220 crush.set_type_name(2, "chassis");
4221 crush.set_type_name(3, "rack");
4222 crush.set_type_name(4, "row");
4223 crush.set_type_name(5, "pdu");
4224 crush.set_type_name(6, "pod");
4225 crush.set_type_name(7, "room");
4226 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4227 crush.set_type_name(9, "zone");
4228 crush.set_type_name(10, "region");
4229 crush.set_type_name(11, "root");
4230 return 11;
7c673cae
FG
4231}
4232
4233int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4234 int nosd, ostream *ss)
4235{
4236 crush.create();
4237
4238 // root
4239 int root_type = _build_crush_types(crush);
4240 int rootid;
4241 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4242 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4243 ceph_assert(r == 0);
7c673cae
FG
4244 crush.set_item_name(rootid, "default");
4245
4246 for (int o=0; o<nosd; o++) {
4247 map<string,string> loc;
4248 loc["host"] = "localhost";
4249 loc["rack"] = "localrack";
4250 loc["root"] = "default";
4251 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4252 char name[32];
4253 snprintf(name, sizeof(name), "osd.%d", o);
4254 crush.insert_item(cct, o, 1.0, name, loc);
4255 }
4256
31f18b77 4257 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4258
4259 crush.finalize();
4260
4261 return 0;
4262}
4263
4264int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4265 CrushWrapper& crush,
4266 ostream *ss)
4267{
11fdf7f2 4268 const auto& conf = cct->_conf;
7c673cae
FG
4269
4270 crush.create();
4271
4272 // root
4273 int root_type = _build_crush_types(crush);
4274 int rootid;
4275 int r = crush.add_bucket(0, 0,
4276 CRUSH_HASH_DEFAULT,
4277 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4278 ceph_assert(r == 0);
7c673cae
FG
4279 crush.set_item_name(rootid, "default");
4280
4281 // add osds
4282 vector<string> sections;
11fdf7f2 4283 conf.get_all_sections(sections);
7c673cae
FG
4284
4285 for (auto &section : sections) {
4286 if (section.find("osd.") != 0)
4287 continue;
4288
4289 const char *begin = section.c_str() + 4;
4290 char *end = (char*)begin;
4291 int o = strtol(begin, &end, 10);
4292 if (*end != '\0')
4293 continue;
4294
4295 string host, rack, row, room, dc, pool;
4296 vector<string> sectiontmp;
4297 sectiontmp.push_back("osd");
4298 sectiontmp.push_back(section);
11fdf7f2
TL
4299 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4300 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4301 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4302 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4303 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4304 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4305
4306 if (host.length() == 0)
4307 host = "unknownhost";
4308 if (rack.length() == 0)
4309 rack = "unknownrack";
4310
4311 map<string,string> loc;
4312 loc["host"] = host;
4313 loc["rack"] = rack;
4314 if (row.size())
4315 loc["row"] = row;
4316 if (room.size())
4317 loc["room"] = room;
4318 if (dc.size())
4319 loc["datacenter"] = dc;
4320 loc["root"] = "default";
4321
4322 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4323 crush.insert_item(cct, o, 1.0, section, loc);
4324 }
4325
31f18b77 4326 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4327
4328 crush.finalize();
4329
4330 return 0;
4331}
4332
4333
31f18b77
FG
4334int OSDMap::build_simple_crush_rules(
4335 CephContext *cct,
4336 CrushWrapper& crush,
4337 const string& root,
4338 ostream *ss)
7c673cae 4339{
31f18b77 4340 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
7c673cae
FG
4341 string failure_domain =
4342 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4343
7c673cae 4344 int r;
31f18b77 4345 r = crush.add_simple_rule_at(
224ce89b 4346 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4347 "firstn", pg_pool_t::TYPE_REPLICATED,
4348 crush_rule, ss);
7c673cae
FG
4349 if (r < 0)
4350 return r;
4351 // do not add an erasure rule by default or else we will implicitly
4352 // require the crush_v2 feature of clients
4353 return 0;
4354}
4355
4356int OSDMap::summarize_mapping_stats(
4357 OSDMap *newmap,
4358 const set<int64_t> *pools,
4359 std::string *out,
4360 Formatter *f) const
4361{
4362 set<int64_t> ls;
4363 if (pools) {
4364 ls = *pools;
4365 } else {
4366 for (auto &p : get_pools())
4367 ls.insert(p.first);
4368 }
4369
4370 unsigned total_pg = 0;
4371 unsigned moved_pg = 0;
4372 vector<unsigned> base_by_osd(get_max_osd(), 0);
4373 vector<unsigned> new_by_osd(get_max_osd(), 0);
4374 for (int64_t pool_id : ls) {
4375 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4376 vector<int> up, up2;
4377 int up_primary;
7c673cae 4378 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4379 pg_t pgid(ps, pool_id);
7c673cae 4380 total_pg += pi->get_size();
31f18b77 4381 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4382 for (int osd : up) {
4383 if (osd >= 0 && osd < get_max_osd())
4384 ++base_by_osd[osd];
4385 }
4386 if (newmap) {
31f18b77 4387 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4388 for (int osd : up2) {
4389 if (osd >= 0 && osd < get_max_osd())
4390 ++new_by_osd[osd];
4391 }
4392 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4393 for (unsigned i=0; i<up.size(); ++i) {
4394 if (up[i] != up2[i]) {
4395 ++moved_pg;
4396 }
4397 }
4398 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4399 for (int osd : up) {
4400 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4401 ++moved_pg;
4402 }
4403 }
4404 } else {
11fdf7f2 4405 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4406 }
4407 }
4408 }
4409 }
4410
4411 unsigned num_up_in = 0;
4412 for (int osd = 0; osd < get_max_osd(); ++osd) {
4413 if (is_up(osd) && is_in(osd))
4414 ++num_up_in;
4415 }
4416 if (!num_up_in) {
4417 return -EINVAL;
4418 }
4419
4420 float avg_pg = (float)total_pg / (float)num_up_in;
4421 float base_stddev = 0, new_stddev = 0;
4422 int min = -1, max = -1;
4423 unsigned min_base_pg = 0, max_base_pg = 0;
4424 unsigned min_new_pg = 0, max_new_pg = 0;
4425 for (int osd = 0; osd < get_max_osd(); ++osd) {
4426 if (is_up(osd) && is_in(osd)) {
4427 float base_diff = (float)base_by_osd[osd] - avg_pg;
4428 base_stddev += base_diff * base_diff;
4429 float new_diff = (float)new_by_osd[osd] - avg_pg;
4430 new_stddev += new_diff * new_diff;
4431 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4432 min = osd;
4433 min_base_pg = base_by_osd[osd];
4434 min_new_pg = new_by_osd[osd];
4435 }
4436 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4437 max = osd;
4438 max_base_pg = base_by_osd[osd];
4439 max_new_pg = new_by_osd[osd];
4440 }
4441 }
4442 }
4443 base_stddev = sqrt(base_stddev / num_up_in);
4444 new_stddev = sqrt(new_stddev / num_up_in);
4445
4446 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4447
4448 ostringstream ss;
4449 if (f)
4450 f->open_object_section("utilization");
4451 if (newmap) {
4452 if (f) {
4453 f->dump_unsigned("moved_pgs", moved_pg);
4454 f->dump_unsigned("total_pgs", total_pg);
4455 } else {
4456 float percent = 0;
4457 if (total_pg)
4458 percent = (float)moved_pg * 100.0 / (float)total_pg;
4459 ss << "moved " << moved_pg << " / " << total_pg
4460 << " (" << percent << "%)\n";
4461 }
4462 }
4463 if (f) {
4464 f->dump_float("avg_pgs", avg_pg);
4465 f->dump_float("std_dev", base_stddev);
4466 f->dump_float("expected_baseline_std_dev", edev);
4467 if (newmap)
4468 f->dump_float("new_std_dev", new_stddev);
4469 } else {
4470 ss << "avg " << avg_pg << "\n";
4471 ss << "stddev " << base_stddev;
4472 if (newmap)
4473 ss << " -> " << new_stddev;
4474 ss << " (expected baseline " << edev << ")\n";
4475 }
4476 if (min >= 0) {
4477 if (f) {
4478 f->dump_unsigned("min_osd", min);
4479 f->dump_unsigned("min_osd_pgs", min_base_pg);
4480 if (newmap)
4481 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4482 } else {
4483 ss << "min osd." << min << " with " << min_base_pg;
4484 if (newmap)
4485 ss << " -> " << min_new_pg;
4486 ss << " pgs (" << (float)min_base_pg / avg_pg;
4487 if (newmap)
4488 ss << " -> " << (float)min_new_pg / avg_pg;
4489 ss << " * mean)\n";
4490 }
4491 }
4492 if (max >= 0) {
4493 if (f) {
4494 f->dump_unsigned("max_osd", max);
4495 f->dump_unsigned("max_osd_pgs", max_base_pg);
4496 if (newmap)
4497 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4498 } else {
4499 ss << "max osd." << max << " with " << max_base_pg;
4500 if (newmap)
4501 ss << " -> " << max_new_pg;
4502 ss << " pgs (" << (float)max_base_pg / avg_pg;
4503 if (newmap)
4504 ss << " -> " << (float)max_new_pg / avg_pg;
4505 ss << " * mean)\n";
4506 }
4507 }
4508 if (f)
4509 f->close_section();
4510 if (out)
4511 *out = ss.str();
4512 return 0;
4513}
4514
7c673cae
FG
4515bool OSDMap::try_pg_upmap(
4516 CephContext *cct,
4517 pg_t pg, ///< pg to potentially remap
4518 const set<int>& overfull, ///< osds we'd want to evacuate
4519 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4520 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4521 vector<int> *orig,
4522 vector<int> *out) ///< resulting alternative mapping
4523{
4524 const pg_pool_t *pool = get_pg_pool(pg.pool());
4525 if (!pool)
4526 return false;
31f18b77 4527 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
7c673cae
FG
4528 pool->get_size());
4529 if (rule < 0)
4530 return false;
4531
7c673cae
FG
4532 // make sure there is something there to remap
4533 bool any = false;
4534 for (auto osd : *orig) {
4535 if (overfull.count(osd)) {
4536 any = true;
4537 break;
4538 }
4539 }
4540 if (!any) {
4541 return false;
4542 }
4543
4544 int r = crush->try_remap_rule(
4545 cct,
4546 rule,
4547 pool->get_size(),
4548 overfull, underfull,
92f5a8d4 4549 more_underfull,
7c673cae
FG
4550 *orig,
4551 out);
4552 if (r < 0)
4553 return false;
4554 if (*out == *orig)
4555 return false;
4556 return true;
4557}
4558
4559int OSDMap::calc_pg_upmaps(
4560 CephContext *cct,
92f5a8d4 4561 uint32_t max_deviation,
7c673cae 4562 int max,
a8e16298 4563 const set<int64_t>& only_pools,
7c673cae
FG
4564 OSDMap::Incremental *pending_inc)
4565{
a8e16298 4566 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
7c673cae 4567 OSDMap tmp;
92f5a8d4
TL
4568 // Can't be less than 1 pg
4569 if (max_deviation < 1)
4570 max_deviation = 1;
7c673cae
FG
4571 tmp.deepish_copy_from(*this);
4572 int num_changed = 0;
a8e16298
TL
4573 map<int,set<pg_t>> pgs_by_osd;
4574 int total_pgs = 0;
4575 float osd_weight_total = 0;
4576 map<int,float> osd_weight;
4577 for (auto& i : pools) {
4578 if (!only_pools.empty() && !only_pools.count(i.first))
4579 continue;
4580 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4581 pg_t pg(ps, i.first);
4582 vector<int> up;
4583 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4584 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4585 for (auto osd : up) {
4586 if (osd != CRUSH_ITEM_NONE)
4587 pgs_by_osd[osd].insert(pg);
7c673cae 4588 }
a8e16298
TL
4589 }
4590 total_pgs += i.second.get_size() * i.second.get_pg_num();
4591
4592 map<int,float> pmap;
4593 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4594 i.second.get_type(),
4595 i.second.get_size());
4596 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4597 ldout(cct,20) << __func__ << " pool " << i.first
4598 << " ruleno " << ruleno
4599 << " weight-map " << pmap
4600 << dendl;
4601 for (auto p : pmap) {
4602 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4603 if (adjusted_weight == 0) {
4604 continue;
31f18b77 4605 }
a8e16298
TL
4606 osd_weight[p.first] += adjusted_weight;
4607 osd_weight_total += adjusted_weight;
7c673cae 4608 }
a8e16298
TL
4609 }
4610 for (auto& i : osd_weight) {
4611 int pgs = 0;
4612 auto p = pgs_by_osd.find(i.first);
4613 if (p != pgs_by_osd.end())
31f18b77 4614 pgs = p->second.size();
a8e16298 4615 else
31f18b77 4616 pgs_by_osd.emplace(i.first, set<pg_t>());
a8e16298 4617 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
31f18b77 4618 << " pgs " << pgs << dendl;
a8e16298
TL
4619 }
4620 if (osd_weight_total == 0) {
4621 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4622 return 0;
4623 }
4624 float pgs_per_weight = total_pgs / osd_weight_total;
4625 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4626 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4627
a8e16298
TL
4628 if (max <= 0) {
4629 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4630 return 0;
4631 }
a8e16298
TL
4632 float stddev = 0;
4633 map<int,float> osd_deviation; // osd, deviation(pgs)
4634 multimap<float,int> deviation_osd; // deviation(pgs), osd
92f5a8d4 4635 float cur_max_deviation = 0;
a8e16298
TL
4636 for (auto& i : pgs_by_osd) {
4637 // make sure osd is still there (belongs to this crush-tree)
4638 ceph_assert(osd_weight.count(i.first));
4639 float target = osd_weight[i.first] * pgs_per_weight;
4640 float deviation = (float)i.second.size() - target;
4641 ldout(cct, 20) << " osd." << i.first
4642 << "\tpgs " << i.second.size()
4643 << "\ttarget " << target
4644 << "\tdeviation " << deviation
4645 << dendl;
4646 osd_deviation[i.first] = deviation;
4647 deviation_osd.insert(make_pair(deviation, i.first));
4648 stddev += deviation * deviation;
92f5a8d4
TL
4649 if (fabsf(deviation) > cur_max_deviation)
4650 cur_max_deviation = fabsf(deviation);
a8e16298 4651 }
92f5a8d4
TL
4652 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4653 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
4654 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4655 << dendl;
4656 return 0;
4657 }
4658 bool skip_overfull = false;
4659 auto aggressive =
11fdf7f2 4660 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4661 auto local_fallback_retries =
11fdf7f2 4662 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
a8e16298 4663 while (max--) {
92f5a8d4 4664 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
4665 // build overfull and underfull
4666 set<int> overfull;
92f5a8d4
TL
4667 set<int> more_overfull;
4668 bool using_more_overfull = false;
a8e16298 4669 vector<int> underfull;
92f5a8d4
TL
4670 vector<int> more_underfull;
4671 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4672 ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
4673 if (i->first <= 0)
4674 break;
4675 if (i->first > max_deviation) {
4676 ldout(cct, 30) << " add overfull osd." << i->second << dendl;
a8e16298 4677 overfull.insert(i->second);
92f5a8d4
TL
4678 } else {
4679 more_overfull.insert(i->second);
4680 }
a8e16298 4681 }
7c673cae 4682
92f5a8d4
TL
4683 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4684 ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
4685 if (i->first >= 0)
a8e16298 4686 break;
92f5a8d4
TL
4687 if (i->first < -(int)max_deviation) {
4688 ldout(cct, 30) << " add underfull osd." << i->second << dendl;
4689 underfull.push_back(i->second);
4690 } else {
4691 more_underfull.push_back(i->second);
4692 }
7c673cae 4693 }
92f5a8d4
TL
4694 if (underfull.empty() && overfull.empty()) {
4695 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 4696 break;
a8e16298 4697 }
92f5a8d4
TL
4698 if (overfull.empty() && !underfull.empty()) {
4699 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4700 overfull = more_overfull;
4701 using_more_overfull = true;
4702 }
7c673cae 4703
a8e16298
TL
4704 ldout(cct, 10) << " overfull " << overfull
4705 << " underfull " << underfull
4706 << dendl;
4707 set<pg_t> to_skip;
4708 uint64_t local_fallback_retried = 0;
4709
4710 retry:
4711
4712 set<pg_t> to_unmap;
4713 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4714 auto temp_pgs_by_osd = pgs_by_osd;
4715 // always start with fullest, break if we find any changes to make
7c673cae 4716 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 4717 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
4718 ldout(cct, 10) << " skipping overfull " << dendl;
4719 break; // fall through to check underfull
4720 }
7c673cae 4721 int osd = p->second;
31f18b77 4722 float deviation = p->first;
9f95a23c
TL
4723 if (deviation < 0) {
4724 ldout(cct, 10) << " hitting underfull osds now"
4725 << " when trying to remap overfull osds"
4726 << dendl;
4727 break;
4728 }
7c673cae 4729 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
4730 ldout(cct, 10) << " Overfull search osd." << osd
4731 << " target " << target
4732 << " deviation " << deviation
4733 << dendl;
a8e16298 4734 ceph_assert(target > 0);
92f5a8d4 4735 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 4736 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4737 << " target " << target
4738 << " deviation " << deviation
92f5a8d4 4739 << " < max deviation " << max_deviation
a8e16298 4740 << dendl;
7c673cae
FG
4741 break;
4742 }
7c673cae 4743
a8e16298
TL
4744 vector<pg_t> pgs;
4745 pgs.reserve(pgs_by_osd[osd].size());
4746 for (auto& pg : pgs_by_osd[osd]) {
4747 if (to_skip.count(pg))
4748 continue;
4749 pgs.push_back(pg);
4750 }
4751 if (aggressive) {
4752 // shuffle PG list so they all get equal (in)attention
4753 std::random_device rd;
4754 std::default_random_engine rng{rd()};
4755 std::shuffle(pgs.begin(), pgs.end(), rng);
4756 }
7c673cae
FG
4757 // look for remaps we can un-remap
4758 for (auto pg : pgs) {
4759 auto p = tmp.pg_upmap_items.find(pg);
a8e16298
TL
4760 if (p == tmp.pg_upmap_items.end())
4761 continue;
4762 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4763 for (auto q : p->second) {
4764 if (q.second == osd) {
4765 ldout(cct, 10) << " will try dropping existing"
4766 << " remapping pair "
4767 << q.first << " -> " << q.second
4768 << " which remapped " << pg
4769 << " into overfull osd." << osd
4770 << dendl;
4771 temp_pgs_by_osd[q.second].erase(pg);
4772 temp_pgs_by_osd[q.first].insert(pg);
4773 } else {
4774 new_upmap_items.push_back(q);
4775 }
4776 }
4777 if (new_upmap_items.empty()) {
4778 // drop whole item
4779 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4780 << " remapped " << pg << " into overfull osd." << osd
4781 << ", will try cancelling it entirely"
4782 << dendl;
4783 to_unmap.insert(pg);
4784 goto test_change;
4785 } else if (new_upmap_items.size() != p->second.size()) {
4786 // drop single remapping pair, updating
4787 ceph_assert(new_upmap_items.size() < p->second.size());
4788 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4789 << " remapped " << pg << " into overfull osd." << osd
4790 << ", new_pg_upmap_items now " << new_upmap_items
4791 << dendl;
4792 to_upmap[pg] = new_upmap_items;
4793 goto test_change;
4794 }
4795 }
7c673cae 4796
a8e16298 4797 // try upmap
7c673cae 4798 for (auto pg : pgs) {
a8e16298
TL
4799 auto temp_it = tmp.pg_upmap.find(pg);
4800 if (temp_it != tmp.pg_upmap.end()) {
4801 // leave pg_upmap alone
4802 // it must be specified by admin since balancer does not
4803 // support pg_upmap yet
4804 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4805 << temp_it->second << ", skipping"
4806 << dendl;
7c673cae
FG
4807 continue;
4808 }
a8e16298
TL
4809 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4810 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4811 set<int> existing;
4812 auto it = tmp.pg_upmap_items.find(pg);
4813 if (it != tmp.pg_upmap_items.end() &&
4814 it->second.size() >= (size_t)pg_pool_size) {
4815 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4816 << it->second << ", skipping"
4817 << dendl;
4818 continue;
4819 } else if (it != tmp.pg_upmap_items.end()) {
4820 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4821 << it->second
4822 << dendl;
4823 new_upmap_items = it->second;
4824 // build existing too (for dedup)
4825 for (auto i : it->second) {
4826 existing.insert(i.first);
4827 existing.insert(i.second);
4828 }
4829 // fall through
4830 // to see if we can append more remapping pairs
4831 }
4832 ldout(cct, 10) << " trying " << pg << dendl;
494da23a
TL
4833 vector<int> raw, orig, out;
4834 tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 4835 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
4836 continue;
4837 }
a8e16298 4838 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4839 if (orig.size() != out.size()) {
4840 continue;
4841 }
a8e16298 4842 ceph_assert(orig != out);
92f5a8d4
TL
4843 int pos = -1;
4844 float max_dev = 0;
7c673cae 4845 for (unsigned i = 0; i < out.size(); ++i) {
a8e16298
TL
4846 if (orig[i] == out[i])
4847 continue; // skip invalid remappings
4848 if (existing.count(orig[i]) || existing.count(out[i]))
4849 continue; // we want new remappings only!
92f5a8d4
TL
4850 if (osd_deviation[orig[i]] > max_dev) {
4851 max_dev = osd_deviation[orig[i]];
4852 pos = i;
4853 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
4854 }
4855 }
4856 if (pos != -1) {
4857 int i = pos;
a8e16298
TL
4858 ldout(cct, 10) << " will try adding new remapping pair "
4859 << orig[i] << " -> " << out[i] << " for " << pg
92f5a8d4 4860 << (orig[i] != osd ? " NOT selected osd" : "")
a8e16298
TL
4861 << dendl;
4862 existing.insert(orig[i]);
4863 existing.insert(out[i]);
4864 temp_pgs_by_osd[orig[i]].erase(pg);
4865 temp_pgs_by_osd[out[i]].insert(pg);
4866 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4867 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4868 // append new remapping pairs slowly
4869 // This way we can make sure that each tiny change will
4870 // definitely make distribution of PGs converging to
4871 // the perfect status.
4872 to_upmap[pg] = new_upmap_items;
4873 goto test_change;
7c673cae 4874 }
a8e16298
TL
4875 }
4876 }
7c673cae 4877
a8e16298
TL
4878 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4879 ldout(cct, 10) << " failed to find any changes for overfull osds"
4880 << dendl;
4881 for (auto& p : deviation_osd) {
4882 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4883 underfull.end())
4884 break;
4885 int osd = p.second;
4886 float deviation = p.first;
4887 float target = osd_weight[osd] * pgs_per_weight;
4888 ceph_assert(target > 0);
92f5a8d4
TL
4889 if (fabsf(deviation) < max_deviation) {
4890 // respect max_deviation too
a8e16298
TL
4891 ldout(cct, 10) << " osd." << osd
4892 << " target " << target
4893 << " deviation " << deviation
92f5a8d4
TL
4894 << " -> absolute " << fabsf(deviation)
4895 << " < max " << max_deviation
a8e16298
TL
4896 << dendl;
4897 break;
4898 }
4899 // look for remaps we can un-remap
4900 vector<pair<pg_t,
4901 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4902 candidates.reserve(tmp.pg_upmap_items.size());
4903 for (auto& i : tmp.pg_upmap_items) {
4904 if (to_skip.count(i.first))
4905 continue;
4906 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4907 continue;
4908 candidates.push_back(make_pair(i.first, i.second));
4909 }
4910 if (aggressive) {
4911 // shuffle candidates so they all get equal (in)attention
4912 std::random_device rd;
4913 std::default_random_engine rng{rd()};
4914 std::shuffle(candidates.begin(), candidates.end(), rng);
4915 }
4916 for (auto& i : candidates) {
4917 auto pg = i.first;
4918 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4919 for (auto& j : i.second) {
4920 if (j.first == osd) {
4921 ldout(cct, 10) << " will try dropping existing"
4922 << " remapping pair "
4923 << j.first << " -> " << j.second
4924 << " which remapped " << pg
4925 << " out from underfull osd." << osd
4926 << dendl;
4927 temp_pgs_by_osd[j.second].erase(pg);
4928 temp_pgs_by_osd[j.first].insert(pg);
4929 } else {
4930 new_upmap_items.push_back(j);
4931 }
4932 }
4933 if (new_upmap_items.empty()) {
4934 // drop whole item
4935 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4936 << " remapped " << pg
4937 << " out from underfull osd." << osd
4938 << ", will try cancelling it entirely"
4939 << dendl;
4940 to_unmap.insert(pg);
4941 goto test_change;
4942 } else if (new_upmap_items.size() != i.second.size()) {
4943 // drop single remapping pair, updating
4944 ceph_assert(new_upmap_items.size() < i.second.size());
4945 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4946 << " remapped " << pg
4947 << " out from underfull osd." << osd
4948 << ", new_pg_upmap_items now " << new_upmap_items
4949 << dendl;
4950 to_upmap[pg] = new_upmap_items;
4951 goto test_change;
4952 }
4953 }
7c673cae 4954 }
a8e16298
TL
4955
4956 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4957 ldout(cct, 10) << " failed to find any changes for underfull osds"
4958 << dendl;
4959 if (!aggressive) {
4960 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
4961 break;
4962 } else if (!skip_overfull) {
4963 // safe to quit because below here we know
4964 // we've done checking both overfull and underfull osds..
4965 ldout(cct, 10) << " break due to not being able to find any"
4966 << " further optimizations"
4967 << dendl;
7c673cae
FG
4968 break;
4969 }
a8e16298
TL
4970 // restart with fullest and do exhaustive searching
4971 skip_overfull = false;
4972 continue;
4973
4974 test_change:
4975
4976 // test change, apply if change is good
4977 ceph_assert(to_unmap.size() || to_upmap.size());
4978 float new_stddev = 0;
4979 map<int,float> temp_osd_deviation;
4980 multimap<float,int> temp_deviation_osd;
92f5a8d4 4981 float cur_max_deviation = 0;
a8e16298
TL
4982 for (auto& i : temp_pgs_by_osd) {
4983 // make sure osd is still there (belongs to this crush-tree)
4984 ceph_assert(osd_weight.count(i.first));
4985 float target = osd_weight[i.first] * pgs_per_weight;
4986 float deviation = (float)i.second.size() - target;
4987 ldout(cct, 20) << " osd." << i.first
4988 << "\tpgs " << i.second.size()
4989 << "\ttarget " << target
4990 << "\tdeviation " << deviation
4991 << dendl;
4992 temp_osd_deviation[i.first] = deviation;
4993 temp_deviation_osd.insert(make_pair(deviation, i.first));
92f5a8d4
TL
4994 new_stddev += deviation * deviation;
4995 if (fabsf(deviation) > cur_max_deviation)
4996 cur_max_deviation = fabsf(deviation);
a8e16298
TL
4997 }
4998 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
4999 if (new_stddev >= stddev) {
5000 if (!aggressive) {
5001 ldout(cct, 10) << " break because stddev is not decreasing"
5002 << " and aggressive mode is not enabled"
5003 << dendl;
5004 break;
5005 }
5006 local_fallback_retried++;
5007 if (local_fallback_retried >= local_fallback_retries) {
5008 // does not make progress
5009 // flip *skip_overfull* so both overfull and underfull
5010 // get equal (in)attention
5011 skip_overfull = !skip_overfull;
5012 ldout(cct, 10) << " hit local_fallback_retries "
5013 << local_fallback_retries
5014 << dendl;
5015 continue;
5016 }
5017 for (auto& i : to_unmap)
5018 to_skip.insert(i);
5019 for (auto& i : to_upmap)
5020 to_skip.insert(i.first);
5021 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
5022 << " to_skip " << to_skip
5023 << dendl;
5024 goto retry;
5025 }
5026
5027 // ready to go
5028 ceph_assert(new_stddev < stddev);
5029 stddev = new_stddev;
5030 pgs_by_osd = temp_pgs_by_osd;
5031 osd_deviation = temp_osd_deviation;
5032 deviation_osd = temp_deviation_osd;
5033 for (auto& i : to_unmap) {
5034 ldout(cct, 10) << " unmap pg " << i << dendl;
5035 ceph_assert(tmp.pg_upmap_items.count(i));
5036 tmp.pg_upmap_items.erase(i);
5037 pending_inc->old_pg_upmap_items.insert(i);
5038 ++num_changed;
5039 }
5040 for (auto& i : to_upmap) {
5041 ldout(cct, 10) << " upmap pg " << i.first
5042 << " new pg_upmap_items " << i.second
5043 << dendl;
5044 tmp.pg_upmap_items[i.first] = i.second;
5045 pending_inc->new_pg_upmap_items[i.first] = i.second;
5046 ++num_changed;
5047 }
92f5a8d4
TL
5048 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
5049 if (cur_max_deviation <= max_deviation) {
5050 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
5051 << dendl;
5052 break;
5053 }
7c673cae 5054 }
a8e16298 5055 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
5056 return num_changed;
5057}
31f18b77
FG
5058
5059int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
5060{
5061 return crush->get_leaves(name, osds);
5062}
5063
3efd9988
FG
5064// get pools whose crush rules might reference the given osd
5065void OSDMap::get_pool_ids_by_osd(CephContext *cct,
5066 int osd,
5067 set<int64_t> *pool_ids) const
5068{
11fdf7f2 5069 ceph_assert(pool_ids);
3efd9988
FG
5070 set<int> raw_rules;
5071 int r = crush->get_rules_by_osd(osd, &raw_rules);
5072 if (r < 0) {
5073 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
5074 << dendl;
11fdf7f2 5075 ceph_assert(r >= 0);
3efd9988
FG
5076 }
5077 set<int> rules;
5078 for (auto &i: raw_rules) {
5079 // exclude any dead rule
5080 if (crush_rule_in_use(i)) {
5081 rules.insert(i);
5082 }
5083 }
5084 for (auto &r: rules) {
5085 get_pool_ids_by_rule(r, pool_ids);
5086 }
5087}
5088
31f18b77
FG
5089template <typename F>
5090class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
5091public:
5092 typedef CrushTreeDumper::Dumper<F> Parent;
5093
5094 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2 5095 const PGMap& pgmap_, bool tree_,
9f95a23c 5096 const string& filter) :
c07f9fc5 5097 Parent(crush, osdmap_->get_pool_names()),
31f18b77 5098 osdmap(osdmap_),
11fdf7f2 5099 pgmap(pgmap_),
31f18b77 5100 tree(tree_),
31f18b77
FG
5101 min_var(-1),
5102 max_var(-1),
5103 stddev(0),
5104 sum(0) {
9f95a23c
TL
5105 if (osdmap->crush->name_exists(filter)) {
5106 // filter by crush node
5107 auto item_id = osdmap->crush->get_item_id(filter);
11fdf7f2
TL
5108 allowed.insert(item_id);
5109 osdmap->crush->get_all_children(item_id, &allowed);
9f95a23c
TL
5110 } else if (osdmap->crush->class_exists(filter)) {
5111 // filter by device class
5112 class_id = osdmap->crush->get_class_id(filter);
5113 } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
5114 pool_id >= 0) {
5115 // filter by pool
5116 auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
5117 set<int> roots;
5118 osdmap->crush->find_takes_by_rule(crush_rule, &roots);
5119 allowed = roots;
5120 for (auto r : roots)
5121 osdmap->crush->get_all_children(r, &allowed);
11fdf7f2
TL
5122 }
5123 average_util = average_utilization();
31f18b77
FG
5124 }
5125
5126protected:
11fdf7f2
TL
5127
5128 bool should_dump(int id) const {
5129 if (!allowed.empty() && !allowed.count(id)) // filter by name
5130 return false;
9f95a23c
TL
5131 if (id >= 0 && class_id >= 0) {
5132 auto item_class_id = osdmap->crush->get_item_class_id(id);
5133 if (item_class_id < 0 || // not bound to a class yet
5134 item_class_id != class_id) // or already bound to a different class
11fdf7f2
TL
5135 return false;
5136 }
5137 return true;
5138 }
5139
5140 set<int> get_dumped_osds() {
9f95a23c 5141 if (allowed.empty() && class_id < 0) {
11fdf7f2
TL
5142 // old way, all
5143 return {};
5144 }
5145 return dumped_osds;
5146 }
5147
31f18b77
FG
5148 void dump_stray(F *f) {
5149 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5150 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 5151 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5152 }
5153 }
5154
5155 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
5156 if (!tree && qi.is_bucket())
5157 return;
11fdf7f2
TL
5158 if (!should_dump(qi.id))
5159 return;
31f18b77 5160
11fdf7f2
TL
5161 if (!qi.is_bucket())
5162 dumped_osds.insert(qi.id);
31f18b77 5163 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5164 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5165 kb_used_meta = 0, kb_avail = 0;
31f18b77 5166 double util = 0;
11fdf7f2
TL
5167 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5168 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5169 if (kb_used && kb)
5170 util = 100.0 * (double)kb_used / (double)kb;
5171
5172 double var = 1.0;
5173 if (average_util)
5174 var = util / average_util;
5175
11fdf7f2 5176 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5177
11fdf7f2
TL
5178 dump_item(qi, reweight, kb, kb_used,
5179 kb_used_data, kb_used_omap, kb_used_meta,
5180 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5181
5182 if (!qi.is_bucket() && reweight > 0) {
5183 if (min_var < 0 || var < min_var)
5184 min_var = var;
5185 if (max_var < 0 || var > max_var)
5186 max_var = var;
5187
5188 double dev = util - average_util;
5189 dev *= dev;
5190 stddev += reweight * dev;
5191 sum += reweight;
5192 }
5193 }
5194
5195 virtual void dump_item(const CrushTreeDumper::Item &qi,
5196 float &reweight,
5197 int64_t kb,
5198 int64_t kb_used,
11fdf7f2
TL
5199 int64_t kb_used_data,
5200 int64_t kb_used_omap,
5201 int64_t kb_used_meta,
31f18b77
FG
5202 int64_t kb_avail,
5203 double& util,
5204 double& var,
5205 const size_t num_pgs,
5206 F *f) = 0;
5207
5208 double dev() {
5209 return sum > 0 ? sqrt(stddev / sum) : 0;
5210 }
5211
5212 double average_utilization() {
5213 int64_t kb = 0, kb_used = 0;
5214 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5215 if (!osdmap->exists(i) ||
5216 osdmap->get_weight(i) == 0 ||
5217 !should_dump(i))
31f18b77 5218 continue;
11fdf7f2
TL
5219 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5220 kb_avail_i;
5221 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5222 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5223 kb += kb_i;
5224 kb_used += kb_used_i;
5225 }
5226 }
5227 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5228 }
5229
5230 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5231 int64_t* kb_used_data,
5232 int64_t* kb_used_omap,
5233 int64_t* kb_used_meta,
31f18b77 5234 int64_t* kb_avail) const {
11fdf7f2 5235 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5236 if (!p) return false;
11fdf7f2
TL
5237 *kb = p->statfs.kb();
5238 *kb_used = p->statfs.kb_used_raw();
5239 *kb_used_data = p->statfs.kb_used_data();
5240 *kb_used_omap = p->statfs.kb_used_omap();
5241 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5242 *kb_avail = p->statfs.kb_avail();
5243
31f18b77
FG
5244 return *kb > 0;
5245 }
5246
5247 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5248 int64_t* kb_used_data,
5249 int64_t* kb_used_omap,
5250 int64_t* kb_used_meta,
31f18b77
FG
5251 int64_t* kb_avail) const {
5252 if (id >= 0) {
11fdf7f2 5253 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5254 *kb = 0;
5255 *kb_used = 0;
11fdf7f2
TL
5256 *kb_used_data = 0;
5257 *kb_used_omap = 0;
5258 *kb_used_meta = 0;
31f18b77
FG
5259 *kb_avail = 0;
5260 return true;
5261 }
11fdf7f2
TL
5262 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5263 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5264 }
5265
5266 *kb = 0;
5267 *kb_used = 0;
11fdf7f2
TL
5268 *kb_used_data = 0;
5269 *kb_used_omap = 0;
5270 *kb_used_meta = 0;
31f18b77
FG
5271 *kb_avail = 0;
5272
5273 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5274 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5275 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5276 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5277 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5278 &kb_used_data_i, &kb_used_omap_i,
5279 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5280 return false;
5281 *kb += kb_i;
5282 *kb_used += kb_used_i;
11fdf7f2
TL
5283 *kb_used_data += kb_used_data_i;
5284 *kb_used_omap += kb_used_omap_i;
5285 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5286 *kb_avail += kb_avail_i;
5287 }
5288 return *kb > 0;
5289 }
5290
5291protected:
5292 const OSDMap *osdmap;
11fdf7f2 5293 const PGMap& pgmap;
31f18b77
FG
5294 bool tree;
5295 double average_util;
5296 double min_var;
5297 double max_var;
5298 double stddev;
5299 double sum;
9f95a23c 5300 int class_id = -1;
11fdf7f2
TL
5301 set<int> allowed;
5302 set<int> dumped_osds;
31f18b77
FG
5303};
5304
5305
5306class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5307public:
5308 typedef OSDUtilizationDumper<TextTable> Parent;
5309
5310 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5311 const PGMap& pgmap, bool tree,
9f95a23c
TL
5312 const string& filter) :
5313 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5314
5315 void dump(TextTable *tbl) {
5316 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5317 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5318 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5319 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5320 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5321 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5322 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5323 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5324 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5325 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5326 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5327 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5328 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5329 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5330 if (tree)
5331 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5332
5333 Parent::dump(tbl);
5334
5335 dump_stray(tbl);
5336
11fdf7f2 5337 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5338 *tbl << ""
5339 << ""
5340 << "" << "TOTAL"
11fdf7f2
TL
5341 << byte_u_t(sum.statfs.total)
5342 << byte_u_t(sum.statfs.get_used_raw())
5343 << byte_u_t(sum.statfs.allocated)
5344 << byte_u_t(sum.statfs.omap_allocated)
5345 << byte_u_t(sum.statfs.internal_metadata)
5346 << byte_u_t(sum.statfs.available)
31f18b77
FG
5347 << lowprecision_t(average_util)
5348 << ""
5349 << TextTable::endrow;
5350 }
5351
5352protected:
5353 struct lowprecision_t {
5354 float v;
5355 explicit lowprecision_t(float _v) : v(_v) {}
5356 };
5357 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5358
5359 using OSDUtilizationDumper<TextTable>::dump_item;
5360 void dump_item(const CrushTreeDumper::Item &qi,
5361 float &reweight,
5362 int64_t kb,
5363 int64_t kb_used,
11fdf7f2
TL
5364 int64_t kb_used_data,
5365 int64_t kb_used_omap,
5366 int64_t kb_used_meta,
31f18b77
FG
5367 int64_t kb_avail,
5368 double& util,
5369 double& var,
5370 const size_t num_pgs,
5371 TextTable *tbl) override {
224ce89b
WB
5372 const char *c = crush->get_item_class(qi.id);
5373 if (!c)
5374 c = "";
31f18b77 5375 *tbl << qi.id
224ce89b 5376 << c
31f18b77
FG
5377 << weightf_t(qi.weight)
5378 << weightf_t(reweight)
1adf2230
AA
5379 << byte_u_t(kb << 10)
5380 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5381 << byte_u_t(kb_used_data << 10)
5382 << byte_u_t(kb_used_omap << 10)
5383 << byte_u_t(kb_used_meta << 10)
1adf2230 5384 << byte_u_t(kb_avail << 10)
31f18b77
FG
5385 << lowprecision_t(util)
5386 << lowprecision_t(var);
5387
5388 if (qi.is_bucket()) {
5389 *tbl << "-";
11fdf7f2 5390 *tbl << "";
31f18b77
FG
5391 } else {
5392 *tbl << num_pgs;
11fdf7f2
TL
5393 if (osdmap->is_up(qi.id)) {
5394 *tbl << "up";
5395 } else if (osdmap->is_destroyed(qi.id)) {
5396 *tbl << "destroyed";
5397 } else {
5398 *tbl << "down";
5399 }
31f18b77
FG
5400 }
5401
5402 if (tree) {
5403 ostringstream name;
5404 for (int k = 0; k < qi.depth; k++)
5405 name << " ";
5406 if (qi.is_bucket()) {
5407 int type = crush->get_bucket_type(qi.id);
5408 name << crush->get_type_name(type) << " "
5409 << crush->get_item_name(qi.id);
5410 } else {
5411 name << "osd." << qi.id;
5412 }
5413 *tbl << name.str();
5414 }
5415
5416 *tbl << TextTable::endrow;
5417 }
5418
5419public:
5420 string summary() {
5421 ostringstream out;
5422 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5423 << "/" << lowprecision_t(max_var) << " "
5424 << "STDDEV: " << lowprecision_t(dev());
5425 return out.str();
5426 }
5427};
5428
5429ostream& operator<<(ostream& out,
5430 const OSDUtilizationPlainDumper::lowprecision_t& v)
5431{
5432 if (v.v < -0.01) {
5433 return out << "-";
5434 } else if (v.v < 0.001) {
5435 return out << "0";
5436 } else {
5437 std::streamsize p = out.precision();
5438 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5439 }
5440}
5441
5442class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5443public:
5444 typedef OSDUtilizationDumper<Formatter> Parent;
5445
5446 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2 5447 const PGMap& pgmap, bool tree,
9f95a23c
TL
5448 const string& filter) :
5449 Parent(crush, osdmap, pgmap, tree, filter) {}
31f18b77
FG
5450
5451 void dump(Formatter *f) {
5452 f->open_array_section("nodes");
5453 Parent::dump(f);
5454 f->close_section();
5455
5456 f->open_array_section("stray");
5457 dump_stray(f);
5458 f->close_section();
5459 }
5460
5461protected:
5462 using OSDUtilizationDumper<Formatter>::dump_item;
5463 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5464 float &reweight,
5465 int64_t kb,
5466 int64_t kb_used,
5467 int64_t kb_used_data,
5468 int64_t kb_used_omap,
5469 int64_t kb_used_meta,
5470 int64_t kb_avail,
5471 double& util,
5472 double& var,
5473 const size_t num_pgs,
5474 Formatter *f) override {
31f18b77 5475 f->open_object_section("item");
c07f9fc5 5476 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5477 f->dump_float("reweight", reweight);
5478 f->dump_int("kb", kb);
5479 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5480 f->dump_int("kb_used_data", kb_used_data);
5481 f->dump_int("kb_used_omap", kb_used_omap);
5482 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5483 f->dump_int("kb_avail", kb_avail);
5484 f->dump_float("utilization", util);
5485 f->dump_float("var", var);
5486 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5487 if (!qi.is_bucket()) {
5488 if (osdmap->is_up(qi.id)) {
5489 f->dump_string("status", "up");
5490 } else if (osdmap->is_destroyed(qi.id)) {
5491 f->dump_string("status", "destroyed");
5492 } else {
5493 f->dump_string("status", "down");
5494 }
5495 }
31f18b77
FG
5496 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5497 f->close_section();
5498 }
5499
5500public:
5501 void summary(Formatter *f) {
5502 f->open_object_section("summary");
11fdf7f2
TL
5503 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5504 auto& s = sum.statfs;
5505
5506 f->dump_int("total_kb", s.kb());
5507 f->dump_int("total_kb_used", s.kb_used_raw());
5508 f->dump_int("total_kb_used_data", s.kb_used_data());
5509 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5510 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5511 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5512 f->dump_float("average_utilization", average_util);
5513 f->dump_float("min_var", min_var);
5514 f->dump_float("max_var", max_var);
5515 f->dump_float("dev", dev());
5516 f->close_section();
5517 }
5518};
5519
5520void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5521 const PGMap& pgmap,
5522 ostream& out,
5523 Formatter *f,
5524 bool tree,
9f95a23c 5525 const string& filter)
31f18b77
FG
5526{
5527 const CrushWrapper *crush = osdmap.crush.get();
5528 if (f) {
5529 f->open_object_section("df");
9f95a23c 5530 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5531 d.dump(f);
5532 d.summary(f);
5533 f->close_section();
5534 f->flush(out);
5535 } else {
9f95a23c 5536 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
31f18b77
FG
5537 TextTable tbl;
5538 d.dump(&tbl);
5539 out << tbl << d.summary() << "\n";
5540 }
5541}
224ce89b 5542
92f5a8d4
TL
5543void OSDMap::check_health(CephContext *cct,
5544 health_check_map_t *checks) const
224ce89b
WB
5545{
5546 int num_osds = get_num_osds();
5547
5548 // OSD_DOWN
5549 // OSD_$subtree_DOWN
5550 // OSD_ORPHAN
5551 if (num_osds >= 0) {
5552 int num_in_osds = 0;
5553 int num_down_in_osds = 0;
5554 set<int> osds;
5555 set<int> down_in_osds;
5556 set<int> up_in_osds;
5557 set<int> subtree_up;
5558 unordered_map<int, set<int> > subtree_type_down;
5559 unordered_map<int, int> num_osds_subtree;
5560 int max_type = crush->get_max_type_id();
5561
5562 for (int i = 0; i < get_max_osd(); i++) {
5563 if (!exists(i)) {
5564 if (crush->item_exists(i)) {
5565 osds.insert(i);
5566 }
5567 continue;
5568 }
5569 if (is_out(i))
5570 continue;
5571 ++num_in_osds;
5572 if (down_in_osds.count(i) || up_in_osds.count(i))
5573 continue;
5574 if (!is_up(i)) {
5575 down_in_osds.insert(i);
5576 int parent_id = 0;
5577 int current = i;
5578 for (int type = 0; type <= max_type; type++) {
5579 if (!crush->get_type_name(type))
5580 continue;
5581 int r = crush->get_immediate_parent_id(current, &parent_id);
5582 if (r == -ENOENT)
5583 break;
5584 // break early if this parent is already marked as up
5585 if (subtree_up.count(parent_id))
5586 break;
5587 type = crush->get_bucket_type(parent_id);
5588 if (!subtree_type_is_down(
92f5a8d4 5589 cct, parent_id, type,
224ce89b
WB
5590 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5591 break;
5592 current = parent_id;
5593 }
5594 }
5595 }
5596
5597 // calculate the number of down osds in each down subtree and
5598 // store it in num_osds_subtree
5599 for (int type = 1; type <= max_type; type++) {
5600 if (!crush->get_type_name(type))
5601 continue;
5602 for (auto j = subtree_type_down[type].begin();
5603 j != subtree_type_down[type].end();
5604 ++j) {
5605 list<int> children;
5606 int num = 0;
5607 int num_children = crush->get_children(*j, &children);
5608 if (num_children == 0)
5609 continue;
5610 for (auto l = children.begin(); l != children.end(); ++l) {
5611 if (*l >= 0) {
5612 ++num;
5613 } else if (num_osds_subtree[*l] > 0) {
5614 num = num + num_osds_subtree[*l];
5615 }
5616 }
5617 num_osds_subtree[*j] = num;
5618 }
5619 }
5620 num_down_in_osds = down_in_osds.size();
11fdf7f2 5621 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
5622 if (num_down_in_osds > 0) {
5623 // summary of down subtree types and osds
5624 for (int type = max_type; type > 0; type--) {
5625 if (!crush->get_type_name(type))
5626 continue;
5627 if (subtree_type_down[type].size() > 0) {
5628 ostringstream ss;
5629 ss << subtree_type_down[type].size() << " "
5630 << crush->get_type_name(type);
5631 if (subtree_type_down[type].size() > 1) {
5632 ss << "s";
5633 }
5634 int sum_down_osds = 0;
5635 for (auto j = subtree_type_down[type].begin();
5636 j != subtree_type_down[type].end();
5637 ++j) {
5638 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5639 }
5640 ss << " (" << sum_down_osds << " osds) down";
5641 string err = string("OSD_") +
5642 string(crush->get_type_name(type)) + "_DOWN";
5643 boost::to_upper(err);
9f95a23c
TL
5644 auto& d = checks->add(err, HEALTH_WARN, ss.str(),
5645 subtree_type_down[type].size());
224ce89b
WB
5646 for (auto j = subtree_type_down[type].rbegin();
5647 j != subtree_type_down[type].rend();
5648 ++j) {
5649 ostringstream ss;
5650 ss << crush->get_type_name(type);
5651 ss << " ";
5652 ss << crush->get_item_name(*j);
5653 // at the top level, do not print location
5654 if (type != max_type) {
5655 ss << " (";
5656 ss << crush->get_full_location_ordered_string(*j);
5657 ss << ")";
5658 }
5659 int num = num_osds_subtree[*j];
5660 ss << " (" << num << " osds)";
5661 ss << " is down";
5662 d.detail.push_back(ss.str());
5663 }
5664 }
5665 }
5666 ostringstream ss;
5667 ss << down_in_osds.size() << " osds down";
9f95a23c
TL
5668 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
5669 down_in_osds.size());
224ce89b
WB
5670 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5671 ostringstream ss;
5672 ss << "osd." << *it << " (";
5673 ss << crush->get_full_location_ordered_string(*it);
5674 ss << ") is down";
5675 d.detail.push_back(ss.str());
5676 }
5677 }
5678
5679 if (!osds.empty()) {
5680 ostringstream ss;
5681 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
9f95a23c
TL
5682 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
5683 osds.size());
224ce89b
WB
5684 for (auto osd : osds) {
5685 ostringstream ss;
5686 ss << "osd." << osd << " exists in crush map but not in osdmap";
5687 d.detail.push_back(ss.str());
5688 }
5689 }
5690 }
5691
eafe8130
TL
5692 std::list<std::string> scrub_messages;
5693 bool noscrub = false, nodeepscrub = false;
5694 for (const auto &p : pools) {
5695 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5696 ostringstream ss;
5697 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5698 scrub_messages.push_back(ss.str());
5699 noscrub = true;
5700 }
5701 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5702 ostringstream ss;
5703 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5704 scrub_messages.push_back(ss.str());
5705 nodeepscrub = true;
5706 }
5707 }
5708 if (noscrub || nodeepscrub) {
5709 string out = "";
5710 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5711 out += nodeepscrub ? "nodeep-scrub" : "";
5712 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
9f95a23c 5713 "Some pool(s) have the " + out + " flag(s) set", 0);
eafe8130
TL
5714 d.detail.splice(d.detail.end(), scrub_messages);
5715 }
5716
224ce89b
WB
5717 // OSD_OUT_OF_ORDER_FULL
5718 {
5719 // An osd could configure failsafe ratio, to something different
5720 // but for now assume it is the same here.
92f5a8d4 5721 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
5722 if (fsr > 1.0) fsr /= 100;
5723 float fr = get_full_ratio();
5724 float br = get_backfillfull_ratio();
5725 float nr = get_nearfull_ratio();
5726
5727 list<string> detail;
5728 // These checks correspond to how OSDService::check_full_status() in an OSD
5729 // handles the improper setting of these values.
5730 if (br < nr) {
5731 ostringstream ss;
5732 ss << "backfillfull_ratio (" << br
5733 << ") < nearfull_ratio (" << nr << "), increased";
5734 detail.push_back(ss.str());
5735 br = nr;
5736 }
5737 if (fr < br) {
5738 ostringstream ss;
5739 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5740 << "), increased";
5741 detail.push_back(ss.str());
5742 fr = br;
5743 }
5744 if (fsr < fr) {
5745 ostringstream ss;
5746 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5747 << "), increased";
5748 detail.push_back(ss.str());
5749 }
5750 if (!detail.empty()) {
5751 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
9f95a23c 5752 "full ratio(s) out of order", 0);
224ce89b
WB
5753 d.detail.swap(detail);
5754 }
5755 }
5756
5757 // OSD_FULL
5758 // OSD_NEARFULL
5759 // OSD_BACKFILLFULL
5760 // OSD_FAILSAFE_FULL
5761 {
5762 set<int> full, backfillfull, nearfull;
5763 get_full_osd_counts(&full, &backfillfull, &nearfull);
5764 if (full.size()) {
5765 ostringstream ss;
5766 ss << full.size() << " full osd(s)";
9f95a23c 5767 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
224ce89b
WB
5768 for (auto& i: full) {
5769 ostringstream ss;
5770 ss << "osd." << i << " is full";
5771 d.detail.push_back(ss.str());
5772 }
5773 }
5774 if (backfillfull.size()) {
5775 ostringstream ss;
5776 ss << backfillfull.size() << " backfillfull osd(s)";
9f95a23c
TL
5777 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
5778 backfillfull.size());
224ce89b
WB
5779 for (auto& i: backfillfull) {
5780 ostringstream ss;
5781 ss << "osd." << i << " is backfill full";
5782 d.detail.push_back(ss.str());
5783 }
5784 }
5785 if (nearfull.size()) {
5786 ostringstream ss;
5787 ss << nearfull.size() << " nearfull osd(s)";
9f95a23c 5788 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
224ce89b
WB
5789 for (auto& i: nearfull) {
5790 ostringstream ss;
5791 ss << "osd." << i << " is near full";
5792 d.detail.push_back(ss.str());
5793 }
5794 }
5795 }
5796
5797 // OSDMAP_FLAGS
5798 {
5799 // warn about flags
5800 uint64_t warn_flags =
224ce89b
WB
5801 CEPH_OSDMAP_PAUSERD |
5802 CEPH_OSDMAP_PAUSEWR |
5803 CEPH_OSDMAP_PAUSEREC |
5804 CEPH_OSDMAP_NOUP |
5805 CEPH_OSDMAP_NODOWN |
5806 CEPH_OSDMAP_NOIN |
5807 CEPH_OSDMAP_NOOUT |
5808 CEPH_OSDMAP_NOBACKFILL |
5809 CEPH_OSDMAP_NORECOVER |
5810 CEPH_OSDMAP_NOSCRUB |
5811 CEPH_OSDMAP_NODEEP_SCRUB |
5812 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 5813 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
5814 CEPH_OSDMAP_NOREBALANCE;
5815 if (test_flag(warn_flags)) {
5816 ostringstream ss;
9f95a23c
TL
5817 string s = get_flag_string(get_flags() & warn_flags);
5818 ss << s << " flag(s) set";
5819 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
5820 s.size() /* kludgey but sufficient */);
224ce89b
WB
5821 }
5822 }
5823
5824 // OSD_FLAGS
5825 {
5826 list<string> detail;
5827 const unsigned flags =
5828 CEPH_OSD_NOUP |
5829 CEPH_OSD_NOIN |
5830 CEPH_OSD_NODOWN |
5831 CEPH_OSD_NOOUT;
5832 for (int i = 0; i < max_osd; ++i) {
5833 if (osd_state[i] & flags) {
5834 ostringstream ss;
5835 set<string> states;
5836 OSDMap::calc_state_set(osd_state[i] & flags, states);
5837 ss << "osd." << i << " has flags " << states;
5838 detail.push_back(ss.str());
5839 }
5840 }
81eedcae
TL
5841 for (auto& i : crush_node_flags) {
5842 if (i.second && crush->item_exists(i.first)) {
5843 ostringstream ss;
5844 set<string> states;
5845 OSDMap::calc_state_set(i.second, states);
5846 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
5847 const char *tn = crush->get_type_name(t);
5848 ss << (tn ? tn : "node") << " "
5849 << crush->get_item_name(i.first) << " has flags " << states;
5850 detail.push_back(ss.str());
5851 }
5852 }
5853 for (auto& i : device_class_flags) {
5854 const char* class_name = crush->get_class_name(i.first);
5855 if (i.second && class_name) {
5856 ostringstream ss;
5857 set<string> states;
5858 OSDMap::calc_state_set(i.second, states);
5859 ss << "device class '" << class_name << "' has flags " << states;
5860 detail.push_back(ss.str());
5861 }
5862 }
224ce89b
WB
5863 if (!detail.empty()) {
5864 ostringstream ss;
81eedcae 5865 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
9f95a23c 5866 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
224ce89b
WB
5867 d.detail.swap(detail);
5868 }
5869 }
5870
5871 // OLD_CRUSH_TUNABLES
92f5a8d4 5872 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 5873 string min = crush->get_min_required_version();
92f5a8d4 5874 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
5875 ostringstream ss;
5876 ss << "crush map has legacy tunables (require " << min
92f5a8d4 5877 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
9f95a23c 5878 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
5879 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5880 }
5881 }
5882
5883 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 5884 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
5885 if (crush->get_straw_calc_version() == 0) {
5886 ostringstream ss;
5887 ss << "crush map has straw_calc_version=0";
9f95a23c 5888 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
5889 d.detail.push_back(
5890 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5891 }
5892 }
5893
5894 // CACHE_POOL_NO_HIT_SET
92f5a8d4 5895 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b 5896 list<string> detail;
9f95a23c 5897 for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
224ce89b
WB
5898 const pg_pool_t& info = p->second;
5899 if (info.cache_mode_requires_hit_set() &&
5900 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5901 ostringstream ss;
5902 ss << "pool '" << get_pool_name(p->first)
5903 << "' with cache_mode " << info.get_cache_mode_name()
5904 << " needs hit_set_type to be set but it is not";
5905 detail.push_back(ss.str());
5906 }
5907 }
5908 if (!detail.empty()) {
5909 ostringstream ss;
5910 ss << detail.size() << " cache pools are missing hit_sets";
9f95a23c
TL
5911 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
5912 detail.size());
224ce89b
WB
5913 d.detail.swap(detail);
5914 }
5915 }
5916
5917 // OSD_NO_SORTBITWISE
11fdf7f2 5918 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 5919 ostringstream ss;
11fdf7f2 5920 ss << "'sortbitwise' flag is not set";
9f95a23c 5921 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
224ce89b
WB
5922 }
5923
5924 // OSD_UPGRADE_FINISHED
5925 // none of these (yet) since we don't run until luminous upgrade is done.
5926
3efd9988 5927 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 5928 {
3efd9988 5929 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
5930 for (auto it : get_pools()) {
5931 const pg_pool_t &pool = it.second;
3efd9988 5932 const string& pool_name = get_pool_name(it.first);
224ce89b 5933 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 5934 stringstream ss;
11fdf7f2 5935 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
5936 // may run out of space too,
5937 // but we want EQUOTA taking precedence
11fdf7f2 5938 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
5939 } else {
5940 ss << "pool '" << pool_name << "' is full (no space)";
5941 }
5942 full_detail.push_back(ss.str());
5943 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
5944 stringstream ss;
5945 ss << "pool '" << pool_name << "' is backfillfull";
5946 backfillfull_detail.push_back(ss.str());
5947 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
5948 stringstream ss;
5949 ss << "pool '" << pool_name << "' is nearfull";
5950 nearfull_detail.push_back(ss.str());
224ce89b
WB
5951 }
5952 }
3efd9988 5953 if (!full_detail.empty()) {
224ce89b 5954 ostringstream ss;
3efd9988 5955 ss << full_detail.size() << " pool(s) full";
9f95a23c 5956 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
3efd9988
FG
5957 d.detail.swap(full_detail);
5958 }
5959 if (!backfillfull_detail.empty()) {
5960 ostringstream ss;
5961 ss << backfillfull_detail.size() << " pool(s) backfillfull";
9f95a23c
TL
5962 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
5963 backfillfull_detail.size());
3efd9988
FG
5964 d.detail.swap(backfillfull_detail);
5965 }
5966 if (!nearfull_detail.empty()) {
5967 ostringstream ss;
5968 ss << nearfull_detail.size() << " pool(s) nearfull";
9f95a23c
TL
5969 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
5970 nearfull_detail.size());
3efd9988 5971 d.detail.swap(nearfull_detail);
224ce89b
WB
5972 }
5973 }
92f5a8d4
TL
5974
5975 // POOL_PG_NUM_NOT_POWER_OF_TWO
5976 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
5977 list<string> detail;
5978 for (auto it : get_pools()) {
5979 if (!isp2(it.second.get_pg_num_target())) {
5980 ostringstream ss;
5981 ss << "pool '" << get_pool_name(it.first)
5982 << "' pg_num " << it.second.get_pg_num_target()
5983 << " is not a power of two";
5984 detail.push_back(ss.str());
5985 }
5986 }
5987 if (!detail.empty()) {
5988 ostringstream ss;
5989 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
5990 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
9f95a23c
TL
5991 ss.str(), detail.size());
5992 d.detail.swap(detail);
5993 }
5994 }
5995
5996 // POOL_NO_REDUNDANCY
5997 if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
5998 {
5999 list<string> detail;
6000 for (auto it : get_pools()) {
6001 if (it.second.get_size() == 1) {
6002 ostringstream ss;
6003 ss << "pool '" << get_pool_name(it.first)
6004 << "' has no replicas configured";
6005 detail.push_back(ss.str());
6006 }
6007 }
6008 if (!detail.empty()) {
6009 ostringstream ss;
6010 ss << detail.size() << " pool(s) have no replicas configured";
6011 auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
6012 ss.str(), detail.size());
92f5a8d4
TL
6013 d.detail.swap(detail);
6014 }
6015 }
224ce89b 6016}
35e4c445
FG
6017
6018int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
6019 ostream *ss) const
6020{
6021 out->clear();
6022 for (auto i = ls.begin(); i != ls.end(); ++i) {
6023 if (i == ls.begin() &&
6024 (*i == "any" || *i == "all" || *i == "*")) {
6025 get_all_osds(*out);
6026 break;
6027 }
9f95a23c 6028 long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
35e4c445
FG
6029 if (osd < 0) {
6030 *ss << "invalid osd id '" << *i << "'";
6031 return -EINVAL;
6032 }
6033 out->insert(osd);
6034 }
6035 return 0;
6036}
11fdf7f2
TL
6037
6038void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
6039 string &subtree,
6040 int limit, // how many
6041 set<int> skip,
6042 set<int> *want) const {
6043 if (limit <= 0)
6044 return;
6045 int subtree_type = crush->get_type_id(subtree);
6046 if (subtree_type < 1)
6047 return;
6048 vector<int> subtrees;
6049 crush->get_subtree_of_type(subtree_type, &subtrees);
6050 std::random_device rd;
6051 std::default_random_engine rng{rd()};
6052 std::shuffle(subtrees.begin(), subtrees.end(), rng);
6053 for (auto s : subtrees) {
6054 if (limit <= 0)
6055 break;
6056 if (crush->subtree_contains(s, n))
6057 continue;
6058 vector<int> osds;
6059 crush->get_children_of_type(s, 0, &osds);
6060 if (osds.empty())
6061 continue;
6062 vector<int> up_osds;
6063 for (auto o : osds) {
6064 if (is_up(o) && !skip.count(o))
6065 up_osds.push_back(o);
6066 }
6067 if (up_osds.empty())
6068 continue;
6069 auto it = up_osds.begin();
6070 std::advance(it, (n % up_osds.size()));
6071 want->insert(*it);
6072 --limit;
6073 }
6074}
6075
6076float OSDMap::pool_raw_used_rate(int64_t poolid) const
6077{
6078 const pg_pool_t *pool = get_pg_pool(poolid);
6079 assert(pool != nullptr);
6080
6081 switch (pool->get_type()) {
6082 case pg_pool_t::TYPE_REPLICATED:
6083 return pool->get_size();
6084 break;
6085 case pg_pool_t::TYPE_ERASURE:
6086 {
6087 auto& ecp =
6088 get_erasure_code_profile(pool->erasure_code_profile);
6089 auto pm = ecp.find("m");
6090 auto pk = ecp.find("k");
6091 if (pm != ecp.end() && pk != ecp.end()) {
6092 int k = atoi(pk->second.c_str());
6093 int m = atoi(pm->second.c_str());
6094 int mk = m + k;
6095 ceph_assert(mk != 0);
6096 ceph_assert(k != 0);
6097 return (float)mk / k;
6098 } else {
6099 return 0.0;
6100 }
6101 }
6102 break;
6103 default:
6104 ceph_abort_msg("unrecognized pool type");
6105 }
6106}
81eedcae
TL
6107
6108unsigned OSDMap::get_osd_crush_node_flags(int osd) const
6109{
6110 unsigned flags = 0;
6111 if (!crush_node_flags.empty()) {
6112 // the map will contain type -> name
6113 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
6114 for (auto& i : ploc) {
6115 int id = crush->get_item_id(i.second);
6116 auto p = crush_node_flags.find(id);
6117 if (p != crush_node_flags.end()) {
6118 flags |= p->second;
6119 }
6120 }
6121 }
6122 return flags;
6123}
6124
6125unsigned OSDMap::get_crush_node_flags(int id) const
6126{
6127 unsigned flags = 0;
6128 auto it = crush_node_flags.find(id);
6129 if (it != crush_node_flags.end())
6130 flags = it->second;
6131 return flags;
6132}
6133
6134unsigned OSDMap::get_device_class_flags(int id) const
6135{
6136 unsigned flags = 0;
6137 auto it = device_class_flags.find(id);
6138 if (it != device_class_flags.end())
6139 flags = it->second;
6140 return flags;
6141}