]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import ceph 14.2.5
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
11fdf7f2 29#include "global/global_context.h"
7c673cae
FG
30#include "include/ceph_features.h"
31#include "include/str_map.h"
32
33#include "common/code_environment.h"
224ce89b 34#include "mon/health_check.h"
7c673cae
FG
35
36#include "crush/CrushTreeDumper.h"
37#include "common/Clock.h"
11fdf7f2
TL
38#include "mon/PGMap.h"
39
7c673cae
FG
40#define dout_subsys ceph_subsys_osd
41
42MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
43MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
44
45
46// ----------------------------------
47// osd_info_t
48
49void osd_info_t::dump(Formatter *f) const
50{
51 f->dump_int("last_clean_begin", last_clean_begin);
52 f->dump_int("last_clean_end", last_clean_end);
53 f->dump_int("up_from", up_from);
54 f->dump_int("up_thru", up_thru);
55 f->dump_int("down_at", down_at);
56 f->dump_int("lost_at", lost_at);
57}
58
59void osd_info_t::encode(bufferlist& bl) const
60{
11fdf7f2 61 using ceph::encode;
7c673cae 62 __u8 struct_v = 1;
11fdf7f2
TL
63 encode(struct_v, bl);
64 encode(last_clean_begin, bl);
65 encode(last_clean_end, bl);
66 encode(up_from, bl);
67 encode(up_thru, bl);
68 encode(down_at, bl);
69 encode(lost_at, bl);
7c673cae
FG
70}
71
11fdf7f2 72void osd_info_t::decode(bufferlist::const_iterator& bl)
7c673cae 73{
11fdf7f2 74 using ceph::decode;
7c673cae 75 __u8 struct_v;
11fdf7f2
TL
76 decode(struct_v, bl);
77 decode(last_clean_begin, bl);
78 decode(last_clean_end, bl);
79 decode(up_from, bl);
80 decode(up_thru, bl);
81 decode(down_at, bl);
82 decode(lost_at, bl);
7c673cae
FG
83}
84
85void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
86{
87 o.push_back(new osd_info_t);
88 o.push_back(new osd_info_t);
89 o.back()->last_clean_begin = 1;
90 o.back()->last_clean_end = 2;
91 o.back()->up_from = 30;
92 o.back()->up_thru = 40;
93 o.back()->down_at = 5;
94 o.back()->lost_at = 6;
95}
96
97ostream& operator<<(ostream& out, const osd_info_t& info)
98{
99 out << "up_from " << info.up_from
100 << " up_thru " << info.up_thru
101 << " down_at " << info.down_at
102 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
103 if (info.lost_at)
104 out << " lost_at " << info.lost_at;
105 return out;
106}
107
108// ----------------------------------
109// osd_xinfo_t
110
111void osd_xinfo_t::dump(Formatter *f) const
112{
113 f->dump_stream("down_stamp") << down_stamp;
114 f->dump_float("laggy_probability", laggy_probability);
115 f->dump_int("laggy_interval", laggy_interval);
116 f->dump_int("features", features);
117 f->dump_unsigned("old_weight", old_weight);
118}
119
120void osd_xinfo_t::encode(bufferlist& bl) const
121{
122 ENCODE_START(3, 1, bl);
11fdf7f2 123 encode(down_stamp, bl);
7c673cae 124 __u32 lp = laggy_probability * 0xfffffffful;
11fdf7f2
TL
125 encode(lp, bl);
126 encode(laggy_interval, bl);
127 encode(features, bl);
128 encode(old_weight, bl);
7c673cae
FG
129 ENCODE_FINISH(bl);
130}
131
11fdf7f2 132void osd_xinfo_t::decode(bufferlist::const_iterator& bl)
7c673cae
FG
133{
134 DECODE_START(3, bl);
11fdf7f2 135 decode(down_stamp, bl);
7c673cae 136 __u32 lp;
11fdf7f2 137 decode(lp, bl);
7c673cae 138 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 139 decode(laggy_interval, bl);
7c673cae 140 if (struct_v >= 2)
11fdf7f2 141 decode(features, bl);
7c673cae
FG
142 else
143 features = 0;
144 if (struct_v >= 3)
11fdf7f2 145 decode(old_weight, bl);
7c673cae
FG
146 else
147 old_weight = 0;
148 DECODE_FINISH(bl);
149}
150
151void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
152{
153 o.push_back(new osd_xinfo_t);
154 o.push_back(new osd_xinfo_t);
155 o.back()->down_stamp = utime_t(2, 3);
156 o.back()->laggy_probability = .123;
157 o.back()->laggy_interval = 123456;
158 o.back()->old_weight = 0x7fff;
159}
160
161ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
162{
163 return out << "down_stamp " << xi.down_stamp
164 << " laggy_probability " << xi.laggy_probability
165 << " laggy_interval " << xi.laggy_interval
166 << " old_weight " << xi.old_weight;
167}
168
169// ----------------------------------
170// OSDMap::Incremental
171
172int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
173{
174 int n = 0;
175 for (auto &weight : new_weight) {
176 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
177 n++; // marked out
178 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
179 n--; // marked in
180 }
181 return n;
182}
183
184int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
185{
186 int n = 0;
187 for (auto &state : new_state) { //
188 if (state.second & CEPH_OSD_UP) {
189 if (previous->is_up(state.first))
190 n++; // marked down
191 else
192 n--; // marked up
193 }
194 }
195 return n;
196}
197
198int OSDMap::Incremental::identify_osd(uuid_d u) const
199{
200 for (auto &uuid : new_uuid)
201 if (uuid.second == u)
202 return uuid.first;
203 return -1;
204}
205
206int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
207 const OSDMap& osdmap)
208{
11fdf7f2 209 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
210
211 for (auto &new_pool : new_pools) {
212 if (!new_pool.second.tiers.empty()) {
213 pg_pool_t& base = new_pool.second;
214
11fdf7f2
TL
215 auto new_rem_it = new_removed_snaps.find(new_pool.first);
216
7c673cae
FG
217 for (const auto &tier_pool : base.tiers) {
218 const auto &r = new_pools.find(tier_pool);
219 pg_pool_t *tier = 0;
220 if (r == new_pools.end()) {
221 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
222 if (!orig) {
223 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
224 return -EIO;
225 }
226 tier = get_new_pool(tier_pool, orig);
227 } else {
228 tier = &r->second;
229 }
230 if (tier->tier_of != new_pool.first) {
231 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
232 return -EIO;
233 }
234
235 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
236 << tier_pool << dendl;
237 tier->snap_seq = base.snap_seq;
238 tier->snap_epoch = base.snap_epoch;
239 tier->snaps = base.snaps;
240 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
241 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
242 pg_pool_t::FLAG_POOL_SNAPS);
243
244 if (new_rem_it != new_removed_snaps.end()) {
245 new_removed_snaps[tier_pool] = new_rem_it->second;
246 }
7c673cae
FG
247 }
248 }
249 }
250 return 0;
251}
252
28e407b8
AA
253// ----------------------------------
254// OSDMap
7c673cae
FG
255
256bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
257{
258 if (id >= 0)
259 return is_down(id);
260
261 if (down_cache &&
262 down_cache->count(id)) {
263 return true;
264 }
265
266 list<int> children;
267 crush->get_children(id, &children);
268 for (const auto &child : children) {
269 if (!subtree_is_down(child, down_cache)) {
270 return false;
271 }
272 }
273 if (down_cache) {
274 down_cache->insert(id);
275 }
276 return true;
277}
278
279bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
280{
281 // use a stack-local down_cache if we didn't get one from the
282 // caller. then at least this particular call will avoid duplicated
283 // work.
284 set<int> local_down_cache;
285 if (!down_cache) {
286 down_cache = &local_down_cache;
287 }
288
289 int current = id;
290 while (true) {
291 int type;
292 if (current >= 0) {
293 type = 0;
294 } else {
295 type = crush->get_bucket_type(current);
296 }
11fdf7f2 297 ceph_assert(type >= 0);
7c673cae
FG
298
299 if (!subtree_is_down(current, down_cache)) {
300 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
301 return false;
302 }
303
304 // is this a big enough subtree to be marked as down?
305 if (type >= subtree_type) {
306 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
307 return true;
308 }
309
310 int r = crush->get_immediate_parent_id(current, &current);
311 if (r < 0) {
312 return false;
313 }
314 }
315}
316
224ce89b
WB
317bool OSDMap::subtree_type_is_down(
318 CephContext *cct,
319 int id,
320 int subtree_type,
321 set<int> *down_in_osds,
322 set<int> *up_in_osds,
323 set<int> *subtree_up,
324 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
325{
326 if (id >= 0) {
327 bool is_down_ret = is_down(id);
328 if (!is_out(id)) {
329 if (is_down_ret) {
330 down_in_osds->insert(id);
331 } else {
332 up_in_osds->insert(id);
333 }
334 }
335 return is_down_ret;
336 }
337
338 if (subtree_type_down &&
339 (*subtree_type_down)[subtree_type].count(id)) {
340 return true;
341 }
342
343 list<int> children;
344 crush->get_children(id, &children);
345 for (const auto &child : children) {
224ce89b
WB
346 if (!subtree_type_is_down(
347 cct, child, crush->get_bucket_type(child),
348 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
349 subtree_up->insert(id);
350 return false;
351 }
352 }
353 if (subtree_type_down) {
354 (*subtree_type_down)[subtree_type].insert(id);
355 }
356 return true;
357}
358
7c673cae
FG
359void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
360{
11fdf7f2 361 using ceph::encode;
7c673cae 362 __u16 v = 5;
11fdf7f2
TL
363 encode(v, bl);
364 encode(fsid, bl);
365 encode(epoch, bl);
366 encode(modified, bl);
7c673cae 367 int32_t new_t = new_pool_max;
11fdf7f2
TL
368 encode(new_t, bl);
369 encode(new_flags, bl);
370 encode(fullmap, bl);
371 encode(crush, bl);
7c673cae 372
11fdf7f2
TL
373 encode(new_max_osd, bl);
374 // for encode(new_pools, bl);
7c673cae 375 __u32 n = new_pools.size();
11fdf7f2 376 encode(n, bl);
7c673cae
FG
377 for (const auto &new_pool : new_pools) {
378 n = new_pool.first;
11fdf7f2
TL
379 encode(n, bl);
380 encode(new_pool.second, bl, 0);
7c673cae 381 }
11fdf7f2 382 // for encode(new_pool_names, bl);
7c673cae 383 n = new_pool_names.size();
11fdf7f2 384 encode(n, bl);
7c673cae
FG
385
386 for (const auto &new_pool_name : new_pool_names) {
387 n = new_pool_name.first;
11fdf7f2
TL
388 encode(n, bl);
389 encode(new_pool_name.second, bl);
7c673cae 390 }
11fdf7f2 391 // for encode(old_pools, bl);
7c673cae 392 n = old_pools.size();
11fdf7f2 393 encode(n, bl);
7c673cae
FG
394 for (auto &old_pool : old_pools) {
395 n = old_pool;
11fdf7f2 396 encode(n, bl);
7c673cae 397 }
11fdf7f2 398 encode(new_up_client, bl, 0);
31f18b77
FG
399 {
400 // legacy is map<int32_t,uint8_t>
401 uint32_t n = new_state.size();
11fdf7f2 402 encode(n, bl);
31f18b77 403 for (auto p : new_state) {
11fdf7f2
TL
404 encode(p.first, bl);
405 encode((uint8_t)p.second, bl);
31f18b77
FG
406 }
407 }
11fdf7f2
TL
408 encode(new_weight, bl);
409 // for encode(new_pg_temp, bl);
7c673cae 410 n = new_pg_temp.size();
11fdf7f2 411 encode(n, bl);
7c673cae
FG
412
413 for (const auto &pg_temp : new_pg_temp) {
414 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
415 encode(opg, bl);
416 encode(pg_temp.second, bl);
7c673cae
FG
417 }
418}
419
420void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
421{
11fdf7f2 422 using ceph::encode;
7c673cae
FG
423 if ((features & CEPH_FEATURE_PGID64) == 0) {
424 encode_client_old(bl);
425 return;
426 }
427
428 // base
429 __u16 v = 6;
11fdf7f2
TL
430 encode(v, bl);
431 encode(fsid, bl);
432 encode(epoch, bl);
433 encode(modified, bl);
434 encode(new_pool_max, bl);
435 encode(new_flags, bl);
436 encode(fullmap, bl);
437 encode(crush, bl);
438
439 encode(new_max_osd, bl);
440 encode(new_pools, bl, features);
441 encode(new_pool_names, bl);
442 encode(old_pools, bl);
443 encode(new_up_client, bl, features);
31f18b77
FG
444 {
445 uint32_t n = new_state.size();
11fdf7f2 446 encode(n, bl);
31f18b77 447 for (auto p : new_state) {
11fdf7f2
TL
448 encode(p.first, bl);
449 encode((uint8_t)p.second, bl);
31f18b77
FG
450 }
451 }
11fdf7f2
TL
452 encode(new_weight, bl);
453 encode(new_pg_temp, bl);
7c673cae
FG
454
455 // extended
456 __u16 ev = 10;
11fdf7f2
TL
457 encode(ev, bl);
458 encode(new_hb_back_up, bl, features);
459 encode(new_up_thru, bl);
460 encode(new_last_clean_interval, bl);
461 encode(new_lost, bl);
462 encode(new_blacklist, bl, features);
463 encode(old_blacklist, bl, features);
464 encode(new_up_cluster, bl, features);
465 encode(cluster_snapshot, bl);
466 encode(new_uuid, bl);
467 encode(new_xinfo, bl);
468 encode(new_hb_front_up, bl, features);
469}
470
471template<class T>
472static void encode_addrvec_map_as_addr(const T& m, bufferlist& bl, uint64_t f)
473{
474 uint32_t n = m.size();
475 encode(n, bl);
476 for (auto& i : m) {
477 encode(i.first, bl);
478 encode(i.second.legacy_addr(), bl, f);
479 }
480}
481
482template<class T>
483static void encode_addrvec_pvec_as_addr(const T& m, bufferlist& bl, uint64_t f)
484{
485 uint32_t n = m.size();
486 encode(n, bl);
487 for (auto& i : m) {
488 if (i) {
489 encode(i->legacy_addr(), bl, f);
490 } else {
491 encode(entity_addr_t(), bl, f);
492 }
493 }
7c673cae
FG
494}
495
11fdf7f2
TL
496/* for a description of osdmap incremental versions, and when they were
497 * introduced, please refer to
498 * doc/dev/osd_internals/osdmap_versions.txt
499 */
7c673cae
FG
500void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
501{
11fdf7f2 502 using ceph::encode;
7c673cae
FG
503 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
504 encode_classic(bl, features);
505 return;
506 }
507
508 // only a select set of callers should *ever* be encoding new
509 // OSDMaps. others should be passing around the canonical encoded
510 // buffers from on high. select out those callers by passing in an
511 // "impossible" feature bit.
11fdf7f2 512 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
513 features &= ~CEPH_FEATURE_RESERVED;
514
515 size_t start_offset = bl.length();
516 size_t tail_offset;
11fdf7f2
TL
517 size_t crc_offset;
518 std::optional<buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
519
520 // meta-encoding: how we include client-used and osd-specific data
521 ENCODE_START(8, 7, bl);
522
523 {
11fdf7f2 524 uint8_t v = 8;
7c673cae
FG
525 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
526 v = 3;
11fdf7f2
TL
527 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
528 v = 5;
529 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
530 v = 6;
7c673cae
FG
531 }
532 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
533 encode(fsid, bl);
534 encode(epoch, bl);
535 encode(modified, bl);
536 encode(new_pool_max, bl);
537 encode(new_flags, bl);
538 encode(fullmap, bl);
539 encode(crush, bl);
540
541 encode(new_max_osd, bl);
542 encode(new_pools, bl, features);
543 encode(new_pool_names, bl);
544 encode(old_pools, bl);
545 if (v >= 7) {
546 encode(new_up_client, bl, features);
547 } else {
548 encode_addrvec_map_as_addr(new_up_client, bl, features);
549 }
31f18b77 550 if (v >= 5) {
11fdf7f2 551 encode(new_state, bl);
31f18b77
FG
552 } else {
553 uint32_t n = new_state.size();
11fdf7f2 554 encode(n, bl);
31f18b77 555 for (auto p : new_state) {
11fdf7f2
TL
556 encode(p.first, bl);
557 encode((uint8_t)p.second, bl);
31f18b77
FG
558 }
559 }
11fdf7f2
TL
560 encode(new_weight, bl);
561 encode(new_pg_temp, bl);
562 encode(new_primary_temp, bl);
563 encode(new_primary_affinity, bl);
564 encode(new_erasure_code_profiles, bl);
565 encode(old_erasure_code_profiles, bl);
7c673cae 566 if (v >= 4) {
11fdf7f2
TL
567 encode(new_pg_upmap, bl);
568 encode(old_pg_upmap, bl);
569 encode(new_pg_upmap_items, bl);
570 encode(old_pg_upmap_items, bl);
571 }
572 if (v >= 6) {
573 encode(new_removed_snaps, bl);
574 encode(new_purged_snaps, bl);
575 }
576 if (v >= 8) {
577 encode(new_last_up_change, bl);
578 encode(new_last_in_change, bl);
7c673cae
FG
579 }
580 ENCODE_FINISH(bl); // client-usable data
581 }
582
583 {
81eedcae 584 uint8_t target_v = 9;
7c673cae
FG
585 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
586 target_v = 2;
11fdf7f2
TL
587 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
588 target_v = 6;
7c673cae
FG
589 }
590 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
591 if (target_v < 7) {
592 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
593 } else {
594 encode(new_hb_back_up, bl, features);
595 }
596 encode(new_up_thru, bl);
597 encode(new_last_clean_interval, bl);
598 encode(new_lost, bl);
599 encode(new_blacklist, bl, features);
600 encode(old_blacklist, bl, features);
601 if (target_v < 7) {
602 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
603 } else {
604 encode(new_up_cluster, bl, features);
605 }
606 encode(cluster_snapshot, bl);
607 encode(new_uuid, bl);
608 encode(new_xinfo, bl);
609 if (target_v < 7) {
610 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
611 } else {
612 encode(new_hb_front_up, bl, features);
613 }
614 encode(features, bl); // NOTE: features arg, not the member
7c673cae 615 if (target_v >= 3) {
11fdf7f2
TL
616 encode(new_nearfull_ratio, bl);
617 encode(new_full_ratio, bl);
618 encode(new_backfillfull_ratio, bl);
31f18b77
FG
619 }
620 // 5 was string-based new_require_min_compat_client
621 if (target_v >= 6) {
11fdf7f2
TL
622 encode(new_require_min_compat_client, bl);
623 encode(new_require_osd_release, bl);
7c673cae 624 }
81eedcae
TL
625 if (target_v >= 8) {
626 encode(new_crush_node_flags, bl);
627 }
628 if (target_v >= 9) {
629 encode(new_device_class_flags, bl);
630 }
7c673cae
FG
631 ENCODE_FINISH(bl); // osd-only data
632 }
633
11fdf7f2
TL
634 crc_offset = bl.length();
635 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
636 tail_offset = bl.length();
637
11fdf7f2 638 encode(full_crc, bl);
7c673cae
FG
639
640 ENCODE_FINISH(bl); // meta-encoding wrapper
641
642 // fill in crc
643 bufferlist front;
11fdf7f2 644 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
645 inc_crc = front.crc32c(-1);
646 bufferlist tail;
647 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
648 inc_crc = tail.crc32c(inc_crc);
649 ceph_le32 crc_le;
650 crc_le = inc_crc;
11fdf7f2 651 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
652 have_crc = true;
653}
654
11fdf7f2 655void OSDMap::Incremental::decode_classic(bufferlist::const_iterator &p)
7c673cae 656{
11fdf7f2 657 using ceph::decode;
7c673cae
FG
658 __u32 n, t;
659 // base
660 __u16 v;
11fdf7f2
TL
661 decode(v, p);
662 decode(fsid, p);
663 decode(epoch, p);
664 decode(modified, p);
7c673cae 665 if (v == 4 || v == 5) {
11fdf7f2 666 decode(n, p);
7c673cae
FG
667 new_pool_max = n;
668 } else if (v >= 6)
11fdf7f2
TL
669 decode(new_pool_max, p);
670 decode(new_flags, p);
671 decode(fullmap, p);
672 decode(crush, p);
7c673cae 673
11fdf7f2 674 decode(new_max_osd, p);
7c673cae
FG
675 if (v < 6) {
676 new_pools.clear();
11fdf7f2 677 decode(n, p);
7c673cae 678 while (n--) {
11fdf7f2
TL
679 decode(t, p);
680 decode(new_pools[t], p);
7c673cae
FG
681 }
682 } else {
11fdf7f2 683 decode(new_pools, p);
7c673cae
FG
684 }
685 if (v == 5) {
686 new_pool_names.clear();
11fdf7f2 687 decode(n, p);
7c673cae 688 while (n--) {
11fdf7f2
TL
689 decode(t, p);
690 decode(new_pool_names[t], p);
7c673cae
FG
691 }
692 } else if (v >= 6) {
11fdf7f2 693 decode(new_pool_names, p);
7c673cae
FG
694 }
695 if (v < 6) {
696 old_pools.clear();
11fdf7f2 697 decode(n, p);
7c673cae 698 while (n--) {
11fdf7f2 699 decode(t, p);
7c673cae
FG
700 old_pools.insert(t);
701 }
702 } else {
11fdf7f2 703 decode(old_pools, p);
7c673cae 704 }
11fdf7f2 705 decode(new_up_client, p);
31f18b77
FG
706 {
707 map<int32_t,uint8_t> ns;
11fdf7f2 708 decode(ns, p);
31f18b77
FG
709 for (auto q : ns) {
710 new_state[q.first] = q.second;
711 }
712 }
11fdf7f2 713 decode(new_weight, p);
7c673cae
FG
714
715 if (v < 6) {
716 new_pg_temp.clear();
11fdf7f2 717 decode(n, p);
7c673cae
FG
718 while (n--) {
719 old_pg_t opg;
720 ::decode_raw(opg, p);
11fdf7f2 721 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
722 }
723 } else {
11fdf7f2 724 decode(new_pg_temp, p);
7c673cae
FG
725 }
726
727 // decode short map, too.
728 if (v == 5 && p.end())
729 return;
730
731 // extended
732 __u16 ev = 0;
733 if (v >= 5)
11fdf7f2
TL
734 decode(ev, p);
735 decode(new_hb_back_up, p);
7c673cae 736 if (v < 5)
11fdf7f2
TL
737 decode(new_pool_names, p);
738 decode(new_up_thru, p);
739 decode(new_last_clean_interval, p);
740 decode(new_lost, p);
741 decode(new_blacklist, p);
742 decode(old_blacklist, p);
7c673cae 743 if (ev >= 6)
11fdf7f2 744 decode(new_up_cluster, p);
7c673cae 745 if (ev >= 7)
11fdf7f2 746 decode(cluster_snapshot, p);
7c673cae 747 if (ev >= 8)
11fdf7f2 748 decode(new_uuid, p);
7c673cae 749 if (ev >= 9)
11fdf7f2 750 decode(new_xinfo, p);
7c673cae 751 if (ev >= 10)
11fdf7f2 752 decode(new_hb_front_up, p);
7c673cae
FG
753}
754
11fdf7f2
TL
755/* for a description of osdmap incremental versions, and when they were
756 * introduced, please refer to
757 * doc/dev/osd_internals/osdmap_versions.txt
758 */
759void OSDMap::Incremental::decode(bufferlist::const_iterator& bl)
7c673cae 760{
11fdf7f2 761 using ceph::decode;
7c673cae
FG
762 /**
763 * Older encodings of the Incremental had a single struct_v which
764 * covered the whole encoding, and was prior to our modern
765 * stuff which includes a compatv and a size. So if we see
766 * a struct_v < 7, we must rewind to the beginning and use our
767 * classic decoder.
768 */
769 size_t start_offset = bl.get_off();
770 size_t tail_offset = 0;
771 bufferlist crc_front, crc_tail;
772
773 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
774 if (struct_v < 7) {
11fdf7f2 775 bl.seek(start_offset);
7c673cae
FG
776 decode_classic(bl);
777 encode_features = 0;
778 if (struct_v >= 6)
779 encode_features = CEPH_FEATURE_PGID64;
780 else
781 encode_features = 0;
782 return;
783 }
784 {
11fdf7f2
TL
785 DECODE_START(8, bl); // client-usable data
786 decode(fsid, bl);
787 decode(epoch, bl);
788 decode(modified, bl);
789 decode(new_pool_max, bl);
790 decode(new_flags, bl);
791 decode(fullmap, bl);
792 decode(crush, bl);
793
794 decode(new_max_osd, bl);
795 decode(new_pools, bl);
796 decode(new_pool_names, bl);
797 decode(old_pools, bl);
798 decode(new_up_client, bl);
31f18b77 799 if (struct_v >= 5) {
11fdf7f2 800 decode(new_state, bl);
31f18b77
FG
801 } else {
802 map<int32_t,uint8_t> ns;
11fdf7f2 803 decode(ns, bl);
31f18b77
FG
804 for (auto q : ns) {
805 new_state[q.first] = q.second;
806 }
807 }
11fdf7f2
TL
808 decode(new_weight, bl);
809 decode(new_pg_temp, bl);
810 decode(new_primary_temp, bl);
7c673cae 811 if (struct_v >= 2)
11fdf7f2 812 decode(new_primary_affinity, bl);
7c673cae
FG
813 else
814 new_primary_affinity.clear();
815 if (struct_v >= 3) {
11fdf7f2
TL
816 decode(new_erasure_code_profiles, bl);
817 decode(old_erasure_code_profiles, bl);
7c673cae
FG
818 } else {
819 new_erasure_code_profiles.clear();
820 old_erasure_code_profiles.clear();
821 }
822 if (struct_v >= 4) {
11fdf7f2
TL
823 decode(new_pg_upmap, bl);
824 decode(old_pg_upmap, bl);
825 decode(new_pg_upmap_items, bl);
826 decode(old_pg_upmap_items, bl);
827 }
828 if (struct_v >= 6) {
829 decode(new_removed_snaps, bl);
830 decode(new_purged_snaps, bl);
831 }
832 if (struct_v >= 8) {
833 decode(new_last_up_change, bl);
834 decode(new_last_in_change, bl);
7c673cae
FG
835 }
836 DECODE_FINISH(bl); // client-usable data
837 }
838
839 {
81eedcae 840 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
841 decode(new_hb_back_up, bl);
842 decode(new_up_thru, bl);
843 decode(new_last_clean_interval, bl);
844 decode(new_lost, bl);
845 decode(new_blacklist, bl);
846 decode(old_blacklist, bl);
847 decode(new_up_cluster, bl);
848 decode(cluster_snapshot, bl);
849 decode(new_uuid, bl);
850 decode(new_xinfo, bl);
851 decode(new_hb_front_up, bl);
7c673cae 852 if (struct_v >= 2)
11fdf7f2 853 decode(encode_features, bl);
7c673cae
FG
854 else
855 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
856 if (struct_v >= 3) {
11fdf7f2
TL
857 decode(new_nearfull_ratio, bl);
858 decode(new_full_ratio, bl);
7c673cae
FG
859 } else {
860 new_nearfull_ratio = -1;
861 new_full_ratio = -1;
862 }
863 if (struct_v >= 4) {
11fdf7f2 864 decode(new_backfillfull_ratio, bl);
7c673cae
FG
865 } else {
866 new_backfillfull_ratio = -1;
867 }
31f18b77
FG
868 if (struct_v == 5) {
869 string r;
11fdf7f2 870 decode(r, bl);
31f18b77
FG
871 if (r.length()) {
872 new_require_min_compat_client = ceph_release_from_name(r.c_str());
873 }
874 }
875 if (struct_v >= 6) {
11fdf7f2
TL
876 decode(new_require_min_compat_client, bl);
877 decode(new_require_osd_release, bl);
31f18b77
FG
878 } else {
879 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
880 // only for compat with post-kraken pre-luminous test clusters
881 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
882 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
883 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
884 new_require_osd_release = CEPH_RELEASE_KRAKEN;
885 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
886 new_require_osd_release = CEPH_RELEASE_JEWEL;
887 } else {
888 new_require_osd_release = -1;
889 }
890 }
81eedcae
TL
891 if (struct_v >= 8) {
892 decode(new_crush_node_flags, bl);
893 }
894 if (struct_v >= 9) {
895 decode(new_device_class_flags, bl);
896 }
7c673cae
FG
897 DECODE_FINISH(bl); // osd-only data
898 }
899
900 if (struct_v >= 8) {
901 have_crc = true;
902 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 903 decode(inc_crc, bl);
7c673cae 904 tail_offset = bl.get_off();
11fdf7f2 905 decode(full_crc, bl);
7c673cae
FG
906 } else {
907 have_crc = false;
908 full_crc = 0;
909 inc_crc = 0;
910 }
911
912 DECODE_FINISH(bl); // wrapper
913
914 if (have_crc) {
915 // verify crc
916 uint32_t actual = crc_front.crc32c(-1);
917 if (tail_offset < bl.get_off()) {
918 bufferlist tail;
919 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
920 actual = tail.crc32c(actual);
921 }
922 if (inc_crc != actual) {
923 ostringstream ss;
924 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
925 string s = ss.str();
926 throw buffer::malformed_input(s.c_str());
927 }
928 }
929}
930
931void OSDMap::Incremental::dump(Formatter *f) const
932{
933 f->dump_int("epoch", epoch);
934 f->dump_stream("fsid") << fsid;
935 f->dump_stream("modified") << modified;
11fdf7f2
TL
936 f->dump_stream("new_last_up_change") << new_last_up_change;
937 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
938 f->dump_int("new_pool_max", new_pool_max);
939 f->dump_int("new_flags", new_flags);
940 f->dump_float("new_full_ratio", new_full_ratio);
941 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
942 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
31f18b77
FG
943 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
944 f->dump_int("new_require_osd_release", new_require_osd_release);
7c673cae
FG
945
946 if (fullmap.length()) {
947 f->open_object_section("full_map");
948 OSDMap full;
949 bufferlist fbl = fullmap; // kludge around constness.
11fdf7f2 950 auto p = fbl.cbegin();
7c673cae
FG
951 full.decode(p);
952 full.dump(f);
953 f->close_section();
954 }
955 if (crush.length()) {
956 f->open_object_section("crush");
957 CrushWrapper c;
958 bufferlist tbl = crush; // kludge around constness.
11fdf7f2 959 auto p = tbl.cbegin();
7c673cae
FG
960 c.decode(p);
961 c.dump(f);
962 f->close_section();
963 }
964
965 f->dump_int("new_max_osd", new_max_osd);
966
967 f->open_array_section("new_pools");
968
969 for (const auto &new_pool : new_pools) {
970 f->open_object_section("pool");
971 f->dump_int("pool", new_pool.first);
972 new_pool.second.dump(f);
973 f->close_section();
974 }
975 f->close_section();
976 f->open_array_section("new_pool_names");
977
978 for (const auto &new_pool_name : new_pool_names) {
979 f->open_object_section("pool_name");
980 f->dump_int("pool", new_pool_name.first);
981 f->dump_string("name", new_pool_name.second);
982 f->close_section();
983 }
984 f->close_section();
985 f->open_array_section("old_pools");
986
987 for (const auto &old_pool : old_pools)
988 f->dump_int("pool", old_pool);
989 f->close_section();
990
991 f->open_array_section("new_up_osds");
992
993 for (const auto &upclient : new_up_client) {
994 f->open_object_section("osd");
995 f->dump_int("osd", upclient.first);
11fdf7f2
TL
996 f->dump_stream("public_addr") << upclient.second.legacy_addr();
997 f->dump_object("public_addrs", upclient.second);
998 if (auto p = new_up_cluster.find(upclient.first);
999 p != new_up_cluster.end()) {
1000 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1001 f->dump_object("cluster_addrs", p->second);
1002 }
1003 if (auto p = new_hb_back_up.find(upclient.first);
1004 p != new_hb_back_up.end()) {
1005 f->dump_object("heartbeat_back_addrs", p->second);
1006 }
1007 if (auto p = new_hb_front_up.find(upclient.first);
1008 p != new_hb_front_up.end()) {
1009 f->dump_object("heartbeat_front_addrs", p->second);
1010 }
7c673cae
FG
1011 f->close_section();
1012 }
1013 f->close_section();
1014
1015 f->open_array_section("new_weight");
1016
1017 for (const auto &weight : new_weight) {
1018 f->open_object_section("osd");
1019 f->dump_int("osd", weight.first);
1020 f->dump_int("weight", weight.second);
1021 f->close_section();
1022 }
1023 f->close_section();
1024
1025 f->open_array_section("osd_state_xor");
1026 for (const auto &ns : new_state) {
1027 f->open_object_section("osd");
1028 f->dump_int("osd", ns.first);
1029 set<string> st;
1030 calc_state_set(new_state.find(ns.first)->second, st);
1031 f->open_array_section("state_xor");
1032 for (auto &state : st)
1033 f->dump_string("state", state);
1034 f->close_section();
c07f9fc5 1035 f->close_section();
7c673cae
FG
1036 }
1037 f->close_section();
1038
1039 f->open_array_section("new_pg_temp");
1040
1041 for (const auto &pg_temp : new_pg_temp) {
1042 f->open_object_section("pg");
1043 f->dump_stream("pgid") << pg_temp.first;
1044 f->open_array_section("osds");
1045
1046 for (const auto &osd : pg_temp.second)
1047 f->dump_int("osd", osd);
1048 f->close_section();
1049 f->close_section();
1050 }
1051 f->close_section();
1052
1053 f->open_array_section("primary_temp");
1054
1055 for (const auto &primary_temp : new_primary_temp) {
1056 f->dump_stream("pgid") << primary_temp.first;
1057 f->dump_int("osd", primary_temp.second);
1058 }
1059 f->close_section(); // primary_temp
1060
1061 f->open_array_section("new_pg_upmap");
1062 for (auto& i : new_pg_upmap) {
1063 f->open_object_section("mapping");
1064 f->dump_stream("pgid") << i.first;
1065 f->open_array_section("osds");
1066 for (auto osd : i.second) {
1067 f->dump_int("osd", osd);
1068 }
1069 f->close_section();
1070 f->close_section();
1071 }
1072 f->close_section();
1073 f->open_array_section("old_pg_upmap");
1074 for (auto& i : old_pg_upmap) {
1075 f->dump_stream("pgid") << i;
1076 }
1077 f->close_section();
1078
1079 f->open_array_section("new_pg_upmap_items");
1080 for (auto& i : new_pg_upmap_items) {
1081 f->open_object_section("mapping");
1082 f->dump_stream("pgid") << i.first;
1083 f->open_array_section("mappings");
1084 for (auto& p : i.second) {
1085 f->open_object_section("mapping");
1086 f->dump_int("from", p.first);
1087 f->dump_int("to", p.second);
1088 f->close_section();
1089 }
1090 f->close_section();
1091 f->close_section();
1092 }
1093 f->close_section();
1094 f->open_array_section("old_pg_upmap_items");
1095 for (auto& i : old_pg_upmap_items) {
1096 f->dump_stream("pgid") << i;
1097 }
1098 f->close_section();
1099
1100 f->open_array_section("new_up_thru");
1101
1102 for (const auto &up_thru : new_up_thru) {
1103 f->open_object_section("osd");
1104 f->dump_int("osd", up_thru.first);
1105 f->dump_int("up_thru", up_thru.second);
1106 f->close_section();
1107 }
1108 f->close_section();
1109
1110 f->open_array_section("new_lost");
1111
1112 for (const auto &lost : new_lost) {
1113 f->open_object_section("osd");
1114 f->dump_int("osd", lost.first);
1115 f->dump_int("epoch_lost", lost.second);
1116 f->close_section();
1117 }
1118 f->close_section();
1119
1120 f->open_array_section("new_last_clean_interval");
1121
1122 for (const auto &last_clean_interval : new_last_clean_interval) {
1123 f->open_object_section("osd");
1124 f->dump_int("osd", last_clean_interval.first);
1125 f->dump_int("first", last_clean_interval.second.first);
1126 f->dump_int("last", last_clean_interval.second.second);
1127 f->close_section();
1128 }
1129 f->close_section();
1130
1131 f->open_array_section("new_blacklist");
1132 for (const auto &blist : new_blacklist) {
1133 stringstream ss;
1134 ss << blist.first;
1135 f->dump_stream(ss.str().c_str()) << blist.second;
1136 }
1137 f->close_section();
1138 f->open_array_section("old_blacklist");
1139 for (const auto &blist : old_blacklist)
1140 f->dump_stream("addr") << blist;
1141 f->close_section();
1142
1143 f->open_array_section("new_xinfo");
1144 for (const auto &xinfo : new_xinfo) {
1145 f->open_object_section("xinfo");
1146 f->dump_int("osd", xinfo.first);
1147 xinfo.second.dump(f);
1148 f->close_section();
1149 }
1150 f->close_section();
1151
1152 if (cluster_snapshot.size())
1153 f->dump_string("cluster_snapshot", cluster_snapshot);
1154
1155 f->open_array_section("new_uuid");
1156 for (const auto &uuid : new_uuid) {
1157 f->open_object_section("osd");
1158 f->dump_int("osd", uuid.first);
1159 f->dump_stream("uuid") << uuid.second;
1160 f->close_section();
1161 }
1162 f->close_section();
1163
1164 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1165 f->open_array_section("old_erasure_code_profiles");
1166 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1167 f->dump_string("old", erasure_code_profile.c_str());
1168 }
1169 f->close_section();
11fdf7f2
TL
1170
1171 f->open_array_section("new_removed_snaps");
1172 for (auto& p : new_removed_snaps) {
1173 f->open_object_section("pool");
1174 f->dump_int("pool", p.first);
1175 f->open_array_section("snaps");
1176 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1177 f->open_object_section("interval");
1178 f->dump_unsigned("begin", q.get_start());
1179 f->dump_unsigned("length", q.get_len());
1180 f->close_section();
1181 }
1182 f->close_section();
1183 f->close_section();
1184 }
1185 f->close_section();
1186 f->open_array_section("new_purged_snaps");
1187 for (auto& p : new_purged_snaps) {
1188 f->open_object_section("pool");
1189 f->dump_int("pool", p.first);
1190 f->open_array_section("snaps");
1191 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1192 f->open_object_section("interval");
1193 f->dump_unsigned("begin", q.get_start());
1194 f->dump_unsigned("length", q.get_len());
1195 f->close_section();
1196 }
1197 f->close_section();
1198 f->close_section();
1199 }
81eedcae
TL
1200 f->open_array_section("new_crush_node_flags");
1201 for (auto& i : new_crush_node_flags) {
1202 f->open_object_section("node");
1203 f->dump_int("id", i.first);
1204 set<string> st;
1205 calc_state_set(i.second, st);
1206 for (auto& j : st) {
1207 f->dump_string("flag", j);
1208 }
1209 f->close_section();
1210 }
1211 f->close_section();
1212 f->open_array_section("new_device_class_flags");
1213 for (auto& i : new_device_class_flags) {
1214 f->open_object_section("device_class");
1215 f->dump_int("id", i.first);
1216 set<string> st;
1217 calc_state_set(i.second, st);
1218 for (auto& j : st) {
1219 f->dump_string("flag", j);
1220 }
1221 f->close_section();
1222 }
1223 f->close_section();
11fdf7f2 1224 f->close_section();
7c673cae
FG
1225}
1226
1227void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1228{
1229 o.push_back(new Incremental);
1230}
1231
1232// ----------------------------------
1233// OSDMap
1234
1235void OSDMap::set_epoch(epoch_t e)
1236{
1237 epoch = e;
1238 for (auto &pool : pools)
1239 pool.second.last_change = e;
1240}
1241
11fdf7f2 1242bool OSDMap::is_blacklisted(const entity_addr_t& orig) const
7c673cae 1243{
11fdf7f2 1244 if (blacklist.empty()) {
7c673cae 1245 return false;
11fdf7f2
TL
1246 }
1247
1248 // all blacklist entries are type ANY for nautilus+
1249 // FIXME: avoid this copy!
1250 entity_addr_t a = orig;
1251 if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
1252 a.set_type(entity_addr_t::TYPE_LEGACY);
1253 } else {
1254 a.set_type(entity_addr_t::TYPE_ANY);
1255 }
7c673cae
FG
1256
1257 // this specific instance?
11fdf7f2 1258 if (blacklist.count(a)) {
7c673cae 1259 return true;
11fdf7f2 1260 }
7c673cae
FG
1261
1262 // is entire ip blacklisted?
1263 if (a.is_ip()) {
11fdf7f2
TL
1264 a.set_port(0);
1265 a.set_nonce(0);
1266 if (blacklist.count(a)) {
1267 return true;
1268 }
1269 }
1270
1271 return false;
1272}
1273
1274bool OSDMap::is_blacklisted(const entity_addrvec_t& av) const
1275{
1276 if (blacklist.empty())
1277 return false;
1278
1279 for (auto& a : av.v) {
1280 if (is_blacklisted(a)) {
7c673cae
FG
1281 return true;
1282 }
1283 }
1284
1285 return false;
1286}
1287
1288void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1289{
1290 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1291}
1292
31f18b77
FG
1293void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1294{
1295 for (const auto &i : blacklist) {
1296 bl->insert(i.first);
1297 }
1298}
1299
7c673cae
FG
1300void OSDMap::set_max_osd(int m)
1301{
1302 int o = max_osd;
1303 max_osd = m;
1304 osd_state.resize(m);
1305 osd_weight.resize(m);
1306 for (; o<max_osd; o++) {
1307 osd_state[o] = 0;
1308 osd_weight[o] = CEPH_OSD_OUT;
1309 }
1310 osd_info.resize(m);
1311 osd_xinfo.resize(m);
11fdf7f2
TL
1312 osd_addrs->client_addrs.resize(m);
1313 osd_addrs->cluster_addrs.resize(m);
1314 osd_addrs->hb_back_addrs.resize(m);
1315 osd_addrs->hb_front_addrs.resize(m);
7c673cae
FG
1316 osd_uuid->resize(m);
1317 if (osd_primary_affinity)
1318 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1319
1320 calc_num_osds();
1321}
1322
1323int OSDMap::calc_num_osds()
1324{
1325 num_osd = 0;
1326 num_up_osd = 0;
1327 num_in_osd = 0;
1328 for (int i=0; i<max_osd; i++) {
1329 if (osd_state[i] & CEPH_OSD_EXISTS) {
1330 ++num_osd;
1331 if (osd_state[i] & CEPH_OSD_UP) {
1332 ++num_up_osd;
1333 }
1334 if (get_weight(i) != CEPH_OSD_OUT) {
1335 ++num_in_osd;
1336 }
1337 }
1338 }
1339 return num_osd;
1340}
1341
3efd9988
FG
1342void OSDMap::get_full_pools(CephContext *cct,
1343 set<int64_t> *full,
1344 set<int64_t> *backfillfull,
1345 set<int64_t> *nearfull) const
7c673cae 1346{
11fdf7f2
TL
1347 ceph_assert(full);
1348 ceph_assert(backfillfull);
1349 ceph_assert(nearfull);
3efd9988
FG
1350 full->clear();
1351 backfillfull->clear();
1352 nearfull->clear();
1353
1354 vector<int> full_osds;
1355 vector<int> backfillfull_osds;
1356 vector<int> nearfull_osds;
7c673cae
FG
1357 for (int i = 0; i < max_osd; ++i) {
1358 if (exists(i) && is_up(i) && is_in(i)) {
1359 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1360 full_osds.push_back(i);
7c673cae 1361 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1362 backfillfull_osds.push_back(i);
7c673cae 1363 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1364 nearfull_osds.push_back(i);
7c673cae
FG
1365 }
1366 }
3efd9988
FG
1367
1368 for (auto i: full_osds) {
1369 get_pool_ids_by_osd(cct, i, full);
1370 }
1371 for (auto i: backfillfull_osds) {
1372 get_pool_ids_by_osd(cct, i, backfillfull);
1373 }
1374 for (auto i: nearfull_osds) {
1375 get_pool_ids_by_osd(cct, i, nearfull);
1376 }
7c673cae
FG
1377}
1378
31f18b77
FG
1379void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1380 set<int> *nearfull) const
1381{
1382 full->clear();
1383 backfill->clear();
1384 nearfull->clear();
1385 for (int i = 0; i < max_osd; ++i) {
1386 if (exists(i) && is_up(i) && is_in(i)) {
1387 if (osd_state[i] & CEPH_OSD_FULL)
1388 full->emplace(i);
1389 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1390 backfill->emplace(i);
1391 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1392 nearfull->emplace(i);
1393 }
1394 }
1395}
1396
7c673cae
FG
1397void OSDMap::get_all_osds(set<int32_t>& ls) const
1398{
1399 for (int i=0; i<max_osd; i++)
1400 if (exists(i))
1401 ls.insert(i);
1402}
1403
1404void OSDMap::get_up_osds(set<int32_t>& ls) const
1405{
1406 for (int i = 0; i < max_osd; i++) {
1407 if (is_up(i))
1408 ls.insert(i);
1409 }
1410}
1411
81eedcae 1412void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1413{
1414 for (int i = 0; i < max_osd; i++) {
81eedcae 1415 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1416 ls.insert(i);
1417 }
1418}
1419
11fdf7f2
TL
1420void OSDMap::get_flag_set(set<string> *flagset) const
1421{
1422 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1423 if (flags & (1<<i)) {
1424 flagset->insert(get_flag_string(flags & (1<<i)));
1425 }
1426 }
1427}
1428
7c673cae
FG
1429void OSDMap::calc_state_set(int state, set<string>& st)
1430{
1431 unsigned t = state;
1432 for (unsigned s = 1; t; s <<= 1) {
1433 if (t & s) {
1434 t &= ~s;
1435 st.insert(ceph_osd_state_name(s));
1436 }
1437 }
1438}
1439
1440void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1441{
1442 float max = 0;
1443 for (const auto &weight : weights) {
1444 if (weight.second > max)
1445 max = weight.second;
1446 }
1447
1448 for (const auto &weight : weights) {
1449 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1450 }
1451}
1452
1453int OSDMap::identify_osd(const entity_addr_t& addr) const
1454{
1455 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1456 if (exists(i) && (get_addrs(i).contains(addr) ||
1457 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1458 return i;
1459 return -1;
1460}
1461
1462int OSDMap::identify_osd(const uuid_d& u) const
1463{
1464 for (int i=0; i<max_osd; i++)
1465 if (exists(i) && get_uuid(i) == u)
1466 return i;
1467 return -1;
1468}
1469
1470int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1471{
1472 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1473 if (exists(i) && (get_addrs(i).contains(addr) ||
1474 get_cluster_addrs(i).contains(addr) ||
1475 get_hb_back_addrs(i).contains(addr) ||
1476 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1477 return i;
1478 return -1;
1479}
1480
1481int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1482{
1483 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1484 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1485 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1486 return i;
1487 return -1;
1488}
1489
1490
1491uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1492{
1493 uint64_t features = 0; // things we actually have
1494 uint64_t mask = 0; // things we could have
1495
1496 if (crush->has_nondefault_tunables())
1497 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1498 if (crush->has_nondefault_tunables2())
1499 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1500 if (crush->has_nondefault_tunables3())
1501 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1502 if (crush->has_v4_buckets())
1503 features |= CEPH_FEATURE_CRUSH_V4;
1504 if (crush->has_nondefault_tunables5())
1505 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1506 if (crush->has_incompat_choose_args()) {
1507 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1508 }
7c673cae
FG
1509 mask |= CEPH_FEATURES_CRUSH;
1510
1511 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1512 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1513 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1514
1515 for (auto &pool: pools) {
1516 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1517 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1518 }
7c673cae
FG
1519 if (!pool.second.tiers.empty() ||
1520 pool.second.is_tier()) {
1521 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1522 }
31f18b77 1523 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
7c673cae
FG
1524 pool.second.get_type(),
1525 pool.second.get_size());
1526 if (ruleid >= 0) {
1527 if (crush->is_v2_rule(ruleid))
1528 features |= CEPH_FEATURE_CRUSH_V2;
1529 if (crush->is_v3_rule(ruleid))
1530 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1531 if (crush->is_v5_rule(ruleid))
1532 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1533 }
1534 }
7c673cae 1535 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1536
1537 if (osd_primary_affinity) {
1538 for (int i = 0; i < max_osd; ++i) {
1539 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1540 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1541 break;
1542 }
1543 }
1544 }
1545 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1546
1547 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1548 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
31f18b77 1549 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
1550 features |= jewel_features;
1551 }
1552 mask |= jewel_features;
1553
1554 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1555 | CEPH_FEATURE_MSG_ADDR2;
31f18b77 1556 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1557 features |= kraken_features;
1558 }
1559 mask |= kraken_features;
1560 }
1561
11fdf7f2
TL
1562 if (require_min_compat_client >= CEPH_RELEASE_NAUTILUS) {
1563 // if min_compat_client is >= nautilus, require v2 cephx signatures
1564 // from everyone
1565 features |= CEPH_FEATUREMASK_CEPHX_V2;
1566 } else if (require_osd_release >= CEPH_RELEASE_NAUTILUS &&
1567 entity_type == CEPH_ENTITY_TYPE_OSD) {
1568 // if osds are >= nautilus, at least require the signatures from them
1569 features |= CEPH_FEATUREMASK_CEPHX_V2;
1570 }
1571 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1572
7c673cae
FG
1573 if (pmask)
1574 *pmask = mask;
1575 return features;
1576}
1577
31f18b77 1578uint8_t OSDMap::get_min_compat_client() const
7c673cae
FG
1579{
1580 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1581
1582 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77
FG
1583 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1584 return CEPH_RELEASE_LUMINOUS; // v12.2.0
7c673cae
FG
1585 }
1586 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
31f18b77 1587 return CEPH_RELEASE_JEWEL; // v10.2.0
7c673cae
FG
1588 }
1589 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
31f18b77 1590 return CEPH_RELEASE_HAMMER; // v0.94.0
7c673cae
FG
1591 }
1592 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1593 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1594 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
31f18b77 1595 return CEPH_RELEASE_FIREFLY; // v0.80.0
7c673cae
FG
1596 }
1597 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1598 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
31f18b77 1599 return CEPH_RELEASE_DUMPLING; // v0.67.0
7c673cae
FG
1600 }
1601 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
31f18b77 1602 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae 1603 }
31f18b77 1604 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae
FG
1605}
1606
11fdf7f2
TL
1607uint8_t OSDMap::get_require_min_compat_client() const
1608{
1609 return require_min_compat_client;
1610}
1611
7c673cae
FG
1612void OSDMap::_calc_up_osd_features()
1613{
1614 bool first = true;
1615 cached_up_osd_features = 0;
1616 for (int osd = 0; osd < max_osd; ++osd) {
1617 if (!is_up(osd))
1618 continue;
1619 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1620 if (xi.features == 0)
1621 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1622 if (first) {
1623 cached_up_osd_features = xi.features;
1624 first = false;
1625 } else {
1626 cached_up_osd_features &= xi.features;
1627 }
1628 }
1629}
1630
1631uint64_t OSDMap::get_up_osd_features() const
1632{
1633 return cached_up_osd_features;
1634}
1635
1636void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1637{
11fdf7f2 1638 using ceph::encode;
7c673cae
FG
1639 if (o->epoch == n->epoch)
1640 return;
1641
1642 int diff = 0;
1643
1644 // do addrs match?
1645 if (o->max_osd != n->max_osd)
1646 diff++;
1647 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1648 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1649 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1650 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1651 else
1652 diff++;
11fdf7f2
TL
1653 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1654 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1655 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1656 else
1657 diff++;
11fdf7f2
TL
1658 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1659 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1660 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1661 else
1662 diff++;
11fdf7f2
TL
1663 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1664 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1665 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1666 else
1667 diff++;
1668 }
1669 if (diff == 0) {
1670 // zoinks, no differences at all!
1671 n->osd_addrs = o->osd_addrs;
1672 }
1673
1674 // does crush match?
1675 bufferlist oc, nc;
11fdf7f2
TL
1676 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1677 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1678 if (oc.contents_equal(nc)) {
1679 n->crush = o->crush;
1680 }
1681
1682 // does pg_temp match?
31f18b77
FG
1683 if (*o->pg_temp == *n->pg_temp)
1684 n->pg_temp = o->pg_temp;
7c673cae
FG
1685
1686 // does primary_temp match?
1687 if (o->primary_temp->size() == n->primary_temp->size()) {
1688 if (*o->primary_temp == *n->primary_temp)
1689 n->primary_temp = o->primary_temp;
1690 }
1691
1692 // do uuids match?
1693 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1694 *o->osd_uuid == *n->osd_uuid)
1695 n->osd_uuid = o->osd_uuid;
1696}
1697
1698void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1699 const OSDMap& oldmap,
1700 const OSDMap& nextmap,
1701 Incremental *pending_inc)
7c673cae
FG
1702{
1703 ldout(cct, 10) << __func__ << dendl;
7c673cae 1704
11fdf7f2 1705 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1706 // if pool does not exist, remove any existing pg_temps associated with
1707 // it. we don't care about pg_temps on the pending_inc either; if there
1708 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1709 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1710 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1711 << " for nonexistent pool " << pg.first.pool() << dendl;
1712 pending_inc->new_pg_temp[pg.first].clear();
1713 continue;
1714 }
1715 // all osds down?
1716 unsigned num_up = 0;
1717 for (auto o : pg.second) {
11fdf7f2 1718 if (!nextmap.is_down(o)) {
7c673cae
FG
1719 ++num_up;
1720 break;
1721 }
1722 }
1723 if (num_up == 0) {
1724 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1725 << " with all down osds" << pg.second << dendl;
1726 pending_inc->new_pg_temp[pg.first].clear();
1727 continue;
1728 }
1729 // redundant pg_temp?
1730 vector<int> raw_up;
1731 int primary;
11fdf7f2 1732 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1733 bool remove = false;
11fdf7f2 1734 if (raw_up == pg.second) {
7c673cae
FG
1735 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1736 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1737 remove = true;
1738 }
1739 // oversized pg_temp?
11fdf7f2 1740 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1741 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1742 << pg.second << " exceeds pool size" << dendl;
1743 remove = true;
1744 }
1745 if (remove) {
11fdf7f2 1746 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1747 pending_inc->new_pg_temp[pg.first].clear();
1748 else
1749 pending_inc->new_pg_temp.erase(pg.first);
1750 }
1751 }
1752
11fdf7f2 1753 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1754 // primary down?
11fdf7f2 1755 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1756 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1757 << " to down " << pg.second << dendl;
1758 pending_inc->new_primary_temp[pg.first] = -1;
1759 continue;
1760 }
1761 // redundant primary_temp?
1762 vector<int> real_up, templess_up;
1763 int real_primary, templess_primary;
1764 pg_t pgid = pg.first;
11fdf7f2
TL
1765 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1766 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1767 if (real_primary == templess_primary){
1768 ldout(cct, 10) << __func__ << " removing primary_temp "
1769 << pgid << " -> " << real_primary
1770 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1771 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1772 pending_inc->new_primary_temp[pgid] = -1;
1773 else
1774 pending_inc->new_primary_temp.erase(pgid);
1775 }
1776 }
1777}
1778
494da23a 1779void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1780{
494da23a
TL
1781 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1782 for (auto& p : pg_upmap)
1783 upmap_pgs->push_back(p.first);
1784 for (auto& p : pg_upmap_items)
1785 upmap_pgs->push_back(p.first);
1786}
94b18763 1787
494da23a
TL
1788bool OSDMap::check_pg_upmaps(
1789 CephContext *cct,
1790 const vector<pg_t>& to_check,
1791 vector<pg_t> *to_cancel,
1792 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1793{
1794 bool any_change = false;
1795 map<int, map<int, float>> rule_weight_map;
28e407b8 1796 for (auto& pg : to_check) {
494da23a 1797 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
1798 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1799 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1800 << dendl;
494da23a 1801 to_cancel->push_back(pg);
11fdf7f2
TL
1802 continue;
1803 }
1804 if (pi->is_pending_merge(pg, nullptr)) {
1805 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1806 << dendl;
494da23a 1807 to_cancel->push_back(pg);
94b18763
FG
1808 continue;
1809 }
494da23a
TL
1810 vector<int> raw, up;
1811 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
1812 auto crush_rule = get_pg_pool_crush_rule(pg);
1813 auto r = crush->verify_upmap(cct,
1814 crush_rule,
1815 get_pg_pool_size(pg),
1816 up);
a8e16298
TL
1817 if (r < 0) {
1818 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1819 << " returning " << r
1820 << dendl;
494da23a 1821 to_cancel->push_back(pg);
a8e16298
TL
1822 continue;
1823 }
1824 // below we check against crush-topology changing..
28e407b8
AA
1825 map<int, float> weight_map;
1826 auto it = rule_weight_map.find(crush_rule);
1827 if (it == rule_weight_map.end()) {
494da23a 1828 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
1829 if (r < 0) {
1830 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
1831 << "crush_rule " << crush_rule
1832 << dendl;
28e407b8
AA
1833 continue;
1834 }
1835 rule_weight_map[crush_rule] = weight_map;
1836 } else {
1837 weight_map = it->second;
1838 }
28e407b8 1839 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1840 << " weight_map " << weight_map
94b18763 1841 << dendl;
a8e16298 1842 for (auto osd : up) {
28e407b8
AA
1843 auto it = weight_map.find(osd);
1844 if (it == weight_map.end()) {
1845 // osd is gone or has been moved out of the specific crush-tree
494da23a 1846 to_cancel->push_back(pg);
94b18763
FG
1847 break;
1848 }
494da23a 1849 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8
AA
1850 if (adjusted_weight == 0) {
1851 // osd is out/crush-out
494da23a 1852 to_cancel->push_back(pg);
94b18763
FG
1853 break;
1854 }
1855 }
eafe8130
TL
1856 if (!to_cancel->empty() && to_cancel->back() == pg)
1857 continue;
1858 // okay, upmap is valid
1859 // continue to check if it is still necessary
1860 auto i = pg_upmap.find(pg);
1861 if (i != pg_upmap.end() && raw == i->second) {
1862 ldout(cct, 10) << " removing redundant pg_upmap "
1863 << i->first << " " << i->second
1864 << dendl;
1865 to_cancel->push_back(pg);
1866 continue;
1867 }
1868 auto j = pg_upmap_items.find(pg);
1869 if (j != pg_upmap_items.end()) {
1870 mempool::osdmap::vector<pair<int,int>> newmap;
1871 for (auto& p : j->second) {
1872 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1873 // cancel mapping if source osd does not exist anymore
1874 continue;
1875 }
1876 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1877 p.second >= 0 && osd_weight[p.second] == 0) {
1878 // cancel mapping if target osd is out
1879 continue;
1880 }
1881 newmap.push_back(p);
1882 }
1883 if (newmap.empty()) {
1884 ldout(cct, 10) << " removing no-op pg_upmap_items "
1885 << j->first << " " << j->second
1886 << dendl;
1887 to_cancel->push_back(pg);
1888 } else if (newmap != j->second) {
1889 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
1890 << j->first << " " << j->second
1891 << " -> " << newmap
1892 << dendl;
1893 to_remap->insert({pg, newmap});
1894 any_change = true;
1895 }
1896 }
28e407b8 1897 }
494da23a
TL
1898 any_change = any_change || !to_cancel->empty();
1899 return any_change;
1900}
1901
1902void OSDMap::clean_pg_upmaps(
1903 CephContext *cct,
1904 Incremental *pending_inc,
1905 const vector<pg_t>& to_cancel,
1906 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
1907{
28e407b8 1908 for (auto &pg: to_cancel) {
494da23a
TL
1909 auto i = pending_inc->new_pg_upmap.find(pg);
1910 if (i != pending_inc->new_pg_upmap.end()) {
1911 ldout(cct, 10) << __func__ << " cancel invalid pending "
1912 << "pg_upmap entry "
1913 << i->first << "->" << i->second
1914 << dendl;
1915 pending_inc->new_pg_upmap.erase(i);
94b18763 1916 }
494da23a
TL
1917 auto j = pg_upmap.find(pg);
1918 if (j != pg_upmap.end()) {
1919 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
1920 << j->first << "->" << j->second
1921 << dendl;
1922 pending_inc->old_pg_upmap.insert(pg);
1923 }
1924 auto p = pending_inc->new_pg_upmap_items.find(pg);
1925 if (p != pending_inc->new_pg_upmap_items.end()) {
1926 ldout(cct, 10) << __func__ << " cancel invalid pending "
1927 << "pg_upmap_items entry "
1928 << p->first << "->" << p->second
1929 << dendl;
1930 pending_inc->new_pg_upmap_items.erase(p);
1931 }
1932 auto q = pg_upmap_items.find(pg);
1933 if (q != pg_upmap_items.end()) {
1934 ldout(cct, 10) << __func__ << " cancel invalid "
1935 << "pg_upmap_items entry "
1936 << q->first << "->" << q->second
1937 << dendl;
1938 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
1939 }
1940 }
494da23a
TL
1941 for (auto& i : to_remap)
1942 pending_inc->new_pg_upmap_items[i.first] = i.second;
1943}
1944
1945bool OSDMap::clean_pg_upmaps(
1946 CephContext *cct,
1947 Incremental *pending_inc) const
1948{
1949 ldout(cct, 10) << __func__ << dendl;
1950 vector<pg_t> to_check;
1951 vector<pg_t> to_cancel;
1952 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
1953
1954 get_upmap_pgs(&to_check);
1955 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
1956 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
1957 return any_change;
94b18763
FG
1958}
1959
7c673cae
FG
1960int OSDMap::apply_incremental(const Incremental &inc)
1961{
1962 new_blacklist_entries = false;
1963 if (inc.epoch == 1)
1964 fsid = inc.fsid;
1965 else if (inc.fsid != fsid)
1966 return -EINVAL;
1967
11fdf7f2 1968 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
1969
1970 epoch++;
1971 modified = inc.modified;
1972
1973 // full map?
1974 if (inc.fullmap.length()) {
1975 bufferlist bl(inc.fullmap);
1976 decode(bl);
1977 return 0;
1978 }
1979
1980 // nope, incremental.
31f18b77 1981 if (inc.new_flags >= 0) {
7c673cae 1982 flags = inc.new_flags;
31f18b77
FG
1983 // the below is just to cover a newly-upgraded luminous mon
1984 // cluster that has to set require_jewel_osds or
1985 // require_kraken_osds before the osds can be upgraded to
1986 // luminous.
1987 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1988 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1989 require_osd_release = CEPH_RELEASE_KRAKEN;
1990 }
1991 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1992 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1993 require_osd_release = CEPH_RELEASE_JEWEL;
1994 }
1995 }
1996 }
7c673cae
FG
1997
1998 if (inc.new_max_osd >= 0)
1999 set_max_osd(inc.new_max_osd);
2000
2001 if (inc.new_pool_max != -1)
2002 pool_max = inc.new_pool_max;
2003
2004 for (const auto &pool : inc.new_pools) {
2005 pools[pool.first] = pool.second;
2006 pools[pool.first].last_change = epoch;
2007 }
2008
11fdf7f2
TL
2009 new_removed_snaps = inc.new_removed_snaps;
2010 new_purged_snaps = inc.new_purged_snaps;
2011 for (auto p = new_removed_snaps.begin();
2012 p != new_removed_snaps.end();
2013 ++p) {
2014 removed_snaps_queue[p->first].union_of(p->second);
2015 }
2016 for (auto p = new_purged_snaps.begin();
2017 p != new_purged_snaps.end();
2018 ++p) {
2019 auto q = removed_snaps_queue.find(p->first);
2020 ceph_assert(q != removed_snaps_queue.end());
2021 q->second.subtract(p->second);
2022 if (q->second.empty()) {
2023 removed_snaps_queue.erase(q);
2024 }
2025 }
2026
2027 if (inc.new_last_up_change != utime_t()) {
2028 last_up_change = inc.new_last_up_change;
2029 }
2030 if (inc.new_last_in_change != utime_t()) {
2031 last_in_change = inc.new_last_in_change;
2032 }
2033
7c673cae
FG
2034 for (const auto &pname : inc.new_pool_names) {
2035 auto pool_name_entry = pool_name.find(pname.first);
2036 if (pool_name_entry != pool_name.end()) {
2037 name_pool.erase(pool_name_entry->second);
2038 pool_name_entry->second = pname.second;
2039 } else {
2040 pool_name[pname.first] = pname.second;
2041 }
2042 name_pool[pname.second] = pname.first;
2043 }
2044
2045 for (const auto &pool : inc.old_pools) {
2046 pools.erase(pool);
2047 name_pool.erase(pool_name[pool]);
2048 pool_name.erase(pool);
2049 }
2050
2051 for (const auto &weight : inc.new_weight) {
2052 set_weight(weight.first, weight.second);
2053
2054 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2055 // xinfo old_weight.
2056 if (weight.second) {
2057 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2058 osd_xinfo[weight.first].old_weight = 0;
2059 }
2060 }
2061
2062 for (const auto &primary_affinity : inc.new_primary_affinity) {
2063 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2064 }
2065
2066 // erasure_code_profiles
2067 for (const auto &profile : inc.old_erasure_code_profiles)
2068 erasure_code_profiles.erase(profile);
2069
2070 for (const auto &profile : inc.new_erasure_code_profiles) {
2071 set_erasure_code_profile(profile.first, profile.second);
2072 }
2073
2074 // up/down
2075 for (const auto &state : inc.new_state) {
2076 const auto osd = state.first;
2077 int s = state.second ? state.second : CEPH_OSD_UP;
2078 if ((osd_state[osd] & CEPH_OSD_UP) &&
2079 (s & CEPH_OSD_UP)) {
2080 osd_info[osd].down_at = epoch;
2081 osd_xinfo[osd].down_stamp = modified;
2082 }
2083 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2084 (s & CEPH_OSD_EXISTS)) {
2085 // osd is destroyed; clear out anything interesting.
2086 (*osd_uuid)[osd] = uuid_d();
2087 osd_info[osd] = osd_info_t();
2088 osd_xinfo[osd] = osd_xinfo_t();
2089 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2090 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2091 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2092 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2093 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2094 osd_state[osd] = 0;
2095 } else {
2096 osd_state[osd] ^= s;
2097 }
2098 }
2099
2100 for (const auto &client : inc.new_up_client) {
2101 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
11fdf7f2
TL
2102 osd_addrs->client_addrs[client.first].reset(
2103 new entity_addrvec_t(client.second));
2104 osd_addrs->hb_back_addrs[client.first].reset(
2105 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2106 osd_addrs->hb_front_addrs[client.first].reset(
2107 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2108
2109 osd_info[client.first].up_from = epoch;
2110 }
2111
2112 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2113 osd_addrs->cluster_addrs[cluster.first].reset(
2114 new entity_addrvec_t(cluster.second));
7c673cae
FG
2115
2116 // info
2117 for (const auto &thru : inc.new_up_thru)
2118 osd_info[thru.first].up_thru = thru.second;
2119
2120 for (const auto &interval : inc.new_last_clean_interval) {
2121 osd_info[interval.first].last_clean_begin = interval.second.first;
2122 osd_info[interval.first].last_clean_end = interval.second.second;
2123 }
2124
2125 for (const auto &lost : inc.new_lost)
2126 osd_info[lost.first].lost_at = lost.second;
2127
2128 // xinfo
2129 for (const auto &xinfo : inc.new_xinfo)
2130 osd_xinfo[xinfo.first] = xinfo.second;
2131
2132 // uuid
2133 for (const auto &uuid : inc.new_uuid)
2134 (*osd_uuid)[uuid.first] = uuid.second;
2135
2136 // pg rebuild
2137 for (const auto &pg : inc.new_pg_temp) {
2138 if (pg.second.empty())
2139 pg_temp->erase(pg.first);
2140 else
31f18b77
FG
2141 pg_temp->set(pg.first, pg.second);
2142 }
2143 if (!inc.new_pg_temp.empty()) {
2144 // make sure pg_temp is efficiently stored
2145 pg_temp->rebuild();
7c673cae
FG
2146 }
2147
2148 for (const auto &pg : inc.new_primary_temp) {
2149 if (pg.second == -1)
2150 primary_temp->erase(pg.first);
2151 else
2152 (*primary_temp)[pg.first] = pg.second;
2153 }
2154
2155 for (auto& p : inc.new_pg_upmap) {
2156 pg_upmap[p.first] = p.second;
2157 }
2158 for (auto& pg : inc.old_pg_upmap) {
2159 pg_upmap.erase(pg);
2160 }
2161 for (auto& p : inc.new_pg_upmap_items) {
2162 pg_upmap_items[p.first] = p.second;
2163 }
2164 for (auto& pg : inc.old_pg_upmap_items) {
2165 pg_upmap_items.erase(pg);
2166 }
2167
2168 // blacklist
2169 if (!inc.new_blacklist.empty()) {
2170 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
2171 new_blacklist_entries = true;
2172 }
2173 for (const auto &addr : inc.old_blacklist)
2174 blacklist.erase(addr);
2175
81eedcae
TL
2176 for (auto& i : inc.new_crush_node_flags) {
2177 if (i.second) {
2178 crush_node_flags[i.first] = i.second;
2179 } else {
2180 crush_node_flags.erase(i.first);
2181 }
2182 }
2183
2184 for (auto& i : inc.new_device_class_flags) {
2185 if (i.second) {
2186 device_class_flags[i.first] = i.second;
2187 } else {
2188 device_class_flags.erase(i.first);
2189 }
2190 }
2191
7c673cae
FG
2192 // cluster snapshot?
2193 if (inc.cluster_snapshot.length()) {
2194 cluster_snapshot = inc.cluster_snapshot;
2195 cluster_snapshot_epoch = inc.epoch;
2196 } else {
2197 cluster_snapshot.clear();
2198 cluster_snapshot_epoch = 0;
2199 }
2200
2201 if (inc.new_nearfull_ratio >= 0) {
2202 nearfull_ratio = inc.new_nearfull_ratio;
2203 }
2204 if (inc.new_backfillfull_ratio >= 0) {
2205 backfillfull_ratio = inc.new_backfillfull_ratio;
2206 }
2207 if (inc.new_full_ratio >= 0) {
2208 full_ratio = inc.new_full_ratio;
2209 }
31f18b77 2210 if (inc.new_require_min_compat_client > 0) {
7c673cae
FG
2211 require_min_compat_client = inc.new_require_min_compat_client;
2212 }
31f18b77
FG
2213 if (inc.new_require_osd_release >= 0) {
2214 require_osd_release = inc.new_require_osd_release;
2215 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2216 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2217 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2218 }
2219 }
7c673cae 2220
11fdf7f2
TL
2221 if (inc.new_require_osd_release >= 0) {
2222 require_osd_release = inc.new_require_osd_release;
2223 if (require_osd_release >= CEPH_RELEASE_NAUTILUS) {
2224 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2225 }
2226 }
7c673cae
FG
2227 // do new crush map last (after up/down stuff)
2228 if (inc.crush.length()) {
2229 bufferlist bl(inc.crush);
11fdf7f2 2230 auto blp = bl.cbegin();
7c673cae
FG
2231 crush.reset(new CrushWrapper);
2232 crush->decode(blp);
31f18b77
FG
2233 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2234 // only increment if this is a luminous-encoded osdmap, lest
2235 // the mon's crush_version diverge from what the osds or others
2236 // are decoding and applying on their end. if we won't encode
2237 // it in the canonical version, don't change it.
2238 ++crush_version;
2239 }
81eedcae
TL
2240 for (auto it = device_class_flags.begin();
2241 it != device_class_flags.end();) {
2242 const char* class_name = crush->get_class_name(it->first);
2243 if (!class_name) // device class is gone
2244 it = device_class_flags.erase(it);
2245 else
2246 it++;
2247 }
7c673cae
FG
2248 }
2249
2250 calc_num_osds();
2251 _calc_up_osd_features();
2252 return 0;
2253}
2254
2255// mapping
2256int OSDMap::map_to_pg(
2257 int64_t poolid,
2258 const string& name,
2259 const string& key,
2260 const string& nspace,
2261 pg_t *pg) const
2262{
2263 // calculate ps (placement seed)
2264 const pg_pool_t *pool = get_pg_pool(poolid);
2265 if (!pool)
2266 return -ENOENT;
2267 ps_t ps;
2268 if (!key.empty())
2269 ps = pool->hash_key(key, nspace);
2270 else
2271 ps = pool->hash_key(name, nspace);
2272 *pg = pg_t(ps, poolid);
2273 return 0;
2274}
2275
2276int OSDMap::object_locator_to_pg(
2277 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2278{
2279 if (loc.hash >= 0) {
2280 if (!get_pg_pool(loc.get_pool())) {
2281 return -ENOENT;
2282 }
2283 pg = pg_t(loc.hash, loc.get_pool());
2284 return 0;
2285 }
2286 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2287}
2288
2289ceph_object_layout OSDMap::make_object_layout(
2290 object_t oid, int pg_pool, string nspace) const
2291{
2292 object_locator_t loc(pg_pool, nspace);
2293
2294 ceph_object_layout ol;
2295 pg_t pgid = object_locator_to_pg(oid, loc);
2296 ol.ol_pgid = pgid.get_old_pg().v;
2297 ol.ol_stripe_unit = 0;
2298 return ol;
2299}
2300
2301void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2302 vector<int>& osds) const
2303{
2304 if (pool.can_shift_osds()) {
2305 unsigned removed = 0;
2306 for (unsigned i = 0; i < osds.size(); i++) {
2307 if (!exists(osds[i])) {
2308 removed++;
2309 continue;
2310 }
2311 if (removed) {
2312 osds[i - removed] = osds[i];
2313 }
2314 }
2315 if (removed)
2316 osds.resize(osds.size() - removed);
2317 } else {
2318 for (auto& osd : osds) {
2319 if (!exists(osd))
2320 osd = CRUSH_ITEM_NONE;
2321 }
2322 }
2323}
2324
31f18b77 2325void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2326 const pg_pool_t& pool, pg_t pg,
2327 vector<int> *osds,
2328 ps_t *ppps) const
2329{
2330 // map to osds[]
2331 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2332 unsigned size = pool.get_size();
2333
2334 // what crush rule?
31f18b77 2335 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
7c673cae
FG
2336 if (ruleno >= 0)
2337 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2338
2339 _remove_nonexistent_osds(pool, *osds);
2340
2341 if (ppps)
2342 *ppps = pps;
7c673cae
FG
2343}
2344
2345int OSDMap::_pick_primary(const vector<int>& osds) const
2346{
2347 for (auto osd : osds) {
2348 if (osd != CRUSH_ITEM_NONE) {
2349 return osd;
2350 }
2351 }
2352 return -1;
2353}
2354
224ce89b 2355void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2356{
2357 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2358 auto p = pg_upmap.find(pg);
2359 if (p != pg_upmap.end()) {
2360 // make sure targets aren't marked out
2361 for (auto osd : p->second) {
91327a77
AA
2362 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2363 osd_weight[osd] == 0) {
7c673cae
FG
2364 // reject/ignore the explicit mapping
2365 return;
2366 }
2367 }
2368 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2369 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2370 }
2371
2372 auto q = pg_upmap_items.find(pg);
2373 if (q != pg_upmap_items.end()) {
181888fb
FG
2374 // NOTE: this approach does not allow a bidirectional swap,
2375 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2376 for (auto& r : q->second) {
2377 // make sure the replacement value doesn't already appear
2378 bool exists = false;
2379 ssize_t pos = -1;
2380 for (unsigned i = 0; i < raw->size(); ++i) {
2381 int osd = (*raw)[i];
2382 if (osd == r.second) {
2383 exists = true;
2384 break;
2385 }
2386 // ignore mapping if target is marked out (or invalid osd id)
2387 if (osd == r.first &&
2388 pos < 0 &&
2389 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2390 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2391 pos = i;
2392 }
2393 }
2394 if (!exists && pos >= 0) {
2395 (*raw)[pos] = r.second;
7c673cae
FG
2396 }
2397 }
2398 }
2399}
2400
2401// pg -> (up osd list)
2402void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2403 vector<int> *up) const
2404{
2405 if (pool.can_shift_osds()) {
2406 // shift left
2407 up->clear();
2408 up->reserve(raw.size());
2409 for (unsigned i=0; i<raw.size(); i++) {
2410 if (!exists(raw[i]) || is_down(raw[i]))
2411 continue;
2412 up->push_back(raw[i]);
2413 }
2414 } else {
2415 // set down/dne devices to NONE
2416 up->resize(raw.size());
2417 for (int i = raw.size() - 1; i >= 0; --i) {
2418 if (!exists(raw[i]) || is_down(raw[i])) {
2419 (*up)[i] = CRUSH_ITEM_NONE;
2420 } else {
2421 (*up)[i] = raw[i];
2422 }
2423 }
2424 }
2425}
2426
2427void OSDMap::_apply_primary_affinity(ps_t seed,
2428 const pg_pool_t& pool,
2429 vector<int> *osds,
2430 int *primary) const
2431{
2432 // do we have any non-default primary_affinity values for these osds?
2433 if (!osd_primary_affinity)
2434 return;
2435
2436 bool any = false;
2437 for (const auto osd : *osds) {
2438 if (osd != CRUSH_ITEM_NONE &&
2439 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2440 any = true;
2441 break;
2442 }
2443 }
2444 if (!any)
2445 return;
2446
2447 // pick the primary. feed both the seed (for the pg) and the osd
2448 // into the hash/rng so that a proportional fraction of an osd's pgs
2449 // get rejected as primary.
2450 int pos = -1;
2451 for (unsigned i = 0; i < osds->size(); ++i) {
2452 int o = (*osds)[i];
2453 if (o == CRUSH_ITEM_NONE)
2454 continue;
2455 unsigned a = (*osd_primary_affinity)[o];
2456 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2457 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2458 seed, o) >> 16) >= a) {
2459 // we chose not to use this primary. note it anyway as a
2460 // fallback in case we don't pick anyone else, but keep looking.
2461 if (pos < 0)
2462 pos = i;
2463 } else {
2464 pos = i;
2465 break;
2466 }
2467 }
2468 if (pos < 0)
2469 return;
2470
2471 *primary = (*osds)[pos];
2472
2473 if (pool.can_shift_osds() && pos > 0) {
2474 // move the new primary to the front.
2475 for (int i = pos; i > 0; --i) {
2476 (*osds)[i] = (*osds)[i-1];
2477 }
2478 (*osds)[0] = *primary;
2479 }
2480}
2481
2482void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2483 vector<int> *temp_pg, int *temp_primary) const
2484{
2485 pg = pool.raw_pg_to_pg(pg);
2486 const auto p = pg_temp->find(pg);
2487 temp_pg->clear();
2488 if (p != pg_temp->end()) {
2489 for (unsigned i=0; i<p->second.size(); i++) {
2490 if (!exists(p->second[i]) || is_down(p->second[i])) {
2491 if (pool.can_shift_osds()) {
2492 continue;
2493 } else {
2494 temp_pg->push_back(CRUSH_ITEM_NONE);
2495 }
2496 } else {
2497 temp_pg->push_back(p->second[i]);
2498 }
2499 }
2500 }
2501 const auto &pp = primary_temp->find(pg);
2502 *temp_primary = -1;
2503 if (pp != primary_temp->end()) {
2504 *temp_primary = pp->second;
2505 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2506 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2507 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2508 *temp_primary = (*temp_pg)[i];
2509 break;
2510 }
2511 }
2512 }
2513}
2514
31f18b77 2515void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2516{
7c673cae 2517 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2518 if (!pool) {
2519 *primary = -1;
2520 raw->clear();
31f18b77 2521 return;
11fdf7f2 2522 }
31f18b77 2523 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2524 *primary = _pick_primary(*raw);
7c673cae
FG
2525}
2526
494da23a
TL
2527void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2528 vector<int> *raw_upmap) const
a8e16298
TL
2529{
2530 auto pool = get_pg_pool(pg.pool());
2531 if (!pool) {
2532 raw_upmap->clear();
2533 return;
2534 }
494da23a
TL
2535 _pg_to_raw_osds(*pool, pg, raw, NULL);
2536 *raw_upmap = *raw;
a8e16298
TL
2537 _apply_upmap(*pool, pg, raw_upmap);
2538}
2539
7c673cae
FG
2540void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2541{
2542 const pg_pool_t *pool = get_pg_pool(pg.pool());
2543 if (!pool) {
11fdf7f2
TL
2544 *primary = -1;
2545 up->clear();
7c673cae
FG
2546 return;
2547 }
2548 vector<int> raw;
2549 ps_t pps;
2550 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2551 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2552 _raw_to_up_osds(*pool, raw, up);
2553 *primary = _pick_primary(raw);
2554 _apply_primary_affinity(pps, *pool, up, primary);
2555}
31f18b77 2556
7c673cae
FG
2557void OSDMap::_pg_to_up_acting_osds(
2558 const pg_t& pg, vector<int> *up, int *up_primary,
2559 vector<int> *acting, int *acting_primary,
2560 bool raw_pg_to_pg) const
2561{
2562 const pg_pool_t *pool = get_pg_pool(pg.pool());
2563 if (!pool ||
2564 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2565 if (up)
2566 up->clear();
2567 if (up_primary)
2568 *up_primary = -1;
2569 if (acting)
2570 acting->clear();
2571 if (acting_primary)
2572 *acting_primary = -1;
2573 return;
2574 }
2575 vector<int> raw;
2576 vector<int> _up;
2577 vector<int> _acting;
2578 int _up_primary;
2579 int _acting_primary;
2580 ps_t pps;
2581 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2582 if (_acting.empty() || up || up_primary) {
2583 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2584 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2585 _raw_to_up_osds(*pool, raw, &_up);
2586 _up_primary = _pick_primary(_up);
2587 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2588 if (_acting.empty()) {
2589 _acting = _up;
2590 if (_acting_primary == -1) {
2591 _acting_primary = _up_primary;
2592 }
2593 }
2594
2595 if (up)
2596 up->swap(_up);
2597 if (up_primary)
2598 *up_primary = _up_primary;
2599 }
2600
2601 if (acting)
2602 acting->swap(_acting);
2603 if (acting_primary)
2604 *acting_primary = _acting_primary;
2605}
2606
2607int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2608{
2609 if (!nrep)
2610 nrep = acting.size();
2611 for (int i=0; i<nrep; i++)
2612 if (acting[i] == osd)
2613 return i;
2614 return -1;
2615}
2616
2617int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2618{
2619 return calc_pg_rank(osd, acting, nrep);
2620}
2621
2622bool OSDMap::primary_changed(
2623 int oldprimary,
2624 const vector<int> &oldacting,
2625 int newprimary,
2626 const vector<int> &newacting)
2627{
2628 if (oldacting.empty() && newacting.empty())
2629 return false; // both still empty
2630 if (oldacting.empty() ^ newacting.empty())
2631 return true; // was empty, now not, or vice versa
2632 if (oldprimary != newprimary)
2633 return true; // primary changed
2634 if (calc_pg_rank(oldprimary, oldacting) !=
2635 calc_pg_rank(newprimary, newacting))
2636 return true;
2637 return false; // same primary (tho replicas may have changed)
2638}
2639
28e407b8
AA
2640uint64_t OSDMap::get_encoding_features() const
2641{
2642 uint64_t f = SIGNIFICANT_FEATURES;
11fdf7f2
TL
2643 if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
2644 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2645 }
2646 if (require_osd_release < CEPH_RELEASE_MIMIC) {
2647 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2648 }
28e407b8
AA
2649 if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
2650 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2651 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2652 }
2653 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
2654 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2655 CEPH_FEATURE_MSG_ADDR2);
28e407b8
AA
2656 }
2657 if (require_osd_release < CEPH_RELEASE_JEWEL) {
2658 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2659 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2660 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2661 }
2662 return f;
2663}
7c673cae
FG
2664
2665// serialize, unserialize
2666void OSDMap::encode_client_old(bufferlist& bl) const
2667{
11fdf7f2 2668 using ceph::encode;
7c673cae 2669 __u16 v = 5;
11fdf7f2 2670 encode(v, bl);
7c673cae
FG
2671
2672 // base
11fdf7f2
TL
2673 encode(fsid, bl);
2674 encode(epoch, bl);
2675 encode(created, bl);
2676 encode(modified, bl);
7c673cae 2677
11fdf7f2 2678 // for encode(pools, bl);
7c673cae 2679 __u32 n = pools.size();
11fdf7f2 2680 encode(n, bl);
7c673cae
FG
2681
2682 for (const auto &pool : pools) {
2683 n = pool.first;
11fdf7f2
TL
2684 encode(n, bl);
2685 encode(pool.second, bl, 0);
7c673cae 2686 }
11fdf7f2 2687 // for encode(pool_name, bl);
7c673cae 2688 n = pool_name.size();
11fdf7f2 2689 encode(n, bl);
7c673cae
FG
2690 for (const auto &pname : pool_name) {
2691 n = pname.first;
11fdf7f2
TL
2692 encode(n, bl);
2693 encode(pname.second, bl);
7c673cae 2694 }
11fdf7f2 2695 // for encode(pool_max, bl);
7c673cae 2696 n = pool_max;
11fdf7f2 2697 encode(n, bl);
7c673cae 2698
11fdf7f2 2699 encode(flags, bl);
7c673cae 2700
11fdf7f2 2701 encode(max_osd, bl);
31f18b77
FG
2702 {
2703 uint32_t n = osd_state.size();
11fdf7f2 2704 encode(n, bl);
31f18b77 2705 for (auto s : osd_state) {
11fdf7f2 2706 encode((uint8_t)s, bl);
31f18b77
FG
2707 }
2708 }
11fdf7f2
TL
2709 encode(osd_weight, bl);
2710 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2711
11fdf7f2 2712 // for encode(pg_temp, bl);
7c673cae 2713 n = pg_temp->size();
11fdf7f2 2714 encode(n, bl);
7c673cae
FG
2715 for (const auto pg : *pg_temp) {
2716 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2717 encode(opg, bl);
2718 encode(pg.second, bl);
7c673cae
FG
2719 }
2720
2721 // crush
2722 bufferlist cbl;
2723 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2724 encode(cbl, bl);
7c673cae
FG
2725}
2726
2727void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2728{
11fdf7f2 2729 using ceph::encode;
7c673cae
FG
2730 if ((features & CEPH_FEATURE_PGID64) == 0) {
2731 encode_client_old(bl);
2732 return;
2733 }
2734
2735 __u16 v = 6;
11fdf7f2 2736 encode(v, bl);
7c673cae
FG
2737
2738 // base
11fdf7f2
TL
2739 encode(fsid, bl);
2740 encode(epoch, bl);
2741 encode(created, bl);
2742 encode(modified, bl);
7c673cae 2743
11fdf7f2
TL
2744 encode(pools, bl, features);
2745 encode(pool_name, bl);
2746 encode(pool_max, bl);
7c673cae 2747
11fdf7f2 2748 encode(flags, bl);
7c673cae 2749
11fdf7f2 2750 encode(max_osd, bl);
31f18b77
FG
2751 {
2752 uint32_t n = osd_state.size();
11fdf7f2 2753 encode(n, bl);
31f18b77 2754 for (auto s : osd_state) {
11fdf7f2 2755 encode((uint8_t)s, bl);
31f18b77
FG
2756 }
2757 }
11fdf7f2
TL
2758 encode(osd_weight, bl);
2759 encode(osd_addrs->client_addrs, bl, features);
7c673cae 2760
11fdf7f2 2761 encode(*pg_temp, bl);
7c673cae
FG
2762
2763 // crush
2764 bufferlist cbl;
2765 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2766 encode(cbl, bl);
7c673cae
FG
2767
2768 // extended
2769 __u16 ev = 10;
11fdf7f2
TL
2770 encode(ev, bl);
2771 encode(osd_addrs->hb_back_addrs, bl, features);
2772 encode(osd_info, bl);
2773 encode(blacklist, bl, features);
2774 encode(osd_addrs->cluster_addrs, bl, features);
2775 encode(cluster_snapshot_epoch, bl);
2776 encode(cluster_snapshot, bl);
2777 encode(*osd_uuid, bl);
2778 encode(osd_xinfo, bl);
2779 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
2780}
2781
11fdf7f2
TL
2782/* for a description of osdmap versions, and when they were introduced, please
2783 * refer to
2784 * doc/dev/osd_internals/osdmap_versions.txt
2785 */
7c673cae
FG
2786void OSDMap::encode(bufferlist& bl, uint64_t features) const
2787{
11fdf7f2 2788 using ceph::encode;
7c673cae
FG
2789 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2790 encode_classic(bl, features);
2791 return;
2792 }
2793
2794 // only a select set of callers should *ever* be encoding new
2795 // OSDMaps. others should be passing around the canonical encoded
2796 // buffers from on high. select out those callers by passing in an
2797 // "impossible" feature bit.
11fdf7f2 2798 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
2799 features &= ~CEPH_FEATURE_RESERVED;
2800
2801 size_t start_offset = bl.length();
2802 size_t tail_offset;
11fdf7f2
TL
2803 size_t crc_offset;
2804 std::optional<buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
2805
2806 // meta-encoding: how we include client-used and osd-specific data
2807 ENCODE_START(8, 7, bl);
2808
2809 {
28e407b8
AA
2810 // NOTE: any new encoding dependencies must be reflected by
2811 // SIGNIFICANT_FEATURES
11fdf7f2 2812 uint8_t v = 9;
31f18b77 2813 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 2814 v = 3;
11fdf7f2
TL
2815 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2816 v = 6;
2817 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2818 v = 7;
7c673cae
FG
2819 }
2820 ENCODE_START(v, 1, bl); // client-usable data
2821 // base
11fdf7f2
TL
2822 encode(fsid, bl);
2823 encode(epoch, bl);
2824 encode(created, bl);
2825 encode(modified, bl);
7c673cae 2826
11fdf7f2
TL
2827 encode(pools, bl, features);
2828 encode(pool_name, bl);
2829 encode(pool_max, bl);
7c673cae 2830
31f18b77
FG
2831 if (v < 4) {
2832 decltype(flags) f = flags;
2833 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
c07f9fc5 2834 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2835 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2836 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2837 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2838 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 2839 encode(f, bl);
31f18b77 2840 } else {
11fdf7f2 2841 encode(flags, bl);
31f18b77 2842 }
7c673cae 2843
11fdf7f2 2844 encode(max_osd, bl);
31f18b77 2845 if (v >= 5) {
11fdf7f2 2846 encode(osd_state, bl);
31f18b77
FG
2847 } else {
2848 uint32_t n = osd_state.size();
11fdf7f2 2849 encode(n, bl);
31f18b77 2850 for (auto s : osd_state) {
11fdf7f2 2851 encode((uint8_t)s, bl);
31f18b77
FG
2852 }
2853 }
11fdf7f2
TL
2854 encode(osd_weight, bl);
2855 if (v >= 8) {
2856 encode(osd_addrs->client_addrs, bl, features);
2857 } else {
2858 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2859 }
7c673cae 2860
11fdf7f2
TL
2861 encode(*pg_temp, bl);
2862 encode(*primary_temp, bl);
7c673cae 2863 if (osd_primary_affinity) {
11fdf7f2 2864 encode(*osd_primary_affinity, bl);
7c673cae
FG
2865 } else {
2866 vector<__u32> v;
11fdf7f2 2867 encode(v, bl);
7c673cae
FG
2868 }
2869
2870 // crush
2871 bufferlist cbl;
2872 crush->encode(cbl, features);
11fdf7f2
TL
2873 encode(cbl, bl);
2874 encode(erasure_code_profiles, bl);
7c673cae
FG
2875
2876 if (v >= 4) {
11fdf7f2
TL
2877 encode(pg_upmap, bl);
2878 encode(pg_upmap_items, bl);
7c673cae 2879 } else {
11fdf7f2
TL
2880 ceph_assert(pg_upmap.empty());
2881 ceph_assert(pg_upmap_items.empty());
7c673cae 2882 }
31f18b77 2883 if (v >= 6) {
11fdf7f2
TL
2884 encode(crush_version, bl);
2885 }
2886 if (v >= 7) {
2887 encode(new_removed_snaps, bl);
2888 encode(new_purged_snaps, bl);
2889 }
2890 if (v >= 9) {
2891 encode(last_up_change, bl);
2892 encode(last_in_change, bl);
31f18b77 2893 }
7c673cae
FG
2894 ENCODE_FINISH(bl); // client-usable data
2895 }
2896
2897 {
28e407b8
AA
2898 // NOTE: any new encoding dependencies must be reflected by
2899 // SIGNIFICANT_FEATURES
81eedcae 2900 uint8_t target_v = 9;
7c673cae
FG
2901 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2902 target_v = 1;
11fdf7f2
TL
2903 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2904 target_v = 5;
2905 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2906 target_v = 6;
7c673cae
FG
2907 }
2908 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
2909 if (target_v < 7) {
2910 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
2911 } else {
2912 encode(osd_addrs->hb_back_addrs, bl, features);
2913 }
2914 encode(osd_info, bl);
7c673cae
FG
2915 {
2916 // put this in a sorted, ordered map<> so that we encode in a
2917 // deterministic order.
2918 map<entity_addr_t,utime_t> blacklist_map;
2919 for (const auto &addr : blacklist)
2920 blacklist_map.insert(make_pair(addr.first, addr.second));
11fdf7f2
TL
2921 encode(blacklist_map, bl, features);
2922 }
2923 if (target_v < 7) {
2924 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
2925 } else {
2926 encode(osd_addrs->cluster_addrs, bl, features);
2927 }
2928 encode(cluster_snapshot_epoch, bl);
2929 encode(cluster_snapshot, bl);
2930 encode(*osd_uuid, bl);
2931 encode(osd_xinfo, bl);
2932 if (target_v < 7) {
2933 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
2934 } else {
2935 encode(osd_addrs->hb_front_addrs, bl, features);
2936 }
7c673cae 2937 if (target_v >= 2) {
11fdf7f2
TL
2938 encode(nearfull_ratio, bl);
2939 encode(full_ratio, bl);
2940 encode(backfillfull_ratio, bl);
31f18b77
FG
2941 }
2942 // 4 was string-based new_require_min_compat_client
2943 if (target_v >= 5) {
11fdf7f2
TL
2944 encode(require_min_compat_client, bl);
2945 encode(require_osd_release, bl);
2946 }
2947 if (target_v >= 6) {
2948 encode(removed_snaps_queue, bl);
7c673cae 2949 }
81eedcae
TL
2950 if (target_v >= 8) {
2951 encode(crush_node_flags, bl);
2952 }
2953 if (target_v >= 9) {
2954 encode(device_class_flags, bl);
2955 }
7c673cae
FG
2956 ENCODE_FINISH(bl); // osd-only data
2957 }
2958
11fdf7f2
TL
2959 crc_offset = bl.length();
2960 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
2961 tail_offset = bl.length();
2962
2963 ENCODE_FINISH(bl); // meta-encoding wrapper
2964
2965 // fill in crc
2966 bufferlist front;
11fdf7f2 2967 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
2968 crc = front.crc32c(-1);
2969 if (tail_offset < bl.length()) {
2970 bufferlist tail;
2971 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2972 crc = tail.crc32c(crc);
2973 }
2974 ceph_le32 crc_le;
2975 crc_le = crc;
11fdf7f2 2976 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
2977 crc_defined = true;
2978}
2979
11fdf7f2
TL
2980/* for a description of osdmap versions, and when they were introduced, please
2981 * refer to
2982 * doc/dev/osd_internals/osdmap_versions.txt
2983 */
7c673cae
FG
2984void OSDMap::decode(bufferlist& bl)
2985{
11fdf7f2 2986 auto p = bl.cbegin();
7c673cae
FG
2987 decode(p);
2988}
2989
11fdf7f2 2990void OSDMap::decode_classic(bufferlist::const_iterator& p)
7c673cae 2991{
11fdf7f2 2992 using ceph::decode;
7c673cae
FG
2993 __u32 n, t;
2994 __u16 v;
11fdf7f2 2995 decode(v, p);
7c673cae
FG
2996
2997 // base
11fdf7f2
TL
2998 decode(fsid, p);
2999 decode(epoch, p);
3000 decode(created, p);
3001 decode(modified, p);
7c673cae
FG
3002
3003 if (v < 6) {
3004 if (v < 4) {
3005 int32_t max_pools = 0;
11fdf7f2 3006 decode(max_pools, p);
7c673cae
FG
3007 pool_max = max_pools;
3008 }
3009 pools.clear();
11fdf7f2 3010 decode(n, p);
7c673cae 3011 while (n--) {
11fdf7f2
TL
3012 decode(t, p);
3013 decode(pools[t], p);
7c673cae
FG
3014 }
3015 if (v == 4) {
11fdf7f2 3016 decode(n, p);
7c673cae
FG
3017 pool_max = n;
3018 } else if (v == 5) {
3019 pool_name.clear();
11fdf7f2 3020 decode(n, p);
7c673cae 3021 while (n--) {
11fdf7f2
TL
3022 decode(t, p);
3023 decode(pool_name[t], p);
7c673cae 3024 }
11fdf7f2 3025 decode(n, p);
7c673cae
FG
3026 pool_max = n;
3027 }
3028 } else {
11fdf7f2
TL
3029 decode(pools, p);
3030 decode(pool_name, p);
3031 decode(pool_max, p);
7c673cae
FG
3032 }
3033 // kludge around some old bug that zeroed out pool_max (#2307)
3034 if (pools.size() && pool_max < pools.rbegin()->first) {
3035 pool_max = pools.rbegin()->first;
3036 }
3037
11fdf7f2 3038 decode(flags, p);
7c673cae 3039
11fdf7f2 3040 decode(max_osd, p);
31f18b77
FG
3041 {
3042 vector<uint8_t> os;
11fdf7f2 3043 decode(os, p);
31f18b77
FG
3044 osd_state.resize(os.size());
3045 for (unsigned i = 0; i < os.size(); ++i) {
3046 osd_state[i] = os[i];
3047 }
3048 }
11fdf7f2
TL
3049 decode(osd_weight, p);
3050 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3051 if (v <= 5) {
3052 pg_temp->clear();
11fdf7f2 3053 decode(n, p);
7c673cae
FG
3054 while (n--) {
3055 old_pg_t opg;
3056 ::decode_raw(opg, p);
31f18b77 3057 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3058 decode(v, p);
31f18b77 3059 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3060 }
3061 } else {
11fdf7f2 3062 decode(*pg_temp, p);
7c673cae
FG
3063 }
3064
3065 // crush
3066 bufferlist cbl;
11fdf7f2
TL
3067 decode(cbl, p);
3068 auto cblp = cbl.cbegin();
7c673cae
FG
3069 crush->decode(cblp);
3070
3071 // extended
3072 __u16 ev = 0;
3073 if (v >= 5)
11fdf7f2
TL
3074 decode(ev, p);
3075 decode(osd_addrs->hb_back_addrs, p);
3076 decode(osd_info, p);
7c673cae 3077 if (v < 5)
11fdf7f2 3078 decode(pool_name, p);
7c673cae 3079
11fdf7f2 3080 decode(blacklist, p);
7c673cae 3081 if (ev >= 6)
11fdf7f2 3082 decode(osd_addrs->cluster_addrs, p);
7c673cae 3083 else
11fdf7f2 3084 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3085
3086 if (ev >= 7) {
11fdf7f2
TL
3087 decode(cluster_snapshot_epoch, p);
3088 decode(cluster_snapshot, p);
7c673cae
FG
3089 }
3090
3091 if (ev >= 8) {
11fdf7f2 3092 decode(*osd_uuid, p);
7c673cae
FG
3093 } else {
3094 osd_uuid->resize(max_osd);
3095 }
3096 if (ev >= 9)
11fdf7f2 3097 decode(osd_xinfo, p);
7c673cae
FG
3098 else
3099 osd_xinfo.resize(max_osd);
3100
3101 if (ev >= 10)
11fdf7f2 3102 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3103 else
11fdf7f2 3104 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3105
3106 osd_primary_affinity.reset();
3107
3108 post_decode();
3109}
3110
11fdf7f2 3111void OSDMap::decode(bufferlist::const_iterator& bl)
7c673cae 3112{
11fdf7f2 3113 using ceph::decode;
7c673cae
FG
3114 /**
3115 * Older encodings of the OSDMap had a single struct_v which
3116 * covered the whole encoding, and was prior to our modern
3117 * stuff which includes a compatv and a size. So if we see
3118 * a struct_v < 7, we must rewind to the beginning and use our
3119 * classic decoder.
3120 */
3121 size_t start_offset = bl.get_off();
3122 size_t tail_offset = 0;
3123 bufferlist crc_front, crc_tail;
3124
3125 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3126 if (struct_v < 7) {
11fdf7f2 3127 bl.seek(start_offset);
7c673cae
FG
3128 decode_classic(bl);
3129 return;
3130 }
3131 /**
3132 * Since we made it past that hurdle, we can use our normal paths.
3133 */
3134 {
11fdf7f2 3135 DECODE_START(9, bl); // client-usable data
7c673cae 3136 // base
11fdf7f2
TL
3137 decode(fsid, bl);
3138 decode(epoch, bl);
3139 decode(created, bl);
3140 decode(modified, bl);
7c673cae 3141
11fdf7f2
TL
3142 decode(pools, bl);
3143 decode(pool_name, bl);
3144 decode(pool_max, bl);
7c673cae 3145
11fdf7f2 3146 decode(flags, bl);
7c673cae 3147
11fdf7f2 3148 decode(max_osd, bl);
31f18b77 3149 if (struct_v >= 5) {
11fdf7f2 3150 decode(osd_state, bl);
31f18b77
FG
3151 } else {
3152 vector<uint8_t> os;
11fdf7f2 3153 decode(os, bl);
31f18b77
FG
3154 osd_state.resize(os.size());
3155 for (unsigned i = 0; i < os.size(); ++i) {
3156 osd_state[i] = os[i];
3157 }
3158 }
11fdf7f2
TL
3159 decode(osd_weight, bl);
3160 decode(osd_addrs->client_addrs, bl);
7c673cae 3161
11fdf7f2
TL
3162 decode(*pg_temp, bl);
3163 decode(*primary_temp, bl);
3164 // dates back to firefly. version increased from 2 to 3 still in firefly.
3165 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3166 if (struct_v >= 2) {
3167 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3168 decode(*osd_primary_affinity, bl);
7c673cae
FG
3169 if (osd_primary_affinity->empty())
3170 osd_primary_affinity.reset();
3171 } else {
3172 osd_primary_affinity.reset();
3173 }
3174
3175 // crush
3176 bufferlist cbl;
11fdf7f2
TL
3177 decode(cbl, bl);
3178 auto cblp = cbl.cbegin();
7c673cae 3179 crush->decode(cblp);
11fdf7f2
TL
3180 // added in firefly; version increased in luminous, so it affects
3181 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3182 // alone until we require clients to be all luminous?
7c673cae 3183 if (struct_v >= 3) {
11fdf7f2 3184 decode(erasure_code_profiles, bl);
7c673cae
FG
3185 } else {
3186 erasure_code_profiles.clear();
3187 }
11fdf7f2
TL
3188 // version increased from 3 to 4 still in luminous, so same as above
3189 // applies.
7c673cae 3190 if (struct_v >= 4) {
11fdf7f2
TL
3191 decode(pg_upmap, bl);
3192 decode(pg_upmap_items, bl);
7c673cae
FG
3193 } else {
3194 pg_upmap.clear();
3195 pg_upmap_items.clear();
3196 }
11fdf7f2
TL
3197 // again, version increased from 5 to 6 still in luminous, so above
3198 // applies.
31f18b77 3199 if (struct_v >= 6) {
11fdf7f2
TL
3200 decode(crush_version, bl);
3201 }
3202 // version increase from 6 to 7 in mimic
3203 if (struct_v >= 7) {
3204 decode(new_removed_snaps, bl);
3205 decode(new_purged_snaps, bl);
3206 }
3207 // version increase from 7 to 8, 8 to 9, in nautilus.
3208 if (struct_v >= 9) {
3209 decode(last_up_change, bl);
3210 decode(last_in_change, bl);
31f18b77 3211 }
7c673cae
FG
3212 DECODE_FINISH(bl); // client-usable data
3213 }
3214
3215 {
81eedcae 3216 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
3217 decode(osd_addrs->hb_back_addrs, bl);
3218 decode(osd_info, bl);
3219 decode(blacklist, bl);
3220 decode(osd_addrs->cluster_addrs, bl);
3221 decode(cluster_snapshot_epoch, bl);
3222 decode(cluster_snapshot, bl);
3223 decode(*osd_uuid, bl);
3224 decode(osd_xinfo, bl);
3225 decode(osd_addrs->hb_front_addrs, bl);
3226 //
7c673cae 3227 if (struct_v >= 2) {
11fdf7f2
TL
3228 decode(nearfull_ratio, bl);
3229 decode(full_ratio, bl);
7c673cae
FG
3230 } else {
3231 nearfull_ratio = 0;
3232 full_ratio = 0;
3233 }
3234 if (struct_v >= 3) {
11fdf7f2 3235 decode(backfillfull_ratio, bl);
7c673cae
FG
3236 } else {
3237 backfillfull_ratio = 0;
3238 }
31f18b77
FG
3239 if (struct_v == 4) {
3240 string r;
11fdf7f2 3241 decode(r, bl);
31f18b77
FG
3242 if (r.length())
3243 require_min_compat_client = ceph_release_from_name(r.c_str());
3244 }
3245 if (struct_v >= 5) {
11fdf7f2
TL
3246 decode(require_min_compat_client, bl);
3247 decode(require_osd_release, bl);
3248 if (require_osd_release >= CEPH_RELEASE_NAUTILUS) {
3249 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3250 }
31f18b77
FG
3251 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3252 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3253 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3254 }
3255 } else {
3256 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3257 // only for compat with post-kraken pre-luminous test clusters
3258 require_osd_release = CEPH_RELEASE_LUMINOUS;
3259 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3260 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3261 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
3262 require_osd_release = CEPH_RELEASE_KRAKEN;
3263 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
3264 require_osd_release = CEPH_RELEASE_JEWEL;
3265 } else {
3266 require_osd_release = 0;
3267 }
3268 }
11fdf7f2
TL
3269 if (struct_v >= 6) {
3270 decode(removed_snaps_queue, bl);
3271 }
81eedcae
TL
3272 if (struct_v >= 8) {
3273 decode(crush_node_flags, bl);
3274 } else {
3275 crush_node_flags.clear();
3276 }
3277 if (struct_v >= 9) {
3278 decode(device_class_flags, bl);
3279 } else {
3280 device_class_flags.clear();
3281 }
7c673cae
FG
3282 DECODE_FINISH(bl); // osd-only data
3283 }
3284
3285 if (struct_v >= 8) {
3286 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3287 decode(crc, bl);
7c673cae
FG
3288 tail_offset = bl.get_off();
3289 crc_defined = true;
3290 } else {
3291 crc_defined = false;
3292 crc = 0;
3293 }
3294
3295 DECODE_FINISH(bl); // wrapper
3296
3297 if (tail_offset) {
3298 // verify crc
3299 uint32_t actual = crc_front.crc32c(-1);
3300 if (tail_offset < bl.get_off()) {
3301 bufferlist tail;
3302 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3303 actual = tail.crc32c(actual);
3304 }
3305 if (crc != actual) {
3306 ostringstream ss;
3307 ss << "bad crc, actual " << actual << " != expected " << crc;
3308 string s = ss.str();
3309 throw buffer::malformed_input(s.c_str());
3310 }
3311 }
3312
3313 post_decode();
3314}
3315
3316void OSDMap::post_decode()
3317{
3318 // index pool names
3319 name_pool.clear();
3320 for (const auto &pname : pool_name) {
3321 name_pool[pname.second] = pname.first;
3322 }
3323
3324 calc_num_osds();
3325 _calc_up_osd_features();
3326}
3327
3328void OSDMap::dump_erasure_code_profiles(
3329 const mempool::osdmap::map<string,map<string,string>>& profiles,
3330 Formatter *f)
3331{
3332 f->open_object_section("erasure_code_profiles");
3333 for (const auto &profile : profiles) {
3334 f->open_object_section(profile.first.c_str());
3335 for (const auto &profm : profile.second) {
3336 f->dump_string(profm.first.c_str(), profm.second.c_str());
3337 }
3338 f->close_section();
3339 }
3340 f->close_section();
3341}
3342
3343void OSDMap::dump(Formatter *f) const
3344{
3345 f->dump_int("epoch", get_epoch());
3346 f->dump_stream("fsid") << get_fsid();
3347 f->dump_stream("created") << get_created();
3348 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3349 f->dump_stream("last_up_change") << last_up_change;
3350 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3351 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3352 f->dump_unsigned("flags_num", flags);
3353 f->open_array_section("flags_set");
3354 set<string> flagset;
3355 get_flag_set(&flagset);
3356 for (auto p : flagset) {
3357 f->dump_string("flag", p);
3358 }
3359 f->close_section();
31f18b77 3360 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3361 f->dump_float("full_ratio", full_ratio);
3362 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3363 f->dump_float("nearfull_ratio", nearfull_ratio);
3364 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3365 f->dump_int("pool_max", get_pool_max());
3366 f->dump_int("max_osd", get_max_osd());
31f18b77
FG
3367 f->dump_string("require_min_compat_client",
3368 ceph_release_name(require_min_compat_client));
3369 f->dump_string("min_compat_client",
3370 ceph_release_name(get_min_compat_client()));
3371 f->dump_string("require_osd_release",
3372 ceph_release_name(require_osd_release));
7c673cae
FG
3373
3374 f->open_array_section("pools");
3375 for (const auto &pool : pools) {
3376 std::string name("<unknown>");
3377 const auto &pni = pool_name.find(pool.first);
3378 if (pni != pool_name.end())
3379 name = pni->second;
3380 f->open_object_section("pool");
3381 f->dump_int("pool", pool.first);
3382 f->dump_string("pool_name", name);
3383 pool.second.dump(f);
3384 f->close_section();
3385 }
3386 f->close_section();
3387
3388 f->open_array_section("osds");
3389 for (int i=0; i<get_max_osd(); i++)
3390 if (exists(i)) {
3391 f->open_object_section("osd_info");
3392 f->dump_int("osd", i);
3393 f->dump_stream("uuid") << get_uuid(i);
3394 f->dump_int("up", is_up(i));
3395 f->dump_int("in", is_in(i));
3396 f->dump_float("weight", get_weightf(i));
3397 f->dump_float("primary_affinity", get_primary_affinityf(i));
3398 get_info(i).dump(f);
11fdf7f2
TL
3399 f->dump_object("public_addrs", get_addrs(i));
3400 f->dump_object("cluster_addrs", get_cluster_addrs(i));
3401 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(i));
3402 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(i));
3403 // compat
3404 f->dump_stream("public_addr") << get_addrs(i).get_legacy_str();
3405 f->dump_stream("cluster_addr") << get_cluster_addrs(i).get_legacy_str();
3406 f->dump_stream("heartbeat_back_addr")
3407 << get_hb_back_addrs(i).get_legacy_str();
3408 f->dump_stream("heartbeat_front_addr")
3409 << get_hb_front_addrs(i).get_legacy_str();
7c673cae
FG
3410
3411 set<string> st;
3412 get_state(i, st);
3413 f->open_array_section("state");
3414 for (const auto &state : st)
3415 f->dump_string("state", state);
3416 f->close_section();
3417
3418 f->close_section();
3419 }
3420 f->close_section();
3421
3422 f->open_array_section("osd_xinfo");
3423 for (int i=0; i<get_max_osd(); i++) {
3424 if (exists(i)) {
3425 f->open_object_section("xinfo");
3426 f->dump_int("osd", i);
3427 osd_xinfo[i].dump(f);
3428 f->close_section();
3429 }
3430 }
3431 f->close_section();
3432
3433 f->open_array_section("pg_upmap");
3434 for (auto& p : pg_upmap) {
3435 f->open_object_section("mapping");
3436 f->dump_stream("pgid") << p.first;
3437 f->open_array_section("osds");
3438 for (auto q : p.second) {
3439 f->dump_int("osd", q);
3440 }
3441 f->close_section();
3442 f->close_section();
3443 }
3444 f->close_section();
3445 f->open_array_section("pg_upmap_items");
3446 for (auto& p : pg_upmap_items) {
3447 f->open_object_section("mapping");
3448 f->dump_stream("pgid") << p.first;
3449 f->open_array_section("mappings");
3450 for (auto& q : p.second) {
3451 f->open_object_section("mapping");
3452 f->dump_int("from", q.first);
3453 f->dump_int("to", q.second);
3454 f->close_section();
3455 }
3456 f->close_section();
3457 f->close_section();
3458 }
3459 f->close_section();
3460 f->open_array_section("pg_temp");
31f18b77 3461 pg_temp->dump(f);
7c673cae
FG
3462 f->close_section();
3463
3464 f->open_array_section("primary_temp");
3465 for (const auto &pg : *primary_temp) {
3466 f->dump_stream("pgid") << pg.first;
3467 f->dump_int("osd", pg.second);
3468 }
3469 f->close_section(); // primary_temp
3470
3471 f->open_object_section("blacklist");
3472 for (const auto &addr : blacklist) {
3473 stringstream ss;
3474 ss << addr.first;
3475 f->dump_stream(ss.str().c_str()) << addr.second;
3476 }
3477 f->close_section();
3478
3479 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3480
3481 f->open_array_section("removed_snaps_queue");
3482 for (auto& p : removed_snaps_queue) {
3483 f->open_object_section("pool");
3484 f->dump_int("pool", p.first);
3485 f->open_array_section("snaps");
3486 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3487 f->open_object_section("interval");
3488 f->dump_unsigned("begin", q.get_start());
3489 f->dump_unsigned("length", q.get_len());
3490 f->close_section();
3491 }
3492 f->close_section();
3493 f->close_section();
3494 }
3495 f->close_section();
3496 f->open_array_section("new_removed_snaps");
3497 for (auto& p : new_removed_snaps) {
3498 f->open_object_section("pool");
3499 f->dump_int("pool", p.first);
3500 f->open_array_section("snaps");
3501 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3502 f->open_object_section("interval");
3503 f->dump_unsigned("begin", q.get_start());
3504 f->dump_unsigned("length", q.get_len());
3505 f->close_section();
3506 }
3507 f->close_section();
3508 f->close_section();
3509 }
3510 f->close_section();
3511 f->open_array_section("new_purged_snaps");
3512 for (auto& p : new_purged_snaps) {
3513 f->open_object_section("pool");
3514 f->dump_int("pool", p.first);
3515 f->open_array_section("snaps");
3516 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3517 f->open_object_section("interval");
3518 f->dump_unsigned("begin", q.get_start());
3519 f->dump_unsigned("length", q.get_len());
3520 f->close_section();
3521 }
3522 f->close_section();
3523 f->close_section();
3524 }
3525 f->close_section();
81eedcae
TL
3526 f->open_object_section("crush_node_flags");
3527 for (auto& i : crush_node_flags) {
3528 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3529 : stringify(i.first);
3530 f->open_array_section(s.c_str());
3531 set<string> st;
3532 calc_state_set(i.second, st);
3533 for (auto& j : st) {
3534 f->dump_string("flag", j);
3535 }
3536 f->close_section();
3537 }
3538 f->close_section();
3539 f->open_object_section("device_class_flags");
3540 for (auto& i : device_class_flags) {
3541 const char* class_name = crush->get_class_name(i.first);
3542 string s = class_name ? class_name : stringify(i.first);
3543 f->open_array_section(s.c_str());
3544 set<string> st;
3545 calc_state_set(i.second, st);
3546 for (auto& j : st) {
3547 f->dump_string("flag", j);
3548 }
3549 f->close_section();
3550 }
3551 f->close_section();
7c673cae
FG
3552}
3553
3554void OSDMap::generate_test_instances(list<OSDMap*>& o)
3555{
3556 o.push_back(new OSDMap);
3557
3558 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3559 o.push_back(new OSDMap);
3560 uuid_d fsid;
224ce89b 3561 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae
FG
3562 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
3563 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
3564 cct->put();
3565}
3566
3567string OSDMap::get_flag_string(unsigned f)
3568{
3569 string s;
3570 if ( f& CEPH_OSDMAP_NEARFULL)
3571 s += ",nearfull";
3572 if (f & CEPH_OSDMAP_FULL)
3573 s += ",full";
3574 if (f & CEPH_OSDMAP_PAUSERD)
3575 s += ",pauserd";
3576 if (f & CEPH_OSDMAP_PAUSEWR)
3577 s += ",pausewr";
3578 if (f & CEPH_OSDMAP_PAUSEREC)
3579 s += ",pauserec";
3580 if (f & CEPH_OSDMAP_NOUP)
3581 s += ",noup";
3582 if (f & CEPH_OSDMAP_NODOWN)
3583 s += ",nodown";
3584 if (f & CEPH_OSDMAP_NOOUT)
3585 s += ",noout";
3586 if (f & CEPH_OSDMAP_NOIN)
3587 s += ",noin";
3588 if (f & CEPH_OSDMAP_NOBACKFILL)
3589 s += ",nobackfill";
3590 if (f & CEPH_OSDMAP_NOREBALANCE)
3591 s += ",norebalance";
3592 if (f & CEPH_OSDMAP_NORECOVER)
3593 s += ",norecover";
3594 if (f & CEPH_OSDMAP_NOSCRUB)
3595 s += ",noscrub";
3596 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3597 s += ",nodeep-scrub";
3598 if (f & CEPH_OSDMAP_NOTIERAGENT)
3599 s += ",notieragent";
11fdf7f2
TL
3600 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3601 s += ",nosnaptrim";
7c673cae
FG
3602 if (f & CEPH_OSDMAP_SORTBITWISE)
3603 s += ",sortbitwise";
3604 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3605 s += ",require_jewel_osds";
3606 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3607 s += ",require_kraken_osds";
3608 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3609 s += ",require_luminous_osds";
c07f9fc5
FG
3610 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3611 s += ",recovery_deletes";
181888fb
FG
3612 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3613 s += ",purged_snapdirs";
f64942e4
AA
3614 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3615 s += ",pglog_hardlimit";
7c673cae
FG
3616 if (s.length())
3617 s.erase(0, 1);
3618 return s;
3619}
3620
3621string OSDMap::get_flag_string() const
3622{
3623 return get_flag_string(flags);
3624}
3625
7c673cae
FG
3626void OSDMap::print_pools(ostream& out) const
3627{
3628 for (const auto &pool : pools) {
3629 std::string name("<unknown>");
3630 const auto &pni = pool_name.find(pool.first);
3631 if (pni != pool_name.end())
3632 name = pni->second;
3633 out << "pool " << pool.first
3634 << " '" << name
3635 << "' " << pool.second << "\n";
3636
3637 for (const auto &snap : pool.second.snaps)
3638 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3639
3640 if (!pool.second.removed_snaps.empty())
3641 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3642 auto p = removed_snaps_queue.find(pool.first);
3643 if (p != removed_snaps_queue.end()) {
3644 out << "\tremoved_snaps_queue " << p->second << "\n";
3645 }
7c673cae
FG
3646 }
3647 out << std::endl;
3648}
3649
3650void OSDMap::print(ostream& out) const
3651{
3652 out << "epoch " << get_epoch() << "\n"
3653 << "fsid " << get_fsid() << "\n"
3654 << "created " << get_created() << "\n"
3655 << "modified " << get_modified() << "\n";
3656
3657 out << "flags " << get_flag_string() << "\n";
31f18b77 3658 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3659 out << "full_ratio " << full_ratio << "\n";
3660 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3661 out << "nearfull_ratio " << nearfull_ratio << "\n";
31f18b77
FG
3662 if (require_min_compat_client > 0) {
3663 out << "require_min_compat_client "
3664 << ceph_release_name(require_min_compat_client) << "\n";
7c673cae 3665 }
31f18b77
FG
3666 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
3667 << "\n";
224ce89b
WB
3668 if (require_osd_release > 0) {
3669 out << "require_osd_release " << ceph_release_name(require_osd_release)
3670 << "\n";
3671 }
7c673cae
FG
3672 if (get_cluster_snapshot().length())
3673 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3674 out << "\n";
3675
3676 print_pools(out);
3677
3678 out << "max_osd " << get_max_osd() << "\n";
3679 for (int i=0; i<get_max_osd(); i++) {
3680 if (exists(i)) {
3681 out << "osd." << i;
3682 out << (is_up(i) ? " up ":" down");
3683 out << (is_in(i) ? " in ":" out");
3684 out << " weight " << get_weightf(i);
3685 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3686 out << " primary_affinity " << get_primary_affinityf(i);
3687 const osd_info_t& info(get_info(i));
3688 out << " " << info;
11fdf7f2 3689 out << " " << get_addrs(i) << " " << get_cluster_addrs(i);
7c673cae
FG
3690 set<string> st;
3691 get_state(i, st);
3692 out << " " << st;
3693 if (!get_uuid(i).is_zero())
3694 out << " " << get_uuid(i);
3695 out << "\n";
3696 }
3697 }
3698 out << std::endl;
3699
3700 for (auto& p : pg_upmap) {
3701 out << "pg_upmap " << p.first << " " << p.second << "\n";
3702 }
3703 for (auto& p : pg_upmap_items) {
3704 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3705 }
3706
3707 for (const auto pg : *pg_temp)
3708 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3709
3710 for (const auto pg : *primary_temp)
3711 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3712
3713 for (const auto &addr : blacklist)
3714 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
3715}
3716
3717class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3718public:
3719 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3720
3721 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3722 unsigned f)
c07f9fc5 3723 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3724
3725 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3726 if (!filter) {
3727 return true; // normal case
3728 }
3729 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3730 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3731 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3732 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3733 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3734 return true;
31f18b77 3735 }
c07f9fc5 3736 return false;
31f18b77
FG
3737 }
3738
3739 bool should_dump_empty_bucket() const override {
3740 return !filter;
3741 }
7c673cae 3742
11fdf7f2 3743 void init_table(TextTable *tbl) {
7c673cae 3744 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3745 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3746 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3747 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3748 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3749 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3750 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
3751 }
3752 void dump(TextTable *tbl, string& bucket) {
3753 init_table(tbl);
7c673cae 3754
11fdf7f2
TL
3755 if (!bucket.empty()) {
3756 set_root(bucket);
3757 Parent::dump(tbl);
3758 } else {
3759 Parent::dump(tbl);
3760 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3761 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3762 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3763 }
31f18b77 3764 }
7c673cae
FG
3765 }
3766 }
3767
3768protected:
3769 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3770 const char *c = crush->get_item_class(qi.id);
3771 if (!c)
3772 c = "";
7c673cae 3773 *tbl << qi.id
224ce89b 3774 << c
7c673cae
FG
3775 << weightf_t(qi.weight);
3776
3777 ostringstream name;
3778 for (int k = 0; k < qi.depth; k++)
3779 name << " ";
3780 if (qi.is_bucket()) {
3781 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3782 << crush->get_item_name(qi.id);
3783 } else {
3784 name << "osd." << qi.id;
3785 }
3786 *tbl << name.str();
3787
3788 if (!qi.is_bucket()) {
3789 if (!osdmap->exists(qi.id)) {
3790 *tbl << "DNE"
3791 << 0;
3792 } else {
c07f9fc5
FG
3793 string s;
3794 if (osdmap->is_up(qi.id)) {
3795 s = "up";
3796 } else if (osdmap->is_destroyed(qi.id)) {
3797 s = "destroyed";
3798 } else {
3799 s = "down";
3800 }
3801 *tbl << s
7c673cae
FG
3802 << weightf_t(osdmap->get_weightf(qi.id))
3803 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3804 }
3805 }
3806 *tbl << TextTable::endrow;
3807 }
3808
3809private:
3810 const OSDMap *osdmap;
31f18b77 3811 const unsigned filter;
7c673cae
FG
3812};
3813
3814class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3815public:
3816 typedef CrushTreeDumper::FormattingDumper Parent;
3817
31f18b77
FG
3818 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3819 unsigned f)
c07f9fc5 3820 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3821
3822 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3823 if (!filter) {
3824 return true; // normal case
3825 }
3826 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3827 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3828 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3829 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3830 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3831 return true;
31f18b77 3832 }
c07f9fc5 3833 return false;
31f18b77
FG
3834 }
3835
3836 bool should_dump_empty_bucket() const override {
3837 return !filter;
3838 }
7c673cae 3839
11fdf7f2
TL
3840 void dump(Formatter *f, string& bucket) {
3841 if (!bucket.empty()) {
3842 set_root(bucket);
3843 f->open_array_section("nodes");
3844 Parent::dump(f);
3845 f->close_section();
3846 } else {
3847 f->open_array_section("nodes");
3848 Parent::dump(f);
3849 f->close_section();
3850 f->open_array_section("stray");
3851 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3852 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3853 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3854 }
3855 f->close_section();
7c673cae 3856 }
7c673cae
FG
3857 }
3858
3859protected:
3860 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3861 Parent::dump_item_fields(qi, f);
3862 if (!qi.is_bucket())
3863 {
c07f9fc5
FG
3864 string s;
3865 if (osdmap->is_up(qi.id)) {
3866 s = "up";
3867 } else if (osdmap->is_destroyed(qi.id)) {
3868 s = "destroyed";
3869 } else {
3870 s = "down";
3871 }
7c673cae 3872 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 3873 f->dump_string("status", s);
7c673cae
FG
3874 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3875 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3876 }
3877 }
3878
3879private:
3880 const OSDMap *osdmap;
31f18b77 3881 const unsigned filter;
7c673cae
FG
3882};
3883
11fdf7f2 3884void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 3885{
31f18b77 3886 if (f) {
11fdf7f2 3887 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 3888 } else {
11fdf7f2 3889 ceph_assert(out);
7c673cae 3890 TextTable tbl;
11fdf7f2 3891 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
3892 *out << tbl;
3893 }
3894}
3895
224ce89b 3896void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 3897 const string& prefix, bool extra) const
7c673cae
FG
3898{
3899 if (f) {
3900 f->open_object_section("osdmap");
3901 f->dump_int("epoch", get_epoch());
3902 f->dump_int("num_osds", get_num_osds());
3903 f->dump_int("num_up_osds", get_num_up_osds());
3904 f->dump_int("num_in_osds", get_num_in_osds());
7c673cae
FG
3905 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3906 f->close_section();
3907 } else {
11fdf7f2 3908 utime_t now = ceph_clock_now();
31f18b77 3909 out << get_num_osds() << " osds: "
11fdf7f2
TL
3910 << get_num_up_osds() << " up";
3911 if (last_up_change != utime_t()) {
3912 out << " (since " << utimespan_str(now - last_up_change) << ")";
3913 }
3914 out << ", " << get_num_in_osds() << " in";
3915 if (last_in_change != utime_t()) {
3916 out << " (since " << utimespan_str(now - last_in_change) << ")";
3917 }
3918 if (extra)
3919 out << "; epoch: e" << get_epoch();
7c673cae
FG
3920 if (get_num_pg_temp())
3921 out << "; " << get_num_pg_temp() << " remapped pgs";
3922 out << "\n";
3923 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3924 if (important_flags)
224ce89b 3925 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
3926 }
3927}
3928
3929void OSDMap::print_oneline_summary(ostream& out) const
3930{
3931 out << "e" << get_epoch() << ": "
31f18b77 3932 << get_num_osds() << " total, "
7c673cae
FG
3933 << get_num_up_osds() << " up, "
3934 << get_num_in_osds() << " in";
7c673cae
FG
3935}
3936
3efd9988 3937bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
3938{
3939 for (const auto &pool : pools) {
3efd9988 3940 if (pool.second.crush_rule == rule_id)
7c673cae
FG
3941 return true;
3942 }
3943 return false;
3944}
3945
3efd9988
FG
3946int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
3947 ostream *ss) const
3948{
3949 for (auto& i : pools) {
3950 auto& pool = i.second;
3951 int ruleno = pool.get_crush_rule();
3952 if (!newcrush->rule_exists(ruleno)) {
3953 *ss << "pool " << i.first << " references crush_rule " << ruleno
3954 << " but it is not present";
3955 return -EINVAL;
3956 }
3957 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
3958 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
3959 return -EINVAL;
3960 }
3961 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
3962 *ss << "pool " << i.first << " type does not match rule " << ruleno;
3963 return -EINVAL;
3964 }
11fdf7f2
TL
3965 int poolsize = pool.get_size();
3966 if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
3967 poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
3968 *ss << "pool " << i.first << " size " << poolsize << " does not"
3efd9988
FG
3969 << " fall within rule " << ruleno
3970 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
3971 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
3972 return -EINVAL;
3973 }
3974 }
3975 return 0;
3976}
3977
224ce89b
WB
3978int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
3979 int nosd, int pg_bits, int pgp_bits,
3980 bool default_pool)
7c673cae 3981{
224ce89b
WB
3982 ldout(cct, 10) << "build_simple on " << nosd
3983 << " osds" << dendl;
7c673cae
FG
3984 epoch = e;
3985 set_fsid(fsid);
3986 created = modified = ceph_clock_now();
3987
3988 if (nosd >= 0) {
3989 set_max_osd(nosd);
3990 } else {
3991 // count osds
3992 int maxosd = 0;
11fdf7f2 3993 const auto& conf = cct->_conf;
7c673cae 3994 vector<string> sections;
11fdf7f2 3995 conf.get_all_sections(sections);
7c673cae
FG
3996
3997 for (auto &section : sections) {
3998 if (section.find("osd.") != 0)
3999 continue;
4000
4001 const char *begin = section.c_str() + 4;
4002 char *end = (char*)begin;
4003 int o = strtol(begin, &end, 10);
4004 if (*end != '\0')
4005 continue;
4006
4007 if (o > cct->_conf->mon_max_osd) {
4008 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4009 return -ERANGE;
4010 }
4011
4012 if (o > maxosd)
4013 maxosd = o;
4014 }
4015
4016 set_max_osd(maxosd + 1);
4017 }
4018
7c673cae
FG
4019
4020 stringstream ss;
4021 int r;
4022 if (nosd >= 0)
4023 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4024 else
4025 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4026 ceph_assert(r == 0);
7c673cae
FG
4027
4028 int poolbase = get_max_osd() ? get_max_osd() : 1;
4029
d2e6a577 4030 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
11fdf7f2 4031 ceph_assert(default_replicated_rule >= 0);
7c673cae 4032
224ce89b
WB
4033 if (default_pool) {
4034 // pgp_num <= pg_num
4035 if (pgp_bits > pg_bits)
4036 pgp_bits = pg_bits;
4037
4038 vector<string> pool_names;
4039 pool_names.push_back("rbd");
4040 for (auto &plname : pool_names) {
4041 int64_t pool = ++pool_max;
4042 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4043 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4044 if (cct->_conf->osd_pool_default_flag_hashpspool)
4045 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4046 if (cct->_conf->osd_pool_default_flag_nodelete)
4047 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4048 if (cct->_conf->osd_pool_default_flag_nopgchange)
4049 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4050 if (cct->_conf->osd_pool_default_flag_nosizechange)
4051 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
4052 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4053 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4054 pools[pool].size);
224ce89b
WB
4055 pools[pool].crush_rule = default_replicated_rule;
4056 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4057 pools[pool].set_pg_num(poolbase << pg_bits);
4058 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4059 pools[pool].set_pg_num_target(poolbase << pg_bits);
4060 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4061 pools[pool].last_change = epoch;
c07f9fc5
FG
4062 pools[pool].application_metadata.insert(
4063 {pg_pool_t::APPLICATION_NAME_RBD, {}});
11fdf7f2
TL
4064 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4065 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4066 pools[pool].pg_autoscale_mode = m >= 0 ? m : 0;
224ce89b
WB
4067 pool_name[pool] = plname;
4068 name_pool[plname] = pool;
4069 }
7c673cae
FG
4070 }
4071
4072 for (int i=0; i<get_max_osd(); i++) {
4073 set_state(i, 0);
4074 set_weight(i, CEPH_OSD_OUT);
4075 }
4076
4077 map<string,string> profile_map;
4078 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4079 if (r < 0) {
4080 lderr(cct) << ss.str() << dendl;
4081 return r;
4082 }
4083 set_erasure_code_profile("default", profile_map);
4084 return 0;
4085}
4086
4087int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4088 map<string,string> &profile_map,
4089 ostream *ss)
4090{
11fdf7f2 4091 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4092 *ss,
4093 &profile_map);
4094 return r;
4095}
4096
4097int OSDMap::_build_crush_types(CrushWrapper& crush)
4098{
4099 crush.set_type_name(0, "osd");
4100 crush.set_type_name(1, "host");
4101 crush.set_type_name(2, "chassis");
4102 crush.set_type_name(3, "rack");
4103 crush.set_type_name(4, "row");
4104 crush.set_type_name(5, "pdu");
4105 crush.set_type_name(6, "pod");
4106 crush.set_type_name(7, "room");
4107 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4108 crush.set_type_name(9, "zone");
4109 crush.set_type_name(10, "region");
4110 crush.set_type_name(11, "root");
4111 return 11;
7c673cae
FG
4112}
4113
4114int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4115 int nosd, ostream *ss)
4116{
4117 crush.create();
4118
4119 // root
4120 int root_type = _build_crush_types(crush);
4121 int rootid;
4122 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4123 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4124 ceph_assert(r == 0);
7c673cae
FG
4125 crush.set_item_name(rootid, "default");
4126
4127 for (int o=0; o<nosd; o++) {
4128 map<string,string> loc;
4129 loc["host"] = "localhost";
4130 loc["rack"] = "localrack";
4131 loc["root"] = "default";
4132 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4133 char name[32];
4134 snprintf(name, sizeof(name), "osd.%d", o);
4135 crush.insert_item(cct, o, 1.0, name, loc);
4136 }
4137
31f18b77 4138 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4139
4140 crush.finalize();
4141
4142 return 0;
4143}
4144
4145int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4146 CrushWrapper& crush,
4147 ostream *ss)
4148{
11fdf7f2 4149 const auto& conf = cct->_conf;
7c673cae
FG
4150
4151 crush.create();
4152
4153 // root
4154 int root_type = _build_crush_types(crush);
4155 int rootid;
4156 int r = crush.add_bucket(0, 0,
4157 CRUSH_HASH_DEFAULT,
4158 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4159 ceph_assert(r == 0);
7c673cae
FG
4160 crush.set_item_name(rootid, "default");
4161
4162 // add osds
4163 vector<string> sections;
11fdf7f2 4164 conf.get_all_sections(sections);
7c673cae
FG
4165
4166 for (auto &section : sections) {
4167 if (section.find("osd.") != 0)
4168 continue;
4169
4170 const char *begin = section.c_str() + 4;
4171 char *end = (char*)begin;
4172 int o = strtol(begin, &end, 10);
4173 if (*end != '\0')
4174 continue;
4175
4176 string host, rack, row, room, dc, pool;
4177 vector<string> sectiontmp;
4178 sectiontmp.push_back("osd");
4179 sectiontmp.push_back(section);
11fdf7f2
TL
4180 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4181 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4182 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4183 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4184 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4185 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4186
4187 if (host.length() == 0)
4188 host = "unknownhost";
4189 if (rack.length() == 0)
4190 rack = "unknownrack";
4191
4192 map<string,string> loc;
4193 loc["host"] = host;
4194 loc["rack"] = rack;
4195 if (row.size())
4196 loc["row"] = row;
4197 if (room.size())
4198 loc["room"] = room;
4199 if (dc.size())
4200 loc["datacenter"] = dc;
4201 loc["root"] = "default";
4202
4203 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4204 crush.insert_item(cct, o, 1.0, section, loc);
4205 }
4206
31f18b77 4207 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4208
4209 crush.finalize();
4210
4211 return 0;
4212}
4213
4214
31f18b77
FG
4215int OSDMap::build_simple_crush_rules(
4216 CephContext *cct,
4217 CrushWrapper& crush,
4218 const string& root,
4219 ostream *ss)
7c673cae 4220{
31f18b77 4221 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
7c673cae
FG
4222 string failure_domain =
4223 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4224
7c673cae 4225 int r;
31f18b77 4226 r = crush.add_simple_rule_at(
224ce89b 4227 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4228 "firstn", pg_pool_t::TYPE_REPLICATED,
4229 crush_rule, ss);
7c673cae
FG
4230 if (r < 0)
4231 return r;
4232 // do not add an erasure rule by default or else we will implicitly
4233 // require the crush_v2 feature of clients
4234 return 0;
4235}
4236
4237int OSDMap::summarize_mapping_stats(
4238 OSDMap *newmap,
4239 const set<int64_t> *pools,
4240 std::string *out,
4241 Formatter *f) const
4242{
4243 set<int64_t> ls;
4244 if (pools) {
4245 ls = *pools;
4246 } else {
4247 for (auto &p : get_pools())
4248 ls.insert(p.first);
4249 }
4250
4251 unsigned total_pg = 0;
4252 unsigned moved_pg = 0;
4253 vector<unsigned> base_by_osd(get_max_osd(), 0);
4254 vector<unsigned> new_by_osd(get_max_osd(), 0);
4255 for (int64_t pool_id : ls) {
4256 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4257 vector<int> up, up2;
4258 int up_primary;
7c673cae 4259 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4260 pg_t pgid(ps, pool_id);
7c673cae 4261 total_pg += pi->get_size();
31f18b77 4262 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4263 for (int osd : up) {
4264 if (osd >= 0 && osd < get_max_osd())
4265 ++base_by_osd[osd];
4266 }
4267 if (newmap) {
31f18b77 4268 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4269 for (int osd : up2) {
4270 if (osd >= 0 && osd < get_max_osd())
4271 ++new_by_osd[osd];
4272 }
4273 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4274 for (unsigned i=0; i<up.size(); ++i) {
4275 if (up[i] != up2[i]) {
4276 ++moved_pg;
4277 }
4278 }
4279 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4280 for (int osd : up) {
4281 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4282 ++moved_pg;
4283 }
4284 }
4285 } else {
11fdf7f2 4286 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4287 }
4288 }
4289 }
4290 }
4291
4292 unsigned num_up_in = 0;
4293 for (int osd = 0; osd < get_max_osd(); ++osd) {
4294 if (is_up(osd) && is_in(osd))
4295 ++num_up_in;
4296 }
4297 if (!num_up_in) {
4298 return -EINVAL;
4299 }
4300
4301 float avg_pg = (float)total_pg / (float)num_up_in;
4302 float base_stddev = 0, new_stddev = 0;
4303 int min = -1, max = -1;
4304 unsigned min_base_pg = 0, max_base_pg = 0;
4305 unsigned min_new_pg = 0, max_new_pg = 0;
4306 for (int osd = 0; osd < get_max_osd(); ++osd) {
4307 if (is_up(osd) && is_in(osd)) {
4308 float base_diff = (float)base_by_osd[osd] - avg_pg;
4309 base_stddev += base_diff * base_diff;
4310 float new_diff = (float)new_by_osd[osd] - avg_pg;
4311 new_stddev += new_diff * new_diff;
4312 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4313 min = osd;
4314 min_base_pg = base_by_osd[osd];
4315 min_new_pg = new_by_osd[osd];
4316 }
4317 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4318 max = osd;
4319 max_base_pg = base_by_osd[osd];
4320 max_new_pg = new_by_osd[osd];
4321 }
4322 }
4323 }
4324 base_stddev = sqrt(base_stddev / num_up_in);
4325 new_stddev = sqrt(new_stddev / num_up_in);
4326
4327 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4328
4329 ostringstream ss;
4330 if (f)
4331 f->open_object_section("utilization");
4332 if (newmap) {
4333 if (f) {
4334 f->dump_unsigned("moved_pgs", moved_pg);
4335 f->dump_unsigned("total_pgs", total_pg);
4336 } else {
4337 float percent = 0;
4338 if (total_pg)
4339 percent = (float)moved_pg * 100.0 / (float)total_pg;
4340 ss << "moved " << moved_pg << " / " << total_pg
4341 << " (" << percent << "%)\n";
4342 }
4343 }
4344 if (f) {
4345 f->dump_float("avg_pgs", avg_pg);
4346 f->dump_float("std_dev", base_stddev);
4347 f->dump_float("expected_baseline_std_dev", edev);
4348 if (newmap)
4349 f->dump_float("new_std_dev", new_stddev);
4350 } else {
4351 ss << "avg " << avg_pg << "\n";
4352 ss << "stddev " << base_stddev;
4353 if (newmap)
4354 ss << " -> " << new_stddev;
4355 ss << " (expected baseline " << edev << ")\n";
4356 }
4357 if (min >= 0) {
4358 if (f) {
4359 f->dump_unsigned("min_osd", min);
4360 f->dump_unsigned("min_osd_pgs", min_base_pg);
4361 if (newmap)
4362 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4363 } else {
4364 ss << "min osd." << min << " with " << min_base_pg;
4365 if (newmap)
4366 ss << " -> " << min_new_pg;
4367 ss << " pgs (" << (float)min_base_pg / avg_pg;
4368 if (newmap)
4369 ss << " -> " << (float)min_new_pg / avg_pg;
4370 ss << " * mean)\n";
4371 }
4372 }
4373 if (max >= 0) {
4374 if (f) {
4375 f->dump_unsigned("max_osd", max);
4376 f->dump_unsigned("max_osd_pgs", max_base_pg);
4377 if (newmap)
4378 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4379 } else {
4380 ss << "max osd." << max << " with " << max_base_pg;
4381 if (newmap)
4382 ss << " -> " << max_new_pg;
4383 ss << " pgs (" << (float)max_base_pg / avg_pg;
4384 if (newmap)
4385 ss << " -> " << (float)max_new_pg / avg_pg;
4386 ss << " * mean)\n";
4387 }
4388 }
4389 if (f)
4390 f->close_section();
4391 if (out)
4392 *out = ss.str();
4393 return 0;
4394}
4395
7c673cae
FG
4396bool OSDMap::try_pg_upmap(
4397 CephContext *cct,
4398 pg_t pg, ///< pg to potentially remap
4399 const set<int>& overfull, ///< osds we'd want to evacuate
4400 const vector<int>& underfull, ///< osds to move to, in order of preference
4401 vector<int> *orig,
4402 vector<int> *out) ///< resulting alternative mapping
4403{
4404 const pg_pool_t *pool = get_pg_pool(pg.pool());
4405 if (!pool)
4406 return false;
31f18b77 4407 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
7c673cae
FG
4408 pool->get_size());
4409 if (rule < 0)
4410 return false;
4411
7c673cae
FG
4412 // make sure there is something there to remap
4413 bool any = false;
4414 for (auto osd : *orig) {
4415 if (overfull.count(osd)) {
4416 any = true;
4417 break;
4418 }
4419 }
4420 if (!any) {
4421 return false;
4422 }
4423
4424 int r = crush->try_remap_rule(
4425 cct,
4426 rule,
4427 pool->get_size(),
4428 overfull, underfull,
4429 *orig,
4430 out);
4431 if (r < 0)
4432 return false;
4433 if (*out == *orig)
4434 return false;
4435 return true;
4436}
4437
4438int OSDMap::calc_pg_upmaps(
4439 CephContext *cct,
31f18b77 4440 float max_deviation_ratio,
7c673cae 4441 int max,
a8e16298 4442 const set<int64_t>& only_pools,
7c673cae
FG
4443 OSDMap::Incremental *pending_inc)
4444{
a8e16298 4445 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
7c673cae
FG
4446 OSDMap tmp;
4447 tmp.deepish_copy_from(*this);
4448 int num_changed = 0;
a8e16298
TL
4449 map<int,set<pg_t>> pgs_by_osd;
4450 int total_pgs = 0;
4451 float osd_weight_total = 0;
4452 map<int,float> osd_weight;
4453 for (auto& i : pools) {
4454 if (!only_pools.empty() && !only_pools.count(i.first))
4455 continue;
4456 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4457 pg_t pg(ps, i.first);
4458 vector<int> up;
4459 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4460 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4461 for (auto osd : up) {
4462 if (osd != CRUSH_ITEM_NONE)
4463 pgs_by_osd[osd].insert(pg);
7c673cae 4464 }
a8e16298
TL
4465 }
4466 total_pgs += i.second.get_size() * i.second.get_pg_num();
4467
4468 map<int,float> pmap;
4469 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4470 i.second.get_type(),
4471 i.second.get_size());
4472 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4473 ldout(cct,20) << __func__ << " pool " << i.first
4474 << " ruleno " << ruleno
4475 << " weight-map " << pmap
4476 << dendl;
4477 for (auto p : pmap) {
4478 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4479 if (adjusted_weight == 0) {
4480 continue;
31f18b77 4481 }
a8e16298
TL
4482 osd_weight[p.first] += adjusted_weight;
4483 osd_weight_total += adjusted_weight;
7c673cae 4484 }
a8e16298
TL
4485 }
4486 for (auto& i : osd_weight) {
4487 int pgs = 0;
4488 auto p = pgs_by_osd.find(i.first);
4489 if (p != pgs_by_osd.end())
31f18b77 4490 pgs = p->second.size();
a8e16298 4491 else
31f18b77 4492 pgs_by_osd.emplace(i.first, set<pg_t>());
a8e16298 4493 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
31f18b77 4494 << " pgs " << pgs << dendl;
a8e16298
TL
4495 }
4496 if (osd_weight_total == 0) {
4497 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4498 return 0;
4499 }
4500 float pgs_per_weight = total_pgs / osd_weight_total;
4501 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4502 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4503
a8e16298
TL
4504 if (max <= 0) {
4505 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4506 return 0;
4507 }
4508 float decay_factor = 1.0 / float(max);
4509 float stddev = 0;
4510 map<int,float> osd_deviation; // osd, deviation(pgs)
4511 multimap<float,int> deviation_osd; // deviation(pgs), osd
4512 for (auto& i : pgs_by_osd) {
4513 // make sure osd is still there (belongs to this crush-tree)
4514 ceph_assert(osd_weight.count(i.first));
4515 float target = osd_weight[i.first] * pgs_per_weight;
4516 float deviation = (float)i.second.size() - target;
4517 ldout(cct, 20) << " osd." << i.first
4518 << "\tpgs " << i.second.size()
4519 << "\ttarget " << target
4520 << "\tdeviation " << deviation
4521 << dendl;
4522 osd_deviation[i.first] = deviation;
4523 deviation_osd.insert(make_pair(deviation, i.first));
4524 stddev += deviation * deviation;
4525 }
11fdf7f2 4526 if (stddev <= cct->_conf.get_val<double>("osd_calc_pg_upmaps_max_stddev")) {
a8e16298
TL
4527 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4528 << dendl;
4529 return 0;
4530 }
4531 bool skip_overfull = false;
4532 auto aggressive =
11fdf7f2 4533 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4534 auto local_fallback_retries =
11fdf7f2 4535 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
a8e16298
TL
4536 while (max--) {
4537 // build overfull and underfull
4538 set<int> overfull;
4539 vector<int> underfull;
4540 float decay = 0;
4541 int decay_count = 0;
4542 while (overfull.empty()) {
4543 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4544 if (i->first >= (1.0 - decay))
4545 overfull.insert(i->second);
4546 }
4547 if (!overfull.empty())
4548 break;
4549 decay_count++;
4550 decay = decay_factor * decay_count;
4551 if (decay >= 1.0)
4552 break;
4553 ldout(cct, 30) << " decay_factor = " << decay_factor
4554 << " decay_count = " << decay_count
4555 << " decay (overfull) = " << decay
4556 << dendl;
4557 }
4558 if (overfull.empty()) {
4559 lderr(cct) << __func__ << " failed to build overfull" << dendl;
224ce89b
WB
4560 break;
4561 }
7c673cae 4562
a8e16298
TL
4563 decay = 0;
4564 decay_count = 0;
4565 while (underfull.empty()) {
4566 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4567 if (i->first >= (-.999 + decay))
4568 break;
4569 underfull.push_back(i->second);
4570 }
4571 if (!underfull.empty())
4572 break;
4573 decay_count++;
4574 decay = decay_factor * decay_count;
4575 if (decay >= .999)
4576 break;
4577 ldout(cct, 30) << " decay_factor = " << decay_factor
4578 << " decay_count = " << decay_count
4579 << " decay (underfull) = " << decay
4580 << dendl;
7c673cae 4581 }
a8e16298
TL
4582 if (underfull.empty()) {
4583 lderr(cct) << __func__ << " failed to build underfull" << dendl;
7c673cae 4584 break;
a8e16298 4585 }
7c673cae 4586
a8e16298
TL
4587 ldout(cct, 10) << " overfull " << overfull
4588 << " underfull " << underfull
4589 << dendl;
4590 set<pg_t> to_skip;
4591 uint64_t local_fallback_retried = 0;
4592
4593 retry:
4594
4595 set<pg_t> to_unmap;
4596 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4597 auto temp_pgs_by_osd = pgs_by_osd;
4598 // always start with fullest, break if we find any changes to make
7c673cae 4599 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
a8e16298
TL
4600 if (skip_overfull) {
4601 ldout(cct, 10) << " skipping overfull " << dendl;
4602 break; // fall through to check underfull
4603 }
7c673cae 4604 int osd = p->second;
31f18b77 4605 float deviation = p->first;
7c673cae 4606 float target = osd_weight[osd] * pgs_per_weight;
a8e16298
TL
4607 ceph_assert(target > 0);
4608 float deviation_ratio = deviation / target;
4609 if (deviation_ratio < max_deviation_ratio) {
7c673cae 4610 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4611 << " target " << target
4612 << " deviation " << deviation
4613 << " -> ratio " << deviation_ratio
4614 << " < max ratio " << max_deviation_ratio
4615 << dendl;
7c673cae
FG
4616 break;
4617 }
7c673cae 4618
a8e16298
TL
4619 vector<pg_t> pgs;
4620 pgs.reserve(pgs_by_osd[osd].size());
4621 for (auto& pg : pgs_by_osd[osd]) {
4622 if (to_skip.count(pg))
4623 continue;
4624 pgs.push_back(pg);
4625 }
4626 if (aggressive) {
4627 // shuffle PG list so they all get equal (in)attention
4628 std::random_device rd;
4629 std::default_random_engine rng{rd()};
4630 std::shuffle(pgs.begin(), pgs.end(), rng);
4631 }
7c673cae
FG
4632 // look for remaps we can un-remap
4633 for (auto pg : pgs) {
4634 auto p = tmp.pg_upmap_items.find(pg);
a8e16298
TL
4635 if (p == tmp.pg_upmap_items.end())
4636 continue;
4637 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4638 for (auto q : p->second) {
4639 if (q.second == osd) {
4640 ldout(cct, 10) << " will try dropping existing"
4641 << " remapping pair "
4642 << q.first << " -> " << q.second
4643 << " which remapped " << pg
4644 << " into overfull osd." << osd
4645 << dendl;
4646 temp_pgs_by_osd[q.second].erase(pg);
4647 temp_pgs_by_osd[q.first].insert(pg);
4648 } else {
4649 new_upmap_items.push_back(q);
4650 }
4651 }
4652 if (new_upmap_items.empty()) {
4653 // drop whole item
4654 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4655 << " remapped " << pg << " into overfull osd." << osd
4656 << ", will try cancelling it entirely"
4657 << dendl;
4658 to_unmap.insert(pg);
4659 goto test_change;
4660 } else if (new_upmap_items.size() != p->second.size()) {
4661 // drop single remapping pair, updating
4662 ceph_assert(new_upmap_items.size() < p->second.size());
4663 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4664 << " remapped " << pg << " into overfull osd." << osd
4665 << ", new_pg_upmap_items now " << new_upmap_items
4666 << dendl;
4667 to_upmap[pg] = new_upmap_items;
4668 goto test_change;
4669 }
4670 }
7c673cae 4671
a8e16298 4672 // try upmap
7c673cae 4673 for (auto pg : pgs) {
a8e16298
TL
4674 auto temp_it = tmp.pg_upmap.find(pg);
4675 if (temp_it != tmp.pg_upmap.end()) {
4676 // leave pg_upmap alone
4677 // it must be specified by admin since balancer does not
4678 // support pg_upmap yet
4679 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4680 << temp_it->second << ", skipping"
4681 << dendl;
7c673cae
FG
4682 continue;
4683 }
a8e16298
TL
4684 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4685 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4686 set<int> existing;
4687 auto it = tmp.pg_upmap_items.find(pg);
4688 if (it != tmp.pg_upmap_items.end() &&
4689 it->second.size() >= (size_t)pg_pool_size) {
4690 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4691 << it->second << ", skipping"
4692 << dendl;
4693 continue;
4694 } else if (it != tmp.pg_upmap_items.end()) {
4695 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4696 << it->second
4697 << dendl;
4698 new_upmap_items = it->second;
4699 // build existing too (for dedup)
4700 for (auto i : it->second) {
4701 existing.insert(i.first);
4702 existing.insert(i.second);
4703 }
4704 // fall through
4705 // to see if we can append more remapping pairs
4706 }
4707 ldout(cct, 10) << " trying " << pg << dendl;
494da23a
TL
4708 vector<int> raw, orig, out;
4709 tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
7c673cae
FG
4710 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
4711 continue;
4712 }
a8e16298 4713 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4714 if (orig.size() != out.size()) {
4715 continue;
4716 }
a8e16298 4717 ceph_assert(orig != out);
7c673cae 4718 for (unsigned i = 0; i < out.size(); ++i) {
a8e16298
TL
4719 if (orig[i] == out[i])
4720 continue; // skip invalid remappings
4721 if (existing.count(orig[i]) || existing.count(out[i]))
4722 continue; // we want new remappings only!
4723 ldout(cct, 10) << " will try adding new remapping pair "
4724 << orig[i] << " -> " << out[i] << " for " << pg
4725 << dendl;
4726 existing.insert(orig[i]);
4727 existing.insert(out[i]);
4728 temp_pgs_by_osd[orig[i]].erase(pg);
4729 temp_pgs_by_osd[out[i]].insert(pg);
4730 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4731 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4732 // append new remapping pairs slowly
4733 // This way we can make sure that each tiny change will
4734 // definitely make distribution of PGs converging to
4735 // the perfect status.
4736 to_upmap[pg] = new_upmap_items;
4737 goto test_change;
7c673cae 4738 }
a8e16298
TL
4739 }
4740 }
7c673cae 4741
a8e16298
TL
4742 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4743 ldout(cct, 10) << " failed to find any changes for overfull osds"
4744 << dendl;
4745 for (auto& p : deviation_osd) {
4746 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4747 underfull.end())
4748 break;
4749 int osd = p.second;
4750 float deviation = p.first;
4751 float target = osd_weight[osd] * pgs_per_weight;
4752 ceph_assert(target > 0);
4753 float deviation_ratio = abs(deviation / target);
4754 if (deviation_ratio < max_deviation_ratio) {
4755 // respect max_deviation_ratio too
4756 ldout(cct, 10) << " osd." << osd
4757 << " target " << target
4758 << " deviation " << deviation
4759 << " -> absolute ratio " << deviation_ratio
4760 << " < max ratio " << max_deviation_ratio
4761 << dendl;
4762 break;
4763 }
4764 // look for remaps we can un-remap
4765 vector<pair<pg_t,
4766 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4767 candidates.reserve(tmp.pg_upmap_items.size());
4768 for (auto& i : tmp.pg_upmap_items) {
4769 if (to_skip.count(i.first))
4770 continue;
4771 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4772 continue;
4773 candidates.push_back(make_pair(i.first, i.second));
4774 }
4775 if (aggressive) {
4776 // shuffle candidates so they all get equal (in)attention
4777 std::random_device rd;
4778 std::default_random_engine rng{rd()};
4779 std::shuffle(candidates.begin(), candidates.end(), rng);
4780 }
4781 for (auto& i : candidates) {
4782 auto pg = i.first;
4783 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4784 for (auto& j : i.second) {
4785 if (j.first == osd) {
4786 ldout(cct, 10) << " will try dropping existing"
4787 << " remapping pair "
4788 << j.first << " -> " << j.second
4789 << " which remapped " << pg
4790 << " out from underfull osd." << osd
4791 << dendl;
4792 temp_pgs_by_osd[j.second].erase(pg);
4793 temp_pgs_by_osd[j.first].insert(pg);
4794 } else {
4795 new_upmap_items.push_back(j);
4796 }
4797 }
4798 if (new_upmap_items.empty()) {
4799 // drop whole item
4800 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4801 << " remapped " << pg
4802 << " out from underfull osd." << osd
4803 << ", will try cancelling it entirely"
4804 << dendl;
4805 to_unmap.insert(pg);
4806 goto test_change;
4807 } else if (new_upmap_items.size() != i.second.size()) {
4808 // drop single remapping pair, updating
4809 ceph_assert(new_upmap_items.size() < i.second.size());
4810 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4811 << " remapped " << pg
4812 << " out from underfull osd." << osd
4813 << ", new_pg_upmap_items now " << new_upmap_items
4814 << dendl;
4815 to_upmap[pg] = new_upmap_items;
4816 goto test_change;
4817 }
4818 }
7c673cae 4819 }
a8e16298
TL
4820
4821 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4822 ldout(cct, 10) << " failed to find any changes for underfull osds"
4823 << dendl;
4824 if (!aggressive) {
4825 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
4826 break;
4827 } else if (!skip_overfull) {
4828 // safe to quit because below here we know
4829 // we've done checking both overfull and underfull osds..
4830 ldout(cct, 10) << " break due to not being able to find any"
4831 << " further optimizations"
4832 << dendl;
7c673cae
FG
4833 break;
4834 }
a8e16298
TL
4835 // restart with fullest and do exhaustive searching
4836 skip_overfull = false;
4837 continue;
4838
4839 test_change:
4840
4841 // test change, apply if change is good
4842 ceph_assert(to_unmap.size() || to_upmap.size());
4843 float new_stddev = 0;
4844 map<int,float> temp_osd_deviation;
4845 multimap<float,int> temp_deviation_osd;
4846 for (auto& i : temp_pgs_by_osd) {
4847 // make sure osd is still there (belongs to this crush-tree)
4848 ceph_assert(osd_weight.count(i.first));
4849 float target = osd_weight[i.first] * pgs_per_weight;
4850 float deviation = (float)i.second.size() - target;
4851 ldout(cct, 20) << " osd." << i.first
4852 << "\tpgs " << i.second.size()
4853 << "\ttarget " << target
4854 << "\tdeviation " << deviation
4855 << dendl;
4856 temp_osd_deviation[i.first] = deviation;
4857 temp_deviation_osd.insert(make_pair(deviation, i.first));
4858 new_stddev += deviation * deviation;
4859 }
4860 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
4861 if (new_stddev >= stddev) {
4862 if (!aggressive) {
4863 ldout(cct, 10) << " break because stddev is not decreasing"
4864 << " and aggressive mode is not enabled"
4865 << dendl;
4866 break;
4867 }
4868 local_fallback_retried++;
4869 if (local_fallback_retried >= local_fallback_retries) {
4870 // does not make progress
4871 // flip *skip_overfull* so both overfull and underfull
4872 // get equal (in)attention
4873 skip_overfull = !skip_overfull;
4874 ldout(cct, 10) << " hit local_fallback_retries "
4875 << local_fallback_retries
4876 << dendl;
4877 continue;
4878 }
4879 for (auto& i : to_unmap)
4880 to_skip.insert(i);
4881 for (auto& i : to_upmap)
4882 to_skip.insert(i.first);
4883 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
4884 << " to_skip " << to_skip
4885 << dendl;
4886 goto retry;
4887 }
4888
4889 // ready to go
4890 ceph_assert(new_stddev < stddev);
4891 stddev = new_stddev;
4892 pgs_by_osd = temp_pgs_by_osd;
4893 osd_deviation = temp_osd_deviation;
4894 deviation_osd = temp_deviation_osd;
4895 for (auto& i : to_unmap) {
4896 ldout(cct, 10) << " unmap pg " << i << dendl;
4897 ceph_assert(tmp.pg_upmap_items.count(i));
4898 tmp.pg_upmap_items.erase(i);
4899 pending_inc->old_pg_upmap_items.insert(i);
4900 ++num_changed;
4901 }
4902 for (auto& i : to_upmap) {
4903 ldout(cct, 10) << " upmap pg " << i.first
4904 << " new pg_upmap_items " << i.second
4905 << dendl;
4906 tmp.pg_upmap_items[i.first] = i.second;
4907 pending_inc->new_pg_upmap_items[i.first] = i.second;
4908 ++num_changed;
4909 }
7c673cae 4910 }
a8e16298 4911 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
4912 return num_changed;
4913}
31f18b77
FG
4914
4915int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
4916{
4917 return crush->get_leaves(name, osds);
4918}
4919
3efd9988
FG
4920// get pools whose crush rules might reference the given osd
4921void OSDMap::get_pool_ids_by_osd(CephContext *cct,
4922 int osd,
4923 set<int64_t> *pool_ids) const
4924{
11fdf7f2 4925 ceph_assert(pool_ids);
3efd9988
FG
4926 set<int> raw_rules;
4927 int r = crush->get_rules_by_osd(osd, &raw_rules);
4928 if (r < 0) {
4929 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
4930 << dendl;
11fdf7f2 4931 ceph_assert(r >= 0);
3efd9988
FG
4932 }
4933 set<int> rules;
4934 for (auto &i: raw_rules) {
4935 // exclude any dead rule
4936 if (crush_rule_in_use(i)) {
4937 rules.insert(i);
4938 }
4939 }
4940 for (auto &r: rules) {
4941 get_pool_ids_by_rule(r, pool_ids);
4942 }
4943}
4944
31f18b77
FG
4945template <typename F>
4946class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
4947public:
4948 typedef CrushTreeDumper::Dumper<F> Parent;
4949
4950 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2
TL
4951 const PGMap& pgmap_, bool tree_,
4952 const string& class_name_,
4953 const string& item_name_) :
c07f9fc5 4954 Parent(crush, osdmap_->get_pool_names()),
31f18b77 4955 osdmap(osdmap_),
11fdf7f2 4956 pgmap(pgmap_),
31f18b77 4957 tree(tree_),
11fdf7f2
TL
4958 class_name(class_name_),
4959 item_name(item_name_),
31f18b77
FG
4960 min_var(-1),
4961 max_var(-1),
4962 stddev(0),
4963 sum(0) {
11fdf7f2
TL
4964 if (osdmap->crush->name_exists(item_name)) {
4965 // filter out items we are allowed to dump
4966 auto item_id = osdmap->crush->get_item_id(item_name);
4967 allowed.insert(item_id);
4968 osdmap->crush->get_all_children(item_id, &allowed);
4969 }
4970 average_util = average_utilization();
31f18b77
FG
4971 }
4972
4973protected:
11fdf7f2
TL
4974
4975 bool should_dump(int id) const {
4976 if (!allowed.empty() && !allowed.count(id)) // filter by name
4977 return false;
4978 if (id >= 0 && !class_name.empty()) {
4979 const char* item_class_name = osdmap->crush->get_item_class(id);
4980 if (!item_class_name || // not bound to a class yet
4981 item_class_name != class_name) // or already bound to
4982 // a different class
4983 return false;
4984 }
4985 return true;
4986 }
4987
4988 set<int> get_dumped_osds() {
4989 if (class_name.empty() && item_name.empty()) {
4990 // old way, all
4991 return {};
4992 }
4993 return dumped_osds;
4994 }
4995
31f18b77
FG
4996 void dump_stray(F *f) {
4997 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4998 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 4999 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5000 }
5001 }
5002
5003 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
5004 if (!tree && qi.is_bucket())
5005 return;
11fdf7f2
TL
5006 if (!should_dump(qi.id))
5007 return;
31f18b77 5008
11fdf7f2
TL
5009 if (!qi.is_bucket())
5010 dumped_osds.insert(qi.id);
31f18b77 5011 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5012 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5013 kb_used_meta = 0, kb_avail = 0;
31f18b77 5014 double util = 0;
11fdf7f2
TL
5015 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5016 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5017 if (kb_used && kb)
5018 util = 100.0 * (double)kb_used / (double)kb;
5019
5020 double var = 1.0;
5021 if (average_util)
5022 var = util / average_util;
5023
11fdf7f2 5024 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5025
11fdf7f2
TL
5026 dump_item(qi, reweight, kb, kb_used,
5027 kb_used_data, kb_used_omap, kb_used_meta,
5028 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5029
5030 if (!qi.is_bucket() && reweight > 0) {
5031 if (min_var < 0 || var < min_var)
5032 min_var = var;
5033 if (max_var < 0 || var > max_var)
5034 max_var = var;
5035
5036 double dev = util - average_util;
5037 dev *= dev;
5038 stddev += reweight * dev;
5039 sum += reweight;
5040 }
5041 }
5042
5043 virtual void dump_item(const CrushTreeDumper::Item &qi,
5044 float &reweight,
5045 int64_t kb,
5046 int64_t kb_used,
11fdf7f2
TL
5047 int64_t kb_used_data,
5048 int64_t kb_used_omap,
5049 int64_t kb_used_meta,
31f18b77
FG
5050 int64_t kb_avail,
5051 double& util,
5052 double& var,
5053 const size_t num_pgs,
5054 F *f) = 0;
5055
5056 double dev() {
5057 return sum > 0 ? sqrt(stddev / sum) : 0;
5058 }
5059
5060 double average_utilization() {
5061 int64_t kb = 0, kb_used = 0;
5062 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5063 if (!osdmap->exists(i) ||
5064 osdmap->get_weight(i) == 0 ||
5065 !should_dump(i))
31f18b77 5066 continue;
11fdf7f2
TL
5067 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5068 kb_avail_i;
5069 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5070 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5071 kb += kb_i;
5072 kb_used += kb_used_i;
5073 }
5074 }
5075 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5076 }
5077
5078 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5079 int64_t* kb_used_data,
5080 int64_t* kb_used_omap,
5081 int64_t* kb_used_meta,
31f18b77 5082 int64_t* kb_avail) const {
11fdf7f2 5083 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5084 if (!p) return false;
11fdf7f2
TL
5085 *kb = p->statfs.kb();
5086 *kb_used = p->statfs.kb_used_raw();
5087 *kb_used_data = p->statfs.kb_used_data();
5088 *kb_used_omap = p->statfs.kb_used_omap();
5089 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5090 *kb_avail = p->statfs.kb_avail();
5091
31f18b77
FG
5092 return *kb > 0;
5093 }
5094
5095 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5096 int64_t* kb_used_data,
5097 int64_t* kb_used_omap,
5098 int64_t* kb_used_meta,
31f18b77
FG
5099 int64_t* kb_avail) const {
5100 if (id >= 0) {
11fdf7f2 5101 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5102 *kb = 0;
5103 *kb_used = 0;
11fdf7f2
TL
5104 *kb_used_data = 0;
5105 *kb_used_omap = 0;
5106 *kb_used_meta = 0;
31f18b77
FG
5107 *kb_avail = 0;
5108 return true;
5109 }
11fdf7f2
TL
5110 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5111 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5112 }
5113
5114 *kb = 0;
5115 *kb_used = 0;
11fdf7f2
TL
5116 *kb_used_data = 0;
5117 *kb_used_omap = 0;
5118 *kb_used_meta = 0;
31f18b77
FG
5119 *kb_avail = 0;
5120
5121 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5122 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5123 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5124 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5125 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5126 &kb_used_data_i, &kb_used_omap_i,
5127 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5128 return false;
5129 *kb += kb_i;
5130 *kb_used += kb_used_i;
11fdf7f2
TL
5131 *kb_used_data += kb_used_data_i;
5132 *kb_used_omap += kb_used_omap_i;
5133 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5134 *kb_avail += kb_avail_i;
5135 }
5136 return *kb > 0;
5137 }
5138
5139protected:
5140 const OSDMap *osdmap;
11fdf7f2 5141 const PGMap& pgmap;
31f18b77 5142 bool tree;
11fdf7f2
TL
5143 const string class_name;
5144 const string item_name;
31f18b77
FG
5145 double average_util;
5146 double min_var;
5147 double max_var;
5148 double stddev;
5149 double sum;
11fdf7f2
TL
5150 set<int> allowed;
5151 set<int> dumped_osds;
31f18b77
FG
5152};
5153
5154
5155class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5156public:
5157 typedef OSDUtilizationDumper<TextTable> Parent;
5158
5159 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2
TL
5160 const PGMap& pgmap, bool tree,
5161 const string& class_name,
5162 const string& item_name) :
5163 Parent(crush, osdmap, pgmap, tree, class_name, item_name) {}
31f18b77
FG
5164
5165 void dump(TextTable *tbl) {
5166 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5167 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5168 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5169 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5170 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5171 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5172 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5173 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5174 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5175 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5176 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5177 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5178 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5179 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5180 if (tree)
5181 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5182
5183 Parent::dump(tbl);
5184
5185 dump_stray(tbl);
5186
11fdf7f2 5187 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5188 *tbl << ""
5189 << ""
5190 << "" << "TOTAL"
11fdf7f2
TL
5191 << byte_u_t(sum.statfs.total)
5192 << byte_u_t(sum.statfs.get_used_raw())
5193 << byte_u_t(sum.statfs.allocated)
5194 << byte_u_t(sum.statfs.omap_allocated)
5195 << byte_u_t(sum.statfs.internal_metadata)
5196 << byte_u_t(sum.statfs.available)
31f18b77
FG
5197 << lowprecision_t(average_util)
5198 << ""
5199 << TextTable::endrow;
5200 }
5201
5202protected:
5203 struct lowprecision_t {
5204 float v;
5205 explicit lowprecision_t(float _v) : v(_v) {}
5206 };
5207 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5208
5209 using OSDUtilizationDumper<TextTable>::dump_item;
5210 void dump_item(const CrushTreeDumper::Item &qi,
5211 float &reweight,
5212 int64_t kb,
5213 int64_t kb_used,
11fdf7f2
TL
5214 int64_t kb_used_data,
5215 int64_t kb_used_omap,
5216 int64_t kb_used_meta,
31f18b77
FG
5217 int64_t kb_avail,
5218 double& util,
5219 double& var,
5220 const size_t num_pgs,
5221 TextTable *tbl) override {
224ce89b
WB
5222 const char *c = crush->get_item_class(qi.id);
5223 if (!c)
5224 c = "";
31f18b77 5225 *tbl << qi.id
224ce89b 5226 << c
31f18b77
FG
5227 << weightf_t(qi.weight)
5228 << weightf_t(reweight)
1adf2230
AA
5229 << byte_u_t(kb << 10)
5230 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5231 << byte_u_t(kb_used_data << 10)
5232 << byte_u_t(kb_used_omap << 10)
5233 << byte_u_t(kb_used_meta << 10)
1adf2230 5234 << byte_u_t(kb_avail << 10)
31f18b77
FG
5235 << lowprecision_t(util)
5236 << lowprecision_t(var);
5237
5238 if (qi.is_bucket()) {
5239 *tbl << "-";
11fdf7f2 5240 *tbl << "";
31f18b77
FG
5241 } else {
5242 *tbl << num_pgs;
11fdf7f2
TL
5243 if (osdmap->is_up(qi.id)) {
5244 *tbl << "up";
5245 } else if (osdmap->is_destroyed(qi.id)) {
5246 *tbl << "destroyed";
5247 } else {
5248 *tbl << "down";
5249 }
31f18b77
FG
5250 }
5251
5252 if (tree) {
5253 ostringstream name;
5254 for (int k = 0; k < qi.depth; k++)
5255 name << " ";
5256 if (qi.is_bucket()) {
5257 int type = crush->get_bucket_type(qi.id);
5258 name << crush->get_type_name(type) << " "
5259 << crush->get_item_name(qi.id);
5260 } else {
5261 name << "osd." << qi.id;
5262 }
5263 *tbl << name.str();
5264 }
5265
5266 *tbl << TextTable::endrow;
5267 }
5268
5269public:
5270 string summary() {
5271 ostringstream out;
5272 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5273 << "/" << lowprecision_t(max_var) << " "
5274 << "STDDEV: " << lowprecision_t(dev());
5275 return out.str();
5276 }
5277};
5278
5279ostream& operator<<(ostream& out,
5280 const OSDUtilizationPlainDumper::lowprecision_t& v)
5281{
5282 if (v.v < -0.01) {
5283 return out << "-";
5284 } else if (v.v < 0.001) {
5285 return out << "0";
5286 } else {
5287 std::streamsize p = out.precision();
5288 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5289 }
5290}
5291
5292class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5293public:
5294 typedef OSDUtilizationDumper<Formatter> Parent;
5295
5296 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2
TL
5297 const PGMap& pgmap, bool tree,
5298 const string& class_name,
5299 const string& item_name) :
5300 Parent(crush, osdmap, pgmap, tree, class_name, item_name) {}
31f18b77
FG
5301
5302 void dump(Formatter *f) {
5303 f->open_array_section("nodes");
5304 Parent::dump(f);
5305 f->close_section();
5306
5307 f->open_array_section("stray");
5308 dump_stray(f);
5309 f->close_section();
5310 }
5311
5312protected:
5313 using OSDUtilizationDumper<Formatter>::dump_item;
5314 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5315 float &reweight,
5316 int64_t kb,
5317 int64_t kb_used,
5318 int64_t kb_used_data,
5319 int64_t kb_used_omap,
5320 int64_t kb_used_meta,
5321 int64_t kb_avail,
5322 double& util,
5323 double& var,
5324 const size_t num_pgs,
5325 Formatter *f) override {
31f18b77 5326 f->open_object_section("item");
c07f9fc5 5327 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5328 f->dump_float("reweight", reweight);
5329 f->dump_int("kb", kb);
5330 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5331 f->dump_int("kb_used_data", kb_used_data);
5332 f->dump_int("kb_used_omap", kb_used_omap);
5333 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5334 f->dump_int("kb_avail", kb_avail);
5335 f->dump_float("utilization", util);
5336 f->dump_float("var", var);
5337 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5338 if (!qi.is_bucket()) {
5339 if (osdmap->is_up(qi.id)) {
5340 f->dump_string("status", "up");
5341 } else if (osdmap->is_destroyed(qi.id)) {
5342 f->dump_string("status", "destroyed");
5343 } else {
5344 f->dump_string("status", "down");
5345 }
5346 }
31f18b77
FG
5347 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5348 f->close_section();
5349 }
5350
5351public:
5352 void summary(Formatter *f) {
5353 f->open_object_section("summary");
11fdf7f2
TL
5354 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5355 auto& s = sum.statfs;
5356
5357 f->dump_int("total_kb", s.kb());
5358 f->dump_int("total_kb_used", s.kb_used_raw());
5359 f->dump_int("total_kb_used_data", s.kb_used_data());
5360 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5361 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5362 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5363 f->dump_float("average_utilization", average_util);
5364 f->dump_float("min_var", min_var);
5365 f->dump_float("max_var", max_var);
5366 f->dump_float("dev", dev());
5367 f->close_section();
5368 }
5369};
5370
5371void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5372 const PGMap& pgmap,
5373 ostream& out,
5374 Formatter *f,
5375 bool tree,
5376 const string& class_name,
5377 const string& item_name)
31f18b77
FG
5378{
5379 const CrushWrapper *crush = osdmap.crush.get();
5380 if (f) {
5381 f->open_object_section("df");
11fdf7f2
TL
5382 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree,
5383 class_name, item_name);
31f18b77
FG
5384 d.dump(f);
5385 d.summary(f);
5386 f->close_section();
5387 f->flush(out);
5388 } else {
11fdf7f2
TL
5389 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree,
5390 class_name, item_name);
31f18b77
FG
5391 TextTable tbl;
5392 d.dump(&tbl);
5393 out << tbl << d.summary() << "\n";
5394 }
5395}
224ce89b
WB
5396
5397void OSDMap::check_health(health_check_map_t *checks) const
5398{
5399 int num_osds = get_num_osds();
5400
5401 // OSD_DOWN
5402 // OSD_$subtree_DOWN
5403 // OSD_ORPHAN
5404 if (num_osds >= 0) {
5405 int num_in_osds = 0;
5406 int num_down_in_osds = 0;
5407 set<int> osds;
5408 set<int> down_in_osds;
5409 set<int> up_in_osds;
5410 set<int> subtree_up;
5411 unordered_map<int, set<int> > subtree_type_down;
5412 unordered_map<int, int> num_osds_subtree;
5413 int max_type = crush->get_max_type_id();
5414
5415 for (int i = 0; i < get_max_osd(); i++) {
5416 if (!exists(i)) {
5417 if (crush->item_exists(i)) {
5418 osds.insert(i);
5419 }
5420 continue;
5421 }
5422 if (is_out(i))
5423 continue;
5424 ++num_in_osds;
5425 if (down_in_osds.count(i) || up_in_osds.count(i))
5426 continue;
5427 if (!is_up(i)) {
5428 down_in_osds.insert(i);
5429 int parent_id = 0;
5430 int current = i;
5431 for (int type = 0; type <= max_type; type++) {
5432 if (!crush->get_type_name(type))
5433 continue;
5434 int r = crush->get_immediate_parent_id(current, &parent_id);
5435 if (r == -ENOENT)
5436 break;
5437 // break early if this parent is already marked as up
5438 if (subtree_up.count(parent_id))
5439 break;
5440 type = crush->get_bucket_type(parent_id);
5441 if (!subtree_type_is_down(
5442 g_ceph_context, parent_id, type,
5443 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5444 break;
5445 current = parent_id;
5446 }
5447 }
5448 }
5449
5450 // calculate the number of down osds in each down subtree and
5451 // store it in num_osds_subtree
5452 for (int type = 1; type <= max_type; type++) {
5453 if (!crush->get_type_name(type))
5454 continue;
5455 for (auto j = subtree_type_down[type].begin();
5456 j != subtree_type_down[type].end();
5457 ++j) {
5458 list<int> children;
5459 int num = 0;
5460 int num_children = crush->get_children(*j, &children);
5461 if (num_children == 0)
5462 continue;
5463 for (auto l = children.begin(); l != children.end(); ++l) {
5464 if (*l >= 0) {
5465 ++num;
5466 } else if (num_osds_subtree[*l] > 0) {
5467 num = num + num_osds_subtree[*l];
5468 }
5469 }
5470 num_osds_subtree[*j] = num;
5471 }
5472 }
5473 num_down_in_osds = down_in_osds.size();
11fdf7f2 5474 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
5475 if (num_down_in_osds > 0) {
5476 // summary of down subtree types and osds
5477 for (int type = max_type; type > 0; type--) {
5478 if (!crush->get_type_name(type))
5479 continue;
5480 if (subtree_type_down[type].size() > 0) {
5481 ostringstream ss;
5482 ss << subtree_type_down[type].size() << " "
5483 << crush->get_type_name(type);
5484 if (subtree_type_down[type].size() > 1) {
5485 ss << "s";
5486 }
5487 int sum_down_osds = 0;
5488 for (auto j = subtree_type_down[type].begin();
5489 j != subtree_type_down[type].end();
5490 ++j) {
5491 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5492 }
5493 ss << " (" << sum_down_osds << " osds) down";
5494 string err = string("OSD_") +
5495 string(crush->get_type_name(type)) + "_DOWN";
5496 boost::to_upper(err);
5497 auto& d = checks->add(err, HEALTH_WARN, ss.str());
5498 for (auto j = subtree_type_down[type].rbegin();
5499 j != subtree_type_down[type].rend();
5500 ++j) {
5501 ostringstream ss;
5502 ss << crush->get_type_name(type);
5503 ss << " ";
5504 ss << crush->get_item_name(*j);
5505 // at the top level, do not print location
5506 if (type != max_type) {
5507 ss << " (";
5508 ss << crush->get_full_location_ordered_string(*j);
5509 ss << ")";
5510 }
5511 int num = num_osds_subtree[*j];
5512 ss << " (" << num << " osds)";
5513 ss << " is down";
5514 d.detail.push_back(ss.str());
5515 }
5516 }
5517 }
5518 ostringstream ss;
5519 ss << down_in_osds.size() << " osds down";
5520 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
5521 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5522 ostringstream ss;
5523 ss << "osd." << *it << " (";
5524 ss << crush->get_full_location_ordered_string(*it);
5525 ss << ") is down";
5526 d.detail.push_back(ss.str());
5527 }
5528 }
5529
5530 if (!osds.empty()) {
5531 ostringstream ss;
5532 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
5533 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
5534 for (auto osd : osds) {
5535 ostringstream ss;
5536 ss << "osd." << osd << " exists in crush map but not in osdmap";
5537 d.detail.push_back(ss.str());
5538 }
5539 }
5540 }
5541
eafe8130
TL
5542 std::list<std::string> scrub_messages;
5543 bool noscrub = false, nodeepscrub = false;
5544 for (const auto &p : pools) {
5545 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5546 ostringstream ss;
5547 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5548 scrub_messages.push_back(ss.str());
5549 noscrub = true;
5550 }
5551 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5552 ostringstream ss;
5553 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5554 scrub_messages.push_back(ss.str());
5555 nodeepscrub = true;
5556 }
5557 }
5558 if (noscrub || nodeepscrub) {
5559 string out = "";
5560 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5561 out += nodeepscrub ? "nodeep-scrub" : "";
5562 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
5563 "Some pool(s) have the " + out + " flag(s) set");
5564 d.detail.splice(d.detail.end(), scrub_messages);
5565 }
5566
224ce89b
WB
5567 // OSD_OUT_OF_ORDER_FULL
5568 {
5569 // An osd could configure failsafe ratio, to something different
5570 // but for now assume it is the same here.
11fdf7f2 5571 float fsr = g_conf()->osd_failsafe_full_ratio;
224ce89b
WB
5572 if (fsr > 1.0) fsr /= 100;
5573 float fr = get_full_ratio();
5574 float br = get_backfillfull_ratio();
5575 float nr = get_nearfull_ratio();
5576
5577 list<string> detail;
5578 // These checks correspond to how OSDService::check_full_status() in an OSD
5579 // handles the improper setting of these values.
5580 if (br < nr) {
5581 ostringstream ss;
5582 ss << "backfillfull_ratio (" << br
5583 << ") < nearfull_ratio (" << nr << "), increased";
5584 detail.push_back(ss.str());
5585 br = nr;
5586 }
5587 if (fr < br) {
5588 ostringstream ss;
5589 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5590 << "), increased";
5591 detail.push_back(ss.str());
5592 fr = br;
5593 }
5594 if (fsr < fr) {
5595 ostringstream ss;
5596 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5597 << "), increased";
5598 detail.push_back(ss.str());
5599 }
5600 if (!detail.empty()) {
5601 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
5602 "full ratio(s) out of order");
5603 d.detail.swap(detail);
5604 }
5605 }
5606
5607 // OSD_FULL
5608 // OSD_NEARFULL
5609 // OSD_BACKFILLFULL
5610 // OSD_FAILSAFE_FULL
5611 {
5612 set<int> full, backfillfull, nearfull;
5613 get_full_osd_counts(&full, &backfillfull, &nearfull);
5614 if (full.size()) {
5615 ostringstream ss;
5616 ss << full.size() << " full osd(s)";
5617 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
5618 for (auto& i: full) {
5619 ostringstream ss;
5620 ss << "osd." << i << " is full";
5621 d.detail.push_back(ss.str());
5622 }
5623 }
5624 if (backfillfull.size()) {
5625 ostringstream ss;
5626 ss << backfillfull.size() << " backfillfull osd(s)";
5627 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
5628 for (auto& i: backfillfull) {
5629 ostringstream ss;
5630 ss << "osd." << i << " is backfill full";
5631 d.detail.push_back(ss.str());
5632 }
5633 }
5634 if (nearfull.size()) {
5635 ostringstream ss;
5636 ss << nearfull.size() << " nearfull osd(s)";
5637 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
5638 for (auto& i: nearfull) {
5639 ostringstream ss;
5640 ss << "osd." << i << " is near full";
5641 d.detail.push_back(ss.str());
5642 }
5643 }
5644 }
5645
5646 // OSDMAP_FLAGS
5647 {
5648 // warn about flags
5649 uint64_t warn_flags =
3efd9988 5650 CEPH_OSDMAP_NEARFULL |
224ce89b
WB
5651 CEPH_OSDMAP_FULL |
5652 CEPH_OSDMAP_PAUSERD |
5653 CEPH_OSDMAP_PAUSEWR |
5654 CEPH_OSDMAP_PAUSEREC |
5655 CEPH_OSDMAP_NOUP |
5656 CEPH_OSDMAP_NODOWN |
5657 CEPH_OSDMAP_NOIN |
5658 CEPH_OSDMAP_NOOUT |
5659 CEPH_OSDMAP_NOBACKFILL |
5660 CEPH_OSDMAP_NORECOVER |
5661 CEPH_OSDMAP_NOSCRUB |
5662 CEPH_OSDMAP_NODEEP_SCRUB |
5663 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 5664 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
5665 CEPH_OSDMAP_NOREBALANCE;
5666 if (test_flag(warn_flags)) {
5667 ostringstream ss;
5668 ss << get_flag_string(get_flags() & warn_flags)
5669 << " flag(s) set";
5670 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
5671 }
5672 }
5673
5674 // OSD_FLAGS
5675 {
5676 list<string> detail;
5677 const unsigned flags =
5678 CEPH_OSD_NOUP |
5679 CEPH_OSD_NOIN |
5680 CEPH_OSD_NODOWN |
5681 CEPH_OSD_NOOUT;
5682 for (int i = 0; i < max_osd; ++i) {
5683 if (osd_state[i] & flags) {
5684 ostringstream ss;
5685 set<string> states;
5686 OSDMap::calc_state_set(osd_state[i] & flags, states);
5687 ss << "osd." << i << " has flags " << states;
5688 detail.push_back(ss.str());
5689 }
5690 }
81eedcae
TL
5691 for (auto& i : crush_node_flags) {
5692 if (i.second && crush->item_exists(i.first)) {
5693 ostringstream ss;
5694 set<string> states;
5695 OSDMap::calc_state_set(i.second, states);
5696 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
5697 const char *tn = crush->get_type_name(t);
5698 ss << (tn ? tn : "node") << " "
5699 << crush->get_item_name(i.first) << " has flags " << states;
5700 detail.push_back(ss.str());
5701 }
5702 }
5703 for (auto& i : device_class_flags) {
5704 const char* class_name = crush->get_class_name(i.first);
5705 if (i.second && class_name) {
5706 ostringstream ss;
5707 set<string> states;
5708 OSDMap::calc_state_set(i.second, states);
5709 ss << "device class '" << class_name << "' has flags " << states;
5710 detail.push_back(ss.str());
5711 }
5712 }
224ce89b
WB
5713 if (!detail.empty()) {
5714 ostringstream ss;
81eedcae 5715 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
224ce89b
WB
5716 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
5717 d.detail.swap(detail);
5718 }
5719 }
5720
5721 // OLD_CRUSH_TUNABLES
11fdf7f2 5722 if (g_conf()->mon_warn_on_legacy_crush_tunables) {
224ce89b 5723 string min = crush->get_min_required_version();
11fdf7f2 5724 if (min < g_conf()->mon_crush_min_required_version) {
224ce89b
WB
5725 ostringstream ss;
5726 ss << "crush map has legacy tunables (require " << min
11fdf7f2 5727 << ", min is " << g_conf()->mon_crush_min_required_version << ")";
224ce89b
WB
5728 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
5729 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5730 }
5731 }
5732
5733 // OLD_CRUSH_STRAW_CALC_VERSION
11fdf7f2 5734 if (g_conf()->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
5735 if (crush->get_straw_calc_version() == 0) {
5736 ostringstream ss;
5737 ss << "crush map has straw_calc_version=0";
5738 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
5739 d.detail.push_back(
5740 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5741 }
5742 }
5743
5744 // CACHE_POOL_NO_HIT_SET
11fdf7f2 5745 if (g_conf()->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b
WB
5746 list<string> detail;
5747 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5748 p != pools.end();
5749 ++p) {
5750 const pg_pool_t& info = p->second;
5751 if (info.cache_mode_requires_hit_set() &&
5752 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5753 ostringstream ss;
5754 ss << "pool '" << get_pool_name(p->first)
5755 << "' with cache_mode " << info.get_cache_mode_name()
5756 << " needs hit_set_type to be set but it is not";
5757 detail.push_back(ss.str());
5758 }
5759 }
5760 if (!detail.empty()) {
5761 ostringstream ss;
5762 ss << detail.size() << " cache pools are missing hit_sets";
5763 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
5764 d.detail.swap(detail);
5765 }
5766 }
5767
5768 // OSD_NO_SORTBITWISE
11fdf7f2 5769 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 5770 ostringstream ss;
11fdf7f2 5771 ss << "'sortbitwise' flag is not set";
224ce89b
WB
5772 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
5773 }
5774
5775 // OSD_UPGRADE_FINISHED
5776 // none of these (yet) since we don't run until luminous upgrade is done.
5777
3efd9988 5778 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 5779 {
3efd9988 5780 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
5781 for (auto it : get_pools()) {
5782 const pg_pool_t &pool = it.second;
3efd9988 5783 const string& pool_name = get_pool_name(it.first);
224ce89b 5784 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 5785 stringstream ss;
11fdf7f2 5786 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
5787 // may run out of space too,
5788 // but we want EQUOTA taking precedence
11fdf7f2 5789 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
5790 } else {
5791 ss << "pool '" << pool_name << "' is full (no space)";
5792 }
5793 full_detail.push_back(ss.str());
5794 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
5795 stringstream ss;
5796 ss << "pool '" << pool_name << "' is backfillfull";
5797 backfillfull_detail.push_back(ss.str());
5798 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
5799 stringstream ss;
5800 ss << "pool '" << pool_name << "' is nearfull";
5801 nearfull_detail.push_back(ss.str());
224ce89b
WB
5802 }
5803 }
3efd9988 5804 if (!full_detail.empty()) {
224ce89b 5805 ostringstream ss;
3efd9988 5806 ss << full_detail.size() << " pool(s) full";
224ce89b 5807 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
3efd9988
FG
5808 d.detail.swap(full_detail);
5809 }
5810 if (!backfillfull_detail.empty()) {
5811 ostringstream ss;
5812 ss << backfillfull_detail.size() << " pool(s) backfillfull";
5813 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
5814 d.detail.swap(backfillfull_detail);
5815 }
5816 if (!nearfull_detail.empty()) {
5817 ostringstream ss;
5818 ss << nearfull_detail.size() << " pool(s) nearfull";
5819 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
5820 d.detail.swap(nearfull_detail);
224ce89b
WB
5821 }
5822 }
5823}
35e4c445
FG
5824
5825int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
5826 ostream *ss) const
5827{
5828 out->clear();
5829 for (auto i = ls.begin(); i != ls.end(); ++i) {
5830 if (i == ls.begin() &&
5831 (*i == "any" || *i == "all" || *i == "*")) {
5832 get_all_osds(*out);
5833 break;
5834 }
5835 long osd = parse_osd_id(i->c_str(), ss);
5836 if (osd < 0) {
5837 *ss << "invalid osd id '" << *i << "'";
5838 return -EINVAL;
5839 }
5840 out->insert(osd);
5841 }
5842 return 0;
5843}
11fdf7f2
TL
5844
5845void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
5846 string &subtree,
5847 int limit, // how many
5848 set<int> skip,
5849 set<int> *want) const {
5850 if (limit <= 0)
5851 return;
5852 int subtree_type = crush->get_type_id(subtree);
5853 if (subtree_type < 1)
5854 return;
5855 vector<int> subtrees;
5856 crush->get_subtree_of_type(subtree_type, &subtrees);
5857 std::random_device rd;
5858 std::default_random_engine rng{rd()};
5859 std::shuffle(subtrees.begin(), subtrees.end(), rng);
5860 for (auto s : subtrees) {
5861 if (limit <= 0)
5862 break;
5863 if (crush->subtree_contains(s, n))
5864 continue;
5865 vector<int> osds;
5866 crush->get_children_of_type(s, 0, &osds);
5867 if (osds.empty())
5868 continue;
5869 vector<int> up_osds;
5870 for (auto o : osds) {
5871 if (is_up(o) && !skip.count(o))
5872 up_osds.push_back(o);
5873 }
5874 if (up_osds.empty())
5875 continue;
5876 auto it = up_osds.begin();
5877 std::advance(it, (n % up_osds.size()));
5878 want->insert(*it);
5879 --limit;
5880 }
5881}
5882
5883float OSDMap::pool_raw_used_rate(int64_t poolid) const
5884{
5885 const pg_pool_t *pool = get_pg_pool(poolid);
5886 assert(pool != nullptr);
5887
5888 switch (pool->get_type()) {
5889 case pg_pool_t::TYPE_REPLICATED:
5890 return pool->get_size();
5891 break;
5892 case pg_pool_t::TYPE_ERASURE:
5893 {
5894 auto& ecp =
5895 get_erasure_code_profile(pool->erasure_code_profile);
5896 auto pm = ecp.find("m");
5897 auto pk = ecp.find("k");
5898 if (pm != ecp.end() && pk != ecp.end()) {
5899 int k = atoi(pk->second.c_str());
5900 int m = atoi(pm->second.c_str());
5901 int mk = m + k;
5902 ceph_assert(mk != 0);
5903 ceph_assert(k != 0);
5904 return (float)mk / k;
5905 } else {
5906 return 0.0;
5907 }
5908 }
5909 break;
5910 default:
5911 ceph_abort_msg("unrecognized pool type");
5912 }
5913}
81eedcae
TL
5914
5915unsigned OSDMap::get_osd_crush_node_flags(int osd) const
5916{
5917 unsigned flags = 0;
5918 if (!crush_node_flags.empty()) {
5919 // the map will contain type -> name
5920 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
5921 for (auto& i : ploc) {
5922 int id = crush->get_item_id(i.second);
5923 auto p = crush_node_flags.find(id);
5924 if (p != crush_node_flags.end()) {
5925 flags |= p->second;
5926 }
5927 }
5928 }
5929 return flags;
5930}
5931
5932unsigned OSDMap::get_crush_node_flags(int id) const
5933{
5934 unsigned flags = 0;
5935 auto it = crush_node_flags.find(id);
5936 if (it != crush_node_flags.end())
5937 flags = it->second;
5938 return flags;
5939}
5940
5941unsigned OSDMap::get_device_class_flags(int id) const
5942{
5943 unsigned flags = 0;
5944 auto it = device_class_flags.find(id);
5945 if (it != device_class_flags.end())
5946 flags = it->second;
5947 return flags;
5948}