]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
11fdf7f2
TL
18#include <algorithm>
19#include <optional>
20#include <random>
21
224ce89b
WB
22#include <boost/algorithm/string.hpp>
23
7c673cae 24#include "OSDMap.h"
7c673cae 25#include "common/config.h"
3efd9988 26#include "common/errno.h"
7c673cae
FG
27#include "common/Formatter.h"
28#include "common/TextTable.h"
29#include "include/ceph_features.h"
30#include "include/str_map.h"
31
32#include "common/code_environment.h"
224ce89b 33#include "mon/health_check.h"
7c673cae
FG
34
35#include "crush/CrushTreeDumper.h"
36#include "common/Clock.h"
11fdf7f2
TL
37#include "mon/PGMap.h"
38
7c673cae
FG
39#define dout_subsys ceph_subsys_osd
40
41MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
42MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
43
44
45// ----------------------------------
46// osd_info_t
47
48void osd_info_t::dump(Formatter *f) const
49{
50 f->dump_int("last_clean_begin", last_clean_begin);
51 f->dump_int("last_clean_end", last_clean_end);
52 f->dump_int("up_from", up_from);
53 f->dump_int("up_thru", up_thru);
54 f->dump_int("down_at", down_at);
55 f->dump_int("lost_at", lost_at);
56}
57
58void osd_info_t::encode(bufferlist& bl) const
59{
11fdf7f2 60 using ceph::encode;
7c673cae 61 __u8 struct_v = 1;
11fdf7f2
TL
62 encode(struct_v, bl);
63 encode(last_clean_begin, bl);
64 encode(last_clean_end, bl);
65 encode(up_from, bl);
66 encode(up_thru, bl);
67 encode(down_at, bl);
68 encode(lost_at, bl);
7c673cae
FG
69}
70
11fdf7f2 71void osd_info_t::decode(bufferlist::const_iterator& bl)
7c673cae 72{
11fdf7f2 73 using ceph::decode;
7c673cae 74 __u8 struct_v;
11fdf7f2
TL
75 decode(struct_v, bl);
76 decode(last_clean_begin, bl);
77 decode(last_clean_end, bl);
78 decode(up_from, bl);
79 decode(up_thru, bl);
80 decode(down_at, bl);
81 decode(lost_at, bl);
7c673cae
FG
82}
83
84void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
85{
86 o.push_back(new osd_info_t);
87 o.push_back(new osd_info_t);
88 o.back()->last_clean_begin = 1;
89 o.back()->last_clean_end = 2;
90 o.back()->up_from = 30;
91 o.back()->up_thru = 40;
92 o.back()->down_at = 5;
93 o.back()->lost_at = 6;
94}
95
96ostream& operator<<(ostream& out, const osd_info_t& info)
97{
98 out << "up_from " << info.up_from
99 << " up_thru " << info.up_thru
100 << " down_at " << info.down_at
101 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
102 if (info.lost_at)
103 out << " lost_at " << info.lost_at;
104 return out;
105}
106
107// ----------------------------------
108// osd_xinfo_t
109
110void osd_xinfo_t::dump(Formatter *f) const
111{
112 f->dump_stream("down_stamp") << down_stamp;
113 f->dump_float("laggy_probability", laggy_probability);
114 f->dump_int("laggy_interval", laggy_interval);
115 f->dump_int("features", features);
116 f->dump_unsigned("old_weight", old_weight);
117}
118
119void osd_xinfo_t::encode(bufferlist& bl) const
120{
121 ENCODE_START(3, 1, bl);
11fdf7f2 122 encode(down_stamp, bl);
7c673cae 123 __u32 lp = laggy_probability * 0xfffffffful;
11fdf7f2
TL
124 encode(lp, bl);
125 encode(laggy_interval, bl);
126 encode(features, bl);
127 encode(old_weight, bl);
7c673cae
FG
128 ENCODE_FINISH(bl);
129}
130
11fdf7f2 131void osd_xinfo_t::decode(bufferlist::const_iterator& bl)
7c673cae
FG
132{
133 DECODE_START(3, bl);
11fdf7f2 134 decode(down_stamp, bl);
7c673cae 135 __u32 lp;
11fdf7f2 136 decode(lp, bl);
7c673cae 137 laggy_probability = (float)lp / (float)0xffffffff;
11fdf7f2 138 decode(laggy_interval, bl);
7c673cae 139 if (struct_v >= 2)
11fdf7f2 140 decode(features, bl);
7c673cae
FG
141 else
142 features = 0;
143 if (struct_v >= 3)
11fdf7f2 144 decode(old_weight, bl);
7c673cae
FG
145 else
146 old_weight = 0;
147 DECODE_FINISH(bl);
148}
149
150void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
151{
152 o.push_back(new osd_xinfo_t);
153 o.push_back(new osd_xinfo_t);
154 o.back()->down_stamp = utime_t(2, 3);
155 o.back()->laggy_probability = .123;
156 o.back()->laggy_interval = 123456;
157 o.back()->old_weight = 0x7fff;
158}
159
160ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
161{
162 return out << "down_stamp " << xi.down_stamp
163 << " laggy_probability " << xi.laggy_probability
164 << " laggy_interval " << xi.laggy_interval
165 << " old_weight " << xi.old_weight;
166}
167
168// ----------------------------------
169// OSDMap::Incremental
170
171int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
172{
173 int n = 0;
174 for (auto &weight : new_weight) {
175 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
176 n++; // marked out
177 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
178 n--; // marked in
179 }
180 return n;
181}
182
183int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
184{
185 int n = 0;
186 for (auto &state : new_state) { //
187 if (state.second & CEPH_OSD_UP) {
188 if (previous->is_up(state.first))
189 n++; // marked down
190 else
191 n--; // marked up
192 }
193 }
194 return n;
195}
196
197int OSDMap::Incremental::identify_osd(uuid_d u) const
198{
199 for (auto &uuid : new_uuid)
200 if (uuid.second == u)
201 return uuid.first;
202 return -1;
203}
204
205int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
206 const OSDMap& osdmap)
207{
11fdf7f2 208 ceph_assert(epoch == osdmap.get_epoch() + 1);
7c673cae
FG
209
210 for (auto &new_pool : new_pools) {
211 if (!new_pool.second.tiers.empty()) {
212 pg_pool_t& base = new_pool.second;
213
11fdf7f2
TL
214 auto new_rem_it = new_removed_snaps.find(new_pool.first);
215
7c673cae
FG
216 for (const auto &tier_pool : base.tiers) {
217 const auto &r = new_pools.find(tier_pool);
218 pg_pool_t *tier = 0;
219 if (r == new_pools.end()) {
220 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
221 if (!orig) {
222 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
223 return -EIO;
224 }
225 tier = get_new_pool(tier_pool, orig);
226 } else {
227 tier = &r->second;
228 }
229 if (tier->tier_of != new_pool.first) {
230 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
231 return -EIO;
232 }
233
234 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
235 << tier_pool << dendl;
236 tier->snap_seq = base.snap_seq;
237 tier->snap_epoch = base.snap_epoch;
238 tier->snaps = base.snaps;
239 tier->removed_snaps = base.removed_snaps;
11fdf7f2
TL
240 tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
241 pg_pool_t::FLAG_POOL_SNAPS);
242
243 if (new_rem_it != new_removed_snaps.end()) {
244 new_removed_snaps[tier_pool] = new_rem_it->second;
245 }
7c673cae
FG
246 }
247 }
248 }
249 return 0;
250}
251
28e407b8
AA
252// ----------------------------------
253// OSDMap
7c673cae
FG
254
255bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
256{
257 if (id >= 0)
258 return is_down(id);
259
260 if (down_cache &&
261 down_cache->count(id)) {
262 return true;
263 }
264
265 list<int> children;
266 crush->get_children(id, &children);
267 for (const auto &child : children) {
268 if (!subtree_is_down(child, down_cache)) {
269 return false;
270 }
271 }
272 if (down_cache) {
273 down_cache->insert(id);
274 }
275 return true;
276}
277
278bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
279{
280 // use a stack-local down_cache if we didn't get one from the
281 // caller. then at least this particular call will avoid duplicated
282 // work.
283 set<int> local_down_cache;
284 if (!down_cache) {
285 down_cache = &local_down_cache;
286 }
287
288 int current = id;
289 while (true) {
290 int type;
291 if (current >= 0) {
292 type = 0;
293 } else {
294 type = crush->get_bucket_type(current);
295 }
11fdf7f2 296 ceph_assert(type >= 0);
7c673cae
FG
297
298 if (!subtree_is_down(current, down_cache)) {
299 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
300 return false;
301 }
302
303 // is this a big enough subtree to be marked as down?
304 if (type >= subtree_type) {
305 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
306 return true;
307 }
308
309 int r = crush->get_immediate_parent_id(current, &current);
310 if (r < 0) {
311 return false;
312 }
313 }
314}
315
224ce89b
WB
316bool OSDMap::subtree_type_is_down(
317 CephContext *cct,
318 int id,
319 int subtree_type,
320 set<int> *down_in_osds,
321 set<int> *up_in_osds,
322 set<int> *subtree_up,
323 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
324{
325 if (id >= 0) {
326 bool is_down_ret = is_down(id);
327 if (!is_out(id)) {
328 if (is_down_ret) {
329 down_in_osds->insert(id);
330 } else {
331 up_in_osds->insert(id);
332 }
333 }
334 return is_down_ret;
335 }
336
337 if (subtree_type_down &&
338 (*subtree_type_down)[subtree_type].count(id)) {
339 return true;
340 }
341
342 list<int> children;
343 crush->get_children(id, &children);
344 for (const auto &child : children) {
224ce89b
WB
345 if (!subtree_type_is_down(
346 cct, child, crush->get_bucket_type(child),
347 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
348 subtree_up->insert(id);
349 return false;
350 }
351 }
352 if (subtree_type_down) {
353 (*subtree_type_down)[subtree_type].insert(id);
354 }
355 return true;
356}
357
7c673cae
FG
358void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
359{
11fdf7f2 360 using ceph::encode;
7c673cae 361 __u16 v = 5;
11fdf7f2
TL
362 encode(v, bl);
363 encode(fsid, bl);
364 encode(epoch, bl);
365 encode(modified, bl);
7c673cae 366 int32_t new_t = new_pool_max;
11fdf7f2
TL
367 encode(new_t, bl);
368 encode(new_flags, bl);
369 encode(fullmap, bl);
370 encode(crush, bl);
7c673cae 371
11fdf7f2
TL
372 encode(new_max_osd, bl);
373 // for encode(new_pools, bl);
7c673cae 374 __u32 n = new_pools.size();
11fdf7f2 375 encode(n, bl);
7c673cae
FG
376 for (const auto &new_pool : new_pools) {
377 n = new_pool.first;
11fdf7f2
TL
378 encode(n, bl);
379 encode(new_pool.second, bl, 0);
7c673cae 380 }
11fdf7f2 381 // for encode(new_pool_names, bl);
7c673cae 382 n = new_pool_names.size();
11fdf7f2 383 encode(n, bl);
7c673cae
FG
384
385 for (const auto &new_pool_name : new_pool_names) {
386 n = new_pool_name.first;
11fdf7f2
TL
387 encode(n, bl);
388 encode(new_pool_name.second, bl);
7c673cae 389 }
11fdf7f2 390 // for encode(old_pools, bl);
7c673cae 391 n = old_pools.size();
11fdf7f2 392 encode(n, bl);
7c673cae
FG
393 for (auto &old_pool : old_pools) {
394 n = old_pool;
11fdf7f2 395 encode(n, bl);
7c673cae 396 }
11fdf7f2 397 encode(new_up_client, bl, 0);
31f18b77
FG
398 {
399 // legacy is map<int32_t,uint8_t>
400 uint32_t n = new_state.size();
11fdf7f2 401 encode(n, bl);
31f18b77 402 for (auto p : new_state) {
11fdf7f2
TL
403 encode(p.first, bl);
404 encode((uint8_t)p.second, bl);
31f18b77
FG
405 }
406 }
11fdf7f2
TL
407 encode(new_weight, bl);
408 // for encode(new_pg_temp, bl);
7c673cae 409 n = new_pg_temp.size();
11fdf7f2 410 encode(n, bl);
7c673cae
FG
411
412 for (const auto &pg_temp : new_pg_temp) {
413 old_pg_t opg = pg_temp.first.get_old_pg();
11fdf7f2
TL
414 encode(opg, bl);
415 encode(pg_temp.second, bl);
7c673cae
FG
416 }
417}
418
419void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
420{
11fdf7f2 421 using ceph::encode;
7c673cae
FG
422 if ((features & CEPH_FEATURE_PGID64) == 0) {
423 encode_client_old(bl);
424 return;
425 }
426
427 // base
428 __u16 v = 6;
11fdf7f2
TL
429 encode(v, bl);
430 encode(fsid, bl);
431 encode(epoch, bl);
432 encode(modified, bl);
433 encode(new_pool_max, bl);
434 encode(new_flags, bl);
435 encode(fullmap, bl);
436 encode(crush, bl);
437
438 encode(new_max_osd, bl);
439 encode(new_pools, bl, features);
440 encode(new_pool_names, bl);
441 encode(old_pools, bl);
442 encode(new_up_client, bl, features);
31f18b77
FG
443 {
444 uint32_t n = new_state.size();
11fdf7f2 445 encode(n, bl);
31f18b77 446 for (auto p : new_state) {
11fdf7f2
TL
447 encode(p.first, bl);
448 encode((uint8_t)p.second, bl);
31f18b77
FG
449 }
450 }
11fdf7f2
TL
451 encode(new_weight, bl);
452 encode(new_pg_temp, bl);
7c673cae
FG
453
454 // extended
455 __u16 ev = 10;
11fdf7f2
TL
456 encode(ev, bl);
457 encode(new_hb_back_up, bl, features);
458 encode(new_up_thru, bl);
459 encode(new_last_clean_interval, bl);
460 encode(new_lost, bl);
461 encode(new_blacklist, bl, features);
462 encode(old_blacklist, bl, features);
463 encode(new_up_cluster, bl, features);
464 encode(cluster_snapshot, bl);
465 encode(new_uuid, bl);
466 encode(new_xinfo, bl);
467 encode(new_hb_front_up, bl, features);
468}
469
470template<class T>
471static void encode_addrvec_map_as_addr(const T& m, bufferlist& bl, uint64_t f)
472{
473 uint32_t n = m.size();
474 encode(n, bl);
475 for (auto& i : m) {
476 encode(i.first, bl);
477 encode(i.second.legacy_addr(), bl, f);
478 }
479}
480
481template<class T>
482static void encode_addrvec_pvec_as_addr(const T& m, bufferlist& bl, uint64_t f)
483{
484 uint32_t n = m.size();
485 encode(n, bl);
486 for (auto& i : m) {
487 if (i) {
488 encode(i->legacy_addr(), bl, f);
489 } else {
490 encode(entity_addr_t(), bl, f);
491 }
492 }
7c673cae
FG
493}
494
11fdf7f2
TL
495/* for a description of osdmap incremental versions, and when they were
496 * introduced, please refer to
497 * doc/dev/osd_internals/osdmap_versions.txt
498 */
7c673cae
FG
499void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
500{
11fdf7f2 501 using ceph::encode;
7c673cae
FG
502 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
503 encode_classic(bl, features);
504 return;
505 }
506
507 // only a select set of callers should *ever* be encoding new
508 // OSDMaps. others should be passing around the canonical encoded
509 // buffers from on high. select out those callers by passing in an
510 // "impossible" feature bit.
11fdf7f2 511 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
512 features &= ~CEPH_FEATURE_RESERVED;
513
514 size_t start_offset = bl.length();
515 size_t tail_offset;
11fdf7f2
TL
516 size_t crc_offset;
517 std::optional<buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
518
519 // meta-encoding: how we include client-used and osd-specific data
520 ENCODE_START(8, 7, bl);
521
522 {
11fdf7f2 523 uint8_t v = 8;
7c673cae
FG
524 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
525 v = 3;
11fdf7f2
TL
526 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
527 v = 5;
528 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
529 v = 6;
7c673cae
FG
530 }
531 ENCODE_START(v, 1, bl); // client-usable data
11fdf7f2
TL
532 encode(fsid, bl);
533 encode(epoch, bl);
534 encode(modified, bl);
535 encode(new_pool_max, bl);
536 encode(new_flags, bl);
537 encode(fullmap, bl);
538 encode(crush, bl);
539
540 encode(new_max_osd, bl);
541 encode(new_pools, bl, features);
542 encode(new_pool_names, bl);
543 encode(old_pools, bl);
544 if (v >= 7) {
545 encode(new_up_client, bl, features);
546 } else {
547 encode_addrvec_map_as_addr(new_up_client, bl, features);
548 }
31f18b77 549 if (v >= 5) {
11fdf7f2 550 encode(new_state, bl);
31f18b77
FG
551 } else {
552 uint32_t n = new_state.size();
11fdf7f2 553 encode(n, bl);
31f18b77 554 for (auto p : new_state) {
11fdf7f2
TL
555 encode(p.first, bl);
556 encode((uint8_t)p.second, bl);
31f18b77
FG
557 }
558 }
11fdf7f2
TL
559 encode(new_weight, bl);
560 encode(new_pg_temp, bl);
561 encode(new_primary_temp, bl);
562 encode(new_primary_affinity, bl);
563 encode(new_erasure_code_profiles, bl);
564 encode(old_erasure_code_profiles, bl);
7c673cae 565 if (v >= 4) {
11fdf7f2
TL
566 encode(new_pg_upmap, bl);
567 encode(old_pg_upmap, bl);
568 encode(new_pg_upmap_items, bl);
569 encode(old_pg_upmap_items, bl);
570 }
571 if (v >= 6) {
572 encode(new_removed_snaps, bl);
573 encode(new_purged_snaps, bl);
574 }
575 if (v >= 8) {
576 encode(new_last_up_change, bl);
577 encode(new_last_in_change, bl);
7c673cae
FG
578 }
579 ENCODE_FINISH(bl); // client-usable data
580 }
581
582 {
81eedcae 583 uint8_t target_v = 9;
7c673cae
FG
584 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
585 target_v = 2;
11fdf7f2
TL
586 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
587 target_v = 6;
7c673cae
FG
588 }
589 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
590 if (target_v < 7) {
591 encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
592 } else {
593 encode(new_hb_back_up, bl, features);
594 }
595 encode(new_up_thru, bl);
596 encode(new_last_clean_interval, bl);
597 encode(new_lost, bl);
598 encode(new_blacklist, bl, features);
599 encode(old_blacklist, bl, features);
600 if (target_v < 7) {
601 encode_addrvec_map_as_addr(new_up_cluster, bl, features);
602 } else {
603 encode(new_up_cluster, bl, features);
604 }
605 encode(cluster_snapshot, bl);
606 encode(new_uuid, bl);
607 encode(new_xinfo, bl);
608 if (target_v < 7) {
609 encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
610 } else {
611 encode(new_hb_front_up, bl, features);
612 }
613 encode(features, bl); // NOTE: features arg, not the member
7c673cae 614 if (target_v >= 3) {
11fdf7f2
TL
615 encode(new_nearfull_ratio, bl);
616 encode(new_full_ratio, bl);
617 encode(new_backfillfull_ratio, bl);
31f18b77
FG
618 }
619 // 5 was string-based new_require_min_compat_client
620 if (target_v >= 6) {
11fdf7f2
TL
621 encode(new_require_min_compat_client, bl);
622 encode(new_require_osd_release, bl);
7c673cae 623 }
81eedcae
TL
624 if (target_v >= 8) {
625 encode(new_crush_node_flags, bl);
626 }
627 if (target_v >= 9) {
628 encode(new_device_class_flags, bl);
629 }
7c673cae
FG
630 ENCODE_FINISH(bl); // osd-only data
631 }
632
11fdf7f2
TL
633 crc_offset = bl.length();
634 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
635 tail_offset = bl.length();
636
11fdf7f2 637 encode(full_crc, bl);
7c673cae
FG
638
639 ENCODE_FINISH(bl); // meta-encoding wrapper
640
641 // fill in crc
642 bufferlist front;
11fdf7f2 643 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
644 inc_crc = front.crc32c(-1);
645 bufferlist tail;
646 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
647 inc_crc = tail.crc32c(inc_crc);
648 ceph_le32 crc_le;
649 crc_le = inc_crc;
11fdf7f2 650 crc_filler->copy_in(4u, (char*)&crc_le);
7c673cae
FG
651 have_crc = true;
652}
653
11fdf7f2 654void OSDMap::Incremental::decode_classic(bufferlist::const_iterator &p)
7c673cae 655{
11fdf7f2 656 using ceph::decode;
7c673cae
FG
657 __u32 n, t;
658 // base
659 __u16 v;
11fdf7f2
TL
660 decode(v, p);
661 decode(fsid, p);
662 decode(epoch, p);
663 decode(modified, p);
7c673cae 664 if (v == 4 || v == 5) {
11fdf7f2 665 decode(n, p);
7c673cae
FG
666 new_pool_max = n;
667 } else if (v >= 6)
11fdf7f2
TL
668 decode(new_pool_max, p);
669 decode(new_flags, p);
670 decode(fullmap, p);
671 decode(crush, p);
7c673cae 672
11fdf7f2 673 decode(new_max_osd, p);
7c673cae
FG
674 if (v < 6) {
675 new_pools.clear();
11fdf7f2 676 decode(n, p);
7c673cae 677 while (n--) {
11fdf7f2
TL
678 decode(t, p);
679 decode(new_pools[t], p);
7c673cae
FG
680 }
681 } else {
11fdf7f2 682 decode(new_pools, p);
7c673cae
FG
683 }
684 if (v == 5) {
685 new_pool_names.clear();
11fdf7f2 686 decode(n, p);
7c673cae 687 while (n--) {
11fdf7f2
TL
688 decode(t, p);
689 decode(new_pool_names[t], p);
7c673cae
FG
690 }
691 } else if (v >= 6) {
11fdf7f2 692 decode(new_pool_names, p);
7c673cae
FG
693 }
694 if (v < 6) {
695 old_pools.clear();
11fdf7f2 696 decode(n, p);
7c673cae 697 while (n--) {
11fdf7f2 698 decode(t, p);
7c673cae
FG
699 old_pools.insert(t);
700 }
701 } else {
11fdf7f2 702 decode(old_pools, p);
7c673cae 703 }
11fdf7f2 704 decode(new_up_client, p);
31f18b77
FG
705 {
706 map<int32_t,uint8_t> ns;
11fdf7f2 707 decode(ns, p);
31f18b77
FG
708 for (auto q : ns) {
709 new_state[q.first] = q.second;
710 }
711 }
11fdf7f2 712 decode(new_weight, p);
7c673cae
FG
713
714 if (v < 6) {
715 new_pg_temp.clear();
11fdf7f2 716 decode(n, p);
7c673cae
FG
717 while (n--) {
718 old_pg_t opg;
719 ::decode_raw(opg, p);
11fdf7f2 720 decode(new_pg_temp[pg_t(opg)], p);
7c673cae
FG
721 }
722 } else {
11fdf7f2 723 decode(new_pg_temp, p);
7c673cae
FG
724 }
725
726 // decode short map, too.
727 if (v == 5 && p.end())
728 return;
729
730 // extended
731 __u16 ev = 0;
732 if (v >= 5)
11fdf7f2
TL
733 decode(ev, p);
734 decode(new_hb_back_up, p);
7c673cae 735 if (v < 5)
11fdf7f2
TL
736 decode(new_pool_names, p);
737 decode(new_up_thru, p);
738 decode(new_last_clean_interval, p);
739 decode(new_lost, p);
740 decode(new_blacklist, p);
741 decode(old_blacklist, p);
7c673cae 742 if (ev >= 6)
11fdf7f2 743 decode(new_up_cluster, p);
7c673cae 744 if (ev >= 7)
11fdf7f2 745 decode(cluster_snapshot, p);
7c673cae 746 if (ev >= 8)
11fdf7f2 747 decode(new_uuid, p);
7c673cae 748 if (ev >= 9)
11fdf7f2 749 decode(new_xinfo, p);
7c673cae 750 if (ev >= 10)
11fdf7f2 751 decode(new_hb_front_up, p);
7c673cae
FG
752}
753
11fdf7f2
TL
754/* for a description of osdmap incremental versions, and when they were
755 * introduced, please refer to
756 * doc/dev/osd_internals/osdmap_versions.txt
757 */
758void OSDMap::Incremental::decode(bufferlist::const_iterator& bl)
7c673cae 759{
11fdf7f2 760 using ceph::decode;
7c673cae
FG
761 /**
762 * Older encodings of the Incremental had a single struct_v which
763 * covered the whole encoding, and was prior to our modern
764 * stuff which includes a compatv and a size. So if we see
765 * a struct_v < 7, we must rewind to the beginning and use our
766 * classic decoder.
767 */
768 size_t start_offset = bl.get_off();
769 size_t tail_offset = 0;
770 bufferlist crc_front, crc_tail;
771
772 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
773 if (struct_v < 7) {
11fdf7f2 774 bl.seek(start_offset);
7c673cae
FG
775 decode_classic(bl);
776 encode_features = 0;
777 if (struct_v >= 6)
778 encode_features = CEPH_FEATURE_PGID64;
779 else
780 encode_features = 0;
781 return;
782 }
783 {
11fdf7f2
TL
784 DECODE_START(8, bl); // client-usable data
785 decode(fsid, bl);
786 decode(epoch, bl);
787 decode(modified, bl);
788 decode(new_pool_max, bl);
789 decode(new_flags, bl);
790 decode(fullmap, bl);
791 decode(crush, bl);
792
793 decode(new_max_osd, bl);
794 decode(new_pools, bl);
795 decode(new_pool_names, bl);
796 decode(old_pools, bl);
797 decode(new_up_client, bl);
31f18b77 798 if (struct_v >= 5) {
11fdf7f2 799 decode(new_state, bl);
31f18b77
FG
800 } else {
801 map<int32_t,uint8_t> ns;
11fdf7f2 802 decode(ns, bl);
31f18b77
FG
803 for (auto q : ns) {
804 new_state[q.first] = q.second;
805 }
806 }
11fdf7f2
TL
807 decode(new_weight, bl);
808 decode(new_pg_temp, bl);
809 decode(new_primary_temp, bl);
7c673cae 810 if (struct_v >= 2)
11fdf7f2 811 decode(new_primary_affinity, bl);
7c673cae
FG
812 else
813 new_primary_affinity.clear();
814 if (struct_v >= 3) {
11fdf7f2
TL
815 decode(new_erasure_code_profiles, bl);
816 decode(old_erasure_code_profiles, bl);
7c673cae
FG
817 } else {
818 new_erasure_code_profiles.clear();
819 old_erasure_code_profiles.clear();
820 }
821 if (struct_v >= 4) {
11fdf7f2
TL
822 decode(new_pg_upmap, bl);
823 decode(old_pg_upmap, bl);
824 decode(new_pg_upmap_items, bl);
825 decode(old_pg_upmap_items, bl);
826 }
827 if (struct_v >= 6) {
828 decode(new_removed_snaps, bl);
829 decode(new_purged_snaps, bl);
830 }
831 if (struct_v >= 8) {
832 decode(new_last_up_change, bl);
833 decode(new_last_in_change, bl);
7c673cae
FG
834 }
835 DECODE_FINISH(bl); // client-usable data
836 }
837
838 {
81eedcae 839 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
840 decode(new_hb_back_up, bl);
841 decode(new_up_thru, bl);
842 decode(new_last_clean_interval, bl);
843 decode(new_lost, bl);
844 decode(new_blacklist, bl);
845 decode(old_blacklist, bl);
846 decode(new_up_cluster, bl);
847 decode(cluster_snapshot, bl);
848 decode(new_uuid, bl);
849 decode(new_xinfo, bl);
850 decode(new_hb_front_up, bl);
7c673cae 851 if (struct_v >= 2)
11fdf7f2 852 decode(encode_features, bl);
7c673cae
FG
853 else
854 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
855 if (struct_v >= 3) {
11fdf7f2
TL
856 decode(new_nearfull_ratio, bl);
857 decode(new_full_ratio, bl);
7c673cae
FG
858 } else {
859 new_nearfull_ratio = -1;
860 new_full_ratio = -1;
861 }
862 if (struct_v >= 4) {
11fdf7f2 863 decode(new_backfillfull_ratio, bl);
7c673cae
FG
864 } else {
865 new_backfillfull_ratio = -1;
866 }
31f18b77
FG
867 if (struct_v == 5) {
868 string r;
11fdf7f2 869 decode(r, bl);
31f18b77
FG
870 if (r.length()) {
871 new_require_min_compat_client = ceph_release_from_name(r.c_str());
872 }
873 }
874 if (struct_v >= 6) {
11fdf7f2
TL
875 decode(new_require_min_compat_client, bl);
876 decode(new_require_osd_release, bl);
31f18b77
FG
877 } else {
878 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
879 // only for compat with post-kraken pre-luminous test clusters
880 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
881 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
882 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
883 new_require_osd_release = CEPH_RELEASE_KRAKEN;
884 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
885 new_require_osd_release = CEPH_RELEASE_JEWEL;
886 } else {
887 new_require_osd_release = -1;
888 }
889 }
81eedcae
TL
890 if (struct_v >= 8) {
891 decode(new_crush_node_flags, bl);
892 }
893 if (struct_v >= 9) {
894 decode(new_device_class_flags, bl);
895 }
7c673cae
FG
896 DECODE_FINISH(bl); // osd-only data
897 }
898
899 if (struct_v >= 8) {
900 have_crc = true;
901 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 902 decode(inc_crc, bl);
7c673cae 903 tail_offset = bl.get_off();
11fdf7f2 904 decode(full_crc, bl);
7c673cae
FG
905 } else {
906 have_crc = false;
907 full_crc = 0;
908 inc_crc = 0;
909 }
910
911 DECODE_FINISH(bl); // wrapper
912
913 if (have_crc) {
914 // verify crc
915 uint32_t actual = crc_front.crc32c(-1);
916 if (tail_offset < bl.get_off()) {
917 bufferlist tail;
918 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
919 actual = tail.crc32c(actual);
920 }
921 if (inc_crc != actual) {
922 ostringstream ss;
923 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
924 string s = ss.str();
925 throw buffer::malformed_input(s.c_str());
926 }
927 }
928}
929
930void OSDMap::Incremental::dump(Formatter *f) const
931{
932 f->dump_int("epoch", epoch);
933 f->dump_stream("fsid") << fsid;
934 f->dump_stream("modified") << modified;
11fdf7f2
TL
935 f->dump_stream("new_last_up_change") << new_last_up_change;
936 f->dump_stream("new_last_in_change") << new_last_in_change;
7c673cae
FG
937 f->dump_int("new_pool_max", new_pool_max);
938 f->dump_int("new_flags", new_flags);
939 f->dump_float("new_full_ratio", new_full_ratio);
940 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
941 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
31f18b77
FG
942 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
943 f->dump_int("new_require_osd_release", new_require_osd_release);
7c673cae
FG
944
945 if (fullmap.length()) {
946 f->open_object_section("full_map");
947 OSDMap full;
948 bufferlist fbl = fullmap; // kludge around constness.
11fdf7f2 949 auto p = fbl.cbegin();
7c673cae
FG
950 full.decode(p);
951 full.dump(f);
952 f->close_section();
953 }
954 if (crush.length()) {
955 f->open_object_section("crush");
956 CrushWrapper c;
957 bufferlist tbl = crush; // kludge around constness.
11fdf7f2 958 auto p = tbl.cbegin();
7c673cae
FG
959 c.decode(p);
960 c.dump(f);
961 f->close_section();
962 }
963
964 f->dump_int("new_max_osd", new_max_osd);
965
966 f->open_array_section("new_pools");
967
968 for (const auto &new_pool : new_pools) {
969 f->open_object_section("pool");
970 f->dump_int("pool", new_pool.first);
971 new_pool.second.dump(f);
972 f->close_section();
973 }
974 f->close_section();
975 f->open_array_section("new_pool_names");
976
977 for (const auto &new_pool_name : new_pool_names) {
978 f->open_object_section("pool_name");
979 f->dump_int("pool", new_pool_name.first);
980 f->dump_string("name", new_pool_name.second);
981 f->close_section();
982 }
983 f->close_section();
984 f->open_array_section("old_pools");
985
986 for (const auto &old_pool : old_pools)
987 f->dump_int("pool", old_pool);
988 f->close_section();
989
990 f->open_array_section("new_up_osds");
991
992 for (const auto &upclient : new_up_client) {
993 f->open_object_section("osd");
994 f->dump_int("osd", upclient.first);
11fdf7f2
TL
995 f->dump_stream("public_addr") << upclient.second.legacy_addr();
996 f->dump_object("public_addrs", upclient.second);
997 if (auto p = new_up_cluster.find(upclient.first);
998 p != new_up_cluster.end()) {
999 f->dump_stream("cluster_addr") << p->second.legacy_addr();
1000 f->dump_object("cluster_addrs", p->second);
1001 }
1002 if (auto p = new_hb_back_up.find(upclient.first);
1003 p != new_hb_back_up.end()) {
1004 f->dump_object("heartbeat_back_addrs", p->second);
1005 }
1006 if (auto p = new_hb_front_up.find(upclient.first);
1007 p != new_hb_front_up.end()) {
1008 f->dump_object("heartbeat_front_addrs", p->second);
1009 }
7c673cae
FG
1010 f->close_section();
1011 }
1012 f->close_section();
1013
1014 f->open_array_section("new_weight");
1015
1016 for (const auto &weight : new_weight) {
1017 f->open_object_section("osd");
1018 f->dump_int("osd", weight.first);
1019 f->dump_int("weight", weight.second);
1020 f->close_section();
1021 }
1022 f->close_section();
1023
1024 f->open_array_section("osd_state_xor");
1025 for (const auto &ns : new_state) {
1026 f->open_object_section("osd");
1027 f->dump_int("osd", ns.first);
1028 set<string> st;
1029 calc_state_set(new_state.find(ns.first)->second, st);
1030 f->open_array_section("state_xor");
1031 for (auto &state : st)
1032 f->dump_string("state", state);
1033 f->close_section();
c07f9fc5 1034 f->close_section();
7c673cae
FG
1035 }
1036 f->close_section();
1037
1038 f->open_array_section("new_pg_temp");
1039
1040 for (const auto &pg_temp : new_pg_temp) {
1041 f->open_object_section("pg");
1042 f->dump_stream("pgid") << pg_temp.first;
1043 f->open_array_section("osds");
1044
1045 for (const auto &osd : pg_temp.second)
1046 f->dump_int("osd", osd);
1047 f->close_section();
1048 f->close_section();
1049 }
1050 f->close_section();
1051
1052 f->open_array_section("primary_temp");
1053
1054 for (const auto &primary_temp : new_primary_temp) {
1055 f->dump_stream("pgid") << primary_temp.first;
1056 f->dump_int("osd", primary_temp.second);
1057 }
1058 f->close_section(); // primary_temp
1059
1060 f->open_array_section("new_pg_upmap");
1061 for (auto& i : new_pg_upmap) {
1062 f->open_object_section("mapping");
1063 f->dump_stream("pgid") << i.first;
1064 f->open_array_section("osds");
1065 for (auto osd : i.second) {
1066 f->dump_int("osd", osd);
1067 }
1068 f->close_section();
1069 f->close_section();
1070 }
1071 f->close_section();
1072 f->open_array_section("old_pg_upmap");
1073 for (auto& i : old_pg_upmap) {
1074 f->dump_stream("pgid") << i;
1075 }
1076 f->close_section();
1077
1078 f->open_array_section("new_pg_upmap_items");
1079 for (auto& i : new_pg_upmap_items) {
1080 f->open_object_section("mapping");
1081 f->dump_stream("pgid") << i.first;
1082 f->open_array_section("mappings");
1083 for (auto& p : i.second) {
1084 f->open_object_section("mapping");
1085 f->dump_int("from", p.first);
1086 f->dump_int("to", p.second);
1087 f->close_section();
1088 }
1089 f->close_section();
1090 f->close_section();
1091 }
1092 f->close_section();
1093 f->open_array_section("old_pg_upmap_items");
1094 for (auto& i : old_pg_upmap_items) {
1095 f->dump_stream("pgid") << i;
1096 }
1097 f->close_section();
1098
1099 f->open_array_section("new_up_thru");
1100
1101 for (const auto &up_thru : new_up_thru) {
1102 f->open_object_section("osd");
1103 f->dump_int("osd", up_thru.first);
1104 f->dump_int("up_thru", up_thru.second);
1105 f->close_section();
1106 }
1107 f->close_section();
1108
1109 f->open_array_section("new_lost");
1110
1111 for (const auto &lost : new_lost) {
1112 f->open_object_section("osd");
1113 f->dump_int("osd", lost.first);
1114 f->dump_int("epoch_lost", lost.second);
1115 f->close_section();
1116 }
1117 f->close_section();
1118
1119 f->open_array_section("new_last_clean_interval");
1120
1121 for (const auto &last_clean_interval : new_last_clean_interval) {
1122 f->open_object_section("osd");
1123 f->dump_int("osd", last_clean_interval.first);
1124 f->dump_int("first", last_clean_interval.second.first);
1125 f->dump_int("last", last_clean_interval.second.second);
1126 f->close_section();
1127 }
1128 f->close_section();
1129
1130 f->open_array_section("new_blacklist");
1131 for (const auto &blist : new_blacklist) {
1132 stringstream ss;
1133 ss << blist.first;
1134 f->dump_stream(ss.str().c_str()) << blist.second;
1135 }
1136 f->close_section();
1137 f->open_array_section("old_blacklist");
1138 for (const auto &blist : old_blacklist)
1139 f->dump_stream("addr") << blist;
1140 f->close_section();
1141
1142 f->open_array_section("new_xinfo");
1143 for (const auto &xinfo : new_xinfo) {
1144 f->open_object_section("xinfo");
1145 f->dump_int("osd", xinfo.first);
1146 xinfo.second.dump(f);
1147 f->close_section();
1148 }
1149 f->close_section();
1150
1151 if (cluster_snapshot.size())
1152 f->dump_string("cluster_snapshot", cluster_snapshot);
1153
1154 f->open_array_section("new_uuid");
1155 for (const auto &uuid : new_uuid) {
1156 f->open_object_section("osd");
1157 f->dump_int("osd", uuid.first);
1158 f->dump_stream("uuid") << uuid.second;
1159 f->close_section();
1160 }
1161 f->close_section();
1162
1163 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1164 f->open_array_section("old_erasure_code_profiles");
1165 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1166 f->dump_string("old", erasure_code_profile.c_str());
1167 }
1168 f->close_section();
11fdf7f2
TL
1169
1170 f->open_array_section("new_removed_snaps");
1171 for (auto& p : new_removed_snaps) {
1172 f->open_object_section("pool");
1173 f->dump_int("pool", p.first);
1174 f->open_array_section("snaps");
1175 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1176 f->open_object_section("interval");
1177 f->dump_unsigned("begin", q.get_start());
1178 f->dump_unsigned("length", q.get_len());
1179 f->close_section();
1180 }
1181 f->close_section();
1182 f->close_section();
1183 }
1184 f->close_section();
1185 f->open_array_section("new_purged_snaps");
1186 for (auto& p : new_purged_snaps) {
1187 f->open_object_section("pool");
1188 f->dump_int("pool", p.first);
1189 f->open_array_section("snaps");
1190 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
1191 f->open_object_section("interval");
1192 f->dump_unsigned("begin", q.get_start());
1193 f->dump_unsigned("length", q.get_len());
1194 f->close_section();
1195 }
1196 f->close_section();
1197 f->close_section();
1198 }
81eedcae
TL
1199 f->open_array_section("new_crush_node_flags");
1200 for (auto& i : new_crush_node_flags) {
1201 f->open_object_section("node");
1202 f->dump_int("id", i.first);
1203 set<string> st;
1204 calc_state_set(i.second, st);
1205 for (auto& j : st) {
1206 f->dump_string("flag", j);
1207 }
1208 f->close_section();
1209 }
1210 f->close_section();
1211 f->open_array_section("new_device_class_flags");
1212 for (auto& i : new_device_class_flags) {
1213 f->open_object_section("device_class");
1214 f->dump_int("id", i.first);
1215 set<string> st;
1216 calc_state_set(i.second, st);
1217 for (auto& j : st) {
1218 f->dump_string("flag", j);
1219 }
1220 f->close_section();
1221 }
1222 f->close_section();
11fdf7f2 1223 f->close_section();
7c673cae
FG
1224}
1225
1226void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1227{
1228 o.push_back(new Incremental);
1229}
1230
1231// ----------------------------------
1232// OSDMap
1233
1234void OSDMap::set_epoch(epoch_t e)
1235{
1236 epoch = e;
1237 for (auto &pool : pools)
1238 pool.second.last_change = e;
1239}
1240
11fdf7f2 1241bool OSDMap::is_blacklisted(const entity_addr_t& orig) const
7c673cae 1242{
11fdf7f2 1243 if (blacklist.empty()) {
7c673cae 1244 return false;
11fdf7f2
TL
1245 }
1246
1247 // all blacklist entries are type ANY for nautilus+
1248 // FIXME: avoid this copy!
1249 entity_addr_t a = orig;
1250 if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
1251 a.set_type(entity_addr_t::TYPE_LEGACY);
1252 } else {
1253 a.set_type(entity_addr_t::TYPE_ANY);
1254 }
7c673cae
FG
1255
1256 // this specific instance?
11fdf7f2 1257 if (blacklist.count(a)) {
7c673cae 1258 return true;
11fdf7f2 1259 }
7c673cae
FG
1260
1261 // is entire ip blacklisted?
1262 if (a.is_ip()) {
11fdf7f2
TL
1263 a.set_port(0);
1264 a.set_nonce(0);
1265 if (blacklist.count(a)) {
1266 return true;
1267 }
1268 }
1269
1270 return false;
1271}
1272
1273bool OSDMap::is_blacklisted(const entity_addrvec_t& av) const
1274{
1275 if (blacklist.empty())
1276 return false;
1277
1278 for (auto& a : av.v) {
1279 if (is_blacklisted(a)) {
7c673cae
FG
1280 return true;
1281 }
1282 }
1283
1284 return false;
1285}
1286
1287void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1288{
1289 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1290}
1291
31f18b77
FG
1292void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1293{
1294 for (const auto &i : blacklist) {
1295 bl->insert(i.first);
1296 }
1297}
1298
7c673cae
FG
1299void OSDMap::set_max_osd(int m)
1300{
1301 int o = max_osd;
1302 max_osd = m;
1303 osd_state.resize(m);
1304 osd_weight.resize(m);
1305 for (; o<max_osd; o++) {
1306 osd_state[o] = 0;
1307 osd_weight[o] = CEPH_OSD_OUT;
1308 }
1309 osd_info.resize(m);
1310 osd_xinfo.resize(m);
11fdf7f2
TL
1311 osd_addrs->client_addrs.resize(m);
1312 osd_addrs->cluster_addrs.resize(m);
1313 osd_addrs->hb_back_addrs.resize(m);
1314 osd_addrs->hb_front_addrs.resize(m);
7c673cae
FG
1315 osd_uuid->resize(m);
1316 if (osd_primary_affinity)
1317 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1318
1319 calc_num_osds();
1320}
1321
1322int OSDMap::calc_num_osds()
1323{
1324 num_osd = 0;
1325 num_up_osd = 0;
1326 num_in_osd = 0;
1327 for (int i=0; i<max_osd; i++) {
1328 if (osd_state[i] & CEPH_OSD_EXISTS) {
1329 ++num_osd;
1330 if (osd_state[i] & CEPH_OSD_UP) {
1331 ++num_up_osd;
1332 }
1333 if (get_weight(i) != CEPH_OSD_OUT) {
1334 ++num_in_osd;
1335 }
1336 }
1337 }
1338 return num_osd;
1339}
1340
3efd9988
FG
1341void OSDMap::get_full_pools(CephContext *cct,
1342 set<int64_t> *full,
1343 set<int64_t> *backfillfull,
1344 set<int64_t> *nearfull) const
7c673cae 1345{
11fdf7f2
TL
1346 ceph_assert(full);
1347 ceph_assert(backfillfull);
1348 ceph_assert(nearfull);
3efd9988
FG
1349 full->clear();
1350 backfillfull->clear();
1351 nearfull->clear();
1352
1353 vector<int> full_osds;
1354 vector<int> backfillfull_osds;
1355 vector<int> nearfull_osds;
7c673cae
FG
1356 for (int i = 0; i < max_osd; ++i) {
1357 if (exists(i) && is_up(i) && is_in(i)) {
1358 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1359 full_osds.push_back(i);
7c673cae 1360 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1361 backfillfull_osds.push_back(i);
7c673cae 1362 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1363 nearfull_osds.push_back(i);
7c673cae
FG
1364 }
1365 }
3efd9988
FG
1366
1367 for (auto i: full_osds) {
1368 get_pool_ids_by_osd(cct, i, full);
1369 }
1370 for (auto i: backfillfull_osds) {
1371 get_pool_ids_by_osd(cct, i, backfillfull);
1372 }
1373 for (auto i: nearfull_osds) {
1374 get_pool_ids_by_osd(cct, i, nearfull);
1375 }
7c673cae
FG
1376}
1377
31f18b77
FG
1378void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1379 set<int> *nearfull) const
1380{
1381 full->clear();
1382 backfill->clear();
1383 nearfull->clear();
1384 for (int i = 0; i < max_osd; ++i) {
1385 if (exists(i) && is_up(i) && is_in(i)) {
1386 if (osd_state[i] & CEPH_OSD_FULL)
1387 full->emplace(i);
1388 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1389 backfill->emplace(i);
1390 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1391 nearfull->emplace(i);
1392 }
1393 }
1394}
1395
7c673cae
FG
1396void OSDMap::get_all_osds(set<int32_t>& ls) const
1397{
1398 for (int i=0; i<max_osd; i++)
1399 if (exists(i))
1400 ls.insert(i);
1401}
1402
1403void OSDMap::get_up_osds(set<int32_t>& ls) const
1404{
1405 for (int i = 0; i < max_osd; i++) {
1406 if (is_up(i))
1407 ls.insert(i);
1408 }
1409}
1410
81eedcae 1411void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
31f18b77
FG
1412{
1413 for (int i = 0; i < max_osd; i++) {
81eedcae 1414 if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
31f18b77
FG
1415 ls.insert(i);
1416 }
1417}
1418
11fdf7f2
TL
1419void OSDMap::get_flag_set(set<string> *flagset) const
1420{
1421 for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
1422 if (flags & (1<<i)) {
1423 flagset->insert(get_flag_string(flags & (1<<i)));
1424 }
1425 }
1426}
1427
7c673cae
FG
1428void OSDMap::calc_state_set(int state, set<string>& st)
1429{
1430 unsigned t = state;
1431 for (unsigned s = 1; t; s <<= 1) {
1432 if (t & s) {
1433 t &= ~s;
1434 st.insert(ceph_osd_state_name(s));
1435 }
1436 }
1437}
1438
1439void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1440{
1441 float max = 0;
1442 for (const auto &weight : weights) {
1443 if (weight.second > max)
1444 max = weight.second;
1445 }
1446
1447 for (const auto &weight : weights) {
1448 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1449 }
1450}
1451
1452int OSDMap::identify_osd(const entity_addr_t& addr) const
1453{
1454 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1455 if (exists(i) && (get_addrs(i).contains(addr) ||
1456 get_cluster_addrs(i).contains(addr)))
7c673cae
FG
1457 return i;
1458 return -1;
1459}
1460
1461int OSDMap::identify_osd(const uuid_d& u) const
1462{
1463 for (int i=0; i<max_osd; i++)
1464 if (exists(i) && get_uuid(i) == u)
1465 return i;
1466 return -1;
1467}
1468
1469int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1470{
1471 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1472 if (exists(i) && (get_addrs(i).contains(addr) ||
1473 get_cluster_addrs(i).contains(addr) ||
1474 get_hb_back_addrs(i).contains(addr) ||
1475 get_hb_front_addrs(i).contains(addr)))
7c673cae
FG
1476 return i;
1477 return -1;
1478}
1479
1480int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1481{
1482 for (int i=0; i<max_osd; i++)
11fdf7f2
TL
1483 if (exists(i) && (get_addrs(i).is_same_host(ip) ||
1484 get_cluster_addrs(i).is_same_host(ip)))
7c673cae
FG
1485 return i;
1486 return -1;
1487}
1488
1489
1490uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1491{
1492 uint64_t features = 0; // things we actually have
1493 uint64_t mask = 0; // things we could have
1494
1495 if (crush->has_nondefault_tunables())
1496 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1497 if (crush->has_nondefault_tunables2())
1498 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1499 if (crush->has_nondefault_tunables3())
1500 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1501 if (crush->has_v4_buckets())
1502 features |= CEPH_FEATURE_CRUSH_V4;
1503 if (crush->has_nondefault_tunables5())
1504 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1505 if (crush->has_incompat_choose_args()) {
1506 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1507 }
7c673cae
FG
1508 mask |= CEPH_FEATURES_CRUSH;
1509
1510 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1511 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1512 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1513
1514 for (auto &pool: pools) {
1515 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1516 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1517 }
7c673cae
FG
1518 if (!pool.second.tiers.empty() ||
1519 pool.second.is_tier()) {
1520 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1521 }
31f18b77 1522 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
7c673cae
FG
1523 pool.second.get_type(),
1524 pool.second.get_size());
1525 if (ruleid >= 0) {
1526 if (crush->is_v2_rule(ruleid))
1527 features |= CEPH_FEATURE_CRUSH_V2;
1528 if (crush->is_v3_rule(ruleid))
1529 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1530 if (crush->is_v5_rule(ruleid))
1531 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1532 }
1533 }
7c673cae 1534 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
7c673cae
FG
1535
1536 if (osd_primary_affinity) {
1537 for (int i = 0; i < max_osd; ++i) {
1538 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1539 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1540 break;
1541 }
1542 }
1543 }
1544 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1545
1546 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1547 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
31f18b77 1548 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
1549 features |= jewel_features;
1550 }
1551 mask |= jewel_features;
1552
1553 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1554 | CEPH_FEATURE_MSG_ADDR2;
31f18b77 1555 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1556 features |= kraken_features;
1557 }
1558 mask |= kraken_features;
1559 }
1560
11fdf7f2
TL
1561 if (require_min_compat_client >= CEPH_RELEASE_NAUTILUS) {
1562 // if min_compat_client is >= nautilus, require v2 cephx signatures
1563 // from everyone
1564 features |= CEPH_FEATUREMASK_CEPHX_V2;
1565 } else if (require_osd_release >= CEPH_RELEASE_NAUTILUS &&
1566 entity_type == CEPH_ENTITY_TYPE_OSD) {
1567 // if osds are >= nautilus, at least require the signatures from them
1568 features |= CEPH_FEATUREMASK_CEPHX_V2;
1569 }
1570 mask |= CEPH_FEATUREMASK_CEPHX_V2;
1571
7c673cae
FG
1572 if (pmask)
1573 *pmask = mask;
1574 return features;
1575}
1576
31f18b77 1577uint8_t OSDMap::get_min_compat_client() const
7c673cae
FG
1578{
1579 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1580
1581 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77
FG
1582 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1583 return CEPH_RELEASE_LUMINOUS; // v12.2.0
7c673cae
FG
1584 }
1585 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
31f18b77 1586 return CEPH_RELEASE_JEWEL; // v10.2.0
7c673cae
FG
1587 }
1588 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
31f18b77 1589 return CEPH_RELEASE_HAMMER; // v0.94.0
7c673cae
FG
1590 }
1591 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1592 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
7c673cae 1593 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
31f18b77 1594 return CEPH_RELEASE_FIREFLY; // v0.80.0
7c673cae
FG
1595 }
1596 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1597 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
31f18b77 1598 return CEPH_RELEASE_DUMPLING; // v0.67.0
7c673cae
FG
1599 }
1600 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
31f18b77 1601 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae 1602 }
31f18b77 1603 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae
FG
1604}
1605
11fdf7f2
TL
1606uint8_t OSDMap::get_require_min_compat_client() const
1607{
1608 return require_min_compat_client;
1609}
1610
7c673cae
FG
1611void OSDMap::_calc_up_osd_features()
1612{
1613 bool first = true;
1614 cached_up_osd_features = 0;
1615 for (int osd = 0; osd < max_osd; ++osd) {
1616 if (!is_up(osd))
1617 continue;
1618 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1619 if (xi.features == 0)
1620 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1621 if (first) {
1622 cached_up_osd_features = xi.features;
1623 first = false;
1624 } else {
1625 cached_up_osd_features &= xi.features;
1626 }
1627 }
1628}
1629
1630uint64_t OSDMap::get_up_osd_features() const
1631{
1632 return cached_up_osd_features;
1633}
1634
1635void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1636{
11fdf7f2 1637 using ceph::encode;
7c673cae
FG
1638 if (o->epoch == n->epoch)
1639 return;
1640
1641 int diff = 0;
1642
1643 // do addrs match?
1644 if (o->max_osd != n->max_osd)
1645 diff++;
1646 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
11fdf7f2
TL
1647 if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
1648 *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
1649 n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
7c673cae
FG
1650 else
1651 diff++;
11fdf7f2
TL
1652 if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
1653 *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
1654 n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
7c673cae
FG
1655 else
1656 diff++;
11fdf7f2
TL
1657 if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
1658 *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
1659 n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
7c673cae
FG
1660 else
1661 diff++;
11fdf7f2
TL
1662 if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
1663 *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
1664 n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
7c673cae
FG
1665 else
1666 diff++;
1667 }
1668 if (diff == 0) {
1669 // zoinks, no differences at all!
1670 n->osd_addrs = o->osd_addrs;
1671 }
1672
1673 // does crush match?
1674 bufferlist oc, nc;
11fdf7f2
TL
1675 encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1676 encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
1677 if (oc.contents_equal(nc)) {
1678 n->crush = o->crush;
1679 }
1680
1681 // does pg_temp match?
31f18b77
FG
1682 if (*o->pg_temp == *n->pg_temp)
1683 n->pg_temp = o->pg_temp;
7c673cae
FG
1684
1685 // does primary_temp match?
1686 if (o->primary_temp->size() == n->primary_temp->size()) {
1687 if (*o->primary_temp == *n->primary_temp)
1688 n->primary_temp = o->primary_temp;
1689 }
1690
1691 // do uuids match?
1692 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1693 *o->osd_uuid == *n->osd_uuid)
1694 n->osd_uuid = o->osd_uuid;
1695}
1696
1697void OSDMap::clean_temps(CephContext *cct,
11fdf7f2
TL
1698 const OSDMap& oldmap,
1699 const OSDMap& nextmap,
1700 Incremental *pending_inc)
7c673cae
FG
1701{
1702 ldout(cct, 10) << __func__ << dendl;
7c673cae 1703
11fdf7f2 1704 for (auto pg : *nextmap.pg_temp) {
7c673cae
FG
1705 // if pool does not exist, remove any existing pg_temps associated with
1706 // it. we don't care about pg_temps on the pending_inc either; if there
1707 // are new_pg_temp entries on the pending, clear them out just as well.
11fdf7f2 1708 if (!nextmap.have_pg_pool(pg.first.pool())) {
7c673cae
FG
1709 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1710 << " for nonexistent pool " << pg.first.pool() << dendl;
1711 pending_inc->new_pg_temp[pg.first].clear();
1712 continue;
1713 }
1714 // all osds down?
1715 unsigned num_up = 0;
1716 for (auto o : pg.second) {
11fdf7f2 1717 if (!nextmap.is_down(o)) {
7c673cae
FG
1718 ++num_up;
1719 break;
1720 }
1721 }
1722 if (num_up == 0) {
1723 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1724 << " with all down osds" << pg.second << dendl;
1725 pending_inc->new_pg_temp[pg.first].clear();
1726 continue;
1727 }
1728 // redundant pg_temp?
1729 vector<int> raw_up;
1730 int primary;
11fdf7f2 1731 nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1732 bool remove = false;
11fdf7f2 1733 if (raw_up == pg.second) {
7c673cae
FG
1734 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1735 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1736 remove = true;
1737 }
1738 // oversized pg_temp?
11fdf7f2 1739 if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
91327a77
AA
1740 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1741 << pg.second << " exceeds pool size" << dendl;
1742 remove = true;
1743 }
1744 if (remove) {
11fdf7f2 1745 if (oldmap.pg_temp->count(pg.first))
7c673cae
FG
1746 pending_inc->new_pg_temp[pg.first].clear();
1747 else
1748 pending_inc->new_pg_temp.erase(pg.first);
1749 }
1750 }
1751
11fdf7f2 1752 for (auto &pg : *nextmap.primary_temp) {
7c673cae 1753 // primary down?
11fdf7f2 1754 if (nextmap.is_down(pg.second)) {
7c673cae
FG
1755 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1756 << " to down " << pg.second << dendl;
1757 pending_inc->new_primary_temp[pg.first] = -1;
1758 continue;
1759 }
1760 // redundant primary_temp?
1761 vector<int> real_up, templess_up;
1762 int real_primary, templess_primary;
1763 pg_t pgid = pg.first;
11fdf7f2
TL
1764 nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1765 nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
7c673cae
FG
1766 if (real_primary == templess_primary){
1767 ldout(cct, 10) << __func__ << " removing primary_temp "
1768 << pgid << " -> " << real_primary
1769 << " (unnecessary/redundant)" << dendl;
11fdf7f2 1770 if (oldmap.primary_temp->count(pgid))
7c673cae
FG
1771 pending_inc->new_primary_temp[pgid] = -1;
1772 else
1773 pending_inc->new_primary_temp.erase(pgid);
1774 }
1775 }
1776}
1777
494da23a 1778void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
94b18763 1779{
494da23a
TL
1780 upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
1781 for (auto& p : pg_upmap)
1782 upmap_pgs->push_back(p.first);
1783 for (auto& p : pg_upmap_items)
1784 upmap_pgs->push_back(p.first);
1785}
94b18763 1786
494da23a
TL
1787bool OSDMap::check_pg_upmaps(
1788 CephContext *cct,
1789 const vector<pg_t>& to_check,
1790 vector<pg_t> *to_cancel,
1791 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
1792{
1793 bool any_change = false;
1794 map<int, map<int, float>> rule_weight_map;
28e407b8 1795 for (auto& pg : to_check) {
494da23a 1796 const pg_pool_t *pi = get_pg_pool(pg.pool());
11fdf7f2
TL
1797 if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
1798 ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
1799 << dendl;
494da23a 1800 to_cancel->push_back(pg);
11fdf7f2
TL
1801 continue;
1802 }
1803 if (pi->is_pending_merge(pg, nullptr)) {
1804 ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
1805 << dendl;
494da23a 1806 to_cancel->push_back(pg);
94b18763
FG
1807 continue;
1808 }
494da23a
TL
1809 vector<int> raw, up;
1810 pg_to_raw_upmap(pg, &raw, &up);
494da23a
TL
1811 auto crush_rule = get_pg_pool_crush_rule(pg);
1812 auto r = crush->verify_upmap(cct,
1813 crush_rule,
1814 get_pg_pool_size(pg),
1815 up);
a8e16298
TL
1816 if (r < 0) {
1817 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1818 << " returning " << r
1819 << dendl;
494da23a 1820 to_cancel->push_back(pg);
a8e16298
TL
1821 continue;
1822 }
1823 // below we check against crush-topology changing..
28e407b8
AA
1824 map<int, float> weight_map;
1825 auto it = rule_weight_map.find(crush_rule);
1826 if (it == rule_weight_map.end()) {
494da23a 1827 auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
28e407b8
AA
1828 if (r < 0) {
1829 lderr(cct) << __func__ << " unable to get crush weight_map for "
494da23a
TL
1830 << "crush_rule " << crush_rule
1831 << dendl;
28e407b8
AA
1832 continue;
1833 }
1834 rule_weight_map[crush_rule] = weight_map;
1835 } else {
1836 weight_map = it->second;
1837 }
28e407b8 1838 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1839 << " weight_map " << weight_map
94b18763 1840 << dendl;
a8e16298 1841 for (auto osd : up) {
28e407b8
AA
1842 auto it = weight_map.find(osd);
1843 if (it == weight_map.end()) {
92f5a8d4
TL
1844 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
1845 << "been moved out of the specific crush-tree"
1846 << dendl;
494da23a 1847 to_cancel->push_back(pg);
94b18763
FG
1848 break;
1849 }
494da23a 1850 auto adjusted_weight = get_weightf(it->first) * it->second;
28e407b8 1851 if (adjusted_weight == 0) {
92f5a8d4
TL
1852 ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
1853 << " is out/crush-out"
1854 << dendl;
494da23a 1855 to_cancel->push_back(pg);
94b18763
FG
1856 break;
1857 }
1858 }
eafe8130
TL
1859 if (!to_cancel->empty() && to_cancel->back() == pg)
1860 continue;
1861 // okay, upmap is valid
1862 // continue to check if it is still necessary
1863 auto i = pg_upmap.find(pg);
1864 if (i != pg_upmap.end() && raw == i->second) {
1865 ldout(cct, 10) << " removing redundant pg_upmap "
1866 << i->first << " " << i->second
1867 << dendl;
1868 to_cancel->push_back(pg);
1869 continue;
1870 }
1871 auto j = pg_upmap_items.find(pg);
1872 if (j != pg_upmap_items.end()) {
1873 mempool::osdmap::vector<pair<int,int>> newmap;
1874 for (auto& p : j->second) {
1875 if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
1876 // cancel mapping if source osd does not exist anymore
1877 continue;
1878 }
1879 if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
1880 p.second >= 0 && osd_weight[p.second] == 0) {
1881 // cancel mapping if target osd is out
1882 continue;
1883 }
1884 newmap.push_back(p);
1885 }
1886 if (newmap.empty()) {
1887 ldout(cct, 10) << " removing no-op pg_upmap_items "
1888 << j->first << " " << j->second
1889 << dendl;
1890 to_cancel->push_back(pg);
1891 } else if (newmap != j->second) {
1892 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
1893 << j->first << " " << j->second
1894 << " -> " << newmap
1895 << dendl;
1896 to_remap->insert({pg, newmap});
1897 any_change = true;
1898 }
1899 }
28e407b8 1900 }
494da23a
TL
1901 any_change = any_change || !to_cancel->empty();
1902 return any_change;
1903}
1904
1905void OSDMap::clean_pg_upmaps(
1906 CephContext *cct,
1907 Incremental *pending_inc,
1908 const vector<pg_t>& to_cancel,
1909 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
1910{
28e407b8 1911 for (auto &pg: to_cancel) {
494da23a
TL
1912 auto i = pending_inc->new_pg_upmap.find(pg);
1913 if (i != pending_inc->new_pg_upmap.end()) {
1914 ldout(cct, 10) << __func__ << " cancel invalid pending "
1915 << "pg_upmap entry "
1916 << i->first << "->" << i->second
1917 << dendl;
1918 pending_inc->new_pg_upmap.erase(i);
94b18763 1919 }
494da23a
TL
1920 auto j = pg_upmap.find(pg);
1921 if (j != pg_upmap.end()) {
1922 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
1923 << j->first << "->" << j->second
1924 << dendl;
1925 pending_inc->old_pg_upmap.insert(pg);
1926 }
1927 auto p = pending_inc->new_pg_upmap_items.find(pg);
1928 if (p != pending_inc->new_pg_upmap_items.end()) {
1929 ldout(cct, 10) << __func__ << " cancel invalid pending "
1930 << "pg_upmap_items entry "
1931 << p->first << "->" << p->second
1932 << dendl;
1933 pending_inc->new_pg_upmap_items.erase(p);
1934 }
1935 auto q = pg_upmap_items.find(pg);
1936 if (q != pg_upmap_items.end()) {
1937 ldout(cct, 10) << __func__ << " cancel invalid "
1938 << "pg_upmap_items entry "
1939 << q->first << "->" << q->second
1940 << dendl;
1941 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
1942 }
1943 }
494da23a
TL
1944 for (auto& i : to_remap)
1945 pending_inc->new_pg_upmap_items[i.first] = i.second;
1946}
1947
1948bool OSDMap::clean_pg_upmaps(
1949 CephContext *cct,
1950 Incremental *pending_inc) const
1951{
1952 ldout(cct, 10) << __func__ << dendl;
1953 vector<pg_t> to_check;
1954 vector<pg_t> to_cancel;
1955 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
1956
1957 get_upmap_pgs(&to_check);
1958 auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
1959 clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
1960 return any_change;
94b18763
FG
1961}
1962
7c673cae
FG
1963int OSDMap::apply_incremental(const Incremental &inc)
1964{
1965 new_blacklist_entries = false;
1966 if (inc.epoch == 1)
1967 fsid = inc.fsid;
1968 else if (inc.fsid != fsid)
1969 return -EINVAL;
1970
11fdf7f2 1971 ceph_assert(inc.epoch == epoch+1);
7c673cae
FG
1972
1973 epoch++;
1974 modified = inc.modified;
1975
1976 // full map?
1977 if (inc.fullmap.length()) {
1978 bufferlist bl(inc.fullmap);
1979 decode(bl);
1980 return 0;
1981 }
1982
1983 // nope, incremental.
31f18b77 1984 if (inc.new_flags >= 0) {
7c673cae 1985 flags = inc.new_flags;
31f18b77
FG
1986 // the below is just to cover a newly-upgraded luminous mon
1987 // cluster that has to set require_jewel_osds or
1988 // require_kraken_osds before the osds can be upgraded to
1989 // luminous.
1990 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1991 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1992 require_osd_release = CEPH_RELEASE_KRAKEN;
1993 }
1994 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1995 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1996 require_osd_release = CEPH_RELEASE_JEWEL;
1997 }
1998 }
1999 }
7c673cae
FG
2000
2001 if (inc.new_max_osd >= 0)
2002 set_max_osd(inc.new_max_osd);
2003
2004 if (inc.new_pool_max != -1)
2005 pool_max = inc.new_pool_max;
2006
2007 for (const auto &pool : inc.new_pools) {
2008 pools[pool.first] = pool.second;
2009 pools[pool.first].last_change = epoch;
2010 }
2011
11fdf7f2
TL
2012 new_removed_snaps = inc.new_removed_snaps;
2013 new_purged_snaps = inc.new_purged_snaps;
2014 for (auto p = new_removed_snaps.begin();
2015 p != new_removed_snaps.end();
2016 ++p) {
2017 removed_snaps_queue[p->first].union_of(p->second);
2018 }
2019 for (auto p = new_purged_snaps.begin();
2020 p != new_purged_snaps.end();
2021 ++p) {
2022 auto q = removed_snaps_queue.find(p->first);
2023 ceph_assert(q != removed_snaps_queue.end());
2024 q->second.subtract(p->second);
2025 if (q->second.empty()) {
2026 removed_snaps_queue.erase(q);
2027 }
2028 }
2029
2030 if (inc.new_last_up_change != utime_t()) {
2031 last_up_change = inc.new_last_up_change;
2032 }
2033 if (inc.new_last_in_change != utime_t()) {
2034 last_in_change = inc.new_last_in_change;
2035 }
2036
7c673cae
FG
2037 for (const auto &pname : inc.new_pool_names) {
2038 auto pool_name_entry = pool_name.find(pname.first);
2039 if (pool_name_entry != pool_name.end()) {
2040 name_pool.erase(pool_name_entry->second);
2041 pool_name_entry->second = pname.second;
2042 } else {
2043 pool_name[pname.first] = pname.second;
2044 }
2045 name_pool[pname.second] = pname.first;
2046 }
2047
2048 for (const auto &pool : inc.old_pools) {
2049 pools.erase(pool);
2050 name_pool.erase(pool_name[pool]);
2051 pool_name.erase(pool);
2052 }
2053
2054 for (const auto &weight : inc.new_weight) {
2055 set_weight(weight.first, weight.second);
2056
2057 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2058 // xinfo old_weight.
2059 if (weight.second) {
2060 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
2061 osd_xinfo[weight.first].old_weight = 0;
2062 }
2063 }
2064
2065 for (const auto &primary_affinity : inc.new_primary_affinity) {
2066 set_primary_affinity(primary_affinity.first, primary_affinity.second);
2067 }
2068
2069 // erasure_code_profiles
2070 for (const auto &profile : inc.old_erasure_code_profiles)
2071 erasure_code_profiles.erase(profile);
2072
2073 for (const auto &profile : inc.new_erasure_code_profiles) {
2074 set_erasure_code_profile(profile.first, profile.second);
2075 }
2076
2077 // up/down
2078 for (const auto &state : inc.new_state) {
2079 const auto osd = state.first;
2080 int s = state.second ? state.second : CEPH_OSD_UP;
2081 if ((osd_state[osd] & CEPH_OSD_UP) &&
2082 (s & CEPH_OSD_UP)) {
2083 osd_info[osd].down_at = epoch;
2084 osd_xinfo[osd].down_stamp = modified;
2085 }
2086 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
2087 (s & CEPH_OSD_EXISTS)) {
2088 // osd is destroyed; clear out anything interesting.
2089 (*osd_uuid)[osd] = uuid_d();
2090 osd_info[osd] = osd_info_t();
2091 osd_xinfo[osd] = osd_xinfo_t();
2092 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
11fdf7f2
TL
2093 osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
2094 osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
2095 osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
2096 osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
7c673cae
FG
2097 osd_state[osd] = 0;
2098 } else {
2099 osd_state[osd] ^= s;
2100 }
2101 }
2102
2103 for (const auto &client : inc.new_up_client) {
2104 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
11fdf7f2
TL
2105 osd_addrs->client_addrs[client.first].reset(
2106 new entity_addrvec_t(client.second));
2107 osd_addrs->hb_back_addrs[client.first].reset(
2108 new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
2109 osd_addrs->hb_front_addrs[client.first].reset(
2110 new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
7c673cae
FG
2111
2112 osd_info[client.first].up_from = epoch;
2113 }
2114
2115 for (const auto &cluster : inc.new_up_cluster)
11fdf7f2
TL
2116 osd_addrs->cluster_addrs[cluster.first].reset(
2117 new entity_addrvec_t(cluster.second));
7c673cae
FG
2118
2119 // info
2120 for (const auto &thru : inc.new_up_thru)
2121 osd_info[thru.first].up_thru = thru.second;
2122
2123 for (const auto &interval : inc.new_last_clean_interval) {
2124 osd_info[interval.first].last_clean_begin = interval.second.first;
2125 osd_info[interval.first].last_clean_end = interval.second.second;
2126 }
2127
2128 for (const auto &lost : inc.new_lost)
2129 osd_info[lost.first].lost_at = lost.second;
2130
2131 // xinfo
2132 for (const auto &xinfo : inc.new_xinfo)
2133 osd_xinfo[xinfo.first] = xinfo.second;
2134
2135 // uuid
2136 for (const auto &uuid : inc.new_uuid)
2137 (*osd_uuid)[uuid.first] = uuid.second;
2138
2139 // pg rebuild
2140 for (const auto &pg : inc.new_pg_temp) {
2141 if (pg.second.empty())
2142 pg_temp->erase(pg.first);
2143 else
31f18b77
FG
2144 pg_temp->set(pg.first, pg.second);
2145 }
2146 if (!inc.new_pg_temp.empty()) {
2147 // make sure pg_temp is efficiently stored
2148 pg_temp->rebuild();
7c673cae
FG
2149 }
2150
2151 for (const auto &pg : inc.new_primary_temp) {
2152 if (pg.second == -1)
2153 primary_temp->erase(pg.first);
2154 else
2155 (*primary_temp)[pg.first] = pg.second;
2156 }
2157
2158 for (auto& p : inc.new_pg_upmap) {
2159 pg_upmap[p.first] = p.second;
2160 }
2161 for (auto& pg : inc.old_pg_upmap) {
2162 pg_upmap.erase(pg);
2163 }
2164 for (auto& p : inc.new_pg_upmap_items) {
2165 pg_upmap_items[p.first] = p.second;
2166 }
2167 for (auto& pg : inc.old_pg_upmap_items) {
2168 pg_upmap_items.erase(pg);
2169 }
2170
2171 // blacklist
2172 if (!inc.new_blacklist.empty()) {
2173 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
2174 new_blacklist_entries = true;
2175 }
2176 for (const auto &addr : inc.old_blacklist)
2177 blacklist.erase(addr);
2178
81eedcae
TL
2179 for (auto& i : inc.new_crush_node_flags) {
2180 if (i.second) {
2181 crush_node_flags[i.first] = i.second;
2182 } else {
2183 crush_node_flags.erase(i.first);
2184 }
2185 }
2186
2187 for (auto& i : inc.new_device_class_flags) {
2188 if (i.second) {
2189 device_class_flags[i.first] = i.second;
2190 } else {
2191 device_class_flags.erase(i.first);
2192 }
2193 }
2194
7c673cae
FG
2195 // cluster snapshot?
2196 if (inc.cluster_snapshot.length()) {
2197 cluster_snapshot = inc.cluster_snapshot;
2198 cluster_snapshot_epoch = inc.epoch;
2199 } else {
2200 cluster_snapshot.clear();
2201 cluster_snapshot_epoch = 0;
2202 }
2203
2204 if (inc.new_nearfull_ratio >= 0) {
2205 nearfull_ratio = inc.new_nearfull_ratio;
2206 }
2207 if (inc.new_backfillfull_ratio >= 0) {
2208 backfillfull_ratio = inc.new_backfillfull_ratio;
2209 }
2210 if (inc.new_full_ratio >= 0) {
2211 full_ratio = inc.new_full_ratio;
2212 }
31f18b77 2213 if (inc.new_require_min_compat_client > 0) {
7c673cae
FG
2214 require_min_compat_client = inc.new_require_min_compat_client;
2215 }
31f18b77
FG
2216 if (inc.new_require_osd_release >= 0) {
2217 require_osd_release = inc.new_require_osd_release;
2218 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2219 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2220 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2221 }
2222 }
7c673cae 2223
11fdf7f2
TL
2224 if (inc.new_require_osd_release >= 0) {
2225 require_osd_release = inc.new_require_osd_release;
2226 if (require_osd_release >= CEPH_RELEASE_NAUTILUS) {
2227 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
2228 }
2229 }
7c673cae
FG
2230 // do new crush map last (after up/down stuff)
2231 if (inc.crush.length()) {
2232 bufferlist bl(inc.crush);
11fdf7f2 2233 auto blp = bl.cbegin();
7c673cae
FG
2234 crush.reset(new CrushWrapper);
2235 crush->decode(blp);
31f18b77
FG
2236 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2237 // only increment if this is a luminous-encoded osdmap, lest
2238 // the mon's crush_version diverge from what the osds or others
2239 // are decoding and applying on their end. if we won't encode
2240 // it in the canonical version, don't change it.
2241 ++crush_version;
2242 }
81eedcae
TL
2243 for (auto it = device_class_flags.begin();
2244 it != device_class_flags.end();) {
2245 const char* class_name = crush->get_class_name(it->first);
2246 if (!class_name) // device class is gone
2247 it = device_class_flags.erase(it);
2248 else
2249 it++;
2250 }
7c673cae
FG
2251 }
2252
2253 calc_num_osds();
2254 _calc_up_osd_features();
2255 return 0;
2256}
2257
2258// mapping
2259int OSDMap::map_to_pg(
2260 int64_t poolid,
2261 const string& name,
2262 const string& key,
2263 const string& nspace,
2264 pg_t *pg) const
2265{
2266 // calculate ps (placement seed)
2267 const pg_pool_t *pool = get_pg_pool(poolid);
2268 if (!pool)
2269 return -ENOENT;
2270 ps_t ps;
2271 if (!key.empty())
2272 ps = pool->hash_key(key, nspace);
2273 else
2274 ps = pool->hash_key(name, nspace);
2275 *pg = pg_t(ps, poolid);
2276 return 0;
2277}
2278
2279int OSDMap::object_locator_to_pg(
2280 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2281{
2282 if (loc.hash >= 0) {
2283 if (!get_pg_pool(loc.get_pool())) {
2284 return -ENOENT;
2285 }
2286 pg = pg_t(loc.hash, loc.get_pool());
2287 return 0;
2288 }
2289 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2290}
2291
2292ceph_object_layout OSDMap::make_object_layout(
2293 object_t oid, int pg_pool, string nspace) const
2294{
2295 object_locator_t loc(pg_pool, nspace);
2296
2297 ceph_object_layout ol;
2298 pg_t pgid = object_locator_to_pg(oid, loc);
2299 ol.ol_pgid = pgid.get_old_pg().v;
2300 ol.ol_stripe_unit = 0;
2301 return ol;
2302}
2303
2304void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2305 vector<int>& osds) const
2306{
2307 if (pool.can_shift_osds()) {
2308 unsigned removed = 0;
2309 for (unsigned i = 0; i < osds.size(); i++) {
2310 if (!exists(osds[i])) {
2311 removed++;
2312 continue;
2313 }
2314 if (removed) {
2315 osds[i - removed] = osds[i];
2316 }
2317 }
2318 if (removed)
2319 osds.resize(osds.size() - removed);
2320 } else {
2321 for (auto& osd : osds) {
2322 if (!exists(osd))
2323 osd = CRUSH_ITEM_NONE;
2324 }
2325 }
2326}
2327
31f18b77 2328void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2329 const pg_pool_t& pool, pg_t pg,
2330 vector<int> *osds,
2331 ps_t *ppps) const
2332{
2333 // map to osds[]
2334 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2335 unsigned size = pool.get_size();
2336
2337 // what crush rule?
31f18b77 2338 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
7c673cae
FG
2339 if (ruleno >= 0)
2340 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2341
2342 _remove_nonexistent_osds(pool, *osds);
2343
2344 if (ppps)
2345 *ppps = pps;
7c673cae
FG
2346}
2347
2348int OSDMap::_pick_primary(const vector<int>& osds) const
2349{
2350 for (auto osd : osds) {
2351 if (osd != CRUSH_ITEM_NONE) {
2352 return osd;
2353 }
2354 }
2355 return -1;
2356}
2357
224ce89b 2358void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2359{
2360 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2361 auto p = pg_upmap.find(pg);
2362 if (p != pg_upmap.end()) {
2363 // make sure targets aren't marked out
2364 for (auto osd : p->second) {
91327a77
AA
2365 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2366 osd_weight[osd] == 0) {
7c673cae
FG
2367 // reject/ignore the explicit mapping
2368 return;
2369 }
2370 }
2371 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2372 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2373 }
2374
2375 auto q = pg_upmap_items.find(pg);
2376 if (q != pg_upmap_items.end()) {
181888fb
FG
2377 // NOTE: this approach does not allow a bidirectional swap,
2378 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2379 for (auto& r : q->second) {
2380 // make sure the replacement value doesn't already appear
2381 bool exists = false;
2382 ssize_t pos = -1;
2383 for (unsigned i = 0; i < raw->size(); ++i) {
2384 int osd = (*raw)[i];
2385 if (osd == r.second) {
2386 exists = true;
2387 break;
2388 }
2389 // ignore mapping if target is marked out (or invalid osd id)
2390 if (osd == r.first &&
2391 pos < 0 &&
2392 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2393 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2394 pos = i;
2395 }
2396 }
2397 if (!exists && pos >= 0) {
2398 (*raw)[pos] = r.second;
7c673cae
FG
2399 }
2400 }
2401 }
2402}
2403
2404// pg -> (up osd list)
2405void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2406 vector<int> *up) const
2407{
2408 if (pool.can_shift_osds()) {
2409 // shift left
2410 up->clear();
2411 up->reserve(raw.size());
2412 for (unsigned i=0; i<raw.size(); i++) {
2413 if (!exists(raw[i]) || is_down(raw[i]))
2414 continue;
2415 up->push_back(raw[i]);
2416 }
2417 } else {
2418 // set down/dne devices to NONE
2419 up->resize(raw.size());
2420 for (int i = raw.size() - 1; i >= 0; --i) {
2421 if (!exists(raw[i]) || is_down(raw[i])) {
2422 (*up)[i] = CRUSH_ITEM_NONE;
2423 } else {
2424 (*up)[i] = raw[i];
2425 }
2426 }
2427 }
2428}
2429
2430void OSDMap::_apply_primary_affinity(ps_t seed,
2431 const pg_pool_t& pool,
2432 vector<int> *osds,
2433 int *primary) const
2434{
2435 // do we have any non-default primary_affinity values for these osds?
2436 if (!osd_primary_affinity)
2437 return;
2438
2439 bool any = false;
2440 for (const auto osd : *osds) {
2441 if (osd != CRUSH_ITEM_NONE &&
2442 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2443 any = true;
2444 break;
2445 }
2446 }
2447 if (!any)
2448 return;
2449
2450 // pick the primary. feed both the seed (for the pg) and the osd
2451 // into the hash/rng so that a proportional fraction of an osd's pgs
2452 // get rejected as primary.
2453 int pos = -1;
2454 for (unsigned i = 0; i < osds->size(); ++i) {
2455 int o = (*osds)[i];
2456 if (o == CRUSH_ITEM_NONE)
2457 continue;
2458 unsigned a = (*osd_primary_affinity)[o];
2459 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2460 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2461 seed, o) >> 16) >= a) {
2462 // we chose not to use this primary. note it anyway as a
2463 // fallback in case we don't pick anyone else, but keep looking.
2464 if (pos < 0)
2465 pos = i;
2466 } else {
2467 pos = i;
2468 break;
2469 }
2470 }
2471 if (pos < 0)
2472 return;
2473
2474 *primary = (*osds)[pos];
2475
2476 if (pool.can_shift_osds() && pos > 0) {
2477 // move the new primary to the front.
2478 for (int i = pos; i > 0; --i) {
2479 (*osds)[i] = (*osds)[i-1];
2480 }
2481 (*osds)[0] = *primary;
2482 }
2483}
2484
2485void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2486 vector<int> *temp_pg, int *temp_primary) const
2487{
2488 pg = pool.raw_pg_to_pg(pg);
2489 const auto p = pg_temp->find(pg);
2490 temp_pg->clear();
2491 if (p != pg_temp->end()) {
2492 for (unsigned i=0; i<p->second.size(); i++) {
2493 if (!exists(p->second[i]) || is_down(p->second[i])) {
2494 if (pool.can_shift_osds()) {
2495 continue;
2496 } else {
2497 temp_pg->push_back(CRUSH_ITEM_NONE);
2498 }
2499 } else {
2500 temp_pg->push_back(p->second[i]);
2501 }
2502 }
2503 }
2504 const auto &pp = primary_temp->find(pg);
2505 *temp_primary = -1;
2506 if (pp != primary_temp->end()) {
2507 *temp_primary = pp->second;
2508 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2509 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2510 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2511 *temp_primary = (*temp_pg)[i];
2512 break;
2513 }
2514 }
2515 }
2516}
2517
31f18b77 2518void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae 2519{
7c673cae 2520 const pg_pool_t *pool = get_pg_pool(pg.pool());
11fdf7f2
TL
2521 if (!pool) {
2522 *primary = -1;
2523 raw->clear();
31f18b77 2524 return;
11fdf7f2 2525 }
31f18b77 2526 _pg_to_raw_osds(*pool, pg, raw, NULL);
11fdf7f2 2527 *primary = _pick_primary(*raw);
7c673cae
FG
2528}
2529
494da23a
TL
2530void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
2531 vector<int> *raw_upmap) const
a8e16298
TL
2532{
2533 auto pool = get_pg_pool(pg.pool());
2534 if (!pool) {
2535 raw_upmap->clear();
2536 return;
2537 }
494da23a
TL
2538 _pg_to_raw_osds(*pool, pg, raw, NULL);
2539 *raw_upmap = *raw;
a8e16298
TL
2540 _apply_upmap(*pool, pg, raw_upmap);
2541}
2542
7c673cae
FG
2543void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2544{
2545 const pg_pool_t *pool = get_pg_pool(pg.pool());
2546 if (!pool) {
11fdf7f2
TL
2547 *primary = -1;
2548 up->clear();
7c673cae
FG
2549 return;
2550 }
2551 vector<int> raw;
2552 ps_t pps;
2553 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2554 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2555 _raw_to_up_osds(*pool, raw, up);
2556 *primary = _pick_primary(raw);
2557 _apply_primary_affinity(pps, *pool, up, primary);
2558}
31f18b77 2559
7c673cae
FG
2560void OSDMap::_pg_to_up_acting_osds(
2561 const pg_t& pg, vector<int> *up, int *up_primary,
2562 vector<int> *acting, int *acting_primary,
2563 bool raw_pg_to_pg) const
2564{
2565 const pg_pool_t *pool = get_pg_pool(pg.pool());
2566 if (!pool ||
2567 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2568 if (up)
2569 up->clear();
2570 if (up_primary)
2571 *up_primary = -1;
2572 if (acting)
2573 acting->clear();
2574 if (acting_primary)
2575 *acting_primary = -1;
2576 return;
2577 }
2578 vector<int> raw;
2579 vector<int> _up;
2580 vector<int> _acting;
2581 int _up_primary;
2582 int _acting_primary;
2583 ps_t pps;
2584 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2585 if (_acting.empty() || up || up_primary) {
2586 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2587 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2588 _raw_to_up_osds(*pool, raw, &_up);
2589 _up_primary = _pick_primary(_up);
2590 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2591 if (_acting.empty()) {
2592 _acting = _up;
2593 if (_acting_primary == -1) {
2594 _acting_primary = _up_primary;
2595 }
2596 }
2597
2598 if (up)
2599 up->swap(_up);
2600 if (up_primary)
2601 *up_primary = _up_primary;
2602 }
2603
2604 if (acting)
2605 acting->swap(_acting);
2606 if (acting_primary)
2607 *acting_primary = _acting_primary;
2608}
2609
2610int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2611{
2612 if (!nrep)
2613 nrep = acting.size();
2614 for (int i=0; i<nrep; i++)
2615 if (acting[i] == osd)
2616 return i;
2617 return -1;
2618}
2619
2620int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2621{
2622 return calc_pg_rank(osd, acting, nrep);
2623}
2624
2625bool OSDMap::primary_changed(
2626 int oldprimary,
2627 const vector<int> &oldacting,
2628 int newprimary,
2629 const vector<int> &newacting)
2630{
2631 if (oldacting.empty() && newacting.empty())
2632 return false; // both still empty
2633 if (oldacting.empty() ^ newacting.empty())
2634 return true; // was empty, now not, or vice versa
2635 if (oldprimary != newprimary)
2636 return true; // primary changed
2637 if (calc_pg_rank(oldprimary, oldacting) !=
2638 calc_pg_rank(newprimary, newacting))
2639 return true;
2640 return false; // same primary (tho replicas may have changed)
2641}
2642
28e407b8
AA
2643uint64_t OSDMap::get_encoding_features() const
2644{
2645 uint64_t f = SIGNIFICANT_FEATURES;
11fdf7f2
TL
2646 if (require_osd_release < CEPH_RELEASE_NAUTILUS) {
2647 f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
2648 }
2649 if (require_osd_release < CEPH_RELEASE_MIMIC) {
2650 f &= ~CEPH_FEATURE_SERVER_MIMIC;
2651 }
28e407b8
AA
2652 if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
2653 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2654 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2655 }
2656 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
2657 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2658 CEPH_FEATURE_MSG_ADDR2);
28e407b8
AA
2659 }
2660 if (require_osd_release < CEPH_RELEASE_JEWEL) {
2661 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2662 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2663 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2664 }
2665 return f;
2666}
7c673cae
FG
2667
2668// serialize, unserialize
2669void OSDMap::encode_client_old(bufferlist& bl) const
2670{
11fdf7f2 2671 using ceph::encode;
7c673cae 2672 __u16 v = 5;
11fdf7f2 2673 encode(v, bl);
7c673cae
FG
2674
2675 // base
11fdf7f2
TL
2676 encode(fsid, bl);
2677 encode(epoch, bl);
2678 encode(created, bl);
2679 encode(modified, bl);
7c673cae 2680
11fdf7f2 2681 // for encode(pools, bl);
7c673cae 2682 __u32 n = pools.size();
11fdf7f2 2683 encode(n, bl);
7c673cae
FG
2684
2685 for (const auto &pool : pools) {
2686 n = pool.first;
11fdf7f2
TL
2687 encode(n, bl);
2688 encode(pool.second, bl, 0);
7c673cae 2689 }
11fdf7f2 2690 // for encode(pool_name, bl);
7c673cae 2691 n = pool_name.size();
11fdf7f2 2692 encode(n, bl);
7c673cae
FG
2693 for (const auto &pname : pool_name) {
2694 n = pname.first;
11fdf7f2
TL
2695 encode(n, bl);
2696 encode(pname.second, bl);
7c673cae 2697 }
11fdf7f2 2698 // for encode(pool_max, bl);
7c673cae 2699 n = pool_max;
11fdf7f2 2700 encode(n, bl);
7c673cae 2701
11fdf7f2 2702 encode(flags, bl);
7c673cae 2703
11fdf7f2 2704 encode(max_osd, bl);
31f18b77
FG
2705 {
2706 uint32_t n = osd_state.size();
11fdf7f2 2707 encode(n, bl);
31f18b77 2708 for (auto s : osd_state) {
11fdf7f2 2709 encode((uint8_t)s, bl);
31f18b77
FG
2710 }
2711 }
11fdf7f2
TL
2712 encode(osd_weight, bl);
2713 encode(osd_addrs->client_addrs, bl, 0);
7c673cae 2714
11fdf7f2 2715 // for encode(pg_temp, bl);
7c673cae 2716 n = pg_temp->size();
11fdf7f2 2717 encode(n, bl);
7c673cae
FG
2718 for (const auto pg : *pg_temp) {
2719 old_pg_t opg = pg.first.get_old_pg();
11fdf7f2
TL
2720 encode(opg, bl);
2721 encode(pg.second, bl);
7c673cae
FG
2722 }
2723
2724 // crush
2725 bufferlist cbl;
2726 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2727 encode(cbl, bl);
7c673cae
FG
2728}
2729
2730void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2731{
11fdf7f2 2732 using ceph::encode;
7c673cae
FG
2733 if ((features & CEPH_FEATURE_PGID64) == 0) {
2734 encode_client_old(bl);
2735 return;
2736 }
2737
2738 __u16 v = 6;
11fdf7f2 2739 encode(v, bl);
7c673cae
FG
2740
2741 // base
11fdf7f2
TL
2742 encode(fsid, bl);
2743 encode(epoch, bl);
2744 encode(created, bl);
2745 encode(modified, bl);
7c673cae 2746
11fdf7f2
TL
2747 encode(pools, bl, features);
2748 encode(pool_name, bl);
2749 encode(pool_max, bl);
7c673cae 2750
11fdf7f2 2751 encode(flags, bl);
7c673cae 2752
11fdf7f2 2753 encode(max_osd, bl);
31f18b77
FG
2754 {
2755 uint32_t n = osd_state.size();
11fdf7f2 2756 encode(n, bl);
31f18b77 2757 for (auto s : osd_state) {
11fdf7f2 2758 encode((uint8_t)s, bl);
31f18b77
FG
2759 }
2760 }
11fdf7f2
TL
2761 encode(osd_weight, bl);
2762 encode(osd_addrs->client_addrs, bl, features);
7c673cae 2763
11fdf7f2 2764 encode(*pg_temp, bl);
7c673cae
FG
2765
2766 // crush
2767 bufferlist cbl;
2768 crush->encode(cbl, 0 /* legacy (no) features */);
11fdf7f2 2769 encode(cbl, bl);
7c673cae
FG
2770
2771 // extended
2772 __u16 ev = 10;
11fdf7f2
TL
2773 encode(ev, bl);
2774 encode(osd_addrs->hb_back_addrs, bl, features);
2775 encode(osd_info, bl);
2776 encode(blacklist, bl, features);
2777 encode(osd_addrs->cluster_addrs, bl, features);
2778 encode(cluster_snapshot_epoch, bl);
2779 encode(cluster_snapshot, bl);
2780 encode(*osd_uuid, bl);
2781 encode(osd_xinfo, bl);
2782 encode(osd_addrs->hb_front_addrs, bl, features);
7c673cae
FG
2783}
2784
11fdf7f2
TL
2785/* for a description of osdmap versions, and when they were introduced, please
2786 * refer to
2787 * doc/dev/osd_internals/osdmap_versions.txt
2788 */
7c673cae
FG
2789void OSDMap::encode(bufferlist& bl, uint64_t features) const
2790{
11fdf7f2 2791 using ceph::encode;
7c673cae
FG
2792 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2793 encode_classic(bl, features);
2794 return;
2795 }
2796
2797 // only a select set of callers should *ever* be encoding new
2798 // OSDMaps. others should be passing around the canonical encoded
2799 // buffers from on high. select out those callers by passing in an
2800 // "impossible" feature bit.
11fdf7f2 2801 ceph_assert(features & CEPH_FEATURE_RESERVED);
7c673cae
FG
2802 features &= ~CEPH_FEATURE_RESERVED;
2803
2804 size_t start_offset = bl.length();
2805 size_t tail_offset;
11fdf7f2
TL
2806 size_t crc_offset;
2807 std::optional<buffer::list::contiguous_filler> crc_filler;
7c673cae
FG
2808
2809 // meta-encoding: how we include client-used and osd-specific data
2810 ENCODE_START(8, 7, bl);
2811
2812 {
28e407b8
AA
2813 // NOTE: any new encoding dependencies must be reflected by
2814 // SIGNIFICANT_FEATURES
11fdf7f2 2815 uint8_t v = 9;
31f18b77 2816 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae 2817 v = 3;
11fdf7f2
TL
2818 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2819 v = 6;
2820 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2821 v = 7;
7c673cae
FG
2822 }
2823 ENCODE_START(v, 1, bl); // client-usable data
2824 // base
11fdf7f2
TL
2825 encode(fsid, bl);
2826 encode(epoch, bl);
2827 encode(created, bl);
2828 encode(modified, bl);
7c673cae 2829
11fdf7f2
TL
2830 encode(pools, bl, features);
2831 encode(pool_name, bl);
2832 encode(pool_max, bl);
7c673cae 2833
31f18b77
FG
2834 if (v < 4) {
2835 decltype(flags) f = flags;
2836 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
c07f9fc5 2837 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2838 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2839 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2840 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2841 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
11fdf7f2 2842 encode(f, bl);
31f18b77 2843 } else {
11fdf7f2 2844 encode(flags, bl);
31f18b77 2845 }
7c673cae 2846
11fdf7f2 2847 encode(max_osd, bl);
31f18b77 2848 if (v >= 5) {
11fdf7f2 2849 encode(osd_state, bl);
31f18b77
FG
2850 } else {
2851 uint32_t n = osd_state.size();
11fdf7f2 2852 encode(n, bl);
31f18b77 2853 for (auto s : osd_state) {
11fdf7f2 2854 encode((uint8_t)s, bl);
31f18b77
FG
2855 }
2856 }
11fdf7f2
TL
2857 encode(osd_weight, bl);
2858 if (v >= 8) {
2859 encode(osd_addrs->client_addrs, bl, features);
2860 } else {
2861 encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
2862 }
7c673cae 2863
11fdf7f2
TL
2864 encode(*pg_temp, bl);
2865 encode(*primary_temp, bl);
7c673cae 2866 if (osd_primary_affinity) {
11fdf7f2 2867 encode(*osd_primary_affinity, bl);
7c673cae
FG
2868 } else {
2869 vector<__u32> v;
11fdf7f2 2870 encode(v, bl);
7c673cae
FG
2871 }
2872
2873 // crush
2874 bufferlist cbl;
2875 crush->encode(cbl, features);
11fdf7f2
TL
2876 encode(cbl, bl);
2877 encode(erasure_code_profiles, bl);
7c673cae
FG
2878
2879 if (v >= 4) {
11fdf7f2
TL
2880 encode(pg_upmap, bl);
2881 encode(pg_upmap_items, bl);
7c673cae 2882 } else {
11fdf7f2
TL
2883 ceph_assert(pg_upmap.empty());
2884 ceph_assert(pg_upmap_items.empty());
7c673cae 2885 }
31f18b77 2886 if (v >= 6) {
11fdf7f2
TL
2887 encode(crush_version, bl);
2888 }
2889 if (v >= 7) {
2890 encode(new_removed_snaps, bl);
2891 encode(new_purged_snaps, bl);
2892 }
2893 if (v >= 9) {
2894 encode(last_up_change, bl);
2895 encode(last_in_change, bl);
31f18b77 2896 }
7c673cae
FG
2897 ENCODE_FINISH(bl); // client-usable data
2898 }
2899
2900 {
28e407b8
AA
2901 // NOTE: any new encoding dependencies must be reflected by
2902 // SIGNIFICANT_FEATURES
81eedcae 2903 uint8_t target_v = 9;
7c673cae
FG
2904 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2905 target_v = 1;
11fdf7f2
TL
2906 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
2907 target_v = 5;
2908 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
2909 target_v = 6;
7c673cae
FG
2910 }
2911 ENCODE_START(target_v, 1, bl); // extended, osd-only data
11fdf7f2
TL
2912 if (target_v < 7) {
2913 encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
2914 } else {
2915 encode(osd_addrs->hb_back_addrs, bl, features);
2916 }
2917 encode(osd_info, bl);
7c673cae
FG
2918 {
2919 // put this in a sorted, ordered map<> so that we encode in a
2920 // deterministic order.
2921 map<entity_addr_t,utime_t> blacklist_map;
2922 for (const auto &addr : blacklist)
2923 blacklist_map.insert(make_pair(addr.first, addr.second));
11fdf7f2
TL
2924 encode(blacklist_map, bl, features);
2925 }
2926 if (target_v < 7) {
2927 encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
2928 } else {
2929 encode(osd_addrs->cluster_addrs, bl, features);
2930 }
2931 encode(cluster_snapshot_epoch, bl);
2932 encode(cluster_snapshot, bl);
2933 encode(*osd_uuid, bl);
2934 encode(osd_xinfo, bl);
2935 if (target_v < 7) {
2936 encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
2937 } else {
2938 encode(osd_addrs->hb_front_addrs, bl, features);
2939 }
7c673cae 2940 if (target_v >= 2) {
11fdf7f2
TL
2941 encode(nearfull_ratio, bl);
2942 encode(full_ratio, bl);
2943 encode(backfillfull_ratio, bl);
31f18b77
FG
2944 }
2945 // 4 was string-based new_require_min_compat_client
2946 if (target_v >= 5) {
11fdf7f2
TL
2947 encode(require_min_compat_client, bl);
2948 encode(require_osd_release, bl);
2949 }
2950 if (target_v >= 6) {
2951 encode(removed_snaps_queue, bl);
7c673cae 2952 }
81eedcae
TL
2953 if (target_v >= 8) {
2954 encode(crush_node_flags, bl);
2955 }
2956 if (target_v >= 9) {
2957 encode(device_class_flags, bl);
2958 }
7c673cae
FG
2959 ENCODE_FINISH(bl); // osd-only data
2960 }
2961
11fdf7f2
TL
2962 crc_offset = bl.length();
2963 crc_filler = bl.append_hole(sizeof(uint32_t));
7c673cae
FG
2964 tail_offset = bl.length();
2965
2966 ENCODE_FINISH(bl); // meta-encoding wrapper
2967
2968 // fill in crc
2969 bufferlist front;
11fdf7f2 2970 front.substr_of(bl, start_offset, crc_offset - start_offset);
7c673cae
FG
2971 crc = front.crc32c(-1);
2972 if (tail_offset < bl.length()) {
2973 bufferlist tail;
2974 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2975 crc = tail.crc32c(crc);
2976 }
2977 ceph_le32 crc_le;
2978 crc_le = crc;
11fdf7f2 2979 crc_filler->copy_in(4, (char*)&crc_le);
7c673cae
FG
2980 crc_defined = true;
2981}
2982
11fdf7f2
TL
2983/* for a description of osdmap versions, and when they were introduced, please
2984 * refer to
2985 * doc/dev/osd_internals/osdmap_versions.txt
2986 */
7c673cae
FG
2987void OSDMap::decode(bufferlist& bl)
2988{
11fdf7f2 2989 auto p = bl.cbegin();
7c673cae
FG
2990 decode(p);
2991}
2992
11fdf7f2 2993void OSDMap::decode_classic(bufferlist::const_iterator& p)
7c673cae 2994{
11fdf7f2 2995 using ceph::decode;
7c673cae
FG
2996 __u32 n, t;
2997 __u16 v;
11fdf7f2 2998 decode(v, p);
7c673cae
FG
2999
3000 // base
11fdf7f2
TL
3001 decode(fsid, p);
3002 decode(epoch, p);
3003 decode(created, p);
3004 decode(modified, p);
7c673cae
FG
3005
3006 if (v < 6) {
3007 if (v < 4) {
3008 int32_t max_pools = 0;
11fdf7f2 3009 decode(max_pools, p);
7c673cae
FG
3010 pool_max = max_pools;
3011 }
3012 pools.clear();
11fdf7f2 3013 decode(n, p);
7c673cae 3014 while (n--) {
11fdf7f2
TL
3015 decode(t, p);
3016 decode(pools[t], p);
7c673cae
FG
3017 }
3018 if (v == 4) {
11fdf7f2 3019 decode(n, p);
7c673cae
FG
3020 pool_max = n;
3021 } else if (v == 5) {
3022 pool_name.clear();
11fdf7f2 3023 decode(n, p);
7c673cae 3024 while (n--) {
11fdf7f2
TL
3025 decode(t, p);
3026 decode(pool_name[t], p);
7c673cae 3027 }
11fdf7f2 3028 decode(n, p);
7c673cae
FG
3029 pool_max = n;
3030 }
3031 } else {
11fdf7f2
TL
3032 decode(pools, p);
3033 decode(pool_name, p);
3034 decode(pool_max, p);
7c673cae
FG
3035 }
3036 // kludge around some old bug that zeroed out pool_max (#2307)
3037 if (pools.size() && pool_max < pools.rbegin()->first) {
3038 pool_max = pools.rbegin()->first;
3039 }
3040
11fdf7f2 3041 decode(flags, p);
7c673cae 3042
11fdf7f2 3043 decode(max_osd, p);
31f18b77
FG
3044 {
3045 vector<uint8_t> os;
11fdf7f2 3046 decode(os, p);
31f18b77
FG
3047 osd_state.resize(os.size());
3048 for (unsigned i = 0; i < os.size(); ++i) {
3049 osd_state[i] = os[i];
3050 }
3051 }
11fdf7f2
TL
3052 decode(osd_weight, p);
3053 decode(osd_addrs->client_addrs, p);
7c673cae
FG
3054 if (v <= 5) {
3055 pg_temp->clear();
11fdf7f2 3056 decode(n, p);
7c673cae
FG
3057 while (n--) {
3058 old_pg_t opg;
3059 ::decode_raw(opg, p);
31f18b77 3060 mempool::osdmap::vector<int32_t> v;
11fdf7f2 3061 decode(v, p);
31f18b77 3062 pg_temp->set(pg_t(opg), v);
7c673cae
FG
3063 }
3064 } else {
11fdf7f2 3065 decode(*pg_temp, p);
7c673cae
FG
3066 }
3067
3068 // crush
3069 bufferlist cbl;
11fdf7f2
TL
3070 decode(cbl, p);
3071 auto cblp = cbl.cbegin();
7c673cae
FG
3072 crush->decode(cblp);
3073
3074 // extended
3075 __u16 ev = 0;
3076 if (v >= 5)
11fdf7f2
TL
3077 decode(ev, p);
3078 decode(osd_addrs->hb_back_addrs, p);
3079 decode(osd_info, p);
7c673cae 3080 if (v < 5)
11fdf7f2 3081 decode(pool_name, p);
7c673cae 3082
11fdf7f2 3083 decode(blacklist, p);
7c673cae 3084 if (ev >= 6)
11fdf7f2 3085 decode(osd_addrs->cluster_addrs, p);
7c673cae 3086 else
11fdf7f2 3087 osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
7c673cae
FG
3088
3089 if (ev >= 7) {
11fdf7f2
TL
3090 decode(cluster_snapshot_epoch, p);
3091 decode(cluster_snapshot, p);
7c673cae
FG
3092 }
3093
3094 if (ev >= 8) {
11fdf7f2 3095 decode(*osd_uuid, p);
7c673cae
FG
3096 } else {
3097 osd_uuid->resize(max_osd);
3098 }
3099 if (ev >= 9)
11fdf7f2 3100 decode(osd_xinfo, p);
7c673cae
FG
3101 else
3102 osd_xinfo.resize(max_osd);
3103
3104 if (ev >= 10)
11fdf7f2 3105 decode(osd_addrs->hb_front_addrs, p);
7c673cae 3106 else
11fdf7f2 3107 osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
7c673cae
FG
3108
3109 osd_primary_affinity.reset();
3110
3111 post_decode();
3112}
3113
11fdf7f2 3114void OSDMap::decode(bufferlist::const_iterator& bl)
7c673cae 3115{
11fdf7f2 3116 using ceph::decode;
7c673cae
FG
3117 /**
3118 * Older encodings of the OSDMap had a single struct_v which
3119 * covered the whole encoding, and was prior to our modern
3120 * stuff which includes a compatv and a size. So if we see
3121 * a struct_v < 7, we must rewind to the beginning and use our
3122 * classic decoder.
3123 */
3124 size_t start_offset = bl.get_off();
3125 size_t tail_offset = 0;
3126 bufferlist crc_front, crc_tail;
3127
3128 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
3129 if (struct_v < 7) {
11fdf7f2 3130 bl.seek(start_offset);
7c673cae
FG
3131 decode_classic(bl);
3132 return;
3133 }
3134 /**
3135 * Since we made it past that hurdle, we can use our normal paths.
3136 */
3137 {
11fdf7f2 3138 DECODE_START(9, bl); // client-usable data
7c673cae 3139 // base
11fdf7f2
TL
3140 decode(fsid, bl);
3141 decode(epoch, bl);
3142 decode(created, bl);
3143 decode(modified, bl);
7c673cae 3144
11fdf7f2
TL
3145 decode(pools, bl);
3146 decode(pool_name, bl);
3147 decode(pool_max, bl);
7c673cae 3148
11fdf7f2 3149 decode(flags, bl);
7c673cae 3150
11fdf7f2 3151 decode(max_osd, bl);
31f18b77 3152 if (struct_v >= 5) {
11fdf7f2 3153 decode(osd_state, bl);
31f18b77
FG
3154 } else {
3155 vector<uint8_t> os;
11fdf7f2 3156 decode(os, bl);
31f18b77
FG
3157 osd_state.resize(os.size());
3158 for (unsigned i = 0; i < os.size(); ++i) {
3159 osd_state[i] = os[i];
3160 }
3161 }
11fdf7f2
TL
3162 decode(osd_weight, bl);
3163 decode(osd_addrs->client_addrs, bl);
7c673cae 3164
11fdf7f2
TL
3165 decode(*pg_temp, bl);
3166 decode(*primary_temp, bl);
3167 // dates back to firefly. version increased from 2 to 3 still in firefly.
3168 // do we really still need to keep this around? even for old clients?
7c673cae
FG
3169 if (struct_v >= 2) {
3170 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
11fdf7f2 3171 decode(*osd_primary_affinity, bl);
7c673cae
FG
3172 if (osd_primary_affinity->empty())
3173 osd_primary_affinity.reset();
3174 } else {
3175 osd_primary_affinity.reset();
3176 }
3177
3178 // crush
3179 bufferlist cbl;
11fdf7f2
TL
3180 decode(cbl, bl);
3181 auto cblp = cbl.cbegin();
7c673cae 3182 crush->decode(cblp);
11fdf7f2
TL
3183 // added in firefly; version increased in luminous, so it affects
3184 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3185 // alone until we require clients to be all luminous?
7c673cae 3186 if (struct_v >= 3) {
11fdf7f2 3187 decode(erasure_code_profiles, bl);
7c673cae
FG
3188 } else {
3189 erasure_code_profiles.clear();
3190 }
11fdf7f2
TL
3191 // version increased from 3 to 4 still in luminous, so same as above
3192 // applies.
7c673cae 3193 if (struct_v >= 4) {
11fdf7f2
TL
3194 decode(pg_upmap, bl);
3195 decode(pg_upmap_items, bl);
7c673cae
FG
3196 } else {
3197 pg_upmap.clear();
3198 pg_upmap_items.clear();
3199 }
11fdf7f2
TL
3200 // again, version increased from 5 to 6 still in luminous, so above
3201 // applies.
31f18b77 3202 if (struct_v >= 6) {
11fdf7f2
TL
3203 decode(crush_version, bl);
3204 }
3205 // version increase from 6 to 7 in mimic
3206 if (struct_v >= 7) {
3207 decode(new_removed_snaps, bl);
3208 decode(new_purged_snaps, bl);
3209 }
3210 // version increase from 7 to 8, 8 to 9, in nautilus.
3211 if (struct_v >= 9) {
3212 decode(last_up_change, bl);
3213 decode(last_in_change, bl);
31f18b77 3214 }
7c673cae
FG
3215 DECODE_FINISH(bl); // client-usable data
3216 }
3217
3218 {
81eedcae 3219 DECODE_START(9, bl); // extended, osd-only data
11fdf7f2
TL
3220 decode(osd_addrs->hb_back_addrs, bl);
3221 decode(osd_info, bl);
3222 decode(blacklist, bl);
3223 decode(osd_addrs->cluster_addrs, bl);
3224 decode(cluster_snapshot_epoch, bl);
3225 decode(cluster_snapshot, bl);
3226 decode(*osd_uuid, bl);
3227 decode(osd_xinfo, bl);
3228 decode(osd_addrs->hb_front_addrs, bl);
3229 //
7c673cae 3230 if (struct_v >= 2) {
11fdf7f2
TL
3231 decode(nearfull_ratio, bl);
3232 decode(full_ratio, bl);
7c673cae
FG
3233 } else {
3234 nearfull_ratio = 0;
3235 full_ratio = 0;
3236 }
3237 if (struct_v >= 3) {
11fdf7f2 3238 decode(backfillfull_ratio, bl);
7c673cae
FG
3239 } else {
3240 backfillfull_ratio = 0;
3241 }
31f18b77
FG
3242 if (struct_v == 4) {
3243 string r;
11fdf7f2 3244 decode(r, bl);
31f18b77
FG
3245 if (r.length())
3246 require_min_compat_client = ceph_release_from_name(r.c_str());
3247 }
3248 if (struct_v >= 5) {
11fdf7f2
TL
3249 decode(require_min_compat_client, bl);
3250 decode(require_osd_release, bl);
3251 if (require_osd_release >= CEPH_RELEASE_NAUTILUS) {
3252 flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
3253 }
31f18b77
FG
3254 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3255 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3256 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3257 }
3258 } else {
3259 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
3260 // only for compat with post-kraken pre-luminous test clusters
3261 require_osd_release = CEPH_RELEASE_LUMINOUS;
3262 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 3263 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
3264 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
3265 require_osd_release = CEPH_RELEASE_KRAKEN;
3266 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
3267 require_osd_release = CEPH_RELEASE_JEWEL;
3268 } else {
3269 require_osd_release = 0;
3270 }
3271 }
11fdf7f2
TL
3272 if (struct_v >= 6) {
3273 decode(removed_snaps_queue, bl);
3274 }
81eedcae
TL
3275 if (struct_v >= 8) {
3276 decode(crush_node_flags, bl);
3277 } else {
3278 crush_node_flags.clear();
3279 }
3280 if (struct_v >= 9) {
3281 decode(device_class_flags, bl);
3282 } else {
3283 device_class_flags.clear();
3284 }
7c673cae
FG
3285 DECODE_FINISH(bl); // osd-only data
3286 }
3287
3288 if (struct_v >= 8) {
3289 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
11fdf7f2 3290 decode(crc, bl);
7c673cae
FG
3291 tail_offset = bl.get_off();
3292 crc_defined = true;
3293 } else {
3294 crc_defined = false;
3295 crc = 0;
3296 }
3297
3298 DECODE_FINISH(bl); // wrapper
3299
3300 if (tail_offset) {
3301 // verify crc
3302 uint32_t actual = crc_front.crc32c(-1);
3303 if (tail_offset < bl.get_off()) {
3304 bufferlist tail;
3305 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
3306 actual = tail.crc32c(actual);
3307 }
3308 if (crc != actual) {
3309 ostringstream ss;
3310 ss << "bad crc, actual " << actual << " != expected " << crc;
3311 string s = ss.str();
3312 throw buffer::malformed_input(s.c_str());
3313 }
3314 }
3315
3316 post_decode();
3317}
3318
3319void OSDMap::post_decode()
3320{
3321 // index pool names
3322 name_pool.clear();
3323 for (const auto &pname : pool_name) {
3324 name_pool[pname.second] = pname.first;
3325 }
3326
3327 calc_num_osds();
3328 _calc_up_osd_features();
3329}
3330
3331void OSDMap::dump_erasure_code_profiles(
3332 const mempool::osdmap::map<string,map<string,string>>& profiles,
3333 Formatter *f)
3334{
3335 f->open_object_section("erasure_code_profiles");
3336 for (const auto &profile : profiles) {
3337 f->open_object_section(profile.first.c_str());
3338 for (const auto &profm : profile.second) {
3339 f->dump_string(profm.first.c_str(), profm.second.c_str());
3340 }
3341 f->close_section();
3342 }
3343 f->close_section();
3344}
3345
3346void OSDMap::dump(Formatter *f) const
3347{
3348 f->dump_int("epoch", get_epoch());
3349 f->dump_stream("fsid") << get_fsid();
3350 f->dump_stream("created") << get_created();
3351 f->dump_stream("modified") << get_modified();
11fdf7f2
TL
3352 f->dump_stream("last_up_change") << last_up_change;
3353 f->dump_stream("last_in_change") << last_in_change;
7c673cae 3354 f->dump_string("flags", get_flag_string());
11fdf7f2
TL
3355 f->dump_unsigned("flags_num", flags);
3356 f->open_array_section("flags_set");
3357 set<string> flagset;
3358 get_flag_set(&flagset);
3359 for (auto p : flagset) {
3360 f->dump_string("flag", p);
3361 }
3362 f->close_section();
31f18b77 3363 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
3364 f->dump_float("full_ratio", full_ratio);
3365 f->dump_float("backfillfull_ratio", backfillfull_ratio);
3366 f->dump_float("nearfull_ratio", nearfull_ratio);
3367 f->dump_string("cluster_snapshot", get_cluster_snapshot());
3368 f->dump_int("pool_max", get_pool_max());
3369 f->dump_int("max_osd", get_max_osd());
31f18b77
FG
3370 f->dump_string("require_min_compat_client",
3371 ceph_release_name(require_min_compat_client));
3372 f->dump_string("min_compat_client",
3373 ceph_release_name(get_min_compat_client()));
3374 f->dump_string("require_osd_release",
3375 ceph_release_name(require_osd_release));
7c673cae
FG
3376
3377 f->open_array_section("pools");
3378 for (const auto &pool : pools) {
3379 std::string name("<unknown>");
3380 const auto &pni = pool_name.find(pool.first);
3381 if (pni != pool_name.end())
3382 name = pni->second;
3383 f->open_object_section("pool");
3384 f->dump_int("pool", pool.first);
3385 f->dump_string("pool_name", name);
3386 pool.second.dump(f);
3387 f->close_section();
3388 }
3389 f->close_section();
3390
3391 f->open_array_section("osds");
3392 for (int i=0; i<get_max_osd(); i++)
3393 if (exists(i)) {
3394 f->open_object_section("osd_info");
3395 f->dump_int("osd", i);
3396 f->dump_stream("uuid") << get_uuid(i);
3397 f->dump_int("up", is_up(i));
3398 f->dump_int("in", is_in(i));
3399 f->dump_float("weight", get_weightf(i));
3400 f->dump_float("primary_affinity", get_primary_affinityf(i));
3401 get_info(i).dump(f);
11fdf7f2
TL
3402 f->dump_object("public_addrs", get_addrs(i));
3403 f->dump_object("cluster_addrs", get_cluster_addrs(i));
3404 f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(i));
3405 f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(i));
3406 // compat
3407 f->dump_stream("public_addr") << get_addrs(i).get_legacy_str();
3408 f->dump_stream("cluster_addr") << get_cluster_addrs(i).get_legacy_str();
3409 f->dump_stream("heartbeat_back_addr")
3410 << get_hb_back_addrs(i).get_legacy_str();
3411 f->dump_stream("heartbeat_front_addr")
3412 << get_hb_front_addrs(i).get_legacy_str();
7c673cae
FG
3413
3414 set<string> st;
3415 get_state(i, st);
3416 f->open_array_section("state");
3417 for (const auto &state : st)
3418 f->dump_string("state", state);
3419 f->close_section();
3420
3421 f->close_section();
3422 }
3423 f->close_section();
3424
3425 f->open_array_section("osd_xinfo");
3426 for (int i=0; i<get_max_osd(); i++) {
3427 if (exists(i)) {
3428 f->open_object_section("xinfo");
3429 f->dump_int("osd", i);
3430 osd_xinfo[i].dump(f);
3431 f->close_section();
3432 }
3433 }
3434 f->close_section();
3435
3436 f->open_array_section("pg_upmap");
3437 for (auto& p : pg_upmap) {
3438 f->open_object_section("mapping");
3439 f->dump_stream("pgid") << p.first;
3440 f->open_array_section("osds");
3441 for (auto q : p.second) {
3442 f->dump_int("osd", q);
3443 }
3444 f->close_section();
3445 f->close_section();
3446 }
3447 f->close_section();
3448 f->open_array_section("pg_upmap_items");
3449 for (auto& p : pg_upmap_items) {
3450 f->open_object_section("mapping");
3451 f->dump_stream("pgid") << p.first;
3452 f->open_array_section("mappings");
3453 for (auto& q : p.second) {
3454 f->open_object_section("mapping");
3455 f->dump_int("from", q.first);
3456 f->dump_int("to", q.second);
3457 f->close_section();
3458 }
3459 f->close_section();
3460 f->close_section();
3461 }
3462 f->close_section();
3463 f->open_array_section("pg_temp");
31f18b77 3464 pg_temp->dump(f);
7c673cae
FG
3465 f->close_section();
3466
3467 f->open_array_section("primary_temp");
3468 for (const auto &pg : *primary_temp) {
3469 f->dump_stream("pgid") << pg.first;
3470 f->dump_int("osd", pg.second);
3471 }
3472 f->close_section(); // primary_temp
3473
3474 f->open_object_section("blacklist");
3475 for (const auto &addr : blacklist) {
3476 stringstream ss;
3477 ss << addr.first;
3478 f->dump_stream(ss.str().c_str()) << addr.second;
3479 }
3480 f->close_section();
3481
3482 dump_erasure_code_profiles(erasure_code_profiles, f);
11fdf7f2
TL
3483
3484 f->open_array_section("removed_snaps_queue");
3485 for (auto& p : removed_snaps_queue) {
3486 f->open_object_section("pool");
3487 f->dump_int("pool", p.first);
3488 f->open_array_section("snaps");
3489 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3490 f->open_object_section("interval");
3491 f->dump_unsigned("begin", q.get_start());
3492 f->dump_unsigned("length", q.get_len());
3493 f->close_section();
3494 }
3495 f->close_section();
3496 f->close_section();
3497 }
3498 f->close_section();
3499 f->open_array_section("new_removed_snaps");
3500 for (auto& p : new_removed_snaps) {
3501 f->open_object_section("pool");
3502 f->dump_int("pool", p.first);
3503 f->open_array_section("snaps");
3504 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3505 f->open_object_section("interval");
3506 f->dump_unsigned("begin", q.get_start());
3507 f->dump_unsigned("length", q.get_len());
3508 f->close_section();
3509 }
3510 f->close_section();
3511 f->close_section();
3512 }
3513 f->close_section();
3514 f->open_array_section("new_purged_snaps");
3515 for (auto& p : new_purged_snaps) {
3516 f->open_object_section("pool");
3517 f->dump_int("pool", p.first);
3518 f->open_array_section("snaps");
3519 for (auto q = p.second.begin(); q != p.second.end(); ++q) {
3520 f->open_object_section("interval");
3521 f->dump_unsigned("begin", q.get_start());
3522 f->dump_unsigned("length", q.get_len());
3523 f->close_section();
3524 }
3525 f->close_section();
3526 f->close_section();
3527 }
3528 f->close_section();
81eedcae
TL
3529 f->open_object_section("crush_node_flags");
3530 for (auto& i : crush_node_flags) {
3531 string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
3532 : stringify(i.first);
3533 f->open_array_section(s.c_str());
3534 set<string> st;
3535 calc_state_set(i.second, st);
3536 for (auto& j : st) {
3537 f->dump_string("flag", j);
3538 }
3539 f->close_section();
3540 }
3541 f->close_section();
3542 f->open_object_section("device_class_flags");
3543 for (auto& i : device_class_flags) {
3544 const char* class_name = crush->get_class_name(i.first);
3545 string s = class_name ? class_name : stringify(i.first);
3546 f->open_array_section(s.c_str());
3547 set<string> st;
3548 calc_state_set(i.second, st);
3549 for (auto& j : st) {
3550 f->dump_string("flag", j);
3551 }
3552 f->close_section();
3553 }
3554 f->close_section();
7c673cae
FG
3555}
3556
3557void OSDMap::generate_test_instances(list<OSDMap*>& o)
3558{
3559 o.push_back(new OSDMap);
3560
3561 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3562 o.push_back(new OSDMap);
3563 uuid_d fsid;
224ce89b 3564 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae
FG
3565 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
3566 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
3567 cct->put();
3568}
3569
3570string OSDMap::get_flag_string(unsigned f)
3571{
3572 string s;
3573 if ( f& CEPH_OSDMAP_NEARFULL)
3574 s += ",nearfull";
3575 if (f & CEPH_OSDMAP_FULL)
3576 s += ",full";
3577 if (f & CEPH_OSDMAP_PAUSERD)
3578 s += ",pauserd";
3579 if (f & CEPH_OSDMAP_PAUSEWR)
3580 s += ",pausewr";
3581 if (f & CEPH_OSDMAP_PAUSEREC)
3582 s += ",pauserec";
3583 if (f & CEPH_OSDMAP_NOUP)
3584 s += ",noup";
3585 if (f & CEPH_OSDMAP_NODOWN)
3586 s += ",nodown";
3587 if (f & CEPH_OSDMAP_NOOUT)
3588 s += ",noout";
3589 if (f & CEPH_OSDMAP_NOIN)
3590 s += ",noin";
3591 if (f & CEPH_OSDMAP_NOBACKFILL)
3592 s += ",nobackfill";
3593 if (f & CEPH_OSDMAP_NOREBALANCE)
3594 s += ",norebalance";
3595 if (f & CEPH_OSDMAP_NORECOVER)
3596 s += ",norecover";
3597 if (f & CEPH_OSDMAP_NOSCRUB)
3598 s += ",noscrub";
3599 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3600 s += ",nodeep-scrub";
3601 if (f & CEPH_OSDMAP_NOTIERAGENT)
3602 s += ",notieragent";
11fdf7f2
TL
3603 if (f & CEPH_OSDMAP_NOSNAPTRIM)
3604 s += ",nosnaptrim";
7c673cae
FG
3605 if (f & CEPH_OSDMAP_SORTBITWISE)
3606 s += ",sortbitwise";
3607 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3608 s += ",require_jewel_osds";
3609 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3610 s += ",require_kraken_osds";
3611 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3612 s += ",require_luminous_osds";
c07f9fc5
FG
3613 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3614 s += ",recovery_deletes";
181888fb
FG
3615 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3616 s += ",purged_snapdirs";
f64942e4
AA
3617 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3618 s += ",pglog_hardlimit";
7c673cae
FG
3619 if (s.length())
3620 s.erase(0, 1);
3621 return s;
3622}
3623
3624string OSDMap::get_flag_string() const
3625{
3626 return get_flag_string(flags);
3627}
3628
7c673cae
FG
3629void OSDMap::print_pools(ostream& out) const
3630{
3631 for (const auto &pool : pools) {
3632 std::string name("<unknown>");
3633 const auto &pni = pool_name.find(pool.first);
3634 if (pni != pool_name.end())
3635 name = pni->second;
3636 out << "pool " << pool.first
3637 << " '" << name
3638 << "' " << pool.second << "\n";
3639
3640 for (const auto &snap : pool.second.snaps)
3641 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3642
3643 if (!pool.second.removed_snaps.empty())
3644 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
11fdf7f2
TL
3645 auto p = removed_snaps_queue.find(pool.first);
3646 if (p != removed_snaps_queue.end()) {
3647 out << "\tremoved_snaps_queue " << p->second << "\n";
3648 }
7c673cae
FG
3649 }
3650 out << std::endl;
3651}
3652
3653void OSDMap::print(ostream& out) const
3654{
3655 out << "epoch " << get_epoch() << "\n"
3656 << "fsid " << get_fsid() << "\n"
3657 << "created " << get_created() << "\n"
3658 << "modified " << get_modified() << "\n";
3659
3660 out << "flags " << get_flag_string() << "\n";
31f18b77 3661 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3662 out << "full_ratio " << full_ratio << "\n";
3663 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3664 out << "nearfull_ratio " << nearfull_ratio << "\n";
31f18b77
FG
3665 if (require_min_compat_client > 0) {
3666 out << "require_min_compat_client "
3667 << ceph_release_name(require_min_compat_client) << "\n";
7c673cae 3668 }
31f18b77
FG
3669 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
3670 << "\n";
224ce89b
WB
3671 if (require_osd_release > 0) {
3672 out << "require_osd_release " << ceph_release_name(require_osd_release)
3673 << "\n";
3674 }
7c673cae
FG
3675 if (get_cluster_snapshot().length())
3676 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3677 out << "\n";
3678
3679 print_pools(out);
3680
3681 out << "max_osd " << get_max_osd() << "\n";
3682 for (int i=0; i<get_max_osd(); i++) {
3683 if (exists(i)) {
3684 out << "osd." << i;
3685 out << (is_up(i) ? " up ":" down");
3686 out << (is_in(i) ? " in ":" out");
3687 out << " weight " << get_weightf(i);
3688 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3689 out << " primary_affinity " << get_primary_affinityf(i);
3690 const osd_info_t& info(get_info(i));
3691 out << " " << info;
11fdf7f2 3692 out << " " << get_addrs(i) << " " << get_cluster_addrs(i);
7c673cae
FG
3693 set<string> st;
3694 get_state(i, st);
3695 out << " " << st;
3696 if (!get_uuid(i).is_zero())
3697 out << " " << get_uuid(i);
3698 out << "\n";
3699 }
3700 }
3701 out << std::endl;
3702
3703 for (auto& p : pg_upmap) {
3704 out << "pg_upmap " << p.first << " " << p.second << "\n";
3705 }
3706 for (auto& p : pg_upmap_items) {
3707 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3708 }
3709
3710 for (const auto pg : *pg_temp)
3711 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3712
3713 for (const auto pg : *primary_temp)
3714 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3715
3716 for (const auto &addr : blacklist)
3717 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
7c673cae
FG
3718}
3719
3720class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3721public:
3722 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3723
3724 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3725 unsigned f)
c07f9fc5 3726 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3727
3728 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3729 if (!filter) {
3730 return true; // normal case
3731 }
3732 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3733 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3734 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3735 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3736 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3737 return true;
31f18b77 3738 }
c07f9fc5 3739 return false;
31f18b77
FG
3740 }
3741
3742 bool should_dump_empty_bucket() const override {
3743 return !filter;
3744 }
7c673cae 3745
11fdf7f2 3746 void init_table(TextTable *tbl) {
7c673cae 3747 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3748 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3749 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3750 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3751 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3752 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3753 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
3754 }
3755 void dump(TextTable *tbl, string& bucket) {
3756 init_table(tbl);
7c673cae 3757
11fdf7f2
TL
3758 if (!bucket.empty()) {
3759 set_root(bucket);
3760 Parent::dump(tbl);
3761 } else {
3762 Parent::dump(tbl);
3763 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3764 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3765 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
3766 }
31f18b77 3767 }
7c673cae
FG
3768 }
3769 }
3770
3771protected:
3772 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3773 const char *c = crush->get_item_class(qi.id);
3774 if (!c)
3775 c = "";
7c673cae 3776 *tbl << qi.id
224ce89b 3777 << c
7c673cae
FG
3778 << weightf_t(qi.weight);
3779
3780 ostringstream name;
3781 for (int k = 0; k < qi.depth; k++)
3782 name << " ";
3783 if (qi.is_bucket()) {
3784 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3785 << crush->get_item_name(qi.id);
3786 } else {
3787 name << "osd." << qi.id;
3788 }
3789 *tbl << name.str();
3790
3791 if (!qi.is_bucket()) {
3792 if (!osdmap->exists(qi.id)) {
3793 *tbl << "DNE"
3794 << 0;
3795 } else {
c07f9fc5
FG
3796 string s;
3797 if (osdmap->is_up(qi.id)) {
3798 s = "up";
3799 } else if (osdmap->is_destroyed(qi.id)) {
3800 s = "destroyed";
3801 } else {
3802 s = "down";
3803 }
3804 *tbl << s
7c673cae
FG
3805 << weightf_t(osdmap->get_weightf(qi.id))
3806 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3807 }
3808 }
3809 *tbl << TextTable::endrow;
3810 }
3811
3812private:
3813 const OSDMap *osdmap;
31f18b77 3814 const unsigned filter;
7c673cae
FG
3815};
3816
3817class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3818public:
3819 typedef CrushTreeDumper::FormattingDumper Parent;
3820
31f18b77
FG
3821 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3822 unsigned f)
c07f9fc5 3823 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3824
3825 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3826 if (!filter) {
3827 return true; // normal case
3828 }
3829 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3830 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3831 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3832 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3833 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3834 return true;
31f18b77 3835 }
c07f9fc5 3836 return false;
31f18b77
FG
3837 }
3838
3839 bool should_dump_empty_bucket() const override {
3840 return !filter;
3841 }
7c673cae 3842
11fdf7f2
TL
3843 void dump(Formatter *f, string& bucket) {
3844 if (!bucket.empty()) {
3845 set_root(bucket);
3846 f->open_array_section("nodes");
3847 Parent::dump(f);
3848 f->close_section();
3849 } else {
3850 f->open_array_section("nodes");
3851 Parent::dump(f);
3852 f->close_section();
3853 f->open_array_section("stray");
3854 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3855 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3856 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
3857 }
3858 f->close_section();
7c673cae 3859 }
7c673cae
FG
3860 }
3861
3862protected:
3863 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3864 Parent::dump_item_fields(qi, f);
3865 if (!qi.is_bucket())
3866 {
c07f9fc5
FG
3867 string s;
3868 if (osdmap->is_up(qi.id)) {
3869 s = "up";
3870 } else if (osdmap->is_destroyed(qi.id)) {
3871 s = "destroyed";
3872 } else {
3873 s = "down";
3874 }
7c673cae 3875 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 3876 f->dump_string("status", s);
7c673cae
FG
3877 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3878 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3879 }
3880 }
3881
3882private:
3883 const OSDMap *osdmap;
31f18b77 3884 const unsigned filter;
7c673cae
FG
3885};
3886
11fdf7f2 3887void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
7c673cae 3888{
31f18b77 3889 if (f) {
11fdf7f2 3890 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
31f18b77 3891 } else {
11fdf7f2 3892 ceph_assert(out);
7c673cae 3893 TextTable tbl;
11fdf7f2 3894 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
7c673cae
FG
3895 *out << tbl;
3896 }
3897}
3898
224ce89b 3899void OSDMap::print_summary(Formatter *f, ostream& out,
11fdf7f2 3900 const string& prefix, bool extra) const
7c673cae
FG
3901{
3902 if (f) {
3903 f->open_object_section("osdmap");
3904 f->dump_int("epoch", get_epoch());
3905 f->dump_int("num_osds", get_num_osds());
3906 f->dump_int("num_up_osds", get_num_up_osds());
3907 f->dump_int("num_in_osds", get_num_in_osds());
7c673cae
FG
3908 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3909 f->close_section();
3910 } else {
11fdf7f2 3911 utime_t now = ceph_clock_now();
31f18b77 3912 out << get_num_osds() << " osds: "
11fdf7f2
TL
3913 << get_num_up_osds() << " up";
3914 if (last_up_change != utime_t()) {
3915 out << " (since " << utimespan_str(now - last_up_change) << ")";
3916 }
3917 out << ", " << get_num_in_osds() << " in";
3918 if (last_in_change != utime_t()) {
3919 out << " (since " << utimespan_str(now - last_in_change) << ")";
3920 }
3921 if (extra)
3922 out << "; epoch: e" << get_epoch();
7c673cae
FG
3923 if (get_num_pg_temp())
3924 out << "; " << get_num_pg_temp() << " remapped pgs";
3925 out << "\n";
3926 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3927 if (important_flags)
224ce89b 3928 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
3929 }
3930}
3931
3932void OSDMap::print_oneline_summary(ostream& out) const
3933{
3934 out << "e" << get_epoch() << ": "
31f18b77 3935 << get_num_osds() << " total, "
7c673cae
FG
3936 << get_num_up_osds() << " up, "
3937 << get_num_in_osds() << " in";
7c673cae
FG
3938}
3939
3efd9988 3940bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
3941{
3942 for (const auto &pool : pools) {
3efd9988 3943 if (pool.second.crush_rule == rule_id)
7c673cae
FG
3944 return true;
3945 }
3946 return false;
3947}
3948
3efd9988
FG
3949int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
3950 ostream *ss) const
3951{
3952 for (auto& i : pools) {
3953 auto& pool = i.second;
3954 int ruleno = pool.get_crush_rule();
3955 if (!newcrush->rule_exists(ruleno)) {
3956 *ss << "pool " << i.first << " references crush_rule " << ruleno
3957 << " but it is not present";
3958 return -EINVAL;
3959 }
3960 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
3961 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
3962 return -EINVAL;
3963 }
3964 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
3965 *ss << "pool " << i.first << " type does not match rule " << ruleno;
3966 return -EINVAL;
3967 }
11fdf7f2
TL
3968 int poolsize = pool.get_size();
3969 if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
3970 poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
3971 *ss << "pool " << i.first << " size " << poolsize << " does not"
3efd9988
FG
3972 << " fall within rule " << ruleno
3973 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
3974 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
3975 return -EINVAL;
3976 }
3977 }
3978 return 0;
3979}
3980
224ce89b
WB
3981int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
3982 int nosd, int pg_bits, int pgp_bits,
3983 bool default_pool)
7c673cae 3984{
224ce89b
WB
3985 ldout(cct, 10) << "build_simple on " << nosd
3986 << " osds" << dendl;
7c673cae
FG
3987 epoch = e;
3988 set_fsid(fsid);
3989 created = modified = ceph_clock_now();
3990
3991 if (nosd >= 0) {
3992 set_max_osd(nosd);
3993 } else {
3994 // count osds
3995 int maxosd = 0;
11fdf7f2 3996 const auto& conf = cct->_conf;
7c673cae 3997 vector<string> sections;
11fdf7f2 3998 conf.get_all_sections(sections);
7c673cae
FG
3999
4000 for (auto &section : sections) {
4001 if (section.find("osd.") != 0)
4002 continue;
4003
4004 const char *begin = section.c_str() + 4;
4005 char *end = (char*)begin;
4006 int o = strtol(begin, &end, 10);
4007 if (*end != '\0')
4008 continue;
4009
4010 if (o > cct->_conf->mon_max_osd) {
4011 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
4012 return -ERANGE;
4013 }
4014
4015 if (o > maxosd)
4016 maxosd = o;
4017 }
4018
4019 set_max_osd(maxosd + 1);
4020 }
4021
7c673cae
FG
4022
4023 stringstream ss;
4024 int r;
4025 if (nosd >= 0)
4026 r = build_simple_crush_map(cct, *crush, nosd, &ss);
4027 else
4028 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
11fdf7f2 4029 ceph_assert(r == 0);
7c673cae
FG
4030
4031 int poolbase = get_max_osd() ? get_max_osd() : 1;
4032
d2e6a577 4033 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
11fdf7f2 4034 ceph_assert(default_replicated_rule >= 0);
7c673cae 4035
224ce89b
WB
4036 if (default_pool) {
4037 // pgp_num <= pg_num
4038 if (pgp_bits > pg_bits)
4039 pgp_bits = pg_bits;
4040
4041 vector<string> pool_names;
4042 pool_names.push_back("rbd");
4043 for (auto &plname : pool_names) {
4044 int64_t pool = ++pool_max;
4045 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
4046 pools[pool].flags = cct->_conf->osd_pool_default_flags;
4047 if (cct->_conf->osd_pool_default_flag_hashpspool)
4048 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
4049 if (cct->_conf->osd_pool_default_flag_nodelete)
4050 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
4051 if (cct->_conf->osd_pool_default_flag_nopgchange)
4052 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
4053 if (cct->_conf->osd_pool_default_flag_nosizechange)
4054 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
4055 pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
4056 pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
4057 pools[pool].size);
224ce89b
WB
4058 pools[pool].crush_rule = default_replicated_rule;
4059 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
4060 pools[pool].set_pg_num(poolbase << pg_bits);
4061 pools[pool].set_pgp_num(poolbase << pgp_bits);
11fdf7f2
TL
4062 pools[pool].set_pg_num_target(poolbase << pg_bits);
4063 pools[pool].set_pgp_num_target(poolbase << pgp_bits);
224ce89b 4064 pools[pool].last_change = epoch;
c07f9fc5
FG
4065 pools[pool].application_metadata.insert(
4066 {pg_pool_t::APPLICATION_NAME_RBD, {}});
11fdf7f2
TL
4067 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
4068 cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
4069 pools[pool].pg_autoscale_mode = m >= 0 ? m : 0;
224ce89b
WB
4070 pool_name[pool] = plname;
4071 name_pool[plname] = pool;
4072 }
7c673cae
FG
4073 }
4074
4075 for (int i=0; i<get_max_osd(); i++) {
4076 set_state(i, 0);
4077 set_weight(i, CEPH_OSD_OUT);
4078 }
4079
4080 map<string,string> profile_map;
4081 r = get_erasure_code_profile_default(cct, profile_map, &ss);
4082 if (r < 0) {
4083 lderr(cct) << ss.str() << dendl;
4084 return r;
4085 }
4086 set_erasure_code_profile("default", profile_map);
4087 return 0;
4088}
4089
4090int OSDMap::get_erasure_code_profile_default(CephContext *cct,
4091 map<string,string> &profile_map,
4092 ostream *ss)
4093{
11fdf7f2 4094 int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
7c673cae
FG
4095 *ss,
4096 &profile_map);
4097 return r;
4098}
4099
4100int OSDMap::_build_crush_types(CrushWrapper& crush)
4101{
4102 crush.set_type_name(0, "osd");
4103 crush.set_type_name(1, "host");
4104 crush.set_type_name(2, "chassis");
4105 crush.set_type_name(3, "rack");
4106 crush.set_type_name(4, "row");
4107 crush.set_type_name(5, "pdu");
4108 crush.set_type_name(6, "pod");
4109 crush.set_type_name(7, "room");
4110 crush.set_type_name(8, "datacenter");
11fdf7f2
TL
4111 crush.set_type_name(9, "zone");
4112 crush.set_type_name(10, "region");
4113 crush.set_type_name(11, "root");
4114 return 11;
7c673cae
FG
4115}
4116
4117int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
4118 int nosd, ostream *ss)
4119{
4120 crush.create();
4121
4122 // root
4123 int root_type = _build_crush_types(crush);
4124 int rootid;
4125 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
4126 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4127 ceph_assert(r == 0);
7c673cae
FG
4128 crush.set_item_name(rootid, "default");
4129
4130 for (int o=0; o<nosd; o++) {
4131 map<string,string> loc;
4132 loc["host"] = "localhost";
4133 loc["rack"] = "localrack";
4134 loc["root"] = "default";
4135 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
4136 char name[32];
4137 snprintf(name, sizeof(name), "osd.%d", o);
4138 crush.insert_item(cct, o, 1.0, name, loc);
4139 }
4140
31f18b77 4141 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4142
4143 crush.finalize();
4144
4145 return 0;
4146}
4147
4148int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
4149 CrushWrapper& crush,
4150 ostream *ss)
4151{
11fdf7f2 4152 const auto& conf = cct->_conf;
7c673cae
FG
4153
4154 crush.create();
4155
4156 // root
4157 int root_type = _build_crush_types(crush);
4158 int rootid;
4159 int r = crush.add_bucket(0, 0,
4160 CRUSH_HASH_DEFAULT,
4161 root_type, 0, NULL, NULL, &rootid);
11fdf7f2 4162 ceph_assert(r == 0);
7c673cae
FG
4163 crush.set_item_name(rootid, "default");
4164
4165 // add osds
4166 vector<string> sections;
11fdf7f2 4167 conf.get_all_sections(sections);
7c673cae
FG
4168
4169 for (auto &section : sections) {
4170 if (section.find("osd.") != 0)
4171 continue;
4172
4173 const char *begin = section.c_str() + 4;
4174 char *end = (char*)begin;
4175 int o = strtol(begin, &end, 10);
4176 if (*end != '\0')
4177 continue;
4178
4179 string host, rack, row, room, dc, pool;
4180 vector<string> sectiontmp;
4181 sectiontmp.push_back("osd");
4182 sectiontmp.push_back(section);
11fdf7f2
TL
4183 conf.get_val_from_conf_file(sectiontmp, "host", host, false);
4184 conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
4185 conf.get_val_from_conf_file(sectiontmp, "row", row, false);
4186 conf.get_val_from_conf_file(sectiontmp, "room", room, false);
4187 conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
4188 conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
7c673cae
FG
4189
4190 if (host.length() == 0)
4191 host = "unknownhost";
4192 if (rack.length() == 0)
4193 rack = "unknownrack";
4194
4195 map<string,string> loc;
4196 loc["host"] = host;
4197 loc["rack"] = rack;
4198 if (row.size())
4199 loc["row"] = row;
4200 if (room.size())
4201 loc["room"] = room;
4202 if (dc.size())
4203 loc["datacenter"] = dc;
4204 loc["root"] = "default";
4205
4206 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
4207 crush.insert_item(cct, o, 1.0, section, loc);
4208 }
4209
31f18b77 4210 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
4211
4212 crush.finalize();
4213
4214 return 0;
4215}
4216
4217
31f18b77
FG
4218int OSDMap::build_simple_crush_rules(
4219 CephContext *cct,
4220 CrushWrapper& crush,
4221 const string& root,
4222 ostream *ss)
7c673cae 4223{
31f18b77 4224 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
7c673cae
FG
4225 string failure_domain =
4226 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
4227
7c673cae 4228 int r;
31f18b77 4229 r = crush.add_simple_rule_at(
224ce89b 4230 "replicated_rule", root, failure_domain, "",
31f18b77
FG
4231 "firstn", pg_pool_t::TYPE_REPLICATED,
4232 crush_rule, ss);
7c673cae
FG
4233 if (r < 0)
4234 return r;
4235 // do not add an erasure rule by default or else we will implicitly
4236 // require the crush_v2 feature of clients
4237 return 0;
4238}
4239
4240int OSDMap::summarize_mapping_stats(
4241 OSDMap *newmap,
4242 const set<int64_t> *pools,
4243 std::string *out,
4244 Formatter *f) const
4245{
4246 set<int64_t> ls;
4247 if (pools) {
4248 ls = *pools;
4249 } else {
4250 for (auto &p : get_pools())
4251 ls.insert(p.first);
4252 }
4253
4254 unsigned total_pg = 0;
4255 unsigned moved_pg = 0;
4256 vector<unsigned> base_by_osd(get_max_osd(), 0);
4257 vector<unsigned> new_by_osd(get_max_osd(), 0);
4258 for (int64_t pool_id : ls) {
4259 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
4260 vector<int> up, up2;
4261 int up_primary;
7c673cae 4262 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
11fdf7f2 4263 pg_t pgid(ps, pool_id);
7c673cae 4264 total_pg += pi->get_size();
31f18b77 4265 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
4266 for (int osd : up) {
4267 if (osd >= 0 && osd < get_max_osd())
4268 ++base_by_osd[osd];
4269 }
4270 if (newmap) {
31f18b77 4271 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
4272 for (int osd : up2) {
4273 if (osd >= 0 && osd < get_max_osd())
4274 ++new_by_osd[osd];
4275 }
4276 if (pi->type == pg_pool_t::TYPE_ERASURE) {
4277 for (unsigned i=0; i<up.size(); ++i) {
4278 if (up[i] != up2[i]) {
4279 ++moved_pg;
4280 }
4281 }
4282 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
4283 for (int osd : up) {
4284 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
4285 ++moved_pg;
4286 }
4287 }
4288 } else {
11fdf7f2 4289 ceph_abort_msg("unhandled pool type");
7c673cae
FG
4290 }
4291 }
4292 }
4293 }
4294
4295 unsigned num_up_in = 0;
4296 for (int osd = 0; osd < get_max_osd(); ++osd) {
4297 if (is_up(osd) && is_in(osd))
4298 ++num_up_in;
4299 }
4300 if (!num_up_in) {
4301 return -EINVAL;
4302 }
4303
4304 float avg_pg = (float)total_pg / (float)num_up_in;
4305 float base_stddev = 0, new_stddev = 0;
4306 int min = -1, max = -1;
4307 unsigned min_base_pg = 0, max_base_pg = 0;
4308 unsigned min_new_pg = 0, max_new_pg = 0;
4309 for (int osd = 0; osd < get_max_osd(); ++osd) {
4310 if (is_up(osd) && is_in(osd)) {
4311 float base_diff = (float)base_by_osd[osd] - avg_pg;
4312 base_stddev += base_diff * base_diff;
4313 float new_diff = (float)new_by_osd[osd] - avg_pg;
4314 new_stddev += new_diff * new_diff;
4315 if (min < 0 || base_by_osd[osd] < min_base_pg) {
4316 min = osd;
4317 min_base_pg = base_by_osd[osd];
4318 min_new_pg = new_by_osd[osd];
4319 }
4320 if (max < 0 || base_by_osd[osd] > max_base_pg) {
4321 max = osd;
4322 max_base_pg = base_by_osd[osd];
4323 max_new_pg = new_by_osd[osd];
4324 }
4325 }
4326 }
4327 base_stddev = sqrt(base_stddev / num_up_in);
4328 new_stddev = sqrt(new_stddev / num_up_in);
4329
4330 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
4331
4332 ostringstream ss;
4333 if (f)
4334 f->open_object_section("utilization");
4335 if (newmap) {
4336 if (f) {
4337 f->dump_unsigned("moved_pgs", moved_pg);
4338 f->dump_unsigned("total_pgs", total_pg);
4339 } else {
4340 float percent = 0;
4341 if (total_pg)
4342 percent = (float)moved_pg * 100.0 / (float)total_pg;
4343 ss << "moved " << moved_pg << " / " << total_pg
4344 << " (" << percent << "%)\n";
4345 }
4346 }
4347 if (f) {
4348 f->dump_float("avg_pgs", avg_pg);
4349 f->dump_float("std_dev", base_stddev);
4350 f->dump_float("expected_baseline_std_dev", edev);
4351 if (newmap)
4352 f->dump_float("new_std_dev", new_stddev);
4353 } else {
4354 ss << "avg " << avg_pg << "\n";
4355 ss << "stddev " << base_stddev;
4356 if (newmap)
4357 ss << " -> " << new_stddev;
4358 ss << " (expected baseline " << edev << ")\n";
4359 }
4360 if (min >= 0) {
4361 if (f) {
4362 f->dump_unsigned("min_osd", min);
4363 f->dump_unsigned("min_osd_pgs", min_base_pg);
4364 if (newmap)
4365 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
4366 } else {
4367 ss << "min osd." << min << " with " << min_base_pg;
4368 if (newmap)
4369 ss << " -> " << min_new_pg;
4370 ss << " pgs (" << (float)min_base_pg / avg_pg;
4371 if (newmap)
4372 ss << " -> " << (float)min_new_pg / avg_pg;
4373 ss << " * mean)\n";
4374 }
4375 }
4376 if (max >= 0) {
4377 if (f) {
4378 f->dump_unsigned("max_osd", max);
4379 f->dump_unsigned("max_osd_pgs", max_base_pg);
4380 if (newmap)
4381 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
4382 } else {
4383 ss << "max osd." << max << " with " << max_base_pg;
4384 if (newmap)
4385 ss << " -> " << max_new_pg;
4386 ss << " pgs (" << (float)max_base_pg / avg_pg;
4387 if (newmap)
4388 ss << " -> " << (float)max_new_pg / avg_pg;
4389 ss << " * mean)\n";
4390 }
4391 }
4392 if (f)
4393 f->close_section();
4394 if (out)
4395 *out = ss.str();
4396 return 0;
4397}
4398
7c673cae
FG
4399bool OSDMap::try_pg_upmap(
4400 CephContext *cct,
4401 pg_t pg, ///< pg to potentially remap
4402 const set<int>& overfull, ///< osds we'd want to evacuate
4403 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 4404 const vector<int>& more_underfull, ///< more osds only slightly underfull
7c673cae
FG
4405 vector<int> *orig,
4406 vector<int> *out) ///< resulting alternative mapping
4407{
4408 const pg_pool_t *pool = get_pg_pool(pg.pool());
4409 if (!pool)
4410 return false;
31f18b77 4411 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
7c673cae
FG
4412 pool->get_size());
4413 if (rule < 0)
4414 return false;
4415
7c673cae
FG
4416 // make sure there is something there to remap
4417 bool any = false;
4418 for (auto osd : *orig) {
4419 if (overfull.count(osd)) {
4420 any = true;
4421 break;
4422 }
4423 }
4424 if (!any) {
4425 return false;
4426 }
4427
4428 int r = crush->try_remap_rule(
4429 cct,
4430 rule,
4431 pool->get_size(),
4432 overfull, underfull,
92f5a8d4 4433 more_underfull,
7c673cae
FG
4434 *orig,
4435 out);
4436 if (r < 0)
4437 return false;
4438 if (*out == *orig)
4439 return false;
4440 return true;
4441}
4442
4443int OSDMap::calc_pg_upmaps(
4444 CephContext *cct,
92f5a8d4 4445 uint32_t max_deviation,
7c673cae 4446 int max,
a8e16298 4447 const set<int64_t>& only_pools,
7c673cae
FG
4448 OSDMap::Incremental *pending_inc)
4449{
a8e16298 4450 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
7c673cae 4451 OSDMap tmp;
92f5a8d4
TL
4452 // Can't be less than 1 pg
4453 if (max_deviation < 1)
4454 max_deviation = 1;
7c673cae
FG
4455 tmp.deepish_copy_from(*this);
4456 int num_changed = 0;
a8e16298
TL
4457 map<int,set<pg_t>> pgs_by_osd;
4458 int total_pgs = 0;
4459 float osd_weight_total = 0;
4460 map<int,float> osd_weight;
4461 for (auto& i : pools) {
4462 if (!only_pools.empty() && !only_pools.count(i.first))
4463 continue;
4464 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4465 pg_t pg(ps, i.first);
4466 vector<int> up;
4467 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4468 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4469 for (auto osd : up) {
4470 if (osd != CRUSH_ITEM_NONE)
4471 pgs_by_osd[osd].insert(pg);
7c673cae 4472 }
a8e16298
TL
4473 }
4474 total_pgs += i.second.get_size() * i.second.get_pg_num();
4475
4476 map<int,float> pmap;
4477 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4478 i.second.get_type(),
4479 i.second.get_size());
4480 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4481 ldout(cct,20) << __func__ << " pool " << i.first
4482 << " ruleno " << ruleno
4483 << " weight-map " << pmap
4484 << dendl;
4485 for (auto p : pmap) {
4486 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4487 if (adjusted_weight == 0) {
4488 continue;
31f18b77 4489 }
a8e16298
TL
4490 osd_weight[p.first] += adjusted_weight;
4491 osd_weight_total += adjusted_weight;
7c673cae 4492 }
a8e16298
TL
4493 }
4494 for (auto& i : osd_weight) {
4495 int pgs = 0;
4496 auto p = pgs_by_osd.find(i.first);
4497 if (p != pgs_by_osd.end())
31f18b77 4498 pgs = p->second.size();
a8e16298 4499 else
31f18b77 4500 pgs_by_osd.emplace(i.first, set<pg_t>());
a8e16298 4501 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
31f18b77 4502 << " pgs " << pgs << dendl;
a8e16298
TL
4503 }
4504 if (osd_weight_total == 0) {
4505 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4506 return 0;
4507 }
4508 float pgs_per_weight = total_pgs / osd_weight_total;
4509 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4510 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4511
a8e16298
TL
4512 if (max <= 0) {
4513 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4514 return 0;
4515 }
a8e16298
TL
4516 float stddev = 0;
4517 map<int,float> osd_deviation; // osd, deviation(pgs)
4518 multimap<float,int> deviation_osd; // deviation(pgs), osd
92f5a8d4 4519 float cur_max_deviation = 0;
a8e16298
TL
4520 for (auto& i : pgs_by_osd) {
4521 // make sure osd is still there (belongs to this crush-tree)
4522 ceph_assert(osd_weight.count(i.first));
4523 float target = osd_weight[i.first] * pgs_per_weight;
4524 float deviation = (float)i.second.size() - target;
4525 ldout(cct, 20) << " osd." << i.first
4526 << "\tpgs " << i.second.size()
4527 << "\ttarget " << target
4528 << "\tdeviation " << deviation
4529 << dendl;
4530 osd_deviation[i.first] = deviation;
4531 deviation_osd.insert(make_pair(deviation, i.first));
4532 stddev += deviation * deviation;
92f5a8d4
TL
4533 if (fabsf(deviation) > cur_max_deviation)
4534 cur_max_deviation = fabsf(deviation);
a8e16298 4535 }
92f5a8d4
TL
4536 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4537 if (cur_max_deviation <= max_deviation) {
a8e16298
TL
4538 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4539 << dendl;
4540 return 0;
4541 }
4542 bool skip_overfull = false;
4543 auto aggressive =
11fdf7f2 4544 cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
a8e16298 4545 auto local_fallback_retries =
11fdf7f2 4546 cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
a8e16298 4547 while (max--) {
92f5a8d4 4548 ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
a8e16298
TL
4549 // build overfull and underfull
4550 set<int> overfull;
92f5a8d4
TL
4551 set<int> more_overfull;
4552 bool using_more_overfull = false;
a8e16298 4553 vector<int> underfull;
92f5a8d4
TL
4554 vector<int> more_underfull;
4555 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4556 ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
4557 if (i->first <= 0)
4558 break;
4559 if (i->first > max_deviation) {
4560 ldout(cct, 30) << " add overfull osd." << i->second << dendl;
a8e16298 4561 overfull.insert(i->second);
92f5a8d4
TL
4562 } else {
4563 more_overfull.insert(i->second);
4564 }
a8e16298 4565 }
7c673cae 4566
92f5a8d4
TL
4567 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4568 ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
4569 if (i->first >= 0)
a8e16298 4570 break;
92f5a8d4
TL
4571 if (i->first < -(int)max_deviation) {
4572 ldout(cct, 30) << " add underfull osd." << i->second << dendl;
4573 underfull.push_back(i->second);
4574 } else {
4575 more_underfull.push_back(i->second);
4576 }
7c673cae 4577 }
92f5a8d4
TL
4578 if (underfull.empty() && overfull.empty()) {
4579 ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
7c673cae 4580 break;
a8e16298 4581 }
92f5a8d4
TL
4582 if (overfull.empty() && !underfull.empty()) {
4583 ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
4584 overfull = more_overfull;
4585 using_more_overfull = true;
4586 }
7c673cae 4587
a8e16298
TL
4588 ldout(cct, 10) << " overfull " << overfull
4589 << " underfull " << underfull
4590 << dendl;
4591 set<pg_t> to_skip;
4592 uint64_t local_fallback_retried = 0;
4593
4594 retry:
4595
4596 set<pg_t> to_unmap;
4597 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4598 auto temp_pgs_by_osd = pgs_by_osd;
4599 // always start with fullest, break if we find any changes to make
7c673cae 4600 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
92f5a8d4 4601 if (skip_overfull && !underfull.empty()) {
a8e16298
TL
4602 ldout(cct, 10) << " skipping overfull " << dendl;
4603 break; // fall through to check underfull
4604 }
7c673cae 4605 int osd = p->second;
31f18b77 4606 float deviation = p->first;
7c673cae 4607 float target = osd_weight[osd] * pgs_per_weight;
92f5a8d4
TL
4608 ldout(cct, 10) << " Overfull search osd." << osd
4609 << " target " << target
4610 << " deviation " << deviation
4611 << dendl;
a8e16298 4612 ceph_assert(target > 0);
92f5a8d4 4613 if (!using_more_overfull && deviation <= max_deviation) {
7c673cae 4614 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4615 << " target " << target
4616 << " deviation " << deviation
92f5a8d4 4617 << " < max deviation " << max_deviation
a8e16298 4618 << dendl;
7c673cae
FG
4619 break;
4620 }
7c673cae 4621
a8e16298
TL
4622 vector<pg_t> pgs;
4623 pgs.reserve(pgs_by_osd[osd].size());
4624 for (auto& pg : pgs_by_osd[osd]) {
4625 if (to_skip.count(pg))
4626 continue;
4627 pgs.push_back(pg);
4628 }
4629 if (aggressive) {
4630 // shuffle PG list so they all get equal (in)attention
4631 std::random_device rd;
4632 std::default_random_engine rng{rd()};
4633 std::shuffle(pgs.begin(), pgs.end(), rng);
4634 }
7c673cae
FG
4635 // look for remaps we can un-remap
4636 for (auto pg : pgs) {
4637 auto p = tmp.pg_upmap_items.find(pg);
a8e16298
TL
4638 if (p == tmp.pg_upmap_items.end())
4639 continue;
4640 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4641 for (auto q : p->second) {
4642 if (q.second == osd) {
4643 ldout(cct, 10) << " will try dropping existing"
4644 << " remapping pair "
4645 << q.first << " -> " << q.second
4646 << " which remapped " << pg
4647 << " into overfull osd." << osd
4648 << dendl;
4649 temp_pgs_by_osd[q.second].erase(pg);
4650 temp_pgs_by_osd[q.first].insert(pg);
4651 } else {
4652 new_upmap_items.push_back(q);
4653 }
4654 }
4655 if (new_upmap_items.empty()) {
4656 // drop whole item
4657 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4658 << " remapped " << pg << " into overfull osd." << osd
4659 << ", will try cancelling it entirely"
4660 << dendl;
4661 to_unmap.insert(pg);
4662 goto test_change;
4663 } else if (new_upmap_items.size() != p->second.size()) {
4664 // drop single remapping pair, updating
4665 ceph_assert(new_upmap_items.size() < p->second.size());
4666 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4667 << " remapped " << pg << " into overfull osd." << osd
4668 << ", new_pg_upmap_items now " << new_upmap_items
4669 << dendl;
4670 to_upmap[pg] = new_upmap_items;
4671 goto test_change;
4672 }
4673 }
7c673cae 4674
a8e16298 4675 // try upmap
7c673cae 4676 for (auto pg : pgs) {
a8e16298
TL
4677 auto temp_it = tmp.pg_upmap.find(pg);
4678 if (temp_it != tmp.pg_upmap.end()) {
4679 // leave pg_upmap alone
4680 // it must be specified by admin since balancer does not
4681 // support pg_upmap yet
4682 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4683 << temp_it->second << ", skipping"
4684 << dendl;
7c673cae
FG
4685 continue;
4686 }
a8e16298
TL
4687 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4688 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4689 set<int> existing;
4690 auto it = tmp.pg_upmap_items.find(pg);
4691 if (it != tmp.pg_upmap_items.end() &&
4692 it->second.size() >= (size_t)pg_pool_size) {
4693 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4694 << it->second << ", skipping"
4695 << dendl;
4696 continue;
4697 } else if (it != tmp.pg_upmap_items.end()) {
4698 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4699 << it->second
4700 << dendl;
4701 new_upmap_items = it->second;
4702 // build existing too (for dedup)
4703 for (auto i : it->second) {
4704 existing.insert(i.first);
4705 existing.insert(i.second);
4706 }
4707 // fall through
4708 // to see if we can append more remapping pairs
4709 }
4710 ldout(cct, 10) << " trying " << pg << dendl;
494da23a
TL
4711 vector<int> raw, orig, out;
4712 tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
92f5a8d4 4713 if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
7c673cae
FG
4714 continue;
4715 }
a8e16298 4716 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4717 if (orig.size() != out.size()) {
4718 continue;
4719 }
a8e16298 4720 ceph_assert(orig != out);
92f5a8d4
TL
4721 int pos = -1;
4722 float max_dev = 0;
7c673cae 4723 for (unsigned i = 0; i < out.size(); ++i) {
a8e16298
TL
4724 if (orig[i] == out[i])
4725 continue; // skip invalid remappings
4726 if (existing.count(orig[i]) || existing.count(out[i]))
4727 continue; // we want new remappings only!
92f5a8d4
TL
4728 if (osd_deviation[orig[i]] > max_dev) {
4729 max_dev = osd_deviation[orig[i]];
4730 pos = i;
4731 ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
4732 }
4733 }
4734 if (pos != -1) {
4735 int i = pos;
a8e16298
TL
4736 ldout(cct, 10) << " will try adding new remapping pair "
4737 << orig[i] << " -> " << out[i] << " for " << pg
92f5a8d4 4738 << (orig[i] != osd ? " NOT selected osd" : "")
a8e16298
TL
4739 << dendl;
4740 existing.insert(orig[i]);
4741 existing.insert(out[i]);
4742 temp_pgs_by_osd[orig[i]].erase(pg);
4743 temp_pgs_by_osd[out[i]].insert(pg);
4744 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4745 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4746 // append new remapping pairs slowly
4747 // This way we can make sure that each tiny change will
4748 // definitely make distribution of PGs converging to
4749 // the perfect status.
4750 to_upmap[pg] = new_upmap_items;
4751 goto test_change;
7c673cae 4752 }
a8e16298
TL
4753 }
4754 }
7c673cae 4755
a8e16298
TL
4756 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4757 ldout(cct, 10) << " failed to find any changes for overfull osds"
4758 << dendl;
4759 for (auto& p : deviation_osd) {
4760 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4761 underfull.end())
4762 break;
4763 int osd = p.second;
4764 float deviation = p.first;
4765 float target = osd_weight[osd] * pgs_per_weight;
4766 ceph_assert(target > 0);
92f5a8d4
TL
4767 if (fabsf(deviation) < max_deviation) {
4768 // respect max_deviation too
a8e16298
TL
4769 ldout(cct, 10) << " osd." << osd
4770 << " target " << target
4771 << " deviation " << deviation
92f5a8d4
TL
4772 << " -> absolute " << fabsf(deviation)
4773 << " < max " << max_deviation
a8e16298
TL
4774 << dendl;
4775 break;
4776 }
4777 // look for remaps we can un-remap
4778 vector<pair<pg_t,
4779 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4780 candidates.reserve(tmp.pg_upmap_items.size());
4781 for (auto& i : tmp.pg_upmap_items) {
4782 if (to_skip.count(i.first))
4783 continue;
4784 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4785 continue;
4786 candidates.push_back(make_pair(i.first, i.second));
4787 }
4788 if (aggressive) {
4789 // shuffle candidates so they all get equal (in)attention
4790 std::random_device rd;
4791 std::default_random_engine rng{rd()};
4792 std::shuffle(candidates.begin(), candidates.end(), rng);
4793 }
4794 for (auto& i : candidates) {
4795 auto pg = i.first;
4796 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4797 for (auto& j : i.second) {
4798 if (j.first == osd) {
4799 ldout(cct, 10) << " will try dropping existing"
4800 << " remapping pair "
4801 << j.first << " -> " << j.second
4802 << " which remapped " << pg
4803 << " out from underfull osd." << osd
4804 << dendl;
4805 temp_pgs_by_osd[j.second].erase(pg);
4806 temp_pgs_by_osd[j.first].insert(pg);
4807 } else {
4808 new_upmap_items.push_back(j);
4809 }
4810 }
4811 if (new_upmap_items.empty()) {
4812 // drop whole item
4813 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4814 << " remapped " << pg
4815 << " out from underfull osd." << osd
4816 << ", will try cancelling it entirely"
4817 << dendl;
4818 to_unmap.insert(pg);
4819 goto test_change;
4820 } else if (new_upmap_items.size() != i.second.size()) {
4821 // drop single remapping pair, updating
4822 ceph_assert(new_upmap_items.size() < i.second.size());
4823 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4824 << " remapped " << pg
4825 << " out from underfull osd." << osd
4826 << ", new_pg_upmap_items now " << new_upmap_items
4827 << dendl;
4828 to_upmap[pg] = new_upmap_items;
4829 goto test_change;
4830 }
4831 }
7c673cae 4832 }
a8e16298
TL
4833
4834 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4835 ldout(cct, 10) << " failed to find any changes for underfull osds"
4836 << dendl;
4837 if (!aggressive) {
4838 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
4839 break;
4840 } else if (!skip_overfull) {
4841 // safe to quit because below here we know
4842 // we've done checking both overfull and underfull osds..
4843 ldout(cct, 10) << " break due to not being able to find any"
4844 << " further optimizations"
4845 << dendl;
7c673cae
FG
4846 break;
4847 }
a8e16298
TL
4848 // restart with fullest and do exhaustive searching
4849 skip_overfull = false;
4850 continue;
4851
4852 test_change:
4853
4854 // test change, apply if change is good
4855 ceph_assert(to_unmap.size() || to_upmap.size());
4856 float new_stddev = 0;
4857 map<int,float> temp_osd_deviation;
4858 multimap<float,int> temp_deviation_osd;
92f5a8d4 4859 float cur_max_deviation = 0;
a8e16298
TL
4860 for (auto& i : temp_pgs_by_osd) {
4861 // make sure osd is still there (belongs to this crush-tree)
4862 ceph_assert(osd_weight.count(i.first));
4863 float target = osd_weight[i.first] * pgs_per_weight;
4864 float deviation = (float)i.second.size() - target;
4865 ldout(cct, 20) << " osd." << i.first
4866 << "\tpgs " << i.second.size()
4867 << "\ttarget " << target
4868 << "\tdeviation " << deviation
4869 << dendl;
4870 temp_osd_deviation[i.first] = deviation;
4871 temp_deviation_osd.insert(make_pair(deviation, i.first));
92f5a8d4
TL
4872 new_stddev += deviation * deviation;
4873 if (fabsf(deviation) > cur_max_deviation)
4874 cur_max_deviation = fabsf(deviation);
a8e16298
TL
4875 }
4876 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
4877 if (new_stddev >= stddev) {
4878 if (!aggressive) {
4879 ldout(cct, 10) << " break because stddev is not decreasing"
4880 << " and aggressive mode is not enabled"
4881 << dendl;
4882 break;
4883 }
4884 local_fallback_retried++;
4885 if (local_fallback_retried >= local_fallback_retries) {
4886 // does not make progress
4887 // flip *skip_overfull* so both overfull and underfull
4888 // get equal (in)attention
4889 skip_overfull = !skip_overfull;
4890 ldout(cct, 10) << " hit local_fallback_retries "
4891 << local_fallback_retries
4892 << dendl;
4893 continue;
4894 }
4895 for (auto& i : to_unmap)
4896 to_skip.insert(i);
4897 for (auto& i : to_upmap)
4898 to_skip.insert(i.first);
4899 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
4900 << " to_skip " << to_skip
4901 << dendl;
4902 goto retry;
4903 }
4904
4905 // ready to go
4906 ceph_assert(new_stddev < stddev);
4907 stddev = new_stddev;
4908 pgs_by_osd = temp_pgs_by_osd;
4909 osd_deviation = temp_osd_deviation;
4910 deviation_osd = temp_deviation_osd;
4911 for (auto& i : to_unmap) {
4912 ldout(cct, 10) << " unmap pg " << i << dendl;
4913 ceph_assert(tmp.pg_upmap_items.count(i));
4914 tmp.pg_upmap_items.erase(i);
4915 pending_inc->old_pg_upmap_items.insert(i);
4916 ++num_changed;
4917 }
4918 for (auto& i : to_upmap) {
4919 ldout(cct, 10) << " upmap pg " << i.first
4920 << " new pg_upmap_items " << i.second
4921 << dendl;
4922 tmp.pg_upmap_items[i.first] = i.second;
4923 pending_inc->new_pg_upmap_items[i.first] = i.second;
4924 ++num_changed;
4925 }
92f5a8d4
TL
4926 ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
4927 if (cur_max_deviation <= max_deviation) {
4928 ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
4929 << dendl;
4930 break;
4931 }
7c673cae 4932 }
a8e16298 4933 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
4934 return num_changed;
4935}
31f18b77
FG
4936
4937int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
4938{
4939 return crush->get_leaves(name, osds);
4940}
4941
3efd9988
FG
4942// get pools whose crush rules might reference the given osd
4943void OSDMap::get_pool_ids_by_osd(CephContext *cct,
4944 int osd,
4945 set<int64_t> *pool_ids) const
4946{
11fdf7f2 4947 ceph_assert(pool_ids);
3efd9988
FG
4948 set<int> raw_rules;
4949 int r = crush->get_rules_by_osd(osd, &raw_rules);
4950 if (r < 0) {
4951 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
4952 << dendl;
11fdf7f2 4953 ceph_assert(r >= 0);
3efd9988
FG
4954 }
4955 set<int> rules;
4956 for (auto &i: raw_rules) {
4957 // exclude any dead rule
4958 if (crush_rule_in_use(i)) {
4959 rules.insert(i);
4960 }
4961 }
4962 for (auto &r: rules) {
4963 get_pool_ids_by_rule(r, pool_ids);
4964 }
4965}
4966
31f18b77
FG
4967template <typename F>
4968class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
4969public:
4970 typedef CrushTreeDumper::Dumper<F> Parent;
4971
4972 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
11fdf7f2
TL
4973 const PGMap& pgmap_, bool tree_,
4974 const string& class_name_,
4975 const string& item_name_) :
c07f9fc5 4976 Parent(crush, osdmap_->get_pool_names()),
31f18b77 4977 osdmap(osdmap_),
11fdf7f2 4978 pgmap(pgmap_),
31f18b77 4979 tree(tree_),
11fdf7f2
TL
4980 class_name(class_name_),
4981 item_name(item_name_),
31f18b77
FG
4982 min_var(-1),
4983 max_var(-1),
4984 stddev(0),
4985 sum(0) {
11fdf7f2
TL
4986 if (osdmap->crush->name_exists(item_name)) {
4987 // filter out items we are allowed to dump
4988 auto item_id = osdmap->crush->get_item_id(item_name);
4989 allowed.insert(item_id);
4990 osdmap->crush->get_all_children(item_id, &allowed);
4991 }
4992 average_util = average_utilization();
31f18b77
FG
4993 }
4994
4995protected:
11fdf7f2
TL
4996
4997 bool should_dump(int id) const {
4998 if (!allowed.empty() && !allowed.count(id)) // filter by name
4999 return false;
5000 if (id >= 0 && !class_name.empty()) {
5001 const char* item_class_name = osdmap->crush->get_item_class(id);
5002 if (!item_class_name || // not bound to a class yet
5003 item_class_name != class_name) // or already bound to
5004 // a different class
5005 return false;
5006 }
5007 return true;
5008 }
5009
5010 set<int> get_dumped_osds() {
5011 if (class_name.empty() && item_name.empty()) {
5012 // old way, all
5013 return {};
5014 }
5015 return dumped_osds;
5016 }
5017
31f18b77
FG
5018 void dump_stray(F *f) {
5019 for (int i = 0; i < osdmap->get_max_osd(); i++) {
5020 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 5021 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
5022 }
5023 }
5024
5025 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
5026 if (!tree && qi.is_bucket())
5027 return;
11fdf7f2
TL
5028 if (!should_dump(qi.id))
5029 return;
31f18b77 5030
11fdf7f2
TL
5031 if (!qi.is_bucket())
5032 dumped_osds.insert(qi.id);
31f18b77 5033 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
11fdf7f2
TL
5034 int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
5035 kb_used_meta = 0, kb_avail = 0;
31f18b77 5036 double util = 0;
11fdf7f2
TL
5037 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
5038 &kb_used_omap, &kb_used_meta, &kb_avail))
31f18b77
FG
5039 if (kb_used && kb)
5040 util = 100.0 * (double)kb_used / (double)kb;
5041
5042 double var = 1.0;
5043 if (average_util)
5044 var = util / average_util;
5045
11fdf7f2 5046 size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
31f18b77 5047
11fdf7f2
TL
5048 dump_item(qi, reweight, kb, kb_used,
5049 kb_used_data, kb_used_omap, kb_used_meta,
5050 kb_avail, util, var, num_pgs, f);
31f18b77
FG
5051
5052 if (!qi.is_bucket() && reweight > 0) {
5053 if (min_var < 0 || var < min_var)
5054 min_var = var;
5055 if (max_var < 0 || var > max_var)
5056 max_var = var;
5057
5058 double dev = util - average_util;
5059 dev *= dev;
5060 stddev += reweight * dev;
5061 sum += reweight;
5062 }
5063 }
5064
5065 virtual void dump_item(const CrushTreeDumper::Item &qi,
5066 float &reweight,
5067 int64_t kb,
5068 int64_t kb_used,
11fdf7f2
TL
5069 int64_t kb_used_data,
5070 int64_t kb_used_omap,
5071 int64_t kb_used_meta,
31f18b77
FG
5072 int64_t kb_avail,
5073 double& util,
5074 double& var,
5075 const size_t num_pgs,
5076 F *f) = 0;
5077
5078 double dev() {
5079 return sum > 0 ? sqrt(stddev / sum) : 0;
5080 }
5081
5082 double average_utilization() {
5083 int64_t kb = 0, kb_used = 0;
5084 for (int i = 0; i < osdmap->get_max_osd(); i++) {
11fdf7f2
TL
5085 if (!osdmap->exists(i) ||
5086 osdmap->get_weight(i) == 0 ||
5087 !should_dump(i))
31f18b77 5088 continue;
11fdf7f2
TL
5089 int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
5090 kb_avail_i;
5091 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
5092 &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
31f18b77
FG
5093 kb += kb_i;
5094 kb_used += kb_used_i;
5095 }
5096 }
5097 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
5098 }
5099
5100 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5101 int64_t* kb_used_data,
5102 int64_t* kb_used_omap,
5103 int64_t* kb_used_meta,
31f18b77 5104 int64_t* kb_avail) const {
11fdf7f2 5105 const osd_stat_t *p = pgmap.get_osd_stat(id);
31f18b77 5106 if (!p) return false;
11fdf7f2
TL
5107 *kb = p->statfs.kb();
5108 *kb_used = p->statfs.kb_used_raw();
5109 *kb_used_data = p->statfs.kb_used_data();
5110 *kb_used_omap = p->statfs.kb_used_omap();
5111 *kb_used_meta = p->statfs.kb_used_internal_metadata();
5112 *kb_avail = p->statfs.kb_avail();
5113
31f18b77
FG
5114 return *kb > 0;
5115 }
5116
5117 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
11fdf7f2
TL
5118 int64_t* kb_used_data,
5119 int64_t* kb_used_omap,
5120 int64_t* kb_used_meta,
31f18b77
FG
5121 int64_t* kb_avail) const {
5122 if (id >= 0) {
11fdf7f2 5123 if (osdmap->is_out(id) || !should_dump(id)) {
31f18b77
FG
5124 *kb = 0;
5125 *kb_used = 0;
11fdf7f2
TL
5126 *kb_used_data = 0;
5127 *kb_used_omap = 0;
5128 *kb_used_meta = 0;
31f18b77
FG
5129 *kb_avail = 0;
5130 return true;
5131 }
11fdf7f2
TL
5132 return get_osd_utilization(id, kb, kb_used, kb_used_data,
5133 kb_used_omap, kb_used_meta, kb_avail);
31f18b77
FG
5134 }
5135
5136 *kb = 0;
5137 *kb_used = 0;
11fdf7f2
TL
5138 *kb_used_data = 0;
5139 *kb_used_omap = 0;
5140 *kb_used_meta = 0;
31f18b77
FG
5141 *kb_avail = 0;
5142
5143 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
5144 int item = osdmap->crush->get_bucket_item(id, k);
11fdf7f2
TL
5145 int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
5146 kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
5147 if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
5148 &kb_used_data_i, &kb_used_omap_i,
5149 &kb_used_meta_i, &kb_avail_i))
31f18b77
FG
5150 return false;
5151 *kb += kb_i;
5152 *kb_used += kb_used_i;
11fdf7f2
TL
5153 *kb_used_data += kb_used_data_i;
5154 *kb_used_omap += kb_used_omap_i;
5155 *kb_used_meta += kb_used_meta_i;
31f18b77
FG
5156 *kb_avail += kb_avail_i;
5157 }
5158 return *kb > 0;
5159 }
5160
5161protected:
5162 const OSDMap *osdmap;
11fdf7f2 5163 const PGMap& pgmap;
31f18b77 5164 bool tree;
11fdf7f2
TL
5165 const string class_name;
5166 const string item_name;
31f18b77
FG
5167 double average_util;
5168 double min_var;
5169 double max_var;
5170 double stddev;
5171 double sum;
11fdf7f2
TL
5172 set<int> allowed;
5173 set<int> dumped_osds;
31f18b77
FG
5174};
5175
5176
5177class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
5178public:
5179 typedef OSDUtilizationDumper<TextTable> Parent;
5180
5181 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2
TL
5182 const PGMap& pgmap, bool tree,
5183 const string& class_name,
5184 const string& item_name) :
5185 Parent(crush, osdmap, pgmap, tree, class_name, item_name) {}
31f18b77
FG
5186
5187 void dump(TextTable *tbl) {
5188 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 5189 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5190 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
5191 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
5192 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
5193 tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
5194 tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
5195 tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
5196 tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5197 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
5198 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
5199 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
5200 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 5201 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
5202 if (tree)
5203 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
5204
5205 Parent::dump(tbl);
5206
5207 dump_stray(tbl);
5208
11fdf7f2 5209 auto sum = pgmap.get_osd_sum(get_dumped_osds());
224ce89b
WB
5210 *tbl << ""
5211 << ""
5212 << "" << "TOTAL"
11fdf7f2
TL
5213 << byte_u_t(sum.statfs.total)
5214 << byte_u_t(sum.statfs.get_used_raw())
5215 << byte_u_t(sum.statfs.allocated)
5216 << byte_u_t(sum.statfs.omap_allocated)
5217 << byte_u_t(sum.statfs.internal_metadata)
5218 << byte_u_t(sum.statfs.available)
31f18b77
FG
5219 << lowprecision_t(average_util)
5220 << ""
5221 << TextTable::endrow;
5222 }
5223
5224protected:
5225 struct lowprecision_t {
5226 float v;
5227 explicit lowprecision_t(float _v) : v(_v) {}
5228 };
5229 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
5230
5231 using OSDUtilizationDumper<TextTable>::dump_item;
5232 void dump_item(const CrushTreeDumper::Item &qi,
5233 float &reweight,
5234 int64_t kb,
5235 int64_t kb_used,
11fdf7f2
TL
5236 int64_t kb_used_data,
5237 int64_t kb_used_omap,
5238 int64_t kb_used_meta,
31f18b77
FG
5239 int64_t kb_avail,
5240 double& util,
5241 double& var,
5242 const size_t num_pgs,
5243 TextTable *tbl) override {
224ce89b
WB
5244 const char *c = crush->get_item_class(qi.id);
5245 if (!c)
5246 c = "";
31f18b77 5247 *tbl << qi.id
224ce89b 5248 << c
31f18b77
FG
5249 << weightf_t(qi.weight)
5250 << weightf_t(reweight)
1adf2230
AA
5251 << byte_u_t(kb << 10)
5252 << byte_u_t(kb_used << 10)
11fdf7f2
TL
5253 << byte_u_t(kb_used_data << 10)
5254 << byte_u_t(kb_used_omap << 10)
5255 << byte_u_t(kb_used_meta << 10)
1adf2230 5256 << byte_u_t(kb_avail << 10)
31f18b77
FG
5257 << lowprecision_t(util)
5258 << lowprecision_t(var);
5259
5260 if (qi.is_bucket()) {
5261 *tbl << "-";
11fdf7f2 5262 *tbl << "";
31f18b77
FG
5263 } else {
5264 *tbl << num_pgs;
11fdf7f2
TL
5265 if (osdmap->is_up(qi.id)) {
5266 *tbl << "up";
5267 } else if (osdmap->is_destroyed(qi.id)) {
5268 *tbl << "destroyed";
5269 } else {
5270 *tbl << "down";
5271 }
31f18b77
FG
5272 }
5273
5274 if (tree) {
5275 ostringstream name;
5276 for (int k = 0; k < qi.depth; k++)
5277 name << " ";
5278 if (qi.is_bucket()) {
5279 int type = crush->get_bucket_type(qi.id);
5280 name << crush->get_type_name(type) << " "
5281 << crush->get_item_name(qi.id);
5282 } else {
5283 name << "osd." << qi.id;
5284 }
5285 *tbl << name.str();
5286 }
5287
5288 *tbl << TextTable::endrow;
5289 }
5290
5291public:
5292 string summary() {
5293 ostringstream out;
5294 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
5295 << "/" << lowprecision_t(max_var) << " "
5296 << "STDDEV: " << lowprecision_t(dev());
5297 return out.str();
5298 }
5299};
5300
5301ostream& operator<<(ostream& out,
5302 const OSDUtilizationPlainDumper::lowprecision_t& v)
5303{
5304 if (v.v < -0.01) {
5305 return out << "-";
5306 } else if (v.v < 0.001) {
5307 return out << "0";
5308 } else {
5309 std::streamsize p = out.precision();
5310 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
5311 }
5312}
5313
5314class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
5315public:
5316 typedef OSDUtilizationDumper<Formatter> Parent;
5317
5318 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
11fdf7f2
TL
5319 const PGMap& pgmap, bool tree,
5320 const string& class_name,
5321 const string& item_name) :
5322 Parent(crush, osdmap, pgmap, tree, class_name, item_name) {}
31f18b77
FG
5323
5324 void dump(Formatter *f) {
5325 f->open_array_section("nodes");
5326 Parent::dump(f);
5327 f->close_section();
5328
5329 f->open_array_section("stray");
5330 dump_stray(f);
5331 f->close_section();
5332 }
5333
5334protected:
5335 using OSDUtilizationDumper<Formatter>::dump_item;
5336 void dump_item(const CrushTreeDumper::Item &qi,
11fdf7f2
TL
5337 float &reweight,
5338 int64_t kb,
5339 int64_t kb_used,
5340 int64_t kb_used_data,
5341 int64_t kb_used_omap,
5342 int64_t kb_used_meta,
5343 int64_t kb_avail,
5344 double& util,
5345 double& var,
5346 const size_t num_pgs,
5347 Formatter *f) override {
31f18b77 5348 f->open_object_section("item");
c07f9fc5 5349 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
5350 f->dump_float("reweight", reweight);
5351 f->dump_int("kb", kb);
5352 f->dump_int("kb_used", kb_used);
11fdf7f2
TL
5353 f->dump_int("kb_used_data", kb_used_data);
5354 f->dump_int("kb_used_omap", kb_used_omap);
5355 f->dump_int("kb_used_meta", kb_used_meta);
31f18b77
FG
5356 f->dump_int("kb_avail", kb_avail);
5357 f->dump_float("utilization", util);
5358 f->dump_float("var", var);
5359 f->dump_unsigned("pgs", num_pgs);
11fdf7f2
TL
5360 if (!qi.is_bucket()) {
5361 if (osdmap->is_up(qi.id)) {
5362 f->dump_string("status", "up");
5363 } else if (osdmap->is_destroyed(qi.id)) {
5364 f->dump_string("status", "destroyed");
5365 } else {
5366 f->dump_string("status", "down");
5367 }
5368 }
31f18b77
FG
5369 CrushTreeDumper::dump_bucket_children(crush, qi, f);
5370 f->close_section();
5371 }
5372
5373public:
5374 void summary(Formatter *f) {
5375 f->open_object_section("summary");
11fdf7f2
TL
5376 auto sum = pgmap.get_osd_sum(get_dumped_osds());
5377 auto& s = sum.statfs;
5378
5379 f->dump_int("total_kb", s.kb());
5380 f->dump_int("total_kb_used", s.kb_used_raw());
5381 f->dump_int("total_kb_used_data", s.kb_used_data());
5382 f->dump_int("total_kb_used_omap", s.kb_used_omap());
5383 f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
5384 f->dump_int("total_kb_avail", s.kb_avail());
31f18b77
FG
5385 f->dump_float("average_utilization", average_util);
5386 f->dump_float("min_var", min_var);
5387 f->dump_float("max_var", max_var);
5388 f->dump_float("dev", dev());
5389 f->close_section();
5390 }
5391};
5392
5393void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
5394 const PGMap& pgmap,
5395 ostream& out,
5396 Formatter *f,
5397 bool tree,
5398 const string& class_name,
5399 const string& item_name)
31f18b77
FG
5400{
5401 const CrushWrapper *crush = osdmap.crush.get();
5402 if (f) {
5403 f->open_object_section("df");
11fdf7f2
TL
5404 OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree,
5405 class_name, item_name);
31f18b77
FG
5406 d.dump(f);
5407 d.summary(f);
5408 f->close_section();
5409 f->flush(out);
5410 } else {
11fdf7f2
TL
5411 OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree,
5412 class_name, item_name);
31f18b77
FG
5413 TextTable tbl;
5414 d.dump(&tbl);
5415 out << tbl << d.summary() << "\n";
5416 }
5417}
224ce89b 5418
92f5a8d4
TL
5419void OSDMap::check_health(CephContext *cct,
5420 health_check_map_t *checks) const
224ce89b
WB
5421{
5422 int num_osds = get_num_osds();
5423
5424 // OSD_DOWN
5425 // OSD_$subtree_DOWN
5426 // OSD_ORPHAN
5427 if (num_osds >= 0) {
5428 int num_in_osds = 0;
5429 int num_down_in_osds = 0;
5430 set<int> osds;
5431 set<int> down_in_osds;
5432 set<int> up_in_osds;
5433 set<int> subtree_up;
5434 unordered_map<int, set<int> > subtree_type_down;
5435 unordered_map<int, int> num_osds_subtree;
5436 int max_type = crush->get_max_type_id();
5437
5438 for (int i = 0; i < get_max_osd(); i++) {
5439 if (!exists(i)) {
5440 if (crush->item_exists(i)) {
5441 osds.insert(i);
5442 }
5443 continue;
5444 }
5445 if (is_out(i))
5446 continue;
5447 ++num_in_osds;
5448 if (down_in_osds.count(i) || up_in_osds.count(i))
5449 continue;
5450 if (!is_up(i)) {
5451 down_in_osds.insert(i);
5452 int parent_id = 0;
5453 int current = i;
5454 for (int type = 0; type <= max_type; type++) {
5455 if (!crush->get_type_name(type))
5456 continue;
5457 int r = crush->get_immediate_parent_id(current, &parent_id);
5458 if (r == -ENOENT)
5459 break;
5460 // break early if this parent is already marked as up
5461 if (subtree_up.count(parent_id))
5462 break;
5463 type = crush->get_bucket_type(parent_id);
5464 if (!subtree_type_is_down(
92f5a8d4 5465 cct, parent_id, type,
224ce89b
WB
5466 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
5467 break;
5468 current = parent_id;
5469 }
5470 }
5471 }
5472
5473 // calculate the number of down osds in each down subtree and
5474 // store it in num_osds_subtree
5475 for (int type = 1; type <= max_type; type++) {
5476 if (!crush->get_type_name(type))
5477 continue;
5478 for (auto j = subtree_type_down[type].begin();
5479 j != subtree_type_down[type].end();
5480 ++j) {
5481 list<int> children;
5482 int num = 0;
5483 int num_children = crush->get_children(*j, &children);
5484 if (num_children == 0)
5485 continue;
5486 for (auto l = children.begin(); l != children.end(); ++l) {
5487 if (*l >= 0) {
5488 ++num;
5489 } else if (num_osds_subtree[*l] > 0) {
5490 num = num + num_osds_subtree[*l];
5491 }
5492 }
5493 num_osds_subtree[*j] = num;
5494 }
5495 }
5496 num_down_in_osds = down_in_osds.size();
11fdf7f2 5497 ceph_assert(num_down_in_osds <= num_in_osds);
224ce89b
WB
5498 if (num_down_in_osds > 0) {
5499 // summary of down subtree types and osds
5500 for (int type = max_type; type > 0; type--) {
5501 if (!crush->get_type_name(type))
5502 continue;
5503 if (subtree_type_down[type].size() > 0) {
5504 ostringstream ss;
5505 ss << subtree_type_down[type].size() << " "
5506 << crush->get_type_name(type);
5507 if (subtree_type_down[type].size() > 1) {
5508 ss << "s";
5509 }
5510 int sum_down_osds = 0;
5511 for (auto j = subtree_type_down[type].begin();
5512 j != subtree_type_down[type].end();
5513 ++j) {
5514 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
5515 }
5516 ss << " (" << sum_down_osds << " osds) down";
5517 string err = string("OSD_") +
5518 string(crush->get_type_name(type)) + "_DOWN";
5519 boost::to_upper(err);
5520 auto& d = checks->add(err, HEALTH_WARN, ss.str());
5521 for (auto j = subtree_type_down[type].rbegin();
5522 j != subtree_type_down[type].rend();
5523 ++j) {
5524 ostringstream ss;
5525 ss << crush->get_type_name(type);
5526 ss << " ";
5527 ss << crush->get_item_name(*j);
5528 // at the top level, do not print location
5529 if (type != max_type) {
5530 ss << " (";
5531 ss << crush->get_full_location_ordered_string(*j);
5532 ss << ")";
5533 }
5534 int num = num_osds_subtree[*j];
5535 ss << " (" << num << " osds)";
5536 ss << " is down";
5537 d.detail.push_back(ss.str());
5538 }
5539 }
5540 }
5541 ostringstream ss;
5542 ss << down_in_osds.size() << " osds down";
5543 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
5544 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
5545 ostringstream ss;
5546 ss << "osd." << *it << " (";
5547 ss << crush->get_full_location_ordered_string(*it);
5548 ss << ") is down";
5549 d.detail.push_back(ss.str());
5550 }
5551 }
5552
5553 if (!osds.empty()) {
5554 ostringstream ss;
5555 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
5556 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
5557 for (auto osd : osds) {
5558 ostringstream ss;
5559 ss << "osd." << osd << " exists in crush map but not in osdmap";
5560 d.detail.push_back(ss.str());
5561 }
5562 }
5563 }
5564
eafe8130
TL
5565 std::list<std::string> scrub_messages;
5566 bool noscrub = false, nodeepscrub = false;
5567 for (const auto &p : pools) {
5568 if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
5569 ostringstream ss;
5570 ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
5571 scrub_messages.push_back(ss.str());
5572 noscrub = true;
5573 }
5574 if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
5575 ostringstream ss;
5576 ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
5577 scrub_messages.push_back(ss.str());
5578 nodeepscrub = true;
5579 }
5580 }
5581 if (noscrub || nodeepscrub) {
5582 string out = "";
5583 out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
5584 out += nodeepscrub ? "nodeep-scrub" : "";
5585 auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
5586 "Some pool(s) have the " + out + " flag(s) set");
5587 d.detail.splice(d.detail.end(), scrub_messages);
5588 }
5589
224ce89b
WB
5590 // OSD_OUT_OF_ORDER_FULL
5591 {
5592 // An osd could configure failsafe ratio, to something different
5593 // but for now assume it is the same here.
92f5a8d4 5594 float fsr = cct->_conf->osd_failsafe_full_ratio;
224ce89b
WB
5595 if (fsr > 1.0) fsr /= 100;
5596 float fr = get_full_ratio();
5597 float br = get_backfillfull_ratio();
5598 float nr = get_nearfull_ratio();
5599
5600 list<string> detail;
5601 // These checks correspond to how OSDService::check_full_status() in an OSD
5602 // handles the improper setting of these values.
5603 if (br < nr) {
5604 ostringstream ss;
5605 ss << "backfillfull_ratio (" << br
5606 << ") < nearfull_ratio (" << nr << "), increased";
5607 detail.push_back(ss.str());
5608 br = nr;
5609 }
5610 if (fr < br) {
5611 ostringstream ss;
5612 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5613 << "), increased";
5614 detail.push_back(ss.str());
5615 fr = br;
5616 }
5617 if (fsr < fr) {
5618 ostringstream ss;
5619 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5620 << "), increased";
5621 detail.push_back(ss.str());
5622 }
5623 if (!detail.empty()) {
5624 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
5625 "full ratio(s) out of order");
5626 d.detail.swap(detail);
5627 }
5628 }
5629
5630 // OSD_FULL
5631 // OSD_NEARFULL
5632 // OSD_BACKFILLFULL
5633 // OSD_FAILSAFE_FULL
5634 {
5635 set<int> full, backfillfull, nearfull;
5636 get_full_osd_counts(&full, &backfillfull, &nearfull);
5637 if (full.size()) {
5638 ostringstream ss;
5639 ss << full.size() << " full osd(s)";
5640 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
5641 for (auto& i: full) {
5642 ostringstream ss;
5643 ss << "osd." << i << " is full";
5644 d.detail.push_back(ss.str());
5645 }
5646 }
5647 if (backfillfull.size()) {
5648 ostringstream ss;
5649 ss << backfillfull.size() << " backfillfull osd(s)";
5650 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
5651 for (auto& i: backfillfull) {
5652 ostringstream ss;
5653 ss << "osd." << i << " is backfill full";
5654 d.detail.push_back(ss.str());
5655 }
5656 }
5657 if (nearfull.size()) {
5658 ostringstream ss;
5659 ss << nearfull.size() << " nearfull osd(s)";
5660 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
5661 for (auto& i: nearfull) {
5662 ostringstream ss;
5663 ss << "osd." << i << " is near full";
5664 d.detail.push_back(ss.str());
5665 }
5666 }
5667 }
5668
5669 // OSDMAP_FLAGS
5670 {
5671 // warn about flags
5672 uint64_t warn_flags =
3efd9988 5673 CEPH_OSDMAP_NEARFULL |
224ce89b
WB
5674 CEPH_OSDMAP_FULL |
5675 CEPH_OSDMAP_PAUSERD |
5676 CEPH_OSDMAP_PAUSEWR |
5677 CEPH_OSDMAP_PAUSEREC |
5678 CEPH_OSDMAP_NOUP |
5679 CEPH_OSDMAP_NODOWN |
5680 CEPH_OSDMAP_NOIN |
5681 CEPH_OSDMAP_NOOUT |
5682 CEPH_OSDMAP_NOBACKFILL |
5683 CEPH_OSDMAP_NORECOVER |
5684 CEPH_OSDMAP_NOSCRUB |
5685 CEPH_OSDMAP_NODEEP_SCRUB |
5686 CEPH_OSDMAP_NOTIERAGENT |
11fdf7f2 5687 CEPH_OSDMAP_NOSNAPTRIM |
224ce89b
WB
5688 CEPH_OSDMAP_NOREBALANCE;
5689 if (test_flag(warn_flags)) {
5690 ostringstream ss;
5691 ss << get_flag_string(get_flags() & warn_flags)
5692 << " flag(s) set";
5693 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
5694 }
5695 }
5696
5697 // OSD_FLAGS
5698 {
5699 list<string> detail;
5700 const unsigned flags =
5701 CEPH_OSD_NOUP |
5702 CEPH_OSD_NOIN |
5703 CEPH_OSD_NODOWN |
5704 CEPH_OSD_NOOUT;
5705 for (int i = 0; i < max_osd; ++i) {
5706 if (osd_state[i] & flags) {
5707 ostringstream ss;
5708 set<string> states;
5709 OSDMap::calc_state_set(osd_state[i] & flags, states);
5710 ss << "osd." << i << " has flags " << states;
5711 detail.push_back(ss.str());
5712 }
5713 }
81eedcae
TL
5714 for (auto& i : crush_node_flags) {
5715 if (i.second && crush->item_exists(i.first)) {
5716 ostringstream ss;
5717 set<string> states;
5718 OSDMap::calc_state_set(i.second, states);
5719 int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
5720 const char *tn = crush->get_type_name(t);
5721 ss << (tn ? tn : "node") << " "
5722 << crush->get_item_name(i.first) << " has flags " << states;
5723 detail.push_back(ss.str());
5724 }
5725 }
5726 for (auto& i : device_class_flags) {
5727 const char* class_name = crush->get_class_name(i.first);
5728 if (i.second && class_name) {
5729 ostringstream ss;
5730 set<string> states;
5731 OSDMap::calc_state_set(i.second, states);
5732 ss << "device class '" << class_name << "' has flags " << states;
5733 detail.push_back(ss.str());
5734 }
5735 }
224ce89b
WB
5736 if (!detail.empty()) {
5737 ostringstream ss;
81eedcae 5738 ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
224ce89b
WB
5739 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
5740 d.detail.swap(detail);
5741 }
5742 }
5743
5744 // OLD_CRUSH_TUNABLES
92f5a8d4 5745 if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
224ce89b 5746 string min = crush->get_min_required_version();
92f5a8d4 5747 if (min < cct->_conf->mon_crush_min_required_version) {
224ce89b
WB
5748 ostringstream ss;
5749 ss << "crush map has legacy tunables (require " << min
92f5a8d4 5750 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
224ce89b
WB
5751 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
5752 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5753 }
5754 }
5755
5756 // OLD_CRUSH_STRAW_CALC_VERSION
92f5a8d4 5757 if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
224ce89b
WB
5758 if (crush->get_straw_calc_version() == 0) {
5759 ostringstream ss;
5760 ss << "crush map has straw_calc_version=0";
5761 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
5762 d.detail.push_back(
5763 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5764 }
5765 }
5766
5767 // CACHE_POOL_NO_HIT_SET
92f5a8d4 5768 if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
224ce89b
WB
5769 list<string> detail;
5770 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5771 p != pools.end();
5772 ++p) {
5773 const pg_pool_t& info = p->second;
5774 if (info.cache_mode_requires_hit_set() &&
5775 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5776 ostringstream ss;
5777 ss << "pool '" << get_pool_name(p->first)
5778 << "' with cache_mode " << info.get_cache_mode_name()
5779 << " needs hit_set_type to be set but it is not";
5780 detail.push_back(ss.str());
5781 }
5782 }
5783 if (!detail.empty()) {
5784 ostringstream ss;
5785 ss << detail.size() << " cache pools are missing hit_sets";
5786 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
5787 d.detail.swap(detail);
5788 }
5789 }
5790
5791 // OSD_NO_SORTBITWISE
11fdf7f2 5792 if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
224ce89b 5793 ostringstream ss;
11fdf7f2 5794 ss << "'sortbitwise' flag is not set";
224ce89b
WB
5795 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
5796 }
5797
5798 // OSD_UPGRADE_FINISHED
5799 // none of these (yet) since we don't run until luminous upgrade is done.
5800
3efd9988 5801 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 5802 {
3efd9988 5803 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
5804 for (auto it : get_pools()) {
5805 const pg_pool_t &pool = it.second;
3efd9988 5806 const string& pool_name = get_pool_name(it.first);
224ce89b 5807 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 5808 stringstream ss;
11fdf7f2 5809 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
3efd9988
FG
5810 // may run out of space too,
5811 // but we want EQUOTA taking precedence
11fdf7f2 5812 ss << "pool '" << pool_name << "' is full (running out of quota)";
3efd9988
FG
5813 } else {
5814 ss << "pool '" << pool_name << "' is full (no space)";
5815 }
5816 full_detail.push_back(ss.str());
5817 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
5818 stringstream ss;
5819 ss << "pool '" << pool_name << "' is backfillfull";
5820 backfillfull_detail.push_back(ss.str());
5821 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
5822 stringstream ss;
5823 ss << "pool '" << pool_name << "' is nearfull";
5824 nearfull_detail.push_back(ss.str());
224ce89b
WB
5825 }
5826 }
3efd9988 5827 if (!full_detail.empty()) {
224ce89b 5828 ostringstream ss;
3efd9988 5829 ss << full_detail.size() << " pool(s) full";
224ce89b 5830 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
3efd9988
FG
5831 d.detail.swap(full_detail);
5832 }
5833 if (!backfillfull_detail.empty()) {
5834 ostringstream ss;
5835 ss << backfillfull_detail.size() << " pool(s) backfillfull";
5836 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
5837 d.detail.swap(backfillfull_detail);
5838 }
5839 if (!nearfull_detail.empty()) {
5840 ostringstream ss;
5841 ss << nearfull_detail.size() << " pool(s) nearfull";
5842 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
5843 d.detail.swap(nearfull_detail);
224ce89b
WB
5844 }
5845 }
92f5a8d4
TL
5846
5847 // POOL_PG_NUM_NOT_POWER_OF_TWO
5848 if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
5849 list<string> detail;
5850 for (auto it : get_pools()) {
5851 if (!isp2(it.second.get_pg_num_target())) {
5852 ostringstream ss;
5853 ss << "pool '" << get_pool_name(it.first)
5854 << "' pg_num " << it.second.get_pg_num_target()
5855 << " is not a power of two";
5856 detail.push_back(ss.str());
5857 }
5858 }
5859 if (!detail.empty()) {
5860 ostringstream ss;
5861 ss << detail.size() << " pool(s) have non-power-of-two pg_num";
5862 auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
5863 ss.str());
5864 d.detail.swap(detail);
5865 }
5866 }
224ce89b 5867}
35e4c445
FG
5868
5869int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
5870 ostream *ss) const
5871{
5872 out->clear();
5873 for (auto i = ls.begin(); i != ls.end(); ++i) {
5874 if (i == ls.begin() &&
5875 (*i == "any" || *i == "all" || *i == "*")) {
5876 get_all_osds(*out);
5877 break;
5878 }
5879 long osd = parse_osd_id(i->c_str(), ss);
5880 if (osd < 0) {
5881 *ss << "invalid osd id '" << *i << "'";
5882 return -EINVAL;
5883 }
5884 out->insert(osd);
5885 }
5886 return 0;
5887}
11fdf7f2
TL
5888
5889void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
5890 string &subtree,
5891 int limit, // how many
5892 set<int> skip,
5893 set<int> *want) const {
5894 if (limit <= 0)
5895 return;
5896 int subtree_type = crush->get_type_id(subtree);
5897 if (subtree_type < 1)
5898 return;
5899 vector<int> subtrees;
5900 crush->get_subtree_of_type(subtree_type, &subtrees);
5901 std::random_device rd;
5902 std::default_random_engine rng{rd()};
5903 std::shuffle(subtrees.begin(), subtrees.end(), rng);
5904 for (auto s : subtrees) {
5905 if (limit <= 0)
5906 break;
5907 if (crush->subtree_contains(s, n))
5908 continue;
5909 vector<int> osds;
5910 crush->get_children_of_type(s, 0, &osds);
5911 if (osds.empty())
5912 continue;
5913 vector<int> up_osds;
5914 for (auto o : osds) {
5915 if (is_up(o) && !skip.count(o))
5916 up_osds.push_back(o);
5917 }
5918 if (up_osds.empty())
5919 continue;
5920 auto it = up_osds.begin();
5921 std::advance(it, (n % up_osds.size()));
5922 want->insert(*it);
5923 --limit;
5924 }
5925}
5926
5927float OSDMap::pool_raw_used_rate(int64_t poolid) const
5928{
5929 const pg_pool_t *pool = get_pg_pool(poolid);
5930 assert(pool != nullptr);
5931
5932 switch (pool->get_type()) {
5933 case pg_pool_t::TYPE_REPLICATED:
5934 return pool->get_size();
5935 break;
5936 case pg_pool_t::TYPE_ERASURE:
5937 {
5938 auto& ecp =
5939 get_erasure_code_profile(pool->erasure_code_profile);
5940 auto pm = ecp.find("m");
5941 auto pk = ecp.find("k");
5942 if (pm != ecp.end() && pk != ecp.end()) {
5943 int k = atoi(pk->second.c_str());
5944 int m = atoi(pm->second.c_str());
5945 int mk = m + k;
5946 ceph_assert(mk != 0);
5947 ceph_assert(k != 0);
5948 return (float)mk / k;
5949 } else {
5950 return 0.0;
5951 }
5952 }
5953 break;
5954 default:
5955 ceph_abort_msg("unrecognized pool type");
5956 }
5957}
81eedcae
TL
5958
5959unsigned OSDMap::get_osd_crush_node_flags(int osd) const
5960{
5961 unsigned flags = 0;
5962 if (!crush_node_flags.empty()) {
5963 // the map will contain type -> name
5964 std::map<std::string,std::string> ploc = crush->get_full_location(osd);
5965 for (auto& i : ploc) {
5966 int id = crush->get_item_id(i.second);
5967 auto p = crush_node_flags.find(id);
5968 if (p != crush_node_flags.end()) {
5969 flags |= p->second;
5970 }
5971 }
5972 }
5973 return flags;
5974}
5975
5976unsigned OSDMap::get_crush_node_flags(int id) const
5977{
5978 unsigned flags = 0;
5979 auto it = crush_node_flags.find(id);
5980 if (it != crush_node_flags.end())
5981 flags = it->second;
5982 return flags;
5983}
5984
5985unsigned OSDMap::get_device_class_flags(int id) const
5986{
5987 unsigned flags = 0;
5988 auto it = device_class_flags.find(id);
5989 if (it != device_class_flags.end())
5990 flags = it->second;
5991 return flags;
5992}