]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / osd / OSDMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
224ce89b
WB
18#include <boost/algorithm/string.hpp>
19
7c673cae
FG
20#include "OSDMap.h"
21#include <algorithm>
22#include "common/config.h"
3efd9988 23#include "common/errno.h"
7c673cae
FG
24#include "common/Formatter.h"
25#include "common/TextTable.h"
26#include "include/ceph_features.h"
27#include "include/str_map.h"
28
29#include "common/code_environment.h"
224ce89b 30#include "mon/health_check.h"
7c673cae
FG
31
32#include "crush/CrushTreeDumper.h"
33#include "common/Clock.h"
31f18b77 34#include "mon/PGStatService.h"
7c673cae
FG
35
36#define dout_subsys ceph_subsys_osd
37
38MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
39MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
40
41
42// ----------------------------------
43// osd_info_t
44
45void osd_info_t::dump(Formatter *f) const
46{
47 f->dump_int("last_clean_begin", last_clean_begin);
48 f->dump_int("last_clean_end", last_clean_end);
49 f->dump_int("up_from", up_from);
50 f->dump_int("up_thru", up_thru);
51 f->dump_int("down_at", down_at);
52 f->dump_int("lost_at", lost_at);
53}
54
55void osd_info_t::encode(bufferlist& bl) const
56{
57 __u8 struct_v = 1;
58 ::encode(struct_v, bl);
59 ::encode(last_clean_begin, bl);
60 ::encode(last_clean_end, bl);
61 ::encode(up_from, bl);
62 ::encode(up_thru, bl);
63 ::encode(down_at, bl);
64 ::encode(lost_at, bl);
65}
66
67void osd_info_t::decode(bufferlist::iterator& bl)
68{
69 __u8 struct_v;
70 ::decode(struct_v, bl);
71 ::decode(last_clean_begin, bl);
72 ::decode(last_clean_end, bl);
73 ::decode(up_from, bl);
74 ::decode(up_thru, bl);
75 ::decode(down_at, bl);
76 ::decode(lost_at, bl);
77}
78
79void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
80{
81 o.push_back(new osd_info_t);
82 o.push_back(new osd_info_t);
83 o.back()->last_clean_begin = 1;
84 o.back()->last_clean_end = 2;
85 o.back()->up_from = 30;
86 o.back()->up_thru = 40;
87 o.back()->down_at = 5;
88 o.back()->lost_at = 6;
89}
90
91ostream& operator<<(ostream& out, const osd_info_t& info)
92{
93 out << "up_from " << info.up_from
94 << " up_thru " << info.up_thru
95 << " down_at " << info.down_at
96 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
97 if (info.lost_at)
98 out << " lost_at " << info.lost_at;
99 return out;
100}
101
102// ----------------------------------
103// osd_xinfo_t
104
105void osd_xinfo_t::dump(Formatter *f) const
106{
107 f->dump_stream("down_stamp") << down_stamp;
108 f->dump_float("laggy_probability", laggy_probability);
109 f->dump_int("laggy_interval", laggy_interval);
110 f->dump_int("features", features);
111 f->dump_unsigned("old_weight", old_weight);
112}
113
114void osd_xinfo_t::encode(bufferlist& bl) const
115{
116 ENCODE_START(3, 1, bl);
117 ::encode(down_stamp, bl);
118 __u32 lp = laggy_probability * 0xfffffffful;
119 ::encode(lp, bl);
120 ::encode(laggy_interval, bl);
121 ::encode(features, bl);
122 ::encode(old_weight, bl);
123 ENCODE_FINISH(bl);
124}
125
126void osd_xinfo_t::decode(bufferlist::iterator& bl)
127{
128 DECODE_START(3, bl);
129 ::decode(down_stamp, bl);
130 __u32 lp;
131 ::decode(lp, bl);
132 laggy_probability = (float)lp / (float)0xffffffff;
133 ::decode(laggy_interval, bl);
134 if (struct_v >= 2)
135 ::decode(features, bl);
136 else
137 features = 0;
138 if (struct_v >= 3)
139 ::decode(old_weight, bl);
140 else
141 old_weight = 0;
142 DECODE_FINISH(bl);
143}
144
145void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
146{
147 o.push_back(new osd_xinfo_t);
148 o.push_back(new osd_xinfo_t);
149 o.back()->down_stamp = utime_t(2, 3);
150 o.back()->laggy_probability = .123;
151 o.back()->laggy_interval = 123456;
152 o.back()->old_weight = 0x7fff;
153}
154
155ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
156{
157 return out << "down_stamp " << xi.down_stamp
158 << " laggy_probability " << xi.laggy_probability
159 << " laggy_interval " << xi.laggy_interval
160 << " old_weight " << xi.old_weight;
161}
162
163// ----------------------------------
164// OSDMap::Incremental
165
166int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
167{
168 int n = 0;
169 for (auto &weight : new_weight) {
170 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
171 n++; // marked out
172 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
173 n--; // marked in
174 }
175 return n;
176}
177
178int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
179{
180 int n = 0;
181 for (auto &state : new_state) { //
182 if (state.second & CEPH_OSD_UP) {
183 if (previous->is_up(state.first))
184 n++; // marked down
185 else
186 n--; // marked up
187 }
188 }
189 return n;
190}
191
192int OSDMap::Incremental::identify_osd(uuid_d u) const
193{
194 for (auto &uuid : new_uuid)
195 if (uuid.second == u)
196 return uuid.first;
197 return -1;
198}
199
200int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
201 const OSDMap& osdmap)
202{
203 assert(epoch == osdmap.get_epoch() + 1);
204
205 for (auto &new_pool : new_pools) {
206 if (!new_pool.second.tiers.empty()) {
207 pg_pool_t& base = new_pool.second;
208
209 for (const auto &tier_pool : base.tiers) {
210 const auto &r = new_pools.find(tier_pool);
211 pg_pool_t *tier = 0;
212 if (r == new_pools.end()) {
213 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
214 if (!orig) {
215 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
216 return -EIO;
217 }
218 tier = get_new_pool(tier_pool, orig);
219 } else {
220 tier = &r->second;
221 }
222 if (tier->tier_of != new_pool.first) {
223 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
224 return -EIO;
225 }
226
227 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
228 << tier_pool << dendl;
229 tier->snap_seq = base.snap_seq;
230 tier->snap_epoch = base.snap_epoch;
231 tier->snaps = base.snaps;
232 tier->removed_snaps = base.removed_snaps;
233 }
234 }
235 }
236 return 0;
237}
238
28e407b8
AA
239// ----------------------------------
240// OSDMap
7c673cae
FG
241
242bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
243{
244 if (id >= 0)
245 return is_down(id);
246
247 if (down_cache &&
248 down_cache->count(id)) {
249 return true;
250 }
251
252 list<int> children;
253 crush->get_children(id, &children);
254 for (const auto &child : children) {
255 if (!subtree_is_down(child, down_cache)) {
256 return false;
257 }
258 }
259 if (down_cache) {
260 down_cache->insert(id);
261 }
262 return true;
263}
264
265bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
266{
267 // use a stack-local down_cache if we didn't get one from the
268 // caller. then at least this particular call will avoid duplicated
269 // work.
270 set<int> local_down_cache;
271 if (!down_cache) {
272 down_cache = &local_down_cache;
273 }
274
275 int current = id;
276 while (true) {
277 int type;
278 if (current >= 0) {
279 type = 0;
280 } else {
281 type = crush->get_bucket_type(current);
282 }
283 assert(type >= 0);
284
285 if (!subtree_is_down(current, down_cache)) {
286 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
287 return false;
288 }
289
290 // is this a big enough subtree to be marked as down?
291 if (type >= subtree_type) {
292 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
293 return true;
294 }
295
296 int r = crush->get_immediate_parent_id(current, &current);
297 if (r < 0) {
298 return false;
299 }
300 }
301}
302
224ce89b
WB
303bool OSDMap::subtree_type_is_down(
304 CephContext *cct,
305 int id,
306 int subtree_type,
307 set<int> *down_in_osds,
308 set<int> *up_in_osds,
309 set<int> *subtree_up,
310 unordered_map<int, set<int> > *subtree_type_down) const
31f18b77
FG
311{
312 if (id >= 0) {
313 bool is_down_ret = is_down(id);
314 if (!is_out(id)) {
315 if (is_down_ret) {
316 down_in_osds->insert(id);
317 } else {
318 up_in_osds->insert(id);
319 }
320 }
321 return is_down_ret;
322 }
323
324 if (subtree_type_down &&
325 (*subtree_type_down)[subtree_type].count(id)) {
326 return true;
327 }
328
329 list<int> children;
330 crush->get_children(id, &children);
331 for (const auto &child : children) {
224ce89b
WB
332 if (!subtree_type_is_down(
333 cct, child, crush->get_bucket_type(child),
334 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
31f18b77
FG
335 subtree_up->insert(id);
336 return false;
337 }
338 }
339 if (subtree_type_down) {
340 (*subtree_type_down)[subtree_type].insert(id);
341 }
342 return true;
343}
344
7c673cae
FG
345void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
346{
347 __u16 v = 5;
348 ::encode(v, bl);
349 ::encode(fsid, bl);
350 ::encode(epoch, bl);
351 ::encode(modified, bl);
352 int32_t new_t = new_pool_max;
353 ::encode(new_t, bl);
354 ::encode(new_flags, bl);
355 ::encode(fullmap, bl);
356 ::encode(crush, bl);
357
358 ::encode(new_max_osd, bl);
359 // for ::encode(new_pools, bl);
360 __u32 n = new_pools.size();
361 ::encode(n, bl);
362 for (const auto &new_pool : new_pools) {
363 n = new_pool.first;
364 ::encode(n, bl);
365 ::encode(new_pool.second, bl, 0);
366 }
367 // for ::encode(new_pool_names, bl);
368 n = new_pool_names.size();
369 ::encode(n, bl);
370
371 for (const auto &new_pool_name : new_pool_names) {
372 n = new_pool_name.first;
373 ::encode(n, bl);
374 ::encode(new_pool_name.second, bl);
375 }
376 // for ::encode(old_pools, bl);
377 n = old_pools.size();
378 ::encode(n, bl);
379 for (auto &old_pool : old_pools) {
380 n = old_pool;
381 ::encode(n, bl);
382 }
383 ::encode(new_up_client, bl, 0);
31f18b77
FG
384 {
385 // legacy is map<int32_t,uint8_t>
386 uint32_t n = new_state.size();
387 ::encode(n, bl);
388 for (auto p : new_state) {
389 ::encode(p.first, bl);
390 ::encode((uint8_t)p.second, bl);
391 }
392 }
7c673cae
FG
393 ::encode(new_weight, bl);
394 // for ::encode(new_pg_temp, bl);
395 n = new_pg_temp.size();
396 ::encode(n, bl);
397
398 for (const auto &pg_temp : new_pg_temp) {
399 old_pg_t opg = pg_temp.first.get_old_pg();
400 ::encode(opg, bl);
401 ::encode(pg_temp.second, bl);
402 }
403}
404
405void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
406{
407 if ((features & CEPH_FEATURE_PGID64) == 0) {
408 encode_client_old(bl);
409 return;
410 }
411
412 // base
413 __u16 v = 6;
414 ::encode(v, bl);
415 ::encode(fsid, bl);
416 ::encode(epoch, bl);
417 ::encode(modified, bl);
418 ::encode(new_pool_max, bl);
419 ::encode(new_flags, bl);
420 ::encode(fullmap, bl);
421 ::encode(crush, bl);
422
423 ::encode(new_max_osd, bl);
424 ::encode(new_pools, bl, features);
425 ::encode(new_pool_names, bl);
426 ::encode(old_pools, bl);
427 ::encode(new_up_client, bl, features);
31f18b77
FG
428 {
429 uint32_t n = new_state.size();
430 ::encode(n, bl);
431 for (auto p : new_state) {
432 ::encode(p.first, bl);
433 ::encode((uint8_t)p.second, bl);
434 }
435 }
7c673cae
FG
436 ::encode(new_weight, bl);
437 ::encode(new_pg_temp, bl);
438
439 // extended
440 __u16 ev = 10;
441 ::encode(ev, bl);
442 ::encode(new_hb_back_up, bl, features);
443 ::encode(new_up_thru, bl);
444 ::encode(new_last_clean_interval, bl);
445 ::encode(new_lost, bl);
446 ::encode(new_blacklist, bl, features);
447 ::encode(old_blacklist, bl, features);
448 ::encode(new_up_cluster, bl, features);
449 ::encode(cluster_snapshot, bl);
450 ::encode(new_uuid, bl);
451 ::encode(new_xinfo, bl);
452 ::encode(new_hb_front_up, bl, features);
453}
454
455void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
456{
457 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
458 encode_classic(bl, features);
459 return;
460 }
461
462 // only a select set of callers should *ever* be encoding new
463 // OSDMaps. others should be passing around the canonical encoded
464 // buffers from on high. select out those callers by passing in an
465 // "impossible" feature bit.
466 assert(features & CEPH_FEATURE_RESERVED);
467 features &= ~CEPH_FEATURE_RESERVED;
468
469 size_t start_offset = bl.length();
470 size_t tail_offset;
471 buffer::list::iterator crc_it;
472
473 // meta-encoding: how we include client-used and osd-specific data
474 ENCODE_START(8, 7, bl);
475
476 {
31f18b77 477 uint8_t v = 5;
7c673cae
FG
478 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
479 v = 3;
480 }
481 ENCODE_START(v, 1, bl); // client-usable data
482 ::encode(fsid, bl);
483 ::encode(epoch, bl);
484 ::encode(modified, bl);
485 ::encode(new_pool_max, bl);
486 ::encode(new_flags, bl);
487 ::encode(fullmap, bl);
488 ::encode(crush, bl);
489
490 ::encode(new_max_osd, bl);
491 ::encode(new_pools, bl, features);
492 ::encode(new_pool_names, bl);
493 ::encode(old_pools, bl);
494 ::encode(new_up_client, bl, features);
31f18b77
FG
495 if (v >= 5) {
496 ::encode(new_state, bl);
497 } else {
498 uint32_t n = new_state.size();
499 ::encode(n, bl);
500 for (auto p : new_state) {
501 ::encode(p.first, bl);
502 ::encode((uint8_t)p.second, bl);
503 }
504 }
7c673cae
FG
505 ::encode(new_weight, bl);
506 ::encode(new_pg_temp, bl);
507 ::encode(new_primary_temp, bl);
508 ::encode(new_primary_affinity, bl);
509 ::encode(new_erasure_code_profiles, bl);
510 ::encode(old_erasure_code_profiles, bl);
511 if (v >= 4) {
512 ::encode(new_pg_upmap, bl);
513 ::encode(old_pg_upmap, bl);
514 ::encode(new_pg_upmap_items, bl);
515 ::encode(old_pg_upmap_items, bl);
516 }
517 ENCODE_FINISH(bl); // client-usable data
518 }
519
520 {
31f18b77 521 uint8_t target_v = 6;
7c673cae
FG
522 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
523 target_v = 2;
524 }
525 ENCODE_START(target_v, 1, bl); // extended, osd-only data
526 ::encode(new_hb_back_up, bl, features);
527 ::encode(new_up_thru, bl);
528 ::encode(new_last_clean_interval, bl);
529 ::encode(new_lost, bl);
530 ::encode(new_blacklist, bl, features);
531 ::encode(old_blacklist, bl, features);
532 ::encode(new_up_cluster, bl, features);
533 ::encode(cluster_snapshot, bl);
534 ::encode(new_uuid, bl);
535 ::encode(new_xinfo, bl);
536 ::encode(new_hb_front_up, bl, features);
537 ::encode(features, bl); // NOTE: features arg, not the member
538 if (target_v >= 3) {
539 ::encode(new_nearfull_ratio, bl);
540 ::encode(new_full_ratio, bl);
541 ::encode(new_backfillfull_ratio, bl);
31f18b77
FG
542 }
543 // 5 was string-based new_require_min_compat_client
544 if (target_v >= 6) {
7c673cae 545 ::encode(new_require_min_compat_client, bl);
31f18b77 546 ::encode(new_require_osd_release, bl);
7c673cae
FG
547 }
548 ENCODE_FINISH(bl); // osd-only data
549 }
550
551 ::encode((uint32_t)0, bl); // dummy inc_crc
552 crc_it = bl.end();
553 crc_it.advance(-4);
554 tail_offset = bl.length();
555
556 ::encode(full_crc, bl);
557
558 ENCODE_FINISH(bl); // meta-encoding wrapper
559
560 // fill in crc
561 bufferlist front;
562 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
563 inc_crc = front.crc32c(-1);
564 bufferlist tail;
565 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
566 inc_crc = tail.crc32c(inc_crc);
567 ceph_le32 crc_le;
568 crc_le = inc_crc;
569 crc_it.copy_in(4, (char*)&crc_le);
570 have_crc = true;
571}
572
573void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
574{
575 __u32 n, t;
576 // base
577 __u16 v;
578 ::decode(v, p);
579 ::decode(fsid, p);
580 ::decode(epoch, p);
581 ::decode(modified, p);
582 if (v == 4 || v == 5) {
583 ::decode(n, p);
584 new_pool_max = n;
585 } else if (v >= 6)
586 ::decode(new_pool_max, p);
587 ::decode(new_flags, p);
588 ::decode(fullmap, p);
589 ::decode(crush, p);
590
591 ::decode(new_max_osd, p);
592 if (v < 6) {
593 new_pools.clear();
594 ::decode(n, p);
595 while (n--) {
596 ::decode(t, p);
597 ::decode(new_pools[t], p);
598 }
599 } else {
600 ::decode(new_pools, p);
601 }
602 if (v == 5) {
603 new_pool_names.clear();
604 ::decode(n, p);
605 while (n--) {
606 ::decode(t, p);
607 ::decode(new_pool_names[t], p);
608 }
609 } else if (v >= 6) {
610 ::decode(new_pool_names, p);
611 }
612 if (v < 6) {
613 old_pools.clear();
614 ::decode(n, p);
615 while (n--) {
616 ::decode(t, p);
617 old_pools.insert(t);
618 }
619 } else {
620 ::decode(old_pools, p);
621 }
622 ::decode(new_up_client, p);
31f18b77
FG
623 {
624 map<int32_t,uint8_t> ns;
625 ::decode(ns, p);
626 for (auto q : ns) {
627 new_state[q.first] = q.second;
628 }
629 }
7c673cae
FG
630 ::decode(new_weight, p);
631
632 if (v < 6) {
633 new_pg_temp.clear();
634 ::decode(n, p);
635 while (n--) {
636 old_pg_t opg;
637 ::decode_raw(opg, p);
638 ::decode(new_pg_temp[pg_t(opg)], p);
639 }
640 } else {
641 ::decode(new_pg_temp, p);
642 }
643
644 // decode short map, too.
645 if (v == 5 && p.end())
646 return;
647
648 // extended
649 __u16 ev = 0;
650 if (v >= 5)
651 ::decode(ev, p);
652 ::decode(new_hb_back_up, p);
653 if (v < 5)
654 ::decode(new_pool_names, p);
655 ::decode(new_up_thru, p);
656 ::decode(new_last_clean_interval, p);
657 ::decode(new_lost, p);
658 ::decode(new_blacklist, p);
659 ::decode(old_blacklist, p);
660 if (ev >= 6)
661 ::decode(new_up_cluster, p);
662 if (ev >= 7)
663 ::decode(cluster_snapshot, p);
664 if (ev >= 8)
665 ::decode(new_uuid, p);
666 if (ev >= 9)
667 ::decode(new_xinfo, p);
668 if (ev >= 10)
669 ::decode(new_hb_front_up, p);
670}
671
672void OSDMap::Incremental::decode(bufferlist::iterator& bl)
673{
674 /**
675 * Older encodings of the Incremental had a single struct_v which
676 * covered the whole encoding, and was prior to our modern
677 * stuff which includes a compatv and a size. So if we see
678 * a struct_v < 7, we must rewind to the beginning and use our
679 * classic decoder.
680 */
681 size_t start_offset = bl.get_off();
682 size_t tail_offset = 0;
683 bufferlist crc_front, crc_tail;
684
685 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
686 if (struct_v < 7) {
687 int struct_v_size = sizeof(struct_v);
688 bl.advance(-struct_v_size);
689 decode_classic(bl);
690 encode_features = 0;
691 if (struct_v >= 6)
692 encode_features = CEPH_FEATURE_PGID64;
693 else
694 encode_features = 0;
695 return;
696 }
697 {
31f18b77 698 DECODE_START(5, bl); // client-usable data
7c673cae
FG
699 ::decode(fsid, bl);
700 ::decode(epoch, bl);
701 ::decode(modified, bl);
702 ::decode(new_pool_max, bl);
703 ::decode(new_flags, bl);
704 ::decode(fullmap, bl);
705 ::decode(crush, bl);
706
707 ::decode(new_max_osd, bl);
708 ::decode(new_pools, bl);
709 ::decode(new_pool_names, bl);
710 ::decode(old_pools, bl);
711 ::decode(new_up_client, bl);
31f18b77
FG
712 if (struct_v >= 5) {
713 ::decode(new_state, bl);
714 } else {
715 map<int32_t,uint8_t> ns;
716 ::decode(ns, bl);
717 for (auto q : ns) {
718 new_state[q.first] = q.second;
719 }
720 }
7c673cae
FG
721 ::decode(new_weight, bl);
722 ::decode(new_pg_temp, bl);
723 ::decode(new_primary_temp, bl);
724 if (struct_v >= 2)
725 ::decode(new_primary_affinity, bl);
726 else
727 new_primary_affinity.clear();
728 if (struct_v >= 3) {
729 ::decode(new_erasure_code_profiles, bl);
730 ::decode(old_erasure_code_profiles, bl);
731 } else {
732 new_erasure_code_profiles.clear();
733 old_erasure_code_profiles.clear();
734 }
735 if (struct_v >= 4) {
736 ::decode(new_pg_upmap, bl);
737 ::decode(old_pg_upmap, bl);
738 ::decode(new_pg_upmap_items, bl);
739 ::decode(old_pg_upmap_items, bl);
740 }
741 DECODE_FINISH(bl); // client-usable data
742 }
743
744 {
31f18b77 745 DECODE_START(6, bl); // extended, osd-only data
7c673cae
FG
746 ::decode(new_hb_back_up, bl);
747 ::decode(new_up_thru, bl);
748 ::decode(new_last_clean_interval, bl);
749 ::decode(new_lost, bl);
750 ::decode(new_blacklist, bl);
751 ::decode(old_blacklist, bl);
752 ::decode(new_up_cluster, bl);
753 ::decode(cluster_snapshot, bl);
754 ::decode(new_uuid, bl);
755 ::decode(new_xinfo, bl);
756 ::decode(new_hb_front_up, bl);
757 if (struct_v >= 2)
758 ::decode(encode_features, bl);
759 else
760 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
761 if (struct_v >= 3) {
762 ::decode(new_nearfull_ratio, bl);
763 ::decode(new_full_ratio, bl);
764 } else {
765 new_nearfull_ratio = -1;
766 new_full_ratio = -1;
767 }
768 if (struct_v >= 4) {
769 ::decode(new_backfillfull_ratio, bl);
770 } else {
771 new_backfillfull_ratio = -1;
772 }
31f18b77
FG
773 if (struct_v == 5) {
774 string r;
775 ::decode(r, bl);
776 if (r.length()) {
777 new_require_min_compat_client = ceph_release_from_name(r.c_str());
778 }
779 }
780 if (struct_v >= 6) {
7c673cae 781 ::decode(new_require_min_compat_client, bl);
31f18b77
FG
782 ::decode(new_require_osd_release, bl);
783 } else {
784 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
785 // only for compat with post-kraken pre-luminous test clusters
786 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
787 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
788 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
789 new_require_osd_release = CEPH_RELEASE_KRAKEN;
790 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
791 new_require_osd_release = CEPH_RELEASE_JEWEL;
792 } else {
793 new_require_osd_release = -1;
794 }
795 }
7c673cae
FG
796 DECODE_FINISH(bl); // osd-only data
797 }
798
799 if (struct_v >= 8) {
800 have_crc = true;
801 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
802 ::decode(inc_crc, bl);
803 tail_offset = bl.get_off();
804 ::decode(full_crc, bl);
805 } else {
806 have_crc = false;
807 full_crc = 0;
808 inc_crc = 0;
809 }
810
811 DECODE_FINISH(bl); // wrapper
812
813 if (have_crc) {
814 // verify crc
815 uint32_t actual = crc_front.crc32c(-1);
816 if (tail_offset < bl.get_off()) {
817 bufferlist tail;
818 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
819 actual = tail.crc32c(actual);
820 }
821 if (inc_crc != actual) {
822 ostringstream ss;
823 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
824 string s = ss.str();
825 throw buffer::malformed_input(s.c_str());
826 }
827 }
828}
829
830void OSDMap::Incremental::dump(Formatter *f) const
831{
832 f->dump_int("epoch", epoch);
833 f->dump_stream("fsid") << fsid;
834 f->dump_stream("modified") << modified;
835 f->dump_int("new_pool_max", new_pool_max);
836 f->dump_int("new_flags", new_flags);
837 f->dump_float("new_full_ratio", new_full_ratio);
838 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
839 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
31f18b77
FG
840 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
841 f->dump_int("new_require_osd_release", new_require_osd_release);
7c673cae
FG
842
843 if (fullmap.length()) {
844 f->open_object_section("full_map");
845 OSDMap full;
846 bufferlist fbl = fullmap; // kludge around constness.
847 auto p = fbl.begin();
848 full.decode(p);
849 full.dump(f);
850 f->close_section();
851 }
852 if (crush.length()) {
853 f->open_object_section("crush");
854 CrushWrapper c;
855 bufferlist tbl = crush; // kludge around constness.
856 auto p = tbl.begin();
857 c.decode(p);
858 c.dump(f);
859 f->close_section();
860 }
861
862 f->dump_int("new_max_osd", new_max_osd);
863
864 f->open_array_section("new_pools");
865
866 for (const auto &new_pool : new_pools) {
867 f->open_object_section("pool");
868 f->dump_int("pool", new_pool.first);
869 new_pool.second.dump(f);
870 f->close_section();
871 }
872 f->close_section();
873 f->open_array_section("new_pool_names");
874
875 for (const auto &new_pool_name : new_pool_names) {
876 f->open_object_section("pool_name");
877 f->dump_int("pool", new_pool_name.first);
878 f->dump_string("name", new_pool_name.second);
879 f->close_section();
880 }
881 f->close_section();
882 f->open_array_section("old_pools");
883
884 for (const auto &old_pool : old_pools)
885 f->dump_int("pool", old_pool);
886 f->close_section();
887
888 f->open_array_section("new_up_osds");
889
890 for (const auto &upclient : new_up_client) {
891 f->open_object_section("osd");
892 f->dump_int("osd", upclient.first);
893 f->dump_stream("public_addr") << upclient.second;
894 f->dump_stream("cluster_addr") << new_up_cluster.find(upclient.first)->second;
895 f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(upclient.first)->second;
896 map<int32_t, entity_addr_t>::const_iterator q;
897 if ((q = new_hb_front_up.find(upclient.first)) != new_hb_front_up.end())
898 f->dump_stream("heartbeat_front_addr") << q->second;
899 f->close_section();
900 }
901 f->close_section();
902
903 f->open_array_section("new_weight");
904
905 for (const auto &weight : new_weight) {
906 f->open_object_section("osd");
907 f->dump_int("osd", weight.first);
908 f->dump_int("weight", weight.second);
909 f->close_section();
910 }
911 f->close_section();
912
913 f->open_array_section("osd_state_xor");
914 for (const auto &ns : new_state) {
915 f->open_object_section("osd");
916 f->dump_int("osd", ns.first);
917 set<string> st;
918 calc_state_set(new_state.find(ns.first)->second, st);
919 f->open_array_section("state_xor");
920 for (auto &state : st)
921 f->dump_string("state", state);
922 f->close_section();
c07f9fc5 923 f->close_section();
7c673cae
FG
924 }
925 f->close_section();
926
927 f->open_array_section("new_pg_temp");
928
929 for (const auto &pg_temp : new_pg_temp) {
930 f->open_object_section("pg");
931 f->dump_stream("pgid") << pg_temp.first;
932 f->open_array_section("osds");
933
934 for (const auto &osd : pg_temp.second)
935 f->dump_int("osd", osd);
936 f->close_section();
937 f->close_section();
938 }
939 f->close_section();
940
941 f->open_array_section("primary_temp");
942
943 for (const auto &primary_temp : new_primary_temp) {
944 f->dump_stream("pgid") << primary_temp.first;
945 f->dump_int("osd", primary_temp.second);
946 }
947 f->close_section(); // primary_temp
948
949 f->open_array_section("new_pg_upmap");
950 for (auto& i : new_pg_upmap) {
951 f->open_object_section("mapping");
952 f->dump_stream("pgid") << i.first;
953 f->open_array_section("osds");
954 for (auto osd : i.second) {
955 f->dump_int("osd", osd);
956 }
957 f->close_section();
958 f->close_section();
959 }
960 f->close_section();
961 f->open_array_section("old_pg_upmap");
962 for (auto& i : old_pg_upmap) {
963 f->dump_stream("pgid") << i;
964 }
965 f->close_section();
966
967 f->open_array_section("new_pg_upmap_items");
968 for (auto& i : new_pg_upmap_items) {
969 f->open_object_section("mapping");
970 f->dump_stream("pgid") << i.first;
971 f->open_array_section("mappings");
972 for (auto& p : i.second) {
973 f->open_object_section("mapping");
974 f->dump_int("from", p.first);
975 f->dump_int("to", p.second);
976 f->close_section();
977 }
978 f->close_section();
979 f->close_section();
980 }
981 f->close_section();
982 f->open_array_section("old_pg_upmap_items");
983 for (auto& i : old_pg_upmap_items) {
984 f->dump_stream("pgid") << i;
985 }
986 f->close_section();
987
988 f->open_array_section("new_up_thru");
989
990 for (const auto &up_thru : new_up_thru) {
991 f->open_object_section("osd");
992 f->dump_int("osd", up_thru.first);
993 f->dump_int("up_thru", up_thru.second);
994 f->close_section();
995 }
996 f->close_section();
997
998 f->open_array_section("new_lost");
999
1000 for (const auto &lost : new_lost) {
1001 f->open_object_section("osd");
1002 f->dump_int("osd", lost.first);
1003 f->dump_int("epoch_lost", lost.second);
1004 f->close_section();
1005 }
1006 f->close_section();
1007
1008 f->open_array_section("new_last_clean_interval");
1009
1010 for (const auto &last_clean_interval : new_last_clean_interval) {
1011 f->open_object_section("osd");
1012 f->dump_int("osd", last_clean_interval.first);
1013 f->dump_int("first", last_clean_interval.second.first);
1014 f->dump_int("last", last_clean_interval.second.second);
1015 f->close_section();
1016 }
1017 f->close_section();
1018
1019 f->open_array_section("new_blacklist");
1020 for (const auto &blist : new_blacklist) {
1021 stringstream ss;
1022 ss << blist.first;
1023 f->dump_stream(ss.str().c_str()) << blist.second;
1024 }
1025 f->close_section();
1026 f->open_array_section("old_blacklist");
1027 for (const auto &blist : old_blacklist)
1028 f->dump_stream("addr") << blist;
1029 f->close_section();
1030
1031 f->open_array_section("new_xinfo");
1032 for (const auto &xinfo : new_xinfo) {
1033 f->open_object_section("xinfo");
1034 f->dump_int("osd", xinfo.first);
1035 xinfo.second.dump(f);
1036 f->close_section();
1037 }
1038 f->close_section();
1039
1040 if (cluster_snapshot.size())
1041 f->dump_string("cluster_snapshot", cluster_snapshot);
1042
1043 f->open_array_section("new_uuid");
1044 for (const auto &uuid : new_uuid) {
1045 f->open_object_section("osd");
1046 f->dump_int("osd", uuid.first);
1047 f->dump_stream("uuid") << uuid.second;
1048 f->close_section();
1049 }
1050 f->close_section();
1051
1052 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1053 f->open_array_section("old_erasure_code_profiles");
1054 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1055 f->dump_string("old", erasure_code_profile.c_str());
1056 }
1057 f->close_section();
1058}
1059
1060void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1061{
1062 o.push_back(new Incremental);
1063}
1064
1065// ----------------------------------
1066// OSDMap
1067
1068void OSDMap::set_epoch(epoch_t e)
1069{
1070 epoch = e;
1071 for (auto &pool : pools)
1072 pool.second.last_change = e;
1073}
1074
1075bool OSDMap::is_blacklisted(const entity_addr_t& a) const
1076{
1077 if (blacklist.empty())
1078 return false;
1079
1080 // this specific instance?
1081 if (blacklist.count(a))
1082 return true;
1083
1084 // is entire ip blacklisted?
1085 if (a.is_ip()) {
1086 entity_addr_t b = a;
1087 b.set_port(0);
1088 b.set_nonce(0);
1089 if (blacklist.count(b)) {
1090 return true;
1091 }
1092 }
1093
1094 return false;
1095}
1096
1097void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1098{
1099 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1100}
1101
31f18b77
FG
1102void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1103{
1104 for (const auto &i : blacklist) {
1105 bl->insert(i.first);
1106 }
1107}
1108
7c673cae
FG
1109void OSDMap::set_max_osd(int m)
1110{
1111 int o = max_osd;
1112 max_osd = m;
1113 osd_state.resize(m);
1114 osd_weight.resize(m);
1115 for (; o<max_osd; o++) {
1116 osd_state[o] = 0;
1117 osd_weight[o] = CEPH_OSD_OUT;
1118 }
1119 osd_info.resize(m);
1120 osd_xinfo.resize(m);
1121 osd_addrs->client_addr.resize(m);
1122 osd_addrs->cluster_addr.resize(m);
1123 osd_addrs->hb_back_addr.resize(m);
1124 osd_addrs->hb_front_addr.resize(m);
1125 osd_uuid->resize(m);
1126 if (osd_primary_affinity)
1127 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1128
1129 calc_num_osds();
1130}
1131
1132int OSDMap::calc_num_osds()
1133{
1134 num_osd = 0;
1135 num_up_osd = 0;
1136 num_in_osd = 0;
1137 for (int i=0; i<max_osd; i++) {
1138 if (osd_state[i] & CEPH_OSD_EXISTS) {
1139 ++num_osd;
1140 if (osd_state[i] & CEPH_OSD_UP) {
1141 ++num_up_osd;
1142 }
1143 if (get_weight(i) != CEPH_OSD_OUT) {
1144 ++num_in_osd;
1145 }
1146 }
1147 }
1148 return num_osd;
1149}
1150
3efd9988
FG
1151void OSDMap::get_full_pools(CephContext *cct,
1152 set<int64_t> *full,
1153 set<int64_t> *backfillfull,
1154 set<int64_t> *nearfull) const
7c673cae 1155{
3efd9988
FG
1156 assert(full);
1157 assert(backfillfull);
1158 assert(nearfull);
1159 full->clear();
1160 backfillfull->clear();
1161 nearfull->clear();
1162
1163 vector<int> full_osds;
1164 vector<int> backfillfull_osds;
1165 vector<int> nearfull_osds;
7c673cae
FG
1166 for (int i = 0; i < max_osd; ++i) {
1167 if (exists(i) && is_up(i) && is_in(i)) {
1168 if (osd_state[i] & CEPH_OSD_FULL)
3efd9988 1169 full_osds.push_back(i);
7c673cae 1170 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
3efd9988 1171 backfillfull_osds.push_back(i);
7c673cae 1172 else if (osd_state[i] & CEPH_OSD_NEARFULL)
3efd9988 1173 nearfull_osds.push_back(i);
7c673cae
FG
1174 }
1175 }
3efd9988
FG
1176
1177 for (auto i: full_osds) {
1178 get_pool_ids_by_osd(cct, i, full);
1179 }
1180 for (auto i: backfillfull_osds) {
1181 get_pool_ids_by_osd(cct, i, backfillfull);
1182 }
1183 for (auto i: nearfull_osds) {
1184 get_pool_ids_by_osd(cct, i, nearfull);
1185 }
7c673cae
FG
1186}
1187
31f18b77
FG
1188static bool get_osd_utilization(
1189 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1190 int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail)
1191{
1192 auto p = osd_stat.find(id);
1193 if (p == osd_stat.end())
1194 return false;
1195 *kb = p->second.kb;
1196 *kb_used = p->second.kb_used;
1197 *kb_avail = p->second.kb_avail;
1198 return *kb > 0;
7c673cae
FG
1199}
1200
31f18b77
FG
1201void OSDMap::get_full_osd_util(
1202 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1203 map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
7c673cae
FG
1204{
1205 full->clear();
1206 backfill->clear();
1207 nearfull->clear();
1208 for (int i = 0; i < max_osd; ++i) {
1209 if (exists(i) && is_up(i) && is_in(i)) {
1210 int64_t kb, kb_used, kb_avail;
1211 if (osd_state[i] & CEPH_OSD_FULL) {
1212 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1213 full->emplace(i, (float)kb_used / (float)kb);
1214 } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
1215 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1216 backfill->emplace(i, (float)kb_used / (float)kb);
1217 } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
1218 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1219 nearfull->emplace(i, (float)kb_used / (float)kb);
1220 }
1221 }
1222 }
1223}
1224
31f18b77
FG
1225void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1226 set<int> *nearfull) const
1227{
1228 full->clear();
1229 backfill->clear();
1230 nearfull->clear();
1231 for (int i = 0; i < max_osd; ++i) {
1232 if (exists(i) && is_up(i) && is_in(i)) {
1233 if (osd_state[i] & CEPH_OSD_FULL)
1234 full->emplace(i);
1235 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1236 backfill->emplace(i);
1237 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1238 nearfull->emplace(i);
1239 }
1240 }
1241}
1242
7c673cae
FG
1243void OSDMap::get_all_osds(set<int32_t>& ls) const
1244{
1245 for (int i=0; i<max_osd; i++)
1246 if (exists(i))
1247 ls.insert(i);
1248}
1249
1250void OSDMap::get_up_osds(set<int32_t>& ls) const
1251{
1252 for (int i = 0; i < max_osd; i++) {
1253 if (is_up(i))
1254 ls.insert(i);
1255 }
1256}
1257
31f18b77
FG
1258void OSDMap::get_out_osds(set<int32_t>& ls) const
1259{
1260 for (int i = 0; i < max_osd; i++) {
1261 if (is_out(i))
1262 ls.insert(i);
1263 }
1264}
1265
7c673cae
FG
1266void OSDMap::calc_state_set(int state, set<string>& st)
1267{
1268 unsigned t = state;
1269 for (unsigned s = 1; t; s <<= 1) {
1270 if (t & s) {
1271 t &= ~s;
1272 st.insert(ceph_osd_state_name(s));
1273 }
1274 }
1275}
1276
1277void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1278{
1279 float max = 0;
1280 for (const auto &weight : weights) {
1281 if (weight.second > max)
1282 max = weight.second;
1283 }
1284
1285 for (const auto &weight : weights) {
1286 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1287 }
1288}
1289
1290int OSDMap::identify_osd(const entity_addr_t& addr) const
1291{
1292 for (int i=0; i<max_osd; i++)
1293 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr))
1294 return i;
1295 return -1;
1296}
1297
1298int OSDMap::identify_osd(const uuid_d& u) const
1299{
1300 for (int i=0; i<max_osd; i++)
1301 if (exists(i) && get_uuid(i) == u)
1302 return i;
1303 return -1;
1304}
1305
1306int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1307{
1308 for (int i=0; i<max_osd; i++)
1309 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
1310 get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
1311 return i;
1312 return -1;
1313}
1314
1315int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1316{
1317 for (int i=0; i<max_osd; i++)
1318 if (exists(i) && (get_addr(i).is_same_host(ip) || get_cluster_addr(i).is_same_host(ip)))
1319 return i;
1320 return -1;
1321}
1322
1323
1324uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1325{
1326 uint64_t features = 0; // things we actually have
1327 uint64_t mask = 0; // things we could have
1328
1329 if (crush->has_nondefault_tunables())
1330 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1331 if (crush->has_nondefault_tunables2())
1332 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1333 if (crush->has_nondefault_tunables3())
1334 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1335 if (crush->has_v4_buckets())
1336 features |= CEPH_FEATURE_CRUSH_V4;
1337 if (crush->has_nondefault_tunables5())
1338 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
c07f9fc5
FG
1339 if (crush->has_incompat_choose_args()) {
1340 features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
1341 }
7c673cae
FG
1342 mask |= CEPH_FEATURES_CRUSH;
1343
1344 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1345 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1346 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1347
1348 for (auto &pool: pools) {
1349 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1350 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1351 }
1352 if (pool.second.is_erasure() &&
1353 entity_type != CEPH_ENTITY_TYPE_CLIENT) { // not for clients
1354 features |= CEPH_FEATURE_OSD_ERASURE_CODES;
1355 }
1356 if (!pool.second.tiers.empty() ||
1357 pool.second.is_tier()) {
1358 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1359 }
31f18b77 1360 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
7c673cae
FG
1361 pool.second.get_type(),
1362 pool.second.get_size());
1363 if (ruleid >= 0) {
1364 if (crush->is_v2_rule(ruleid))
1365 features |= CEPH_FEATURE_CRUSH_V2;
1366 if (crush->is_v3_rule(ruleid))
1367 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1368 if (crush->is_v5_rule(ruleid))
1369 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1370 }
1371 }
1372 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1373 for (auto &erasure_code_profile : erasure_code_profiles) {
1374 auto& profile = erasure_code_profile.second;
1375 const auto& plugin = profile.find("plugin");
1376 if (plugin != profile.end()) {
1377 if (plugin->second == "isa" || plugin->second == "lrc")
1378 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
1379 if (plugin->second == "shec")
1380 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
1381 }
1382 }
1383 }
1384 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1385 if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
1386 mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
1387
1388 if (osd_primary_affinity) {
1389 for (int i = 0; i < max_osd; ++i) {
1390 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1391 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1392 break;
1393 }
1394 }
1395 }
1396 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1397
1398 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1399 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
31f18b77 1400 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
1401 features |= jewel_features;
1402 }
1403 mask |= jewel_features;
1404
1405 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1406 | CEPH_FEATURE_MSG_ADDR2;
31f18b77 1407 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1408 features |= kraken_features;
1409 }
1410 mask |= kraken_features;
1411 }
1412
1413 if (pmask)
1414 *pmask = mask;
1415 return features;
1416}
1417
31f18b77 1418uint8_t OSDMap::get_min_compat_client() const
7c673cae
FG
1419{
1420 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1421
1422 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
31f18b77
FG
1423 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1424 return CEPH_RELEASE_LUMINOUS; // v12.2.0
7c673cae
FG
1425 }
1426 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
31f18b77 1427 return CEPH_RELEASE_JEWEL; // v10.2.0
7c673cae
FG
1428 }
1429 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
31f18b77 1430 return CEPH_RELEASE_HAMMER; // v0.94.0
7c673cae
FG
1431 }
1432 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1433 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1434 HAVE_FEATURE(f, OSD_ERASURE_CODES) || // v0.73-498-gbfc86a8
1435 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
31f18b77 1436 return CEPH_RELEASE_FIREFLY; // v0.80.0
7c673cae
FG
1437 }
1438 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1439 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
31f18b77 1440 return CEPH_RELEASE_DUMPLING; // v0.67.0
7c673cae
FG
1441 }
1442 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
31f18b77 1443 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae 1444 }
31f18b77 1445 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
7c673cae
FG
1446}
1447
1448void OSDMap::_calc_up_osd_features()
1449{
1450 bool first = true;
1451 cached_up_osd_features = 0;
1452 for (int osd = 0; osd < max_osd; ++osd) {
1453 if (!is_up(osd))
1454 continue;
1455 const osd_xinfo_t &xi = get_xinfo(osd);
3efd9988
FG
1456 if (xi.features == 0)
1457 continue; // bogus xinfo, maybe #20751 or similar, skipping
7c673cae
FG
1458 if (first) {
1459 cached_up_osd_features = xi.features;
1460 first = false;
1461 } else {
1462 cached_up_osd_features &= xi.features;
1463 }
1464 }
1465}
1466
1467uint64_t OSDMap::get_up_osd_features() const
1468{
1469 return cached_up_osd_features;
1470}
1471
1472void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1473{
1474 if (o->epoch == n->epoch)
1475 return;
1476
1477 int diff = 0;
1478
1479 // do addrs match?
1480 if (o->max_osd != n->max_osd)
1481 diff++;
1482 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1483 if ( n->osd_addrs->client_addr[i] && o->osd_addrs->client_addr[i] &&
1484 *n->osd_addrs->client_addr[i] == *o->osd_addrs->client_addr[i])
1485 n->osd_addrs->client_addr[i] = o->osd_addrs->client_addr[i];
1486 else
1487 diff++;
1488 if ( n->osd_addrs->cluster_addr[i] && o->osd_addrs->cluster_addr[i] &&
1489 *n->osd_addrs->cluster_addr[i] == *o->osd_addrs->cluster_addr[i])
1490 n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
1491 else
1492 diff++;
1493 if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
1494 *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
1495 n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
1496 else
1497 diff++;
1498 if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
1499 *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
1500 n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
1501 else
1502 diff++;
1503 }
1504 if (diff == 0) {
1505 // zoinks, no differences at all!
1506 n->osd_addrs = o->osd_addrs;
1507 }
1508
1509 // does crush match?
1510 bufferlist oc, nc;
1511 ::encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1512 ::encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1513 if (oc.contents_equal(nc)) {
1514 n->crush = o->crush;
1515 }
1516
1517 // does pg_temp match?
31f18b77
FG
1518 if (*o->pg_temp == *n->pg_temp)
1519 n->pg_temp = o->pg_temp;
7c673cae
FG
1520
1521 // does primary_temp match?
1522 if (o->primary_temp->size() == n->primary_temp->size()) {
1523 if (*o->primary_temp == *n->primary_temp)
1524 n->primary_temp = o->primary_temp;
1525 }
1526
1527 // do uuids match?
1528 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1529 *o->osd_uuid == *n->osd_uuid)
1530 n->osd_uuid = o->osd_uuid;
1531}
1532
1533void OSDMap::clean_temps(CephContext *cct,
1534 const OSDMap& osdmap, Incremental *pending_inc)
1535{
1536 ldout(cct, 10) << __func__ << dendl;
1537 OSDMap tmpmap;
1538 tmpmap.deepish_copy_from(osdmap);
1539 tmpmap.apply_incremental(*pending_inc);
1540
1541 for (auto pg : *tmpmap.pg_temp) {
1542 // if pool does not exist, remove any existing pg_temps associated with
1543 // it. we don't care about pg_temps on the pending_inc either; if there
1544 // are new_pg_temp entries on the pending, clear them out just as well.
1545 if (!osdmap.have_pg_pool(pg.first.pool())) {
1546 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1547 << " for nonexistent pool " << pg.first.pool() << dendl;
1548 pending_inc->new_pg_temp[pg.first].clear();
1549 continue;
1550 }
1551 // all osds down?
1552 unsigned num_up = 0;
1553 for (auto o : pg.second) {
1554 if (!tmpmap.is_down(o)) {
1555 ++num_up;
1556 break;
1557 }
1558 }
1559 if (num_up == 0) {
1560 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1561 << " with all down osds" << pg.second << dendl;
1562 pending_inc->new_pg_temp[pg.first].clear();
1563 continue;
1564 }
1565 // redundant pg_temp?
1566 vector<int> raw_up;
1567 int primary;
1568 tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
91327a77 1569 bool remove = false;
7c673cae
FG
1570 if (vectors_equal(raw_up, pg.second)) {
1571 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1572 << pg.second << " that matches raw_up mapping" << dendl;
91327a77
AA
1573 remove = true;
1574 }
1575 // oversized pg_temp?
1576 if (pg.second.size() > tmpmap.get_pg_pool(pg.first.pool())->get_size()) {
1577 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1578 << pg.second << " exceeds pool size" << dendl;
1579 remove = true;
1580 }
1581 if (remove) {
7c673cae
FG
1582 if (osdmap.pg_temp->count(pg.first))
1583 pending_inc->new_pg_temp[pg.first].clear();
1584 else
1585 pending_inc->new_pg_temp.erase(pg.first);
1586 }
1587 }
1588
1589 for (auto &pg : *tmpmap.primary_temp) {
1590 // primary down?
1591 if (tmpmap.is_down(pg.second)) {
1592 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1593 << " to down " << pg.second << dendl;
1594 pending_inc->new_primary_temp[pg.first] = -1;
1595 continue;
1596 }
1597 // redundant primary_temp?
1598 vector<int> real_up, templess_up;
1599 int real_primary, templess_primary;
1600 pg_t pgid = pg.first;
1601 tmpmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1602 tmpmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1603 if (real_primary == templess_primary){
1604 ldout(cct, 10) << __func__ << " removing primary_temp "
1605 << pgid << " -> " << real_primary
1606 << " (unnecessary/redundant)" << dendl;
1607 if (osdmap.primary_temp->count(pgid))
1608 pending_inc->new_primary_temp[pgid] = -1;
1609 else
1610 pending_inc->new_primary_temp.erase(pgid);
1611 }
1612 }
1613}
1614
94b18763
FG
1615void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
1616 const OSDMap& osdmap,
1617 Incremental *pending_inc)
1618{
1619 ldout(cct, 10) << __func__ << dendl;
1620 OSDMap tmpmap;
1621 tmpmap.deepish_copy_from(osdmap);
1622 tmpmap.apply_incremental(*pending_inc);
28e407b8
AA
1623 set<pg_t> to_check;
1624 set<pg_t> to_cancel;
1625 map<int, map<int, float>> rule_weight_map;
94b18763
FG
1626
1627 for (auto& p : tmpmap.pg_upmap) {
28e407b8
AA
1628 to_check.insert(p.first);
1629 }
1630 for (auto& p : tmpmap.pg_upmap_items) {
1631 to_check.insert(p.first);
1632 }
1633 for (auto& p : pending_inc->new_pg_upmap) {
1634 to_check.insert(p.first);
1635 }
1636 for (auto& p : pending_inc->new_pg_upmap_items) {
1637 to_check.insert(p.first);
1638 }
1639 for (auto& pg : to_check) {
f64942e4
AA
1640 if (!tmpmap.pg_exists(pg)) {
1641 ldout(cct, 0) << __func__ << " pg " << pg << " is gone" << dendl;
1642 to_cancel.insert(pg);
94b18763
FG
1643 continue;
1644 }
a8e16298
TL
1645 vector<int> raw_up;
1646 int primary;
1647 tmpmap.pg_to_raw_up(pg, &raw_up, &primary);
1648 vector<int> up;
1649 up.reserve(raw_up.size());
1650 for (auto osd : raw_up) {
1651 // skip non-existent/down osd for erasure-coded PGs
1652 if (osd == CRUSH_ITEM_NONE)
1653 continue;
1654 up.push_back(osd);
1655 }
f64942e4 1656 auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
a8e16298
TL
1657 auto r = tmpmap.crush->verify_upmap(cct,
1658 crush_rule,
1659 tmpmap.get_pg_pool_size(pg),
1660 up);
1661 if (r < 0) {
1662 ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
1663 << " returning " << r
1664 << dendl;
1665 to_cancel.insert(pg);
1666 continue;
1667 }
1668 // below we check against crush-topology changing..
28e407b8
AA
1669 map<int, float> weight_map;
1670 auto it = rule_weight_map.find(crush_rule);
1671 if (it == rule_weight_map.end()) {
1672 auto r = tmpmap.crush->get_rule_weight_osd_map(crush_rule, &weight_map);
1673 if (r < 0) {
1674 lderr(cct) << __func__ << " unable to get crush weight_map for "
1675 << "crush_rule " << crush_rule << dendl;
1676 continue;
1677 }
1678 rule_weight_map[crush_rule] = weight_map;
1679 } else {
1680 weight_map = it->second;
1681 }
28e407b8 1682 ldout(cct, 10) << __func__ << " pg " << pg
28e407b8 1683 << " weight_map " << weight_map
94b18763 1684 << dendl;
a8e16298 1685 for (auto osd : up) {
28e407b8
AA
1686 auto it = weight_map.find(osd);
1687 if (it == weight_map.end()) {
1688 // osd is gone or has been moved out of the specific crush-tree
1689 to_cancel.insert(pg);
94b18763
FG
1690 break;
1691 }
28e407b8
AA
1692 auto adjusted_weight = tmpmap.get_weightf(it->first) * it->second;
1693 if (adjusted_weight == 0) {
1694 // osd is out/crush-out
1695 to_cancel.insert(pg);
94b18763
FG
1696 break;
1697 }
1698 }
28e407b8
AA
1699 }
1700 for (auto &pg: to_cancel) {
1701 { // pg_upmap
1702 auto it = pending_inc->new_pg_upmap.find(pg);
94b18763 1703 if (it != pending_inc->new_pg_upmap.end()) {
28e407b8
AA
1704 ldout(cct, 10) << __func__ << " cancel invalid pending "
1705 << "pg_upmap entry "
1706 << it->first << "->" << it->second
1707 << dendl;
94b18763
FG
1708 pending_inc->new_pg_upmap.erase(it);
1709 }
28e407b8
AA
1710 if (osdmap.pg_upmap.count(pg)) {
1711 ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
1712 << osdmap.pg_upmap.find(pg)->first << "->"
1713 << osdmap.pg_upmap.find(pg)->second
1714 << dendl;
1715 pending_inc->old_pg_upmap.insert(pg);
94b18763
FG
1716 }
1717 }
28e407b8
AA
1718 { // pg_upmap_items
1719 auto it = pending_inc->new_pg_upmap_items.find(pg);
94b18763 1720 if (it != pending_inc->new_pg_upmap_items.end()) {
28e407b8
AA
1721 ldout(cct, 10) << __func__ << " cancel invalid pending "
1722 << "pg_upmap_items entry "
1723 << it->first << "->" << it->second
1724 << dendl;
94b18763
FG
1725 pending_inc->new_pg_upmap_items.erase(it);
1726 }
28e407b8
AA
1727 if (osdmap.pg_upmap_items.count(pg)) {
1728 ldout(cct, 10) << __func__ << " cancel invalid "
1729 << "pg_upmap_items entry "
1730 << osdmap.pg_upmap_items.find(pg)->first << "->"
1731 << osdmap.pg_upmap_items.find(pg)->second
1732 << dendl;
1733 pending_inc->old_pg_upmap_items.insert(pg);
94b18763
FG
1734 }
1735 }
1736 }
f64942e4 1737 tmpmap.clean_pg_upmaps(cct, pending_inc);
94b18763
FG
1738}
1739
7c673cae
FG
1740int OSDMap::apply_incremental(const Incremental &inc)
1741{
1742 new_blacklist_entries = false;
1743 if (inc.epoch == 1)
1744 fsid = inc.fsid;
1745 else if (inc.fsid != fsid)
1746 return -EINVAL;
1747
1748 assert(inc.epoch == epoch+1);
1749
1750 epoch++;
1751 modified = inc.modified;
1752
1753 // full map?
1754 if (inc.fullmap.length()) {
1755 bufferlist bl(inc.fullmap);
1756 decode(bl);
1757 return 0;
1758 }
1759
1760 // nope, incremental.
31f18b77 1761 if (inc.new_flags >= 0) {
7c673cae 1762 flags = inc.new_flags;
31f18b77
FG
1763 // the below is just to cover a newly-upgraded luminous mon
1764 // cluster that has to set require_jewel_osds or
1765 // require_kraken_osds before the osds can be upgraded to
1766 // luminous.
1767 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1768 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1769 require_osd_release = CEPH_RELEASE_KRAKEN;
1770 }
1771 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1772 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1773 require_osd_release = CEPH_RELEASE_JEWEL;
1774 }
1775 }
1776 }
7c673cae
FG
1777
1778 if (inc.new_max_osd >= 0)
1779 set_max_osd(inc.new_max_osd);
1780
1781 if (inc.new_pool_max != -1)
1782 pool_max = inc.new_pool_max;
1783
1784 for (const auto &pool : inc.new_pools) {
1785 pools[pool.first] = pool.second;
1786 pools[pool.first].last_change = epoch;
1787 }
1788
1789 for (const auto &pname : inc.new_pool_names) {
1790 auto pool_name_entry = pool_name.find(pname.first);
1791 if (pool_name_entry != pool_name.end()) {
1792 name_pool.erase(pool_name_entry->second);
1793 pool_name_entry->second = pname.second;
1794 } else {
1795 pool_name[pname.first] = pname.second;
1796 }
1797 name_pool[pname.second] = pname.first;
1798 }
1799
1800 for (const auto &pool : inc.old_pools) {
1801 pools.erase(pool);
1802 name_pool.erase(pool_name[pool]);
1803 pool_name.erase(pool);
1804 }
1805
1806 for (const auto &weight : inc.new_weight) {
1807 set_weight(weight.first, weight.second);
1808
1809 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1810 // xinfo old_weight.
1811 if (weight.second) {
1812 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
1813 osd_xinfo[weight.first].old_weight = 0;
1814 }
1815 }
1816
1817 for (const auto &primary_affinity : inc.new_primary_affinity) {
1818 set_primary_affinity(primary_affinity.first, primary_affinity.second);
1819 }
1820
1821 // erasure_code_profiles
1822 for (const auto &profile : inc.old_erasure_code_profiles)
1823 erasure_code_profiles.erase(profile);
1824
1825 for (const auto &profile : inc.new_erasure_code_profiles) {
1826 set_erasure_code_profile(profile.first, profile.second);
1827 }
1828
1829 // up/down
1830 for (const auto &state : inc.new_state) {
1831 const auto osd = state.first;
1832 int s = state.second ? state.second : CEPH_OSD_UP;
1833 if ((osd_state[osd] & CEPH_OSD_UP) &&
1834 (s & CEPH_OSD_UP)) {
1835 osd_info[osd].down_at = epoch;
1836 osd_xinfo[osd].down_stamp = modified;
1837 }
1838 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
1839 (s & CEPH_OSD_EXISTS)) {
1840 // osd is destroyed; clear out anything interesting.
1841 (*osd_uuid)[osd] = uuid_d();
1842 osd_info[osd] = osd_info_t();
1843 osd_xinfo[osd] = osd_xinfo_t();
1844 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1845 osd_addrs->client_addr[osd].reset(new entity_addr_t());
1846 osd_addrs->cluster_addr[osd].reset(new entity_addr_t());
1847 osd_addrs->hb_front_addr[osd].reset(new entity_addr_t());
1848 osd_addrs->hb_back_addr[osd].reset(new entity_addr_t());
1849 osd_state[osd] = 0;
1850 } else {
1851 osd_state[osd] ^= s;
1852 }
1853 }
1854
1855 for (const auto &client : inc.new_up_client) {
1856 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1857 osd_addrs->client_addr[client.first].reset(new entity_addr_t(client.second));
1858 if (inc.new_hb_back_up.empty())
1859 osd_addrs->hb_back_addr[client.first].reset(new entity_addr_t(client.second)); //this is a backward-compatibility hack
1860 else
1861 osd_addrs->hb_back_addr[client.first].reset(
1862 new entity_addr_t(inc.new_hb_back_up.find(client.first)->second));
1863 const auto j = inc.new_hb_front_up.find(client.first);
1864 if (j != inc.new_hb_front_up.end())
1865 osd_addrs->hb_front_addr[client.first].reset(new entity_addr_t(j->second));
1866 else
1867 osd_addrs->hb_front_addr[client.first].reset();
1868
1869 osd_info[client.first].up_from = epoch;
1870 }
1871
1872 for (const auto &cluster : inc.new_up_cluster)
1873 osd_addrs->cluster_addr[cluster.first].reset(new entity_addr_t(cluster.second));
1874
1875 // info
1876 for (const auto &thru : inc.new_up_thru)
1877 osd_info[thru.first].up_thru = thru.second;
1878
1879 for (const auto &interval : inc.new_last_clean_interval) {
1880 osd_info[interval.first].last_clean_begin = interval.second.first;
1881 osd_info[interval.first].last_clean_end = interval.second.second;
1882 }
1883
1884 for (const auto &lost : inc.new_lost)
1885 osd_info[lost.first].lost_at = lost.second;
1886
1887 // xinfo
1888 for (const auto &xinfo : inc.new_xinfo)
1889 osd_xinfo[xinfo.first] = xinfo.second;
1890
1891 // uuid
1892 for (const auto &uuid : inc.new_uuid)
1893 (*osd_uuid)[uuid.first] = uuid.second;
1894
1895 // pg rebuild
1896 for (const auto &pg : inc.new_pg_temp) {
1897 if (pg.second.empty())
1898 pg_temp->erase(pg.first);
1899 else
31f18b77
FG
1900 pg_temp->set(pg.first, pg.second);
1901 }
1902 if (!inc.new_pg_temp.empty()) {
1903 // make sure pg_temp is efficiently stored
1904 pg_temp->rebuild();
7c673cae
FG
1905 }
1906
1907 for (const auto &pg : inc.new_primary_temp) {
1908 if (pg.second == -1)
1909 primary_temp->erase(pg.first);
1910 else
1911 (*primary_temp)[pg.first] = pg.second;
1912 }
1913
1914 for (auto& p : inc.new_pg_upmap) {
1915 pg_upmap[p.first] = p.second;
1916 }
1917 for (auto& pg : inc.old_pg_upmap) {
1918 pg_upmap.erase(pg);
1919 }
1920 for (auto& p : inc.new_pg_upmap_items) {
1921 pg_upmap_items[p.first] = p.second;
1922 }
1923 for (auto& pg : inc.old_pg_upmap_items) {
1924 pg_upmap_items.erase(pg);
1925 }
1926
1927 // blacklist
1928 if (!inc.new_blacklist.empty()) {
1929 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
1930 new_blacklist_entries = true;
1931 }
1932 for (const auto &addr : inc.old_blacklist)
1933 blacklist.erase(addr);
1934
1935 // cluster snapshot?
1936 if (inc.cluster_snapshot.length()) {
1937 cluster_snapshot = inc.cluster_snapshot;
1938 cluster_snapshot_epoch = inc.epoch;
1939 } else {
1940 cluster_snapshot.clear();
1941 cluster_snapshot_epoch = 0;
1942 }
1943
1944 if (inc.new_nearfull_ratio >= 0) {
1945 nearfull_ratio = inc.new_nearfull_ratio;
1946 }
1947 if (inc.new_backfillfull_ratio >= 0) {
1948 backfillfull_ratio = inc.new_backfillfull_ratio;
1949 }
1950 if (inc.new_full_ratio >= 0) {
1951 full_ratio = inc.new_full_ratio;
1952 }
31f18b77 1953 if (inc.new_require_min_compat_client > 0) {
7c673cae
FG
1954 require_min_compat_client = inc.new_require_min_compat_client;
1955 }
31f18b77
FG
1956 if (inc.new_require_osd_release >= 0) {
1957 require_osd_release = inc.new_require_osd_release;
1958 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1959 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 1960 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
1961 }
1962 }
7c673cae
FG
1963
1964 // do new crush map last (after up/down stuff)
1965 if (inc.crush.length()) {
1966 bufferlist bl(inc.crush);
1967 auto blp = bl.begin();
1968 crush.reset(new CrushWrapper);
1969 crush->decode(blp);
31f18b77
FG
1970 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1971 // only increment if this is a luminous-encoded osdmap, lest
1972 // the mon's crush_version diverge from what the osds or others
1973 // are decoding and applying on their end. if we won't encode
1974 // it in the canonical version, don't change it.
1975 ++crush_version;
1976 }
7c673cae
FG
1977 }
1978
1979 calc_num_osds();
1980 _calc_up_osd_features();
1981 return 0;
1982}
1983
1984// mapping
1985int OSDMap::map_to_pg(
1986 int64_t poolid,
1987 const string& name,
1988 const string& key,
1989 const string& nspace,
1990 pg_t *pg) const
1991{
1992 // calculate ps (placement seed)
1993 const pg_pool_t *pool = get_pg_pool(poolid);
1994 if (!pool)
1995 return -ENOENT;
1996 ps_t ps;
1997 if (!key.empty())
1998 ps = pool->hash_key(key, nspace);
1999 else
2000 ps = pool->hash_key(name, nspace);
2001 *pg = pg_t(ps, poolid);
2002 return 0;
2003}
2004
2005int OSDMap::object_locator_to_pg(
2006 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
2007{
2008 if (loc.hash >= 0) {
2009 if (!get_pg_pool(loc.get_pool())) {
2010 return -ENOENT;
2011 }
2012 pg = pg_t(loc.hash, loc.get_pool());
2013 return 0;
2014 }
2015 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
2016}
2017
2018ceph_object_layout OSDMap::make_object_layout(
2019 object_t oid, int pg_pool, string nspace) const
2020{
2021 object_locator_t loc(pg_pool, nspace);
2022
2023 ceph_object_layout ol;
2024 pg_t pgid = object_locator_to_pg(oid, loc);
2025 ol.ol_pgid = pgid.get_old_pg().v;
2026 ol.ol_stripe_unit = 0;
2027 return ol;
2028}
2029
2030void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
2031 vector<int>& osds) const
2032{
2033 if (pool.can_shift_osds()) {
2034 unsigned removed = 0;
2035 for (unsigned i = 0; i < osds.size(); i++) {
2036 if (!exists(osds[i])) {
2037 removed++;
2038 continue;
2039 }
2040 if (removed) {
2041 osds[i - removed] = osds[i];
2042 }
2043 }
2044 if (removed)
2045 osds.resize(osds.size() - removed);
2046 } else {
2047 for (auto& osd : osds) {
2048 if (!exists(osd))
2049 osd = CRUSH_ITEM_NONE;
2050 }
2051 }
2052}
2053
31f18b77 2054void OSDMap::_pg_to_raw_osds(
7c673cae
FG
2055 const pg_pool_t& pool, pg_t pg,
2056 vector<int> *osds,
2057 ps_t *ppps) const
2058{
2059 // map to osds[]
2060 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
2061 unsigned size = pool.get_size();
2062
2063 // what crush rule?
31f18b77 2064 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
7c673cae
FG
2065 if (ruleno >= 0)
2066 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
2067
2068 _remove_nonexistent_osds(pool, *osds);
2069
2070 if (ppps)
2071 *ppps = pps;
7c673cae
FG
2072}
2073
2074int OSDMap::_pick_primary(const vector<int>& osds) const
2075{
2076 for (auto osd : osds) {
2077 if (osd != CRUSH_ITEM_NONE) {
2078 return osd;
2079 }
2080 }
2081 return -1;
2082}
2083
224ce89b 2084void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
7c673cae
FG
2085{
2086 pg_t pg = pi.raw_pg_to_pg(raw_pg);
2087 auto p = pg_upmap.find(pg);
2088 if (p != pg_upmap.end()) {
2089 // make sure targets aren't marked out
2090 for (auto osd : p->second) {
91327a77
AA
2091 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
2092 osd_weight[osd] == 0) {
7c673cae
FG
2093 // reject/ignore the explicit mapping
2094 return;
2095 }
2096 }
2097 *raw = vector<int>(p->second.begin(), p->second.end());
224ce89b 2098 // continue to check and apply pg_upmap_items if any
7c673cae
FG
2099 }
2100
2101 auto q = pg_upmap_items.find(pg);
2102 if (q != pg_upmap_items.end()) {
181888fb
FG
2103 // NOTE: this approach does not allow a bidirectional swap,
2104 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2105 for (auto& r : q->second) {
2106 // make sure the replacement value doesn't already appear
2107 bool exists = false;
2108 ssize_t pos = -1;
2109 for (unsigned i = 0; i < raw->size(); ++i) {
2110 int osd = (*raw)[i];
2111 if (osd == r.second) {
2112 exists = true;
2113 break;
2114 }
2115 // ignore mapping if target is marked out (or invalid osd id)
2116 if (osd == r.first &&
2117 pos < 0 &&
2118 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
91327a77 2119 r.second >= 0 && osd_weight[r.second] == 0)) {
181888fb
FG
2120 pos = i;
2121 }
2122 }
2123 if (!exists && pos >= 0) {
2124 (*raw)[pos] = r.second;
7c673cae
FG
2125 }
2126 }
2127 }
2128}
2129
2130// pg -> (up osd list)
2131void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
2132 vector<int> *up) const
2133{
2134 if (pool.can_shift_osds()) {
2135 // shift left
2136 up->clear();
2137 up->reserve(raw.size());
2138 for (unsigned i=0; i<raw.size(); i++) {
2139 if (!exists(raw[i]) || is_down(raw[i]))
2140 continue;
2141 up->push_back(raw[i]);
2142 }
2143 } else {
2144 // set down/dne devices to NONE
2145 up->resize(raw.size());
2146 for (int i = raw.size() - 1; i >= 0; --i) {
2147 if (!exists(raw[i]) || is_down(raw[i])) {
2148 (*up)[i] = CRUSH_ITEM_NONE;
2149 } else {
2150 (*up)[i] = raw[i];
2151 }
2152 }
2153 }
2154}
2155
2156void OSDMap::_apply_primary_affinity(ps_t seed,
2157 const pg_pool_t& pool,
2158 vector<int> *osds,
2159 int *primary) const
2160{
2161 // do we have any non-default primary_affinity values for these osds?
2162 if (!osd_primary_affinity)
2163 return;
2164
2165 bool any = false;
2166 for (const auto osd : *osds) {
2167 if (osd != CRUSH_ITEM_NONE &&
2168 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2169 any = true;
2170 break;
2171 }
2172 }
2173 if (!any)
2174 return;
2175
2176 // pick the primary. feed both the seed (for the pg) and the osd
2177 // into the hash/rng so that a proportional fraction of an osd's pgs
2178 // get rejected as primary.
2179 int pos = -1;
2180 for (unsigned i = 0; i < osds->size(); ++i) {
2181 int o = (*osds)[i];
2182 if (o == CRUSH_ITEM_NONE)
2183 continue;
2184 unsigned a = (*osd_primary_affinity)[o];
2185 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2186 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2187 seed, o) >> 16) >= a) {
2188 // we chose not to use this primary. note it anyway as a
2189 // fallback in case we don't pick anyone else, but keep looking.
2190 if (pos < 0)
2191 pos = i;
2192 } else {
2193 pos = i;
2194 break;
2195 }
2196 }
2197 if (pos < 0)
2198 return;
2199
2200 *primary = (*osds)[pos];
2201
2202 if (pool.can_shift_osds() && pos > 0) {
2203 // move the new primary to the front.
2204 for (int i = pos; i > 0; --i) {
2205 (*osds)[i] = (*osds)[i-1];
2206 }
2207 (*osds)[0] = *primary;
2208 }
2209}
2210
2211void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2212 vector<int> *temp_pg, int *temp_primary) const
2213{
2214 pg = pool.raw_pg_to_pg(pg);
2215 const auto p = pg_temp->find(pg);
2216 temp_pg->clear();
2217 if (p != pg_temp->end()) {
2218 for (unsigned i=0; i<p->second.size(); i++) {
2219 if (!exists(p->second[i]) || is_down(p->second[i])) {
2220 if (pool.can_shift_osds()) {
2221 continue;
2222 } else {
2223 temp_pg->push_back(CRUSH_ITEM_NONE);
2224 }
2225 } else {
2226 temp_pg->push_back(p->second[i]);
2227 }
2228 }
2229 }
2230 const auto &pp = primary_temp->find(pg);
2231 *temp_primary = -1;
2232 if (pp != primary_temp->end()) {
2233 *temp_primary = pp->second;
2234 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2235 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2236 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2237 *temp_primary = (*temp_pg)[i];
2238 break;
2239 }
2240 }
2241 }
2242}
2243
31f18b77 2244void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
7c673cae
FG
2245{
2246 *primary = -1;
2247 raw->clear();
2248 const pg_pool_t *pool = get_pg_pool(pg.pool());
2249 if (!pool)
31f18b77
FG
2250 return;
2251 _pg_to_raw_osds(*pool, pg, raw, NULL);
7c673cae
FG
2252 if (primary)
2253 *primary = _pick_primary(*raw);
7c673cae
FG
2254}
2255
a8e16298
TL
2256void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const
2257{
2258 auto pool = get_pg_pool(pg.pool());
2259 if (!pool) {
2260 raw_upmap->clear();
2261 return;
2262 }
2263 _pg_to_raw_osds(*pool, pg, raw_upmap, NULL);
2264 _apply_upmap(*pool, pg, raw_upmap);
2265}
2266
7c673cae
FG
2267void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2268{
2269 const pg_pool_t *pool = get_pg_pool(pg.pool());
2270 if (!pool) {
2271 if (primary)
2272 *primary = -1;
2273 if (up)
2274 up->clear();
2275 return;
2276 }
2277 vector<int> raw;
2278 ps_t pps;
2279 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2280 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2281 _raw_to_up_osds(*pool, raw, up);
2282 *primary = _pick_primary(raw);
2283 _apply_primary_affinity(pps, *pool, up, primary);
2284}
31f18b77 2285
7c673cae
FG
2286void OSDMap::_pg_to_up_acting_osds(
2287 const pg_t& pg, vector<int> *up, int *up_primary,
2288 vector<int> *acting, int *acting_primary,
2289 bool raw_pg_to_pg) const
2290{
2291 const pg_pool_t *pool = get_pg_pool(pg.pool());
2292 if (!pool ||
2293 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2294 if (up)
2295 up->clear();
2296 if (up_primary)
2297 *up_primary = -1;
2298 if (acting)
2299 acting->clear();
2300 if (acting_primary)
2301 *acting_primary = -1;
2302 return;
2303 }
2304 vector<int> raw;
2305 vector<int> _up;
2306 vector<int> _acting;
2307 int _up_primary;
2308 int _acting_primary;
2309 ps_t pps;
2310 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2311 if (_acting.empty() || up || up_primary) {
2312 _pg_to_raw_osds(*pool, pg, &raw, &pps);
224ce89b 2313 _apply_upmap(*pool, pg, &raw);
7c673cae
FG
2314 _raw_to_up_osds(*pool, raw, &_up);
2315 _up_primary = _pick_primary(_up);
2316 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2317 if (_acting.empty()) {
2318 _acting = _up;
2319 if (_acting_primary == -1) {
2320 _acting_primary = _up_primary;
2321 }
2322 }
2323
2324 if (up)
2325 up->swap(_up);
2326 if (up_primary)
2327 *up_primary = _up_primary;
2328 }
2329
2330 if (acting)
2331 acting->swap(_acting);
2332 if (acting_primary)
2333 *acting_primary = _acting_primary;
2334}
2335
2336int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2337{
2338 if (!nrep)
2339 nrep = acting.size();
2340 for (int i=0; i<nrep; i++)
2341 if (acting[i] == osd)
2342 return i;
2343 return -1;
2344}
2345
2346int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2347{
2348 return calc_pg_rank(osd, acting, nrep);
2349}
2350
2351bool OSDMap::primary_changed(
2352 int oldprimary,
2353 const vector<int> &oldacting,
2354 int newprimary,
2355 const vector<int> &newacting)
2356{
2357 if (oldacting.empty() && newacting.empty())
2358 return false; // both still empty
2359 if (oldacting.empty() ^ newacting.empty())
2360 return true; // was empty, now not, or vice versa
2361 if (oldprimary != newprimary)
2362 return true; // primary changed
2363 if (calc_pg_rank(oldprimary, oldacting) !=
2364 calc_pg_rank(newprimary, newacting))
2365 return true;
2366 return false; // same primary (tho replicas may have changed)
2367}
2368
28e407b8
AA
2369uint64_t OSDMap::get_encoding_features() const
2370{
2371 uint64_t f = SIGNIFICANT_FEATURES;
2372 if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
2373 f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
2374 CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
2375 }
2376 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
2377 f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1adf2230 2378 CEPH_FEATURE_MSG_ADDR2);
28e407b8
AA
2379 }
2380 if (require_osd_release < CEPH_RELEASE_JEWEL) {
2381 f &= ~(CEPH_FEATURE_SERVER_JEWEL |
1adf2230
AA
2382 CEPH_FEATURE_NEW_OSDOP_ENCODING |
2383 CEPH_FEATURE_CRUSH_TUNABLES5);
28e407b8
AA
2384 }
2385 return f;
2386}
7c673cae
FG
2387
2388// serialize, unserialize
2389void OSDMap::encode_client_old(bufferlist& bl) const
2390{
2391 __u16 v = 5;
2392 ::encode(v, bl);
2393
2394 // base
2395 ::encode(fsid, bl);
2396 ::encode(epoch, bl);
2397 ::encode(created, bl);
2398 ::encode(modified, bl);
2399
2400 // for ::encode(pools, bl);
2401 __u32 n = pools.size();
2402 ::encode(n, bl);
2403
2404 for (const auto &pool : pools) {
2405 n = pool.first;
2406 ::encode(n, bl);
2407 ::encode(pool.second, bl, 0);
2408 }
2409 // for ::encode(pool_name, bl);
2410 n = pool_name.size();
2411 ::encode(n, bl);
2412 for (const auto &pname : pool_name) {
2413 n = pname.first;
2414 ::encode(n, bl);
2415 ::encode(pname.second, bl);
2416 }
2417 // for ::encode(pool_max, bl);
2418 n = pool_max;
2419 ::encode(n, bl);
2420
2421 ::encode(flags, bl);
2422
2423 ::encode(max_osd, bl);
31f18b77
FG
2424 {
2425 uint32_t n = osd_state.size();
2426 ::encode(n, bl);
2427 for (auto s : osd_state) {
2428 ::encode((uint8_t)s, bl);
2429 }
2430 }
7c673cae
FG
2431 ::encode(osd_weight, bl);
2432 ::encode(osd_addrs->client_addr, bl, 0);
2433
2434 // for ::encode(pg_temp, bl);
2435 n = pg_temp->size();
2436 ::encode(n, bl);
2437 for (const auto pg : *pg_temp) {
2438 old_pg_t opg = pg.first.get_old_pg();
2439 ::encode(opg, bl);
2440 ::encode(pg.second, bl);
2441 }
2442
2443 // crush
2444 bufferlist cbl;
2445 crush->encode(cbl, 0 /* legacy (no) features */);
2446 ::encode(cbl, bl);
2447}
2448
2449void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2450{
2451 if ((features & CEPH_FEATURE_PGID64) == 0) {
2452 encode_client_old(bl);
2453 return;
2454 }
2455
2456 __u16 v = 6;
2457 ::encode(v, bl);
2458
2459 // base
2460 ::encode(fsid, bl);
2461 ::encode(epoch, bl);
2462 ::encode(created, bl);
2463 ::encode(modified, bl);
2464
2465 ::encode(pools, bl, features);
2466 ::encode(pool_name, bl);
2467 ::encode(pool_max, bl);
2468
2469 ::encode(flags, bl);
2470
2471 ::encode(max_osd, bl);
31f18b77
FG
2472 {
2473 uint32_t n = osd_state.size();
2474 ::encode(n, bl);
2475 for (auto s : osd_state) {
2476 ::encode((uint8_t)s, bl);
2477 }
2478 }
7c673cae
FG
2479 ::encode(osd_weight, bl);
2480 ::encode(osd_addrs->client_addr, bl, features);
2481
2482 ::encode(*pg_temp, bl);
2483
2484 // crush
2485 bufferlist cbl;
2486 crush->encode(cbl, 0 /* legacy (no) features */);
2487 ::encode(cbl, bl);
2488
2489 // extended
2490 __u16 ev = 10;
2491 ::encode(ev, bl);
2492 ::encode(osd_addrs->hb_back_addr, bl, features);
2493 ::encode(osd_info, bl);
2494 ::encode(blacklist, bl, features);
2495 ::encode(osd_addrs->cluster_addr, bl, features);
2496 ::encode(cluster_snapshot_epoch, bl);
2497 ::encode(cluster_snapshot, bl);
2498 ::encode(*osd_uuid, bl);
2499 ::encode(osd_xinfo, bl);
2500 ::encode(osd_addrs->hb_front_addr, bl, features);
2501}
2502
2503void OSDMap::encode(bufferlist& bl, uint64_t features) const
2504{
2505 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2506 encode_classic(bl, features);
2507 return;
2508 }
2509
2510 // only a select set of callers should *ever* be encoding new
2511 // OSDMaps. others should be passing around the canonical encoded
2512 // buffers from on high. select out those callers by passing in an
2513 // "impossible" feature bit.
2514 assert(features & CEPH_FEATURE_RESERVED);
2515 features &= ~CEPH_FEATURE_RESERVED;
2516
2517 size_t start_offset = bl.length();
2518 size_t tail_offset;
2519 buffer::list::iterator crc_it;
2520
2521 // meta-encoding: how we include client-used and osd-specific data
2522 ENCODE_START(8, 7, bl);
2523
2524 {
28e407b8
AA
2525 // NOTE: any new encoding dependencies must be reflected by
2526 // SIGNIFICANT_FEATURES
31f18b77
FG
2527 uint8_t v = 6;
2528 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
7c673cae
FG
2529 v = 3;
2530 }
2531 ENCODE_START(v, 1, bl); // client-usable data
2532 // base
2533 ::encode(fsid, bl);
2534 ::encode(epoch, bl);
2535 ::encode(created, bl);
2536 ::encode(modified, bl);
2537
2538 ::encode(pools, bl, features);
2539 ::encode(pool_name, bl);
2540 ::encode(pool_max, bl);
2541
31f18b77
FG
2542 if (v < 4) {
2543 decltype(flags) f = flags;
2544 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
c07f9fc5 2545 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2546 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2547 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2548 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2549 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
2550 ::encode(f, bl);
2551 } else {
2552 ::encode(flags, bl);
2553 }
7c673cae
FG
2554
2555 ::encode(max_osd, bl);
31f18b77
FG
2556 if (v >= 5) {
2557 ::encode(osd_state, bl);
2558 } else {
2559 uint32_t n = osd_state.size();
2560 ::encode(n, bl);
2561 for (auto s : osd_state) {
2562 ::encode((uint8_t)s, bl);
2563 }
2564 }
7c673cae
FG
2565 ::encode(osd_weight, bl);
2566 ::encode(osd_addrs->client_addr, bl, features);
2567
2568 ::encode(*pg_temp, bl);
2569 ::encode(*primary_temp, bl);
2570 if (osd_primary_affinity) {
2571 ::encode(*osd_primary_affinity, bl);
2572 } else {
2573 vector<__u32> v;
2574 ::encode(v, bl);
2575 }
2576
2577 // crush
2578 bufferlist cbl;
2579 crush->encode(cbl, features);
2580 ::encode(cbl, bl);
2581 ::encode(erasure_code_profiles, bl);
2582
2583 if (v >= 4) {
2584 ::encode(pg_upmap, bl);
2585 ::encode(pg_upmap_items, bl);
2586 } else {
2587 assert(pg_upmap.empty());
2588 assert(pg_upmap_items.empty());
2589 }
31f18b77
FG
2590 if (v >= 6) {
2591 ::encode(crush_version, bl);
2592 }
7c673cae
FG
2593 ENCODE_FINISH(bl); // client-usable data
2594 }
2595
2596 {
28e407b8
AA
2597 // NOTE: any new encoding dependencies must be reflected by
2598 // SIGNIFICANT_FEATURES
31f18b77 2599 uint8_t target_v = 5;
7c673cae
FG
2600 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2601 target_v = 1;
2602 }
2603 ENCODE_START(target_v, 1, bl); // extended, osd-only data
2604 ::encode(osd_addrs->hb_back_addr, bl, features);
2605 ::encode(osd_info, bl);
2606 {
2607 // put this in a sorted, ordered map<> so that we encode in a
2608 // deterministic order.
2609 map<entity_addr_t,utime_t> blacklist_map;
2610 for (const auto &addr : blacklist)
2611 blacklist_map.insert(make_pair(addr.first, addr.second));
2612 ::encode(blacklist_map, bl, features);
2613 }
2614 ::encode(osd_addrs->cluster_addr, bl, features);
2615 ::encode(cluster_snapshot_epoch, bl);
2616 ::encode(cluster_snapshot, bl);
2617 ::encode(*osd_uuid, bl);
2618 ::encode(osd_xinfo, bl);
2619 ::encode(osd_addrs->hb_front_addr, bl, features);
2620 if (target_v >= 2) {
2621 ::encode(nearfull_ratio, bl);
2622 ::encode(full_ratio, bl);
2623 ::encode(backfillfull_ratio, bl);
31f18b77
FG
2624 }
2625 // 4 was string-based new_require_min_compat_client
2626 if (target_v >= 5) {
7c673cae 2627 ::encode(require_min_compat_client, bl);
31f18b77 2628 ::encode(require_osd_release, bl);
7c673cae
FG
2629 }
2630 ENCODE_FINISH(bl); // osd-only data
2631 }
2632
2633 ::encode((uint32_t)0, bl); // dummy crc
2634 crc_it = bl.end();
2635 crc_it.advance(-4);
2636 tail_offset = bl.length();
2637
2638 ENCODE_FINISH(bl); // meta-encoding wrapper
2639
2640 // fill in crc
2641 bufferlist front;
2642 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
2643 crc = front.crc32c(-1);
2644 if (tail_offset < bl.length()) {
2645 bufferlist tail;
2646 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2647 crc = tail.crc32c(crc);
2648 }
2649 ceph_le32 crc_le;
2650 crc_le = crc;
2651 crc_it.copy_in(4, (char*)&crc_le);
2652 crc_defined = true;
2653}
2654
2655void OSDMap::decode(bufferlist& bl)
2656{
2657 auto p = bl.begin();
2658 decode(p);
2659}
2660
2661void OSDMap::decode_classic(bufferlist::iterator& p)
2662{
2663 __u32 n, t;
2664 __u16 v;
2665 ::decode(v, p);
2666
2667 // base
2668 ::decode(fsid, p);
2669 ::decode(epoch, p);
2670 ::decode(created, p);
2671 ::decode(modified, p);
2672
2673 if (v < 6) {
2674 if (v < 4) {
2675 int32_t max_pools = 0;
2676 ::decode(max_pools, p);
2677 pool_max = max_pools;
2678 }
2679 pools.clear();
2680 ::decode(n, p);
2681 while (n--) {
2682 ::decode(t, p);
2683 ::decode(pools[t], p);
2684 }
2685 if (v == 4) {
2686 ::decode(n, p);
2687 pool_max = n;
2688 } else if (v == 5) {
2689 pool_name.clear();
2690 ::decode(n, p);
2691 while (n--) {
2692 ::decode(t, p);
2693 ::decode(pool_name[t], p);
2694 }
2695 ::decode(n, p);
2696 pool_max = n;
2697 }
2698 } else {
2699 ::decode(pools, p);
2700 ::decode(pool_name, p);
2701 ::decode(pool_max, p);
2702 }
2703 // kludge around some old bug that zeroed out pool_max (#2307)
2704 if (pools.size() && pool_max < pools.rbegin()->first) {
2705 pool_max = pools.rbegin()->first;
2706 }
2707
2708 ::decode(flags, p);
2709
2710 ::decode(max_osd, p);
31f18b77
FG
2711 {
2712 vector<uint8_t> os;
2713 ::decode(os, p);
2714 osd_state.resize(os.size());
2715 for (unsigned i = 0; i < os.size(); ++i) {
2716 osd_state[i] = os[i];
2717 }
2718 }
7c673cae
FG
2719 ::decode(osd_weight, p);
2720 ::decode(osd_addrs->client_addr, p);
2721 if (v <= 5) {
2722 pg_temp->clear();
2723 ::decode(n, p);
2724 while (n--) {
2725 old_pg_t opg;
2726 ::decode_raw(opg, p);
31f18b77
FG
2727 mempool::osdmap::vector<int32_t> v;
2728 ::decode(v, p);
2729 pg_temp->set(pg_t(opg), v);
7c673cae
FG
2730 }
2731 } else {
2732 ::decode(*pg_temp, p);
2733 }
2734
2735 // crush
2736 bufferlist cbl;
2737 ::decode(cbl, p);
2738 auto cblp = cbl.begin();
2739 crush->decode(cblp);
2740
2741 // extended
2742 __u16 ev = 0;
2743 if (v >= 5)
2744 ::decode(ev, p);
2745 ::decode(osd_addrs->hb_back_addr, p);
2746 ::decode(osd_info, p);
2747 if (v < 5)
2748 ::decode(pool_name, p);
2749
2750 ::decode(blacklist, p);
2751 if (ev >= 6)
2752 ::decode(osd_addrs->cluster_addr, p);
2753 else
2754 osd_addrs->cluster_addr.resize(osd_addrs->client_addr.size());
2755
2756 if (ev >= 7) {
2757 ::decode(cluster_snapshot_epoch, p);
2758 ::decode(cluster_snapshot, p);
2759 }
2760
2761 if (ev >= 8) {
2762 ::decode(*osd_uuid, p);
2763 } else {
2764 osd_uuid->resize(max_osd);
2765 }
2766 if (ev >= 9)
2767 ::decode(osd_xinfo, p);
2768 else
2769 osd_xinfo.resize(max_osd);
2770
2771 if (ev >= 10)
2772 ::decode(osd_addrs->hb_front_addr, p);
2773 else
2774 osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
2775
2776 osd_primary_affinity.reset();
2777
2778 post_decode();
2779}
2780
2781void OSDMap::decode(bufferlist::iterator& bl)
2782{
2783 /**
2784 * Older encodings of the OSDMap had a single struct_v which
2785 * covered the whole encoding, and was prior to our modern
2786 * stuff which includes a compatv and a size. So if we see
2787 * a struct_v < 7, we must rewind to the beginning and use our
2788 * classic decoder.
2789 */
2790 size_t start_offset = bl.get_off();
2791 size_t tail_offset = 0;
2792 bufferlist crc_front, crc_tail;
2793
2794 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
2795 if (struct_v < 7) {
2796 int struct_v_size = sizeof(struct_v);
2797 bl.advance(-struct_v_size);
2798 decode_classic(bl);
2799 return;
2800 }
2801 /**
2802 * Since we made it past that hurdle, we can use our normal paths.
2803 */
2804 {
31f18b77 2805 DECODE_START(6, bl); // client-usable data
7c673cae
FG
2806 // base
2807 ::decode(fsid, bl);
2808 ::decode(epoch, bl);
2809 ::decode(created, bl);
2810 ::decode(modified, bl);
2811
2812 ::decode(pools, bl);
2813 ::decode(pool_name, bl);
2814 ::decode(pool_max, bl);
2815
2816 ::decode(flags, bl);
2817
2818 ::decode(max_osd, bl);
31f18b77
FG
2819 if (struct_v >= 5) {
2820 ::decode(osd_state, bl);
2821 } else {
2822 vector<uint8_t> os;
2823 ::decode(os, bl);
2824 osd_state.resize(os.size());
2825 for (unsigned i = 0; i < os.size(); ++i) {
2826 osd_state[i] = os[i];
2827 }
2828 }
7c673cae
FG
2829 ::decode(osd_weight, bl);
2830 ::decode(osd_addrs->client_addr, bl);
2831
2832 ::decode(*pg_temp, bl);
2833 ::decode(*primary_temp, bl);
2834 if (struct_v >= 2) {
2835 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
2836 ::decode(*osd_primary_affinity, bl);
2837 if (osd_primary_affinity->empty())
2838 osd_primary_affinity.reset();
2839 } else {
2840 osd_primary_affinity.reset();
2841 }
2842
2843 // crush
2844 bufferlist cbl;
2845 ::decode(cbl, bl);
2846 auto cblp = cbl.begin();
2847 crush->decode(cblp);
2848 if (struct_v >= 3) {
2849 ::decode(erasure_code_profiles, bl);
2850 } else {
2851 erasure_code_profiles.clear();
2852 }
2853 if (struct_v >= 4) {
2854 ::decode(pg_upmap, bl);
2855 ::decode(pg_upmap_items, bl);
2856 } else {
2857 pg_upmap.clear();
2858 pg_upmap_items.clear();
2859 }
31f18b77
FG
2860 if (struct_v >= 6) {
2861 ::decode(crush_version, bl);
2862 }
7c673cae
FG
2863 DECODE_FINISH(bl); // client-usable data
2864 }
2865
2866 {
31f18b77 2867 DECODE_START(5, bl); // extended, osd-only data
7c673cae
FG
2868 ::decode(osd_addrs->hb_back_addr, bl);
2869 ::decode(osd_info, bl);
2870 ::decode(blacklist, bl);
2871 ::decode(osd_addrs->cluster_addr, bl);
2872 ::decode(cluster_snapshot_epoch, bl);
2873 ::decode(cluster_snapshot, bl);
2874 ::decode(*osd_uuid, bl);
2875 ::decode(osd_xinfo, bl);
2876 ::decode(osd_addrs->hb_front_addr, bl);
2877 if (struct_v >= 2) {
2878 ::decode(nearfull_ratio, bl);
2879 ::decode(full_ratio, bl);
2880 } else {
2881 nearfull_ratio = 0;
2882 full_ratio = 0;
2883 }
2884 if (struct_v >= 3) {
2885 ::decode(backfillfull_ratio, bl);
2886 } else {
2887 backfillfull_ratio = 0;
2888 }
31f18b77
FG
2889 if (struct_v == 4) {
2890 string r;
2891 ::decode(r, bl);
2892 if (r.length())
2893 require_min_compat_client = ceph_release_from_name(r.c_str());
2894 }
2895 if (struct_v >= 5) {
7c673cae 2896 ::decode(require_min_compat_client, bl);
31f18b77
FG
2897 ::decode(require_osd_release, bl);
2898 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2899 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2900 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2901 }
2902 } else {
2903 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
2904 // only for compat with post-kraken pre-luminous test clusters
2905 require_osd_release = CEPH_RELEASE_LUMINOUS;
2906 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
c07f9fc5 2907 flags |= CEPH_OSDMAP_RECOVERY_DELETES;
31f18b77
FG
2908 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2909 require_osd_release = CEPH_RELEASE_KRAKEN;
2910 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2911 require_osd_release = CEPH_RELEASE_JEWEL;
2912 } else {
2913 require_osd_release = 0;
2914 }
2915 }
7c673cae
FG
2916 DECODE_FINISH(bl); // osd-only data
2917 }
2918
2919 if (struct_v >= 8) {
2920 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
2921 ::decode(crc, bl);
2922 tail_offset = bl.get_off();
2923 crc_defined = true;
2924 } else {
2925 crc_defined = false;
2926 crc = 0;
2927 }
2928
2929 DECODE_FINISH(bl); // wrapper
2930
2931 if (tail_offset) {
2932 // verify crc
2933 uint32_t actual = crc_front.crc32c(-1);
2934 if (tail_offset < bl.get_off()) {
2935 bufferlist tail;
2936 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
2937 actual = tail.crc32c(actual);
2938 }
2939 if (crc != actual) {
2940 ostringstream ss;
2941 ss << "bad crc, actual " << actual << " != expected " << crc;
2942 string s = ss.str();
2943 throw buffer::malformed_input(s.c_str());
2944 }
2945 }
2946
2947 post_decode();
2948}
2949
2950void OSDMap::post_decode()
2951{
2952 // index pool names
2953 name_pool.clear();
2954 for (const auto &pname : pool_name) {
2955 name_pool[pname.second] = pname.first;
2956 }
2957
2958 calc_num_osds();
2959 _calc_up_osd_features();
2960}
2961
2962void OSDMap::dump_erasure_code_profiles(
2963 const mempool::osdmap::map<string,map<string,string>>& profiles,
2964 Formatter *f)
2965{
2966 f->open_object_section("erasure_code_profiles");
2967 for (const auto &profile : profiles) {
2968 f->open_object_section(profile.first.c_str());
2969 for (const auto &profm : profile.second) {
2970 f->dump_string(profm.first.c_str(), profm.second.c_str());
2971 }
2972 f->close_section();
2973 }
2974 f->close_section();
2975}
2976
2977void OSDMap::dump(Formatter *f) const
2978{
2979 f->dump_int("epoch", get_epoch());
2980 f->dump_stream("fsid") << get_fsid();
2981 f->dump_stream("created") << get_created();
2982 f->dump_stream("modified") << get_modified();
2983 f->dump_string("flags", get_flag_string());
31f18b77 2984 f->dump_unsigned("crush_version", get_crush_version());
7c673cae
FG
2985 f->dump_float("full_ratio", full_ratio);
2986 f->dump_float("backfillfull_ratio", backfillfull_ratio);
2987 f->dump_float("nearfull_ratio", nearfull_ratio);
2988 f->dump_string("cluster_snapshot", get_cluster_snapshot());
2989 f->dump_int("pool_max", get_pool_max());
2990 f->dump_int("max_osd", get_max_osd());
31f18b77
FG
2991 f->dump_string("require_min_compat_client",
2992 ceph_release_name(require_min_compat_client));
2993 f->dump_string("min_compat_client",
2994 ceph_release_name(get_min_compat_client()));
2995 f->dump_string("require_osd_release",
2996 ceph_release_name(require_osd_release));
7c673cae
FG
2997
2998 f->open_array_section("pools");
2999 for (const auto &pool : pools) {
3000 std::string name("<unknown>");
3001 const auto &pni = pool_name.find(pool.first);
3002 if (pni != pool_name.end())
3003 name = pni->second;
3004 f->open_object_section("pool");
3005 f->dump_int("pool", pool.first);
3006 f->dump_string("pool_name", name);
3007 pool.second.dump(f);
3008 f->close_section();
3009 }
3010 f->close_section();
3011
3012 f->open_array_section("osds");
3013 for (int i=0; i<get_max_osd(); i++)
3014 if (exists(i)) {
3015 f->open_object_section("osd_info");
3016 f->dump_int("osd", i);
3017 f->dump_stream("uuid") << get_uuid(i);
3018 f->dump_int("up", is_up(i));
3019 f->dump_int("in", is_in(i));
3020 f->dump_float("weight", get_weightf(i));
3021 f->dump_float("primary_affinity", get_primary_affinityf(i));
3022 get_info(i).dump(f);
3023 f->dump_stream("public_addr") << get_addr(i);
3024 f->dump_stream("cluster_addr") << get_cluster_addr(i);
3025 f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
3026 f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
3027
3028 set<string> st;
3029 get_state(i, st);
3030 f->open_array_section("state");
3031 for (const auto &state : st)
3032 f->dump_string("state", state);
3033 f->close_section();
3034
3035 f->close_section();
3036 }
3037 f->close_section();
3038
3039 f->open_array_section("osd_xinfo");
3040 for (int i=0; i<get_max_osd(); i++) {
3041 if (exists(i)) {
3042 f->open_object_section("xinfo");
3043 f->dump_int("osd", i);
3044 osd_xinfo[i].dump(f);
3045 f->close_section();
3046 }
3047 }
3048 f->close_section();
3049
3050 f->open_array_section("pg_upmap");
3051 for (auto& p : pg_upmap) {
3052 f->open_object_section("mapping");
3053 f->dump_stream("pgid") << p.first;
3054 f->open_array_section("osds");
3055 for (auto q : p.second) {
3056 f->dump_int("osd", q);
3057 }
3058 f->close_section();
3059 f->close_section();
3060 }
3061 f->close_section();
3062 f->open_array_section("pg_upmap_items");
3063 for (auto& p : pg_upmap_items) {
3064 f->open_object_section("mapping");
3065 f->dump_stream("pgid") << p.first;
3066 f->open_array_section("mappings");
3067 for (auto& q : p.second) {
3068 f->open_object_section("mapping");
3069 f->dump_int("from", q.first);
3070 f->dump_int("to", q.second);
3071 f->close_section();
3072 }
3073 f->close_section();
3074 f->close_section();
3075 }
3076 f->close_section();
3077 f->open_array_section("pg_temp");
31f18b77 3078 pg_temp->dump(f);
7c673cae
FG
3079 f->close_section();
3080
3081 f->open_array_section("primary_temp");
3082 for (const auto &pg : *primary_temp) {
3083 f->dump_stream("pgid") << pg.first;
3084 f->dump_int("osd", pg.second);
3085 }
3086 f->close_section(); // primary_temp
3087
3088 f->open_object_section("blacklist");
3089 for (const auto &addr : blacklist) {
3090 stringstream ss;
3091 ss << addr.first;
3092 f->dump_stream(ss.str().c_str()) << addr.second;
3093 }
3094 f->close_section();
3095
3096 dump_erasure_code_profiles(erasure_code_profiles, f);
3097}
3098
3099void OSDMap::generate_test_instances(list<OSDMap*>& o)
3100{
3101 o.push_back(new OSDMap);
3102
3103 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
3104 o.push_back(new OSDMap);
3105 uuid_d fsid;
224ce89b 3106 o.back()->build_simple(cct, 1, fsid, 16);
7c673cae
FG
3107 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
3108 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
3109 cct->put();
3110}
3111
3112string OSDMap::get_flag_string(unsigned f)
3113{
3114 string s;
3115 if ( f& CEPH_OSDMAP_NEARFULL)
3116 s += ",nearfull";
3117 if (f & CEPH_OSDMAP_FULL)
3118 s += ",full";
3119 if (f & CEPH_OSDMAP_PAUSERD)
3120 s += ",pauserd";
3121 if (f & CEPH_OSDMAP_PAUSEWR)
3122 s += ",pausewr";
3123 if (f & CEPH_OSDMAP_PAUSEREC)
3124 s += ",pauserec";
3125 if (f & CEPH_OSDMAP_NOUP)
3126 s += ",noup";
3127 if (f & CEPH_OSDMAP_NODOWN)
3128 s += ",nodown";
3129 if (f & CEPH_OSDMAP_NOOUT)
3130 s += ",noout";
3131 if (f & CEPH_OSDMAP_NOIN)
3132 s += ",noin";
3133 if (f & CEPH_OSDMAP_NOBACKFILL)
3134 s += ",nobackfill";
3135 if (f & CEPH_OSDMAP_NOREBALANCE)
3136 s += ",norebalance";
3137 if (f & CEPH_OSDMAP_NORECOVER)
3138 s += ",norecover";
3139 if (f & CEPH_OSDMAP_NOSCRUB)
3140 s += ",noscrub";
3141 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
3142 s += ",nodeep-scrub";
3143 if (f & CEPH_OSDMAP_NOTIERAGENT)
3144 s += ",notieragent";
3145 if (f & CEPH_OSDMAP_SORTBITWISE)
3146 s += ",sortbitwise";
3147 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
3148 s += ",require_jewel_osds";
3149 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
3150 s += ",require_kraken_osds";
3151 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
3152 s += ",require_luminous_osds";
c07f9fc5
FG
3153 if (f & CEPH_OSDMAP_RECOVERY_DELETES)
3154 s += ",recovery_deletes";
181888fb
FG
3155 if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
3156 s += ",purged_snapdirs";
f64942e4
AA
3157 if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
3158 s += ",pglog_hardlimit";
7c673cae
FG
3159 if (s.length())
3160 s.erase(0, 1);
3161 return s;
3162}
3163
3164string OSDMap::get_flag_string() const
3165{
3166 return get_flag_string(flags);
3167}
3168
7c673cae
FG
3169void OSDMap::print_pools(ostream& out) const
3170{
3171 for (const auto &pool : pools) {
3172 std::string name("<unknown>");
3173 const auto &pni = pool_name.find(pool.first);
3174 if (pni != pool_name.end())
3175 name = pni->second;
3176 out << "pool " << pool.first
3177 << " '" << name
3178 << "' " << pool.second << "\n";
3179
3180 for (const auto &snap : pool.second.snaps)
3181 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
3182
3183 if (!pool.second.removed_snaps.empty())
3184 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
3185 }
3186 out << std::endl;
3187}
3188
3189void OSDMap::print(ostream& out) const
3190{
3191 out << "epoch " << get_epoch() << "\n"
3192 << "fsid " << get_fsid() << "\n"
3193 << "created " << get_created() << "\n"
3194 << "modified " << get_modified() << "\n";
3195
3196 out << "flags " << get_flag_string() << "\n";
31f18b77 3197 out << "crush_version " << get_crush_version() << "\n";
7c673cae
FG
3198 out << "full_ratio " << full_ratio << "\n";
3199 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3200 out << "nearfull_ratio " << nearfull_ratio << "\n";
31f18b77
FG
3201 if (require_min_compat_client > 0) {
3202 out << "require_min_compat_client "
3203 << ceph_release_name(require_min_compat_client) << "\n";
7c673cae 3204 }
31f18b77
FG
3205 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
3206 << "\n";
224ce89b
WB
3207 if (require_osd_release > 0) {
3208 out << "require_osd_release " << ceph_release_name(require_osd_release)
3209 << "\n";
3210 }
7c673cae
FG
3211 if (get_cluster_snapshot().length())
3212 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3213 out << "\n";
3214
3215 print_pools(out);
3216
3217 out << "max_osd " << get_max_osd() << "\n";
3218 for (int i=0; i<get_max_osd(); i++) {
3219 if (exists(i)) {
3220 out << "osd." << i;
3221 out << (is_up(i) ? " up ":" down");
3222 out << (is_in(i) ? " in ":" out");
3223 out << " weight " << get_weightf(i);
3224 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3225 out << " primary_affinity " << get_primary_affinityf(i);
3226 const osd_info_t& info(get_info(i));
3227 out << " " << info;
3228 out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
3229 << " " << get_hb_front_addr(i);
3230 set<string> st;
3231 get_state(i, st);
3232 out << " " << st;
3233 if (!get_uuid(i).is_zero())
3234 out << " " << get_uuid(i);
3235 out << "\n";
3236 }
3237 }
3238 out << std::endl;
3239
3240 for (auto& p : pg_upmap) {
3241 out << "pg_upmap " << p.first << " " << p.second << "\n";
3242 }
3243 for (auto& p : pg_upmap_items) {
3244 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3245 }
3246
3247 for (const auto pg : *pg_temp)
3248 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3249
3250 for (const auto pg : *primary_temp)
3251 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3252
3253 for (const auto &addr : blacklist)
3254 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
3255
3256 // ignore pg_swap_primary
3257}
3258
3259class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3260public:
3261 typedef CrushTreeDumper::Dumper<TextTable> Parent;
31f18b77
FG
3262
3263 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3264 unsigned f)
c07f9fc5 3265 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3266
3267 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3268 if (!filter) {
3269 return true; // normal case
3270 }
3271 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3272 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3273 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3274 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3275 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3276 return true;
31f18b77 3277 }
c07f9fc5 3278 return false;
31f18b77
FG
3279 }
3280
3281 bool should_dump_empty_bucket() const override {
3282 return !filter;
3283 }
7c673cae
FG
3284
3285 void dump(TextTable *tbl) {
3286 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3287 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3288 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3289 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
c07f9fc5 3290 tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 3291 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
224ce89b 3292 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
3293
3294 Parent::dump(tbl);
3295
3296 for (int i = 0; i < osdmap->get_max_osd(); i++) {
31f18b77 3297 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
c07f9fc5 3298 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
31f18b77 3299 }
7c673cae
FG
3300 }
3301 }
3302
3303protected:
3304 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
224ce89b
WB
3305 const char *c = crush->get_item_class(qi.id);
3306 if (!c)
3307 c = "";
7c673cae 3308 *tbl << qi.id
224ce89b 3309 << c
7c673cae
FG
3310 << weightf_t(qi.weight);
3311
3312 ostringstream name;
3313 for (int k = 0; k < qi.depth; k++)
3314 name << " ";
3315 if (qi.is_bucket()) {
3316 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3317 << crush->get_item_name(qi.id);
3318 } else {
3319 name << "osd." << qi.id;
3320 }
3321 *tbl << name.str();
3322
3323 if (!qi.is_bucket()) {
3324 if (!osdmap->exists(qi.id)) {
3325 *tbl << "DNE"
3326 << 0;
3327 } else {
c07f9fc5
FG
3328 string s;
3329 if (osdmap->is_up(qi.id)) {
3330 s = "up";
3331 } else if (osdmap->is_destroyed(qi.id)) {
3332 s = "destroyed";
3333 } else {
3334 s = "down";
3335 }
3336 *tbl << s
7c673cae
FG
3337 << weightf_t(osdmap->get_weightf(qi.id))
3338 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3339 }
3340 }
3341 *tbl << TextTable::endrow;
3342 }
3343
3344private:
3345 const OSDMap *osdmap;
31f18b77 3346 const unsigned filter;
7c673cae
FG
3347};
3348
3349class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3350public:
3351 typedef CrushTreeDumper::FormattingDumper Parent;
3352
31f18b77
FG
3353 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3354 unsigned f)
c07f9fc5 3355 : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
31f18b77
FG
3356
3357 bool should_dump_leaf(int i) const override {
c07f9fc5
FG
3358 if (!filter) {
3359 return true; // normal case
3360 }
3361 if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
3362 ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
3363 ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
3364 ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
3365 ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
3366 return true;
31f18b77 3367 }
c07f9fc5 3368 return false;
31f18b77
FG
3369 }
3370
3371 bool should_dump_empty_bucket() const override {
3372 return !filter;
3373 }
7c673cae
FG
3374
3375 void dump(Formatter *f) {
3376 f->open_array_section("nodes");
3377 Parent::dump(f);
3378 f->close_section();
3379 f->open_array_section("stray");
3380 for (int i = 0; i < osdmap->get_max_osd(); i++) {
31f18b77 3381 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
c07f9fc5 3382 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
7c673cae
FG
3383 }
3384 f->close_section();
3385 }
3386
3387protected:
3388 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3389 Parent::dump_item_fields(qi, f);
3390 if (!qi.is_bucket())
3391 {
c07f9fc5
FG
3392 string s;
3393 if (osdmap->is_up(qi.id)) {
3394 s = "up";
3395 } else if (osdmap->is_destroyed(qi.id)) {
3396 s = "destroyed";
3397 } else {
3398 s = "down";
3399 }
7c673cae 3400 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
c07f9fc5 3401 f->dump_string("status", s);
7c673cae
FG
3402 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3403 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3404 }
3405 }
3406
3407private:
3408 const OSDMap *osdmap;
31f18b77 3409 const unsigned filter;
7c673cae
FG
3410};
3411
31f18b77 3412void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter) const
7c673cae 3413{
31f18b77
FG
3414 if (f) {
3415 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f);
3416 } else {
7c673cae
FG
3417 assert(out);
3418 TextTable tbl;
31f18b77 3419 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl);
7c673cae
FG
3420 *out << tbl;
3421 }
3422}
3423
224ce89b
WB
3424void OSDMap::print_summary(Formatter *f, ostream& out,
3425 const string& prefix) const
7c673cae
FG
3426{
3427 if (f) {
3428 f->open_object_section("osdmap");
3429 f->dump_int("epoch", get_epoch());
3430 f->dump_int("num_osds", get_num_osds());
3431 f->dump_int("num_up_osds", get_num_up_osds());
3432 f->dump_int("num_in_osds", get_num_in_osds());
3433 f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
3434 f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
3435 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3436 f->close_section();
3437 } else {
31f18b77 3438 out << get_num_osds() << " osds: "
7c673cae
FG
3439 << get_num_up_osds() << " up, "
3440 << get_num_in_osds() << " in";
3441 if (get_num_pg_temp())
3442 out << "; " << get_num_pg_temp() << " remapped pgs";
3443 out << "\n";
3444 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3445 if (important_flags)
224ce89b 3446 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
7c673cae
FG
3447 }
3448}
3449
3450void OSDMap::print_oneline_summary(ostream& out) const
3451{
3452 out << "e" << get_epoch() << ": "
31f18b77 3453 << get_num_osds() << " total, "
7c673cae
FG
3454 << get_num_up_osds() << " up, "
3455 << get_num_in_osds() << " in";
3456 if (test_flag(CEPH_OSDMAP_FULL))
3457 out << " full";
3458 else if (test_flag(CEPH_OSDMAP_NEARFULL))
3459 out << " nearfull";
3460}
3461
3efd9988 3462bool OSDMap::crush_rule_in_use(int rule_id) const
7c673cae
FG
3463{
3464 for (const auto &pool : pools) {
3efd9988 3465 if (pool.second.crush_rule == rule_id)
7c673cae
FG
3466 return true;
3467 }
3468 return false;
3469}
3470
3efd9988
FG
3471int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
3472 ostream *ss) const
3473{
3474 for (auto& i : pools) {
3475 auto& pool = i.second;
3476 int ruleno = pool.get_crush_rule();
3477 if (!newcrush->rule_exists(ruleno)) {
3478 *ss << "pool " << i.first << " references crush_rule " << ruleno
3479 << " but it is not present";
3480 return -EINVAL;
3481 }
3482 if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
3483 *ss << "rule " << ruleno << " mask ruleset does not match rule id";
3484 return -EINVAL;
3485 }
3486 if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
3487 *ss << "pool " << i.first << " type does not match rule " << ruleno;
3488 return -EINVAL;
3489 }
3490 if (pool.get_size() < (int)newcrush->get_rule_mask_min_size(ruleno) ||
3491 pool.get_size() > (int)newcrush->get_rule_mask_max_size(ruleno)) {
3492 *ss << "pool " << i.first << " size " << pool.get_size() << " does not"
3493 << " fall within rule " << ruleno
3494 << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
3495 << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
3496 return -EINVAL;
3497 }
3498 }
3499 return 0;
3500}
3501
224ce89b
WB
3502int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
3503 int nosd, int pg_bits, int pgp_bits,
3504 bool default_pool)
7c673cae 3505{
224ce89b
WB
3506 ldout(cct, 10) << "build_simple on " << nosd
3507 << " osds" << dendl;
7c673cae
FG
3508 epoch = e;
3509 set_fsid(fsid);
3510 created = modified = ceph_clock_now();
3511
3512 if (nosd >= 0) {
3513 set_max_osd(nosd);
3514 } else {
3515 // count osds
3516 int maxosd = 0;
3517 const md_config_t *conf = cct->_conf;
3518 vector<string> sections;
3519 conf->get_all_sections(sections);
3520
3521 for (auto &section : sections) {
3522 if (section.find("osd.") != 0)
3523 continue;
3524
3525 const char *begin = section.c_str() + 4;
3526 char *end = (char*)begin;
3527 int o = strtol(begin, &end, 10);
3528 if (*end != '\0')
3529 continue;
3530
3531 if (o > cct->_conf->mon_max_osd) {
3532 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
3533 return -ERANGE;
3534 }
3535
3536 if (o > maxosd)
3537 maxosd = o;
3538 }
3539
3540 set_max_osd(maxosd + 1);
3541 }
3542
7c673cae
FG
3543
3544 stringstream ss;
3545 int r;
3546 if (nosd >= 0)
3547 r = build_simple_crush_map(cct, *crush, nosd, &ss);
3548 else
3549 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
3550 assert(r == 0);
3551
3552 int poolbase = get_max_osd() ? get_max_osd() : 1;
3553
d2e6a577 3554 const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
31f18b77 3555 assert(default_replicated_rule >= 0);
7c673cae 3556
224ce89b
WB
3557 if (default_pool) {
3558 // pgp_num <= pg_num
3559 if (pgp_bits > pg_bits)
3560 pgp_bits = pg_bits;
3561
3562 vector<string> pool_names;
3563 pool_names.push_back("rbd");
3564 for (auto &plname : pool_names) {
3565 int64_t pool = ++pool_max;
3566 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
3567 pools[pool].flags = cct->_conf->osd_pool_default_flags;
3568 if (cct->_conf->osd_pool_default_flag_hashpspool)
3569 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
3570 if (cct->_conf->osd_pool_default_flag_nodelete)
3571 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
3572 if (cct->_conf->osd_pool_default_flag_nopgchange)
3573 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
3574 if (cct->_conf->osd_pool_default_flag_nosizechange)
3575 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
3576 pools[pool].size = cct->_conf->osd_pool_default_size;
3577 pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
3578 pools[pool].crush_rule = default_replicated_rule;
3579 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
3580 pools[pool].set_pg_num(poolbase << pg_bits);
3581 pools[pool].set_pgp_num(poolbase << pgp_bits);
3582 pools[pool].last_change = epoch;
c07f9fc5
FG
3583 pools[pool].application_metadata.insert(
3584 {pg_pool_t::APPLICATION_NAME_RBD, {}});
224ce89b
WB
3585 pool_name[pool] = plname;
3586 name_pool[plname] = pool;
3587 }
7c673cae
FG
3588 }
3589
3590 for (int i=0; i<get_max_osd(); i++) {
3591 set_state(i, 0);
3592 set_weight(i, CEPH_OSD_OUT);
3593 }
3594
3595 map<string,string> profile_map;
3596 r = get_erasure_code_profile_default(cct, profile_map, &ss);
3597 if (r < 0) {
3598 lderr(cct) << ss.str() << dendl;
3599 return r;
3600 }
3601 set_erasure_code_profile("default", profile_map);
3602 return 0;
3603}
3604
3605int OSDMap::get_erasure_code_profile_default(CephContext *cct,
3606 map<string,string> &profile_map,
3607 ostream *ss)
3608{
3609 int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
3610 *ss,
3611 &profile_map);
3612 return r;
3613}
3614
3615int OSDMap::_build_crush_types(CrushWrapper& crush)
3616{
3617 crush.set_type_name(0, "osd");
3618 crush.set_type_name(1, "host");
3619 crush.set_type_name(2, "chassis");
3620 crush.set_type_name(3, "rack");
3621 crush.set_type_name(4, "row");
3622 crush.set_type_name(5, "pdu");
3623 crush.set_type_name(6, "pod");
3624 crush.set_type_name(7, "room");
3625 crush.set_type_name(8, "datacenter");
3626 crush.set_type_name(9, "region");
3627 crush.set_type_name(10, "root");
3628 return 10;
3629}
3630
3631int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
3632 int nosd, ostream *ss)
3633{
3634 crush.create();
3635
3636 // root
3637 int root_type = _build_crush_types(crush);
3638 int rootid;
3639 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
3640 root_type, 0, NULL, NULL, &rootid);
3641 assert(r == 0);
3642 crush.set_item_name(rootid, "default");
3643
3644 for (int o=0; o<nosd; o++) {
3645 map<string,string> loc;
3646 loc["host"] = "localhost";
3647 loc["rack"] = "localrack";
3648 loc["root"] = "default";
3649 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
3650 char name[32];
3651 snprintf(name, sizeof(name), "osd.%d", o);
3652 crush.insert_item(cct, o, 1.0, name, loc);
3653 }
3654
31f18b77 3655 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
3656
3657 crush.finalize();
3658
3659 return 0;
3660}
3661
3662int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
3663 CrushWrapper& crush,
3664 ostream *ss)
3665{
3666 const md_config_t *conf = cct->_conf;
3667
3668 crush.create();
3669
3670 // root
3671 int root_type = _build_crush_types(crush);
3672 int rootid;
3673 int r = crush.add_bucket(0, 0,
3674 CRUSH_HASH_DEFAULT,
3675 root_type, 0, NULL, NULL, &rootid);
3676 assert(r == 0);
3677 crush.set_item_name(rootid, "default");
3678
3679 // add osds
3680 vector<string> sections;
3681 conf->get_all_sections(sections);
3682
3683 for (auto &section : sections) {
3684 if (section.find("osd.") != 0)
3685 continue;
3686
3687 const char *begin = section.c_str() + 4;
3688 char *end = (char*)begin;
3689 int o = strtol(begin, &end, 10);
3690 if (*end != '\0')
3691 continue;
3692
3693 string host, rack, row, room, dc, pool;
3694 vector<string> sectiontmp;
3695 sectiontmp.push_back("osd");
3696 sectiontmp.push_back(section);
3697 conf->get_val_from_conf_file(sectiontmp, "host", host, false);
3698 conf->get_val_from_conf_file(sectiontmp, "rack", rack, false);
3699 conf->get_val_from_conf_file(sectiontmp, "row", row, false);
3700 conf->get_val_from_conf_file(sectiontmp, "room", room, false);
3701 conf->get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
3702 conf->get_val_from_conf_file(sectiontmp, "root", pool, false);
3703
3704 if (host.length() == 0)
3705 host = "unknownhost";
3706 if (rack.length() == 0)
3707 rack = "unknownrack";
3708
3709 map<string,string> loc;
3710 loc["host"] = host;
3711 loc["rack"] = rack;
3712 if (row.size())
3713 loc["row"] = row;
3714 if (room.size())
3715 loc["room"] = room;
3716 if (dc.size())
3717 loc["datacenter"] = dc;
3718 loc["root"] = "default";
3719
3720 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
3721 crush.insert_item(cct, o, 1.0, section, loc);
3722 }
3723
31f18b77 3724 build_simple_crush_rules(cct, crush, "default", ss);
7c673cae
FG
3725
3726 crush.finalize();
3727
3728 return 0;
3729}
3730
3731
31f18b77
FG
3732int OSDMap::build_simple_crush_rules(
3733 CephContext *cct,
3734 CrushWrapper& crush,
3735 const string& root,
3736 ostream *ss)
7c673cae 3737{
31f18b77 3738 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
7c673cae
FG
3739 string failure_domain =
3740 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
3741
7c673cae 3742 int r;
31f18b77 3743 r = crush.add_simple_rule_at(
224ce89b 3744 "replicated_rule", root, failure_domain, "",
31f18b77
FG
3745 "firstn", pg_pool_t::TYPE_REPLICATED,
3746 crush_rule, ss);
7c673cae
FG
3747 if (r < 0)
3748 return r;
3749 // do not add an erasure rule by default or else we will implicitly
3750 // require the crush_v2 feature of clients
3751 return 0;
3752}
3753
3754int OSDMap::summarize_mapping_stats(
3755 OSDMap *newmap,
3756 const set<int64_t> *pools,
3757 std::string *out,
3758 Formatter *f) const
3759{
3760 set<int64_t> ls;
3761 if (pools) {
3762 ls = *pools;
3763 } else {
3764 for (auto &p : get_pools())
3765 ls.insert(p.first);
3766 }
3767
3768 unsigned total_pg = 0;
3769 unsigned moved_pg = 0;
3770 vector<unsigned> base_by_osd(get_max_osd(), 0);
3771 vector<unsigned> new_by_osd(get_max_osd(), 0);
3772 for (int64_t pool_id : ls) {
3773 const pg_pool_t *pi = get_pg_pool(pool_id);
31f18b77
FG
3774 vector<int> up, up2;
3775 int up_primary;
7c673cae
FG
3776 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
3777 pg_t pgid(ps, pool_id, -1);
3778 total_pg += pi->get_size();
31f18b77 3779 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
7c673cae
FG
3780 for (int osd : up) {
3781 if (osd >= 0 && osd < get_max_osd())
3782 ++base_by_osd[osd];
3783 }
3784 if (newmap) {
31f18b77 3785 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
7c673cae
FG
3786 for (int osd : up2) {
3787 if (osd >= 0 && osd < get_max_osd())
3788 ++new_by_osd[osd];
3789 }
3790 if (pi->type == pg_pool_t::TYPE_ERASURE) {
3791 for (unsigned i=0; i<up.size(); ++i) {
3792 if (up[i] != up2[i]) {
3793 ++moved_pg;
3794 }
3795 }
3796 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
3797 for (int osd : up) {
3798 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
3799 ++moved_pg;
3800 }
3801 }
3802 } else {
3803 assert(0 == "unhandled pool type");
3804 }
3805 }
3806 }
3807 }
3808
3809 unsigned num_up_in = 0;
3810 for (int osd = 0; osd < get_max_osd(); ++osd) {
3811 if (is_up(osd) && is_in(osd))
3812 ++num_up_in;
3813 }
3814 if (!num_up_in) {
3815 return -EINVAL;
3816 }
3817
3818 float avg_pg = (float)total_pg / (float)num_up_in;
3819 float base_stddev = 0, new_stddev = 0;
3820 int min = -1, max = -1;
3821 unsigned min_base_pg = 0, max_base_pg = 0;
3822 unsigned min_new_pg = 0, max_new_pg = 0;
3823 for (int osd = 0; osd < get_max_osd(); ++osd) {
3824 if (is_up(osd) && is_in(osd)) {
3825 float base_diff = (float)base_by_osd[osd] - avg_pg;
3826 base_stddev += base_diff * base_diff;
3827 float new_diff = (float)new_by_osd[osd] - avg_pg;
3828 new_stddev += new_diff * new_diff;
3829 if (min < 0 || base_by_osd[osd] < min_base_pg) {
3830 min = osd;
3831 min_base_pg = base_by_osd[osd];
3832 min_new_pg = new_by_osd[osd];
3833 }
3834 if (max < 0 || base_by_osd[osd] > max_base_pg) {
3835 max = osd;
3836 max_base_pg = base_by_osd[osd];
3837 max_new_pg = new_by_osd[osd];
3838 }
3839 }
3840 }
3841 base_stddev = sqrt(base_stddev / num_up_in);
3842 new_stddev = sqrt(new_stddev / num_up_in);
3843
3844 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
3845
3846 ostringstream ss;
3847 if (f)
3848 f->open_object_section("utilization");
3849 if (newmap) {
3850 if (f) {
3851 f->dump_unsigned("moved_pgs", moved_pg);
3852 f->dump_unsigned("total_pgs", total_pg);
3853 } else {
3854 float percent = 0;
3855 if (total_pg)
3856 percent = (float)moved_pg * 100.0 / (float)total_pg;
3857 ss << "moved " << moved_pg << " / " << total_pg
3858 << " (" << percent << "%)\n";
3859 }
3860 }
3861 if (f) {
3862 f->dump_float("avg_pgs", avg_pg);
3863 f->dump_float("std_dev", base_stddev);
3864 f->dump_float("expected_baseline_std_dev", edev);
3865 if (newmap)
3866 f->dump_float("new_std_dev", new_stddev);
3867 } else {
3868 ss << "avg " << avg_pg << "\n";
3869 ss << "stddev " << base_stddev;
3870 if (newmap)
3871 ss << " -> " << new_stddev;
3872 ss << " (expected baseline " << edev << ")\n";
3873 }
3874 if (min >= 0) {
3875 if (f) {
3876 f->dump_unsigned("min_osd", min);
3877 f->dump_unsigned("min_osd_pgs", min_base_pg);
3878 if (newmap)
3879 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
3880 } else {
3881 ss << "min osd." << min << " with " << min_base_pg;
3882 if (newmap)
3883 ss << " -> " << min_new_pg;
3884 ss << " pgs (" << (float)min_base_pg / avg_pg;
3885 if (newmap)
3886 ss << " -> " << (float)min_new_pg / avg_pg;
3887 ss << " * mean)\n";
3888 }
3889 }
3890 if (max >= 0) {
3891 if (f) {
3892 f->dump_unsigned("max_osd", max);
3893 f->dump_unsigned("max_osd_pgs", max_base_pg);
3894 if (newmap)
3895 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
3896 } else {
3897 ss << "max osd." << max << " with " << max_base_pg;
3898 if (newmap)
3899 ss << " -> " << max_new_pg;
3900 ss << " pgs (" << (float)max_base_pg / avg_pg;
3901 if (newmap)
3902 ss << " -> " << (float)max_new_pg / avg_pg;
3903 ss << " * mean)\n";
3904 }
3905 }
3906 if (f)
3907 f->close_section();
3908 if (out)
3909 *out = ss.str();
3910 return 0;
3911}
3912
3913
3914int OSDMap::clean_pg_upmaps(
3915 CephContext *cct,
f64942e4 3916 Incremental *pending_inc) const
7c673cae
FG
3917{
3918 ldout(cct, 10) << __func__ << dendl;
3919 int changed = 0;
3920 for (auto& p : pg_upmap) {
3921 vector<int> raw;
3922 int primary;
3923 pg_to_raw_osds(p.first, &raw, &primary);
3924 if (vectors_equal(raw, p.second)) {
3925 ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
3926 << p.second << dendl;
3927 pending_inc->old_pg_upmap.insert(p.first);
3928 ++changed;
3929 }
3930 }
3931 for (auto& p : pg_upmap_items) {
3932 vector<int> raw;
3933 int primary;
3934 pg_to_raw_osds(p.first, &raw, &primary);
3935 mempool::osdmap::vector<pair<int,int>> newmap;
3936 for (auto& q : p.second) {
f64942e4
AA
3937 if (std::find(raw.begin(), raw.end(), q.first) == raw.end()) {
3938 // cancel mapping if source osd does not exist anymore
3939 continue;
3940 }
3941 if (q.second != CRUSH_ITEM_NONE && q.second < max_osd &&
3942 q.second >= 0 && osd_weight[q.second] == 0) {
3943 // cancel mapping if target osd is out
3944 continue;
7c673cae 3945 }
f64942e4 3946 newmap.push_back(q);
7c673cae
FG
3947 }
3948 if (newmap.empty()) {
3949 ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
3950 << p.second << dendl;
3951 pending_inc->old_pg_upmap_items.insert(p.first);
3952 ++changed;
3953 } else if (newmap != p.second) {
3954 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
3955 << p.first << " " << p.second << " -> " << newmap << dendl;
3956 pending_inc->new_pg_upmap_items[p.first] = newmap;
3957 ++changed;
3958 }
3959 }
3960 return changed;
3961}
3962
3963bool OSDMap::try_pg_upmap(
3964 CephContext *cct,
3965 pg_t pg, ///< pg to potentially remap
3966 const set<int>& overfull, ///< osds we'd want to evacuate
3967 const vector<int>& underfull, ///< osds to move to, in order of preference
3968 vector<int> *orig,
3969 vector<int> *out) ///< resulting alternative mapping
3970{
3971 const pg_pool_t *pool = get_pg_pool(pg.pool());
3972 if (!pool)
3973 return false;
31f18b77 3974 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
7c673cae
FG
3975 pool->get_size());
3976 if (rule < 0)
3977 return false;
3978
7c673cae
FG
3979 // make sure there is something there to remap
3980 bool any = false;
3981 for (auto osd : *orig) {
3982 if (overfull.count(osd)) {
3983 any = true;
3984 break;
3985 }
3986 }
3987 if (!any) {
3988 return false;
3989 }
3990
3991 int r = crush->try_remap_rule(
3992 cct,
3993 rule,
3994 pool->get_size(),
3995 overfull, underfull,
3996 *orig,
3997 out);
3998 if (r < 0)
3999 return false;
4000 if (*out == *orig)
4001 return false;
4002 return true;
4003}
4004
4005int OSDMap::calc_pg_upmaps(
4006 CephContext *cct,
31f18b77 4007 float max_deviation_ratio,
7c673cae 4008 int max,
a8e16298 4009 const set<int64_t>& only_pools,
7c673cae
FG
4010 OSDMap::Incremental *pending_inc)
4011{
a8e16298 4012 ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
7c673cae
FG
4013 OSDMap tmp;
4014 tmp.deepish_copy_from(*this);
4015 int num_changed = 0;
a8e16298
TL
4016 map<int,set<pg_t>> pgs_by_osd;
4017 int total_pgs = 0;
4018 float osd_weight_total = 0;
4019 map<int,float> osd_weight;
4020 for (auto& i : pools) {
4021 if (!only_pools.empty() && !only_pools.count(i.first))
4022 continue;
4023 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
4024 pg_t pg(ps, i.first);
4025 vector<int> up;
4026 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
4027 ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
4028 for (auto osd : up) {
4029 if (osd != CRUSH_ITEM_NONE)
4030 pgs_by_osd[osd].insert(pg);
7c673cae 4031 }
a8e16298
TL
4032 }
4033 total_pgs += i.second.get_size() * i.second.get_pg_num();
4034
4035 map<int,float> pmap;
4036 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
4037 i.second.get_type(),
4038 i.second.get_size());
4039 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
4040 ldout(cct,20) << __func__ << " pool " << i.first
4041 << " ruleno " << ruleno
4042 << " weight-map " << pmap
4043 << dendl;
4044 for (auto p : pmap) {
4045 auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
4046 if (adjusted_weight == 0) {
4047 continue;
31f18b77 4048 }
a8e16298
TL
4049 osd_weight[p.first] += adjusted_weight;
4050 osd_weight_total += adjusted_weight;
7c673cae 4051 }
a8e16298
TL
4052 }
4053 for (auto& i : osd_weight) {
4054 int pgs = 0;
4055 auto p = pgs_by_osd.find(i.first);
4056 if (p != pgs_by_osd.end())
31f18b77 4057 pgs = p->second.size();
a8e16298 4058 else
31f18b77 4059 pgs_by_osd.emplace(i.first, set<pg_t>());
a8e16298 4060 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
31f18b77 4061 << " pgs " << pgs << dendl;
a8e16298
TL
4062 }
4063 if (osd_weight_total == 0) {
4064 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
4065 return 0;
4066 }
4067 float pgs_per_weight = total_pgs / osd_weight_total;
4068 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
4069 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
7c673cae 4070
a8e16298
TL
4071 if (max <= 0) {
4072 lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
4073 return 0;
4074 }
4075 float decay_factor = 1.0 / float(max);
4076 float stddev = 0;
4077 map<int,float> osd_deviation; // osd, deviation(pgs)
4078 multimap<float,int> deviation_osd; // deviation(pgs), osd
4079 for (auto& i : pgs_by_osd) {
4080 // make sure osd is still there (belongs to this crush-tree)
4081 ceph_assert(osd_weight.count(i.first));
4082 float target = osd_weight[i.first] * pgs_per_weight;
4083 float deviation = (float)i.second.size() - target;
4084 ldout(cct, 20) << " osd." << i.first
4085 << "\tpgs " << i.second.size()
4086 << "\ttarget " << target
4087 << "\tdeviation " << deviation
4088 << dendl;
4089 osd_deviation[i.first] = deviation;
4090 deviation_osd.insert(make_pair(deviation, i.first));
4091 stddev += deviation * deviation;
4092 }
4093 if (stddev <= cct->_conf->get_val<double>("osd_calc_pg_upmaps_max_stddev")) {
4094 ldout(cct, 10) << __func__ << " distribution is almost perfect"
4095 << dendl;
4096 return 0;
4097 }
4098 bool skip_overfull = false;
4099 auto aggressive =
4100 cct->_conf->get_val<bool>("osd_calc_pg_upmaps_aggressively");
4101 auto local_fallback_retries =
4102 cct->_conf->get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
4103 while (max--) {
4104 // build overfull and underfull
4105 set<int> overfull;
4106 vector<int> underfull;
4107 float decay = 0;
4108 int decay_count = 0;
4109 while (overfull.empty()) {
4110 for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
4111 if (i->first >= (1.0 - decay))
4112 overfull.insert(i->second);
4113 }
4114 if (!overfull.empty())
4115 break;
4116 decay_count++;
4117 decay = decay_factor * decay_count;
4118 if (decay >= 1.0)
4119 break;
4120 ldout(cct, 30) << " decay_factor = " << decay_factor
4121 << " decay_count = " << decay_count
4122 << " decay (overfull) = " << decay
4123 << dendl;
4124 }
4125 if (overfull.empty()) {
4126 lderr(cct) << __func__ << " failed to build overfull" << dendl;
224ce89b
WB
4127 break;
4128 }
7c673cae 4129
a8e16298
TL
4130 decay = 0;
4131 decay_count = 0;
4132 while (underfull.empty()) {
4133 for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
4134 if (i->first >= (-.999 + decay))
4135 break;
4136 underfull.push_back(i->second);
4137 }
4138 if (!underfull.empty())
4139 break;
4140 decay_count++;
4141 decay = decay_factor * decay_count;
4142 if (decay >= .999)
4143 break;
4144 ldout(cct, 30) << " decay_factor = " << decay_factor
4145 << " decay_count = " << decay_count
4146 << " decay (underfull) = " << decay
4147 << dendl;
7c673cae 4148 }
a8e16298
TL
4149 if (underfull.empty()) {
4150 lderr(cct) << __func__ << " failed to build underfull" << dendl;
7c673cae 4151 break;
a8e16298 4152 }
7c673cae 4153
a8e16298
TL
4154 ldout(cct, 10) << " overfull " << overfull
4155 << " underfull " << underfull
4156 << dendl;
4157 set<pg_t> to_skip;
4158 uint64_t local_fallback_retried = 0;
4159
4160 retry:
4161
4162 set<pg_t> to_unmap;
4163 map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
4164 auto temp_pgs_by_osd = pgs_by_osd;
4165 // always start with fullest, break if we find any changes to make
7c673cae 4166 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
a8e16298
TL
4167 if (skip_overfull) {
4168 ldout(cct, 10) << " skipping overfull " << dendl;
4169 break; // fall through to check underfull
4170 }
7c673cae 4171 int osd = p->second;
31f18b77 4172 float deviation = p->first;
7c673cae 4173 float target = osd_weight[osd] * pgs_per_weight;
a8e16298
TL
4174 ceph_assert(target > 0);
4175 float deviation_ratio = deviation / target;
4176 if (deviation_ratio < max_deviation_ratio) {
7c673cae 4177 ldout(cct, 10) << " osd." << osd
a8e16298
TL
4178 << " target " << target
4179 << " deviation " << deviation
4180 << " -> ratio " << deviation_ratio
4181 << " < max ratio " << max_deviation_ratio
4182 << dendl;
7c673cae
FG
4183 break;
4184 }
7c673cae 4185
a8e16298
TL
4186 vector<pg_t> pgs;
4187 pgs.reserve(pgs_by_osd[osd].size());
4188 for (auto& pg : pgs_by_osd[osd]) {
4189 if (to_skip.count(pg))
4190 continue;
4191 pgs.push_back(pg);
4192 }
4193 if (aggressive) {
4194 // shuffle PG list so they all get equal (in)attention
4195 std::random_device rd;
4196 std::default_random_engine rng{rd()};
4197 std::shuffle(pgs.begin(), pgs.end(), rng);
4198 }
7c673cae
FG
4199 // look for remaps we can un-remap
4200 for (auto pg : pgs) {
4201 auto p = tmp.pg_upmap_items.find(pg);
a8e16298
TL
4202 if (p == tmp.pg_upmap_items.end())
4203 continue;
4204 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4205 for (auto q : p->second) {
4206 if (q.second == osd) {
4207 ldout(cct, 10) << " will try dropping existing"
4208 << " remapping pair "
4209 << q.first << " -> " << q.second
4210 << " which remapped " << pg
4211 << " into overfull osd." << osd
4212 << dendl;
4213 temp_pgs_by_osd[q.second].erase(pg);
4214 temp_pgs_by_osd[q.first].insert(pg);
4215 } else {
4216 new_upmap_items.push_back(q);
4217 }
4218 }
4219 if (new_upmap_items.empty()) {
4220 // drop whole item
4221 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4222 << " remapped " << pg << " into overfull osd." << osd
4223 << ", will try cancelling it entirely"
4224 << dendl;
4225 to_unmap.insert(pg);
4226 goto test_change;
4227 } else if (new_upmap_items.size() != p->second.size()) {
4228 // drop single remapping pair, updating
4229 ceph_assert(new_upmap_items.size() < p->second.size());
4230 ldout(cct, 10) << " existing pg_upmap_items " << p->second
4231 << " remapped " << pg << " into overfull osd." << osd
4232 << ", new_pg_upmap_items now " << new_upmap_items
4233 << dendl;
4234 to_upmap[pg] = new_upmap_items;
4235 goto test_change;
4236 }
4237 }
7c673cae 4238
a8e16298 4239 // try upmap
7c673cae 4240 for (auto pg : pgs) {
a8e16298
TL
4241 auto temp_it = tmp.pg_upmap.find(pg);
4242 if (temp_it != tmp.pg_upmap.end()) {
4243 // leave pg_upmap alone
4244 // it must be specified by admin since balancer does not
4245 // support pg_upmap yet
4246 ldout(cct, 10) << " " << pg << " already has pg_upmap "
4247 << temp_it->second << ", skipping"
4248 << dendl;
7c673cae
FG
4249 continue;
4250 }
a8e16298
TL
4251 auto pg_pool_size = tmp.get_pg_pool_size(pg);
4252 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4253 set<int> existing;
4254 auto it = tmp.pg_upmap_items.find(pg);
4255 if (it != tmp.pg_upmap_items.end() &&
4256 it->second.size() >= (size_t)pg_pool_size) {
4257 ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
4258 << it->second << ", skipping"
4259 << dendl;
4260 continue;
4261 } else if (it != tmp.pg_upmap_items.end()) {
4262 ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
4263 << it->second
4264 << dendl;
4265 new_upmap_items = it->second;
4266 // build existing too (for dedup)
4267 for (auto i : it->second) {
4268 existing.insert(i.first);
4269 existing.insert(i.second);
4270 }
4271 // fall through
4272 // to see if we can append more remapping pairs
4273 }
4274 ldout(cct, 10) << " trying " << pg << dendl;
7c673cae 4275 vector<int> orig, out;
a8e16298 4276 tmp.pg_to_raw_upmap(pg, &orig); // including existing upmaps too
7c673cae
FG
4277 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
4278 continue;
4279 }
a8e16298 4280 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
7c673cae
FG
4281 if (orig.size() != out.size()) {
4282 continue;
4283 }
a8e16298 4284 ceph_assert(orig != out);
7c673cae 4285 for (unsigned i = 0; i < out.size(); ++i) {
a8e16298
TL
4286 if (orig[i] == out[i])
4287 continue; // skip invalid remappings
4288 if (existing.count(orig[i]) || existing.count(out[i]))
4289 continue; // we want new remappings only!
4290 ldout(cct, 10) << " will try adding new remapping pair "
4291 << orig[i] << " -> " << out[i] << " for " << pg
4292 << dendl;
4293 existing.insert(orig[i]);
4294 existing.insert(out[i]);
4295 temp_pgs_by_osd[orig[i]].erase(pg);
4296 temp_pgs_by_osd[out[i]].insert(pg);
4297 ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
4298 new_upmap_items.push_back(make_pair(orig[i], out[i]));
4299 // append new remapping pairs slowly
4300 // This way we can make sure that each tiny change will
4301 // definitely make distribution of PGs converging to
4302 // the perfect status.
4303 to_upmap[pg] = new_upmap_items;
4304 goto test_change;
7c673cae 4305 }
a8e16298
TL
4306 }
4307 }
7c673cae 4308
a8e16298
TL
4309 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4310 ldout(cct, 10) << " failed to find any changes for overfull osds"
4311 << dendl;
4312 for (auto& p : deviation_osd) {
4313 if (std::find(underfull.begin(), underfull.end(), p.second) ==
4314 underfull.end())
4315 break;
4316 int osd = p.second;
4317 float deviation = p.first;
4318 float target = osd_weight[osd] * pgs_per_weight;
4319 ceph_assert(target > 0);
4320 float deviation_ratio = abs(deviation / target);
4321 if (deviation_ratio < max_deviation_ratio) {
4322 // respect max_deviation_ratio too
4323 ldout(cct, 10) << " osd." << osd
4324 << " target " << target
4325 << " deviation " << deviation
4326 << " -> absolute ratio " << deviation_ratio
4327 << " < max ratio " << max_deviation_ratio
4328 << dendl;
4329 break;
4330 }
4331 // look for remaps we can un-remap
4332 vector<pair<pg_t,
4333 mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
4334 candidates.reserve(tmp.pg_upmap_items.size());
4335 for (auto& i : tmp.pg_upmap_items) {
4336 if (to_skip.count(i.first))
4337 continue;
4338 if (!only_pools.empty() && !only_pools.count(i.first.pool()))
4339 continue;
4340 candidates.push_back(make_pair(i.first, i.second));
4341 }
4342 if (aggressive) {
4343 // shuffle candidates so they all get equal (in)attention
4344 std::random_device rd;
4345 std::default_random_engine rng{rd()};
4346 std::shuffle(candidates.begin(), candidates.end(), rng);
4347 }
4348 for (auto& i : candidates) {
4349 auto pg = i.first;
4350 mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
4351 for (auto& j : i.second) {
4352 if (j.first == osd) {
4353 ldout(cct, 10) << " will try dropping existing"
4354 << " remapping pair "
4355 << j.first << " -> " << j.second
4356 << " which remapped " << pg
4357 << " out from underfull osd." << osd
4358 << dendl;
4359 temp_pgs_by_osd[j.second].erase(pg);
4360 temp_pgs_by_osd[j.first].insert(pg);
4361 } else {
4362 new_upmap_items.push_back(j);
4363 }
4364 }
4365 if (new_upmap_items.empty()) {
4366 // drop whole item
4367 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4368 << " remapped " << pg
4369 << " out from underfull osd." << osd
4370 << ", will try cancelling it entirely"
4371 << dendl;
4372 to_unmap.insert(pg);
4373 goto test_change;
4374 } else if (new_upmap_items.size() != i.second.size()) {
4375 // drop single remapping pair, updating
4376 ceph_assert(new_upmap_items.size() < i.second.size());
4377 ldout(cct, 10) << " existing pg_upmap_items " << i.second
4378 << " remapped " << pg
4379 << " out from underfull osd." << osd
4380 << ", new_pg_upmap_items now " << new_upmap_items
4381 << dendl;
4382 to_upmap[pg] = new_upmap_items;
4383 goto test_change;
4384 }
4385 }
7c673cae 4386 }
a8e16298
TL
4387
4388 ceph_assert(!(to_unmap.size() || to_upmap.size()));
4389 ldout(cct, 10) << " failed to find any changes for underfull osds"
4390 << dendl;
4391 if (!aggressive) {
4392 ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
4393 break;
4394 } else if (!skip_overfull) {
4395 // safe to quit because below here we know
4396 // we've done checking both overfull and underfull osds..
4397 ldout(cct, 10) << " break due to not being able to find any"
4398 << " further optimizations"
4399 << dendl;
7c673cae
FG
4400 break;
4401 }
a8e16298
TL
4402 // restart with fullest and do exhaustive searching
4403 skip_overfull = false;
4404 continue;
4405
4406 test_change:
4407
4408 // test change, apply if change is good
4409 ceph_assert(to_unmap.size() || to_upmap.size());
4410 float new_stddev = 0;
4411 map<int,float> temp_osd_deviation;
4412 multimap<float,int> temp_deviation_osd;
4413 for (auto& i : temp_pgs_by_osd) {
4414 // make sure osd is still there (belongs to this crush-tree)
4415 ceph_assert(osd_weight.count(i.first));
4416 float target = osd_weight[i.first] * pgs_per_weight;
4417 float deviation = (float)i.second.size() - target;
4418 ldout(cct, 20) << " osd." << i.first
4419 << "\tpgs " << i.second.size()
4420 << "\ttarget " << target
4421 << "\tdeviation " << deviation
4422 << dendl;
4423 temp_osd_deviation[i.first] = deviation;
4424 temp_deviation_osd.insert(make_pair(deviation, i.first));
4425 new_stddev += deviation * deviation;
4426 }
4427 ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
4428 if (new_stddev >= stddev) {
4429 if (!aggressive) {
4430 ldout(cct, 10) << " break because stddev is not decreasing"
4431 << " and aggressive mode is not enabled"
4432 << dendl;
4433 break;
4434 }
4435 local_fallback_retried++;
4436 if (local_fallback_retried >= local_fallback_retries) {
4437 // does not make progress
4438 // flip *skip_overfull* so both overfull and underfull
4439 // get equal (in)attention
4440 skip_overfull = !skip_overfull;
4441 ldout(cct, 10) << " hit local_fallback_retries "
4442 << local_fallback_retries
4443 << dendl;
4444 continue;
4445 }
4446 for (auto& i : to_unmap)
4447 to_skip.insert(i);
4448 for (auto& i : to_upmap)
4449 to_skip.insert(i.first);
4450 ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
4451 << " to_skip " << to_skip
4452 << dendl;
4453 goto retry;
4454 }
4455
4456 // ready to go
4457 ceph_assert(new_stddev < stddev);
4458 stddev = new_stddev;
4459 pgs_by_osd = temp_pgs_by_osd;
4460 osd_deviation = temp_osd_deviation;
4461 deviation_osd = temp_deviation_osd;
4462 for (auto& i : to_unmap) {
4463 ldout(cct, 10) << " unmap pg " << i << dendl;
4464 ceph_assert(tmp.pg_upmap_items.count(i));
4465 tmp.pg_upmap_items.erase(i);
4466 pending_inc->old_pg_upmap_items.insert(i);
4467 ++num_changed;
4468 }
4469 for (auto& i : to_upmap) {
4470 ldout(cct, 10) << " upmap pg " << i.first
4471 << " new pg_upmap_items " << i.second
4472 << dendl;
4473 tmp.pg_upmap_items[i.first] = i.second;
4474 pending_inc->new_pg_upmap_items[i.first] = i.second;
4475 ++num_changed;
4476 }
7c673cae 4477 }
a8e16298 4478 ldout(cct, 10) << " num_changed = " << num_changed << dendl;
7c673cae
FG
4479 return num_changed;
4480}
31f18b77
FG
4481
4482int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
4483{
4484 return crush->get_leaves(name, osds);
4485}
4486
3efd9988
FG
4487// get pools whose crush rules might reference the given osd
4488void OSDMap::get_pool_ids_by_osd(CephContext *cct,
4489 int osd,
4490 set<int64_t> *pool_ids) const
4491{
4492 assert(pool_ids);
4493 set<int> raw_rules;
4494 int r = crush->get_rules_by_osd(osd, &raw_rules);
4495 if (r < 0) {
4496 lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
4497 << dendl;
4498 assert(r >= 0);
4499 }
4500 set<int> rules;
4501 for (auto &i: raw_rules) {
4502 // exclude any dead rule
4503 if (crush_rule_in_use(i)) {
4504 rules.insert(i);
4505 }
4506 }
4507 for (auto &r: rules) {
4508 get_pool_ids_by_rule(r, pool_ids);
4509 }
4510}
4511
31f18b77
FG
4512template <typename F>
4513class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
4514public:
4515 typedef CrushTreeDumper::Dumper<F> Parent;
4516
4517 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
4518 const PGStatService *pgs_, bool tree_) :
c07f9fc5 4519 Parent(crush, osdmap_->get_pool_names()),
31f18b77
FG
4520 osdmap(osdmap_),
4521 pgs(pgs_),
4522 tree(tree_),
4523 average_util(average_utilization()),
4524 min_var(-1),
4525 max_var(-1),
4526 stddev(0),
4527 sum(0) {
4528 }
4529
4530protected:
4531 void dump_stray(F *f) {
4532 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4533 if (osdmap->exists(i) && !this->is_touched(i))
c07f9fc5 4534 dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
31f18b77
FG
4535 }
4536 }
4537
4538 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
4539 if (!tree && qi.is_bucket())
4540 return;
4541
4542 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
4543 int64_t kb = 0, kb_used = 0, kb_avail = 0;
4544 double util = 0;
4545 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
4546 if (kb_used && kb)
4547 util = 100.0 * (double)kb_used / (double)kb;
4548
4549 double var = 1.0;
4550 if (average_util)
4551 var = util / average_util;
4552
4553 size_t num_pgs = qi.is_bucket() ? 0 : pgs->get_num_pg_by_osd(qi.id);
4554
4555 dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
4556
4557 if (!qi.is_bucket() && reweight > 0) {
4558 if (min_var < 0 || var < min_var)
4559 min_var = var;
4560 if (max_var < 0 || var > max_var)
4561 max_var = var;
4562
4563 double dev = util - average_util;
4564 dev *= dev;
4565 stddev += reweight * dev;
4566 sum += reweight;
4567 }
4568 }
4569
4570 virtual void dump_item(const CrushTreeDumper::Item &qi,
4571 float &reweight,
4572 int64_t kb,
4573 int64_t kb_used,
4574 int64_t kb_avail,
4575 double& util,
4576 double& var,
4577 const size_t num_pgs,
4578 F *f) = 0;
4579
4580 double dev() {
4581 return sum > 0 ? sqrt(stddev / sum) : 0;
4582 }
4583
4584 double average_utilization() {
4585 int64_t kb = 0, kb_used = 0;
4586 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4587 if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
4588 continue;
4589 int64_t kb_i, kb_used_i, kb_avail_i;
4590 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
4591 kb += kb_i;
4592 kb_used += kb_used_i;
4593 }
4594 }
4595 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
4596 }
4597
4598 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
4599 int64_t* kb_avail) const {
4600 const osd_stat_t *p = pgs->get_osd_stat(id);
4601 if (!p) return false;
4602 *kb = p->kb;
4603 *kb_used = p->kb_used;
4604 *kb_avail = p->kb_avail;
4605 return *kb > 0;
4606 }
4607
4608 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
4609 int64_t* kb_avail) const {
4610 if (id >= 0) {
4611 if (osdmap->is_out(id)) {
4612 *kb = 0;
4613 *kb_used = 0;
4614 *kb_avail = 0;
4615 return true;
4616 }
4617 return get_osd_utilization(id, kb, kb_used, kb_avail);
4618 }
4619
4620 *kb = 0;
4621 *kb_used = 0;
4622 *kb_avail = 0;
4623
4624 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
4625 int item = osdmap->crush->get_bucket_item(id, k);
4626 int64_t kb_i = 0, kb_used_i = 0, kb_avail_i = 0;
4627 if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
4628 return false;
4629 *kb += kb_i;
4630 *kb_used += kb_used_i;
4631 *kb_avail += kb_avail_i;
4632 }
4633 return *kb > 0;
4634 }
4635
4636protected:
4637 const OSDMap *osdmap;
4638 const PGStatService *pgs;
4639 bool tree;
4640 double average_util;
4641 double min_var;
4642 double max_var;
4643 double stddev;
4644 double sum;
4645};
4646
4647
4648class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
4649public:
4650 typedef OSDUtilizationDumper<TextTable> Parent;
4651
4652 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4653 const PGStatService *pgs, bool tree) :
4654 Parent(crush, osdmap, pgs, tree) {}
4655
4656 void dump(TextTable *tbl) {
4657 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
224ce89b 4658 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
4659 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4660 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4661 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
4662 tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
4663 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
4664 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
4665 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
4666 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
4667 if (tree)
4668 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4669
4670 Parent::dump(tbl);
4671
4672 dump_stray(tbl);
4673
224ce89b
WB
4674 *tbl << ""
4675 << ""
4676 << "" << "TOTAL"
1adf2230
AA
4677 << byte_u_t(pgs->get_osd_sum().kb << 10)
4678 << byte_u_t(pgs->get_osd_sum().kb_used << 10)
4679 << byte_u_t(pgs->get_osd_sum().kb_avail << 10)
31f18b77
FG
4680 << lowprecision_t(average_util)
4681 << ""
4682 << TextTable::endrow;
4683 }
4684
4685protected:
4686 struct lowprecision_t {
4687 float v;
4688 explicit lowprecision_t(float _v) : v(_v) {}
4689 };
4690 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
4691
4692 using OSDUtilizationDumper<TextTable>::dump_item;
4693 void dump_item(const CrushTreeDumper::Item &qi,
4694 float &reweight,
4695 int64_t kb,
4696 int64_t kb_used,
4697 int64_t kb_avail,
4698 double& util,
4699 double& var,
4700 const size_t num_pgs,
4701 TextTable *tbl) override {
224ce89b
WB
4702 const char *c = crush->get_item_class(qi.id);
4703 if (!c)
4704 c = "";
31f18b77 4705 *tbl << qi.id
224ce89b 4706 << c
31f18b77
FG
4707 << weightf_t(qi.weight)
4708 << weightf_t(reweight)
1adf2230
AA
4709 << byte_u_t(kb << 10)
4710 << byte_u_t(kb_used << 10)
4711 << byte_u_t(kb_avail << 10)
31f18b77
FG
4712 << lowprecision_t(util)
4713 << lowprecision_t(var);
4714
4715 if (qi.is_bucket()) {
4716 *tbl << "-";
4717 } else {
4718 *tbl << num_pgs;
4719 }
4720
4721 if (tree) {
4722 ostringstream name;
4723 for (int k = 0; k < qi.depth; k++)
4724 name << " ";
4725 if (qi.is_bucket()) {
4726 int type = crush->get_bucket_type(qi.id);
4727 name << crush->get_type_name(type) << " "
4728 << crush->get_item_name(qi.id);
4729 } else {
4730 name << "osd." << qi.id;
4731 }
4732 *tbl << name.str();
4733 }
4734
4735 *tbl << TextTable::endrow;
4736 }
4737
4738public:
4739 string summary() {
4740 ostringstream out;
4741 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
4742 << "/" << lowprecision_t(max_var) << " "
4743 << "STDDEV: " << lowprecision_t(dev());
4744 return out.str();
4745 }
4746};
4747
4748ostream& operator<<(ostream& out,
4749 const OSDUtilizationPlainDumper::lowprecision_t& v)
4750{
4751 if (v.v < -0.01) {
4752 return out << "-";
4753 } else if (v.v < 0.001) {
4754 return out << "0";
4755 } else {
4756 std::streamsize p = out.precision();
4757 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
4758 }
4759}
4760
4761class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
4762public:
4763 typedef OSDUtilizationDumper<Formatter> Parent;
4764
4765 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4766 const PGStatService *pgs, bool tree) :
4767 Parent(crush, osdmap, pgs, tree) {}
4768
4769 void dump(Formatter *f) {
4770 f->open_array_section("nodes");
4771 Parent::dump(f);
4772 f->close_section();
4773
4774 f->open_array_section("stray");
4775 dump_stray(f);
4776 f->close_section();
4777 }
4778
4779protected:
4780 using OSDUtilizationDumper<Formatter>::dump_item;
4781 void dump_item(const CrushTreeDumper::Item &qi,
4782 float &reweight,
4783 int64_t kb,
4784 int64_t kb_used,
4785 int64_t kb_avail,
4786 double& util,
4787 double& var,
4788 const size_t num_pgs,
4789 Formatter *f) override {
4790 f->open_object_section("item");
c07f9fc5 4791 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
31f18b77
FG
4792 f->dump_float("reweight", reweight);
4793 f->dump_int("kb", kb);
4794 f->dump_int("kb_used", kb_used);
4795 f->dump_int("kb_avail", kb_avail);
4796 f->dump_float("utilization", util);
4797 f->dump_float("var", var);
4798 f->dump_unsigned("pgs", num_pgs);
4799 CrushTreeDumper::dump_bucket_children(crush, qi, f);
4800 f->close_section();
4801 }
4802
4803public:
4804 void summary(Formatter *f) {
4805 f->open_object_section("summary");
4806 f->dump_int("total_kb", pgs->get_osd_sum().kb);
4807 f->dump_int("total_kb_used", pgs->get_osd_sum().kb_used);
4808 f->dump_int("total_kb_avail", pgs->get_osd_sum().kb_avail);
4809 f->dump_float("average_utilization", average_util);
4810 f->dump_float("min_var", min_var);
4811 f->dump_float("max_var", max_var);
4812 f->dump_float("dev", dev());
4813 f->close_section();
4814 }
4815};
4816
4817void print_osd_utilization(const OSDMap& osdmap,
4818 const PGStatService *pgstat,
4819 ostream& out,
4820 Formatter *f,
4821 bool tree)
4822{
4823 const CrushWrapper *crush = osdmap.crush.get();
4824 if (f) {
4825 f->open_object_section("df");
4826 OSDUtilizationFormatDumper d(crush, &osdmap, pgstat, tree);
4827 d.dump(f);
4828 d.summary(f);
4829 f->close_section();
4830 f->flush(out);
4831 } else {
4832 OSDUtilizationPlainDumper d(crush, &osdmap, pgstat, tree);
4833 TextTable tbl;
4834 d.dump(&tbl);
4835 out << tbl << d.summary() << "\n";
4836 }
4837}
224ce89b
WB
4838
4839void OSDMap::check_health(health_check_map_t *checks) const
4840{
4841 int num_osds = get_num_osds();
4842
4843 // OSD_DOWN
4844 // OSD_$subtree_DOWN
4845 // OSD_ORPHAN
4846 if (num_osds >= 0) {
4847 int num_in_osds = 0;
4848 int num_down_in_osds = 0;
4849 set<int> osds;
4850 set<int> down_in_osds;
4851 set<int> up_in_osds;
4852 set<int> subtree_up;
4853 unordered_map<int, set<int> > subtree_type_down;
4854 unordered_map<int, int> num_osds_subtree;
4855 int max_type = crush->get_max_type_id();
4856
4857 for (int i = 0; i < get_max_osd(); i++) {
4858 if (!exists(i)) {
4859 if (crush->item_exists(i)) {
4860 osds.insert(i);
4861 }
4862 continue;
4863 }
4864 if (is_out(i))
4865 continue;
4866 ++num_in_osds;
4867 if (down_in_osds.count(i) || up_in_osds.count(i))
4868 continue;
4869 if (!is_up(i)) {
4870 down_in_osds.insert(i);
4871 int parent_id = 0;
4872 int current = i;
4873 for (int type = 0; type <= max_type; type++) {
4874 if (!crush->get_type_name(type))
4875 continue;
4876 int r = crush->get_immediate_parent_id(current, &parent_id);
4877 if (r == -ENOENT)
4878 break;
4879 // break early if this parent is already marked as up
4880 if (subtree_up.count(parent_id))
4881 break;
4882 type = crush->get_bucket_type(parent_id);
4883 if (!subtree_type_is_down(
4884 g_ceph_context, parent_id, type,
4885 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
4886 break;
4887 current = parent_id;
4888 }
4889 }
4890 }
4891
4892 // calculate the number of down osds in each down subtree and
4893 // store it in num_osds_subtree
4894 for (int type = 1; type <= max_type; type++) {
4895 if (!crush->get_type_name(type))
4896 continue;
4897 for (auto j = subtree_type_down[type].begin();
4898 j != subtree_type_down[type].end();
4899 ++j) {
4900 list<int> children;
4901 int num = 0;
4902 int num_children = crush->get_children(*j, &children);
4903 if (num_children == 0)
4904 continue;
4905 for (auto l = children.begin(); l != children.end(); ++l) {
4906 if (*l >= 0) {
4907 ++num;
4908 } else if (num_osds_subtree[*l] > 0) {
4909 num = num + num_osds_subtree[*l];
4910 }
4911 }
4912 num_osds_subtree[*j] = num;
4913 }
4914 }
4915 num_down_in_osds = down_in_osds.size();
4916 assert(num_down_in_osds <= num_in_osds);
4917 if (num_down_in_osds > 0) {
4918 // summary of down subtree types and osds
4919 for (int type = max_type; type > 0; type--) {
4920 if (!crush->get_type_name(type))
4921 continue;
4922 if (subtree_type_down[type].size() > 0) {
4923 ostringstream ss;
4924 ss << subtree_type_down[type].size() << " "
4925 << crush->get_type_name(type);
4926 if (subtree_type_down[type].size() > 1) {
4927 ss << "s";
4928 }
4929 int sum_down_osds = 0;
4930 for (auto j = subtree_type_down[type].begin();
4931 j != subtree_type_down[type].end();
4932 ++j) {
4933 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
4934 }
4935 ss << " (" << sum_down_osds << " osds) down";
4936 string err = string("OSD_") +
4937 string(crush->get_type_name(type)) + "_DOWN";
4938 boost::to_upper(err);
4939 auto& d = checks->add(err, HEALTH_WARN, ss.str());
4940 for (auto j = subtree_type_down[type].rbegin();
4941 j != subtree_type_down[type].rend();
4942 ++j) {
4943 ostringstream ss;
4944 ss << crush->get_type_name(type);
4945 ss << " ";
4946 ss << crush->get_item_name(*j);
4947 // at the top level, do not print location
4948 if (type != max_type) {
4949 ss << " (";
4950 ss << crush->get_full_location_ordered_string(*j);
4951 ss << ")";
4952 }
4953 int num = num_osds_subtree[*j];
4954 ss << " (" << num << " osds)";
4955 ss << " is down";
4956 d.detail.push_back(ss.str());
4957 }
4958 }
4959 }
4960 ostringstream ss;
4961 ss << down_in_osds.size() << " osds down";
4962 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
4963 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
4964 ostringstream ss;
4965 ss << "osd." << *it << " (";
4966 ss << crush->get_full_location_ordered_string(*it);
4967 ss << ") is down";
4968 d.detail.push_back(ss.str());
4969 }
4970 }
4971
4972 if (!osds.empty()) {
4973 ostringstream ss;
4974 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
4975 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
4976 for (auto osd : osds) {
4977 ostringstream ss;
4978 ss << "osd." << osd << " exists in crush map but not in osdmap";
4979 d.detail.push_back(ss.str());
4980 }
4981 }
4982 }
4983
4984 // OSD_OUT_OF_ORDER_FULL
4985 {
4986 // An osd could configure failsafe ratio, to something different
4987 // but for now assume it is the same here.
4988 float fsr = g_conf->osd_failsafe_full_ratio;
4989 if (fsr > 1.0) fsr /= 100;
4990 float fr = get_full_ratio();
4991 float br = get_backfillfull_ratio();
4992 float nr = get_nearfull_ratio();
4993
4994 list<string> detail;
4995 // These checks correspond to how OSDService::check_full_status() in an OSD
4996 // handles the improper setting of these values.
4997 if (br < nr) {
4998 ostringstream ss;
4999 ss << "backfillfull_ratio (" << br
5000 << ") < nearfull_ratio (" << nr << "), increased";
5001 detail.push_back(ss.str());
5002 br = nr;
5003 }
5004 if (fr < br) {
5005 ostringstream ss;
5006 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
5007 << "), increased";
5008 detail.push_back(ss.str());
5009 fr = br;
5010 }
5011 if (fsr < fr) {
5012 ostringstream ss;
5013 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
5014 << "), increased";
5015 detail.push_back(ss.str());
5016 }
5017 if (!detail.empty()) {
5018 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
5019 "full ratio(s) out of order");
5020 d.detail.swap(detail);
5021 }
5022 }
5023
5024 // OSD_FULL
5025 // OSD_NEARFULL
5026 // OSD_BACKFILLFULL
5027 // OSD_FAILSAFE_FULL
5028 {
5029 set<int> full, backfillfull, nearfull;
5030 get_full_osd_counts(&full, &backfillfull, &nearfull);
5031 if (full.size()) {
5032 ostringstream ss;
5033 ss << full.size() << " full osd(s)";
5034 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
5035 for (auto& i: full) {
5036 ostringstream ss;
5037 ss << "osd." << i << " is full";
5038 d.detail.push_back(ss.str());
5039 }
5040 }
5041 if (backfillfull.size()) {
5042 ostringstream ss;
5043 ss << backfillfull.size() << " backfillfull osd(s)";
5044 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
5045 for (auto& i: backfillfull) {
5046 ostringstream ss;
5047 ss << "osd." << i << " is backfill full";
5048 d.detail.push_back(ss.str());
5049 }
5050 }
5051 if (nearfull.size()) {
5052 ostringstream ss;
5053 ss << nearfull.size() << " nearfull osd(s)";
5054 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
5055 for (auto& i: nearfull) {
5056 ostringstream ss;
5057 ss << "osd." << i << " is near full";
5058 d.detail.push_back(ss.str());
5059 }
5060 }
5061 }
5062
5063 // OSDMAP_FLAGS
5064 {
5065 // warn about flags
5066 uint64_t warn_flags =
3efd9988 5067 CEPH_OSDMAP_NEARFULL |
224ce89b
WB
5068 CEPH_OSDMAP_FULL |
5069 CEPH_OSDMAP_PAUSERD |
5070 CEPH_OSDMAP_PAUSEWR |
5071 CEPH_OSDMAP_PAUSEREC |
5072 CEPH_OSDMAP_NOUP |
5073 CEPH_OSDMAP_NODOWN |
5074 CEPH_OSDMAP_NOIN |
5075 CEPH_OSDMAP_NOOUT |
5076 CEPH_OSDMAP_NOBACKFILL |
5077 CEPH_OSDMAP_NORECOVER |
5078 CEPH_OSDMAP_NOSCRUB |
5079 CEPH_OSDMAP_NODEEP_SCRUB |
5080 CEPH_OSDMAP_NOTIERAGENT |
5081 CEPH_OSDMAP_NOREBALANCE;
5082 if (test_flag(warn_flags)) {
5083 ostringstream ss;
5084 ss << get_flag_string(get_flags() & warn_flags)
5085 << " flag(s) set";
5086 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
5087 }
5088 }
5089
5090 // OSD_FLAGS
5091 {
5092 list<string> detail;
5093 const unsigned flags =
5094 CEPH_OSD_NOUP |
5095 CEPH_OSD_NOIN |
5096 CEPH_OSD_NODOWN |
5097 CEPH_OSD_NOOUT;
5098 for (int i = 0; i < max_osd; ++i) {
5099 if (osd_state[i] & flags) {
5100 ostringstream ss;
5101 set<string> states;
5102 OSDMap::calc_state_set(osd_state[i] & flags, states);
5103 ss << "osd." << i << " has flags " << states;
5104 detail.push_back(ss.str());
5105 }
5106 }
5107 if (!detail.empty()) {
5108 ostringstream ss;
5109 ss << detail.size() << " osd(s) have {NOUP,NODOWN,NOIN,NOOUT} flags set";
5110 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
5111 d.detail.swap(detail);
5112 }
5113 }
5114
5115 // OLD_CRUSH_TUNABLES
5116 if (g_conf->mon_warn_on_legacy_crush_tunables) {
5117 string min = crush->get_min_required_version();
5118 if (min < g_conf->mon_crush_min_required_version) {
5119 ostringstream ss;
5120 ss << "crush map has legacy tunables (require " << min
5121 << ", min is " << g_conf->mon_crush_min_required_version << ")";
5122 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
5123 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5124 }
5125 }
5126
5127 // OLD_CRUSH_STRAW_CALC_VERSION
5128 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
5129 if (crush->get_straw_calc_version() == 0) {
5130 ostringstream ss;
5131 ss << "crush map has straw_calc_version=0";
5132 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
5133 d.detail.push_back(
5134 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5135 }
5136 }
5137
5138 // CACHE_POOL_NO_HIT_SET
5139 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
5140 list<string> detail;
5141 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5142 p != pools.end();
5143 ++p) {
5144 const pg_pool_t& info = p->second;
5145 if (info.cache_mode_requires_hit_set() &&
5146 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
5147 ostringstream ss;
5148 ss << "pool '" << get_pool_name(p->first)
5149 << "' with cache_mode " << info.get_cache_mode_name()
5150 << " needs hit_set_type to be set but it is not";
5151 detail.push_back(ss.str());
5152 }
5153 }
5154 if (!detail.empty()) {
5155 ostringstream ss;
5156 ss << detail.size() << " cache pools are missing hit_sets";
5157 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
5158 d.detail.swap(detail);
5159 }
5160 }
5161
5162 // OSD_NO_SORTBITWISE
5163 if (!test_flag(CEPH_OSDMAP_SORTBITWISE) &&
5164 (get_up_osd_features() &
5165 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
5166 ostringstream ss;
5167 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
5168 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
5169 }
5170
5171 // OSD_UPGRADE_FINISHED
5172 // none of these (yet) since we don't run until luminous upgrade is done.
5173
3efd9988 5174 // POOL_NEARFULL/BACKFILLFULL/FULL
224ce89b 5175 {
3efd9988 5176 list<string> full_detail, backfillfull_detail, nearfull_detail;
224ce89b
WB
5177 for (auto it : get_pools()) {
5178 const pg_pool_t &pool = it.second;
3efd9988 5179 const string& pool_name = get_pool_name(it.first);
224ce89b 5180 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
224ce89b 5181 stringstream ss;
3efd9988
FG
5182 if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
5183 // may run out of space too,
5184 // but we want EQUOTA taking precedence
5185 ss << "pool '" << pool_name << "' is full (no quota)";
5186 } else {
5187 ss << "pool '" << pool_name << "' is full (no space)";
5188 }
5189 full_detail.push_back(ss.str());
5190 } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
5191 stringstream ss;
5192 ss << "pool '" << pool_name << "' is backfillfull";
5193 backfillfull_detail.push_back(ss.str());
5194 } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
5195 stringstream ss;
5196 ss << "pool '" << pool_name << "' is nearfull";
5197 nearfull_detail.push_back(ss.str());
224ce89b
WB
5198 }
5199 }
3efd9988 5200 if (!full_detail.empty()) {
224ce89b 5201 ostringstream ss;
3efd9988 5202 ss << full_detail.size() << " pool(s) full";
224ce89b 5203 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
3efd9988
FG
5204 d.detail.swap(full_detail);
5205 }
5206 if (!backfillfull_detail.empty()) {
5207 ostringstream ss;
5208 ss << backfillfull_detail.size() << " pool(s) backfillfull";
5209 auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
5210 d.detail.swap(backfillfull_detail);
5211 }
5212 if (!nearfull_detail.empty()) {
5213 ostringstream ss;
5214 ss << nearfull_detail.size() << " pool(s) nearfull";
5215 auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
5216 d.detail.swap(nearfull_detail);
224ce89b
WB
5217 }
5218 }
5219}
35e4c445
FG
5220
5221int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
5222 ostream *ss) const
5223{
5224 out->clear();
5225 for (auto i = ls.begin(); i != ls.end(); ++i) {
5226 if (i == ls.begin() &&
5227 (*i == "any" || *i == "all" || *i == "*")) {
5228 get_all_osds(*out);
5229 break;
5230 }
5231 long osd = parse_osd_id(i->c_str(), ss);
5232 if (osd < 0) {
5233 *ss << "invalid osd id '" << *i << "'";
5234 return -EINVAL;
5235 }
5236 out->insert(osd);
5237 }
5238 return 0;
5239}