]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
update sources to v12.1.1
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <boost/algorithm/string.hpp>
19
20 #include "OSDMap.h"
21 #include <algorithm>
22 #include "common/config.h"
23 #include "common/Formatter.h"
24 #include "common/TextTable.h"
25 #include "include/ceph_features.h"
26 #include "include/str_map.h"
27
28 #include "common/code_environment.h"
29 #include "mon/health_check.h"
30
31 #include "crush/CrushTreeDumper.h"
32 #include "common/Clock.h"
33 #include "mon/PGStatService.h"
34
35 #define dout_subsys ceph_subsys_osd
36
37 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
38 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
39
40
41 // ----------------------------------
42 // osd_info_t
43
44 void osd_info_t::dump(Formatter *f) const
45 {
46 f->dump_int("last_clean_begin", last_clean_begin);
47 f->dump_int("last_clean_end", last_clean_end);
48 f->dump_int("up_from", up_from);
49 f->dump_int("up_thru", up_thru);
50 f->dump_int("down_at", down_at);
51 f->dump_int("lost_at", lost_at);
52 }
53
54 void osd_info_t::encode(bufferlist& bl) const
55 {
56 __u8 struct_v = 1;
57 ::encode(struct_v, bl);
58 ::encode(last_clean_begin, bl);
59 ::encode(last_clean_end, bl);
60 ::encode(up_from, bl);
61 ::encode(up_thru, bl);
62 ::encode(down_at, bl);
63 ::encode(lost_at, bl);
64 }
65
66 void osd_info_t::decode(bufferlist::iterator& bl)
67 {
68 __u8 struct_v;
69 ::decode(struct_v, bl);
70 ::decode(last_clean_begin, bl);
71 ::decode(last_clean_end, bl);
72 ::decode(up_from, bl);
73 ::decode(up_thru, bl);
74 ::decode(down_at, bl);
75 ::decode(lost_at, bl);
76 }
77
78 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
79 {
80 o.push_back(new osd_info_t);
81 o.push_back(new osd_info_t);
82 o.back()->last_clean_begin = 1;
83 o.back()->last_clean_end = 2;
84 o.back()->up_from = 30;
85 o.back()->up_thru = 40;
86 o.back()->down_at = 5;
87 o.back()->lost_at = 6;
88 }
89
90 ostream& operator<<(ostream& out, const osd_info_t& info)
91 {
92 out << "up_from " << info.up_from
93 << " up_thru " << info.up_thru
94 << " down_at " << info.down_at
95 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
96 if (info.lost_at)
97 out << " lost_at " << info.lost_at;
98 return out;
99 }
100
101 // ----------------------------------
102 // osd_xinfo_t
103
104 void osd_xinfo_t::dump(Formatter *f) const
105 {
106 f->dump_stream("down_stamp") << down_stamp;
107 f->dump_float("laggy_probability", laggy_probability);
108 f->dump_int("laggy_interval", laggy_interval);
109 f->dump_int("features", features);
110 f->dump_unsigned("old_weight", old_weight);
111 }
112
113 void osd_xinfo_t::encode(bufferlist& bl) const
114 {
115 ENCODE_START(3, 1, bl);
116 ::encode(down_stamp, bl);
117 __u32 lp = laggy_probability * 0xfffffffful;
118 ::encode(lp, bl);
119 ::encode(laggy_interval, bl);
120 ::encode(features, bl);
121 ::encode(old_weight, bl);
122 ENCODE_FINISH(bl);
123 }
124
125 void osd_xinfo_t::decode(bufferlist::iterator& bl)
126 {
127 DECODE_START(3, bl);
128 ::decode(down_stamp, bl);
129 __u32 lp;
130 ::decode(lp, bl);
131 laggy_probability = (float)lp / (float)0xffffffff;
132 ::decode(laggy_interval, bl);
133 if (struct_v >= 2)
134 ::decode(features, bl);
135 else
136 features = 0;
137 if (struct_v >= 3)
138 ::decode(old_weight, bl);
139 else
140 old_weight = 0;
141 DECODE_FINISH(bl);
142 }
143
144 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
145 {
146 o.push_back(new osd_xinfo_t);
147 o.push_back(new osd_xinfo_t);
148 o.back()->down_stamp = utime_t(2, 3);
149 o.back()->laggy_probability = .123;
150 o.back()->laggy_interval = 123456;
151 o.back()->old_weight = 0x7fff;
152 }
153
154 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
155 {
156 return out << "down_stamp " << xi.down_stamp
157 << " laggy_probability " << xi.laggy_probability
158 << " laggy_interval " << xi.laggy_interval
159 << " old_weight " << xi.old_weight;
160 }
161
162 // ----------------------------------
163 // OSDMap::Incremental
164
165 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
166 {
167 int n = 0;
168 for (auto &weight : new_weight) {
169 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
170 n++; // marked out
171 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
172 n--; // marked in
173 }
174 return n;
175 }
176
177 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
178 {
179 int n = 0;
180 for (auto &state : new_state) { //
181 if (state.second & CEPH_OSD_UP) {
182 if (previous->is_up(state.first))
183 n++; // marked down
184 else
185 n--; // marked up
186 }
187 }
188 return n;
189 }
190
191 int OSDMap::Incremental::identify_osd(uuid_d u) const
192 {
193 for (auto &uuid : new_uuid)
194 if (uuid.second == u)
195 return uuid.first;
196 return -1;
197 }
198
199 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
200 const OSDMap& osdmap)
201 {
202 assert(epoch == osdmap.get_epoch() + 1);
203
204 for (auto &new_pool : new_pools) {
205 if (!new_pool.second.tiers.empty()) {
206 pg_pool_t& base = new_pool.second;
207
208 for (const auto &tier_pool : base.tiers) {
209 const auto &r = new_pools.find(tier_pool);
210 pg_pool_t *tier = 0;
211 if (r == new_pools.end()) {
212 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
213 if (!orig) {
214 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
215 return -EIO;
216 }
217 tier = get_new_pool(tier_pool, orig);
218 } else {
219 tier = &r->second;
220 }
221 if (tier->tier_of != new_pool.first) {
222 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
223 return -EIO;
224 }
225
226 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
227 << tier_pool << dendl;
228 tier->snap_seq = base.snap_seq;
229 tier->snap_epoch = base.snap_epoch;
230 tier->snaps = base.snaps;
231 tier->removed_snaps = base.removed_snaps;
232 }
233 }
234 }
235 return 0;
236 }
237
238
239 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
240 {
241 if (id >= 0)
242 return is_down(id);
243
244 if (down_cache &&
245 down_cache->count(id)) {
246 return true;
247 }
248
249 list<int> children;
250 crush->get_children(id, &children);
251 for (const auto &child : children) {
252 if (!subtree_is_down(child, down_cache)) {
253 return false;
254 }
255 }
256 if (down_cache) {
257 down_cache->insert(id);
258 }
259 return true;
260 }
261
262 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
263 {
264 // use a stack-local down_cache if we didn't get one from the
265 // caller. then at least this particular call will avoid duplicated
266 // work.
267 set<int> local_down_cache;
268 if (!down_cache) {
269 down_cache = &local_down_cache;
270 }
271
272 int current = id;
273 while (true) {
274 int type;
275 if (current >= 0) {
276 type = 0;
277 } else {
278 type = crush->get_bucket_type(current);
279 }
280 assert(type >= 0);
281
282 if (!subtree_is_down(current, down_cache)) {
283 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
284 return false;
285 }
286
287 // is this a big enough subtree to be marked as down?
288 if (type >= subtree_type) {
289 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
290 return true;
291 }
292
293 int r = crush->get_immediate_parent_id(current, &current);
294 if (r < 0) {
295 return false;
296 }
297 }
298 }
299
300 bool OSDMap::subtree_type_is_down(
301 CephContext *cct,
302 int id,
303 int subtree_type,
304 set<int> *down_in_osds,
305 set<int> *up_in_osds,
306 set<int> *subtree_up,
307 unordered_map<int, set<int> > *subtree_type_down) const
308 {
309 if (id >= 0) {
310 bool is_down_ret = is_down(id);
311 if (!is_out(id)) {
312 if (is_down_ret) {
313 down_in_osds->insert(id);
314 } else {
315 up_in_osds->insert(id);
316 }
317 }
318 return is_down_ret;
319 }
320
321 if (subtree_type_down &&
322 (*subtree_type_down)[subtree_type].count(id)) {
323 return true;
324 }
325
326 list<int> children;
327 crush->get_children(id, &children);
328 for (const auto &child : children) {
329 if (!subtree_type_is_down(
330 cct, child, crush->get_bucket_type(child),
331 down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
332 subtree_up->insert(id);
333 return false;
334 }
335 }
336 if (subtree_type_down) {
337 (*subtree_type_down)[subtree_type].insert(id);
338 }
339 return true;
340 }
341
342 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
343 {
344 __u16 v = 5;
345 ::encode(v, bl);
346 ::encode(fsid, bl);
347 ::encode(epoch, bl);
348 ::encode(modified, bl);
349 int32_t new_t = new_pool_max;
350 ::encode(new_t, bl);
351 ::encode(new_flags, bl);
352 ::encode(fullmap, bl);
353 ::encode(crush, bl);
354
355 ::encode(new_max_osd, bl);
356 // for ::encode(new_pools, bl);
357 __u32 n = new_pools.size();
358 ::encode(n, bl);
359 for (const auto &new_pool : new_pools) {
360 n = new_pool.first;
361 ::encode(n, bl);
362 ::encode(new_pool.second, bl, 0);
363 }
364 // for ::encode(new_pool_names, bl);
365 n = new_pool_names.size();
366 ::encode(n, bl);
367
368 for (const auto &new_pool_name : new_pool_names) {
369 n = new_pool_name.first;
370 ::encode(n, bl);
371 ::encode(new_pool_name.second, bl);
372 }
373 // for ::encode(old_pools, bl);
374 n = old_pools.size();
375 ::encode(n, bl);
376 for (auto &old_pool : old_pools) {
377 n = old_pool;
378 ::encode(n, bl);
379 }
380 ::encode(new_up_client, bl, 0);
381 {
382 // legacy is map<int32_t,uint8_t>
383 uint32_t n = new_state.size();
384 ::encode(n, bl);
385 for (auto p : new_state) {
386 ::encode(p.first, bl);
387 ::encode((uint8_t)p.second, bl);
388 }
389 }
390 ::encode(new_weight, bl);
391 // for ::encode(new_pg_temp, bl);
392 n = new_pg_temp.size();
393 ::encode(n, bl);
394
395 for (const auto &pg_temp : new_pg_temp) {
396 old_pg_t opg = pg_temp.first.get_old_pg();
397 ::encode(opg, bl);
398 ::encode(pg_temp.second, bl);
399 }
400 }
401
402 void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
403 {
404 if ((features & CEPH_FEATURE_PGID64) == 0) {
405 encode_client_old(bl);
406 return;
407 }
408
409 // base
410 __u16 v = 6;
411 ::encode(v, bl);
412 ::encode(fsid, bl);
413 ::encode(epoch, bl);
414 ::encode(modified, bl);
415 ::encode(new_pool_max, bl);
416 ::encode(new_flags, bl);
417 ::encode(fullmap, bl);
418 ::encode(crush, bl);
419
420 ::encode(new_max_osd, bl);
421 ::encode(new_pools, bl, features);
422 ::encode(new_pool_names, bl);
423 ::encode(old_pools, bl);
424 ::encode(new_up_client, bl, features);
425 {
426 uint32_t n = new_state.size();
427 ::encode(n, bl);
428 for (auto p : new_state) {
429 ::encode(p.first, bl);
430 ::encode((uint8_t)p.second, bl);
431 }
432 }
433 ::encode(new_weight, bl);
434 ::encode(new_pg_temp, bl);
435
436 // extended
437 __u16 ev = 10;
438 ::encode(ev, bl);
439 ::encode(new_hb_back_up, bl, features);
440 ::encode(new_up_thru, bl);
441 ::encode(new_last_clean_interval, bl);
442 ::encode(new_lost, bl);
443 ::encode(new_blacklist, bl, features);
444 ::encode(old_blacklist, bl, features);
445 ::encode(new_up_cluster, bl, features);
446 ::encode(cluster_snapshot, bl);
447 ::encode(new_uuid, bl);
448 ::encode(new_xinfo, bl);
449 ::encode(new_hb_front_up, bl, features);
450 }
451
452 void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
453 {
454 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
455 encode_classic(bl, features);
456 return;
457 }
458
459 // only a select set of callers should *ever* be encoding new
460 // OSDMaps. others should be passing around the canonical encoded
461 // buffers from on high. select out those callers by passing in an
462 // "impossible" feature bit.
463 assert(features & CEPH_FEATURE_RESERVED);
464 features &= ~CEPH_FEATURE_RESERVED;
465
466 size_t start_offset = bl.length();
467 size_t tail_offset;
468 buffer::list::iterator crc_it;
469
470 // meta-encoding: how we include client-used and osd-specific data
471 ENCODE_START(8, 7, bl);
472
473 {
474 uint8_t v = 5;
475 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
476 v = 3;
477 }
478 ENCODE_START(v, 1, bl); // client-usable data
479 ::encode(fsid, bl);
480 ::encode(epoch, bl);
481 ::encode(modified, bl);
482 ::encode(new_pool_max, bl);
483 ::encode(new_flags, bl);
484 ::encode(fullmap, bl);
485 ::encode(crush, bl);
486
487 ::encode(new_max_osd, bl);
488 ::encode(new_pools, bl, features);
489 ::encode(new_pool_names, bl);
490 ::encode(old_pools, bl);
491 ::encode(new_up_client, bl, features);
492 if (v >= 5) {
493 ::encode(new_state, bl);
494 } else {
495 uint32_t n = new_state.size();
496 ::encode(n, bl);
497 for (auto p : new_state) {
498 ::encode(p.first, bl);
499 ::encode((uint8_t)p.second, bl);
500 }
501 }
502 ::encode(new_weight, bl);
503 ::encode(new_pg_temp, bl);
504 ::encode(new_primary_temp, bl);
505 ::encode(new_primary_affinity, bl);
506 ::encode(new_erasure_code_profiles, bl);
507 ::encode(old_erasure_code_profiles, bl);
508 if (v >= 4) {
509 ::encode(new_pg_upmap, bl);
510 ::encode(old_pg_upmap, bl);
511 ::encode(new_pg_upmap_items, bl);
512 ::encode(old_pg_upmap_items, bl);
513 }
514 ENCODE_FINISH(bl); // client-usable data
515 }
516
517 {
518 uint8_t target_v = 6;
519 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
520 target_v = 2;
521 }
522 ENCODE_START(target_v, 1, bl); // extended, osd-only data
523 ::encode(new_hb_back_up, bl, features);
524 ::encode(new_up_thru, bl);
525 ::encode(new_last_clean_interval, bl);
526 ::encode(new_lost, bl);
527 ::encode(new_blacklist, bl, features);
528 ::encode(old_blacklist, bl, features);
529 ::encode(new_up_cluster, bl, features);
530 ::encode(cluster_snapshot, bl);
531 ::encode(new_uuid, bl);
532 ::encode(new_xinfo, bl);
533 ::encode(new_hb_front_up, bl, features);
534 ::encode(features, bl); // NOTE: features arg, not the member
535 if (target_v >= 3) {
536 ::encode(new_nearfull_ratio, bl);
537 ::encode(new_full_ratio, bl);
538 ::encode(new_backfillfull_ratio, bl);
539 }
540 // 5 was string-based new_require_min_compat_client
541 if (target_v >= 6) {
542 ::encode(new_require_min_compat_client, bl);
543 ::encode(new_require_osd_release, bl);
544 }
545 ENCODE_FINISH(bl); // osd-only data
546 }
547
548 ::encode((uint32_t)0, bl); // dummy inc_crc
549 crc_it = bl.end();
550 crc_it.advance(-4);
551 tail_offset = bl.length();
552
553 ::encode(full_crc, bl);
554
555 ENCODE_FINISH(bl); // meta-encoding wrapper
556
557 // fill in crc
558 bufferlist front;
559 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
560 inc_crc = front.crc32c(-1);
561 bufferlist tail;
562 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
563 inc_crc = tail.crc32c(inc_crc);
564 ceph_le32 crc_le;
565 crc_le = inc_crc;
566 crc_it.copy_in(4, (char*)&crc_le);
567 have_crc = true;
568 }
569
570 void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
571 {
572 __u32 n, t;
573 // base
574 __u16 v;
575 ::decode(v, p);
576 ::decode(fsid, p);
577 ::decode(epoch, p);
578 ::decode(modified, p);
579 if (v == 4 || v == 5) {
580 ::decode(n, p);
581 new_pool_max = n;
582 } else if (v >= 6)
583 ::decode(new_pool_max, p);
584 ::decode(new_flags, p);
585 ::decode(fullmap, p);
586 ::decode(crush, p);
587
588 ::decode(new_max_osd, p);
589 if (v < 6) {
590 new_pools.clear();
591 ::decode(n, p);
592 while (n--) {
593 ::decode(t, p);
594 ::decode(new_pools[t], p);
595 }
596 } else {
597 ::decode(new_pools, p);
598 }
599 if (v == 5) {
600 new_pool_names.clear();
601 ::decode(n, p);
602 while (n--) {
603 ::decode(t, p);
604 ::decode(new_pool_names[t], p);
605 }
606 } else if (v >= 6) {
607 ::decode(new_pool_names, p);
608 }
609 if (v < 6) {
610 old_pools.clear();
611 ::decode(n, p);
612 while (n--) {
613 ::decode(t, p);
614 old_pools.insert(t);
615 }
616 } else {
617 ::decode(old_pools, p);
618 }
619 ::decode(new_up_client, p);
620 {
621 map<int32_t,uint8_t> ns;
622 ::decode(ns, p);
623 for (auto q : ns) {
624 new_state[q.first] = q.second;
625 }
626 }
627 ::decode(new_weight, p);
628
629 if (v < 6) {
630 new_pg_temp.clear();
631 ::decode(n, p);
632 while (n--) {
633 old_pg_t opg;
634 ::decode_raw(opg, p);
635 ::decode(new_pg_temp[pg_t(opg)], p);
636 }
637 } else {
638 ::decode(new_pg_temp, p);
639 }
640
641 // decode short map, too.
642 if (v == 5 && p.end())
643 return;
644
645 // extended
646 __u16 ev = 0;
647 if (v >= 5)
648 ::decode(ev, p);
649 ::decode(new_hb_back_up, p);
650 if (v < 5)
651 ::decode(new_pool_names, p);
652 ::decode(new_up_thru, p);
653 ::decode(new_last_clean_interval, p);
654 ::decode(new_lost, p);
655 ::decode(new_blacklist, p);
656 ::decode(old_blacklist, p);
657 if (ev >= 6)
658 ::decode(new_up_cluster, p);
659 if (ev >= 7)
660 ::decode(cluster_snapshot, p);
661 if (ev >= 8)
662 ::decode(new_uuid, p);
663 if (ev >= 9)
664 ::decode(new_xinfo, p);
665 if (ev >= 10)
666 ::decode(new_hb_front_up, p);
667 }
668
669 void OSDMap::Incremental::decode(bufferlist::iterator& bl)
670 {
671 /**
672 * Older encodings of the Incremental had a single struct_v which
673 * covered the whole encoding, and was prior to our modern
674 * stuff which includes a compatv and a size. So if we see
675 * a struct_v < 7, we must rewind to the beginning and use our
676 * classic decoder.
677 */
678 size_t start_offset = bl.get_off();
679 size_t tail_offset = 0;
680 bufferlist crc_front, crc_tail;
681
682 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
683 if (struct_v < 7) {
684 int struct_v_size = sizeof(struct_v);
685 bl.advance(-struct_v_size);
686 decode_classic(bl);
687 encode_features = 0;
688 if (struct_v >= 6)
689 encode_features = CEPH_FEATURE_PGID64;
690 else
691 encode_features = 0;
692 return;
693 }
694 {
695 DECODE_START(5, bl); // client-usable data
696 ::decode(fsid, bl);
697 ::decode(epoch, bl);
698 ::decode(modified, bl);
699 ::decode(new_pool_max, bl);
700 ::decode(new_flags, bl);
701 ::decode(fullmap, bl);
702 ::decode(crush, bl);
703
704 ::decode(new_max_osd, bl);
705 ::decode(new_pools, bl);
706 ::decode(new_pool_names, bl);
707 ::decode(old_pools, bl);
708 ::decode(new_up_client, bl);
709 if (struct_v >= 5) {
710 ::decode(new_state, bl);
711 } else {
712 map<int32_t,uint8_t> ns;
713 ::decode(ns, bl);
714 for (auto q : ns) {
715 new_state[q.first] = q.second;
716 }
717 }
718 ::decode(new_weight, bl);
719 ::decode(new_pg_temp, bl);
720 ::decode(new_primary_temp, bl);
721 if (struct_v >= 2)
722 ::decode(new_primary_affinity, bl);
723 else
724 new_primary_affinity.clear();
725 if (struct_v >= 3) {
726 ::decode(new_erasure_code_profiles, bl);
727 ::decode(old_erasure_code_profiles, bl);
728 } else {
729 new_erasure_code_profiles.clear();
730 old_erasure_code_profiles.clear();
731 }
732 if (struct_v >= 4) {
733 ::decode(new_pg_upmap, bl);
734 ::decode(old_pg_upmap, bl);
735 ::decode(new_pg_upmap_items, bl);
736 ::decode(old_pg_upmap_items, bl);
737 }
738 DECODE_FINISH(bl); // client-usable data
739 }
740
741 {
742 DECODE_START(6, bl); // extended, osd-only data
743 ::decode(new_hb_back_up, bl);
744 ::decode(new_up_thru, bl);
745 ::decode(new_last_clean_interval, bl);
746 ::decode(new_lost, bl);
747 ::decode(new_blacklist, bl);
748 ::decode(old_blacklist, bl);
749 ::decode(new_up_cluster, bl);
750 ::decode(cluster_snapshot, bl);
751 ::decode(new_uuid, bl);
752 ::decode(new_xinfo, bl);
753 ::decode(new_hb_front_up, bl);
754 if (struct_v >= 2)
755 ::decode(encode_features, bl);
756 else
757 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
758 if (struct_v >= 3) {
759 ::decode(new_nearfull_ratio, bl);
760 ::decode(new_full_ratio, bl);
761 } else {
762 new_nearfull_ratio = -1;
763 new_full_ratio = -1;
764 }
765 if (struct_v >= 4) {
766 ::decode(new_backfillfull_ratio, bl);
767 } else {
768 new_backfillfull_ratio = -1;
769 }
770 if (struct_v == 5) {
771 string r;
772 ::decode(r, bl);
773 if (r.length()) {
774 new_require_min_compat_client = ceph_release_from_name(r.c_str());
775 }
776 }
777 if (struct_v >= 6) {
778 ::decode(new_require_min_compat_client, bl);
779 ::decode(new_require_osd_release, bl);
780 } else {
781 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
782 // only for compat with post-kraken pre-luminous test clusters
783 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
784 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
785 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
786 new_require_osd_release = CEPH_RELEASE_KRAKEN;
787 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
788 new_require_osd_release = CEPH_RELEASE_JEWEL;
789 } else {
790 new_require_osd_release = -1;
791 }
792 }
793 DECODE_FINISH(bl); // osd-only data
794 }
795
796 if (struct_v >= 8) {
797 have_crc = true;
798 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
799 ::decode(inc_crc, bl);
800 tail_offset = bl.get_off();
801 ::decode(full_crc, bl);
802 } else {
803 have_crc = false;
804 full_crc = 0;
805 inc_crc = 0;
806 }
807
808 DECODE_FINISH(bl); // wrapper
809
810 if (have_crc) {
811 // verify crc
812 uint32_t actual = crc_front.crc32c(-1);
813 if (tail_offset < bl.get_off()) {
814 bufferlist tail;
815 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
816 actual = tail.crc32c(actual);
817 }
818 if (inc_crc != actual) {
819 ostringstream ss;
820 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
821 string s = ss.str();
822 throw buffer::malformed_input(s.c_str());
823 }
824 }
825 }
826
827 void OSDMap::Incremental::dump(Formatter *f) const
828 {
829 f->dump_int("epoch", epoch);
830 f->dump_stream("fsid") << fsid;
831 f->dump_stream("modified") << modified;
832 f->dump_int("new_pool_max", new_pool_max);
833 f->dump_int("new_flags", new_flags);
834 f->dump_float("new_full_ratio", new_full_ratio);
835 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
836 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
837 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
838 f->dump_int("new_require_osd_release", new_require_osd_release);
839
840 if (fullmap.length()) {
841 f->open_object_section("full_map");
842 OSDMap full;
843 bufferlist fbl = fullmap; // kludge around constness.
844 auto p = fbl.begin();
845 full.decode(p);
846 full.dump(f);
847 f->close_section();
848 }
849 if (crush.length()) {
850 f->open_object_section("crush");
851 CrushWrapper c;
852 bufferlist tbl = crush; // kludge around constness.
853 auto p = tbl.begin();
854 c.decode(p);
855 c.dump(f);
856 f->close_section();
857 }
858
859 f->dump_int("new_max_osd", new_max_osd);
860
861 f->open_array_section("new_pools");
862
863 for (const auto &new_pool : new_pools) {
864 f->open_object_section("pool");
865 f->dump_int("pool", new_pool.first);
866 new_pool.second.dump(f);
867 f->close_section();
868 }
869 f->close_section();
870 f->open_array_section("new_pool_names");
871
872 for (const auto &new_pool_name : new_pool_names) {
873 f->open_object_section("pool_name");
874 f->dump_int("pool", new_pool_name.first);
875 f->dump_string("name", new_pool_name.second);
876 f->close_section();
877 }
878 f->close_section();
879 f->open_array_section("old_pools");
880
881 for (const auto &old_pool : old_pools)
882 f->dump_int("pool", old_pool);
883 f->close_section();
884
885 f->open_array_section("new_up_osds");
886
887 for (const auto &upclient : new_up_client) {
888 f->open_object_section("osd");
889 f->dump_int("osd", upclient.first);
890 f->dump_stream("public_addr") << upclient.second;
891 f->dump_stream("cluster_addr") << new_up_cluster.find(upclient.first)->second;
892 f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(upclient.first)->second;
893 map<int32_t, entity_addr_t>::const_iterator q;
894 if ((q = new_hb_front_up.find(upclient.first)) != new_hb_front_up.end())
895 f->dump_stream("heartbeat_front_addr") << q->second;
896 f->close_section();
897 }
898 f->close_section();
899
900 f->open_array_section("new_weight");
901
902 for (const auto &weight : new_weight) {
903 f->open_object_section("osd");
904 f->dump_int("osd", weight.first);
905 f->dump_int("weight", weight.second);
906 f->close_section();
907 }
908 f->close_section();
909
910 f->open_array_section("osd_state_xor");
911 for (const auto &ns : new_state) {
912 f->open_object_section("osd");
913 f->dump_int("osd", ns.first);
914 set<string> st;
915 calc_state_set(new_state.find(ns.first)->second, st);
916 f->open_array_section("state_xor");
917 for (auto &state : st)
918 f->dump_string("state", state);
919 f->close_section();
920 }
921 f->close_section();
922
923 f->open_array_section("new_pg_temp");
924
925 for (const auto &pg_temp : new_pg_temp) {
926 f->open_object_section("pg");
927 f->dump_stream("pgid") << pg_temp.first;
928 f->open_array_section("osds");
929
930 for (const auto &osd : pg_temp.second)
931 f->dump_int("osd", osd);
932 f->close_section();
933 f->close_section();
934 }
935 f->close_section();
936
937 f->open_array_section("primary_temp");
938
939 for (const auto &primary_temp : new_primary_temp) {
940 f->dump_stream("pgid") << primary_temp.first;
941 f->dump_int("osd", primary_temp.second);
942 }
943 f->close_section(); // primary_temp
944
945 f->open_array_section("new_pg_upmap");
946 for (auto& i : new_pg_upmap) {
947 f->open_object_section("mapping");
948 f->dump_stream("pgid") << i.first;
949 f->open_array_section("osds");
950 for (auto osd : i.second) {
951 f->dump_int("osd", osd);
952 }
953 f->close_section();
954 f->close_section();
955 }
956 f->close_section();
957 f->open_array_section("old_pg_upmap");
958 for (auto& i : old_pg_upmap) {
959 f->dump_stream("pgid") << i;
960 }
961 f->close_section();
962
963 f->open_array_section("new_pg_upmap_items");
964 for (auto& i : new_pg_upmap_items) {
965 f->open_object_section("mapping");
966 f->dump_stream("pgid") << i.first;
967 f->open_array_section("mappings");
968 for (auto& p : i.second) {
969 f->open_object_section("mapping");
970 f->dump_int("from", p.first);
971 f->dump_int("to", p.second);
972 f->close_section();
973 }
974 f->close_section();
975 f->close_section();
976 }
977 f->close_section();
978 f->open_array_section("old_pg_upmap_items");
979 for (auto& i : old_pg_upmap_items) {
980 f->dump_stream("pgid") << i;
981 }
982 f->close_section();
983
984 f->open_array_section("new_up_thru");
985
986 for (const auto &up_thru : new_up_thru) {
987 f->open_object_section("osd");
988 f->dump_int("osd", up_thru.first);
989 f->dump_int("up_thru", up_thru.second);
990 f->close_section();
991 }
992 f->close_section();
993
994 f->open_array_section("new_lost");
995
996 for (const auto &lost : new_lost) {
997 f->open_object_section("osd");
998 f->dump_int("osd", lost.first);
999 f->dump_int("epoch_lost", lost.second);
1000 f->close_section();
1001 }
1002 f->close_section();
1003
1004 f->open_array_section("new_last_clean_interval");
1005
1006 for (const auto &last_clean_interval : new_last_clean_interval) {
1007 f->open_object_section("osd");
1008 f->dump_int("osd", last_clean_interval.first);
1009 f->dump_int("first", last_clean_interval.second.first);
1010 f->dump_int("last", last_clean_interval.second.second);
1011 f->close_section();
1012 }
1013 f->close_section();
1014
1015 f->open_array_section("new_blacklist");
1016 for (const auto &blist : new_blacklist) {
1017 stringstream ss;
1018 ss << blist.first;
1019 f->dump_stream(ss.str().c_str()) << blist.second;
1020 }
1021 f->close_section();
1022 f->open_array_section("old_blacklist");
1023 for (const auto &blist : old_blacklist)
1024 f->dump_stream("addr") << blist;
1025 f->close_section();
1026
1027 f->open_array_section("new_xinfo");
1028 for (const auto &xinfo : new_xinfo) {
1029 f->open_object_section("xinfo");
1030 f->dump_int("osd", xinfo.first);
1031 xinfo.second.dump(f);
1032 f->close_section();
1033 }
1034 f->close_section();
1035
1036 if (cluster_snapshot.size())
1037 f->dump_string("cluster_snapshot", cluster_snapshot);
1038
1039 f->open_array_section("new_uuid");
1040 for (const auto &uuid : new_uuid) {
1041 f->open_object_section("osd");
1042 f->dump_int("osd", uuid.first);
1043 f->dump_stream("uuid") << uuid.second;
1044 f->close_section();
1045 }
1046 f->close_section();
1047
1048 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1049 f->open_array_section("old_erasure_code_profiles");
1050 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1051 f->dump_string("old", erasure_code_profile.c_str());
1052 }
1053 f->close_section();
1054 }
1055
1056 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1057 {
1058 o.push_back(new Incremental);
1059 }
1060
1061 // ----------------------------------
1062 // OSDMap
1063
1064 void OSDMap::set_epoch(epoch_t e)
1065 {
1066 epoch = e;
1067 for (auto &pool : pools)
1068 pool.second.last_change = e;
1069 }
1070
1071 bool OSDMap::is_blacklisted(const entity_addr_t& a) const
1072 {
1073 if (blacklist.empty())
1074 return false;
1075
1076 // this specific instance?
1077 if (blacklist.count(a))
1078 return true;
1079
1080 // is entire ip blacklisted?
1081 if (a.is_ip()) {
1082 entity_addr_t b = a;
1083 b.set_port(0);
1084 b.set_nonce(0);
1085 if (blacklist.count(b)) {
1086 return true;
1087 }
1088 }
1089
1090 return false;
1091 }
1092
1093 void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1094 {
1095 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1096 }
1097
1098 void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1099 {
1100 for (const auto &i : blacklist) {
1101 bl->insert(i.first);
1102 }
1103 }
1104
1105 void OSDMap::set_max_osd(int m)
1106 {
1107 int o = max_osd;
1108 max_osd = m;
1109 osd_state.resize(m);
1110 osd_weight.resize(m);
1111 for (; o<max_osd; o++) {
1112 osd_state[o] = 0;
1113 osd_weight[o] = CEPH_OSD_OUT;
1114 }
1115 osd_info.resize(m);
1116 osd_xinfo.resize(m);
1117 osd_addrs->client_addr.resize(m);
1118 osd_addrs->cluster_addr.resize(m);
1119 osd_addrs->hb_back_addr.resize(m);
1120 osd_addrs->hb_front_addr.resize(m);
1121 osd_uuid->resize(m);
1122 if (osd_primary_affinity)
1123 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1124
1125 calc_num_osds();
1126 }
1127
1128 int OSDMap::calc_num_osds()
1129 {
1130 num_osd = 0;
1131 num_up_osd = 0;
1132 num_in_osd = 0;
1133 for (int i=0; i<max_osd; i++) {
1134 if (osd_state[i] & CEPH_OSD_EXISTS) {
1135 ++num_osd;
1136 if (osd_state[i] & CEPH_OSD_UP) {
1137 ++num_up_osd;
1138 }
1139 if (get_weight(i) != CEPH_OSD_OUT) {
1140 ++num_in_osd;
1141 }
1142 }
1143 }
1144 return num_osd;
1145 }
1146
1147 void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
1148 {
1149 *full = 0;
1150 *backfill = 0;
1151 *nearfull = 0;
1152 for (int i = 0; i < max_osd; ++i) {
1153 if (exists(i) && is_up(i) && is_in(i)) {
1154 if (osd_state[i] & CEPH_OSD_FULL)
1155 ++(*full);
1156 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1157 ++(*backfill);
1158 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1159 ++(*nearfull);
1160 }
1161 }
1162 }
1163
1164 static bool get_osd_utilization(
1165 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1166 int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail)
1167 {
1168 auto p = osd_stat.find(id);
1169 if (p == osd_stat.end())
1170 return false;
1171 *kb = p->second.kb;
1172 *kb_used = p->second.kb_used;
1173 *kb_avail = p->second.kb_avail;
1174 return *kb > 0;
1175 }
1176
1177 void OSDMap::get_full_osd_util(
1178 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1179 map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
1180 {
1181 full->clear();
1182 backfill->clear();
1183 nearfull->clear();
1184 for (int i = 0; i < max_osd; ++i) {
1185 if (exists(i) && is_up(i) && is_in(i)) {
1186 int64_t kb, kb_used, kb_avail;
1187 if (osd_state[i] & CEPH_OSD_FULL) {
1188 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1189 full->emplace(i, (float)kb_used / (float)kb);
1190 } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
1191 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1192 backfill->emplace(i, (float)kb_used / (float)kb);
1193 } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
1194 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1195 nearfull->emplace(i, (float)kb_used / (float)kb);
1196 }
1197 }
1198 }
1199 }
1200
1201 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1202 set<int> *nearfull) const
1203 {
1204 full->clear();
1205 backfill->clear();
1206 nearfull->clear();
1207 for (int i = 0; i < max_osd; ++i) {
1208 if (exists(i) && is_up(i) && is_in(i)) {
1209 if (osd_state[i] & CEPH_OSD_FULL)
1210 full->emplace(i);
1211 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1212 backfill->emplace(i);
1213 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1214 nearfull->emplace(i);
1215 }
1216 }
1217 }
1218
1219 void OSDMap::get_all_osds(set<int32_t>& ls) const
1220 {
1221 for (int i=0; i<max_osd; i++)
1222 if (exists(i))
1223 ls.insert(i);
1224 }
1225
1226 void OSDMap::get_up_osds(set<int32_t>& ls) const
1227 {
1228 for (int i = 0; i < max_osd; i++) {
1229 if (is_up(i))
1230 ls.insert(i);
1231 }
1232 }
1233
1234 void OSDMap::get_out_osds(set<int32_t>& ls) const
1235 {
1236 for (int i = 0; i < max_osd; i++) {
1237 if (is_out(i))
1238 ls.insert(i);
1239 }
1240 }
1241
1242 void OSDMap::calc_state_set(int state, set<string>& st)
1243 {
1244 unsigned t = state;
1245 for (unsigned s = 1; t; s <<= 1) {
1246 if (t & s) {
1247 t &= ~s;
1248 st.insert(ceph_osd_state_name(s));
1249 }
1250 }
1251 }
1252
1253 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1254 {
1255 float max = 0;
1256 for (const auto &weight : weights) {
1257 if (weight.second > max)
1258 max = weight.second;
1259 }
1260
1261 for (const auto &weight : weights) {
1262 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1263 }
1264 }
1265
1266 int OSDMap::identify_osd(const entity_addr_t& addr) const
1267 {
1268 for (int i=0; i<max_osd; i++)
1269 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr))
1270 return i;
1271 return -1;
1272 }
1273
1274 int OSDMap::identify_osd(const uuid_d& u) const
1275 {
1276 for (int i=0; i<max_osd; i++)
1277 if (exists(i) && get_uuid(i) == u)
1278 return i;
1279 return -1;
1280 }
1281
1282 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1283 {
1284 for (int i=0; i<max_osd; i++)
1285 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
1286 get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
1287 return i;
1288 return -1;
1289 }
1290
1291 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1292 {
1293 for (int i=0; i<max_osd; i++)
1294 if (exists(i) && (get_addr(i).is_same_host(ip) || get_cluster_addr(i).is_same_host(ip)))
1295 return i;
1296 return -1;
1297 }
1298
1299
1300 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1301 {
1302 uint64_t features = 0; // things we actually have
1303 uint64_t mask = 0; // things we could have
1304
1305 if (crush->has_nondefault_tunables())
1306 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1307 if (crush->has_nondefault_tunables2())
1308 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1309 if (crush->has_nondefault_tunables3())
1310 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1311 if (crush->has_v4_buckets())
1312 features |= CEPH_FEATURE_CRUSH_V4;
1313 if (crush->has_nondefault_tunables5())
1314 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1315 if (crush->has_incompat_choose_args())
1316 features |= CEPH_FEATURE_CRUSH_CHOOSE_ARGS;
1317 mask |= CEPH_FEATURES_CRUSH;
1318
1319 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1320 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1321 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1322
1323 for (auto &pool: pools) {
1324 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1325 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1326 }
1327 if (pool.second.is_erasure() &&
1328 entity_type != CEPH_ENTITY_TYPE_CLIENT) { // not for clients
1329 features |= CEPH_FEATURE_OSD_ERASURE_CODES;
1330 }
1331 if (!pool.second.tiers.empty() ||
1332 pool.second.is_tier()) {
1333 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1334 }
1335 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
1336 pool.second.get_type(),
1337 pool.second.get_size());
1338 if (ruleid >= 0) {
1339 if (crush->is_v2_rule(ruleid))
1340 features |= CEPH_FEATURE_CRUSH_V2;
1341 if (crush->is_v3_rule(ruleid))
1342 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1343 if (crush->is_v5_rule(ruleid))
1344 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1345 }
1346 }
1347 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1348 for (auto &erasure_code_profile : erasure_code_profiles) {
1349 auto& profile = erasure_code_profile.second;
1350 const auto& plugin = profile.find("plugin");
1351 if (plugin != profile.end()) {
1352 if (plugin->second == "isa" || plugin->second == "lrc")
1353 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
1354 if (plugin->second == "shec")
1355 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
1356 }
1357 }
1358 }
1359 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1360 if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
1361 mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
1362
1363 if (osd_primary_affinity) {
1364 for (int i = 0; i < max_osd; ++i) {
1365 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1366 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1367 break;
1368 }
1369 }
1370 }
1371 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1372
1373 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1374 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1375 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
1376 features |= jewel_features;
1377 }
1378 mask |= jewel_features;
1379
1380 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1381 | CEPH_FEATURE_MSG_ADDR2;
1382 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
1383 features |= kraken_features;
1384 }
1385 mask |= kraken_features;
1386 }
1387
1388 if (pmask)
1389 *pmask = mask;
1390 return features;
1391 }
1392
1393 uint8_t OSDMap::get_min_compat_client() const
1394 {
1395 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1396
1397 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1398 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1399 return CEPH_RELEASE_LUMINOUS; // v12.2.0
1400 }
1401 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1402 return CEPH_RELEASE_JEWEL; // v10.2.0
1403 }
1404 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1405 return CEPH_RELEASE_HAMMER; // v0.94.0
1406 }
1407 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1408 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1409 HAVE_FEATURE(f, OSD_ERASURE_CODES) || // v0.73-498-gbfc86a8
1410 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1411 return CEPH_RELEASE_FIREFLY; // v0.80.0
1412 }
1413 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1414 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1415 return CEPH_RELEASE_DUMPLING; // v0.67.0
1416 }
1417 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1418 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1419 }
1420 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1421 }
1422
1423 void OSDMap::_calc_up_osd_features()
1424 {
1425 bool first = true;
1426 cached_up_osd_features = 0;
1427 for (int osd = 0; osd < max_osd; ++osd) {
1428 if (!is_up(osd))
1429 continue;
1430 const osd_xinfo_t &xi = get_xinfo(osd);
1431 if (first) {
1432 cached_up_osd_features = xi.features;
1433 first = false;
1434 } else {
1435 cached_up_osd_features &= xi.features;
1436 }
1437 }
1438 }
1439
1440 uint64_t OSDMap::get_up_osd_features() const
1441 {
1442 return cached_up_osd_features;
1443 }
1444
1445 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1446 {
1447 if (o->epoch == n->epoch)
1448 return;
1449
1450 int diff = 0;
1451
1452 // do addrs match?
1453 if (o->max_osd != n->max_osd)
1454 diff++;
1455 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1456 if ( n->osd_addrs->client_addr[i] && o->osd_addrs->client_addr[i] &&
1457 *n->osd_addrs->client_addr[i] == *o->osd_addrs->client_addr[i])
1458 n->osd_addrs->client_addr[i] = o->osd_addrs->client_addr[i];
1459 else
1460 diff++;
1461 if ( n->osd_addrs->cluster_addr[i] && o->osd_addrs->cluster_addr[i] &&
1462 *n->osd_addrs->cluster_addr[i] == *o->osd_addrs->cluster_addr[i])
1463 n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
1464 else
1465 diff++;
1466 if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
1467 *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
1468 n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
1469 else
1470 diff++;
1471 if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
1472 *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
1473 n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
1474 else
1475 diff++;
1476 }
1477 if (diff == 0) {
1478 // zoinks, no differences at all!
1479 n->osd_addrs = o->osd_addrs;
1480 }
1481
1482 // does crush match?
1483 bufferlist oc, nc;
1484 ::encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1485 ::encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1486 if (oc.contents_equal(nc)) {
1487 n->crush = o->crush;
1488 }
1489
1490 // does pg_temp match?
1491 if (*o->pg_temp == *n->pg_temp)
1492 n->pg_temp = o->pg_temp;
1493
1494 // does primary_temp match?
1495 if (o->primary_temp->size() == n->primary_temp->size()) {
1496 if (*o->primary_temp == *n->primary_temp)
1497 n->primary_temp = o->primary_temp;
1498 }
1499
1500 // do uuids match?
1501 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1502 *o->osd_uuid == *n->osd_uuid)
1503 n->osd_uuid = o->osd_uuid;
1504 }
1505
1506 void OSDMap::clean_temps(CephContext *cct,
1507 const OSDMap& osdmap, Incremental *pending_inc)
1508 {
1509 ldout(cct, 10) << __func__ << dendl;
1510 OSDMap tmpmap;
1511 tmpmap.deepish_copy_from(osdmap);
1512 tmpmap.apply_incremental(*pending_inc);
1513
1514 for (auto pg : *tmpmap.pg_temp) {
1515 // if pool does not exist, remove any existing pg_temps associated with
1516 // it. we don't care about pg_temps on the pending_inc either; if there
1517 // are new_pg_temp entries on the pending, clear them out just as well.
1518 if (!osdmap.have_pg_pool(pg.first.pool())) {
1519 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1520 << " for nonexistent pool " << pg.first.pool() << dendl;
1521 pending_inc->new_pg_temp[pg.first].clear();
1522 continue;
1523 }
1524 // all osds down?
1525 unsigned num_up = 0;
1526 for (auto o : pg.second) {
1527 if (!tmpmap.is_down(o)) {
1528 ++num_up;
1529 break;
1530 }
1531 }
1532 if (num_up == 0) {
1533 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1534 << " with all down osds" << pg.second << dendl;
1535 pending_inc->new_pg_temp[pg.first].clear();
1536 continue;
1537 }
1538 // redundant pg_temp?
1539 vector<int> raw_up;
1540 int primary;
1541 tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1542 if (vectors_equal(raw_up, pg.second)) {
1543 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1544 << pg.second << " that matches raw_up mapping" << dendl;
1545 if (osdmap.pg_temp->count(pg.first))
1546 pending_inc->new_pg_temp[pg.first].clear();
1547 else
1548 pending_inc->new_pg_temp.erase(pg.first);
1549 }
1550 }
1551
1552 for (auto &pg : *tmpmap.primary_temp) {
1553 // primary down?
1554 if (tmpmap.is_down(pg.second)) {
1555 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1556 << " to down " << pg.second << dendl;
1557 pending_inc->new_primary_temp[pg.first] = -1;
1558 continue;
1559 }
1560 // redundant primary_temp?
1561 vector<int> real_up, templess_up;
1562 int real_primary, templess_primary;
1563 pg_t pgid = pg.first;
1564 tmpmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1565 tmpmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1566 if (real_primary == templess_primary){
1567 ldout(cct, 10) << __func__ << " removing primary_temp "
1568 << pgid << " -> " << real_primary
1569 << " (unnecessary/redundant)" << dendl;
1570 if (osdmap.primary_temp->count(pgid))
1571 pending_inc->new_primary_temp[pgid] = -1;
1572 else
1573 pending_inc->new_primary_temp.erase(pgid);
1574 }
1575 }
1576 }
1577
1578 int OSDMap::apply_incremental(const Incremental &inc)
1579 {
1580 new_blacklist_entries = false;
1581 if (inc.epoch == 1)
1582 fsid = inc.fsid;
1583 else if (inc.fsid != fsid)
1584 return -EINVAL;
1585
1586 assert(inc.epoch == epoch+1);
1587
1588 epoch++;
1589 modified = inc.modified;
1590
1591 // full map?
1592 if (inc.fullmap.length()) {
1593 bufferlist bl(inc.fullmap);
1594 decode(bl);
1595 return 0;
1596 }
1597
1598 // nope, incremental.
1599 if (inc.new_flags >= 0) {
1600 flags = inc.new_flags;
1601 // the below is just to cover a newly-upgraded luminous mon
1602 // cluster that has to set require_jewel_osds or
1603 // require_kraken_osds before the osds can be upgraded to
1604 // luminous.
1605 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1606 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1607 require_osd_release = CEPH_RELEASE_KRAKEN;
1608 }
1609 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1610 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1611 require_osd_release = CEPH_RELEASE_JEWEL;
1612 }
1613 }
1614 }
1615
1616 if (inc.new_max_osd >= 0)
1617 set_max_osd(inc.new_max_osd);
1618
1619 if (inc.new_pool_max != -1)
1620 pool_max = inc.new_pool_max;
1621
1622 for (const auto &pool : inc.new_pools) {
1623 pools[pool.first] = pool.second;
1624 pools[pool.first].last_change = epoch;
1625 }
1626
1627 for (const auto &pname : inc.new_pool_names) {
1628 auto pool_name_entry = pool_name.find(pname.first);
1629 if (pool_name_entry != pool_name.end()) {
1630 name_pool.erase(pool_name_entry->second);
1631 pool_name_entry->second = pname.second;
1632 } else {
1633 pool_name[pname.first] = pname.second;
1634 }
1635 name_pool[pname.second] = pname.first;
1636 }
1637
1638 for (const auto &pool : inc.old_pools) {
1639 pools.erase(pool);
1640 name_pool.erase(pool_name[pool]);
1641 pool_name.erase(pool);
1642 }
1643
1644 for (const auto &weight : inc.new_weight) {
1645 set_weight(weight.first, weight.second);
1646
1647 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1648 // xinfo old_weight.
1649 if (weight.second) {
1650 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
1651 osd_xinfo[weight.first].old_weight = 0;
1652 }
1653 }
1654
1655 for (const auto &primary_affinity : inc.new_primary_affinity) {
1656 set_primary_affinity(primary_affinity.first, primary_affinity.second);
1657 }
1658
1659 // erasure_code_profiles
1660 for (const auto &profile : inc.old_erasure_code_profiles)
1661 erasure_code_profiles.erase(profile);
1662
1663 for (const auto &profile : inc.new_erasure_code_profiles) {
1664 set_erasure_code_profile(profile.first, profile.second);
1665 }
1666
1667 // up/down
1668 for (const auto &state : inc.new_state) {
1669 const auto osd = state.first;
1670 int s = state.second ? state.second : CEPH_OSD_UP;
1671 if ((osd_state[osd] & CEPH_OSD_UP) &&
1672 (s & CEPH_OSD_UP)) {
1673 osd_info[osd].down_at = epoch;
1674 osd_xinfo[osd].down_stamp = modified;
1675 }
1676 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
1677 (s & CEPH_OSD_EXISTS)) {
1678 // osd is destroyed; clear out anything interesting.
1679 (*osd_uuid)[osd] = uuid_d();
1680 osd_info[osd] = osd_info_t();
1681 osd_xinfo[osd] = osd_xinfo_t();
1682 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1683 osd_addrs->client_addr[osd].reset(new entity_addr_t());
1684 osd_addrs->cluster_addr[osd].reset(new entity_addr_t());
1685 osd_addrs->hb_front_addr[osd].reset(new entity_addr_t());
1686 osd_addrs->hb_back_addr[osd].reset(new entity_addr_t());
1687 osd_state[osd] = 0;
1688 } else {
1689 osd_state[osd] ^= s;
1690 }
1691 }
1692
1693 for (const auto &client : inc.new_up_client) {
1694 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1695 osd_addrs->client_addr[client.first].reset(new entity_addr_t(client.second));
1696 if (inc.new_hb_back_up.empty())
1697 osd_addrs->hb_back_addr[client.first].reset(new entity_addr_t(client.second)); //this is a backward-compatibility hack
1698 else
1699 osd_addrs->hb_back_addr[client.first].reset(
1700 new entity_addr_t(inc.new_hb_back_up.find(client.first)->second));
1701 const auto j = inc.new_hb_front_up.find(client.first);
1702 if (j != inc.new_hb_front_up.end())
1703 osd_addrs->hb_front_addr[client.first].reset(new entity_addr_t(j->second));
1704 else
1705 osd_addrs->hb_front_addr[client.first].reset();
1706
1707 osd_info[client.first].up_from = epoch;
1708 }
1709
1710 for (const auto &cluster : inc.new_up_cluster)
1711 osd_addrs->cluster_addr[cluster.first].reset(new entity_addr_t(cluster.second));
1712
1713 // info
1714 for (const auto &thru : inc.new_up_thru)
1715 osd_info[thru.first].up_thru = thru.second;
1716
1717 for (const auto &interval : inc.new_last_clean_interval) {
1718 osd_info[interval.first].last_clean_begin = interval.second.first;
1719 osd_info[interval.first].last_clean_end = interval.second.second;
1720 }
1721
1722 for (const auto &lost : inc.new_lost)
1723 osd_info[lost.first].lost_at = lost.second;
1724
1725 // xinfo
1726 for (const auto &xinfo : inc.new_xinfo)
1727 osd_xinfo[xinfo.first] = xinfo.second;
1728
1729 // uuid
1730 for (const auto &uuid : inc.new_uuid)
1731 (*osd_uuid)[uuid.first] = uuid.second;
1732
1733 // pg rebuild
1734 for (const auto &pg : inc.new_pg_temp) {
1735 if (pg.second.empty())
1736 pg_temp->erase(pg.first);
1737 else
1738 pg_temp->set(pg.first, pg.second);
1739 }
1740 if (!inc.new_pg_temp.empty()) {
1741 // make sure pg_temp is efficiently stored
1742 pg_temp->rebuild();
1743 }
1744
1745 for (const auto &pg : inc.new_primary_temp) {
1746 if (pg.second == -1)
1747 primary_temp->erase(pg.first);
1748 else
1749 (*primary_temp)[pg.first] = pg.second;
1750 }
1751
1752 for (auto& p : inc.new_pg_upmap) {
1753 pg_upmap[p.first] = p.second;
1754 }
1755 for (auto& pg : inc.old_pg_upmap) {
1756 pg_upmap.erase(pg);
1757 }
1758 for (auto& p : inc.new_pg_upmap_items) {
1759 pg_upmap_items[p.first] = p.second;
1760 }
1761 for (auto& pg : inc.old_pg_upmap_items) {
1762 pg_upmap_items.erase(pg);
1763 }
1764
1765 // blacklist
1766 if (!inc.new_blacklist.empty()) {
1767 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
1768 new_blacklist_entries = true;
1769 }
1770 for (const auto &addr : inc.old_blacklist)
1771 blacklist.erase(addr);
1772
1773 // cluster snapshot?
1774 if (inc.cluster_snapshot.length()) {
1775 cluster_snapshot = inc.cluster_snapshot;
1776 cluster_snapshot_epoch = inc.epoch;
1777 } else {
1778 cluster_snapshot.clear();
1779 cluster_snapshot_epoch = 0;
1780 }
1781
1782 if (inc.new_nearfull_ratio >= 0) {
1783 nearfull_ratio = inc.new_nearfull_ratio;
1784 }
1785 if (inc.new_backfillfull_ratio >= 0) {
1786 backfillfull_ratio = inc.new_backfillfull_ratio;
1787 }
1788 if (inc.new_full_ratio >= 0) {
1789 full_ratio = inc.new_full_ratio;
1790 }
1791 if (inc.new_require_min_compat_client > 0) {
1792 require_min_compat_client = inc.new_require_min_compat_client;
1793 }
1794 if (inc.new_require_osd_release >= 0) {
1795 require_osd_release = inc.new_require_osd_release;
1796 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1797 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
1798 }
1799 }
1800
1801 // do new crush map last (after up/down stuff)
1802 if (inc.crush.length()) {
1803 bufferlist bl(inc.crush);
1804 auto blp = bl.begin();
1805 crush.reset(new CrushWrapper);
1806 crush->decode(blp);
1807 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1808 // only increment if this is a luminous-encoded osdmap, lest
1809 // the mon's crush_version diverge from what the osds or others
1810 // are decoding and applying on their end. if we won't encode
1811 // it in the canonical version, don't change it.
1812 ++crush_version;
1813 }
1814 }
1815
1816 calc_num_osds();
1817 _calc_up_osd_features();
1818 return 0;
1819 }
1820
1821 // mapping
1822 int OSDMap::map_to_pg(
1823 int64_t poolid,
1824 const string& name,
1825 const string& key,
1826 const string& nspace,
1827 pg_t *pg) const
1828 {
1829 // calculate ps (placement seed)
1830 const pg_pool_t *pool = get_pg_pool(poolid);
1831 if (!pool)
1832 return -ENOENT;
1833 ps_t ps;
1834 if (!key.empty())
1835 ps = pool->hash_key(key, nspace);
1836 else
1837 ps = pool->hash_key(name, nspace);
1838 *pg = pg_t(ps, poolid);
1839 return 0;
1840 }
1841
1842 int OSDMap::object_locator_to_pg(
1843 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
1844 {
1845 if (loc.hash >= 0) {
1846 if (!get_pg_pool(loc.get_pool())) {
1847 return -ENOENT;
1848 }
1849 pg = pg_t(loc.hash, loc.get_pool());
1850 return 0;
1851 }
1852 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
1853 }
1854
1855 ceph_object_layout OSDMap::make_object_layout(
1856 object_t oid, int pg_pool, string nspace) const
1857 {
1858 object_locator_t loc(pg_pool, nspace);
1859
1860 ceph_object_layout ol;
1861 pg_t pgid = object_locator_to_pg(oid, loc);
1862 ol.ol_pgid = pgid.get_old_pg().v;
1863 ol.ol_stripe_unit = 0;
1864 return ol;
1865 }
1866
1867 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
1868 vector<int>& osds) const
1869 {
1870 if (pool.can_shift_osds()) {
1871 unsigned removed = 0;
1872 for (unsigned i = 0; i < osds.size(); i++) {
1873 if (!exists(osds[i])) {
1874 removed++;
1875 continue;
1876 }
1877 if (removed) {
1878 osds[i - removed] = osds[i];
1879 }
1880 }
1881 if (removed)
1882 osds.resize(osds.size() - removed);
1883 } else {
1884 for (auto& osd : osds) {
1885 if (!exists(osd))
1886 osd = CRUSH_ITEM_NONE;
1887 }
1888 }
1889 }
1890
1891 void OSDMap::_pg_to_raw_osds(
1892 const pg_pool_t& pool, pg_t pg,
1893 vector<int> *osds,
1894 ps_t *ppps) const
1895 {
1896 // map to osds[]
1897 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
1898 unsigned size = pool.get_size();
1899
1900 // what crush rule?
1901 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
1902 if (ruleno >= 0)
1903 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
1904
1905 _remove_nonexistent_osds(pool, *osds);
1906
1907 if (ppps)
1908 *ppps = pps;
1909 }
1910
1911 int OSDMap::_pick_primary(const vector<int>& osds) const
1912 {
1913 for (auto osd : osds) {
1914 if (osd != CRUSH_ITEM_NONE) {
1915 return osd;
1916 }
1917 }
1918 return -1;
1919 }
1920
1921 void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
1922 {
1923 pg_t pg = pi.raw_pg_to_pg(raw_pg);
1924 auto p = pg_upmap.find(pg);
1925 if (p != pg_upmap.end()) {
1926 // make sure targets aren't marked out
1927 for (auto osd : p->second) {
1928 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd_weight[osd] == 0) {
1929 // reject/ignore the explicit mapping
1930 return;
1931 }
1932 }
1933 *raw = vector<int>(p->second.begin(), p->second.end());
1934 // continue to check and apply pg_upmap_items if any
1935 }
1936
1937 auto q = pg_upmap_items.find(pg);
1938 if (q != pg_upmap_items.end()) {
1939 // NOTE: this approach does not allow a bidirectional swap,
1940 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1941 for (auto& r : q->second) {
1942 // make sure the replacement value doesn't already appear
1943 bool exists = false;
1944 ssize_t pos = -1;
1945 for (unsigned i = 0; i < raw->size(); ++i) {
1946 int osd = (*raw)[i];
1947 if (osd == r.second) {
1948 exists = true;
1949 break;
1950 }
1951 // ignore mapping if target is marked out (or invalid osd id)
1952 if (osd == r.first &&
1953 pos < 0 &&
1954 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
1955 osd_weight[r.second] == 0)) {
1956 pos = i;
1957 }
1958 }
1959 if (!exists && pos >= 0) {
1960 (*raw)[pos] = r.second;
1961 }
1962 }
1963 }
1964 }
1965
1966 // pg -> (up osd list)
1967 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1968 vector<int> *up) const
1969 {
1970 if (pool.can_shift_osds()) {
1971 // shift left
1972 up->clear();
1973 up->reserve(raw.size());
1974 for (unsigned i=0; i<raw.size(); i++) {
1975 if (!exists(raw[i]) || is_down(raw[i]))
1976 continue;
1977 up->push_back(raw[i]);
1978 }
1979 } else {
1980 // set down/dne devices to NONE
1981 up->resize(raw.size());
1982 for (int i = raw.size() - 1; i >= 0; --i) {
1983 if (!exists(raw[i]) || is_down(raw[i])) {
1984 (*up)[i] = CRUSH_ITEM_NONE;
1985 } else {
1986 (*up)[i] = raw[i];
1987 }
1988 }
1989 }
1990 }
1991
1992 void OSDMap::_apply_primary_affinity(ps_t seed,
1993 const pg_pool_t& pool,
1994 vector<int> *osds,
1995 int *primary) const
1996 {
1997 // do we have any non-default primary_affinity values for these osds?
1998 if (!osd_primary_affinity)
1999 return;
2000
2001 bool any = false;
2002 for (const auto osd : *osds) {
2003 if (osd != CRUSH_ITEM_NONE &&
2004 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2005 any = true;
2006 break;
2007 }
2008 }
2009 if (!any)
2010 return;
2011
2012 // pick the primary. feed both the seed (for the pg) and the osd
2013 // into the hash/rng so that a proportional fraction of an osd's pgs
2014 // get rejected as primary.
2015 int pos = -1;
2016 for (unsigned i = 0; i < osds->size(); ++i) {
2017 int o = (*osds)[i];
2018 if (o == CRUSH_ITEM_NONE)
2019 continue;
2020 unsigned a = (*osd_primary_affinity)[o];
2021 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2022 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2023 seed, o) >> 16) >= a) {
2024 // we chose not to use this primary. note it anyway as a
2025 // fallback in case we don't pick anyone else, but keep looking.
2026 if (pos < 0)
2027 pos = i;
2028 } else {
2029 pos = i;
2030 break;
2031 }
2032 }
2033 if (pos < 0)
2034 return;
2035
2036 *primary = (*osds)[pos];
2037
2038 if (pool.can_shift_osds() && pos > 0) {
2039 // move the new primary to the front.
2040 for (int i = pos; i > 0; --i) {
2041 (*osds)[i] = (*osds)[i-1];
2042 }
2043 (*osds)[0] = *primary;
2044 }
2045 }
2046
2047 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2048 vector<int> *temp_pg, int *temp_primary) const
2049 {
2050 pg = pool.raw_pg_to_pg(pg);
2051 const auto p = pg_temp->find(pg);
2052 temp_pg->clear();
2053 if (p != pg_temp->end()) {
2054 for (unsigned i=0; i<p->second.size(); i++) {
2055 if (!exists(p->second[i]) || is_down(p->second[i])) {
2056 if (pool.can_shift_osds()) {
2057 continue;
2058 } else {
2059 temp_pg->push_back(CRUSH_ITEM_NONE);
2060 }
2061 } else {
2062 temp_pg->push_back(p->second[i]);
2063 }
2064 }
2065 }
2066 const auto &pp = primary_temp->find(pg);
2067 *temp_primary = -1;
2068 if (pp != primary_temp->end()) {
2069 *temp_primary = pp->second;
2070 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2071 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2072 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2073 *temp_primary = (*temp_pg)[i];
2074 break;
2075 }
2076 }
2077 }
2078 }
2079
2080 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2081 {
2082 *primary = -1;
2083 raw->clear();
2084 const pg_pool_t *pool = get_pg_pool(pg.pool());
2085 if (!pool)
2086 return;
2087 _pg_to_raw_osds(*pool, pg, raw, NULL);
2088 if (primary)
2089 *primary = _pick_primary(*raw);
2090 }
2091
2092 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2093 {
2094 const pg_pool_t *pool = get_pg_pool(pg.pool());
2095 if (!pool) {
2096 if (primary)
2097 *primary = -1;
2098 if (up)
2099 up->clear();
2100 return;
2101 }
2102 vector<int> raw;
2103 ps_t pps;
2104 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2105 _apply_upmap(*pool, pg, &raw);
2106 _raw_to_up_osds(*pool, raw, up);
2107 *primary = _pick_primary(raw);
2108 _apply_primary_affinity(pps, *pool, up, primary);
2109 }
2110
2111 void OSDMap::_pg_to_up_acting_osds(
2112 const pg_t& pg, vector<int> *up, int *up_primary,
2113 vector<int> *acting, int *acting_primary,
2114 bool raw_pg_to_pg) const
2115 {
2116 const pg_pool_t *pool = get_pg_pool(pg.pool());
2117 if (!pool ||
2118 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2119 if (up)
2120 up->clear();
2121 if (up_primary)
2122 *up_primary = -1;
2123 if (acting)
2124 acting->clear();
2125 if (acting_primary)
2126 *acting_primary = -1;
2127 return;
2128 }
2129 vector<int> raw;
2130 vector<int> _up;
2131 vector<int> _acting;
2132 int _up_primary;
2133 int _acting_primary;
2134 ps_t pps;
2135 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2136 if (_acting.empty() || up || up_primary) {
2137 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2138 _apply_upmap(*pool, pg, &raw);
2139 _raw_to_up_osds(*pool, raw, &_up);
2140 _up_primary = _pick_primary(_up);
2141 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2142 if (_acting.empty()) {
2143 _acting = _up;
2144 if (_acting_primary == -1) {
2145 _acting_primary = _up_primary;
2146 }
2147 }
2148
2149 if (up)
2150 up->swap(_up);
2151 if (up_primary)
2152 *up_primary = _up_primary;
2153 }
2154
2155 if (acting)
2156 acting->swap(_acting);
2157 if (acting_primary)
2158 *acting_primary = _acting_primary;
2159 }
2160
2161 int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2162 {
2163 if (!nrep)
2164 nrep = acting.size();
2165 for (int i=0; i<nrep; i++)
2166 if (acting[i] == osd)
2167 return i;
2168 return -1;
2169 }
2170
2171 int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2172 {
2173 return calc_pg_rank(osd, acting, nrep);
2174 }
2175
2176 bool OSDMap::primary_changed(
2177 int oldprimary,
2178 const vector<int> &oldacting,
2179 int newprimary,
2180 const vector<int> &newacting)
2181 {
2182 if (oldacting.empty() && newacting.empty())
2183 return false; // both still empty
2184 if (oldacting.empty() ^ newacting.empty())
2185 return true; // was empty, now not, or vice versa
2186 if (oldprimary != newprimary)
2187 return true; // primary changed
2188 if (calc_pg_rank(oldprimary, oldacting) !=
2189 calc_pg_rank(newprimary, newacting))
2190 return true;
2191 return false; // same primary (tho replicas may have changed)
2192 }
2193
2194
2195 // serialize, unserialize
2196 void OSDMap::encode_client_old(bufferlist& bl) const
2197 {
2198 __u16 v = 5;
2199 ::encode(v, bl);
2200
2201 // base
2202 ::encode(fsid, bl);
2203 ::encode(epoch, bl);
2204 ::encode(created, bl);
2205 ::encode(modified, bl);
2206
2207 // for ::encode(pools, bl);
2208 __u32 n = pools.size();
2209 ::encode(n, bl);
2210
2211 for (const auto &pool : pools) {
2212 n = pool.first;
2213 ::encode(n, bl);
2214 ::encode(pool.second, bl, 0);
2215 }
2216 // for ::encode(pool_name, bl);
2217 n = pool_name.size();
2218 ::encode(n, bl);
2219 for (const auto &pname : pool_name) {
2220 n = pname.first;
2221 ::encode(n, bl);
2222 ::encode(pname.second, bl);
2223 }
2224 // for ::encode(pool_max, bl);
2225 n = pool_max;
2226 ::encode(n, bl);
2227
2228 ::encode(flags, bl);
2229
2230 ::encode(max_osd, bl);
2231 {
2232 uint32_t n = osd_state.size();
2233 ::encode(n, bl);
2234 for (auto s : osd_state) {
2235 ::encode((uint8_t)s, bl);
2236 }
2237 }
2238 ::encode(osd_weight, bl);
2239 ::encode(osd_addrs->client_addr, bl, 0);
2240
2241 // for ::encode(pg_temp, bl);
2242 n = pg_temp->size();
2243 ::encode(n, bl);
2244 for (const auto pg : *pg_temp) {
2245 old_pg_t opg = pg.first.get_old_pg();
2246 ::encode(opg, bl);
2247 ::encode(pg.second, bl);
2248 }
2249
2250 // crush
2251 bufferlist cbl;
2252 crush->encode(cbl, 0 /* legacy (no) features */);
2253 ::encode(cbl, bl);
2254 }
2255
2256 void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2257 {
2258 if ((features & CEPH_FEATURE_PGID64) == 0) {
2259 encode_client_old(bl);
2260 return;
2261 }
2262
2263 __u16 v = 6;
2264 ::encode(v, bl);
2265
2266 // base
2267 ::encode(fsid, bl);
2268 ::encode(epoch, bl);
2269 ::encode(created, bl);
2270 ::encode(modified, bl);
2271
2272 ::encode(pools, bl, features);
2273 ::encode(pool_name, bl);
2274 ::encode(pool_max, bl);
2275
2276 ::encode(flags, bl);
2277
2278 ::encode(max_osd, bl);
2279 {
2280 uint32_t n = osd_state.size();
2281 ::encode(n, bl);
2282 for (auto s : osd_state) {
2283 ::encode((uint8_t)s, bl);
2284 }
2285 }
2286 ::encode(osd_weight, bl);
2287 ::encode(osd_addrs->client_addr, bl, features);
2288
2289 ::encode(*pg_temp, bl);
2290
2291 // crush
2292 bufferlist cbl;
2293 crush->encode(cbl, 0 /* legacy (no) features */);
2294 ::encode(cbl, bl);
2295
2296 // extended
2297 __u16 ev = 10;
2298 ::encode(ev, bl);
2299 ::encode(osd_addrs->hb_back_addr, bl, features);
2300 ::encode(osd_info, bl);
2301 ::encode(blacklist, bl, features);
2302 ::encode(osd_addrs->cluster_addr, bl, features);
2303 ::encode(cluster_snapshot_epoch, bl);
2304 ::encode(cluster_snapshot, bl);
2305 ::encode(*osd_uuid, bl);
2306 ::encode(osd_xinfo, bl);
2307 ::encode(osd_addrs->hb_front_addr, bl, features);
2308 }
2309
2310 void OSDMap::encode(bufferlist& bl, uint64_t features) const
2311 {
2312 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2313 encode_classic(bl, features);
2314 return;
2315 }
2316
2317 // only a select set of callers should *ever* be encoding new
2318 // OSDMaps. others should be passing around the canonical encoded
2319 // buffers from on high. select out those callers by passing in an
2320 // "impossible" feature bit.
2321 assert(features & CEPH_FEATURE_RESERVED);
2322 features &= ~CEPH_FEATURE_RESERVED;
2323
2324 size_t start_offset = bl.length();
2325 size_t tail_offset;
2326 buffer::list::iterator crc_it;
2327
2328 // meta-encoding: how we include client-used and osd-specific data
2329 ENCODE_START(8, 7, bl);
2330
2331 {
2332 uint8_t v = 6;
2333 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2334 v = 3;
2335 }
2336 ENCODE_START(v, 1, bl); // client-usable data
2337 // base
2338 ::encode(fsid, bl);
2339 ::encode(epoch, bl);
2340 ::encode(created, bl);
2341 ::encode(modified, bl);
2342
2343 ::encode(pools, bl, features);
2344 ::encode(pool_name, bl);
2345 ::encode(pool_max, bl);
2346
2347 if (v < 4) {
2348 decltype(flags) f = flags;
2349 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
2350 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS;
2351 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2352 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2353 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2354 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
2355 ::encode(f, bl);
2356 } else {
2357 ::encode(flags, bl);
2358 }
2359
2360 ::encode(max_osd, bl);
2361 if (v >= 5) {
2362 ::encode(osd_state, bl);
2363 } else {
2364 uint32_t n = osd_state.size();
2365 ::encode(n, bl);
2366 for (auto s : osd_state) {
2367 ::encode((uint8_t)s, bl);
2368 }
2369 }
2370 ::encode(osd_weight, bl);
2371 ::encode(osd_addrs->client_addr, bl, features);
2372
2373 ::encode(*pg_temp, bl);
2374 ::encode(*primary_temp, bl);
2375 if (osd_primary_affinity) {
2376 ::encode(*osd_primary_affinity, bl);
2377 } else {
2378 vector<__u32> v;
2379 ::encode(v, bl);
2380 }
2381
2382 // crush
2383 bufferlist cbl;
2384 crush->encode(cbl, features);
2385 ::encode(cbl, bl);
2386 ::encode(erasure_code_profiles, bl);
2387
2388 if (v >= 4) {
2389 ::encode(pg_upmap, bl);
2390 ::encode(pg_upmap_items, bl);
2391 } else {
2392 assert(pg_upmap.empty());
2393 assert(pg_upmap_items.empty());
2394 }
2395 if (v >= 6) {
2396 ::encode(crush_version, bl);
2397 }
2398 ENCODE_FINISH(bl); // client-usable data
2399 }
2400
2401 {
2402 uint8_t target_v = 5;
2403 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2404 target_v = 1;
2405 }
2406 ENCODE_START(target_v, 1, bl); // extended, osd-only data
2407 ::encode(osd_addrs->hb_back_addr, bl, features);
2408 ::encode(osd_info, bl);
2409 {
2410 // put this in a sorted, ordered map<> so that we encode in a
2411 // deterministic order.
2412 map<entity_addr_t,utime_t> blacklist_map;
2413 for (const auto &addr : blacklist)
2414 blacklist_map.insert(make_pair(addr.first, addr.second));
2415 ::encode(blacklist_map, bl, features);
2416 }
2417 ::encode(osd_addrs->cluster_addr, bl, features);
2418 ::encode(cluster_snapshot_epoch, bl);
2419 ::encode(cluster_snapshot, bl);
2420 ::encode(*osd_uuid, bl);
2421 ::encode(osd_xinfo, bl);
2422 ::encode(osd_addrs->hb_front_addr, bl, features);
2423 if (target_v >= 2) {
2424 ::encode(nearfull_ratio, bl);
2425 ::encode(full_ratio, bl);
2426 ::encode(backfillfull_ratio, bl);
2427 }
2428 // 4 was string-based new_require_min_compat_client
2429 if (target_v >= 5) {
2430 ::encode(require_min_compat_client, bl);
2431 ::encode(require_osd_release, bl);
2432 }
2433 ENCODE_FINISH(bl); // osd-only data
2434 }
2435
2436 ::encode((uint32_t)0, bl); // dummy crc
2437 crc_it = bl.end();
2438 crc_it.advance(-4);
2439 tail_offset = bl.length();
2440
2441 ENCODE_FINISH(bl); // meta-encoding wrapper
2442
2443 // fill in crc
2444 bufferlist front;
2445 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
2446 crc = front.crc32c(-1);
2447 if (tail_offset < bl.length()) {
2448 bufferlist tail;
2449 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2450 crc = tail.crc32c(crc);
2451 }
2452 ceph_le32 crc_le;
2453 crc_le = crc;
2454 crc_it.copy_in(4, (char*)&crc_le);
2455 crc_defined = true;
2456 }
2457
2458 void OSDMap::decode(bufferlist& bl)
2459 {
2460 auto p = bl.begin();
2461 decode(p);
2462 }
2463
2464 void OSDMap::decode_classic(bufferlist::iterator& p)
2465 {
2466 __u32 n, t;
2467 __u16 v;
2468 ::decode(v, p);
2469
2470 // base
2471 ::decode(fsid, p);
2472 ::decode(epoch, p);
2473 ::decode(created, p);
2474 ::decode(modified, p);
2475
2476 if (v < 6) {
2477 if (v < 4) {
2478 int32_t max_pools = 0;
2479 ::decode(max_pools, p);
2480 pool_max = max_pools;
2481 }
2482 pools.clear();
2483 ::decode(n, p);
2484 while (n--) {
2485 ::decode(t, p);
2486 ::decode(pools[t], p);
2487 }
2488 if (v == 4) {
2489 ::decode(n, p);
2490 pool_max = n;
2491 } else if (v == 5) {
2492 pool_name.clear();
2493 ::decode(n, p);
2494 while (n--) {
2495 ::decode(t, p);
2496 ::decode(pool_name[t], p);
2497 }
2498 ::decode(n, p);
2499 pool_max = n;
2500 }
2501 } else {
2502 ::decode(pools, p);
2503 ::decode(pool_name, p);
2504 ::decode(pool_max, p);
2505 }
2506 // kludge around some old bug that zeroed out pool_max (#2307)
2507 if (pools.size() && pool_max < pools.rbegin()->first) {
2508 pool_max = pools.rbegin()->first;
2509 }
2510
2511 ::decode(flags, p);
2512
2513 ::decode(max_osd, p);
2514 {
2515 vector<uint8_t> os;
2516 ::decode(os, p);
2517 osd_state.resize(os.size());
2518 for (unsigned i = 0; i < os.size(); ++i) {
2519 osd_state[i] = os[i];
2520 }
2521 }
2522 ::decode(osd_weight, p);
2523 ::decode(osd_addrs->client_addr, p);
2524 if (v <= 5) {
2525 pg_temp->clear();
2526 ::decode(n, p);
2527 while (n--) {
2528 old_pg_t opg;
2529 ::decode_raw(opg, p);
2530 mempool::osdmap::vector<int32_t> v;
2531 ::decode(v, p);
2532 pg_temp->set(pg_t(opg), v);
2533 }
2534 } else {
2535 ::decode(*pg_temp, p);
2536 }
2537
2538 // crush
2539 bufferlist cbl;
2540 ::decode(cbl, p);
2541 auto cblp = cbl.begin();
2542 crush->decode(cblp);
2543
2544 // extended
2545 __u16 ev = 0;
2546 if (v >= 5)
2547 ::decode(ev, p);
2548 ::decode(osd_addrs->hb_back_addr, p);
2549 ::decode(osd_info, p);
2550 if (v < 5)
2551 ::decode(pool_name, p);
2552
2553 ::decode(blacklist, p);
2554 if (ev >= 6)
2555 ::decode(osd_addrs->cluster_addr, p);
2556 else
2557 osd_addrs->cluster_addr.resize(osd_addrs->client_addr.size());
2558
2559 if (ev >= 7) {
2560 ::decode(cluster_snapshot_epoch, p);
2561 ::decode(cluster_snapshot, p);
2562 }
2563
2564 if (ev >= 8) {
2565 ::decode(*osd_uuid, p);
2566 } else {
2567 osd_uuid->resize(max_osd);
2568 }
2569 if (ev >= 9)
2570 ::decode(osd_xinfo, p);
2571 else
2572 osd_xinfo.resize(max_osd);
2573
2574 if (ev >= 10)
2575 ::decode(osd_addrs->hb_front_addr, p);
2576 else
2577 osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
2578
2579 osd_primary_affinity.reset();
2580
2581 post_decode();
2582 }
2583
2584 void OSDMap::decode(bufferlist::iterator& bl)
2585 {
2586 /**
2587 * Older encodings of the OSDMap had a single struct_v which
2588 * covered the whole encoding, and was prior to our modern
2589 * stuff which includes a compatv and a size. So if we see
2590 * a struct_v < 7, we must rewind to the beginning and use our
2591 * classic decoder.
2592 */
2593 size_t start_offset = bl.get_off();
2594 size_t tail_offset = 0;
2595 bufferlist crc_front, crc_tail;
2596
2597 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
2598 if (struct_v < 7) {
2599 int struct_v_size = sizeof(struct_v);
2600 bl.advance(-struct_v_size);
2601 decode_classic(bl);
2602 return;
2603 }
2604 /**
2605 * Since we made it past that hurdle, we can use our normal paths.
2606 */
2607 {
2608 DECODE_START(6, bl); // client-usable data
2609 // base
2610 ::decode(fsid, bl);
2611 ::decode(epoch, bl);
2612 ::decode(created, bl);
2613 ::decode(modified, bl);
2614
2615 ::decode(pools, bl);
2616 ::decode(pool_name, bl);
2617 ::decode(pool_max, bl);
2618
2619 ::decode(flags, bl);
2620
2621 ::decode(max_osd, bl);
2622 if (struct_v >= 5) {
2623 ::decode(osd_state, bl);
2624 } else {
2625 vector<uint8_t> os;
2626 ::decode(os, bl);
2627 osd_state.resize(os.size());
2628 for (unsigned i = 0; i < os.size(); ++i) {
2629 osd_state[i] = os[i];
2630 }
2631 }
2632 ::decode(osd_weight, bl);
2633 ::decode(osd_addrs->client_addr, bl);
2634
2635 ::decode(*pg_temp, bl);
2636 ::decode(*primary_temp, bl);
2637 if (struct_v >= 2) {
2638 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
2639 ::decode(*osd_primary_affinity, bl);
2640 if (osd_primary_affinity->empty())
2641 osd_primary_affinity.reset();
2642 } else {
2643 osd_primary_affinity.reset();
2644 }
2645
2646 // crush
2647 bufferlist cbl;
2648 ::decode(cbl, bl);
2649 auto cblp = cbl.begin();
2650 crush->decode(cblp);
2651 if (struct_v >= 3) {
2652 ::decode(erasure_code_profiles, bl);
2653 } else {
2654 erasure_code_profiles.clear();
2655 }
2656 if (struct_v >= 4) {
2657 ::decode(pg_upmap, bl);
2658 ::decode(pg_upmap_items, bl);
2659 } else {
2660 pg_upmap.clear();
2661 pg_upmap_items.clear();
2662 }
2663 if (struct_v >= 6) {
2664 ::decode(crush_version, bl);
2665 }
2666 DECODE_FINISH(bl); // client-usable data
2667 }
2668
2669 {
2670 DECODE_START(5, bl); // extended, osd-only data
2671 ::decode(osd_addrs->hb_back_addr, bl);
2672 ::decode(osd_info, bl);
2673 ::decode(blacklist, bl);
2674 ::decode(osd_addrs->cluster_addr, bl);
2675 ::decode(cluster_snapshot_epoch, bl);
2676 ::decode(cluster_snapshot, bl);
2677 ::decode(*osd_uuid, bl);
2678 ::decode(osd_xinfo, bl);
2679 ::decode(osd_addrs->hb_front_addr, bl);
2680 if (struct_v >= 2) {
2681 ::decode(nearfull_ratio, bl);
2682 ::decode(full_ratio, bl);
2683 } else {
2684 nearfull_ratio = 0;
2685 full_ratio = 0;
2686 }
2687 if (struct_v >= 3) {
2688 ::decode(backfillfull_ratio, bl);
2689 } else {
2690 backfillfull_ratio = 0;
2691 }
2692 if (struct_v == 4) {
2693 string r;
2694 ::decode(r, bl);
2695 if (r.length())
2696 require_min_compat_client = ceph_release_from_name(r.c_str());
2697 }
2698 if (struct_v >= 5) {
2699 ::decode(require_min_compat_client, bl);
2700 ::decode(require_osd_release, bl);
2701 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2702 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2703 }
2704 } else {
2705 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
2706 // only for compat with post-kraken pre-luminous test clusters
2707 require_osd_release = CEPH_RELEASE_LUMINOUS;
2708 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2709 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2710 require_osd_release = CEPH_RELEASE_KRAKEN;
2711 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2712 require_osd_release = CEPH_RELEASE_JEWEL;
2713 } else {
2714 require_osd_release = 0;
2715 }
2716 }
2717 DECODE_FINISH(bl); // osd-only data
2718 }
2719
2720 if (struct_v >= 8) {
2721 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
2722 ::decode(crc, bl);
2723 tail_offset = bl.get_off();
2724 crc_defined = true;
2725 } else {
2726 crc_defined = false;
2727 crc = 0;
2728 }
2729
2730 DECODE_FINISH(bl); // wrapper
2731
2732 if (tail_offset) {
2733 // verify crc
2734 uint32_t actual = crc_front.crc32c(-1);
2735 if (tail_offset < bl.get_off()) {
2736 bufferlist tail;
2737 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
2738 actual = tail.crc32c(actual);
2739 }
2740 if (crc != actual) {
2741 ostringstream ss;
2742 ss << "bad crc, actual " << actual << " != expected " << crc;
2743 string s = ss.str();
2744 throw buffer::malformed_input(s.c_str());
2745 }
2746 }
2747
2748 post_decode();
2749 }
2750
2751 void OSDMap::post_decode()
2752 {
2753 // index pool names
2754 name_pool.clear();
2755 for (const auto &pname : pool_name) {
2756 name_pool[pname.second] = pname.first;
2757 }
2758
2759 calc_num_osds();
2760 _calc_up_osd_features();
2761 }
2762
2763 void OSDMap::dump_erasure_code_profiles(
2764 const mempool::osdmap::map<string,map<string,string>>& profiles,
2765 Formatter *f)
2766 {
2767 f->open_object_section("erasure_code_profiles");
2768 for (const auto &profile : profiles) {
2769 f->open_object_section(profile.first.c_str());
2770 for (const auto &profm : profile.second) {
2771 f->dump_string(profm.first.c_str(), profm.second.c_str());
2772 }
2773 f->close_section();
2774 }
2775 f->close_section();
2776 }
2777
2778 void OSDMap::dump(Formatter *f) const
2779 {
2780 f->dump_int("epoch", get_epoch());
2781 f->dump_stream("fsid") << get_fsid();
2782 f->dump_stream("created") << get_created();
2783 f->dump_stream("modified") << get_modified();
2784 f->dump_string("flags", get_flag_string());
2785 f->dump_unsigned("crush_version", get_crush_version());
2786 f->dump_float("full_ratio", full_ratio);
2787 f->dump_float("backfillfull_ratio", backfillfull_ratio);
2788 f->dump_float("nearfull_ratio", nearfull_ratio);
2789 f->dump_string("cluster_snapshot", get_cluster_snapshot());
2790 f->dump_int("pool_max", get_pool_max());
2791 f->dump_int("max_osd", get_max_osd());
2792 f->dump_string("require_min_compat_client",
2793 ceph_release_name(require_min_compat_client));
2794 f->dump_string("min_compat_client",
2795 ceph_release_name(get_min_compat_client()));
2796 f->dump_string("require_osd_release",
2797 ceph_release_name(require_osd_release));
2798
2799 f->open_array_section("pools");
2800 for (const auto &pool : pools) {
2801 std::string name("<unknown>");
2802 const auto &pni = pool_name.find(pool.first);
2803 if (pni != pool_name.end())
2804 name = pni->second;
2805 f->open_object_section("pool");
2806 f->dump_int("pool", pool.first);
2807 f->dump_string("pool_name", name);
2808 pool.second.dump(f);
2809 f->close_section();
2810 }
2811 f->close_section();
2812
2813 f->open_array_section("osds");
2814 for (int i=0; i<get_max_osd(); i++)
2815 if (exists(i)) {
2816 f->open_object_section("osd_info");
2817 f->dump_int("osd", i);
2818 f->dump_stream("uuid") << get_uuid(i);
2819 f->dump_int("up", is_up(i));
2820 f->dump_int("in", is_in(i));
2821 f->dump_float("weight", get_weightf(i));
2822 f->dump_float("primary_affinity", get_primary_affinityf(i));
2823 get_info(i).dump(f);
2824 f->dump_stream("public_addr") << get_addr(i);
2825 f->dump_stream("cluster_addr") << get_cluster_addr(i);
2826 f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
2827 f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
2828
2829 set<string> st;
2830 get_state(i, st);
2831 f->open_array_section("state");
2832 for (const auto &state : st)
2833 f->dump_string("state", state);
2834 f->close_section();
2835
2836 f->close_section();
2837 }
2838 f->close_section();
2839
2840 f->open_array_section("osd_xinfo");
2841 for (int i=0; i<get_max_osd(); i++) {
2842 if (exists(i)) {
2843 f->open_object_section("xinfo");
2844 f->dump_int("osd", i);
2845 osd_xinfo[i].dump(f);
2846 f->close_section();
2847 }
2848 }
2849 f->close_section();
2850
2851 f->open_array_section("pg_upmap");
2852 for (auto& p : pg_upmap) {
2853 f->open_object_section("mapping");
2854 f->dump_stream("pgid") << p.first;
2855 f->open_array_section("osds");
2856 for (auto q : p.second) {
2857 f->dump_int("osd", q);
2858 }
2859 f->close_section();
2860 f->close_section();
2861 }
2862 f->close_section();
2863 f->open_array_section("pg_upmap_items");
2864 for (auto& p : pg_upmap_items) {
2865 f->open_object_section("mapping");
2866 f->dump_stream("pgid") << p.first;
2867 f->open_array_section("mappings");
2868 for (auto& q : p.second) {
2869 f->open_object_section("mapping");
2870 f->dump_int("from", q.first);
2871 f->dump_int("to", q.second);
2872 f->close_section();
2873 }
2874 f->close_section();
2875 f->close_section();
2876 }
2877 f->close_section();
2878 f->open_array_section("pg_temp");
2879 pg_temp->dump(f);
2880 f->close_section();
2881
2882 f->open_array_section("primary_temp");
2883 for (const auto &pg : *primary_temp) {
2884 f->dump_stream("pgid") << pg.first;
2885 f->dump_int("osd", pg.second);
2886 }
2887 f->close_section(); // primary_temp
2888
2889 f->open_object_section("blacklist");
2890 for (const auto &addr : blacklist) {
2891 stringstream ss;
2892 ss << addr.first;
2893 f->dump_stream(ss.str().c_str()) << addr.second;
2894 }
2895 f->close_section();
2896
2897 dump_erasure_code_profiles(erasure_code_profiles, f);
2898 }
2899
2900 void OSDMap::generate_test_instances(list<OSDMap*>& o)
2901 {
2902 o.push_back(new OSDMap);
2903
2904 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
2905 o.push_back(new OSDMap);
2906 uuid_d fsid;
2907 o.back()->build_simple(cct, 1, fsid, 16);
2908 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
2909 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
2910 cct->put();
2911 }
2912
2913 string OSDMap::get_flag_string(unsigned f)
2914 {
2915 string s;
2916 if ( f& CEPH_OSDMAP_NEARFULL)
2917 s += ",nearfull";
2918 if (f & CEPH_OSDMAP_FULL)
2919 s += ",full";
2920 if (f & CEPH_OSDMAP_PAUSERD)
2921 s += ",pauserd";
2922 if (f & CEPH_OSDMAP_PAUSEWR)
2923 s += ",pausewr";
2924 if (f & CEPH_OSDMAP_PAUSEREC)
2925 s += ",pauserec";
2926 if (f & CEPH_OSDMAP_NOUP)
2927 s += ",noup";
2928 if (f & CEPH_OSDMAP_NODOWN)
2929 s += ",nodown";
2930 if (f & CEPH_OSDMAP_NOOUT)
2931 s += ",noout";
2932 if (f & CEPH_OSDMAP_NOIN)
2933 s += ",noin";
2934 if (f & CEPH_OSDMAP_NOBACKFILL)
2935 s += ",nobackfill";
2936 if (f & CEPH_OSDMAP_NOREBALANCE)
2937 s += ",norebalance";
2938 if (f & CEPH_OSDMAP_NORECOVER)
2939 s += ",norecover";
2940 if (f & CEPH_OSDMAP_NOSCRUB)
2941 s += ",noscrub";
2942 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
2943 s += ",nodeep-scrub";
2944 if (f & CEPH_OSDMAP_NOTIERAGENT)
2945 s += ",notieragent";
2946 if (f & CEPH_OSDMAP_SORTBITWISE)
2947 s += ",sortbitwise";
2948 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
2949 s += ",require_jewel_osds";
2950 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
2951 s += ",require_kraken_osds";
2952 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
2953 s += ",require_luminous_osds";
2954 if (s.length())
2955 s.erase(0, 1);
2956 return s;
2957 }
2958
2959 string OSDMap::get_flag_string() const
2960 {
2961 return get_flag_string(flags);
2962 }
2963
2964 struct qi {
2965 int item;
2966 int depth;
2967 float weight;
2968 qi() : item(0), depth(0), weight(0) {}
2969 qi(int i, int d, float w) : item(i), depth(d), weight(w) {}
2970 };
2971
2972 void OSDMap::print_pools(ostream& out) const
2973 {
2974 for (const auto &pool : pools) {
2975 std::string name("<unknown>");
2976 const auto &pni = pool_name.find(pool.first);
2977 if (pni != pool_name.end())
2978 name = pni->second;
2979 out << "pool " << pool.first
2980 << " '" << name
2981 << "' " << pool.second << "\n";
2982
2983 for (const auto &snap : pool.second.snaps)
2984 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
2985
2986 if (!pool.second.removed_snaps.empty())
2987 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
2988 }
2989 out << std::endl;
2990 }
2991
2992 void OSDMap::print(ostream& out) const
2993 {
2994 out << "epoch " << get_epoch() << "\n"
2995 << "fsid " << get_fsid() << "\n"
2996 << "created " << get_created() << "\n"
2997 << "modified " << get_modified() << "\n";
2998
2999 out << "flags " << get_flag_string() << "\n";
3000 out << "crush_version " << get_crush_version() << "\n";
3001 out << "full_ratio " << full_ratio << "\n";
3002 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
3003 out << "nearfull_ratio " << nearfull_ratio << "\n";
3004 if (require_min_compat_client > 0) {
3005 out << "require_min_compat_client "
3006 << ceph_release_name(require_min_compat_client) << "\n";
3007 }
3008 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
3009 << "\n";
3010 if (require_osd_release > 0) {
3011 out << "require_osd_release " << ceph_release_name(require_osd_release)
3012 << "\n";
3013 }
3014 if (get_cluster_snapshot().length())
3015 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3016 out << "\n";
3017
3018 print_pools(out);
3019
3020 out << "max_osd " << get_max_osd() << "\n";
3021 for (int i=0; i<get_max_osd(); i++) {
3022 if (exists(i)) {
3023 out << "osd." << i;
3024 out << (is_up(i) ? " up ":" down");
3025 out << (is_in(i) ? " in ":" out");
3026 out << " weight " << get_weightf(i);
3027 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3028 out << " primary_affinity " << get_primary_affinityf(i);
3029 const osd_info_t& info(get_info(i));
3030 out << " " << info;
3031 out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
3032 << " " << get_hb_front_addr(i);
3033 set<string> st;
3034 get_state(i, st);
3035 out << " " << st;
3036 if (!get_uuid(i).is_zero())
3037 out << " " << get_uuid(i);
3038 out << "\n";
3039 }
3040 }
3041 out << std::endl;
3042
3043 for (auto& p : pg_upmap) {
3044 out << "pg_upmap " << p.first << " " << p.second << "\n";
3045 }
3046 for (auto& p : pg_upmap_items) {
3047 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3048 }
3049
3050 for (const auto pg : *pg_temp)
3051 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3052
3053 for (const auto pg : *primary_temp)
3054 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3055
3056 for (const auto &addr : blacklist)
3057 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
3058
3059 // ignore pg_swap_primary
3060 }
3061
3062 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3063 public:
3064 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3065
3066 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3067 unsigned f)
3068 : Parent(crush), osdmap(osdmap_), filter(f) { }
3069
3070 bool should_dump_leaf(int i) const override {
3071 if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
3072 ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
3073 ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
3074 ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
3075 return false;
3076 }
3077 return true;
3078 }
3079
3080 bool should_dump_empty_bucket() const override {
3081 return !filter;
3082 }
3083
3084 void dump(TextTable *tbl) {
3085 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
3086 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
3087 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3088 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3089 tbl->define_column("UP/DOWN", TextTable::LEFT, TextTable::RIGHT);
3090 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
3091 tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
3092
3093 Parent::dump(tbl);
3094
3095 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3096 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3097 dump_item(CrushTreeDumper::Item(i, 0, 0), tbl);
3098 }
3099 }
3100 }
3101
3102 protected:
3103 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
3104 const char *c = crush->get_item_class(qi.id);
3105 if (!c)
3106 c = "";
3107 *tbl << qi.id
3108 << c
3109 << weightf_t(qi.weight);
3110
3111 ostringstream name;
3112 for (int k = 0; k < qi.depth; k++)
3113 name << " ";
3114 if (qi.is_bucket()) {
3115 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3116 << crush->get_item_name(qi.id);
3117 } else {
3118 name << "osd." << qi.id;
3119 }
3120 *tbl << name.str();
3121
3122 if (!qi.is_bucket()) {
3123 if (!osdmap->exists(qi.id)) {
3124 *tbl << "DNE"
3125 << 0;
3126 } else {
3127 *tbl << (osdmap->is_up(qi.id) ? "up" : "down")
3128 << weightf_t(osdmap->get_weightf(qi.id))
3129 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3130 }
3131 }
3132 *tbl << TextTable::endrow;
3133 }
3134
3135 private:
3136 const OSDMap *osdmap;
3137 const unsigned filter;
3138 };
3139
3140 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3141 public:
3142 typedef CrushTreeDumper::FormattingDumper Parent;
3143
3144 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3145 unsigned f)
3146 : Parent(crush), osdmap(osdmap_), filter(f) { }
3147
3148 bool should_dump_leaf(int i) const override {
3149 if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
3150 ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
3151 ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
3152 ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
3153 return false;
3154 }
3155 return true;
3156 }
3157
3158 bool should_dump_empty_bucket() const override {
3159 return !filter;
3160 }
3161
3162 void dump(Formatter *f) {
3163 f->open_array_section("nodes");
3164 Parent::dump(f);
3165 f->close_section();
3166 f->open_array_section("stray");
3167 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3168 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3169 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
3170 }
3171 f->close_section();
3172 }
3173
3174 protected:
3175 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3176 Parent::dump_item_fields(qi, f);
3177 if (!qi.is_bucket())
3178 {
3179 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
3180 f->dump_string("status", osdmap->is_up(qi.id) ? "up" : "down");
3181 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3182 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3183 }
3184 }
3185
3186 private:
3187 const OSDMap *osdmap;
3188 const unsigned filter;
3189 };
3190
3191 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter) const
3192 {
3193 if (f) {
3194 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f);
3195 } else {
3196 assert(out);
3197 TextTable tbl;
3198 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl);
3199 *out << tbl;
3200 }
3201 }
3202
3203 void OSDMap::print_summary(Formatter *f, ostream& out,
3204 const string& prefix) const
3205 {
3206 if (f) {
3207 f->open_object_section("osdmap");
3208 f->dump_int("epoch", get_epoch());
3209 f->dump_int("num_osds", get_num_osds());
3210 f->dump_int("num_up_osds", get_num_up_osds());
3211 f->dump_int("num_in_osds", get_num_in_osds());
3212 f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
3213 f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
3214 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3215 f->close_section();
3216 } else {
3217 out << get_num_osds() << " osds: "
3218 << get_num_up_osds() << " up, "
3219 << get_num_in_osds() << " in";
3220 if (get_num_pg_temp())
3221 out << "; " << get_num_pg_temp() << " remapped pgs";
3222 out << "\n";
3223 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3224 if (important_flags)
3225 out << prefix << "flags " << get_flag_string(important_flags) << "\n";
3226 }
3227 }
3228
3229 void OSDMap::print_oneline_summary(ostream& out) const
3230 {
3231 out << "e" << get_epoch() << ": "
3232 << get_num_osds() << " total, "
3233 << get_num_up_osds() << " up, "
3234 << get_num_in_osds() << " in";
3235 if (test_flag(CEPH_OSDMAP_FULL))
3236 out << " full";
3237 else if (test_flag(CEPH_OSDMAP_NEARFULL))
3238 out << " nearfull";
3239 }
3240
3241 bool OSDMap::crush_ruleset_in_use(int ruleset) const
3242 {
3243 for (const auto &pool : pools) {
3244 if (pool.second.crush_rule == ruleset)
3245 return true;
3246 }
3247 return false;
3248 }
3249
3250 int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
3251 int nosd, int pg_bits, int pgp_bits,
3252 bool default_pool)
3253 {
3254 ldout(cct, 10) << "build_simple on " << nosd
3255 << " osds" << dendl;
3256 epoch = e;
3257 set_fsid(fsid);
3258 created = modified = ceph_clock_now();
3259
3260 if (nosd >= 0) {
3261 set_max_osd(nosd);
3262 } else {
3263 // count osds
3264 int maxosd = 0;
3265 const md_config_t *conf = cct->_conf;
3266 vector<string> sections;
3267 conf->get_all_sections(sections);
3268
3269 for (auto &section : sections) {
3270 if (section.find("osd.") != 0)
3271 continue;
3272
3273 const char *begin = section.c_str() + 4;
3274 char *end = (char*)begin;
3275 int o = strtol(begin, &end, 10);
3276 if (*end != '\0')
3277 continue;
3278
3279 if (o > cct->_conf->mon_max_osd) {
3280 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
3281 return -ERANGE;
3282 }
3283
3284 if (o > maxosd)
3285 maxosd = o;
3286 }
3287
3288 set_max_osd(maxosd + 1);
3289 }
3290
3291
3292 stringstream ss;
3293 int r;
3294 if (nosd >= 0)
3295 r = build_simple_crush_map(cct, *crush, nosd, &ss);
3296 else
3297 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
3298 assert(r == 0);
3299
3300 int poolbase = get_max_osd() ? get_max_osd() : 1;
3301
3302 int const default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
3303 assert(default_replicated_rule >= 0);
3304
3305 if (default_pool) {
3306 // pgp_num <= pg_num
3307 if (pgp_bits > pg_bits)
3308 pgp_bits = pg_bits;
3309
3310 vector<string> pool_names;
3311 pool_names.push_back("rbd");
3312 for (auto &plname : pool_names) {
3313 int64_t pool = ++pool_max;
3314 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
3315 pools[pool].flags = cct->_conf->osd_pool_default_flags;
3316 if (cct->_conf->osd_pool_default_flag_hashpspool)
3317 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
3318 if (cct->_conf->osd_pool_default_flag_nodelete)
3319 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
3320 if (cct->_conf->osd_pool_default_flag_nopgchange)
3321 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
3322 if (cct->_conf->osd_pool_default_flag_nosizechange)
3323 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
3324 pools[pool].size = cct->_conf->osd_pool_default_size;
3325 pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
3326 pools[pool].crush_rule = default_replicated_rule;
3327 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
3328 pools[pool].set_pg_num(poolbase << pg_bits);
3329 pools[pool].set_pgp_num(poolbase << pgp_bits);
3330 pools[pool].last_change = epoch;
3331 pool_name[pool] = plname;
3332 name_pool[plname] = pool;
3333 }
3334 }
3335
3336 for (int i=0; i<get_max_osd(); i++) {
3337 set_state(i, 0);
3338 set_weight(i, CEPH_OSD_OUT);
3339 }
3340
3341 map<string,string> profile_map;
3342 r = get_erasure_code_profile_default(cct, profile_map, &ss);
3343 if (r < 0) {
3344 lderr(cct) << ss.str() << dendl;
3345 return r;
3346 }
3347 set_erasure_code_profile("default", profile_map);
3348 return 0;
3349 }
3350
3351 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
3352 map<string,string> &profile_map,
3353 ostream *ss)
3354 {
3355 int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
3356 *ss,
3357 &profile_map);
3358 return r;
3359 }
3360
3361 int OSDMap::_build_crush_types(CrushWrapper& crush)
3362 {
3363 crush.set_type_name(0, "osd");
3364 crush.set_type_name(1, "host");
3365 crush.set_type_name(2, "chassis");
3366 crush.set_type_name(3, "rack");
3367 crush.set_type_name(4, "row");
3368 crush.set_type_name(5, "pdu");
3369 crush.set_type_name(6, "pod");
3370 crush.set_type_name(7, "room");
3371 crush.set_type_name(8, "datacenter");
3372 crush.set_type_name(9, "region");
3373 crush.set_type_name(10, "root");
3374 return 10;
3375 }
3376
3377 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
3378 int nosd, ostream *ss)
3379 {
3380 crush.create();
3381
3382 // root
3383 int root_type = _build_crush_types(crush);
3384 int rootid;
3385 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
3386 root_type, 0, NULL, NULL, &rootid);
3387 assert(r == 0);
3388 crush.set_item_name(rootid, "default");
3389
3390 for (int o=0; o<nosd; o++) {
3391 map<string,string> loc;
3392 loc["host"] = "localhost";
3393 loc["rack"] = "localrack";
3394 loc["root"] = "default";
3395 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
3396 char name[32];
3397 snprintf(name, sizeof(name), "osd.%d", o);
3398 crush.insert_item(cct, o, 1.0, name, loc);
3399 }
3400
3401 build_simple_crush_rules(cct, crush, "default", ss);
3402
3403 crush.finalize();
3404
3405 return 0;
3406 }
3407
3408 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
3409 CrushWrapper& crush,
3410 ostream *ss)
3411 {
3412 const md_config_t *conf = cct->_conf;
3413
3414 crush.create();
3415
3416 // root
3417 int root_type = _build_crush_types(crush);
3418 int rootid;
3419 int r = crush.add_bucket(0, 0,
3420 CRUSH_HASH_DEFAULT,
3421 root_type, 0, NULL, NULL, &rootid);
3422 assert(r == 0);
3423 crush.set_item_name(rootid, "default");
3424
3425 // add osds
3426 vector<string> sections;
3427 conf->get_all_sections(sections);
3428
3429 for (auto &section : sections) {
3430 if (section.find("osd.") != 0)
3431 continue;
3432
3433 const char *begin = section.c_str() + 4;
3434 char *end = (char*)begin;
3435 int o = strtol(begin, &end, 10);
3436 if (*end != '\0')
3437 continue;
3438
3439 string host, rack, row, room, dc, pool;
3440 vector<string> sectiontmp;
3441 sectiontmp.push_back("osd");
3442 sectiontmp.push_back(section);
3443 conf->get_val_from_conf_file(sectiontmp, "host", host, false);
3444 conf->get_val_from_conf_file(sectiontmp, "rack", rack, false);
3445 conf->get_val_from_conf_file(sectiontmp, "row", row, false);
3446 conf->get_val_from_conf_file(sectiontmp, "room", room, false);
3447 conf->get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
3448 conf->get_val_from_conf_file(sectiontmp, "root", pool, false);
3449
3450 if (host.length() == 0)
3451 host = "unknownhost";
3452 if (rack.length() == 0)
3453 rack = "unknownrack";
3454
3455 map<string,string> loc;
3456 loc["host"] = host;
3457 loc["rack"] = rack;
3458 if (row.size())
3459 loc["row"] = row;
3460 if (room.size())
3461 loc["room"] = room;
3462 if (dc.size())
3463 loc["datacenter"] = dc;
3464 loc["root"] = "default";
3465
3466 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
3467 crush.insert_item(cct, o, 1.0, section, loc);
3468 }
3469
3470 build_simple_crush_rules(cct, crush, "default", ss);
3471
3472 crush.finalize();
3473
3474 return 0;
3475 }
3476
3477
3478 int OSDMap::build_simple_crush_rules(
3479 CephContext *cct,
3480 CrushWrapper& crush,
3481 const string& root,
3482 ostream *ss)
3483 {
3484 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
3485 string failure_domain =
3486 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
3487
3488 int r;
3489 r = crush.add_simple_rule_at(
3490 "replicated_rule", root, failure_domain, "",
3491 "firstn", pg_pool_t::TYPE_REPLICATED,
3492 crush_rule, ss);
3493 if (r < 0)
3494 return r;
3495 // do not add an erasure rule by default or else we will implicitly
3496 // require the crush_v2 feature of clients
3497 return 0;
3498 }
3499
3500 int OSDMap::summarize_mapping_stats(
3501 OSDMap *newmap,
3502 const set<int64_t> *pools,
3503 std::string *out,
3504 Formatter *f) const
3505 {
3506 set<int64_t> ls;
3507 if (pools) {
3508 ls = *pools;
3509 } else {
3510 for (auto &p : get_pools())
3511 ls.insert(p.first);
3512 }
3513
3514 unsigned total_pg = 0;
3515 unsigned moved_pg = 0;
3516 vector<unsigned> base_by_osd(get_max_osd(), 0);
3517 vector<unsigned> new_by_osd(get_max_osd(), 0);
3518 for (int64_t pool_id : ls) {
3519 const pg_pool_t *pi = get_pg_pool(pool_id);
3520 vector<int> up, up2;
3521 int up_primary;
3522 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
3523 pg_t pgid(ps, pool_id, -1);
3524 total_pg += pi->get_size();
3525 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
3526 for (int osd : up) {
3527 if (osd >= 0 && osd < get_max_osd())
3528 ++base_by_osd[osd];
3529 }
3530 if (newmap) {
3531 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
3532 for (int osd : up2) {
3533 if (osd >= 0 && osd < get_max_osd())
3534 ++new_by_osd[osd];
3535 }
3536 if (pi->type == pg_pool_t::TYPE_ERASURE) {
3537 for (unsigned i=0; i<up.size(); ++i) {
3538 if (up[i] != up2[i]) {
3539 ++moved_pg;
3540 }
3541 }
3542 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
3543 for (int osd : up) {
3544 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
3545 ++moved_pg;
3546 }
3547 }
3548 } else {
3549 assert(0 == "unhandled pool type");
3550 }
3551 }
3552 }
3553 }
3554
3555 unsigned num_up_in = 0;
3556 for (int osd = 0; osd < get_max_osd(); ++osd) {
3557 if (is_up(osd) && is_in(osd))
3558 ++num_up_in;
3559 }
3560 if (!num_up_in) {
3561 return -EINVAL;
3562 }
3563
3564 float avg_pg = (float)total_pg / (float)num_up_in;
3565 float base_stddev = 0, new_stddev = 0;
3566 int min = -1, max = -1;
3567 unsigned min_base_pg = 0, max_base_pg = 0;
3568 unsigned min_new_pg = 0, max_new_pg = 0;
3569 for (int osd = 0; osd < get_max_osd(); ++osd) {
3570 if (is_up(osd) && is_in(osd)) {
3571 float base_diff = (float)base_by_osd[osd] - avg_pg;
3572 base_stddev += base_diff * base_diff;
3573 float new_diff = (float)new_by_osd[osd] - avg_pg;
3574 new_stddev += new_diff * new_diff;
3575 if (min < 0 || base_by_osd[osd] < min_base_pg) {
3576 min = osd;
3577 min_base_pg = base_by_osd[osd];
3578 min_new_pg = new_by_osd[osd];
3579 }
3580 if (max < 0 || base_by_osd[osd] > max_base_pg) {
3581 max = osd;
3582 max_base_pg = base_by_osd[osd];
3583 max_new_pg = new_by_osd[osd];
3584 }
3585 }
3586 }
3587 base_stddev = sqrt(base_stddev / num_up_in);
3588 new_stddev = sqrt(new_stddev / num_up_in);
3589
3590 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
3591
3592 ostringstream ss;
3593 if (f)
3594 f->open_object_section("utilization");
3595 if (newmap) {
3596 if (f) {
3597 f->dump_unsigned("moved_pgs", moved_pg);
3598 f->dump_unsigned("total_pgs", total_pg);
3599 } else {
3600 float percent = 0;
3601 if (total_pg)
3602 percent = (float)moved_pg * 100.0 / (float)total_pg;
3603 ss << "moved " << moved_pg << " / " << total_pg
3604 << " (" << percent << "%)\n";
3605 }
3606 }
3607 if (f) {
3608 f->dump_float("avg_pgs", avg_pg);
3609 f->dump_float("std_dev", base_stddev);
3610 f->dump_float("expected_baseline_std_dev", edev);
3611 if (newmap)
3612 f->dump_float("new_std_dev", new_stddev);
3613 } else {
3614 ss << "avg " << avg_pg << "\n";
3615 ss << "stddev " << base_stddev;
3616 if (newmap)
3617 ss << " -> " << new_stddev;
3618 ss << " (expected baseline " << edev << ")\n";
3619 }
3620 if (min >= 0) {
3621 if (f) {
3622 f->dump_unsigned("min_osd", min);
3623 f->dump_unsigned("min_osd_pgs", min_base_pg);
3624 if (newmap)
3625 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
3626 } else {
3627 ss << "min osd." << min << " with " << min_base_pg;
3628 if (newmap)
3629 ss << " -> " << min_new_pg;
3630 ss << " pgs (" << (float)min_base_pg / avg_pg;
3631 if (newmap)
3632 ss << " -> " << (float)min_new_pg / avg_pg;
3633 ss << " * mean)\n";
3634 }
3635 }
3636 if (max >= 0) {
3637 if (f) {
3638 f->dump_unsigned("max_osd", max);
3639 f->dump_unsigned("max_osd_pgs", max_base_pg);
3640 if (newmap)
3641 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
3642 } else {
3643 ss << "max osd." << max << " with " << max_base_pg;
3644 if (newmap)
3645 ss << " -> " << max_new_pg;
3646 ss << " pgs (" << (float)max_base_pg / avg_pg;
3647 if (newmap)
3648 ss << " -> " << (float)max_new_pg / avg_pg;
3649 ss << " * mean)\n";
3650 }
3651 }
3652 if (f)
3653 f->close_section();
3654 if (out)
3655 *out = ss.str();
3656 return 0;
3657 }
3658
3659
3660 int OSDMap::clean_pg_upmaps(
3661 CephContext *cct,
3662 Incremental *pending_inc)
3663 {
3664 ldout(cct, 10) << __func__ << dendl;
3665 int changed = 0;
3666 for (auto& p : pg_upmap) {
3667 vector<int> raw;
3668 int primary;
3669 pg_to_raw_osds(p.first, &raw, &primary);
3670 if (vectors_equal(raw, p.second)) {
3671 ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
3672 << p.second << dendl;
3673 pending_inc->old_pg_upmap.insert(p.first);
3674 ++changed;
3675 }
3676 }
3677 for (auto& p : pg_upmap_items) {
3678 vector<int> raw;
3679 int primary;
3680 pg_to_raw_osds(p.first, &raw, &primary);
3681 mempool::osdmap::vector<pair<int,int>> newmap;
3682 for (auto& q : p.second) {
3683 if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
3684 newmap.push_back(q);
3685 }
3686 }
3687 if (newmap.empty()) {
3688 ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
3689 << p.second << dendl;
3690 pending_inc->old_pg_upmap_items.insert(p.first);
3691 ++changed;
3692 } else if (newmap != p.second) {
3693 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
3694 << p.first << " " << p.second << " -> " << newmap << dendl;
3695 pending_inc->new_pg_upmap_items[p.first] = newmap;
3696 ++changed;
3697 }
3698 }
3699 return changed;
3700 }
3701
3702 bool OSDMap::try_pg_upmap(
3703 CephContext *cct,
3704 pg_t pg, ///< pg to potentially remap
3705 const set<int>& overfull, ///< osds we'd want to evacuate
3706 const vector<int>& underfull, ///< osds to move to, in order of preference
3707 vector<int> *orig,
3708 vector<int> *out) ///< resulting alternative mapping
3709 {
3710 const pg_pool_t *pool = get_pg_pool(pg.pool());
3711 if (!pool)
3712 return false;
3713 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
3714 pool->get_size());
3715 if (rule < 0)
3716 return false;
3717
3718 // get original mapping
3719 _pg_to_raw_osds(*pool, pg, orig, NULL);
3720
3721 // make sure there is something there to remap
3722 bool any = false;
3723 for (auto osd : *orig) {
3724 if (overfull.count(osd)) {
3725 any = true;
3726 break;
3727 }
3728 }
3729 if (!any) {
3730 return false;
3731 }
3732
3733 int r = crush->try_remap_rule(
3734 cct,
3735 rule,
3736 pool->get_size(),
3737 overfull, underfull,
3738 *orig,
3739 out);
3740 if (r < 0)
3741 return false;
3742 if (*out == *orig)
3743 return false;
3744 return true;
3745 }
3746
3747 int OSDMap::calc_pg_upmaps(
3748 CephContext *cct,
3749 float max_deviation_ratio,
3750 int max,
3751 const set<int64_t>& only_pools_orig,
3752 OSDMap::Incremental *pending_inc)
3753 {
3754 set<int64_t> only_pools;
3755 if (only_pools_orig.empty()) {
3756 for (auto& i : pools) {
3757 only_pools.insert(i.first);
3758 }
3759 } else {
3760 only_pools = only_pools_orig;
3761 }
3762 OSDMap tmp;
3763 tmp.deepish_copy_from(*this);
3764 float start_deviation = 0;
3765 float end_deviation = 0;
3766 int num_changed = 0;
3767 while (true) {
3768 map<int,set<pg_t>> pgs_by_osd;
3769 int total_pgs = 0;
3770 float osd_weight_total = 0;
3771 map<int,float> osd_weight;
3772 for (auto& i : pools) {
3773 if (!only_pools.empty() && !only_pools.count(i.first))
3774 continue;
3775 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
3776 pg_t pg(ps, i.first);
3777 vector<int> up;
3778 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
3779 for (auto osd : up) {
3780 if (osd != CRUSH_ITEM_NONE)
3781 pgs_by_osd[osd].insert(pg);
3782 }
3783 }
3784 total_pgs += i.second.get_size() * i.second.get_pg_num();
3785
3786 map<int,float> pmap;
3787 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
3788 i.second.get_type(),
3789 i.second.get_size());
3790 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
3791 ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
3792 for (auto p : pmap) {
3793 osd_weight[p.first] += p.second;
3794 osd_weight_total += p.second;
3795 }
3796 }
3797 for (auto& i : osd_weight) {
3798 int pgs = 0;
3799 auto p = pgs_by_osd.find(i.first);
3800 if (p != pgs_by_osd.end())
3801 pgs = p->second.size();
3802 else
3803 pgs_by_osd.emplace(i.first, set<pg_t>());
3804 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
3805 << " pgs " << pgs << dendl;
3806 }
3807
3808 if (osd_weight_total == 0) {
3809 lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
3810 break;
3811 }
3812 float pgs_per_weight = total_pgs / osd_weight_total;
3813 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
3814 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
3815
3816 // osd deviation
3817 float total_deviation = 0;
3818 map<int,float> osd_deviation; // osd, deviation(pgs)
3819 multimap<float,int> deviation_osd; // deviation(pgs), osd
3820 set<int> overfull;
3821 for (auto& i : pgs_by_osd) {
3822 float target = osd_weight[i.first] * pgs_per_weight;
3823 float deviation = (float)i.second.size() - target;
3824 ldout(cct, 20) << " osd." << i.first
3825 << "\tpgs " << i.second.size()
3826 << "\ttarget " << target
3827 << "\tdeviation " << deviation
3828 << dendl;
3829 osd_deviation[i.first] = deviation;
3830 deviation_osd.insert(make_pair(deviation, i.first));
3831 if (deviation >= 1.0)
3832 overfull.insert(i.first);
3833 total_deviation += abs(deviation);
3834 }
3835 if (num_changed == 0) {
3836 start_deviation = total_deviation;
3837 }
3838 end_deviation = total_deviation;
3839
3840 // build underfull, sorted from least-full to most-average
3841 vector<int> underfull;
3842 for (auto i = deviation_osd.begin();
3843 i != deviation_osd.end();
3844 ++i) {
3845 if (i->first >= -.999)
3846 break;
3847 underfull.push_back(i->second);
3848 }
3849 ldout(cct, 10) << " total_deviation " << total_deviation
3850 << " overfull " << overfull
3851 << " underfull " << underfull << dendl;
3852 if (overfull.empty() || underfull.empty())
3853 break;
3854
3855 // pick fullest
3856 bool restart = false;
3857 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
3858 int osd = p->second;
3859 float deviation = p->first;
3860 float target = osd_weight[osd] * pgs_per_weight;
3861 assert(target > 0);
3862 if (deviation/target < max_deviation_ratio) {
3863 ldout(cct, 10) << " osd." << osd
3864 << " target " << target
3865 << " deviation " << deviation
3866 << " -> ratio " << deviation/target
3867 << " < max ratio " << max_deviation_ratio << dendl;
3868 break;
3869 }
3870 int num_to_move = deviation;
3871 ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
3872 if (num_to_move < 1)
3873 break;
3874
3875 set<pg_t>& pgs = pgs_by_osd[osd];
3876
3877 // look for remaps we can un-remap
3878 for (auto pg : pgs) {
3879 auto p = tmp.pg_upmap_items.find(pg);
3880 if (p != tmp.pg_upmap_items.end()) {
3881 for (auto q : p->second) {
3882 if (q.second == osd) {
3883 ldout(cct, 10) << " dropping pg_upmap_items " << pg
3884 << " " << p->second << dendl;
3885 tmp.pg_upmap_items.erase(p);
3886 pending_inc->old_pg_upmap_items.insert(pg);
3887 ++num_changed;
3888 restart = true;
3889 }
3890 }
3891 }
3892 if (restart)
3893 break;
3894 } // pg loop
3895 if (restart)
3896 break;
3897
3898 for (auto pg : pgs) {
3899 if (tmp.pg_upmap.count(pg) ||
3900 tmp.pg_upmap_items.count(pg)) {
3901 ldout(cct, 20) << " already remapped " << pg << dendl;
3902 continue;
3903 }
3904 ldout(cct, 10) << " trying " << pg << dendl;
3905 vector<int> orig, out;
3906 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
3907 continue;
3908 }
3909 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
3910 if (orig.size() != out.size()) {
3911 continue;
3912 }
3913 assert(orig != out);
3914 auto& rmi = tmp.pg_upmap_items[pg];
3915 for (unsigned i = 0; i < out.size(); ++i) {
3916 if (orig[i] != out[i]) {
3917 rmi.push_back(make_pair(orig[i], out[i]));
3918 }
3919 }
3920 pending_inc->new_pg_upmap_items[pg] = rmi;
3921 ldout(cct, 10) << " " << pg << " pg_upmap_items " << rmi << dendl;
3922 restart = true;
3923 ++num_changed;
3924 break;
3925 } // pg loop
3926 if (restart)
3927 break;
3928 } // osd loop
3929
3930 if (!restart) {
3931 ldout(cct, 10) << " failed to find any changes to make" << dendl;
3932 break;
3933 }
3934 if (--max == 0) {
3935 ldout(cct, 10) << " hit max iterations, stopping" << dendl;
3936 break;
3937 }
3938 }
3939 ldout(cct, 10) << " start deviation " << start_deviation << dendl;
3940 ldout(cct, 10) << " end deviation " << end_deviation << dendl;
3941 return num_changed;
3942 }
3943
3944 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
3945 {
3946 return crush->get_leaves(name, osds);
3947 }
3948
3949 template <typename F>
3950 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
3951 public:
3952 typedef CrushTreeDumper::Dumper<F> Parent;
3953
3954 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3955 const PGStatService *pgs_, bool tree_) :
3956 Parent(crush),
3957 osdmap(osdmap_),
3958 pgs(pgs_),
3959 tree(tree_),
3960 average_util(average_utilization()),
3961 min_var(-1),
3962 max_var(-1),
3963 stddev(0),
3964 sum(0) {
3965 }
3966
3967 protected:
3968 void dump_stray(F *f) {
3969 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3970 if (osdmap->exists(i) && !this->is_touched(i))
3971 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
3972 }
3973 }
3974
3975 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
3976 if (!tree && qi.is_bucket())
3977 return;
3978
3979 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
3980 int64_t kb = 0, kb_used = 0, kb_avail = 0;
3981 double util = 0;
3982 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
3983 if (kb_used && kb)
3984 util = 100.0 * (double)kb_used / (double)kb;
3985
3986 double var = 1.0;
3987 if (average_util)
3988 var = util / average_util;
3989
3990 size_t num_pgs = qi.is_bucket() ? 0 : pgs->get_num_pg_by_osd(qi.id);
3991
3992 dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
3993
3994 if (!qi.is_bucket() && reweight > 0) {
3995 if (min_var < 0 || var < min_var)
3996 min_var = var;
3997 if (max_var < 0 || var > max_var)
3998 max_var = var;
3999
4000 double dev = util - average_util;
4001 dev *= dev;
4002 stddev += reweight * dev;
4003 sum += reweight;
4004 }
4005 }
4006
4007 virtual void dump_item(const CrushTreeDumper::Item &qi,
4008 float &reweight,
4009 int64_t kb,
4010 int64_t kb_used,
4011 int64_t kb_avail,
4012 double& util,
4013 double& var,
4014 const size_t num_pgs,
4015 F *f) = 0;
4016
4017 double dev() {
4018 return sum > 0 ? sqrt(stddev / sum) : 0;
4019 }
4020
4021 double average_utilization() {
4022 int64_t kb = 0, kb_used = 0;
4023 for (int i = 0; i < osdmap->get_max_osd(); i++) {
4024 if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
4025 continue;
4026 int64_t kb_i, kb_used_i, kb_avail_i;
4027 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
4028 kb += kb_i;
4029 kb_used += kb_used_i;
4030 }
4031 }
4032 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
4033 }
4034
4035 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
4036 int64_t* kb_avail) const {
4037 const osd_stat_t *p = pgs->get_osd_stat(id);
4038 if (!p) return false;
4039 *kb = p->kb;
4040 *kb_used = p->kb_used;
4041 *kb_avail = p->kb_avail;
4042 return *kb > 0;
4043 }
4044
4045 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
4046 int64_t* kb_avail) const {
4047 if (id >= 0) {
4048 if (osdmap->is_out(id)) {
4049 *kb = 0;
4050 *kb_used = 0;
4051 *kb_avail = 0;
4052 return true;
4053 }
4054 return get_osd_utilization(id, kb, kb_used, kb_avail);
4055 }
4056
4057 *kb = 0;
4058 *kb_used = 0;
4059 *kb_avail = 0;
4060
4061 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
4062 int item = osdmap->crush->get_bucket_item(id, k);
4063 int64_t kb_i = 0, kb_used_i = 0, kb_avail_i = 0;
4064 if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
4065 return false;
4066 *kb += kb_i;
4067 *kb_used += kb_used_i;
4068 *kb_avail += kb_avail_i;
4069 }
4070 return *kb > 0;
4071 }
4072
4073 protected:
4074 const OSDMap *osdmap;
4075 const PGStatService *pgs;
4076 bool tree;
4077 double average_util;
4078 double min_var;
4079 double max_var;
4080 double stddev;
4081 double sum;
4082 };
4083
4084
4085 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
4086 public:
4087 typedef OSDUtilizationDumper<TextTable> Parent;
4088
4089 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4090 const PGStatService *pgs, bool tree) :
4091 Parent(crush, osdmap, pgs, tree) {}
4092
4093 void dump(TextTable *tbl) {
4094 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
4095 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
4096 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4097 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4098 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
4099 tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
4100 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
4101 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
4102 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
4103 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
4104 if (tree)
4105 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4106
4107 Parent::dump(tbl);
4108
4109 dump_stray(tbl);
4110
4111 *tbl << ""
4112 << ""
4113 << "" << "TOTAL"
4114 << si_t(pgs->get_osd_sum().kb << 10)
4115 << si_t(pgs->get_osd_sum().kb_used << 10)
4116 << si_t(pgs->get_osd_sum().kb_avail << 10)
4117 << lowprecision_t(average_util)
4118 << ""
4119 << TextTable::endrow;
4120 }
4121
4122 protected:
4123 struct lowprecision_t {
4124 float v;
4125 explicit lowprecision_t(float _v) : v(_v) {}
4126 };
4127 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
4128
4129 using OSDUtilizationDumper<TextTable>::dump_item;
4130 void dump_item(const CrushTreeDumper::Item &qi,
4131 float &reweight,
4132 int64_t kb,
4133 int64_t kb_used,
4134 int64_t kb_avail,
4135 double& util,
4136 double& var,
4137 const size_t num_pgs,
4138 TextTable *tbl) override {
4139 const char *c = crush->get_item_class(qi.id);
4140 if (!c)
4141 c = "";
4142 *tbl << qi.id
4143 << c
4144 << weightf_t(qi.weight)
4145 << weightf_t(reweight)
4146 << si_t(kb << 10)
4147 << si_t(kb_used << 10)
4148 << si_t(kb_avail << 10)
4149 << lowprecision_t(util)
4150 << lowprecision_t(var);
4151
4152 if (qi.is_bucket()) {
4153 *tbl << "-";
4154 } else {
4155 *tbl << num_pgs;
4156 }
4157
4158 if (tree) {
4159 ostringstream name;
4160 for (int k = 0; k < qi.depth; k++)
4161 name << " ";
4162 if (qi.is_bucket()) {
4163 int type = crush->get_bucket_type(qi.id);
4164 name << crush->get_type_name(type) << " "
4165 << crush->get_item_name(qi.id);
4166 } else {
4167 name << "osd." << qi.id;
4168 }
4169 *tbl << name.str();
4170 }
4171
4172 *tbl << TextTable::endrow;
4173 }
4174
4175 public:
4176 string summary() {
4177 ostringstream out;
4178 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
4179 << "/" << lowprecision_t(max_var) << " "
4180 << "STDDEV: " << lowprecision_t(dev());
4181 return out.str();
4182 }
4183 };
4184
4185 ostream& operator<<(ostream& out,
4186 const OSDUtilizationPlainDumper::lowprecision_t& v)
4187 {
4188 if (v.v < -0.01) {
4189 return out << "-";
4190 } else if (v.v < 0.001) {
4191 return out << "0";
4192 } else {
4193 std::streamsize p = out.precision();
4194 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
4195 }
4196 }
4197
4198 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
4199 public:
4200 typedef OSDUtilizationDumper<Formatter> Parent;
4201
4202 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4203 const PGStatService *pgs, bool tree) :
4204 Parent(crush, osdmap, pgs, tree) {}
4205
4206 void dump(Formatter *f) {
4207 f->open_array_section("nodes");
4208 Parent::dump(f);
4209 f->close_section();
4210
4211 f->open_array_section("stray");
4212 dump_stray(f);
4213 f->close_section();
4214 }
4215
4216 protected:
4217 using OSDUtilizationDumper<Formatter>::dump_item;
4218 void dump_item(const CrushTreeDumper::Item &qi,
4219 float &reweight,
4220 int64_t kb,
4221 int64_t kb_used,
4222 int64_t kb_avail,
4223 double& util,
4224 double& var,
4225 const size_t num_pgs,
4226 Formatter *f) override {
4227 f->open_object_section("item");
4228 CrushTreeDumper::dump_item_fields(crush, qi, f);
4229 f->dump_float("reweight", reweight);
4230 f->dump_int("kb", kb);
4231 f->dump_int("kb_used", kb_used);
4232 f->dump_int("kb_avail", kb_avail);
4233 f->dump_float("utilization", util);
4234 f->dump_float("var", var);
4235 f->dump_unsigned("pgs", num_pgs);
4236 CrushTreeDumper::dump_bucket_children(crush, qi, f);
4237 f->close_section();
4238 }
4239
4240 public:
4241 void summary(Formatter *f) {
4242 f->open_object_section("summary");
4243 f->dump_int("total_kb", pgs->get_osd_sum().kb);
4244 f->dump_int("total_kb_used", pgs->get_osd_sum().kb_used);
4245 f->dump_int("total_kb_avail", pgs->get_osd_sum().kb_avail);
4246 f->dump_float("average_utilization", average_util);
4247 f->dump_float("min_var", min_var);
4248 f->dump_float("max_var", max_var);
4249 f->dump_float("dev", dev());
4250 f->close_section();
4251 }
4252 };
4253
4254 void print_osd_utilization(const OSDMap& osdmap,
4255 const PGStatService *pgstat,
4256 ostream& out,
4257 Formatter *f,
4258 bool tree)
4259 {
4260 const CrushWrapper *crush = osdmap.crush.get();
4261 if (f) {
4262 f->open_object_section("df");
4263 OSDUtilizationFormatDumper d(crush, &osdmap, pgstat, tree);
4264 d.dump(f);
4265 d.summary(f);
4266 f->close_section();
4267 f->flush(out);
4268 } else {
4269 OSDUtilizationPlainDumper d(crush, &osdmap, pgstat, tree);
4270 TextTable tbl;
4271 d.dump(&tbl);
4272 out << tbl << d.summary() << "\n";
4273 }
4274 }
4275
4276 void OSDMap::check_health(health_check_map_t *checks) const
4277 {
4278 int num_osds = get_num_osds();
4279
4280 // OSD_DOWN
4281 // OSD_$subtree_DOWN
4282 // OSD_ORPHAN
4283 if (num_osds >= 0) {
4284 int num_in_osds = 0;
4285 int num_down_in_osds = 0;
4286 set<int> osds;
4287 set<int> down_in_osds;
4288 set<int> up_in_osds;
4289 set<int> subtree_up;
4290 unordered_map<int, set<int> > subtree_type_down;
4291 unordered_map<int, int> num_osds_subtree;
4292 int max_type = crush->get_max_type_id();
4293
4294 for (int i = 0; i < get_max_osd(); i++) {
4295 if (!exists(i)) {
4296 if (crush->item_exists(i)) {
4297 osds.insert(i);
4298 }
4299 continue;
4300 }
4301 if (is_out(i))
4302 continue;
4303 ++num_in_osds;
4304 if (down_in_osds.count(i) || up_in_osds.count(i))
4305 continue;
4306 if (!is_up(i)) {
4307 down_in_osds.insert(i);
4308 int parent_id = 0;
4309 int current = i;
4310 for (int type = 0; type <= max_type; type++) {
4311 if (!crush->get_type_name(type))
4312 continue;
4313 int r = crush->get_immediate_parent_id(current, &parent_id);
4314 if (r == -ENOENT)
4315 break;
4316 // break early if this parent is already marked as up
4317 if (subtree_up.count(parent_id))
4318 break;
4319 type = crush->get_bucket_type(parent_id);
4320 if (!subtree_type_is_down(
4321 g_ceph_context, parent_id, type,
4322 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
4323 break;
4324 current = parent_id;
4325 }
4326 }
4327 }
4328
4329 // calculate the number of down osds in each down subtree and
4330 // store it in num_osds_subtree
4331 for (int type = 1; type <= max_type; type++) {
4332 if (!crush->get_type_name(type))
4333 continue;
4334 for (auto j = subtree_type_down[type].begin();
4335 j != subtree_type_down[type].end();
4336 ++j) {
4337 list<int> children;
4338 int num = 0;
4339 int num_children = crush->get_children(*j, &children);
4340 if (num_children == 0)
4341 continue;
4342 for (auto l = children.begin(); l != children.end(); ++l) {
4343 if (*l >= 0) {
4344 ++num;
4345 } else if (num_osds_subtree[*l] > 0) {
4346 num = num + num_osds_subtree[*l];
4347 }
4348 }
4349 num_osds_subtree[*j] = num;
4350 }
4351 }
4352 num_down_in_osds = down_in_osds.size();
4353 assert(num_down_in_osds <= num_in_osds);
4354 if (num_down_in_osds > 0) {
4355 // summary of down subtree types and osds
4356 for (int type = max_type; type > 0; type--) {
4357 if (!crush->get_type_name(type))
4358 continue;
4359 if (subtree_type_down[type].size() > 0) {
4360 ostringstream ss;
4361 ss << subtree_type_down[type].size() << " "
4362 << crush->get_type_name(type);
4363 if (subtree_type_down[type].size() > 1) {
4364 ss << "s";
4365 }
4366 int sum_down_osds = 0;
4367 for (auto j = subtree_type_down[type].begin();
4368 j != subtree_type_down[type].end();
4369 ++j) {
4370 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
4371 }
4372 ss << " (" << sum_down_osds << " osds) down";
4373 string err = string("OSD_") +
4374 string(crush->get_type_name(type)) + "_DOWN";
4375 boost::to_upper(err);
4376 auto& d = checks->add(err, HEALTH_WARN, ss.str());
4377 for (auto j = subtree_type_down[type].rbegin();
4378 j != subtree_type_down[type].rend();
4379 ++j) {
4380 ostringstream ss;
4381 ss << crush->get_type_name(type);
4382 ss << " ";
4383 ss << crush->get_item_name(*j);
4384 // at the top level, do not print location
4385 if (type != max_type) {
4386 ss << " (";
4387 ss << crush->get_full_location_ordered_string(*j);
4388 ss << ")";
4389 }
4390 int num = num_osds_subtree[*j];
4391 ss << " (" << num << " osds)";
4392 ss << " is down";
4393 d.detail.push_back(ss.str());
4394 }
4395 }
4396 }
4397 ostringstream ss;
4398 ss << down_in_osds.size() << " osds down";
4399 auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
4400 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
4401 ostringstream ss;
4402 ss << "osd." << *it << " (";
4403 ss << crush->get_full_location_ordered_string(*it);
4404 ss << ") is down";
4405 d.detail.push_back(ss.str());
4406 }
4407 }
4408
4409 if (!osds.empty()) {
4410 ostringstream ss;
4411 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
4412 auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
4413 for (auto osd : osds) {
4414 ostringstream ss;
4415 ss << "osd." << osd << " exists in crush map but not in osdmap";
4416 d.detail.push_back(ss.str());
4417 }
4418 }
4419 }
4420
4421 // OSD_OUT_OF_ORDER_FULL
4422 {
4423 // An osd could configure failsafe ratio, to something different
4424 // but for now assume it is the same here.
4425 float fsr = g_conf->osd_failsafe_full_ratio;
4426 if (fsr > 1.0) fsr /= 100;
4427 float fr = get_full_ratio();
4428 float br = get_backfillfull_ratio();
4429 float nr = get_nearfull_ratio();
4430
4431 list<string> detail;
4432 // These checks correspond to how OSDService::check_full_status() in an OSD
4433 // handles the improper setting of these values.
4434 if (br < nr) {
4435 ostringstream ss;
4436 ss << "backfillfull_ratio (" << br
4437 << ") < nearfull_ratio (" << nr << "), increased";
4438 detail.push_back(ss.str());
4439 br = nr;
4440 }
4441 if (fr < br) {
4442 ostringstream ss;
4443 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
4444 << "), increased";
4445 detail.push_back(ss.str());
4446 fr = br;
4447 }
4448 if (fsr < fr) {
4449 ostringstream ss;
4450 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
4451 << "), increased";
4452 detail.push_back(ss.str());
4453 }
4454 if (!detail.empty()) {
4455 auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
4456 "full ratio(s) out of order");
4457 d.detail.swap(detail);
4458 }
4459 }
4460
4461 // OSD_FULL
4462 // OSD_NEARFULL
4463 // OSD_BACKFILLFULL
4464 // OSD_FAILSAFE_FULL
4465 {
4466 set<int> full, backfillfull, nearfull;
4467 get_full_osd_counts(&full, &backfillfull, &nearfull);
4468 if (full.size()) {
4469 ostringstream ss;
4470 ss << full.size() << " full osd(s)";
4471 auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
4472 for (auto& i: full) {
4473 ostringstream ss;
4474 ss << "osd." << i << " is full";
4475 d.detail.push_back(ss.str());
4476 }
4477 }
4478 if (backfillfull.size()) {
4479 ostringstream ss;
4480 ss << backfillfull.size() << " backfillfull osd(s)";
4481 auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
4482 for (auto& i: backfillfull) {
4483 ostringstream ss;
4484 ss << "osd." << i << " is backfill full";
4485 d.detail.push_back(ss.str());
4486 }
4487 }
4488 if (nearfull.size()) {
4489 ostringstream ss;
4490 ss << nearfull.size() << " nearfull osd(s)";
4491 auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
4492 for (auto& i: nearfull) {
4493 ostringstream ss;
4494 ss << "osd." << i << " is near full";
4495 d.detail.push_back(ss.str());
4496 }
4497 }
4498 }
4499
4500 // OSDMAP_FLAGS
4501 {
4502 // warn about flags
4503 uint64_t warn_flags =
4504 CEPH_OSDMAP_FULL |
4505 CEPH_OSDMAP_PAUSERD |
4506 CEPH_OSDMAP_PAUSEWR |
4507 CEPH_OSDMAP_PAUSEREC |
4508 CEPH_OSDMAP_NOUP |
4509 CEPH_OSDMAP_NODOWN |
4510 CEPH_OSDMAP_NOIN |
4511 CEPH_OSDMAP_NOOUT |
4512 CEPH_OSDMAP_NOBACKFILL |
4513 CEPH_OSDMAP_NORECOVER |
4514 CEPH_OSDMAP_NOSCRUB |
4515 CEPH_OSDMAP_NODEEP_SCRUB |
4516 CEPH_OSDMAP_NOTIERAGENT |
4517 CEPH_OSDMAP_NOREBALANCE;
4518 if (test_flag(warn_flags)) {
4519 ostringstream ss;
4520 ss << get_flag_string(get_flags() & warn_flags)
4521 << " flag(s) set";
4522 checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
4523 }
4524 }
4525
4526 // OSD_FLAGS
4527 {
4528 list<string> detail;
4529 const unsigned flags =
4530 CEPH_OSD_NOUP |
4531 CEPH_OSD_NOIN |
4532 CEPH_OSD_NODOWN |
4533 CEPH_OSD_NOOUT;
4534 for (int i = 0; i < max_osd; ++i) {
4535 if (osd_state[i] & flags) {
4536 ostringstream ss;
4537 set<string> states;
4538 OSDMap::calc_state_set(osd_state[i] & flags, states);
4539 ss << "osd." << i << " has flags " << states;
4540 detail.push_back(ss.str());
4541 }
4542 }
4543 if (!detail.empty()) {
4544 ostringstream ss;
4545 ss << detail.size() << " osd(s) have {NOUP,NODOWN,NOIN,NOOUT} flags set";
4546 auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
4547 d.detail.swap(detail);
4548 }
4549 }
4550
4551 // OLD_CRUSH_TUNABLES
4552 if (g_conf->mon_warn_on_legacy_crush_tunables) {
4553 string min = crush->get_min_required_version();
4554 if (min < g_conf->mon_crush_min_required_version) {
4555 ostringstream ss;
4556 ss << "crush map has legacy tunables (require " << min
4557 << ", min is " << g_conf->mon_crush_min_required_version << ")";
4558 auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
4559 d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4560 }
4561 }
4562
4563 // OLD_CRUSH_STRAW_CALC_VERSION
4564 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
4565 if (crush->get_straw_calc_version() == 0) {
4566 ostringstream ss;
4567 ss << "crush map has straw_calc_version=0";
4568 auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
4569 d.detail.push_back(
4570 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4571 }
4572 }
4573
4574 // CACHE_POOL_NO_HIT_SET
4575 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
4576 list<string> detail;
4577 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
4578 p != pools.end();
4579 ++p) {
4580 const pg_pool_t& info = p->second;
4581 if (info.cache_mode_requires_hit_set() &&
4582 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
4583 ostringstream ss;
4584 ss << "pool '" << get_pool_name(p->first)
4585 << "' with cache_mode " << info.get_cache_mode_name()
4586 << " needs hit_set_type to be set but it is not";
4587 detail.push_back(ss.str());
4588 }
4589 }
4590 if (!detail.empty()) {
4591 ostringstream ss;
4592 ss << detail.size() << " cache pools are missing hit_sets";
4593 auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
4594 d.detail.swap(detail);
4595 }
4596 }
4597
4598 // OSD_NO_SORTBITWISE
4599 if (!test_flag(CEPH_OSDMAP_SORTBITWISE) &&
4600 (get_up_osd_features() &
4601 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
4602 ostringstream ss;
4603 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
4604 checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
4605 }
4606
4607 // OSD_UPGRADE_FINISHED
4608 // none of these (yet) since we don't run until luminous upgrade is done.
4609
4610 // POOL_FULL
4611 {
4612 list<string> detail;
4613 for (auto it : get_pools()) {
4614 const pg_pool_t &pool = it.second;
4615 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4616 const string& pool_name = get_pool_name(it.first);
4617 stringstream ss;
4618 ss << "pool '" << pool_name << "' is full";
4619 detail.push_back(ss.str());
4620 }
4621 }
4622 if (!detail.empty()) {
4623 ostringstream ss;
4624 ss << detail.size() << " pool(s) full";
4625 auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
4626 d.detail.swap(detail);
4627 }
4628 }
4629 }