]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
594ff82fae28dd10584bd798ad1d569d77cd6972
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "OSDMap.h"
19 #include <algorithm>
20 #include "common/config.h"
21 #include "common/Formatter.h"
22 #include "common/TextTable.h"
23 #include "include/ceph_features.h"
24 #include "include/str_map.h"
25
26 #include "common/code_environment.h"
27
28 #include "crush/CrushTreeDumper.h"
29 #include "common/Clock.h"
30 #include "mon/PGStatService.h"
31
32 #define dout_subsys ceph_subsys_osd
33
34 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
35 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
36
37
38 // ----------------------------------
39 // osd_info_t
40
41 void osd_info_t::dump(Formatter *f) const
42 {
43 f->dump_int("last_clean_begin", last_clean_begin);
44 f->dump_int("last_clean_end", last_clean_end);
45 f->dump_int("up_from", up_from);
46 f->dump_int("up_thru", up_thru);
47 f->dump_int("down_at", down_at);
48 f->dump_int("lost_at", lost_at);
49 }
50
51 void osd_info_t::encode(bufferlist& bl) const
52 {
53 __u8 struct_v = 1;
54 ::encode(struct_v, bl);
55 ::encode(last_clean_begin, bl);
56 ::encode(last_clean_end, bl);
57 ::encode(up_from, bl);
58 ::encode(up_thru, bl);
59 ::encode(down_at, bl);
60 ::encode(lost_at, bl);
61 }
62
63 void osd_info_t::decode(bufferlist::iterator& bl)
64 {
65 __u8 struct_v;
66 ::decode(struct_v, bl);
67 ::decode(last_clean_begin, bl);
68 ::decode(last_clean_end, bl);
69 ::decode(up_from, bl);
70 ::decode(up_thru, bl);
71 ::decode(down_at, bl);
72 ::decode(lost_at, bl);
73 }
74
75 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
76 {
77 o.push_back(new osd_info_t);
78 o.push_back(new osd_info_t);
79 o.back()->last_clean_begin = 1;
80 o.back()->last_clean_end = 2;
81 o.back()->up_from = 30;
82 o.back()->up_thru = 40;
83 o.back()->down_at = 5;
84 o.back()->lost_at = 6;
85 }
86
87 ostream& operator<<(ostream& out, const osd_info_t& info)
88 {
89 out << "up_from " << info.up_from
90 << " up_thru " << info.up_thru
91 << " down_at " << info.down_at
92 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
93 if (info.lost_at)
94 out << " lost_at " << info.lost_at;
95 return out;
96 }
97
98 // ----------------------------------
99 // osd_xinfo_t
100
101 void osd_xinfo_t::dump(Formatter *f) const
102 {
103 f->dump_stream("down_stamp") << down_stamp;
104 f->dump_float("laggy_probability", laggy_probability);
105 f->dump_int("laggy_interval", laggy_interval);
106 f->dump_int("features", features);
107 f->dump_unsigned("old_weight", old_weight);
108 }
109
110 void osd_xinfo_t::encode(bufferlist& bl) const
111 {
112 ENCODE_START(3, 1, bl);
113 ::encode(down_stamp, bl);
114 __u32 lp = laggy_probability * 0xfffffffful;
115 ::encode(lp, bl);
116 ::encode(laggy_interval, bl);
117 ::encode(features, bl);
118 ::encode(old_weight, bl);
119 ENCODE_FINISH(bl);
120 }
121
122 void osd_xinfo_t::decode(bufferlist::iterator& bl)
123 {
124 DECODE_START(3, bl);
125 ::decode(down_stamp, bl);
126 __u32 lp;
127 ::decode(lp, bl);
128 laggy_probability = (float)lp / (float)0xffffffff;
129 ::decode(laggy_interval, bl);
130 if (struct_v >= 2)
131 ::decode(features, bl);
132 else
133 features = 0;
134 if (struct_v >= 3)
135 ::decode(old_weight, bl);
136 else
137 old_weight = 0;
138 DECODE_FINISH(bl);
139 }
140
141 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
142 {
143 o.push_back(new osd_xinfo_t);
144 o.push_back(new osd_xinfo_t);
145 o.back()->down_stamp = utime_t(2, 3);
146 o.back()->laggy_probability = .123;
147 o.back()->laggy_interval = 123456;
148 o.back()->old_weight = 0x7fff;
149 }
150
151 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
152 {
153 return out << "down_stamp " << xi.down_stamp
154 << " laggy_probability " << xi.laggy_probability
155 << " laggy_interval " << xi.laggy_interval
156 << " old_weight " << xi.old_weight;
157 }
158
159 // ----------------------------------
160 // OSDMap::Incremental
161
162 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
163 {
164 int n = 0;
165 for (auto &weight : new_weight) {
166 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
167 n++; // marked out
168 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
169 n--; // marked in
170 }
171 return n;
172 }
173
174 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
175 {
176 int n = 0;
177 for (auto &state : new_state) { //
178 if (state.second & CEPH_OSD_UP) {
179 if (previous->is_up(state.first))
180 n++; // marked down
181 else
182 n--; // marked up
183 }
184 }
185 return n;
186 }
187
188 int OSDMap::Incremental::identify_osd(uuid_d u) const
189 {
190 for (auto &uuid : new_uuid)
191 if (uuid.second == u)
192 return uuid.first;
193 return -1;
194 }
195
196 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
197 const OSDMap& osdmap)
198 {
199 assert(epoch == osdmap.get_epoch() + 1);
200
201 for (auto &new_pool : new_pools) {
202 if (!new_pool.second.tiers.empty()) {
203 pg_pool_t& base = new_pool.second;
204
205 for (const auto &tier_pool : base.tiers) {
206 const auto &r = new_pools.find(tier_pool);
207 pg_pool_t *tier = 0;
208 if (r == new_pools.end()) {
209 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
210 if (!orig) {
211 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
212 return -EIO;
213 }
214 tier = get_new_pool(tier_pool, orig);
215 } else {
216 tier = &r->second;
217 }
218 if (tier->tier_of != new_pool.first) {
219 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
220 return -EIO;
221 }
222
223 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
224 << tier_pool << dendl;
225 tier->snap_seq = base.snap_seq;
226 tier->snap_epoch = base.snap_epoch;
227 tier->snaps = base.snaps;
228 tier->removed_snaps = base.removed_snaps;
229 }
230 }
231 }
232 return 0;
233 }
234
235
236 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
237 {
238 if (id >= 0)
239 return is_down(id);
240
241 if (down_cache &&
242 down_cache->count(id)) {
243 return true;
244 }
245
246 list<int> children;
247 crush->get_children(id, &children);
248 for (const auto &child : children) {
249 if (!subtree_is_down(child, down_cache)) {
250 return false;
251 }
252 }
253 if (down_cache) {
254 down_cache->insert(id);
255 }
256 return true;
257 }
258
259 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
260 {
261 // use a stack-local down_cache if we didn't get one from the
262 // caller. then at least this particular call will avoid duplicated
263 // work.
264 set<int> local_down_cache;
265 if (!down_cache) {
266 down_cache = &local_down_cache;
267 }
268
269 int current = id;
270 while (true) {
271 int type;
272 if (current >= 0) {
273 type = 0;
274 } else {
275 type = crush->get_bucket_type(current);
276 }
277 assert(type >= 0);
278
279 if (!subtree_is_down(current, down_cache)) {
280 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
281 return false;
282 }
283
284 // is this a big enough subtree to be marked as down?
285 if (type >= subtree_type) {
286 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
287 return true;
288 }
289
290 int r = crush->get_immediate_parent_id(current, &current);
291 if (r < 0) {
292 return false;
293 }
294 }
295 }
296
297 bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
298 set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
299 {
300 if (id >= 0) {
301 bool is_down_ret = is_down(id);
302 if (!is_out(id)) {
303 if (is_down_ret) {
304 down_in_osds->insert(id);
305 } else {
306 up_in_osds->insert(id);
307 }
308 }
309 return is_down_ret;
310 }
311
312 if (subtree_type_down &&
313 (*subtree_type_down)[subtree_type].count(id)) {
314 return true;
315 }
316
317 list<int> children;
318 crush->get_children(id, &children);
319 for (const auto &child : children) {
320 if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
321 subtree_up->insert(id);
322 return false;
323 }
324 }
325 if (subtree_type_down) {
326 (*subtree_type_down)[subtree_type].insert(id);
327 }
328 return true;
329 }
330
331 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
332 {
333 __u16 v = 5;
334 ::encode(v, bl);
335 ::encode(fsid, bl);
336 ::encode(epoch, bl);
337 ::encode(modified, bl);
338 int32_t new_t = new_pool_max;
339 ::encode(new_t, bl);
340 ::encode(new_flags, bl);
341 ::encode(fullmap, bl);
342 ::encode(crush, bl);
343
344 ::encode(new_max_osd, bl);
345 // for ::encode(new_pools, bl);
346 __u32 n = new_pools.size();
347 ::encode(n, bl);
348 for (const auto &new_pool : new_pools) {
349 n = new_pool.first;
350 ::encode(n, bl);
351 ::encode(new_pool.second, bl, 0);
352 }
353 // for ::encode(new_pool_names, bl);
354 n = new_pool_names.size();
355 ::encode(n, bl);
356
357 for (const auto &new_pool_name : new_pool_names) {
358 n = new_pool_name.first;
359 ::encode(n, bl);
360 ::encode(new_pool_name.second, bl);
361 }
362 // for ::encode(old_pools, bl);
363 n = old_pools.size();
364 ::encode(n, bl);
365 for (auto &old_pool : old_pools) {
366 n = old_pool;
367 ::encode(n, bl);
368 }
369 ::encode(new_up_client, bl, 0);
370 {
371 // legacy is map<int32_t,uint8_t>
372 uint32_t n = new_state.size();
373 ::encode(n, bl);
374 for (auto p : new_state) {
375 ::encode(p.first, bl);
376 ::encode((uint8_t)p.second, bl);
377 }
378 }
379 ::encode(new_weight, bl);
380 // for ::encode(new_pg_temp, bl);
381 n = new_pg_temp.size();
382 ::encode(n, bl);
383
384 for (const auto &pg_temp : new_pg_temp) {
385 old_pg_t opg = pg_temp.first.get_old_pg();
386 ::encode(opg, bl);
387 ::encode(pg_temp.second, bl);
388 }
389 }
390
391 void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
392 {
393 if ((features & CEPH_FEATURE_PGID64) == 0) {
394 encode_client_old(bl);
395 return;
396 }
397
398 // base
399 __u16 v = 6;
400 ::encode(v, bl);
401 ::encode(fsid, bl);
402 ::encode(epoch, bl);
403 ::encode(modified, bl);
404 ::encode(new_pool_max, bl);
405 ::encode(new_flags, bl);
406 ::encode(fullmap, bl);
407 ::encode(crush, bl);
408
409 ::encode(new_max_osd, bl);
410 ::encode(new_pools, bl, features);
411 ::encode(new_pool_names, bl);
412 ::encode(old_pools, bl);
413 ::encode(new_up_client, bl, features);
414 {
415 uint32_t n = new_state.size();
416 ::encode(n, bl);
417 for (auto p : new_state) {
418 ::encode(p.first, bl);
419 ::encode((uint8_t)p.second, bl);
420 }
421 }
422 ::encode(new_weight, bl);
423 ::encode(new_pg_temp, bl);
424
425 // extended
426 __u16 ev = 10;
427 ::encode(ev, bl);
428 ::encode(new_hb_back_up, bl, features);
429 ::encode(new_up_thru, bl);
430 ::encode(new_last_clean_interval, bl);
431 ::encode(new_lost, bl);
432 ::encode(new_blacklist, bl, features);
433 ::encode(old_blacklist, bl, features);
434 ::encode(new_up_cluster, bl, features);
435 ::encode(cluster_snapshot, bl);
436 ::encode(new_uuid, bl);
437 ::encode(new_xinfo, bl);
438 ::encode(new_hb_front_up, bl, features);
439 }
440
441 void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
442 {
443 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
444 encode_classic(bl, features);
445 return;
446 }
447
448 // only a select set of callers should *ever* be encoding new
449 // OSDMaps. others should be passing around the canonical encoded
450 // buffers from on high. select out those callers by passing in an
451 // "impossible" feature bit.
452 assert(features & CEPH_FEATURE_RESERVED);
453 features &= ~CEPH_FEATURE_RESERVED;
454
455 size_t start_offset = bl.length();
456 size_t tail_offset;
457 buffer::list::iterator crc_it;
458
459 // meta-encoding: how we include client-used and osd-specific data
460 ENCODE_START(8, 7, bl);
461
462 {
463 uint8_t v = 5;
464 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
465 v = 3;
466 }
467 ENCODE_START(v, 1, bl); // client-usable data
468 ::encode(fsid, bl);
469 ::encode(epoch, bl);
470 ::encode(modified, bl);
471 ::encode(new_pool_max, bl);
472 ::encode(new_flags, bl);
473 ::encode(fullmap, bl);
474 ::encode(crush, bl);
475
476 ::encode(new_max_osd, bl);
477 ::encode(new_pools, bl, features);
478 ::encode(new_pool_names, bl);
479 ::encode(old_pools, bl);
480 ::encode(new_up_client, bl, features);
481 if (v >= 5) {
482 ::encode(new_state, bl);
483 } else {
484 uint32_t n = new_state.size();
485 ::encode(n, bl);
486 for (auto p : new_state) {
487 ::encode(p.first, bl);
488 ::encode((uint8_t)p.second, bl);
489 }
490 }
491 ::encode(new_weight, bl);
492 ::encode(new_pg_temp, bl);
493 ::encode(new_primary_temp, bl);
494 ::encode(new_primary_affinity, bl);
495 ::encode(new_erasure_code_profiles, bl);
496 ::encode(old_erasure_code_profiles, bl);
497 if (v >= 4) {
498 ::encode(new_pg_upmap, bl);
499 ::encode(old_pg_upmap, bl);
500 ::encode(new_pg_upmap_items, bl);
501 ::encode(old_pg_upmap_items, bl);
502 }
503 ENCODE_FINISH(bl); // client-usable data
504 }
505
506 {
507 uint8_t target_v = 6;
508 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
509 target_v = 2;
510 }
511 ENCODE_START(target_v, 1, bl); // extended, osd-only data
512 ::encode(new_hb_back_up, bl, features);
513 ::encode(new_up_thru, bl);
514 ::encode(new_last_clean_interval, bl);
515 ::encode(new_lost, bl);
516 ::encode(new_blacklist, bl, features);
517 ::encode(old_blacklist, bl, features);
518 ::encode(new_up_cluster, bl, features);
519 ::encode(cluster_snapshot, bl);
520 ::encode(new_uuid, bl);
521 ::encode(new_xinfo, bl);
522 ::encode(new_hb_front_up, bl, features);
523 ::encode(features, bl); // NOTE: features arg, not the member
524 if (target_v >= 3) {
525 ::encode(new_nearfull_ratio, bl);
526 ::encode(new_full_ratio, bl);
527 ::encode(new_backfillfull_ratio, bl);
528 }
529 // 5 was string-based new_require_min_compat_client
530 if (target_v >= 6) {
531 ::encode(new_require_min_compat_client, bl);
532 ::encode(new_require_osd_release, bl);
533 }
534 ENCODE_FINISH(bl); // osd-only data
535 }
536
537 ::encode((uint32_t)0, bl); // dummy inc_crc
538 crc_it = bl.end();
539 crc_it.advance(-4);
540 tail_offset = bl.length();
541
542 ::encode(full_crc, bl);
543
544 ENCODE_FINISH(bl); // meta-encoding wrapper
545
546 // fill in crc
547 bufferlist front;
548 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
549 inc_crc = front.crc32c(-1);
550 bufferlist tail;
551 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
552 inc_crc = tail.crc32c(inc_crc);
553 ceph_le32 crc_le;
554 crc_le = inc_crc;
555 crc_it.copy_in(4, (char*)&crc_le);
556 have_crc = true;
557 }
558
559 void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
560 {
561 __u32 n, t;
562 // base
563 __u16 v;
564 ::decode(v, p);
565 ::decode(fsid, p);
566 ::decode(epoch, p);
567 ::decode(modified, p);
568 if (v == 4 || v == 5) {
569 ::decode(n, p);
570 new_pool_max = n;
571 } else if (v >= 6)
572 ::decode(new_pool_max, p);
573 ::decode(new_flags, p);
574 ::decode(fullmap, p);
575 ::decode(crush, p);
576
577 ::decode(new_max_osd, p);
578 if (v < 6) {
579 new_pools.clear();
580 ::decode(n, p);
581 while (n--) {
582 ::decode(t, p);
583 ::decode(new_pools[t], p);
584 }
585 } else {
586 ::decode(new_pools, p);
587 }
588 if (v == 5) {
589 new_pool_names.clear();
590 ::decode(n, p);
591 while (n--) {
592 ::decode(t, p);
593 ::decode(new_pool_names[t], p);
594 }
595 } else if (v >= 6) {
596 ::decode(new_pool_names, p);
597 }
598 if (v < 6) {
599 old_pools.clear();
600 ::decode(n, p);
601 while (n--) {
602 ::decode(t, p);
603 old_pools.insert(t);
604 }
605 } else {
606 ::decode(old_pools, p);
607 }
608 ::decode(new_up_client, p);
609 {
610 map<int32_t,uint8_t> ns;
611 ::decode(ns, p);
612 for (auto q : ns) {
613 new_state[q.first] = q.second;
614 }
615 }
616 ::decode(new_weight, p);
617
618 if (v < 6) {
619 new_pg_temp.clear();
620 ::decode(n, p);
621 while (n--) {
622 old_pg_t opg;
623 ::decode_raw(opg, p);
624 ::decode(new_pg_temp[pg_t(opg)], p);
625 }
626 } else {
627 ::decode(new_pg_temp, p);
628 }
629
630 // decode short map, too.
631 if (v == 5 && p.end())
632 return;
633
634 // extended
635 __u16 ev = 0;
636 if (v >= 5)
637 ::decode(ev, p);
638 ::decode(new_hb_back_up, p);
639 if (v < 5)
640 ::decode(new_pool_names, p);
641 ::decode(new_up_thru, p);
642 ::decode(new_last_clean_interval, p);
643 ::decode(new_lost, p);
644 ::decode(new_blacklist, p);
645 ::decode(old_blacklist, p);
646 if (ev >= 6)
647 ::decode(new_up_cluster, p);
648 if (ev >= 7)
649 ::decode(cluster_snapshot, p);
650 if (ev >= 8)
651 ::decode(new_uuid, p);
652 if (ev >= 9)
653 ::decode(new_xinfo, p);
654 if (ev >= 10)
655 ::decode(new_hb_front_up, p);
656 }
657
658 void OSDMap::Incremental::decode(bufferlist::iterator& bl)
659 {
660 /**
661 * Older encodings of the Incremental had a single struct_v which
662 * covered the whole encoding, and was prior to our modern
663 * stuff which includes a compatv and a size. So if we see
664 * a struct_v < 7, we must rewind to the beginning and use our
665 * classic decoder.
666 */
667 size_t start_offset = bl.get_off();
668 size_t tail_offset = 0;
669 bufferlist crc_front, crc_tail;
670
671 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
672 if (struct_v < 7) {
673 int struct_v_size = sizeof(struct_v);
674 bl.advance(-struct_v_size);
675 decode_classic(bl);
676 encode_features = 0;
677 if (struct_v >= 6)
678 encode_features = CEPH_FEATURE_PGID64;
679 else
680 encode_features = 0;
681 return;
682 }
683 {
684 DECODE_START(5, bl); // client-usable data
685 ::decode(fsid, bl);
686 ::decode(epoch, bl);
687 ::decode(modified, bl);
688 ::decode(new_pool_max, bl);
689 ::decode(new_flags, bl);
690 ::decode(fullmap, bl);
691 ::decode(crush, bl);
692
693 ::decode(new_max_osd, bl);
694 ::decode(new_pools, bl);
695 ::decode(new_pool_names, bl);
696 ::decode(old_pools, bl);
697 ::decode(new_up_client, bl);
698 if (struct_v >= 5) {
699 ::decode(new_state, bl);
700 } else {
701 map<int32_t,uint8_t> ns;
702 ::decode(ns, bl);
703 for (auto q : ns) {
704 new_state[q.first] = q.second;
705 }
706 }
707 ::decode(new_weight, bl);
708 ::decode(new_pg_temp, bl);
709 ::decode(new_primary_temp, bl);
710 if (struct_v >= 2)
711 ::decode(new_primary_affinity, bl);
712 else
713 new_primary_affinity.clear();
714 if (struct_v >= 3) {
715 ::decode(new_erasure_code_profiles, bl);
716 ::decode(old_erasure_code_profiles, bl);
717 } else {
718 new_erasure_code_profiles.clear();
719 old_erasure_code_profiles.clear();
720 }
721 if (struct_v >= 4) {
722 ::decode(new_pg_upmap, bl);
723 ::decode(old_pg_upmap, bl);
724 ::decode(new_pg_upmap_items, bl);
725 ::decode(old_pg_upmap_items, bl);
726 }
727 DECODE_FINISH(bl); // client-usable data
728 }
729
730 {
731 DECODE_START(6, bl); // extended, osd-only data
732 ::decode(new_hb_back_up, bl);
733 ::decode(new_up_thru, bl);
734 ::decode(new_last_clean_interval, bl);
735 ::decode(new_lost, bl);
736 ::decode(new_blacklist, bl);
737 ::decode(old_blacklist, bl);
738 ::decode(new_up_cluster, bl);
739 ::decode(cluster_snapshot, bl);
740 ::decode(new_uuid, bl);
741 ::decode(new_xinfo, bl);
742 ::decode(new_hb_front_up, bl);
743 if (struct_v >= 2)
744 ::decode(encode_features, bl);
745 else
746 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
747 if (struct_v >= 3) {
748 ::decode(new_nearfull_ratio, bl);
749 ::decode(new_full_ratio, bl);
750 } else {
751 new_nearfull_ratio = -1;
752 new_full_ratio = -1;
753 }
754 if (struct_v >= 4) {
755 ::decode(new_backfillfull_ratio, bl);
756 } else {
757 new_backfillfull_ratio = -1;
758 }
759 if (struct_v == 5) {
760 string r;
761 ::decode(r, bl);
762 if (r.length()) {
763 new_require_min_compat_client = ceph_release_from_name(r.c_str());
764 }
765 }
766 if (struct_v >= 6) {
767 ::decode(new_require_min_compat_client, bl);
768 ::decode(new_require_osd_release, bl);
769 } else {
770 if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
771 // only for compat with post-kraken pre-luminous test clusters
772 new_require_osd_release = CEPH_RELEASE_LUMINOUS;
773 new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
774 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
775 new_require_osd_release = CEPH_RELEASE_KRAKEN;
776 } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
777 new_require_osd_release = CEPH_RELEASE_JEWEL;
778 } else {
779 new_require_osd_release = -1;
780 }
781 }
782 DECODE_FINISH(bl); // osd-only data
783 }
784
785 if (struct_v >= 8) {
786 have_crc = true;
787 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
788 ::decode(inc_crc, bl);
789 tail_offset = bl.get_off();
790 ::decode(full_crc, bl);
791 } else {
792 have_crc = false;
793 full_crc = 0;
794 inc_crc = 0;
795 }
796
797 DECODE_FINISH(bl); // wrapper
798
799 if (have_crc) {
800 // verify crc
801 uint32_t actual = crc_front.crc32c(-1);
802 if (tail_offset < bl.get_off()) {
803 bufferlist tail;
804 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
805 actual = tail.crc32c(actual);
806 }
807 if (inc_crc != actual) {
808 ostringstream ss;
809 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
810 string s = ss.str();
811 throw buffer::malformed_input(s.c_str());
812 }
813 }
814 }
815
816 void OSDMap::Incremental::dump(Formatter *f) const
817 {
818 f->dump_int("epoch", epoch);
819 f->dump_stream("fsid") << fsid;
820 f->dump_stream("modified") << modified;
821 f->dump_int("new_pool_max", new_pool_max);
822 f->dump_int("new_flags", new_flags);
823 f->dump_float("new_full_ratio", new_full_ratio);
824 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
825 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
826 f->dump_int("new_require_min_compat_client", new_require_min_compat_client);
827 f->dump_int("new_require_osd_release", new_require_osd_release);
828
829 if (fullmap.length()) {
830 f->open_object_section("full_map");
831 OSDMap full;
832 bufferlist fbl = fullmap; // kludge around constness.
833 auto p = fbl.begin();
834 full.decode(p);
835 full.dump(f);
836 f->close_section();
837 }
838 if (crush.length()) {
839 f->open_object_section("crush");
840 CrushWrapper c;
841 bufferlist tbl = crush; // kludge around constness.
842 auto p = tbl.begin();
843 c.decode(p);
844 c.dump(f);
845 f->close_section();
846 }
847
848 f->dump_int("new_max_osd", new_max_osd);
849
850 f->open_array_section("new_pools");
851
852 for (const auto &new_pool : new_pools) {
853 f->open_object_section("pool");
854 f->dump_int("pool", new_pool.first);
855 new_pool.second.dump(f);
856 f->close_section();
857 }
858 f->close_section();
859 f->open_array_section("new_pool_names");
860
861 for (const auto &new_pool_name : new_pool_names) {
862 f->open_object_section("pool_name");
863 f->dump_int("pool", new_pool_name.first);
864 f->dump_string("name", new_pool_name.second);
865 f->close_section();
866 }
867 f->close_section();
868 f->open_array_section("old_pools");
869
870 for (const auto &old_pool : old_pools)
871 f->dump_int("pool", old_pool);
872 f->close_section();
873
874 f->open_array_section("new_up_osds");
875
876 for (const auto &upclient : new_up_client) {
877 f->open_object_section("osd");
878 f->dump_int("osd", upclient.first);
879 f->dump_stream("public_addr") << upclient.second;
880 f->dump_stream("cluster_addr") << new_up_cluster.find(upclient.first)->second;
881 f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(upclient.first)->second;
882 map<int32_t, entity_addr_t>::const_iterator q;
883 if ((q = new_hb_front_up.find(upclient.first)) != new_hb_front_up.end())
884 f->dump_stream("heartbeat_front_addr") << q->second;
885 f->close_section();
886 }
887 f->close_section();
888
889 f->open_array_section("new_weight");
890
891 for (const auto &weight : new_weight) {
892 f->open_object_section("osd");
893 f->dump_int("osd", weight.first);
894 f->dump_int("weight", weight.second);
895 f->close_section();
896 }
897 f->close_section();
898
899 f->open_array_section("osd_state_xor");
900 for (const auto &ns : new_state) {
901 f->open_object_section("osd");
902 f->dump_int("osd", ns.first);
903 set<string> st;
904 calc_state_set(new_state.find(ns.first)->second, st);
905 f->open_array_section("state_xor");
906 for (auto &state : st)
907 f->dump_string("state", state);
908 f->close_section();
909 }
910 f->close_section();
911
912 f->open_array_section("new_pg_temp");
913
914 for (const auto &pg_temp : new_pg_temp) {
915 f->open_object_section("pg");
916 f->dump_stream("pgid") << pg_temp.first;
917 f->open_array_section("osds");
918
919 for (const auto &osd : pg_temp.second)
920 f->dump_int("osd", osd);
921 f->close_section();
922 f->close_section();
923 }
924 f->close_section();
925
926 f->open_array_section("primary_temp");
927
928 for (const auto &primary_temp : new_primary_temp) {
929 f->dump_stream("pgid") << primary_temp.first;
930 f->dump_int("osd", primary_temp.second);
931 }
932 f->close_section(); // primary_temp
933
934 f->open_array_section("new_pg_upmap");
935 for (auto& i : new_pg_upmap) {
936 f->open_object_section("mapping");
937 f->dump_stream("pgid") << i.first;
938 f->open_array_section("osds");
939 for (auto osd : i.second) {
940 f->dump_int("osd", osd);
941 }
942 f->close_section();
943 f->close_section();
944 }
945 f->close_section();
946 f->open_array_section("old_pg_upmap");
947 for (auto& i : old_pg_upmap) {
948 f->dump_stream("pgid") << i;
949 }
950 f->close_section();
951
952 f->open_array_section("new_pg_upmap_items");
953 for (auto& i : new_pg_upmap_items) {
954 f->open_object_section("mapping");
955 f->dump_stream("pgid") << i.first;
956 f->open_array_section("mappings");
957 for (auto& p : i.second) {
958 f->open_object_section("mapping");
959 f->dump_int("from", p.first);
960 f->dump_int("to", p.second);
961 f->close_section();
962 }
963 f->close_section();
964 f->close_section();
965 }
966 f->close_section();
967 f->open_array_section("old_pg_upmap_items");
968 for (auto& i : old_pg_upmap_items) {
969 f->dump_stream("pgid") << i;
970 }
971 f->close_section();
972
973 f->open_array_section("new_up_thru");
974
975 for (const auto &up_thru : new_up_thru) {
976 f->open_object_section("osd");
977 f->dump_int("osd", up_thru.first);
978 f->dump_int("up_thru", up_thru.second);
979 f->close_section();
980 }
981 f->close_section();
982
983 f->open_array_section("new_lost");
984
985 for (const auto &lost : new_lost) {
986 f->open_object_section("osd");
987 f->dump_int("osd", lost.first);
988 f->dump_int("epoch_lost", lost.second);
989 f->close_section();
990 }
991 f->close_section();
992
993 f->open_array_section("new_last_clean_interval");
994
995 for (const auto &last_clean_interval : new_last_clean_interval) {
996 f->open_object_section("osd");
997 f->dump_int("osd", last_clean_interval.first);
998 f->dump_int("first", last_clean_interval.second.first);
999 f->dump_int("last", last_clean_interval.second.second);
1000 f->close_section();
1001 }
1002 f->close_section();
1003
1004 f->open_array_section("new_blacklist");
1005 for (const auto &blist : new_blacklist) {
1006 stringstream ss;
1007 ss << blist.first;
1008 f->dump_stream(ss.str().c_str()) << blist.second;
1009 }
1010 f->close_section();
1011 f->open_array_section("old_blacklist");
1012 for (const auto &blist : old_blacklist)
1013 f->dump_stream("addr") << blist;
1014 f->close_section();
1015
1016 f->open_array_section("new_xinfo");
1017 for (const auto &xinfo : new_xinfo) {
1018 f->open_object_section("xinfo");
1019 f->dump_int("osd", xinfo.first);
1020 xinfo.second.dump(f);
1021 f->close_section();
1022 }
1023 f->close_section();
1024
1025 if (cluster_snapshot.size())
1026 f->dump_string("cluster_snapshot", cluster_snapshot);
1027
1028 f->open_array_section("new_uuid");
1029 for (const auto &uuid : new_uuid) {
1030 f->open_object_section("osd");
1031 f->dump_int("osd", uuid.first);
1032 f->dump_stream("uuid") << uuid.second;
1033 f->close_section();
1034 }
1035 f->close_section();
1036
1037 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
1038 f->open_array_section("old_erasure_code_profiles");
1039 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
1040 f->dump_string("old", erasure_code_profile.c_str());
1041 }
1042 f->close_section();
1043 }
1044
1045 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
1046 {
1047 o.push_back(new Incremental);
1048 }
1049
1050 // ----------------------------------
1051 // OSDMap
1052
1053 void OSDMap::set_epoch(epoch_t e)
1054 {
1055 epoch = e;
1056 for (auto &pool : pools)
1057 pool.second.last_change = e;
1058 }
1059
1060 bool OSDMap::is_blacklisted(const entity_addr_t& a) const
1061 {
1062 if (blacklist.empty())
1063 return false;
1064
1065 // this specific instance?
1066 if (blacklist.count(a))
1067 return true;
1068
1069 // is entire ip blacklisted?
1070 if (a.is_ip()) {
1071 entity_addr_t b = a;
1072 b.set_port(0);
1073 b.set_nonce(0);
1074 if (blacklist.count(b)) {
1075 return true;
1076 }
1077 }
1078
1079 return false;
1080 }
1081
1082 void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
1083 {
1084 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
1085 }
1086
1087 void OSDMap::get_blacklist(std::set<entity_addr_t> *bl) const
1088 {
1089 for (const auto &i : blacklist) {
1090 bl->insert(i.first);
1091 }
1092 }
1093
1094 void OSDMap::set_max_osd(int m)
1095 {
1096 int o = max_osd;
1097 max_osd = m;
1098 osd_state.resize(m);
1099 osd_weight.resize(m);
1100 for (; o<max_osd; o++) {
1101 osd_state[o] = 0;
1102 osd_weight[o] = CEPH_OSD_OUT;
1103 }
1104 osd_info.resize(m);
1105 osd_xinfo.resize(m);
1106 osd_addrs->client_addr.resize(m);
1107 osd_addrs->cluster_addr.resize(m);
1108 osd_addrs->hb_back_addr.resize(m);
1109 osd_addrs->hb_front_addr.resize(m);
1110 osd_uuid->resize(m);
1111 if (osd_primary_affinity)
1112 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1113
1114 calc_num_osds();
1115 }
1116
1117 int OSDMap::calc_num_osds()
1118 {
1119 num_osd = 0;
1120 num_up_osd = 0;
1121 num_in_osd = 0;
1122 for (int i=0; i<max_osd; i++) {
1123 if (osd_state[i] & CEPH_OSD_EXISTS) {
1124 ++num_osd;
1125 if (osd_state[i] & CEPH_OSD_UP) {
1126 ++num_up_osd;
1127 }
1128 if (get_weight(i) != CEPH_OSD_OUT) {
1129 ++num_in_osd;
1130 }
1131 }
1132 }
1133 return num_osd;
1134 }
1135
1136 void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
1137 {
1138 *full = 0;
1139 *backfill = 0;
1140 *nearfull = 0;
1141 for (int i = 0; i < max_osd; ++i) {
1142 if (exists(i) && is_up(i) && is_in(i)) {
1143 if (osd_state[i] & CEPH_OSD_FULL)
1144 ++(*full);
1145 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1146 ++(*backfill);
1147 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1148 ++(*nearfull);
1149 }
1150 }
1151 }
1152
1153 static bool get_osd_utilization(
1154 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1155 int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail)
1156 {
1157 auto p = osd_stat.find(id);
1158 if (p == osd_stat.end())
1159 return false;
1160 *kb = p->second.kb;
1161 *kb_used = p->second.kb_used;
1162 *kb_avail = p->second.kb_avail;
1163 return *kb > 0;
1164 }
1165
1166 void OSDMap::get_full_osd_util(
1167 const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
1168 map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
1169 {
1170 full->clear();
1171 backfill->clear();
1172 nearfull->clear();
1173 for (int i = 0; i < max_osd; ++i) {
1174 if (exists(i) && is_up(i) && is_in(i)) {
1175 int64_t kb, kb_used, kb_avail;
1176 if (osd_state[i] & CEPH_OSD_FULL) {
1177 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1178 full->emplace(i, (float)kb_used / (float)kb);
1179 } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
1180 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1181 backfill->emplace(i, (float)kb_used / (float)kb);
1182 } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
1183 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1184 nearfull->emplace(i, (float)kb_used / (float)kb);
1185 }
1186 }
1187 }
1188 }
1189
1190 void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
1191 set<int> *nearfull) const
1192 {
1193 full->clear();
1194 backfill->clear();
1195 nearfull->clear();
1196 for (int i = 0; i < max_osd; ++i) {
1197 if (exists(i) && is_up(i) && is_in(i)) {
1198 if (osd_state[i] & CEPH_OSD_FULL)
1199 full->emplace(i);
1200 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1201 backfill->emplace(i);
1202 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1203 nearfull->emplace(i);
1204 }
1205 }
1206 }
1207
1208 void OSDMap::get_all_osds(set<int32_t>& ls) const
1209 {
1210 for (int i=0; i<max_osd; i++)
1211 if (exists(i))
1212 ls.insert(i);
1213 }
1214
1215 void OSDMap::get_up_osds(set<int32_t>& ls) const
1216 {
1217 for (int i = 0; i < max_osd; i++) {
1218 if (is_up(i))
1219 ls.insert(i);
1220 }
1221 }
1222
1223 void OSDMap::get_out_osds(set<int32_t>& ls) const
1224 {
1225 for (int i = 0; i < max_osd; i++) {
1226 if (is_out(i))
1227 ls.insert(i);
1228 }
1229 }
1230
1231 void OSDMap::calc_state_set(int state, set<string>& st)
1232 {
1233 unsigned t = state;
1234 for (unsigned s = 1; t; s <<= 1) {
1235 if (t & s) {
1236 t &= ~s;
1237 st.insert(ceph_osd_state_name(s));
1238 }
1239 }
1240 }
1241
1242 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1243 {
1244 float max = 0;
1245 for (const auto &weight : weights) {
1246 if (weight.second > max)
1247 max = weight.second;
1248 }
1249
1250 for (const auto &weight : weights) {
1251 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1252 }
1253 }
1254
1255 int OSDMap::identify_osd(const entity_addr_t& addr) const
1256 {
1257 for (int i=0; i<max_osd; i++)
1258 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr))
1259 return i;
1260 return -1;
1261 }
1262
1263 int OSDMap::identify_osd(const uuid_d& u) const
1264 {
1265 for (int i=0; i<max_osd; i++)
1266 if (exists(i) && get_uuid(i) == u)
1267 return i;
1268 return -1;
1269 }
1270
1271 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1272 {
1273 for (int i=0; i<max_osd; i++)
1274 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
1275 get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
1276 return i;
1277 return -1;
1278 }
1279
1280 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1281 {
1282 for (int i=0; i<max_osd; i++)
1283 if (exists(i) && (get_addr(i).is_same_host(ip) || get_cluster_addr(i).is_same_host(ip)))
1284 return i;
1285 return -1;
1286 }
1287
1288
1289 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1290 {
1291 uint64_t features = 0; // things we actually have
1292 uint64_t mask = 0; // things we could have
1293
1294 if (crush->has_nondefault_tunables())
1295 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1296 if (crush->has_nondefault_tunables2())
1297 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1298 if (crush->has_nondefault_tunables3())
1299 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1300 if (crush->has_v4_buckets())
1301 features |= CEPH_FEATURE_CRUSH_V4;
1302 if (crush->has_nondefault_tunables5())
1303 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1304 if (crush->has_incompat_choose_args())
1305 features |= CEPH_FEATURE_CRUSH_CHOOSE_ARGS;
1306 mask |= CEPH_FEATURES_CRUSH;
1307
1308 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1309 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1310 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1311
1312 for (auto &pool: pools) {
1313 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1314 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1315 }
1316 if (pool.second.is_erasure() &&
1317 entity_type != CEPH_ENTITY_TYPE_CLIENT) { // not for clients
1318 features |= CEPH_FEATURE_OSD_ERASURE_CODES;
1319 }
1320 if (!pool.second.tiers.empty() ||
1321 pool.second.is_tier()) {
1322 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1323 }
1324 int ruleid = crush->find_rule(pool.second.get_crush_rule(),
1325 pool.second.get_type(),
1326 pool.second.get_size());
1327 if (ruleid >= 0) {
1328 if (crush->is_v2_rule(ruleid))
1329 features |= CEPH_FEATURE_CRUSH_V2;
1330 if (crush->is_v3_rule(ruleid))
1331 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1332 if (crush->is_v5_rule(ruleid))
1333 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1334 }
1335 }
1336 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1337 for (auto &erasure_code_profile : erasure_code_profiles) {
1338 auto& profile = erasure_code_profile.second;
1339 const auto& plugin = profile.find("plugin");
1340 if (plugin != profile.end()) {
1341 if (plugin->second == "isa" || plugin->second == "lrc")
1342 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
1343 if (plugin->second == "shec")
1344 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
1345 }
1346 }
1347 }
1348 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1349 if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
1350 mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
1351
1352 if (osd_primary_affinity) {
1353 for (int i = 0; i < max_osd; ++i) {
1354 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1355 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1356 break;
1357 }
1358 }
1359 }
1360 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1361
1362 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1363 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1364 if (require_osd_release >= CEPH_RELEASE_JEWEL) {
1365 features |= jewel_features;
1366 }
1367 mask |= jewel_features;
1368
1369 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1370 | CEPH_FEATURE_MSG_ADDR2;
1371 if (require_osd_release >= CEPH_RELEASE_KRAKEN) {
1372 features |= kraken_features;
1373 }
1374 mask |= kraken_features;
1375 }
1376
1377 if (pmask)
1378 *pmask = mask;
1379 return features;
1380 }
1381
1382 uint8_t OSDMap::get_min_compat_client() const
1383 {
1384 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1385
1386 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1387 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
1388 return CEPH_RELEASE_LUMINOUS; // v12.2.0
1389 }
1390 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1391 return CEPH_RELEASE_JEWEL; // v10.2.0
1392 }
1393 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1394 return CEPH_RELEASE_HAMMER; // v0.94.0
1395 }
1396 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1397 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1398 HAVE_FEATURE(f, OSD_ERASURE_CODES) || // v0.73-498-gbfc86a8
1399 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1400 return CEPH_RELEASE_FIREFLY; // v0.80.0
1401 }
1402 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1403 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1404 return CEPH_RELEASE_DUMPLING; // v0.67.0
1405 }
1406 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1407 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1408 }
1409 return CEPH_RELEASE_ARGONAUT; // v0.48argonaut-206-g6f381af
1410 }
1411
1412 void OSDMap::_calc_up_osd_features()
1413 {
1414 bool first = true;
1415 cached_up_osd_features = 0;
1416 for (int osd = 0; osd < max_osd; ++osd) {
1417 if (!is_up(osd))
1418 continue;
1419 const osd_xinfo_t &xi = get_xinfo(osd);
1420 if (first) {
1421 cached_up_osd_features = xi.features;
1422 first = false;
1423 } else {
1424 cached_up_osd_features &= xi.features;
1425 }
1426 }
1427 }
1428
1429 uint64_t OSDMap::get_up_osd_features() const
1430 {
1431 return cached_up_osd_features;
1432 }
1433
1434 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1435 {
1436 if (o->epoch == n->epoch)
1437 return;
1438
1439 int diff = 0;
1440
1441 // do addrs match?
1442 if (o->max_osd != n->max_osd)
1443 diff++;
1444 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1445 if ( n->osd_addrs->client_addr[i] && o->osd_addrs->client_addr[i] &&
1446 *n->osd_addrs->client_addr[i] == *o->osd_addrs->client_addr[i])
1447 n->osd_addrs->client_addr[i] = o->osd_addrs->client_addr[i];
1448 else
1449 diff++;
1450 if ( n->osd_addrs->cluster_addr[i] && o->osd_addrs->cluster_addr[i] &&
1451 *n->osd_addrs->cluster_addr[i] == *o->osd_addrs->cluster_addr[i])
1452 n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
1453 else
1454 diff++;
1455 if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
1456 *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
1457 n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
1458 else
1459 diff++;
1460 if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
1461 *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
1462 n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
1463 else
1464 diff++;
1465 }
1466 if (diff == 0) {
1467 // zoinks, no differences at all!
1468 n->osd_addrs = o->osd_addrs;
1469 }
1470
1471 // does crush match?
1472 bufferlist oc, nc;
1473 ::encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1474 ::encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1475 if (oc.contents_equal(nc)) {
1476 n->crush = o->crush;
1477 }
1478
1479 // does pg_temp match?
1480 if (*o->pg_temp == *n->pg_temp)
1481 n->pg_temp = o->pg_temp;
1482
1483 // does primary_temp match?
1484 if (o->primary_temp->size() == n->primary_temp->size()) {
1485 if (*o->primary_temp == *n->primary_temp)
1486 n->primary_temp = o->primary_temp;
1487 }
1488
1489 // do uuids match?
1490 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1491 *o->osd_uuid == *n->osd_uuid)
1492 n->osd_uuid = o->osd_uuid;
1493 }
1494
1495 void OSDMap::clean_temps(CephContext *cct,
1496 const OSDMap& osdmap, Incremental *pending_inc)
1497 {
1498 ldout(cct, 10) << __func__ << dendl;
1499 OSDMap tmpmap;
1500 tmpmap.deepish_copy_from(osdmap);
1501 tmpmap.apply_incremental(*pending_inc);
1502
1503 for (auto pg : *tmpmap.pg_temp) {
1504 // if pool does not exist, remove any existing pg_temps associated with
1505 // it. we don't care about pg_temps on the pending_inc either; if there
1506 // are new_pg_temp entries on the pending, clear them out just as well.
1507 if (!osdmap.have_pg_pool(pg.first.pool())) {
1508 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1509 << " for nonexistent pool " << pg.first.pool() << dendl;
1510 pending_inc->new_pg_temp[pg.first].clear();
1511 continue;
1512 }
1513 // all osds down?
1514 unsigned num_up = 0;
1515 for (auto o : pg.second) {
1516 if (!tmpmap.is_down(o)) {
1517 ++num_up;
1518 break;
1519 }
1520 }
1521 if (num_up == 0) {
1522 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1523 << " with all down osds" << pg.second << dendl;
1524 pending_inc->new_pg_temp[pg.first].clear();
1525 continue;
1526 }
1527 // redundant pg_temp?
1528 vector<int> raw_up;
1529 int primary;
1530 tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1531 if (vectors_equal(raw_up, pg.second)) {
1532 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1533 << pg.second << " that matches raw_up mapping" << dendl;
1534 if (osdmap.pg_temp->count(pg.first))
1535 pending_inc->new_pg_temp[pg.first].clear();
1536 else
1537 pending_inc->new_pg_temp.erase(pg.first);
1538 }
1539 }
1540
1541 for (auto &pg : *tmpmap.primary_temp) {
1542 // primary down?
1543 if (tmpmap.is_down(pg.second)) {
1544 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1545 << " to down " << pg.second << dendl;
1546 pending_inc->new_primary_temp[pg.first] = -1;
1547 continue;
1548 }
1549 // redundant primary_temp?
1550 vector<int> real_up, templess_up;
1551 int real_primary, templess_primary;
1552 pg_t pgid = pg.first;
1553 tmpmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1554 tmpmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1555 if (real_primary == templess_primary){
1556 ldout(cct, 10) << __func__ << " removing primary_temp "
1557 << pgid << " -> " << real_primary
1558 << " (unnecessary/redundant)" << dendl;
1559 if (osdmap.primary_temp->count(pgid))
1560 pending_inc->new_primary_temp[pgid] = -1;
1561 else
1562 pending_inc->new_primary_temp.erase(pgid);
1563 }
1564 }
1565 }
1566
1567 int OSDMap::apply_incremental(const Incremental &inc)
1568 {
1569 new_blacklist_entries = false;
1570 if (inc.epoch == 1)
1571 fsid = inc.fsid;
1572 else if (inc.fsid != fsid)
1573 return -EINVAL;
1574
1575 assert(inc.epoch == epoch+1);
1576
1577 epoch++;
1578 modified = inc.modified;
1579
1580 // full map?
1581 if (inc.fullmap.length()) {
1582 bufferlist bl(inc.fullmap);
1583 decode(bl);
1584 return 0;
1585 }
1586
1587 // nope, incremental.
1588 if (inc.new_flags >= 0) {
1589 flags = inc.new_flags;
1590 // the below is just to cover a newly-upgraded luminous mon
1591 // cluster that has to set require_jewel_osds or
1592 // require_kraken_osds before the osds can be upgraded to
1593 // luminous.
1594 if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
1595 if (require_osd_release < CEPH_RELEASE_KRAKEN) {
1596 require_osd_release = CEPH_RELEASE_KRAKEN;
1597 }
1598 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
1599 if (require_osd_release < CEPH_RELEASE_JEWEL) {
1600 require_osd_release = CEPH_RELEASE_JEWEL;
1601 }
1602 }
1603 }
1604
1605 if (inc.new_max_osd >= 0)
1606 set_max_osd(inc.new_max_osd);
1607
1608 if (inc.new_pool_max != -1)
1609 pool_max = inc.new_pool_max;
1610
1611 for (const auto &pool : inc.new_pools) {
1612 pools[pool.first] = pool.second;
1613 pools[pool.first].last_change = epoch;
1614 }
1615
1616 for (const auto &pname : inc.new_pool_names) {
1617 auto pool_name_entry = pool_name.find(pname.first);
1618 if (pool_name_entry != pool_name.end()) {
1619 name_pool.erase(pool_name_entry->second);
1620 pool_name_entry->second = pname.second;
1621 } else {
1622 pool_name[pname.first] = pname.second;
1623 }
1624 name_pool[pname.second] = pname.first;
1625 }
1626
1627 for (const auto &pool : inc.old_pools) {
1628 pools.erase(pool);
1629 name_pool.erase(pool_name[pool]);
1630 pool_name.erase(pool);
1631 }
1632
1633 for (const auto &weight : inc.new_weight) {
1634 set_weight(weight.first, weight.second);
1635
1636 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1637 // xinfo old_weight.
1638 if (weight.second) {
1639 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
1640 osd_xinfo[weight.first].old_weight = 0;
1641 }
1642 }
1643
1644 for (const auto &primary_affinity : inc.new_primary_affinity) {
1645 set_primary_affinity(primary_affinity.first, primary_affinity.second);
1646 }
1647
1648 // erasure_code_profiles
1649 for (const auto &profile : inc.old_erasure_code_profiles)
1650 erasure_code_profiles.erase(profile);
1651
1652 for (const auto &profile : inc.new_erasure_code_profiles) {
1653 set_erasure_code_profile(profile.first, profile.second);
1654 }
1655
1656 // up/down
1657 for (const auto &state : inc.new_state) {
1658 const auto osd = state.first;
1659 int s = state.second ? state.second : CEPH_OSD_UP;
1660 if ((osd_state[osd] & CEPH_OSD_UP) &&
1661 (s & CEPH_OSD_UP)) {
1662 osd_info[osd].down_at = epoch;
1663 osd_xinfo[osd].down_stamp = modified;
1664 }
1665 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
1666 (s & CEPH_OSD_EXISTS)) {
1667 // osd is destroyed; clear out anything interesting.
1668 (*osd_uuid)[osd] = uuid_d();
1669 osd_info[osd] = osd_info_t();
1670 osd_xinfo[osd] = osd_xinfo_t();
1671 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1672 osd_addrs->client_addr[osd].reset(new entity_addr_t());
1673 osd_addrs->cluster_addr[osd].reset(new entity_addr_t());
1674 osd_addrs->hb_front_addr[osd].reset(new entity_addr_t());
1675 osd_addrs->hb_back_addr[osd].reset(new entity_addr_t());
1676 osd_state[osd] = 0;
1677 } else {
1678 osd_state[osd] ^= s;
1679 }
1680 }
1681
1682 for (const auto &client : inc.new_up_client) {
1683 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1684 osd_addrs->client_addr[client.first].reset(new entity_addr_t(client.second));
1685 if (inc.new_hb_back_up.empty())
1686 osd_addrs->hb_back_addr[client.first].reset(new entity_addr_t(client.second)); //this is a backward-compatibility hack
1687 else
1688 osd_addrs->hb_back_addr[client.first].reset(
1689 new entity_addr_t(inc.new_hb_back_up.find(client.first)->second));
1690 const auto j = inc.new_hb_front_up.find(client.first);
1691 if (j != inc.new_hb_front_up.end())
1692 osd_addrs->hb_front_addr[client.first].reset(new entity_addr_t(j->second));
1693 else
1694 osd_addrs->hb_front_addr[client.first].reset();
1695
1696 osd_info[client.first].up_from = epoch;
1697 }
1698
1699 for (const auto &cluster : inc.new_up_cluster)
1700 osd_addrs->cluster_addr[cluster.first].reset(new entity_addr_t(cluster.second));
1701
1702 // info
1703 for (const auto &thru : inc.new_up_thru)
1704 osd_info[thru.first].up_thru = thru.second;
1705
1706 for (const auto &interval : inc.new_last_clean_interval) {
1707 osd_info[interval.first].last_clean_begin = interval.second.first;
1708 osd_info[interval.first].last_clean_end = interval.second.second;
1709 }
1710
1711 for (const auto &lost : inc.new_lost)
1712 osd_info[lost.first].lost_at = lost.second;
1713
1714 // xinfo
1715 for (const auto &xinfo : inc.new_xinfo)
1716 osd_xinfo[xinfo.first] = xinfo.second;
1717
1718 // uuid
1719 for (const auto &uuid : inc.new_uuid)
1720 (*osd_uuid)[uuid.first] = uuid.second;
1721
1722 // pg rebuild
1723 for (const auto &pg : inc.new_pg_temp) {
1724 if (pg.second.empty())
1725 pg_temp->erase(pg.first);
1726 else
1727 pg_temp->set(pg.first, pg.second);
1728 }
1729 if (!inc.new_pg_temp.empty()) {
1730 // make sure pg_temp is efficiently stored
1731 pg_temp->rebuild();
1732 }
1733
1734 for (const auto &pg : inc.new_primary_temp) {
1735 if (pg.second == -1)
1736 primary_temp->erase(pg.first);
1737 else
1738 (*primary_temp)[pg.first] = pg.second;
1739 }
1740
1741 for (auto& p : inc.new_pg_upmap) {
1742 pg_upmap[p.first] = p.second;
1743 }
1744 for (auto& pg : inc.old_pg_upmap) {
1745 pg_upmap.erase(pg);
1746 }
1747 for (auto& p : inc.new_pg_upmap_items) {
1748 pg_upmap_items[p.first] = p.second;
1749 }
1750 for (auto& pg : inc.old_pg_upmap_items) {
1751 pg_upmap_items.erase(pg);
1752 }
1753
1754 // blacklist
1755 if (!inc.new_blacklist.empty()) {
1756 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
1757 new_blacklist_entries = true;
1758 }
1759 for (const auto &addr : inc.old_blacklist)
1760 blacklist.erase(addr);
1761
1762 // cluster snapshot?
1763 if (inc.cluster_snapshot.length()) {
1764 cluster_snapshot = inc.cluster_snapshot;
1765 cluster_snapshot_epoch = inc.epoch;
1766 } else {
1767 cluster_snapshot.clear();
1768 cluster_snapshot_epoch = 0;
1769 }
1770
1771 if (inc.new_nearfull_ratio >= 0) {
1772 nearfull_ratio = inc.new_nearfull_ratio;
1773 }
1774 if (inc.new_backfillfull_ratio >= 0) {
1775 backfillfull_ratio = inc.new_backfillfull_ratio;
1776 }
1777 if (inc.new_full_ratio >= 0) {
1778 full_ratio = inc.new_full_ratio;
1779 }
1780 if (inc.new_require_min_compat_client > 0) {
1781 require_min_compat_client = inc.new_require_min_compat_client;
1782 }
1783 if (inc.new_require_osd_release >= 0) {
1784 require_osd_release = inc.new_require_osd_release;
1785 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1786 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
1787 }
1788 }
1789
1790 // do new crush map last (after up/down stuff)
1791 if (inc.crush.length()) {
1792 bufferlist bl(inc.crush);
1793 auto blp = bl.begin();
1794 crush.reset(new CrushWrapper);
1795 crush->decode(blp);
1796 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1797 // only increment if this is a luminous-encoded osdmap, lest
1798 // the mon's crush_version diverge from what the osds or others
1799 // are decoding and applying on their end. if we won't encode
1800 // it in the canonical version, don't change it.
1801 ++crush_version;
1802 }
1803 }
1804
1805 calc_num_osds();
1806 _calc_up_osd_features();
1807 return 0;
1808 }
1809
1810 // mapping
1811 int OSDMap::map_to_pg(
1812 int64_t poolid,
1813 const string& name,
1814 const string& key,
1815 const string& nspace,
1816 pg_t *pg) const
1817 {
1818 // calculate ps (placement seed)
1819 const pg_pool_t *pool = get_pg_pool(poolid);
1820 if (!pool)
1821 return -ENOENT;
1822 ps_t ps;
1823 if (!key.empty())
1824 ps = pool->hash_key(key, nspace);
1825 else
1826 ps = pool->hash_key(name, nspace);
1827 *pg = pg_t(ps, poolid);
1828 return 0;
1829 }
1830
1831 int OSDMap::object_locator_to_pg(
1832 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
1833 {
1834 if (loc.hash >= 0) {
1835 if (!get_pg_pool(loc.get_pool())) {
1836 return -ENOENT;
1837 }
1838 pg = pg_t(loc.hash, loc.get_pool());
1839 return 0;
1840 }
1841 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
1842 }
1843
1844 ceph_object_layout OSDMap::make_object_layout(
1845 object_t oid, int pg_pool, string nspace) const
1846 {
1847 object_locator_t loc(pg_pool, nspace);
1848
1849 ceph_object_layout ol;
1850 pg_t pgid = object_locator_to_pg(oid, loc);
1851 ol.ol_pgid = pgid.get_old_pg().v;
1852 ol.ol_stripe_unit = 0;
1853 return ol;
1854 }
1855
1856 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
1857 vector<int>& osds) const
1858 {
1859 if (pool.can_shift_osds()) {
1860 unsigned removed = 0;
1861 for (unsigned i = 0; i < osds.size(); i++) {
1862 if (!exists(osds[i])) {
1863 removed++;
1864 continue;
1865 }
1866 if (removed) {
1867 osds[i - removed] = osds[i];
1868 }
1869 }
1870 if (removed)
1871 osds.resize(osds.size() - removed);
1872 } else {
1873 for (auto& osd : osds) {
1874 if (!exists(osd))
1875 osd = CRUSH_ITEM_NONE;
1876 }
1877 }
1878 }
1879
1880 void OSDMap::_pg_to_raw_osds(
1881 const pg_pool_t& pool, pg_t pg,
1882 vector<int> *osds,
1883 ps_t *ppps) const
1884 {
1885 // map to osds[]
1886 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
1887 unsigned size = pool.get_size();
1888
1889 // what crush rule?
1890 int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
1891 if (ruleno >= 0)
1892 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
1893
1894 _remove_nonexistent_osds(pool, *osds);
1895
1896 if (ppps)
1897 *ppps = pps;
1898 }
1899
1900 int OSDMap::_pick_primary(const vector<int>& osds) const
1901 {
1902 for (auto osd : osds) {
1903 if (osd != CRUSH_ITEM_NONE) {
1904 return osd;
1905 }
1906 }
1907 return -1;
1908 }
1909
1910 void OSDMap::_apply_remap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
1911 {
1912 pg_t pg = pi.raw_pg_to_pg(raw_pg);
1913 auto p = pg_upmap.find(pg);
1914 if (p != pg_upmap.end()) {
1915 // make sure targets aren't marked out
1916 for (auto osd : p->second) {
1917 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd_weight[osd] == 0) {
1918 // reject/ignore the explicit mapping
1919 return;
1920 }
1921 }
1922 *raw = vector<int>(p->second.begin(), p->second.end());
1923 return;
1924 }
1925
1926 auto q = pg_upmap_items.find(pg);
1927 if (q != pg_upmap_items.end()) {
1928 // NOTE: this approach does not allow a bidirectional swap,
1929 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1930 for (auto& r : q->second) {
1931 // make sure the replacement value doesn't already appear
1932 bool exists = false;
1933 ssize_t pos = -1;
1934 for (unsigned i = 0; i < raw->size(); ++i) {
1935 int osd = (*raw)[i];
1936 if (osd == r.second) {
1937 exists = true;
1938 break;
1939 }
1940 // ignore mapping if target is marked out (or invalid osd id)
1941 if (osd == r.first &&
1942 pos < 0 &&
1943 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
1944 osd_weight[r.second] == 0)) {
1945 pos = i;
1946 }
1947 }
1948 if (!exists && pos >= 0) {
1949 (*raw)[pos] = r.second;
1950 return;
1951 }
1952 }
1953 }
1954 }
1955
1956 // pg -> (up osd list)
1957 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1958 vector<int> *up) const
1959 {
1960 if (pool.can_shift_osds()) {
1961 // shift left
1962 up->clear();
1963 up->reserve(raw.size());
1964 for (unsigned i=0; i<raw.size(); i++) {
1965 if (!exists(raw[i]) || is_down(raw[i]))
1966 continue;
1967 up->push_back(raw[i]);
1968 }
1969 } else {
1970 // set down/dne devices to NONE
1971 up->resize(raw.size());
1972 for (int i = raw.size() - 1; i >= 0; --i) {
1973 if (!exists(raw[i]) || is_down(raw[i])) {
1974 (*up)[i] = CRUSH_ITEM_NONE;
1975 } else {
1976 (*up)[i] = raw[i];
1977 }
1978 }
1979 }
1980 }
1981
1982 void OSDMap::_apply_primary_affinity(ps_t seed,
1983 const pg_pool_t& pool,
1984 vector<int> *osds,
1985 int *primary) const
1986 {
1987 // do we have any non-default primary_affinity values for these osds?
1988 if (!osd_primary_affinity)
1989 return;
1990
1991 bool any = false;
1992 for (const auto osd : *osds) {
1993 if (osd != CRUSH_ITEM_NONE &&
1994 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1995 any = true;
1996 break;
1997 }
1998 }
1999 if (!any)
2000 return;
2001
2002 // pick the primary. feed both the seed (for the pg) and the osd
2003 // into the hash/rng so that a proportional fraction of an osd's pgs
2004 // get rejected as primary.
2005 int pos = -1;
2006 for (unsigned i = 0; i < osds->size(); ++i) {
2007 int o = (*osds)[i];
2008 if (o == CRUSH_ITEM_NONE)
2009 continue;
2010 unsigned a = (*osd_primary_affinity)[o];
2011 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2012 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2013 seed, o) >> 16) >= a) {
2014 // we chose not to use this primary. note it anyway as a
2015 // fallback in case we don't pick anyone else, but keep looking.
2016 if (pos < 0)
2017 pos = i;
2018 } else {
2019 pos = i;
2020 break;
2021 }
2022 }
2023 if (pos < 0)
2024 return;
2025
2026 *primary = (*osds)[pos];
2027
2028 if (pool.can_shift_osds() && pos > 0) {
2029 // move the new primary to the front.
2030 for (int i = pos; i > 0; --i) {
2031 (*osds)[i] = (*osds)[i-1];
2032 }
2033 (*osds)[0] = *primary;
2034 }
2035 }
2036
2037 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
2038 vector<int> *temp_pg, int *temp_primary) const
2039 {
2040 pg = pool.raw_pg_to_pg(pg);
2041 const auto p = pg_temp->find(pg);
2042 temp_pg->clear();
2043 if (p != pg_temp->end()) {
2044 for (unsigned i=0; i<p->second.size(); i++) {
2045 if (!exists(p->second[i]) || is_down(p->second[i])) {
2046 if (pool.can_shift_osds()) {
2047 continue;
2048 } else {
2049 temp_pg->push_back(CRUSH_ITEM_NONE);
2050 }
2051 } else {
2052 temp_pg->push_back(p->second[i]);
2053 }
2054 }
2055 }
2056 const auto &pp = primary_temp->find(pg);
2057 *temp_primary = -1;
2058 if (pp != primary_temp->end()) {
2059 *temp_primary = pp->second;
2060 } else if (!temp_pg->empty()) { // apply pg_temp's primary
2061 for (unsigned i = 0; i < temp_pg->size(); ++i) {
2062 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
2063 *temp_primary = (*temp_pg)[i];
2064 break;
2065 }
2066 }
2067 }
2068 }
2069
2070 void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
2071 {
2072 *primary = -1;
2073 raw->clear();
2074 const pg_pool_t *pool = get_pg_pool(pg.pool());
2075 if (!pool)
2076 return;
2077 _pg_to_raw_osds(*pool, pg, raw, NULL);
2078 if (primary)
2079 *primary = _pick_primary(*raw);
2080 }
2081
2082 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
2083 {
2084 const pg_pool_t *pool = get_pg_pool(pg.pool());
2085 if (!pool) {
2086 if (primary)
2087 *primary = -1;
2088 if (up)
2089 up->clear();
2090 return;
2091 }
2092 vector<int> raw;
2093 ps_t pps;
2094 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2095 _apply_remap(*pool, pg, &raw);
2096 _raw_to_up_osds(*pool, raw, up);
2097 *primary = _pick_primary(raw);
2098 _apply_primary_affinity(pps, *pool, up, primary);
2099 }
2100
2101 void OSDMap::_pg_to_up_acting_osds(
2102 const pg_t& pg, vector<int> *up, int *up_primary,
2103 vector<int> *acting, int *acting_primary,
2104 bool raw_pg_to_pg) const
2105 {
2106 const pg_pool_t *pool = get_pg_pool(pg.pool());
2107 if (!pool ||
2108 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
2109 if (up)
2110 up->clear();
2111 if (up_primary)
2112 *up_primary = -1;
2113 if (acting)
2114 acting->clear();
2115 if (acting_primary)
2116 *acting_primary = -1;
2117 return;
2118 }
2119 vector<int> raw;
2120 vector<int> _up;
2121 vector<int> _acting;
2122 int _up_primary;
2123 int _acting_primary;
2124 ps_t pps;
2125 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
2126 if (_acting.empty() || up || up_primary) {
2127 _pg_to_raw_osds(*pool, pg, &raw, &pps);
2128 _apply_remap(*pool, pg, &raw);
2129 _raw_to_up_osds(*pool, raw, &_up);
2130 _up_primary = _pick_primary(_up);
2131 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
2132 if (_acting.empty()) {
2133 _acting = _up;
2134 if (_acting_primary == -1) {
2135 _acting_primary = _up_primary;
2136 }
2137 }
2138
2139 if (up)
2140 up->swap(_up);
2141 if (up_primary)
2142 *up_primary = _up_primary;
2143 }
2144
2145 if (acting)
2146 acting->swap(_acting);
2147 if (acting_primary)
2148 *acting_primary = _acting_primary;
2149 }
2150
2151 int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
2152 {
2153 if (!nrep)
2154 nrep = acting.size();
2155 for (int i=0; i<nrep; i++)
2156 if (acting[i] == osd)
2157 return i;
2158 return -1;
2159 }
2160
2161 int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2162 {
2163 return calc_pg_rank(osd, acting, nrep);
2164 }
2165
2166 bool OSDMap::primary_changed(
2167 int oldprimary,
2168 const vector<int> &oldacting,
2169 int newprimary,
2170 const vector<int> &newacting)
2171 {
2172 if (oldacting.empty() && newacting.empty())
2173 return false; // both still empty
2174 if (oldacting.empty() ^ newacting.empty())
2175 return true; // was empty, now not, or vice versa
2176 if (oldprimary != newprimary)
2177 return true; // primary changed
2178 if (calc_pg_rank(oldprimary, oldacting) !=
2179 calc_pg_rank(newprimary, newacting))
2180 return true;
2181 return false; // same primary (tho replicas may have changed)
2182 }
2183
2184
2185 // serialize, unserialize
2186 void OSDMap::encode_client_old(bufferlist& bl) const
2187 {
2188 __u16 v = 5;
2189 ::encode(v, bl);
2190
2191 // base
2192 ::encode(fsid, bl);
2193 ::encode(epoch, bl);
2194 ::encode(created, bl);
2195 ::encode(modified, bl);
2196
2197 // for ::encode(pools, bl);
2198 __u32 n = pools.size();
2199 ::encode(n, bl);
2200
2201 for (const auto &pool : pools) {
2202 n = pool.first;
2203 ::encode(n, bl);
2204 ::encode(pool.second, bl, 0);
2205 }
2206 // for ::encode(pool_name, bl);
2207 n = pool_name.size();
2208 ::encode(n, bl);
2209 for (const auto &pname : pool_name) {
2210 n = pname.first;
2211 ::encode(n, bl);
2212 ::encode(pname.second, bl);
2213 }
2214 // for ::encode(pool_max, bl);
2215 n = pool_max;
2216 ::encode(n, bl);
2217
2218 ::encode(flags, bl);
2219
2220 ::encode(max_osd, bl);
2221 {
2222 uint32_t n = osd_state.size();
2223 ::encode(n, bl);
2224 for (auto s : osd_state) {
2225 ::encode((uint8_t)s, bl);
2226 }
2227 }
2228 ::encode(osd_weight, bl);
2229 ::encode(osd_addrs->client_addr, bl, 0);
2230
2231 // for ::encode(pg_temp, bl);
2232 n = pg_temp->size();
2233 ::encode(n, bl);
2234 for (const auto pg : *pg_temp) {
2235 old_pg_t opg = pg.first.get_old_pg();
2236 ::encode(opg, bl);
2237 ::encode(pg.second, bl);
2238 }
2239
2240 // crush
2241 bufferlist cbl;
2242 crush->encode(cbl, 0 /* legacy (no) features */);
2243 ::encode(cbl, bl);
2244 }
2245
2246 void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2247 {
2248 if ((features & CEPH_FEATURE_PGID64) == 0) {
2249 encode_client_old(bl);
2250 return;
2251 }
2252
2253 __u16 v = 6;
2254 ::encode(v, bl);
2255
2256 // base
2257 ::encode(fsid, bl);
2258 ::encode(epoch, bl);
2259 ::encode(created, bl);
2260 ::encode(modified, bl);
2261
2262 ::encode(pools, bl, features);
2263 ::encode(pool_name, bl);
2264 ::encode(pool_max, bl);
2265
2266 ::encode(flags, bl);
2267
2268 ::encode(max_osd, bl);
2269 {
2270 uint32_t n = osd_state.size();
2271 ::encode(n, bl);
2272 for (auto s : osd_state) {
2273 ::encode((uint8_t)s, bl);
2274 }
2275 }
2276 ::encode(osd_weight, bl);
2277 ::encode(osd_addrs->client_addr, bl, features);
2278
2279 ::encode(*pg_temp, bl);
2280
2281 // crush
2282 bufferlist cbl;
2283 crush->encode(cbl, 0 /* legacy (no) features */);
2284 ::encode(cbl, bl);
2285
2286 // extended
2287 __u16 ev = 10;
2288 ::encode(ev, bl);
2289 ::encode(osd_addrs->hb_back_addr, bl, features);
2290 ::encode(osd_info, bl);
2291 ::encode(blacklist, bl, features);
2292 ::encode(osd_addrs->cluster_addr, bl, features);
2293 ::encode(cluster_snapshot_epoch, bl);
2294 ::encode(cluster_snapshot, bl);
2295 ::encode(*osd_uuid, bl);
2296 ::encode(osd_xinfo, bl);
2297 ::encode(osd_addrs->hb_front_addr, bl, features);
2298 }
2299
2300 void OSDMap::encode(bufferlist& bl, uint64_t features) const
2301 {
2302 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2303 encode_classic(bl, features);
2304 return;
2305 }
2306
2307 // only a select set of callers should *ever* be encoding new
2308 // OSDMaps. others should be passing around the canonical encoded
2309 // buffers from on high. select out those callers by passing in an
2310 // "impossible" feature bit.
2311 assert(features & CEPH_FEATURE_RESERVED);
2312 features &= ~CEPH_FEATURE_RESERVED;
2313
2314 size_t start_offset = bl.length();
2315 size_t tail_offset;
2316 buffer::list::iterator crc_it;
2317
2318 // meta-encoding: how we include client-used and osd-specific data
2319 ENCODE_START(8, 7, bl);
2320
2321 {
2322 uint8_t v = 6;
2323 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2324 v = 3;
2325 }
2326 ENCODE_START(v, 1, bl); // client-usable data
2327 // base
2328 ::encode(fsid, bl);
2329 ::encode(epoch, bl);
2330 ::encode(created, bl);
2331 ::encode(modified, bl);
2332
2333 ::encode(pools, bl, features);
2334 ::encode(pool_name, bl);
2335 ::encode(pool_max, bl);
2336
2337 if (v < 4) {
2338 decltype(flags) f = flags;
2339 if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
2340 f |= CEPH_OSDMAP_REQUIRE_LUMINOUS;
2341 else if (require_osd_release == CEPH_RELEASE_KRAKEN)
2342 f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
2343 else if (require_osd_release == CEPH_RELEASE_JEWEL)
2344 f |= CEPH_OSDMAP_REQUIRE_JEWEL;
2345 ::encode(f, bl);
2346 } else {
2347 ::encode(flags, bl);
2348 }
2349
2350 ::encode(max_osd, bl);
2351 if (v >= 5) {
2352 ::encode(osd_state, bl);
2353 } else {
2354 uint32_t n = osd_state.size();
2355 ::encode(n, bl);
2356 for (auto s : osd_state) {
2357 ::encode((uint8_t)s, bl);
2358 }
2359 }
2360 ::encode(osd_weight, bl);
2361 ::encode(osd_addrs->client_addr, bl, features);
2362
2363 ::encode(*pg_temp, bl);
2364 ::encode(*primary_temp, bl);
2365 if (osd_primary_affinity) {
2366 ::encode(*osd_primary_affinity, bl);
2367 } else {
2368 vector<__u32> v;
2369 ::encode(v, bl);
2370 }
2371
2372 // crush
2373 bufferlist cbl;
2374 crush->encode(cbl, features);
2375 ::encode(cbl, bl);
2376 ::encode(erasure_code_profiles, bl);
2377
2378 if (v >= 4) {
2379 ::encode(pg_upmap, bl);
2380 ::encode(pg_upmap_items, bl);
2381 } else {
2382 assert(pg_upmap.empty());
2383 assert(pg_upmap_items.empty());
2384 }
2385 if (v >= 6) {
2386 ::encode(crush_version, bl);
2387 }
2388 ENCODE_FINISH(bl); // client-usable data
2389 }
2390
2391 {
2392 uint8_t target_v = 5;
2393 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2394 target_v = 1;
2395 }
2396 ENCODE_START(target_v, 1, bl); // extended, osd-only data
2397 ::encode(osd_addrs->hb_back_addr, bl, features);
2398 ::encode(osd_info, bl);
2399 {
2400 // put this in a sorted, ordered map<> so that we encode in a
2401 // deterministic order.
2402 map<entity_addr_t,utime_t> blacklist_map;
2403 for (const auto &addr : blacklist)
2404 blacklist_map.insert(make_pair(addr.first, addr.second));
2405 ::encode(blacklist_map, bl, features);
2406 }
2407 ::encode(osd_addrs->cluster_addr, bl, features);
2408 ::encode(cluster_snapshot_epoch, bl);
2409 ::encode(cluster_snapshot, bl);
2410 ::encode(*osd_uuid, bl);
2411 ::encode(osd_xinfo, bl);
2412 ::encode(osd_addrs->hb_front_addr, bl, features);
2413 if (target_v >= 2) {
2414 ::encode(nearfull_ratio, bl);
2415 ::encode(full_ratio, bl);
2416 ::encode(backfillfull_ratio, bl);
2417 }
2418 // 4 was string-based new_require_min_compat_client
2419 if (target_v >= 5) {
2420 ::encode(require_min_compat_client, bl);
2421 ::encode(require_osd_release, bl);
2422 }
2423 ENCODE_FINISH(bl); // osd-only data
2424 }
2425
2426 ::encode((uint32_t)0, bl); // dummy crc
2427 crc_it = bl.end();
2428 crc_it.advance(-4);
2429 tail_offset = bl.length();
2430
2431 ENCODE_FINISH(bl); // meta-encoding wrapper
2432
2433 // fill in crc
2434 bufferlist front;
2435 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
2436 crc = front.crc32c(-1);
2437 if (tail_offset < bl.length()) {
2438 bufferlist tail;
2439 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2440 crc = tail.crc32c(crc);
2441 }
2442 ceph_le32 crc_le;
2443 crc_le = crc;
2444 crc_it.copy_in(4, (char*)&crc_le);
2445 crc_defined = true;
2446 }
2447
2448 void OSDMap::decode(bufferlist& bl)
2449 {
2450 auto p = bl.begin();
2451 decode(p);
2452 }
2453
2454 void OSDMap::decode_classic(bufferlist::iterator& p)
2455 {
2456 __u32 n, t;
2457 __u16 v;
2458 ::decode(v, p);
2459
2460 // base
2461 ::decode(fsid, p);
2462 ::decode(epoch, p);
2463 ::decode(created, p);
2464 ::decode(modified, p);
2465
2466 if (v < 6) {
2467 if (v < 4) {
2468 int32_t max_pools = 0;
2469 ::decode(max_pools, p);
2470 pool_max = max_pools;
2471 }
2472 pools.clear();
2473 ::decode(n, p);
2474 while (n--) {
2475 ::decode(t, p);
2476 ::decode(pools[t], p);
2477 }
2478 if (v == 4) {
2479 ::decode(n, p);
2480 pool_max = n;
2481 } else if (v == 5) {
2482 pool_name.clear();
2483 ::decode(n, p);
2484 while (n--) {
2485 ::decode(t, p);
2486 ::decode(pool_name[t], p);
2487 }
2488 ::decode(n, p);
2489 pool_max = n;
2490 }
2491 } else {
2492 ::decode(pools, p);
2493 ::decode(pool_name, p);
2494 ::decode(pool_max, p);
2495 }
2496 // kludge around some old bug that zeroed out pool_max (#2307)
2497 if (pools.size() && pool_max < pools.rbegin()->first) {
2498 pool_max = pools.rbegin()->first;
2499 }
2500
2501 ::decode(flags, p);
2502
2503 ::decode(max_osd, p);
2504 {
2505 vector<uint8_t> os;
2506 ::decode(os, p);
2507 osd_state.resize(os.size());
2508 for (unsigned i = 0; i < os.size(); ++i) {
2509 osd_state[i] = os[i];
2510 }
2511 }
2512 ::decode(osd_weight, p);
2513 ::decode(osd_addrs->client_addr, p);
2514 if (v <= 5) {
2515 pg_temp->clear();
2516 ::decode(n, p);
2517 while (n--) {
2518 old_pg_t opg;
2519 ::decode_raw(opg, p);
2520 mempool::osdmap::vector<int32_t> v;
2521 ::decode(v, p);
2522 pg_temp->set(pg_t(opg), v);
2523 }
2524 } else {
2525 ::decode(*pg_temp, p);
2526 }
2527
2528 // crush
2529 bufferlist cbl;
2530 ::decode(cbl, p);
2531 auto cblp = cbl.begin();
2532 crush->decode(cblp);
2533
2534 // extended
2535 __u16 ev = 0;
2536 if (v >= 5)
2537 ::decode(ev, p);
2538 ::decode(osd_addrs->hb_back_addr, p);
2539 ::decode(osd_info, p);
2540 if (v < 5)
2541 ::decode(pool_name, p);
2542
2543 ::decode(blacklist, p);
2544 if (ev >= 6)
2545 ::decode(osd_addrs->cluster_addr, p);
2546 else
2547 osd_addrs->cluster_addr.resize(osd_addrs->client_addr.size());
2548
2549 if (ev >= 7) {
2550 ::decode(cluster_snapshot_epoch, p);
2551 ::decode(cluster_snapshot, p);
2552 }
2553
2554 if (ev >= 8) {
2555 ::decode(*osd_uuid, p);
2556 } else {
2557 osd_uuid->resize(max_osd);
2558 }
2559 if (ev >= 9)
2560 ::decode(osd_xinfo, p);
2561 else
2562 osd_xinfo.resize(max_osd);
2563
2564 if (ev >= 10)
2565 ::decode(osd_addrs->hb_front_addr, p);
2566 else
2567 osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
2568
2569 osd_primary_affinity.reset();
2570
2571 post_decode();
2572 }
2573
2574 void OSDMap::decode(bufferlist::iterator& bl)
2575 {
2576 /**
2577 * Older encodings of the OSDMap had a single struct_v which
2578 * covered the whole encoding, and was prior to our modern
2579 * stuff which includes a compatv and a size. So if we see
2580 * a struct_v < 7, we must rewind to the beginning and use our
2581 * classic decoder.
2582 */
2583 size_t start_offset = bl.get_off();
2584 size_t tail_offset = 0;
2585 bufferlist crc_front, crc_tail;
2586
2587 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
2588 if (struct_v < 7) {
2589 int struct_v_size = sizeof(struct_v);
2590 bl.advance(-struct_v_size);
2591 decode_classic(bl);
2592 return;
2593 }
2594 /**
2595 * Since we made it past that hurdle, we can use our normal paths.
2596 */
2597 {
2598 DECODE_START(6, bl); // client-usable data
2599 // base
2600 ::decode(fsid, bl);
2601 ::decode(epoch, bl);
2602 ::decode(created, bl);
2603 ::decode(modified, bl);
2604
2605 ::decode(pools, bl);
2606 ::decode(pool_name, bl);
2607 ::decode(pool_max, bl);
2608
2609 ::decode(flags, bl);
2610
2611 ::decode(max_osd, bl);
2612 if (struct_v >= 5) {
2613 ::decode(osd_state, bl);
2614 } else {
2615 vector<uint8_t> os;
2616 ::decode(os, bl);
2617 osd_state.resize(os.size());
2618 for (unsigned i = 0; i < os.size(); ++i) {
2619 osd_state[i] = os[i];
2620 }
2621 }
2622 ::decode(osd_weight, bl);
2623 ::decode(osd_addrs->client_addr, bl);
2624
2625 ::decode(*pg_temp, bl);
2626 ::decode(*primary_temp, bl);
2627 if (struct_v >= 2) {
2628 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
2629 ::decode(*osd_primary_affinity, bl);
2630 if (osd_primary_affinity->empty())
2631 osd_primary_affinity.reset();
2632 } else {
2633 osd_primary_affinity.reset();
2634 }
2635
2636 // crush
2637 bufferlist cbl;
2638 ::decode(cbl, bl);
2639 auto cblp = cbl.begin();
2640 crush->decode(cblp);
2641 if (struct_v >= 3) {
2642 ::decode(erasure_code_profiles, bl);
2643 } else {
2644 erasure_code_profiles.clear();
2645 }
2646 if (struct_v >= 4) {
2647 ::decode(pg_upmap, bl);
2648 ::decode(pg_upmap_items, bl);
2649 } else {
2650 pg_upmap.clear();
2651 pg_upmap_items.clear();
2652 }
2653 if (struct_v >= 6) {
2654 ::decode(crush_version, bl);
2655 }
2656 DECODE_FINISH(bl); // client-usable data
2657 }
2658
2659 {
2660 DECODE_START(5, bl); // extended, osd-only data
2661 ::decode(osd_addrs->hb_back_addr, bl);
2662 ::decode(osd_info, bl);
2663 ::decode(blacklist, bl);
2664 ::decode(osd_addrs->cluster_addr, bl);
2665 ::decode(cluster_snapshot_epoch, bl);
2666 ::decode(cluster_snapshot, bl);
2667 ::decode(*osd_uuid, bl);
2668 ::decode(osd_xinfo, bl);
2669 ::decode(osd_addrs->hb_front_addr, bl);
2670 if (struct_v >= 2) {
2671 ::decode(nearfull_ratio, bl);
2672 ::decode(full_ratio, bl);
2673 } else {
2674 nearfull_ratio = 0;
2675 full_ratio = 0;
2676 }
2677 if (struct_v >= 3) {
2678 ::decode(backfillfull_ratio, bl);
2679 } else {
2680 backfillfull_ratio = 0;
2681 }
2682 if (struct_v == 4) {
2683 string r;
2684 ::decode(r, bl);
2685 if (r.length())
2686 require_min_compat_client = ceph_release_from_name(r.c_str());
2687 }
2688 if (struct_v >= 5) {
2689 ::decode(require_min_compat_client, bl);
2690 ::decode(require_osd_release, bl);
2691 if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
2692 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2693 }
2694 } else {
2695 if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
2696 // only for compat with post-kraken pre-luminous test clusters
2697 require_osd_release = CEPH_RELEASE_LUMINOUS;
2698 flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
2699 } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
2700 require_osd_release = CEPH_RELEASE_KRAKEN;
2701 } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
2702 require_osd_release = CEPH_RELEASE_JEWEL;
2703 } else {
2704 require_osd_release = 0;
2705 }
2706 }
2707 DECODE_FINISH(bl); // osd-only data
2708 }
2709
2710 if (struct_v >= 8) {
2711 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
2712 ::decode(crc, bl);
2713 tail_offset = bl.get_off();
2714 crc_defined = true;
2715 } else {
2716 crc_defined = false;
2717 crc = 0;
2718 }
2719
2720 DECODE_FINISH(bl); // wrapper
2721
2722 if (tail_offset) {
2723 // verify crc
2724 uint32_t actual = crc_front.crc32c(-1);
2725 if (tail_offset < bl.get_off()) {
2726 bufferlist tail;
2727 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
2728 actual = tail.crc32c(actual);
2729 }
2730 if (crc != actual) {
2731 ostringstream ss;
2732 ss << "bad crc, actual " << actual << " != expected " << crc;
2733 string s = ss.str();
2734 throw buffer::malformed_input(s.c_str());
2735 }
2736 }
2737
2738 post_decode();
2739 }
2740
2741 void OSDMap::post_decode()
2742 {
2743 // index pool names
2744 name_pool.clear();
2745 for (const auto &pname : pool_name) {
2746 name_pool[pname.second] = pname.first;
2747 }
2748
2749 calc_num_osds();
2750 _calc_up_osd_features();
2751 }
2752
2753 void OSDMap::dump_erasure_code_profiles(
2754 const mempool::osdmap::map<string,map<string,string>>& profiles,
2755 Formatter *f)
2756 {
2757 f->open_object_section("erasure_code_profiles");
2758 for (const auto &profile : profiles) {
2759 f->open_object_section(profile.first.c_str());
2760 for (const auto &profm : profile.second) {
2761 f->dump_string(profm.first.c_str(), profm.second.c_str());
2762 }
2763 f->close_section();
2764 }
2765 f->close_section();
2766 }
2767
2768 void OSDMap::dump(Formatter *f) const
2769 {
2770 f->dump_int("epoch", get_epoch());
2771 f->dump_stream("fsid") << get_fsid();
2772 f->dump_stream("created") << get_created();
2773 f->dump_stream("modified") << get_modified();
2774 f->dump_string("flags", get_flag_string());
2775 f->dump_unsigned("crush_version", get_crush_version());
2776 f->dump_float("full_ratio", full_ratio);
2777 f->dump_float("backfillfull_ratio", backfillfull_ratio);
2778 f->dump_float("nearfull_ratio", nearfull_ratio);
2779 f->dump_string("cluster_snapshot", get_cluster_snapshot());
2780 f->dump_int("pool_max", get_pool_max());
2781 f->dump_int("max_osd", get_max_osd());
2782 f->dump_string("require_min_compat_client",
2783 ceph_release_name(require_min_compat_client));
2784 f->dump_string("min_compat_client",
2785 ceph_release_name(get_min_compat_client()));
2786 f->dump_string("require_osd_release",
2787 ceph_release_name(require_osd_release));
2788
2789 f->open_array_section("pools");
2790 for (const auto &pool : pools) {
2791 std::string name("<unknown>");
2792 const auto &pni = pool_name.find(pool.first);
2793 if (pni != pool_name.end())
2794 name = pni->second;
2795 f->open_object_section("pool");
2796 f->dump_int("pool", pool.first);
2797 f->dump_string("pool_name", name);
2798 pool.second.dump(f);
2799 f->close_section();
2800 }
2801 f->close_section();
2802
2803 f->open_array_section("osds");
2804 for (int i=0; i<get_max_osd(); i++)
2805 if (exists(i)) {
2806 f->open_object_section("osd_info");
2807 f->dump_int("osd", i);
2808 f->dump_stream("uuid") << get_uuid(i);
2809 f->dump_int("up", is_up(i));
2810 f->dump_int("in", is_in(i));
2811 f->dump_float("weight", get_weightf(i));
2812 f->dump_float("primary_affinity", get_primary_affinityf(i));
2813 get_info(i).dump(f);
2814 f->dump_stream("public_addr") << get_addr(i);
2815 f->dump_stream("cluster_addr") << get_cluster_addr(i);
2816 f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
2817 f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
2818
2819 set<string> st;
2820 get_state(i, st);
2821 f->open_array_section("state");
2822 for (const auto &state : st)
2823 f->dump_string("state", state);
2824 f->close_section();
2825
2826 f->close_section();
2827 }
2828 f->close_section();
2829
2830 f->open_array_section("osd_xinfo");
2831 for (int i=0; i<get_max_osd(); i++) {
2832 if (exists(i)) {
2833 f->open_object_section("xinfo");
2834 f->dump_int("osd", i);
2835 osd_xinfo[i].dump(f);
2836 f->close_section();
2837 }
2838 }
2839 f->close_section();
2840
2841 f->open_array_section("pg_upmap");
2842 for (auto& p : pg_upmap) {
2843 f->open_object_section("mapping");
2844 f->dump_stream("pgid") << p.first;
2845 f->open_array_section("osds");
2846 for (auto q : p.second) {
2847 f->dump_int("osd", q);
2848 }
2849 f->close_section();
2850 f->close_section();
2851 }
2852 f->close_section();
2853 f->open_array_section("pg_upmap_items");
2854 for (auto& p : pg_upmap_items) {
2855 f->open_object_section("mapping");
2856 f->dump_stream("pgid") << p.first;
2857 f->open_array_section("mappings");
2858 for (auto& q : p.second) {
2859 f->open_object_section("mapping");
2860 f->dump_int("from", q.first);
2861 f->dump_int("to", q.second);
2862 f->close_section();
2863 }
2864 f->close_section();
2865 f->close_section();
2866 }
2867 f->close_section();
2868 f->open_array_section("pg_temp");
2869 pg_temp->dump(f);
2870 f->close_section();
2871
2872 f->open_array_section("primary_temp");
2873 for (const auto &pg : *primary_temp) {
2874 f->dump_stream("pgid") << pg.first;
2875 f->dump_int("osd", pg.second);
2876 }
2877 f->close_section(); // primary_temp
2878
2879 f->open_object_section("blacklist");
2880 for (const auto &addr : blacklist) {
2881 stringstream ss;
2882 ss << addr.first;
2883 f->dump_stream(ss.str().c_str()) << addr.second;
2884 }
2885 f->close_section();
2886
2887 dump_erasure_code_profiles(erasure_code_profiles, f);
2888 }
2889
2890 void OSDMap::generate_test_instances(list<OSDMap*>& o)
2891 {
2892 o.push_back(new OSDMap);
2893
2894 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
2895 o.push_back(new OSDMap);
2896 uuid_d fsid;
2897 o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
2898 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
2899 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
2900 cct->put();
2901 }
2902
2903 string OSDMap::get_flag_string(unsigned f)
2904 {
2905 string s;
2906 if ( f& CEPH_OSDMAP_NEARFULL)
2907 s += ",nearfull";
2908 if (f & CEPH_OSDMAP_FULL)
2909 s += ",full";
2910 if (f & CEPH_OSDMAP_PAUSERD)
2911 s += ",pauserd";
2912 if (f & CEPH_OSDMAP_PAUSEWR)
2913 s += ",pausewr";
2914 if (f & CEPH_OSDMAP_PAUSEREC)
2915 s += ",pauserec";
2916 if (f & CEPH_OSDMAP_NOUP)
2917 s += ",noup";
2918 if (f & CEPH_OSDMAP_NODOWN)
2919 s += ",nodown";
2920 if (f & CEPH_OSDMAP_NOOUT)
2921 s += ",noout";
2922 if (f & CEPH_OSDMAP_NOIN)
2923 s += ",noin";
2924 if (f & CEPH_OSDMAP_NOBACKFILL)
2925 s += ",nobackfill";
2926 if (f & CEPH_OSDMAP_NOREBALANCE)
2927 s += ",norebalance";
2928 if (f & CEPH_OSDMAP_NORECOVER)
2929 s += ",norecover";
2930 if (f & CEPH_OSDMAP_NOSCRUB)
2931 s += ",noscrub";
2932 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
2933 s += ",nodeep-scrub";
2934 if (f & CEPH_OSDMAP_NOTIERAGENT)
2935 s += ",notieragent";
2936 if (f & CEPH_OSDMAP_SORTBITWISE)
2937 s += ",sortbitwise";
2938 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
2939 s += ",require_jewel_osds";
2940 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
2941 s += ",require_kraken_osds";
2942 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
2943 s += ",require_luminous_osds";
2944 if (s.length())
2945 s.erase(0, 1);
2946 return s;
2947 }
2948
2949 string OSDMap::get_flag_string() const
2950 {
2951 return get_flag_string(flags);
2952 }
2953
2954 struct qi {
2955 int item;
2956 int depth;
2957 float weight;
2958 qi() : item(0), depth(0), weight(0) {}
2959 qi(int i, int d, float w) : item(i), depth(d), weight(w) {}
2960 };
2961
2962 void OSDMap::print_pools(ostream& out) const
2963 {
2964 for (const auto &pool : pools) {
2965 std::string name("<unknown>");
2966 const auto &pni = pool_name.find(pool.first);
2967 if (pni != pool_name.end())
2968 name = pni->second;
2969 out << "pool " << pool.first
2970 << " '" << name
2971 << "' " << pool.second << "\n";
2972
2973 for (const auto &snap : pool.second.snaps)
2974 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
2975
2976 if (!pool.second.removed_snaps.empty())
2977 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
2978 }
2979 out << std::endl;
2980 }
2981
2982 void OSDMap::print(ostream& out) const
2983 {
2984 out << "epoch " << get_epoch() << "\n"
2985 << "fsid " << get_fsid() << "\n"
2986 << "created " << get_created() << "\n"
2987 << "modified " << get_modified() << "\n";
2988
2989 out << "flags " << get_flag_string() << "\n";
2990 out << "crush_version " << get_crush_version() << "\n";
2991 out << "full_ratio " << full_ratio << "\n";
2992 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
2993 out << "nearfull_ratio " << nearfull_ratio << "\n";
2994 if (require_min_compat_client > 0) {
2995 out << "require_min_compat_client "
2996 << ceph_release_name(require_min_compat_client) << "\n";
2997 }
2998 out << "min_compat_client " << ceph_release_name(get_min_compat_client())
2999 << "\n";
3000 if (get_cluster_snapshot().length())
3001 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
3002 out << "\n";
3003
3004 print_pools(out);
3005
3006 out << "max_osd " << get_max_osd() << "\n";
3007 for (int i=0; i<get_max_osd(); i++) {
3008 if (exists(i)) {
3009 out << "osd." << i;
3010 out << (is_up(i) ? " up ":" down");
3011 out << (is_in(i) ? " in ":" out");
3012 out << " weight " << get_weightf(i);
3013 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
3014 out << " primary_affinity " << get_primary_affinityf(i);
3015 const osd_info_t& info(get_info(i));
3016 out << " " << info;
3017 out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
3018 << " " << get_hb_front_addr(i);
3019 set<string> st;
3020 get_state(i, st);
3021 out << " " << st;
3022 if (!get_uuid(i).is_zero())
3023 out << " " << get_uuid(i);
3024 out << "\n";
3025 }
3026 }
3027 out << std::endl;
3028
3029 for (auto& p : pg_upmap) {
3030 out << "pg_upmap " << p.first << " " << p.second << "\n";
3031 }
3032 for (auto& p : pg_upmap_items) {
3033 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
3034 }
3035
3036 for (const auto pg : *pg_temp)
3037 out << "pg_temp " << pg.first << " " << pg.second << "\n";
3038
3039 for (const auto pg : *primary_temp)
3040 out << "primary_temp " << pg.first << " " << pg.second << "\n";
3041
3042 for (const auto &addr : blacklist)
3043 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
3044
3045 // ignore pg_swap_primary
3046 }
3047
3048 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
3049 public:
3050 typedef CrushTreeDumper::Dumper<TextTable> Parent;
3051
3052 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3053 unsigned f)
3054 : Parent(crush), osdmap(osdmap_), filter(f) { }
3055
3056 bool should_dump_leaf(int i) const override {
3057 if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
3058 ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
3059 ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
3060 ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
3061 return false;
3062 }
3063 return true;
3064 }
3065
3066 bool should_dump_empty_bucket() const override {
3067 return !filter;
3068 }
3069
3070 void dump(TextTable *tbl) {
3071 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
3072 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
3073 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
3074 tbl->define_column("UP/DOWN", TextTable::LEFT, TextTable::RIGHT);
3075 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
3076 tbl->define_column("PRIMARY-AFFINITY", TextTable::LEFT, TextTable::RIGHT);
3077
3078 Parent::dump(tbl);
3079
3080 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3081 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
3082 dump_item(CrushTreeDumper::Item(i, 0, 0), tbl);
3083 }
3084 }
3085 }
3086
3087 protected:
3088 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
3089
3090 *tbl << qi.id
3091 << weightf_t(qi.weight);
3092
3093 ostringstream name;
3094 for (int k = 0; k < qi.depth; k++)
3095 name << " ";
3096 if (qi.is_bucket()) {
3097 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
3098 << crush->get_item_name(qi.id);
3099 } else {
3100 name << "osd." << qi.id;
3101 }
3102 *tbl << name.str();
3103
3104 if (!qi.is_bucket()) {
3105 if (!osdmap->exists(qi.id)) {
3106 *tbl << "DNE"
3107 << 0;
3108 } else {
3109 *tbl << (osdmap->is_up(qi.id) ? "up" : "down")
3110 << weightf_t(osdmap->get_weightf(qi.id))
3111 << weightf_t(osdmap->get_primary_affinityf(qi.id));
3112 }
3113 }
3114 *tbl << TextTable::endrow;
3115 }
3116
3117 private:
3118 const OSDMap *osdmap;
3119 const unsigned filter;
3120 };
3121
3122 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
3123 public:
3124 typedef CrushTreeDumper::FormattingDumper Parent;
3125
3126 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3127 unsigned f)
3128 : Parent(crush), osdmap(osdmap_), filter(f) { }
3129
3130 bool should_dump_leaf(int i) const override {
3131 if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
3132 ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
3133 ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
3134 ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
3135 return false;
3136 }
3137 return true;
3138 }
3139
3140 bool should_dump_empty_bucket() const override {
3141 return !filter;
3142 }
3143
3144 void dump(Formatter *f) {
3145 f->open_array_section("nodes");
3146 Parent::dump(f);
3147 f->close_section();
3148 f->open_array_section("stray");
3149 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3150 if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
3151 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
3152 }
3153 f->close_section();
3154 }
3155
3156 protected:
3157 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
3158 Parent::dump_item_fields(qi, f);
3159 if (!qi.is_bucket())
3160 {
3161 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
3162 f->dump_string("status", osdmap->is_up(qi.id) ? "up" : "down");
3163 f->dump_float("reweight", osdmap->get_weightf(qi.id));
3164 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
3165 }
3166 }
3167
3168 private:
3169 const OSDMap *osdmap;
3170 const unsigned filter;
3171 };
3172
3173 void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter) const
3174 {
3175 if (f) {
3176 OSDTreeFormattingDumper(crush.get(), this, filter).dump(f);
3177 } else {
3178 assert(out);
3179 TextTable tbl;
3180 OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl);
3181 *out << tbl;
3182 }
3183 }
3184
3185 void OSDMap::print_summary(Formatter *f, ostream& out) const
3186 {
3187 if (f) {
3188 f->open_object_section("osdmap");
3189 f->dump_int("epoch", get_epoch());
3190 f->dump_int("num_osds", get_num_osds());
3191 f->dump_int("num_up_osds", get_num_up_osds());
3192 f->dump_int("num_in_osds", get_num_in_osds());
3193 f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
3194 f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
3195 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3196 f->close_section();
3197 } else {
3198 out << get_num_osds() << " osds: "
3199 << get_num_up_osds() << " up, "
3200 << get_num_in_osds() << " in";
3201 if (get_num_pg_temp())
3202 out << "; " << get_num_pg_temp() << " remapped pgs";
3203 out << "\n";
3204 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
3205 if (important_flags)
3206 out << " flags " << get_flag_string(important_flags) << "\n";
3207 }
3208 }
3209
3210 void OSDMap::print_oneline_summary(ostream& out) const
3211 {
3212 out << "e" << get_epoch() << ": "
3213 << get_num_osds() << " total, "
3214 << get_num_up_osds() << " up, "
3215 << get_num_in_osds() << " in";
3216 if (test_flag(CEPH_OSDMAP_FULL))
3217 out << " full";
3218 else if (test_flag(CEPH_OSDMAP_NEARFULL))
3219 out << " nearfull";
3220 }
3221
3222 bool OSDMap::crush_ruleset_in_use(int ruleset) const
3223 {
3224 for (const auto &pool : pools) {
3225 if (pool.second.crush_rule == ruleset)
3226 return true;
3227 }
3228 return false;
3229 }
3230
3231 int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
3232 int nosd, int pg_bits, int pgp_bits)
3233 {
3234 ldout(cct, 10) << "build_simple on " << num_osd
3235 << " osds with " << pg_bits << " pg bits per osd, "
3236 << dendl;
3237 epoch = e;
3238 set_fsid(fsid);
3239 created = modified = ceph_clock_now();
3240
3241 if (nosd >= 0) {
3242 set_max_osd(nosd);
3243 } else {
3244 // count osds
3245 int maxosd = 0;
3246 const md_config_t *conf = cct->_conf;
3247 vector<string> sections;
3248 conf->get_all_sections(sections);
3249
3250 for (auto &section : sections) {
3251 if (section.find("osd.") != 0)
3252 continue;
3253
3254 const char *begin = section.c_str() + 4;
3255 char *end = (char*)begin;
3256 int o = strtol(begin, &end, 10);
3257 if (*end != '\0')
3258 continue;
3259
3260 if (o > cct->_conf->mon_max_osd) {
3261 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
3262 return -ERANGE;
3263 }
3264
3265 if (o > maxosd)
3266 maxosd = o;
3267 }
3268
3269 set_max_osd(maxosd + 1);
3270 }
3271
3272 // pgp_num <= pg_num
3273 if (pgp_bits > pg_bits)
3274 pgp_bits = pg_bits;
3275
3276 vector<string> pool_names;
3277 pool_names.push_back("rbd");
3278
3279 stringstream ss;
3280 int r;
3281 if (nosd >= 0)
3282 r = build_simple_crush_map(cct, *crush, nosd, &ss);
3283 else
3284 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
3285 assert(r == 0);
3286
3287 int poolbase = get_max_osd() ? get_max_osd() : 1;
3288
3289 int const default_replicated_rule =
3290 crush->get_osd_pool_default_crush_replicated_ruleset(cct);
3291 assert(default_replicated_rule >= 0);
3292
3293 for (auto &plname : pool_names) {
3294 int64_t pool = ++pool_max;
3295 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
3296 pools[pool].flags = cct->_conf->osd_pool_default_flags;
3297 if (cct->_conf->osd_pool_default_flag_hashpspool)
3298 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
3299 if (cct->_conf->osd_pool_default_flag_nodelete)
3300 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
3301 if (cct->_conf->osd_pool_default_flag_nopgchange)
3302 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
3303 if (cct->_conf->osd_pool_default_flag_nosizechange)
3304 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
3305 pools[pool].size = cct->_conf->osd_pool_default_size;
3306 pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
3307 pools[pool].crush_rule = default_replicated_rule;
3308 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
3309 pools[pool].set_pg_num(poolbase << pg_bits);
3310 pools[pool].set_pgp_num(poolbase << pgp_bits);
3311 pools[pool].last_change = epoch;
3312 pool_name[pool] = plname;
3313 name_pool[plname] = pool;
3314 }
3315
3316 for (int i=0; i<get_max_osd(); i++) {
3317 set_state(i, 0);
3318 set_weight(i, CEPH_OSD_OUT);
3319 }
3320
3321 map<string,string> profile_map;
3322 r = get_erasure_code_profile_default(cct, profile_map, &ss);
3323 if (r < 0) {
3324 lderr(cct) << ss.str() << dendl;
3325 return r;
3326 }
3327 set_erasure_code_profile("default", profile_map);
3328 return 0;
3329 }
3330
3331 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
3332 map<string,string> &profile_map,
3333 ostream *ss)
3334 {
3335 int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
3336 *ss,
3337 &profile_map);
3338 return r;
3339 }
3340
3341 int OSDMap::_build_crush_types(CrushWrapper& crush)
3342 {
3343 crush.set_type_name(0, "osd");
3344 crush.set_type_name(1, "host");
3345 crush.set_type_name(2, "chassis");
3346 crush.set_type_name(3, "rack");
3347 crush.set_type_name(4, "row");
3348 crush.set_type_name(5, "pdu");
3349 crush.set_type_name(6, "pod");
3350 crush.set_type_name(7, "room");
3351 crush.set_type_name(8, "datacenter");
3352 crush.set_type_name(9, "region");
3353 crush.set_type_name(10, "root");
3354 return 10;
3355 }
3356
3357 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
3358 int nosd, ostream *ss)
3359 {
3360 crush.create();
3361
3362 // root
3363 int root_type = _build_crush_types(crush);
3364 int rootid;
3365 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
3366 root_type, 0, NULL, NULL, &rootid);
3367 assert(r == 0);
3368 crush.set_item_name(rootid, "default");
3369
3370 for (int o=0; o<nosd; o++) {
3371 map<string,string> loc;
3372 loc["host"] = "localhost";
3373 loc["rack"] = "localrack";
3374 loc["root"] = "default";
3375 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
3376 char name[32];
3377 snprintf(name, sizeof(name), "osd.%d", o);
3378 crush.insert_item(cct, o, 1.0, name, loc);
3379 }
3380
3381 build_simple_crush_rules(cct, crush, "default", ss);
3382
3383 crush.finalize();
3384
3385 return 0;
3386 }
3387
3388 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
3389 CrushWrapper& crush,
3390 ostream *ss)
3391 {
3392 const md_config_t *conf = cct->_conf;
3393
3394 crush.create();
3395
3396 // root
3397 int root_type = _build_crush_types(crush);
3398 int rootid;
3399 int r = crush.add_bucket(0, 0,
3400 CRUSH_HASH_DEFAULT,
3401 root_type, 0, NULL, NULL, &rootid);
3402 assert(r == 0);
3403 crush.set_item_name(rootid, "default");
3404
3405 // add osds
3406 vector<string> sections;
3407 conf->get_all_sections(sections);
3408
3409 for (auto &section : sections) {
3410 if (section.find("osd.") != 0)
3411 continue;
3412
3413 const char *begin = section.c_str() + 4;
3414 char *end = (char*)begin;
3415 int o = strtol(begin, &end, 10);
3416 if (*end != '\0')
3417 continue;
3418
3419 string host, rack, row, room, dc, pool;
3420 vector<string> sectiontmp;
3421 sectiontmp.push_back("osd");
3422 sectiontmp.push_back(section);
3423 conf->get_val_from_conf_file(sectiontmp, "host", host, false);
3424 conf->get_val_from_conf_file(sectiontmp, "rack", rack, false);
3425 conf->get_val_from_conf_file(sectiontmp, "row", row, false);
3426 conf->get_val_from_conf_file(sectiontmp, "room", room, false);
3427 conf->get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
3428 conf->get_val_from_conf_file(sectiontmp, "root", pool, false);
3429
3430 if (host.length() == 0)
3431 host = "unknownhost";
3432 if (rack.length() == 0)
3433 rack = "unknownrack";
3434
3435 map<string,string> loc;
3436 loc["host"] = host;
3437 loc["rack"] = rack;
3438 if (row.size())
3439 loc["row"] = row;
3440 if (room.size())
3441 loc["room"] = room;
3442 if (dc.size())
3443 loc["datacenter"] = dc;
3444 loc["root"] = "default";
3445
3446 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
3447 crush.insert_item(cct, o, 1.0, section, loc);
3448 }
3449
3450 build_simple_crush_rules(cct, crush, "default", ss);
3451
3452 crush.finalize();
3453
3454 return 0;
3455 }
3456
3457
3458 int OSDMap::build_simple_crush_rules(
3459 CephContext *cct,
3460 CrushWrapper& crush,
3461 const string& root,
3462 ostream *ss)
3463 {
3464 int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
3465 string failure_domain =
3466 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
3467
3468 int r;
3469 r = crush.add_simple_rule_at(
3470 "replicated_rule", root, failure_domain,
3471 "firstn", pg_pool_t::TYPE_REPLICATED,
3472 crush_rule, ss);
3473 if (r < 0)
3474 return r;
3475 // do not add an erasure rule by default or else we will implicitly
3476 // require the crush_v2 feature of clients
3477 return 0;
3478 }
3479
3480 int OSDMap::summarize_mapping_stats(
3481 OSDMap *newmap,
3482 const set<int64_t> *pools,
3483 std::string *out,
3484 Formatter *f) const
3485 {
3486 set<int64_t> ls;
3487 if (pools) {
3488 ls = *pools;
3489 } else {
3490 for (auto &p : get_pools())
3491 ls.insert(p.first);
3492 }
3493
3494 unsigned total_pg = 0;
3495 unsigned moved_pg = 0;
3496 vector<unsigned> base_by_osd(get_max_osd(), 0);
3497 vector<unsigned> new_by_osd(get_max_osd(), 0);
3498 for (int64_t pool_id : ls) {
3499 const pg_pool_t *pi = get_pg_pool(pool_id);
3500 vector<int> up, up2;
3501 int up_primary;
3502 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
3503 pg_t pgid(ps, pool_id, -1);
3504 total_pg += pi->get_size();
3505 pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
3506 for (int osd : up) {
3507 if (osd >= 0 && osd < get_max_osd())
3508 ++base_by_osd[osd];
3509 }
3510 if (newmap) {
3511 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
3512 for (int osd : up2) {
3513 if (osd >= 0 && osd < get_max_osd())
3514 ++new_by_osd[osd];
3515 }
3516 if (pi->type == pg_pool_t::TYPE_ERASURE) {
3517 for (unsigned i=0; i<up.size(); ++i) {
3518 if (up[i] != up2[i]) {
3519 ++moved_pg;
3520 }
3521 }
3522 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
3523 for (int osd : up) {
3524 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
3525 ++moved_pg;
3526 }
3527 }
3528 } else {
3529 assert(0 == "unhandled pool type");
3530 }
3531 }
3532 }
3533 }
3534
3535 unsigned num_up_in = 0;
3536 for (int osd = 0; osd < get_max_osd(); ++osd) {
3537 if (is_up(osd) && is_in(osd))
3538 ++num_up_in;
3539 }
3540 if (!num_up_in) {
3541 return -EINVAL;
3542 }
3543
3544 float avg_pg = (float)total_pg / (float)num_up_in;
3545 float base_stddev = 0, new_stddev = 0;
3546 int min = -1, max = -1;
3547 unsigned min_base_pg = 0, max_base_pg = 0;
3548 unsigned min_new_pg = 0, max_new_pg = 0;
3549 for (int osd = 0; osd < get_max_osd(); ++osd) {
3550 if (is_up(osd) && is_in(osd)) {
3551 float base_diff = (float)base_by_osd[osd] - avg_pg;
3552 base_stddev += base_diff * base_diff;
3553 float new_diff = (float)new_by_osd[osd] - avg_pg;
3554 new_stddev += new_diff * new_diff;
3555 if (min < 0 || base_by_osd[osd] < min_base_pg) {
3556 min = osd;
3557 min_base_pg = base_by_osd[osd];
3558 min_new_pg = new_by_osd[osd];
3559 }
3560 if (max < 0 || base_by_osd[osd] > max_base_pg) {
3561 max = osd;
3562 max_base_pg = base_by_osd[osd];
3563 max_new_pg = new_by_osd[osd];
3564 }
3565 }
3566 }
3567 base_stddev = sqrt(base_stddev / num_up_in);
3568 new_stddev = sqrt(new_stddev / num_up_in);
3569
3570 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
3571
3572 ostringstream ss;
3573 if (f)
3574 f->open_object_section("utilization");
3575 if (newmap) {
3576 if (f) {
3577 f->dump_unsigned("moved_pgs", moved_pg);
3578 f->dump_unsigned("total_pgs", total_pg);
3579 } else {
3580 float percent = 0;
3581 if (total_pg)
3582 percent = (float)moved_pg * 100.0 / (float)total_pg;
3583 ss << "moved " << moved_pg << " / " << total_pg
3584 << " (" << percent << "%)\n";
3585 }
3586 }
3587 if (f) {
3588 f->dump_float("avg_pgs", avg_pg);
3589 f->dump_float("std_dev", base_stddev);
3590 f->dump_float("expected_baseline_std_dev", edev);
3591 if (newmap)
3592 f->dump_float("new_std_dev", new_stddev);
3593 } else {
3594 ss << "avg " << avg_pg << "\n";
3595 ss << "stddev " << base_stddev;
3596 if (newmap)
3597 ss << " -> " << new_stddev;
3598 ss << " (expected baseline " << edev << ")\n";
3599 }
3600 if (min >= 0) {
3601 if (f) {
3602 f->dump_unsigned("min_osd", min);
3603 f->dump_unsigned("min_osd_pgs", min_base_pg);
3604 if (newmap)
3605 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
3606 } else {
3607 ss << "min osd." << min << " with " << min_base_pg;
3608 if (newmap)
3609 ss << " -> " << min_new_pg;
3610 ss << " pgs (" << (float)min_base_pg / avg_pg;
3611 if (newmap)
3612 ss << " -> " << (float)min_new_pg / avg_pg;
3613 ss << " * mean)\n";
3614 }
3615 }
3616 if (max >= 0) {
3617 if (f) {
3618 f->dump_unsigned("max_osd", max);
3619 f->dump_unsigned("max_osd_pgs", max_base_pg);
3620 if (newmap)
3621 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
3622 } else {
3623 ss << "max osd." << max << " with " << max_base_pg;
3624 if (newmap)
3625 ss << " -> " << max_new_pg;
3626 ss << " pgs (" << (float)max_base_pg / avg_pg;
3627 if (newmap)
3628 ss << " -> " << (float)max_new_pg / avg_pg;
3629 ss << " * mean)\n";
3630 }
3631 }
3632 if (f)
3633 f->close_section();
3634 if (out)
3635 *out = ss.str();
3636 return 0;
3637 }
3638
3639
3640 int OSDMap::clean_pg_upmaps(
3641 CephContext *cct,
3642 Incremental *pending_inc)
3643 {
3644 ldout(cct, 10) << __func__ << dendl;
3645 int changed = 0;
3646 for (auto& p : pg_upmap) {
3647 vector<int> raw;
3648 int primary;
3649 pg_to_raw_osds(p.first, &raw, &primary);
3650 if (vectors_equal(raw, p.second)) {
3651 ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
3652 << p.second << dendl;
3653 pending_inc->old_pg_upmap.insert(p.first);
3654 ++changed;
3655 }
3656 }
3657 for (auto& p : pg_upmap_items) {
3658 vector<int> raw;
3659 int primary;
3660 pg_to_raw_osds(p.first, &raw, &primary);
3661 mempool::osdmap::vector<pair<int,int>> newmap;
3662 for (auto& q : p.second) {
3663 if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
3664 newmap.push_back(q);
3665 }
3666 }
3667 if (newmap.empty()) {
3668 ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
3669 << p.second << dendl;
3670 pending_inc->old_pg_upmap_items.insert(p.first);
3671 ++changed;
3672 } else if (newmap != p.second) {
3673 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
3674 << p.first << " " << p.second << " -> " << newmap << dendl;
3675 pending_inc->new_pg_upmap_items[p.first] = newmap;
3676 ++changed;
3677 }
3678 }
3679 return changed;
3680 }
3681
3682 bool OSDMap::try_pg_upmap(
3683 CephContext *cct,
3684 pg_t pg, ///< pg to potentially remap
3685 const set<int>& overfull, ///< osds we'd want to evacuate
3686 const vector<int>& underfull, ///< osds to move to, in order of preference
3687 vector<int> *orig,
3688 vector<int> *out) ///< resulting alternative mapping
3689 {
3690 const pg_pool_t *pool = get_pg_pool(pg.pool());
3691 if (!pool)
3692 return false;
3693 int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
3694 pool->get_size());
3695 if (rule < 0)
3696 return false;
3697
3698 // get original mapping
3699 _pg_to_raw_osds(*pool, pg, orig, NULL);
3700
3701 // make sure there is something there to remap
3702 bool any = false;
3703 for (auto osd : *orig) {
3704 if (overfull.count(osd)) {
3705 any = true;
3706 break;
3707 }
3708 }
3709 if (!any) {
3710 return false;
3711 }
3712
3713 int r = crush->try_remap_rule(
3714 cct,
3715 rule,
3716 pool->get_size(),
3717 overfull, underfull,
3718 *orig,
3719 out);
3720 if (r < 0)
3721 return false;
3722 if (*out == *orig)
3723 return false;
3724 return true;
3725 }
3726
3727 int OSDMap::calc_pg_upmaps(
3728 CephContext *cct,
3729 float max_deviation_ratio,
3730 int max,
3731 const set<int64_t>& only_pools_orig,
3732 OSDMap::Incremental *pending_inc)
3733 {
3734 set<int64_t> only_pools;
3735 if (only_pools_orig.empty()) {
3736 for (auto& i : pools) {
3737 only_pools.insert(i.first);
3738 }
3739 } else {
3740 only_pools = only_pools_orig;
3741 }
3742 OSDMap tmp;
3743 tmp.deepish_copy_from(*this);
3744 float start_deviation = 0;
3745 float end_deviation = 0;
3746 int num_changed = 0;
3747 while (true) {
3748 map<int,set<pg_t>> pgs_by_osd;
3749 int total_pgs = 0;
3750 float osd_weight_total = 0;
3751 map<int,float> osd_weight;
3752 for (auto& i : pools) {
3753 if (!only_pools.empty() && !only_pools.count(i.first))
3754 continue;
3755 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
3756 pg_t pg(ps, i.first);
3757 vector<int> up;
3758 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
3759 for (auto osd : up) {
3760 if (osd != CRUSH_ITEM_NONE)
3761 pgs_by_osd[osd].insert(pg);
3762 }
3763 }
3764 total_pgs += i.second.get_size() * i.second.get_pg_num();
3765
3766 map<int,float> pmap;
3767 int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
3768 i.second.get_type(),
3769 i.second.get_size());
3770 tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
3771 ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
3772 for (auto p : pmap) {
3773 osd_weight[p.first] += p.second;
3774 osd_weight_total += p.second;
3775 }
3776 }
3777 for (auto& i : osd_weight) {
3778 int pgs = 0;
3779 auto p = pgs_by_osd.find(i.first);
3780 if (p != pgs_by_osd.end())
3781 pgs = p->second.size();
3782 else
3783 pgs_by_osd.emplace(i.first, set<pg_t>());
3784 ldout(cct, 20) << " osd." << i.first << " weight " << i.second
3785 << " pgs " << pgs << dendl;
3786 }
3787
3788 float pgs_per_weight = total_pgs / osd_weight_total;
3789 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
3790 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
3791
3792 // osd deviation
3793 float total_deviation = 0;
3794 map<int,float> osd_deviation; // osd, deviation(pgs)
3795 multimap<float,int> deviation_osd; // deviation(pgs), osd
3796 set<int> overfull;
3797 for (auto& i : pgs_by_osd) {
3798 float target = osd_weight[i.first] * pgs_per_weight;
3799 float deviation = (float)i.second.size() - target;
3800 ldout(cct, 20) << " osd." << i.first
3801 << "\tpgs " << i.second.size()
3802 << "\ttarget " << target
3803 << "\tdeviation " << deviation
3804 << dendl;
3805 osd_deviation[i.first] = deviation;
3806 deviation_osd.insert(make_pair(deviation, i.first));
3807 if (deviation >= 1.0)
3808 overfull.insert(i.first);
3809 total_deviation += abs(deviation);
3810 }
3811 if (num_changed == 0) {
3812 start_deviation = total_deviation;
3813 }
3814 end_deviation = total_deviation;
3815
3816 // build underfull, sorted from least-full to most-average
3817 vector<int> underfull;
3818 for (auto i = deviation_osd.begin();
3819 i != deviation_osd.end();
3820 ++i) {
3821 if (i->first >= -.999)
3822 break;
3823 underfull.push_back(i->second);
3824 }
3825 ldout(cct, 10) << " total_deviation " << total_deviation
3826 << " overfull " << overfull
3827 << " underfull " << underfull << dendl;
3828 if (overfull.empty() || underfull.empty())
3829 break;
3830
3831 // pick fullest
3832 bool restart = false;
3833 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
3834 int osd = p->second;
3835 float deviation = p->first;
3836 float target = osd_weight[osd] * pgs_per_weight;
3837 if (deviation/target < max_deviation_ratio) {
3838 ldout(cct, 10) << " osd." << osd
3839 << " target " << target
3840 << " deviation " << deviation
3841 << " -> ratio " << deviation/target
3842 << " < max ratio " << max_deviation_ratio << dendl;
3843 break;
3844 }
3845 int num_to_move = deviation;
3846 ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
3847 if (num_to_move < 1)
3848 break;
3849
3850 set<pg_t>& pgs = pgs_by_osd[osd];
3851
3852 // look for remaps we can un-remap
3853 for (auto pg : pgs) {
3854 auto p = tmp.pg_upmap_items.find(pg);
3855 if (p != tmp.pg_upmap_items.end()) {
3856 for (auto q : p->second) {
3857 if (q.second == osd) {
3858 ldout(cct, 10) << " dropping pg_upmap_items " << pg
3859 << " " << p->second << dendl;
3860 tmp.pg_upmap_items.erase(p);
3861 pending_inc->old_pg_upmap_items.insert(pg);
3862 ++num_changed;
3863 restart = true;
3864 }
3865 }
3866 }
3867 if (restart)
3868 break;
3869 } // pg loop
3870 if (restart)
3871 break;
3872
3873 for (auto pg : pgs) {
3874 if (tmp.pg_upmap.count(pg) ||
3875 tmp.pg_upmap_items.count(pg)) {
3876 ldout(cct, 20) << " already remapped " << pg << dendl;
3877 continue;
3878 }
3879 ldout(cct, 10) << " trying " << pg << dendl;
3880 vector<int> orig, out;
3881 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
3882 continue;
3883 }
3884 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
3885 if (orig.size() != out.size()) {
3886 continue;
3887 }
3888 assert(orig != out);
3889 auto& rmi = tmp.pg_upmap_items[pg];
3890 for (unsigned i = 0; i < out.size(); ++i) {
3891 if (orig[i] != out[i]) {
3892 rmi.push_back(make_pair(orig[i], out[i]));
3893 }
3894 }
3895 pending_inc->new_pg_upmap_items[pg] = rmi;
3896 ldout(cct, 10) << " " << pg << " pg_upmap_items " << rmi << dendl;
3897 restart = true;
3898 ++num_changed;
3899 break;
3900 } // pg loop
3901 if (restart)
3902 break;
3903 } // osd loop
3904
3905 if (!restart) {
3906 ldout(cct, 10) << " failed to find any changes to make" << dendl;
3907 break;
3908 }
3909 if (--max == 0) {
3910 ldout(cct, 10) << " hit max iterations, stopping" << dendl;
3911 break;
3912 }
3913 }
3914 ldout(cct, 10) << " start deviation " << start_deviation << dendl;
3915 ldout(cct, 10) << " end deviation " << end_deviation << dendl;
3916 return num_changed;
3917 }
3918
3919 int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
3920 {
3921 return crush->get_leaves(name, osds);
3922 }
3923
3924 template <typename F>
3925 class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
3926 public:
3927 typedef CrushTreeDumper::Dumper<F> Parent;
3928
3929 OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
3930 const PGStatService *pgs_, bool tree_) :
3931 Parent(crush),
3932 osdmap(osdmap_),
3933 pgs(pgs_),
3934 tree(tree_),
3935 average_util(average_utilization()),
3936 min_var(-1),
3937 max_var(-1),
3938 stddev(0),
3939 sum(0) {
3940 }
3941
3942 protected:
3943 void dump_stray(F *f) {
3944 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3945 if (osdmap->exists(i) && !this->is_touched(i))
3946 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
3947 }
3948 }
3949
3950 void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
3951 if (!tree && qi.is_bucket())
3952 return;
3953
3954 float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
3955 int64_t kb = 0, kb_used = 0, kb_avail = 0;
3956 double util = 0;
3957 if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
3958 if (kb_used && kb)
3959 util = 100.0 * (double)kb_used / (double)kb;
3960
3961 double var = 1.0;
3962 if (average_util)
3963 var = util / average_util;
3964
3965 size_t num_pgs = qi.is_bucket() ? 0 : pgs->get_num_pg_by_osd(qi.id);
3966
3967 dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
3968
3969 if (!qi.is_bucket() && reweight > 0) {
3970 if (min_var < 0 || var < min_var)
3971 min_var = var;
3972 if (max_var < 0 || var > max_var)
3973 max_var = var;
3974
3975 double dev = util - average_util;
3976 dev *= dev;
3977 stddev += reweight * dev;
3978 sum += reweight;
3979 }
3980 }
3981
3982 virtual void dump_item(const CrushTreeDumper::Item &qi,
3983 float &reweight,
3984 int64_t kb,
3985 int64_t kb_used,
3986 int64_t kb_avail,
3987 double& util,
3988 double& var,
3989 const size_t num_pgs,
3990 F *f) = 0;
3991
3992 double dev() {
3993 return sum > 0 ? sqrt(stddev / sum) : 0;
3994 }
3995
3996 double average_utilization() {
3997 int64_t kb = 0, kb_used = 0;
3998 for (int i = 0; i < osdmap->get_max_osd(); i++) {
3999 if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
4000 continue;
4001 int64_t kb_i, kb_used_i, kb_avail_i;
4002 if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
4003 kb += kb_i;
4004 kb_used += kb_used_i;
4005 }
4006 }
4007 return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
4008 }
4009
4010 bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
4011 int64_t* kb_avail) const {
4012 const osd_stat_t *p = pgs->get_osd_stat(id);
4013 if (!p) return false;
4014 *kb = p->kb;
4015 *kb_used = p->kb_used;
4016 *kb_avail = p->kb_avail;
4017 return *kb > 0;
4018 }
4019
4020 bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
4021 int64_t* kb_avail) const {
4022 if (id >= 0) {
4023 if (osdmap->is_out(id)) {
4024 *kb = 0;
4025 *kb_used = 0;
4026 *kb_avail = 0;
4027 return true;
4028 }
4029 return get_osd_utilization(id, kb, kb_used, kb_avail);
4030 }
4031
4032 *kb = 0;
4033 *kb_used = 0;
4034 *kb_avail = 0;
4035
4036 for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
4037 int item = osdmap->crush->get_bucket_item(id, k);
4038 int64_t kb_i = 0, kb_used_i = 0, kb_avail_i = 0;
4039 if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
4040 return false;
4041 *kb += kb_i;
4042 *kb_used += kb_used_i;
4043 *kb_avail += kb_avail_i;
4044 }
4045 return *kb > 0;
4046 }
4047
4048 protected:
4049 const OSDMap *osdmap;
4050 const PGStatService *pgs;
4051 bool tree;
4052 double average_util;
4053 double min_var;
4054 double max_var;
4055 double stddev;
4056 double sum;
4057 };
4058
4059
4060 class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
4061 public:
4062 typedef OSDUtilizationDumper<TextTable> Parent;
4063
4064 OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4065 const PGStatService *pgs, bool tree) :
4066 Parent(crush, osdmap, pgs, tree) {}
4067
4068 void dump(TextTable *tbl) {
4069 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
4070 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
4071 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
4072 tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
4073 tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
4074 tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
4075 tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
4076 tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
4077 tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
4078 if (tree)
4079 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
4080
4081 Parent::dump(tbl);
4082
4083 dump_stray(tbl);
4084
4085 *tbl << "" << "" << "TOTAL"
4086 << si_t(pgs->get_osd_sum().kb << 10)
4087 << si_t(pgs->get_osd_sum().kb_used << 10)
4088 << si_t(pgs->get_osd_sum().kb_avail << 10)
4089 << lowprecision_t(average_util)
4090 << ""
4091 << TextTable::endrow;
4092 }
4093
4094 protected:
4095 struct lowprecision_t {
4096 float v;
4097 explicit lowprecision_t(float _v) : v(_v) {}
4098 };
4099 friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
4100
4101 using OSDUtilizationDumper<TextTable>::dump_item;
4102 void dump_item(const CrushTreeDumper::Item &qi,
4103 float &reweight,
4104 int64_t kb,
4105 int64_t kb_used,
4106 int64_t kb_avail,
4107 double& util,
4108 double& var,
4109 const size_t num_pgs,
4110 TextTable *tbl) override {
4111 *tbl << qi.id
4112 << weightf_t(qi.weight)
4113 << weightf_t(reweight)
4114 << si_t(kb << 10)
4115 << si_t(kb_used << 10)
4116 << si_t(kb_avail << 10)
4117 << lowprecision_t(util)
4118 << lowprecision_t(var);
4119
4120 if (qi.is_bucket()) {
4121 *tbl << "-";
4122 } else {
4123 *tbl << num_pgs;
4124 }
4125
4126 if (tree) {
4127 ostringstream name;
4128 for (int k = 0; k < qi.depth; k++)
4129 name << " ";
4130 if (qi.is_bucket()) {
4131 int type = crush->get_bucket_type(qi.id);
4132 name << crush->get_type_name(type) << " "
4133 << crush->get_item_name(qi.id);
4134 } else {
4135 name << "osd." << qi.id;
4136 }
4137 *tbl << name.str();
4138 }
4139
4140 *tbl << TextTable::endrow;
4141 }
4142
4143 public:
4144 string summary() {
4145 ostringstream out;
4146 out << "MIN/MAX VAR: " << lowprecision_t(min_var)
4147 << "/" << lowprecision_t(max_var) << " "
4148 << "STDDEV: " << lowprecision_t(dev());
4149 return out.str();
4150 }
4151 };
4152
4153 ostream& operator<<(ostream& out,
4154 const OSDUtilizationPlainDumper::lowprecision_t& v)
4155 {
4156 if (v.v < -0.01) {
4157 return out << "-";
4158 } else if (v.v < 0.001) {
4159 return out << "0";
4160 } else {
4161 std::streamsize p = out.precision();
4162 return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
4163 }
4164 }
4165
4166 class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
4167 public:
4168 typedef OSDUtilizationDumper<Formatter> Parent;
4169
4170 OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
4171 const PGStatService *pgs, bool tree) :
4172 Parent(crush, osdmap, pgs, tree) {}
4173
4174 void dump(Formatter *f) {
4175 f->open_array_section("nodes");
4176 Parent::dump(f);
4177 f->close_section();
4178
4179 f->open_array_section("stray");
4180 dump_stray(f);
4181 f->close_section();
4182 }
4183
4184 protected:
4185 using OSDUtilizationDumper<Formatter>::dump_item;
4186 void dump_item(const CrushTreeDumper::Item &qi,
4187 float &reweight,
4188 int64_t kb,
4189 int64_t kb_used,
4190 int64_t kb_avail,
4191 double& util,
4192 double& var,
4193 const size_t num_pgs,
4194 Formatter *f) override {
4195 f->open_object_section("item");
4196 CrushTreeDumper::dump_item_fields(crush, qi, f);
4197 f->dump_float("reweight", reweight);
4198 f->dump_int("kb", kb);
4199 f->dump_int("kb_used", kb_used);
4200 f->dump_int("kb_avail", kb_avail);
4201 f->dump_float("utilization", util);
4202 f->dump_float("var", var);
4203 f->dump_unsigned("pgs", num_pgs);
4204 CrushTreeDumper::dump_bucket_children(crush, qi, f);
4205 f->close_section();
4206 }
4207
4208 public:
4209 void summary(Formatter *f) {
4210 f->open_object_section("summary");
4211 f->dump_int("total_kb", pgs->get_osd_sum().kb);
4212 f->dump_int("total_kb_used", pgs->get_osd_sum().kb_used);
4213 f->dump_int("total_kb_avail", pgs->get_osd_sum().kb_avail);
4214 f->dump_float("average_utilization", average_util);
4215 f->dump_float("min_var", min_var);
4216 f->dump_float("max_var", max_var);
4217 f->dump_float("dev", dev());
4218 f->close_section();
4219 }
4220 };
4221
4222 void print_osd_utilization(const OSDMap& osdmap,
4223 const PGStatService *pgstat,
4224 ostream& out,
4225 Formatter *f,
4226 bool tree)
4227 {
4228 const CrushWrapper *crush = osdmap.crush.get();
4229 if (f) {
4230 f->open_object_section("df");
4231 OSDUtilizationFormatDumper d(crush, &osdmap, pgstat, tree);
4232 d.dump(f);
4233 d.summary(f);
4234 f->close_section();
4235 f->flush(out);
4236 } else {
4237 OSDUtilizationPlainDumper d(crush, &osdmap, pgstat, tree);
4238 TextTable tbl;
4239 d.dump(&tbl);
4240 out << tbl << d.summary() << "\n";
4241 }
4242 }