]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / osd / OSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "OSDMap.h"
19 #include <algorithm>
20 #include "common/config.h"
21 #include "common/Formatter.h"
22 #include "common/TextTable.h"
23 #include "include/ceph_features.h"
24 #include "include/str_map.h"
25
26 #include "common/code_environment.h"
27
28 #include "crush/CrushTreeDumper.h"
29 #include "common/Clock.h"
30
31 #define dout_subsys ceph_subsys_osd
32
33 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
34 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
35
36
37 // ----------------------------------
38 // osd_info_t
39
40 void osd_info_t::dump(Formatter *f) const
41 {
42 f->dump_int("last_clean_begin", last_clean_begin);
43 f->dump_int("last_clean_end", last_clean_end);
44 f->dump_int("up_from", up_from);
45 f->dump_int("up_thru", up_thru);
46 f->dump_int("down_at", down_at);
47 f->dump_int("lost_at", lost_at);
48 }
49
50 void osd_info_t::encode(bufferlist& bl) const
51 {
52 __u8 struct_v = 1;
53 ::encode(struct_v, bl);
54 ::encode(last_clean_begin, bl);
55 ::encode(last_clean_end, bl);
56 ::encode(up_from, bl);
57 ::encode(up_thru, bl);
58 ::encode(down_at, bl);
59 ::encode(lost_at, bl);
60 }
61
62 void osd_info_t::decode(bufferlist::iterator& bl)
63 {
64 __u8 struct_v;
65 ::decode(struct_v, bl);
66 ::decode(last_clean_begin, bl);
67 ::decode(last_clean_end, bl);
68 ::decode(up_from, bl);
69 ::decode(up_thru, bl);
70 ::decode(down_at, bl);
71 ::decode(lost_at, bl);
72 }
73
74 void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
75 {
76 o.push_back(new osd_info_t);
77 o.push_back(new osd_info_t);
78 o.back()->last_clean_begin = 1;
79 o.back()->last_clean_end = 2;
80 o.back()->up_from = 30;
81 o.back()->up_thru = 40;
82 o.back()->down_at = 5;
83 o.back()->lost_at = 6;
84 }
85
86 ostream& operator<<(ostream& out, const osd_info_t& info)
87 {
88 out << "up_from " << info.up_from
89 << " up_thru " << info.up_thru
90 << " down_at " << info.down_at
91 << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
92 if (info.lost_at)
93 out << " lost_at " << info.lost_at;
94 return out;
95 }
96
97 // ----------------------------------
98 // osd_xinfo_t
99
100 void osd_xinfo_t::dump(Formatter *f) const
101 {
102 f->dump_stream("down_stamp") << down_stamp;
103 f->dump_float("laggy_probability", laggy_probability);
104 f->dump_int("laggy_interval", laggy_interval);
105 f->dump_int("features", features);
106 f->dump_unsigned("old_weight", old_weight);
107 }
108
109 void osd_xinfo_t::encode(bufferlist& bl) const
110 {
111 ENCODE_START(3, 1, bl);
112 ::encode(down_stamp, bl);
113 __u32 lp = laggy_probability * 0xfffffffful;
114 ::encode(lp, bl);
115 ::encode(laggy_interval, bl);
116 ::encode(features, bl);
117 ::encode(old_weight, bl);
118 ENCODE_FINISH(bl);
119 }
120
121 void osd_xinfo_t::decode(bufferlist::iterator& bl)
122 {
123 DECODE_START(3, bl);
124 ::decode(down_stamp, bl);
125 __u32 lp;
126 ::decode(lp, bl);
127 laggy_probability = (float)lp / (float)0xffffffff;
128 ::decode(laggy_interval, bl);
129 if (struct_v >= 2)
130 ::decode(features, bl);
131 else
132 features = 0;
133 if (struct_v >= 3)
134 ::decode(old_weight, bl);
135 else
136 old_weight = 0;
137 DECODE_FINISH(bl);
138 }
139
140 void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
141 {
142 o.push_back(new osd_xinfo_t);
143 o.push_back(new osd_xinfo_t);
144 o.back()->down_stamp = utime_t(2, 3);
145 o.back()->laggy_probability = .123;
146 o.back()->laggy_interval = 123456;
147 o.back()->old_weight = 0x7fff;
148 }
149
150 ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
151 {
152 return out << "down_stamp " << xi.down_stamp
153 << " laggy_probability " << xi.laggy_probability
154 << " laggy_interval " << xi.laggy_interval
155 << " old_weight " << xi.old_weight;
156 }
157
158 // ----------------------------------
159 // OSDMap::Incremental
160
161 int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
162 {
163 int n = 0;
164 for (auto &weight : new_weight) {
165 if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
166 n++; // marked out
167 else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
168 n--; // marked in
169 }
170 return n;
171 }
172
173 int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
174 {
175 int n = 0;
176 for (auto &state : new_state) { //
177 if (state.second & CEPH_OSD_UP) {
178 if (previous->is_up(state.first))
179 n++; // marked down
180 else
181 n--; // marked up
182 }
183 }
184 return n;
185 }
186
187 int OSDMap::Incremental::identify_osd(uuid_d u) const
188 {
189 for (auto &uuid : new_uuid)
190 if (uuid.second == u)
191 return uuid.first;
192 return -1;
193 }
194
195 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
196 const OSDMap& osdmap)
197 {
198 assert(epoch == osdmap.get_epoch() + 1);
199
200 for (auto &new_pool : new_pools) {
201 if (!new_pool.second.tiers.empty()) {
202 pg_pool_t& base = new_pool.second;
203
204 for (const auto &tier_pool : base.tiers) {
205 const auto &r = new_pools.find(tier_pool);
206 pg_pool_t *tier = 0;
207 if (r == new_pools.end()) {
208 const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
209 if (!orig) {
210 lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
211 return -EIO;
212 }
213 tier = get_new_pool(tier_pool, orig);
214 } else {
215 tier = &r->second;
216 }
217 if (tier->tier_of != new_pool.first) {
218 lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
219 return -EIO;
220 }
221
222 ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
223 << tier_pool << dendl;
224 tier->snap_seq = base.snap_seq;
225 tier->snap_epoch = base.snap_epoch;
226 tier->snaps = base.snaps;
227 tier->removed_snaps = base.removed_snaps;
228 }
229 }
230 }
231 return 0;
232 }
233
234
235 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
236 {
237 if (id >= 0)
238 return is_down(id);
239
240 if (down_cache &&
241 down_cache->count(id)) {
242 return true;
243 }
244
245 list<int> children;
246 crush->get_children(id, &children);
247 for (const auto &child : children) {
248 if (!subtree_is_down(child, down_cache)) {
249 return false;
250 }
251 }
252 if (down_cache) {
253 down_cache->insert(id);
254 }
255 return true;
256 }
257
258 bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
259 {
260 // use a stack-local down_cache if we didn't get one from the
261 // caller. then at least this particular call will avoid duplicated
262 // work.
263 set<int> local_down_cache;
264 if (!down_cache) {
265 down_cache = &local_down_cache;
266 }
267
268 int current = id;
269 while (true) {
270 int type;
271 if (current >= 0) {
272 type = 0;
273 } else {
274 type = crush->get_bucket_type(current);
275 }
276 assert(type >= 0);
277
278 if (!subtree_is_down(current, down_cache)) {
279 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
280 return false;
281 }
282
283 // is this a big enough subtree to be marked as down?
284 if (type >= subtree_type) {
285 ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
286 return true;
287 }
288
289 int r = crush->get_immediate_parent_id(current, &current);
290 if (r < 0) {
291 return false;
292 }
293 }
294 }
295
296 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
297 {
298 __u16 v = 5;
299 ::encode(v, bl);
300 ::encode(fsid, bl);
301 ::encode(epoch, bl);
302 ::encode(modified, bl);
303 int32_t new_t = new_pool_max;
304 ::encode(new_t, bl);
305 ::encode(new_flags, bl);
306 ::encode(fullmap, bl);
307 ::encode(crush, bl);
308
309 ::encode(new_max_osd, bl);
310 // for ::encode(new_pools, bl);
311 __u32 n = new_pools.size();
312 ::encode(n, bl);
313 for (const auto &new_pool : new_pools) {
314 n = new_pool.first;
315 ::encode(n, bl);
316 ::encode(new_pool.second, bl, 0);
317 }
318 // for ::encode(new_pool_names, bl);
319 n = new_pool_names.size();
320 ::encode(n, bl);
321
322 for (const auto &new_pool_name : new_pool_names) {
323 n = new_pool_name.first;
324 ::encode(n, bl);
325 ::encode(new_pool_name.second, bl);
326 }
327 // for ::encode(old_pools, bl);
328 n = old_pools.size();
329 ::encode(n, bl);
330 for (auto &old_pool : old_pools) {
331 n = old_pool;
332 ::encode(n, bl);
333 }
334 ::encode(new_up_client, bl, 0);
335 ::encode(new_state, bl);
336 ::encode(new_weight, bl);
337 // for ::encode(new_pg_temp, bl);
338 n = new_pg_temp.size();
339 ::encode(n, bl);
340
341 for (const auto &pg_temp : new_pg_temp) {
342 old_pg_t opg = pg_temp.first.get_old_pg();
343 ::encode(opg, bl);
344 ::encode(pg_temp.second, bl);
345 }
346 }
347
348 void OSDMap::Incremental::encode_classic(bufferlist& bl, uint64_t features) const
349 {
350 if ((features & CEPH_FEATURE_PGID64) == 0) {
351 encode_client_old(bl);
352 return;
353 }
354
355 // base
356 __u16 v = 6;
357 ::encode(v, bl);
358 ::encode(fsid, bl);
359 ::encode(epoch, bl);
360 ::encode(modified, bl);
361 ::encode(new_pool_max, bl);
362 ::encode(new_flags, bl);
363 ::encode(fullmap, bl);
364 ::encode(crush, bl);
365
366 ::encode(new_max_osd, bl);
367 ::encode(new_pools, bl, features);
368 ::encode(new_pool_names, bl);
369 ::encode(old_pools, bl);
370 ::encode(new_up_client, bl, features);
371 ::encode(new_state, bl);
372 ::encode(new_weight, bl);
373 ::encode(new_pg_temp, bl);
374
375 // extended
376 __u16 ev = 10;
377 ::encode(ev, bl);
378 ::encode(new_hb_back_up, bl, features);
379 ::encode(new_up_thru, bl);
380 ::encode(new_last_clean_interval, bl);
381 ::encode(new_lost, bl);
382 ::encode(new_blacklist, bl, features);
383 ::encode(old_blacklist, bl, features);
384 ::encode(new_up_cluster, bl, features);
385 ::encode(cluster_snapshot, bl);
386 ::encode(new_uuid, bl);
387 ::encode(new_xinfo, bl);
388 ::encode(new_hb_front_up, bl, features);
389 }
390
391 void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
392 {
393 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
394 encode_classic(bl, features);
395 return;
396 }
397
398 // only a select set of callers should *ever* be encoding new
399 // OSDMaps. others should be passing around the canonical encoded
400 // buffers from on high. select out those callers by passing in an
401 // "impossible" feature bit.
402 assert(features & CEPH_FEATURE_RESERVED);
403 features &= ~CEPH_FEATURE_RESERVED;
404
405 size_t start_offset = bl.length();
406 size_t tail_offset;
407 buffer::list::iterator crc_it;
408
409 // meta-encoding: how we include client-used and osd-specific data
410 ENCODE_START(8, 7, bl);
411
412 {
413 uint8_t v = 4;
414 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
415 v = 3;
416 }
417 ENCODE_START(v, 1, bl); // client-usable data
418 ::encode(fsid, bl);
419 ::encode(epoch, bl);
420 ::encode(modified, bl);
421 ::encode(new_pool_max, bl);
422 ::encode(new_flags, bl);
423 ::encode(fullmap, bl);
424 ::encode(crush, bl);
425
426 ::encode(new_max_osd, bl);
427 ::encode(new_pools, bl, features);
428 ::encode(new_pool_names, bl);
429 ::encode(old_pools, bl);
430 ::encode(new_up_client, bl, features);
431 ::encode(new_state, bl);
432 ::encode(new_weight, bl);
433 ::encode(new_pg_temp, bl);
434 ::encode(new_primary_temp, bl);
435 ::encode(new_primary_affinity, bl);
436 ::encode(new_erasure_code_profiles, bl);
437 ::encode(old_erasure_code_profiles, bl);
438 if (v >= 4) {
439 ::encode(new_pg_upmap, bl);
440 ::encode(old_pg_upmap, bl);
441 ::encode(new_pg_upmap_items, bl);
442 ::encode(old_pg_upmap_items, bl);
443 }
444 ENCODE_FINISH(bl); // client-usable data
445 }
446
447 {
448 uint8_t target_v = 5;
449 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
450 target_v = 2;
451 }
452 ENCODE_START(target_v, 1, bl); // extended, osd-only data
453 ::encode(new_hb_back_up, bl, features);
454 ::encode(new_up_thru, bl);
455 ::encode(new_last_clean_interval, bl);
456 ::encode(new_lost, bl);
457 ::encode(new_blacklist, bl, features);
458 ::encode(old_blacklist, bl, features);
459 ::encode(new_up_cluster, bl, features);
460 ::encode(cluster_snapshot, bl);
461 ::encode(new_uuid, bl);
462 ::encode(new_xinfo, bl);
463 ::encode(new_hb_front_up, bl, features);
464 ::encode(features, bl); // NOTE: features arg, not the member
465 if (target_v >= 3) {
466 ::encode(new_nearfull_ratio, bl);
467 ::encode(new_full_ratio, bl);
468 ::encode(new_backfillfull_ratio, bl);
469 ::encode(new_require_min_compat_client, bl);
470 }
471 ENCODE_FINISH(bl); // osd-only data
472 }
473
474 ::encode((uint32_t)0, bl); // dummy inc_crc
475 crc_it = bl.end();
476 crc_it.advance(-4);
477 tail_offset = bl.length();
478
479 ::encode(full_crc, bl);
480
481 ENCODE_FINISH(bl); // meta-encoding wrapper
482
483 // fill in crc
484 bufferlist front;
485 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
486 inc_crc = front.crc32c(-1);
487 bufferlist tail;
488 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
489 inc_crc = tail.crc32c(inc_crc);
490 ceph_le32 crc_le;
491 crc_le = inc_crc;
492 crc_it.copy_in(4, (char*)&crc_le);
493 have_crc = true;
494 }
495
496 void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
497 {
498 __u32 n, t;
499 // base
500 __u16 v;
501 ::decode(v, p);
502 ::decode(fsid, p);
503 ::decode(epoch, p);
504 ::decode(modified, p);
505 if (v == 4 || v == 5) {
506 ::decode(n, p);
507 new_pool_max = n;
508 } else if (v >= 6)
509 ::decode(new_pool_max, p);
510 ::decode(new_flags, p);
511 ::decode(fullmap, p);
512 ::decode(crush, p);
513
514 ::decode(new_max_osd, p);
515 if (v < 6) {
516 new_pools.clear();
517 ::decode(n, p);
518 while (n--) {
519 ::decode(t, p);
520 ::decode(new_pools[t], p);
521 }
522 } else {
523 ::decode(new_pools, p);
524 }
525 if (v == 5) {
526 new_pool_names.clear();
527 ::decode(n, p);
528 while (n--) {
529 ::decode(t, p);
530 ::decode(new_pool_names[t], p);
531 }
532 } else if (v >= 6) {
533 ::decode(new_pool_names, p);
534 }
535 if (v < 6) {
536 old_pools.clear();
537 ::decode(n, p);
538 while (n--) {
539 ::decode(t, p);
540 old_pools.insert(t);
541 }
542 } else {
543 ::decode(old_pools, p);
544 }
545 ::decode(new_up_client, p);
546 ::decode(new_state, p);
547 ::decode(new_weight, p);
548
549 if (v < 6) {
550 new_pg_temp.clear();
551 ::decode(n, p);
552 while (n--) {
553 old_pg_t opg;
554 ::decode_raw(opg, p);
555 ::decode(new_pg_temp[pg_t(opg)], p);
556 }
557 } else {
558 ::decode(new_pg_temp, p);
559 }
560
561 // decode short map, too.
562 if (v == 5 && p.end())
563 return;
564
565 // extended
566 __u16 ev = 0;
567 if (v >= 5)
568 ::decode(ev, p);
569 ::decode(new_hb_back_up, p);
570 if (v < 5)
571 ::decode(new_pool_names, p);
572 ::decode(new_up_thru, p);
573 ::decode(new_last_clean_interval, p);
574 ::decode(new_lost, p);
575 ::decode(new_blacklist, p);
576 ::decode(old_blacklist, p);
577 if (ev >= 6)
578 ::decode(new_up_cluster, p);
579 if (ev >= 7)
580 ::decode(cluster_snapshot, p);
581 if (ev >= 8)
582 ::decode(new_uuid, p);
583 if (ev >= 9)
584 ::decode(new_xinfo, p);
585 if (ev >= 10)
586 ::decode(new_hb_front_up, p);
587 }
588
589 void OSDMap::Incremental::decode(bufferlist::iterator& bl)
590 {
591 /**
592 * Older encodings of the Incremental had a single struct_v which
593 * covered the whole encoding, and was prior to our modern
594 * stuff which includes a compatv and a size. So if we see
595 * a struct_v < 7, we must rewind to the beginning and use our
596 * classic decoder.
597 */
598 size_t start_offset = bl.get_off();
599 size_t tail_offset = 0;
600 bufferlist crc_front, crc_tail;
601
602 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
603 if (struct_v < 7) {
604 int struct_v_size = sizeof(struct_v);
605 bl.advance(-struct_v_size);
606 decode_classic(bl);
607 encode_features = 0;
608 if (struct_v >= 6)
609 encode_features = CEPH_FEATURE_PGID64;
610 else
611 encode_features = 0;
612 return;
613 }
614 {
615 DECODE_START(4, bl); // client-usable data
616 ::decode(fsid, bl);
617 ::decode(epoch, bl);
618 ::decode(modified, bl);
619 ::decode(new_pool_max, bl);
620 ::decode(new_flags, bl);
621 ::decode(fullmap, bl);
622 ::decode(crush, bl);
623
624 ::decode(new_max_osd, bl);
625 ::decode(new_pools, bl);
626 ::decode(new_pool_names, bl);
627 ::decode(old_pools, bl);
628 ::decode(new_up_client, bl);
629 ::decode(new_state, bl);
630 ::decode(new_weight, bl);
631 ::decode(new_pg_temp, bl);
632 ::decode(new_primary_temp, bl);
633 if (struct_v >= 2)
634 ::decode(new_primary_affinity, bl);
635 else
636 new_primary_affinity.clear();
637 if (struct_v >= 3) {
638 ::decode(new_erasure_code_profiles, bl);
639 ::decode(old_erasure_code_profiles, bl);
640 } else {
641 new_erasure_code_profiles.clear();
642 old_erasure_code_profiles.clear();
643 }
644 if (struct_v >= 4) {
645 ::decode(new_pg_upmap, bl);
646 ::decode(old_pg_upmap, bl);
647 ::decode(new_pg_upmap_items, bl);
648 ::decode(old_pg_upmap_items, bl);
649 }
650 DECODE_FINISH(bl); // client-usable data
651 }
652
653 {
654 DECODE_START(5, bl); // extended, osd-only data
655 ::decode(new_hb_back_up, bl);
656 ::decode(new_up_thru, bl);
657 ::decode(new_last_clean_interval, bl);
658 ::decode(new_lost, bl);
659 ::decode(new_blacklist, bl);
660 ::decode(old_blacklist, bl);
661 ::decode(new_up_cluster, bl);
662 ::decode(cluster_snapshot, bl);
663 ::decode(new_uuid, bl);
664 ::decode(new_xinfo, bl);
665 ::decode(new_hb_front_up, bl);
666 if (struct_v >= 2)
667 ::decode(encode_features, bl);
668 else
669 encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
670 if (struct_v >= 3) {
671 ::decode(new_nearfull_ratio, bl);
672 ::decode(new_full_ratio, bl);
673 } else {
674 new_nearfull_ratio = -1;
675 new_full_ratio = -1;
676 }
677 if (struct_v >= 4) {
678 ::decode(new_backfillfull_ratio, bl);
679 } else {
680 new_backfillfull_ratio = -1;
681 }
682 if (struct_v >= 5)
683 ::decode(new_require_min_compat_client, bl);
684 DECODE_FINISH(bl); // osd-only data
685 }
686
687 if (struct_v >= 8) {
688 have_crc = true;
689 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
690 ::decode(inc_crc, bl);
691 tail_offset = bl.get_off();
692 ::decode(full_crc, bl);
693 } else {
694 have_crc = false;
695 full_crc = 0;
696 inc_crc = 0;
697 }
698
699 DECODE_FINISH(bl); // wrapper
700
701 if (have_crc) {
702 // verify crc
703 uint32_t actual = crc_front.crc32c(-1);
704 if (tail_offset < bl.get_off()) {
705 bufferlist tail;
706 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
707 actual = tail.crc32c(actual);
708 }
709 if (inc_crc != actual) {
710 ostringstream ss;
711 ss << "bad crc, actual " << actual << " != expected " << inc_crc;
712 string s = ss.str();
713 throw buffer::malformed_input(s.c_str());
714 }
715 }
716 }
717
718 void OSDMap::Incremental::dump(Formatter *f) const
719 {
720 f->dump_int("epoch", epoch);
721 f->dump_stream("fsid") << fsid;
722 f->dump_stream("modified") << modified;
723 f->dump_int("new_pool_max", new_pool_max);
724 f->dump_int("new_flags", new_flags);
725 f->dump_float("new_full_ratio", new_full_ratio);
726 f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
727 f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
728 f->dump_string("new_require_min_compat_client", new_require_min_compat_client);
729
730 if (fullmap.length()) {
731 f->open_object_section("full_map");
732 OSDMap full;
733 bufferlist fbl = fullmap; // kludge around constness.
734 auto p = fbl.begin();
735 full.decode(p);
736 full.dump(f);
737 f->close_section();
738 }
739 if (crush.length()) {
740 f->open_object_section("crush");
741 CrushWrapper c;
742 bufferlist tbl = crush; // kludge around constness.
743 auto p = tbl.begin();
744 c.decode(p);
745 c.dump(f);
746 f->close_section();
747 }
748
749 f->dump_int("new_max_osd", new_max_osd);
750
751 f->open_array_section("new_pools");
752
753 for (const auto &new_pool : new_pools) {
754 f->open_object_section("pool");
755 f->dump_int("pool", new_pool.first);
756 new_pool.second.dump(f);
757 f->close_section();
758 }
759 f->close_section();
760 f->open_array_section("new_pool_names");
761
762 for (const auto &new_pool_name : new_pool_names) {
763 f->open_object_section("pool_name");
764 f->dump_int("pool", new_pool_name.first);
765 f->dump_string("name", new_pool_name.second);
766 f->close_section();
767 }
768 f->close_section();
769 f->open_array_section("old_pools");
770
771 for (const auto &old_pool : old_pools)
772 f->dump_int("pool", old_pool);
773 f->close_section();
774
775 f->open_array_section("new_up_osds");
776
777 for (const auto &upclient : new_up_client) {
778 f->open_object_section("osd");
779 f->dump_int("osd", upclient.first);
780 f->dump_stream("public_addr") << upclient.second;
781 f->dump_stream("cluster_addr") << new_up_cluster.find(upclient.first)->second;
782 f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(upclient.first)->second;
783 map<int32_t, entity_addr_t>::const_iterator q;
784 if ((q = new_hb_front_up.find(upclient.first)) != new_hb_front_up.end())
785 f->dump_stream("heartbeat_front_addr") << q->second;
786 f->close_section();
787 }
788 f->close_section();
789
790 f->open_array_section("new_weight");
791
792 for (const auto &weight : new_weight) {
793 f->open_object_section("osd");
794 f->dump_int("osd", weight.first);
795 f->dump_int("weight", weight.second);
796 f->close_section();
797 }
798 f->close_section();
799
800 f->open_array_section("osd_state_xor");
801 for (const auto &ns : new_state) {
802 f->open_object_section("osd");
803 f->dump_int("osd", ns.first);
804 set<string> st;
805 calc_state_set(new_state.find(ns.first)->second, st);
806 f->open_array_section("state_xor");
807 for (auto &state : st)
808 f->dump_string("state", state);
809 f->close_section();
810 }
811 f->close_section();
812
813 f->open_array_section("new_pg_temp");
814
815 for (const auto &pg_temp : new_pg_temp) {
816 f->open_object_section("pg");
817 f->dump_stream("pgid") << pg_temp.first;
818 f->open_array_section("osds");
819
820 for (const auto &osd : pg_temp.second)
821 f->dump_int("osd", osd);
822 f->close_section();
823 f->close_section();
824 }
825 f->close_section();
826
827 f->open_array_section("primary_temp");
828
829 for (const auto &primary_temp : new_primary_temp) {
830 f->dump_stream("pgid") << primary_temp.first;
831 f->dump_int("osd", primary_temp.second);
832 }
833 f->close_section(); // primary_temp
834
835 f->open_array_section("new_pg_upmap");
836 for (auto& i : new_pg_upmap) {
837 f->open_object_section("mapping");
838 f->dump_stream("pgid") << i.first;
839 f->open_array_section("osds");
840 for (auto osd : i.second) {
841 f->dump_int("osd", osd);
842 }
843 f->close_section();
844 f->close_section();
845 }
846 f->close_section();
847 f->open_array_section("old_pg_upmap");
848 for (auto& i : old_pg_upmap) {
849 f->dump_stream("pgid") << i;
850 }
851 f->close_section();
852
853 f->open_array_section("new_pg_upmap_items");
854 for (auto& i : new_pg_upmap_items) {
855 f->open_object_section("mapping");
856 f->dump_stream("pgid") << i.first;
857 f->open_array_section("mappings");
858 for (auto& p : i.second) {
859 f->open_object_section("mapping");
860 f->dump_int("from", p.first);
861 f->dump_int("to", p.second);
862 f->close_section();
863 }
864 f->close_section();
865 f->close_section();
866 }
867 f->close_section();
868 f->open_array_section("old_pg_upmap_items");
869 for (auto& i : old_pg_upmap_items) {
870 f->dump_stream("pgid") << i;
871 }
872 f->close_section();
873
874 f->open_array_section("new_up_thru");
875
876 for (const auto &up_thru : new_up_thru) {
877 f->open_object_section("osd");
878 f->dump_int("osd", up_thru.first);
879 f->dump_int("up_thru", up_thru.second);
880 f->close_section();
881 }
882 f->close_section();
883
884 f->open_array_section("new_lost");
885
886 for (const auto &lost : new_lost) {
887 f->open_object_section("osd");
888 f->dump_int("osd", lost.first);
889 f->dump_int("epoch_lost", lost.second);
890 f->close_section();
891 }
892 f->close_section();
893
894 f->open_array_section("new_last_clean_interval");
895
896 for (const auto &last_clean_interval : new_last_clean_interval) {
897 f->open_object_section("osd");
898 f->dump_int("osd", last_clean_interval.first);
899 f->dump_int("first", last_clean_interval.second.first);
900 f->dump_int("last", last_clean_interval.second.second);
901 f->close_section();
902 }
903 f->close_section();
904
905 f->open_array_section("new_blacklist");
906 for (const auto &blist : new_blacklist) {
907 stringstream ss;
908 ss << blist.first;
909 f->dump_stream(ss.str().c_str()) << blist.second;
910 }
911 f->close_section();
912 f->open_array_section("old_blacklist");
913 for (const auto &blist : old_blacklist)
914 f->dump_stream("addr") << blist;
915 f->close_section();
916
917 f->open_array_section("new_xinfo");
918 for (const auto &xinfo : new_xinfo) {
919 f->open_object_section("xinfo");
920 f->dump_int("osd", xinfo.first);
921 xinfo.second.dump(f);
922 f->close_section();
923 }
924 f->close_section();
925
926 if (cluster_snapshot.size())
927 f->dump_string("cluster_snapshot", cluster_snapshot);
928
929 f->open_array_section("new_uuid");
930 for (const auto &uuid : new_uuid) {
931 f->open_object_section("osd");
932 f->dump_int("osd", uuid.first);
933 f->dump_stream("uuid") << uuid.second;
934 f->close_section();
935 }
936 f->close_section();
937
938 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
939 f->open_array_section("old_erasure_code_profiles");
940 for (const auto &erasure_code_profile : old_erasure_code_profiles) {
941 f->dump_string("old", erasure_code_profile.c_str());
942 }
943 f->close_section();
944 }
945
946 void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
947 {
948 o.push_back(new Incremental);
949 }
950
951 // ----------------------------------
952 // OSDMap
953
954 void OSDMap::set_epoch(epoch_t e)
955 {
956 epoch = e;
957 for (auto &pool : pools)
958 pool.second.last_change = e;
959 }
960
961 bool OSDMap::is_blacklisted(const entity_addr_t& a) const
962 {
963 if (blacklist.empty())
964 return false;
965
966 // this specific instance?
967 if (blacklist.count(a))
968 return true;
969
970 // is entire ip blacklisted?
971 if (a.is_ip()) {
972 entity_addr_t b = a;
973 b.set_port(0);
974 b.set_nonce(0);
975 if (blacklist.count(b)) {
976 return true;
977 }
978 }
979
980 return false;
981 }
982
983 void OSDMap::get_blacklist(list<pair<entity_addr_t,utime_t> > *bl) const
984 {
985 std::copy(blacklist.begin(), blacklist.end(), std::back_inserter(*bl));
986 }
987
988 void OSDMap::set_max_osd(int m)
989 {
990 int o = max_osd;
991 max_osd = m;
992 osd_state.resize(m);
993 osd_weight.resize(m);
994 for (; o<max_osd; o++) {
995 osd_state[o] = 0;
996 osd_weight[o] = CEPH_OSD_OUT;
997 }
998 osd_info.resize(m);
999 osd_xinfo.resize(m);
1000 osd_addrs->client_addr.resize(m);
1001 osd_addrs->cluster_addr.resize(m);
1002 osd_addrs->hb_back_addr.resize(m);
1003 osd_addrs->hb_front_addr.resize(m);
1004 osd_uuid->resize(m);
1005 if (osd_primary_affinity)
1006 osd_primary_affinity->resize(m, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1007
1008 calc_num_osds();
1009 }
1010
1011 int OSDMap::calc_num_osds()
1012 {
1013 num_osd = 0;
1014 num_up_osd = 0;
1015 num_in_osd = 0;
1016 for (int i=0; i<max_osd; i++) {
1017 if (osd_state[i] & CEPH_OSD_EXISTS) {
1018 ++num_osd;
1019 if (osd_state[i] & CEPH_OSD_UP) {
1020 ++num_up_osd;
1021 }
1022 if (get_weight(i) != CEPH_OSD_OUT) {
1023 ++num_in_osd;
1024 }
1025 }
1026 }
1027 return num_osd;
1028 }
1029
1030 void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
1031 {
1032 *full = 0;
1033 *backfill = 0;
1034 *nearfull = 0;
1035 for (int i = 0; i < max_osd; ++i) {
1036 if (exists(i) && is_up(i) && is_in(i)) {
1037 if (osd_state[i] & CEPH_OSD_FULL)
1038 ++(*full);
1039 else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
1040 ++(*backfill);
1041 else if (osd_state[i] & CEPH_OSD_NEARFULL)
1042 ++(*nearfull);
1043 }
1044 }
1045 }
1046
1047 static bool get_osd_utilization(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
1048 int id, int64_t* kb, int64_t* kb_used, int64_t* kb_avail) {
1049 auto p = osd_stat.find(id);
1050 if (p == osd_stat.end())
1051 return false;
1052 *kb = p->second.kb;
1053 *kb_used = p->second.kb_used;
1054 *kb_avail = p->second.kb_avail;
1055 return *kb > 0;
1056 }
1057
1058 void OSDMap::get_full_osd_util(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
1059 map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const
1060 {
1061 full->clear();
1062 backfill->clear();
1063 nearfull->clear();
1064 for (int i = 0; i < max_osd; ++i) {
1065 if (exists(i) && is_up(i) && is_in(i)) {
1066 int64_t kb, kb_used, kb_avail;
1067 if (osd_state[i] & CEPH_OSD_FULL) {
1068 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1069 full->emplace(i, (float)kb_used / (float)kb);
1070 } else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) {
1071 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1072 backfill->emplace(i, (float)kb_used / (float)kb);
1073 } else if (osd_state[i] & CEPH_OSD_NEARFULL) {
1074 if (get_osd_utilization(osd_stat, i, &kb, &kb_used, &kb_avail))
1075 nearfull->emplace(i, (float)kb_used / (float)kb);
1076 }
1077 }
1078 }
1079 }
1080
1081 void OSDMap::get_all_osds(set<int32_t>& ls) const
1082 {
1083 for (int i=0; i<max_osd; i++)
1084 if (exists(i))
1085 ls.insert(i);
1086 }
1087
1088 void OSDMap::get_up_osds(set<int32_t>& ls) const
1089 {
1090 for (int i = 0; i < max_osd; i++) {
1091 if (is_up(i))
1092 ls.insert(i);
1093 }
1094 }
1095
1096 void OSDMap::calc_state_set(int state, set<string>& st)
1097 {
1098 unsigned t = state;
1099 for (unsigned s = 1; t; s <<= 1) {
1100 if (t & s) {
1101 t &= ~s;
1102 st.insert(ceph_osd_state_name(s));
1103 }
1104 }
1105 }
1106
1107 void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
1108 {
1109 float max = 0;
1110 for (const auto &weight : weights) {
1111 if (weight.second > max)
1112 max = weight.second;
1113 }
1114
1115 for (const auto &weight : weights) {
1116 inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
1117 }
1118 }
1119
1120 int OSDMap::identify_osd(const entity_addr_t& addr) const
1121 {
1122 for (int i=0; i<max_osd; i++)
1123 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr))
1124 return i;
1125 return -1;
1126 }
1127
1128 int OSDMap::identify_osd(const uuid_d& u) const
1129 {
1130 for (int i=0; i<max_osd; i++)
1131 if (exists(i) && get_uuid(i) == u)
1132 return i;
1133 return -1;
1134 }
1135
1136 int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
1137 {
1138 for (int i=0; i<max_osd; i++)
1139 if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
1140 get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
1141 return i;
1142 return -1;
1143 }
1144
1145 int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
1146 {
1147 for (int i=0; i<max_osd; i++)
1148 if (exists(i) && (get_addr(i).is_same_host(ip) || get_cluster_addr(i).is_same_host(ip)))
1149 return i;
1150 return -1;
1151 }
1152
1153
1154 uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
1155 {
1156 uint64_t features = 0; // things we actually have
1157 uint64_t mask = 0; // things we could have
1158
1159 if (crush->has_nondefault_tunables())
1160 features |= CEPH_FEATURE_CRUSH_TUNABLES;
1161 if (crush->has_nondefault_tunables2())
1162 features |= CEPH_FEATURE_CRUSH_TUNABLES2;
1163 if (crush->has_nondefault_tunables3())
1164 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1165 if (crush->has_v4_buckets())
1166 features |= CEPH_FEATURE_CRUSH_V4;
1167 if (crush->has_nondefault_tunables5())
1168 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1169 if (crush->has_incompat_chooseargs())
1170 features |= CEPH_FEATURE_CRUSH_CHOOSEARGS;
1171 mask |= CEPH_FEATURES_CRUSH;
1172
1173 if (!pg_upmap.empty() || !pg_upmap_items.empty())
1174 features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1175 mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
1176
1177 for (auto &pool: pools) {
1178 if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
1179 features |= CEPH_FEATURE_OSDHASHPSPOOL;
1180 }
1181 if (pool.second.is_erasure() &&
1182 entity_type != CEPH_ENTITY_TYPE_CLIENT) { // not for clients
1183 features |= CEPH_FEATURE_OSD_ERASURE_CODES;
1184 }
1185 if (!pool.second.tiers.empty() ||
1186 pool.second.is_tier()) {
1187 features |= CEPH_FEATURE_OSD_CACHEPOOL;
1188 }
1189 int ruleid = crush->find_rule(pool.second.get_crush_ruleset(),
1190 pool.second.get_type(),
1191 pool.second.get_size());
1192 if (ruleid >= 0) {
1193 if (crush->is_v2_rule(ruleid))
1194 features |= CEPH_FEATURE_CRUSH_V2;
1195 if (crush->is_v3_rule(ruleid))
1196 features |= CEPH_FEATURE_CRUSH_TUNABLES3;
1197 if (crush->is_v5_rule(ruleid))
1198 features |= CEPH_FEATURE_CRUSH_TUNABLES5;
1199 }
1200 }
1201 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1202 for (auto &erasure_code_profile : erasure_code_profiles) {
1203 auto& profile = erasure_code_profile.second;
1204 const auto& plugin = profile.find("plugin");
1205 if (plugin != profile.end()) {
1206 if (plugin->second == "isa" || plugin->second == "lrc")
1207 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
1208 if (plugin->second == "shec")
1209 features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
1210 }
1211 }
1212 }
1213 mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
1214 if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
1215 mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
1216
1217 if (osd_primary_affinity) {
1218 for (int i = 0; i < max_osd; ++i) {
1219 if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1220 features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1221 break;
1222 }
1223 }
1224 }
1225 mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
1226
1227 if (entity_type == CEPH_ENTITY_TYPE_OSD) {
1228 const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
1229 if (test_flag(CEPH_OSDMAP_REQUIRE_JEWEL)) {
1230 features |= jewel_features;
1231 }
1232 mask |= jewel_features;
1233
1234 const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
1235 | CEPH_FEATURE_MSG_ADDR2;
1236 if (test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN)) {
1237 features |= kraken_features;
1238 }
1239 mask |= kraken_features;
1240 }
1241
1242 if (pmask)
1243 *pmask = mask;
1244 return features;
1245 }
1246
1247 pair<string,string> OSDMap::get_min_compat_client() const
1248 {
1249 uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
1250
1251 if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
1252 HAVE_FEATURE(f, CRUSH_CHOOSEARGS)) { // v12.0.1-2172-gef1ef28
1253 return make_pair("luminous", "12.2.0");
1254 }
1255 if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
1256 return make_pair("jewel", "10.2.0");
1257 }
1258 if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
1259 return make_pair("hammer", "0.94");
1260 }
1261 if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
1262 HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
1263 HAVE_FEATURE(f, OSD_ERASURE_CODES) || // v0.73-498-gbfc86a8
1264 HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
1265 return make_pair("firefly", "0.80");
1266 }
1267 if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
1268 HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
1269 return make_pair("dumpling", "0.67");
1270 }
1271 if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
1272 return make_pair("argonaut", "0.48argonaut-207");
1273 }
1274 return make_pair("argonaut", "0.48");
1275 }
1276
1277 void OSDMap::_calc_up_osd_features()
1278 {
1279 bool first = true;
1280 cached_up_osd_features = 0;
1281 for (int osd = 0; osd < max_osd; ++osd) {
1282 if (!is_up(osd))
1283 continue;
1284 const osd_xinfo_t &xi = get_xinfo(osd);
1285 if (first) {
1286 cached_up_osd_features = xi.features;
1287 first = false;
1288 } else {
1289 cached_up_osd_features &= xi.features;
1290 }
1291 }
1292 }
1293
1294 uint64_t OSDMap::get_up_osd_features() const
1295 {
1296 return cached_up_osd_features;
1297 }
1298
1299 void OSDMap::dedup(const OSDMap *o, OSDMap *n)
1300 {
1301 if (o->epoch == n->epoch)
1302 return;
1303
1304 int diff = 0;
1305
1306 // do addrs match?
1307 if (o->max_osd != n->max_osd)
1308 diff++;
1309 for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
1310 if ( n->osd_addrs->client_addr[i] && o->osd_addrs->client_addr[i] &&
1311 *n->osd_addrs->client_addr[i] == *o->osd_addrs->client_addr[i])
1312 n->osd_addrs->client_addr[i] = o->osd_addrs->client_addr[i];
1313 else
1314 diff++;
1315 if ( n->osd_addrs->cluster_addr[i] && o->osd_addrs->cluster_addr[i] &&
1316 *n->osd_addrs->cluster_addr[i] == *o->osd_addrs->cluster_addr[i])
1317 n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
1318 else
1319 diff++;
1320 if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
1321 *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
1322 n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
1323 else
1324 diff++;
1325 if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
1326 *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
1327 n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
1328 else
1329 diff++;
1330 }
1331 if (diff == 0) {
1332 // zoinks, no differences at all!
1333 n->osd_addrs = o->osd_addrs;
1334 }
1335
1336 // does crush match?
1337 bufferlist oc, nc;
1338 ::encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1339 ::encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
1340 if (oc.contents_equal(nc)) {
1341 n->crush = o->crush;
1342 }
1343
1344 // does pg_temp match?
1345 if (o->pg_temp->size() == n->pg_temp->size()) {
1346 if (*o->pg_temp == *n->pg_temp)
1347 n->pg_temp = o->pg_temp;
1348 }
1349
1350 // does primary_temp match?
1351 if (o->primary_temp->size() == n->primary_temp->size()) {
1352 if (*o->primary_temp == *n->primary_temp)
1353 n->primary_temp = o->primary_temp;
1354 }
1355
1356 // do uuids match?
1357 if (o->osd_uuid->size() == n->osd_uuid->size() &&
1358 *o->osd_uuid == *n->osd_uuid)
1359 n->osd_uuid = o->osd_uuid;
1360 }
1361
1362 void OSDMap::clean_temps(CephContext *cct,
1363 const OSDMap& osdmap, Incremental *pending_inc)
1364 {
1365 ldout(cct, 10) << __func__ << dendl;
1366 OSDMap tmpmap;
1367 tmpmap.deepish_copy_from(osdmap);
1368 tmpmap.apply_incremental(*pending_inc);
1369
1370 for (auto pg : *tmpmap.pg_temp) {
1371 // if pool does not exist, remove any existing pg_temps associated with
1372 // it. we don't care about pg_temps on the pending_inc either; if there
1373 // are new_pg_temp entries on the pending, clear them out just as well.
1374 if (!osdmap.have_pg_pool(pg.first.pool())) {
1375 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1376 << " for nonexistent pool " << pg.first.pool() << dendl;
1377 pending_inc->new_pg_temp[pg.first].clear();
1378 continue;
1379 }
1380 // all osds down?
1381 unsigned num_up = 0;
1382 for (auto o : pg.second) {
1383 if (!tmpmap.is_down(o)) {
1384 ++num_up;
1385 break;
1386 }
1387 }
1388 if (num_up == 0) {
1389 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
1390 << " with all down osds" << pg.second << dendl;
1391 pending_inc->new_pg_temp[pg.first].clear();
1392 continue;
1393 }
1394 // redundant pg_temp?
1395 vector<int> raw_up;
1396 int primary;
1397 tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
1398 if (vectors_equal(raw_up, pg.second)) {
1399 ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
1400 << pg.second << " that matches raw_up mapping" << dendl;
1401 if (osdmap.pg_temp->count(pg.first))
1402 pending_inc->new_pg_temp[pg.first].clear();
1403 else
1404 pending_inc->new_pg_temp.erase(pg.first);
1405 }
1406 }
1407
1408 for (auto &pg : *tmpmap.primary_temp) {
1409 // primary down?
1410 if (tmpmap.is_down(pg.second)) {
1411 ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
1412 << " to down " << pg.second << dendl;
1413 pending_inc->new_primary_temp[pg.first] = -1;
1414 continue;
1415 }
1416 // redundant primary_temp?
1417 vector<int> real_up, templess_up;
1418 int real_primary, templess_primary;
1419 pg_t pgid = pg.first;
1420 tmpmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
1421 tmpmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
1422 if (real_primary == templess_primary){
1423 ldout(cct, 10) << __func__ << " removing primary_temp "
1424 << pgid << " -> " << real_primary
1425 << " (unnecessary/redundant)" << dendl;
1426 if (osdmap.primary_temp->count(pgid))
1427 pending_inc->new_primary_temp[pgid] = -1;
1428 else
1429 pending_inc->new_primary_temp.erase(pgid);
1430 }
1431 }
1432 }
1433
1434 int OSDMap::apply_incremental(const Incremental &inc)
1435 {
1436 new_blacklist_entries = false;
1437 if (inc.epoch == 1)
1438 fsid = inc.fsid;
1439 else if (inc.fsid != fsid)
1440 return -EINVAL;
1441
1442 assert(inc.epoch == epoch+1);
1443
1444 epoch++;
1445 modified = inc.modified;
1446
1447 // full map?
1448 if (inc.fullmap.length()) {
1449 bufferlist bl(inc.fullmap);
1450 decode(bl);
1451 return 0;
1452 }
1453
1454 // nope, incremental.
1455 if (inc.new_flags >= 0)
1456 flags = inc.new_flags;
1457
1458 if (inc.new_max_osd >= 0)
1459 set_max_osd(inc.new_max_osd);
1460
1461 if (inc.new_pool_max != -1)
1462 pool_max = inc.new_pool_max;
1463
1464 for (const auto &pool : inc.new_pools) {
1465 pools[pool.first] = pool.second;
1466 pools[pool.first].last_change = epoch;
1467 }
1468
1469 for (const auto &pname : inc.new_pool_names) {
1470 auto pool_name_entry = pool_name.find(pname.first);
1471 if (pool_name_entry != pool_name.end()) {
1472 name_pool.erase(pool_name_entry->second);
1473 pool_name_entry->second = pname.second;
1474 } else {
1475 pool_name[pname.first] = pname.second;
1476 }
1477 name_pool[pname.second] = pname.first;
1478 }
1479
1480 for (const auto &pool : inc.old_pools) {
1481 pools.erase(pool);
1482 name_pool.erase(pool_name[pool]);
1483 pool_name.erase(pool);
1484 }
1485
1486 for (const auto &weight : inc.new_weight) {
1487 set_weight(weight.first, weight.second);
1488
1489 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1490 // xinfo old_weight.
1491 if (weight.second) {
1492 osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
1493 osd_xinfo[weight.first].old_weight = 0;
1494 }
1495 }
1496
1497 for (const auto &primary_affinity : inc.new_primary_affinity) {
1498 set_primary_affinity(primary_affinity.first, primary_affinity.second);
1499 }
1500
1501 // erasure_code_profiles
1502 for (const auto &profile : inc.old_erasure_code_profiles)
1503 erasure_code_profiles.erase(profile);
1504
1505 for (const auto &profile : inc.new_erasure_code_profiles) {
1506 set_erasure_code_profile(profile.first, profile.second);
1507 }
1508
1509 // up/down
1510 for (const auto &state : inc.new_state) {
1511 const auto osd = state.first;
1512 int s = state.second ? state.second : CEPH_OSD_UP;
1513 if ((osd_state[osd] & CEPH_OSD_UP) &&
1514 (s & CEPH_OSD_UP)) {
1515 osd_info[osd].down_at = epoch;
1516 osd_xinfo[osd].down_stamp = modified;
1517 }
1518 if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
1519 (s & CEPH_OSD_EXISTS)) {
1520 // osd is destroyed; clear out anything interesting.
1521 (*osd_uuid)[osd] = uuid_d();
1522 osd_info[osd] = osd_info_t();
1523 osd_xinfo[osd] = osd_xinfo_t();
1524 set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1525 osd_addrs->client_addr[osd].reset(new entity_addr_t());
1526 osd_addrs->cluster_addr[osd].reset(new entity_addr_t());
1527 osd_addrs->hb_front_addr[osd].reset(new entity_addr_t());
1528 osd_addrs->hb_back_addr[osd].reset(new entity_addr_t());
1529 osd_state[osd] = 0;
1530 } else {
1531 osd_state[osd] ^= s;
1532 }
1533 }
1534
1535 for (const auto &client : inc.new_up_client) {
1536 osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1537 osd_addrs->client_addr[client.first].reset(new entity_addr_t(client.second));
1538 if (inc.new_hb_back_up.empty())
1539 osd_addrs->hb_back_addr[client.first].reset(new entity_addr_t(client.second)); //this is a backward-compatibility hack
1540 else
1541 osd_addrs->hb_back_addr[client.first].reset(
1542 new entity_addr_t(inc.new_hb_back_up.find(client.first)->second));
1543 const auto j = inc.new_hb_front_up.find(client.first);
1544 if (j != inc.new_hb_front_up.end())
1545 osd_addrs->hb_front_addr[client.first].reset(new entity_addr_t(j->second));
1546 else
1547 osd_addrs->hb_front_addr[client.first].reset();
1548
1549 osd_info[client.first].up_from = epoch;
1550 }
1551
1552 for (const auto &cluster : inc.new_up_cluster)
1553 osd_addrs->cluster_addr[cluster.first].reset(new entity_addr_t(cluster.second));
1554
1555 // info
1556 for (const auto &thru : inc.new_up_thru)
1557 osd_info[thru.first].up_thru = thru.second;
1558
1559 for (const auto &interval : inc.new_last_clean_interval) {
1560 osd_info[interval.first].last_clean_begin = interval.second.first;
1561 osd_info[interval.first].last_clean_end = interval.second.second;
1562 }
1563
1564 for (const auto &lost : inc.new_lost)
1565 osd_info[lost.first].lost_at = lost.second;
1566
1567 // xinfo
1568 for (const auto &xinfo : inc.new_xinfo)
1569 osd_xinfo[xinfo.first] = xinfo.second;
1570
1571 // uuid
1572 for (const auto &uuid : inc.new_uuid)
1573 (*osd_uuid)[uuid.first] = uuid.second;
1574
1575 // pg rebuild
1576 for (const auto &pg : inc.new_pg_temp) {
1577 if (pg.second.empty())
1578 pg_temp->erase(pg.first);
1579 else
1580 (*pg_temp)[pg.first] = pg.second;
1581 }
1582
1583 for (const auto &pg : inc.new_primary_temp) {
1584 if (pg.second == -1)
1585 primary_temp->erase(pg.first);
1586 else
1587 (*primary_temp)[pg.first] = pg.second;
1588 }
1589
1590 for (auto& p : inc.new_pg_upmap) {
1591 pg_upmap[p.first] = p.second;
1592 }
1593 for (auto& pg : inc.old_pg_upmap) {
1594 pg_upmap.erase(pg);
1595 }
1596 for (auto& p : inc.new_pg_upmap_items) {
1597 pg_upmap_items[p.first] = p.second;
1598 }
1599 for (auto& pg : inc.old_pg_upmap_items) {
1600 pg_upmap_items.erase(pg);
1601 }
1602
1603 // blacklist
1604 if (!inc.new_blacklist.empty()) {
1605 blacklist.insert(inc.new_blacklist.begin(),inc.new_blacklist.end());
1606 new_blacklist_entries = true;
1607 }
1608 for (const auto &addr : inc.old_blacklist)
1609 blacklist.erase(addr);
1610
1611 // cluster snapshot?
1612 if (inc.cluster_snapshot.length()) {
1613 cluster_snapshot = inc.cluster_snapshot;
1614 cluster_snapshot_epoch = inc.epoch;
1615 } else {
1616 cluster_snapshot.clear();
1617 cluster_snapshot_epoch = 0;
1618 }
1619
1620 if (inc.new_nearfull_ratio >= 0) {
1621 nearfull_ratio = inc.new_nearfull_ratio;
1622 }
1623 if (inc.new_backfillfull_ratio >= 0) {
1624 backfillfull_ratio = inc.new_backfillfull_ratio;
1625 }
1626 if (inc.new_full_ratio >= 0) {
1627 full_ratio = inc.new_full_ratio;
1628 }
1629 if (inc.new_require_min_compat_client.length()) {
1630 require_min_compat_client = inc.new_require_min_compat_client;
1631 }
1632
1633 // do new crush map last (after up/down stuff)
1634 if (inc.crush.length()) {
1635 bufferlist bl(inc.crush);
1636 auto blp = bl.begin();
1637 crush.reset(new CrushWrapper);
1638 crush->decode(blp);
1639 }
1640
1641 calc_num_osds();
1642 _calc_up_osd_features();
1643 return 0;
1644 }
1645
1646 // mapping
1647 int OSDMap::map_to_pg(
1648 int64_t poolid,
1649 const string& name,
1650 const string& key,
1651 const string& nspace,
1652 pg_t *pg) const
1653 {
1654 // calculate ps (placement seed)
1655 const pg_pool_t *pool = get_pg_pool(poolid);
1656 if (!pool)
1657 return -ENOENT;
1658 ps_t ps;
1659 if (!key.empty())
1660 ps = pool->hash_key(key, nspace);
1661 else
1662 ps = pool->hash_key(name, nspace);
1663 *pg = pg_t(ps, poolid);
1664 return 0;
1665 }
1666
1667 int OSDMap::object_locator_to_pg(
1668 const object_t& oid, const object_locator_t& loc, pg_t &pg) const
1669 {
1670 if (loc.hash >= 0) {
1671 if (!get_pg_pool(loc.get_pool())) {
1672 return -ENOENT;
1673 }
1674 pg = pg_t(loc.hash, loc.get_pool());
1675 return 0;
1676 }
1677 return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
1678 }
1679
1680 ceph_object_layout OSDMap::make_object_layout(
1681 object_t oid, int pg_pool, string nspace) const
1682 {
1683 object_locator_t loc(pg_pool, nspace);
1684
1685 ceph_object_layout ol;
1686 pg_t pgid = object_locator_to_pg(oid, loc);
1687 ol.ol_pgid = pgid.get_old_pg().v;
1688 ol.ol_stripe_unit = 0;
1689 return ol;
1690 }
1691
1692 void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
1693 vector<int>& osds) const
1694 {
1695 if (pool.can_shift_osds()) {
1696 unsigned removed = 0;
1697 for (unsigned i = 0; i < osds.size(); i++) {
1698 if (!exists(osds[i])) {
1699 removed++;
1700 continue;
1701 }
1702 if (removed) {
1703 osds[i - removed] = osds[i];
1704 }
1705 }
1706 if (removed)
1707 osds.resize(osds.size() - removed);
1708 } else {
1709 for (auto& osd : osds) {
1710 if (!exists(osd))
1711 osd = CRUSH_ITEM_NONE;
1712 }
1713 }
1714 }
1715
1716 int OSDMap::_pg_to_raw_osds(
1717 const pg_pool_t& pool, pg_t pg,
1718 vector<int> *osds,
1719 ps_t *ppps) const
1720 {
1721 // map to osds[]
1722 ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
1723 unsigned size = pool.get_size();
1724
1725 // what crush rule?
1726 int ruleno = crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), size);
1727 if (ruleno >= 0)
1728 crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
1729
1730 _remove_nonexistent_osds(pool, *osds);
1731
1732 if (ppps)
1733 *ppps = pps;
1734
1735 return osds->size();
1736 }
1737
1738 int OSDMap::_pick_primary(const vector<int>& osds) const
1739 {
1740 for (auto osd : osds) {
1741 if (osd != CRUSH_ITEM_NONE) {
1742 return osd;
1743 }
1744 }
1745 return -1;
1746 }
1747
1748 void OSDMap::_apply_remap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
1749 {
1750 pg_t pg = pi.raw_pg_to_pg(raw_pg);
1751 auto p = pg_upmap.find(pg);
1752 if (p != pg_upmap.end()) {
1753 // make sure targets aren't marked out
1754 for (auto osd : p->second) {
1755 if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd_weight[osd] == 0) {
1756 // reject/ignore the explicit mapping
1757 return;
1758 }
1759 }
1760 *raw = vector<int>(p->second.begin(), p->second.end());
1761 return;
1762 }
1763
1764 auto q = pg_upmap_items.find(pg);
1765 if (q != pg_upmap_items.end()) {
1766 // NOTE: this approach does not allow a bidirectional swap,
1767 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1768 for (auto& r : q->second) {
1769 // make sure the replacement value doesn't already appear
1770 bool exists = false;
1771 ssize_t pos = -1;
1772 for (unsigned i = 0; i < raw->size(); ++i) {
1773 int osd = (*raw)[i];
1774 if (osd == r.second) {
1775 exists = true;
1776 break;
1777 }
1778 // ignore mapping if target is marked out (or invalid osd id)
1779 if (osd == r.first &&
1780 pos < 0 &&
1781 !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
1782 osd_weight[r.second] == 0)) {
1783 pos = i;
1784 }
1785 }
1786 if (!exists && pos >= 0) {
1787 (*raw)[pos] = r.second;
1788 return;
1789 }
1790 }
1791 }
1792 }
1793
1794 // pg -> (up osd list)
1795 void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1796 vector<int> *up) const
1797 {
1798 if (pool.can_shift_osds()) {
1799 // shift left
1800 up->clear();
1801 up->reserve(raw.size());
1802 for (unsigned i=0; i<raw.size(); i++) {
1803 if (!exists(raw[i]) || is_down(raw[i]))
1804 continue;
1805 up->push_back(raw[i]);
1806 }
1807 } else {
1808 // set down/dne devices to NONE
1809 up->resize(raw.size());
1810 for (int i = raw.size() - 1; i >= 0; --i) {
1811 if (!exists(raw[i]) || is_down(raw[i])) {
1812 (*up)[i] = CRUSH_ITEM_NONE;
1813 } else {
1814 (*up)[i] = raw[i];
1815 }
1816 }
1817 }
1818 }
1819
1820 void OSDMap::_apply_primary_affinity(ps_t seed,
1821 const pg_pool_t& pool,
1822 vector<int> *osds,
1823 int *primary) const
1824 {
1825 // do we have any non-default primary_affinity values for these osds?
1826 if (!osd_primary_affinity)
1827 return;
1828
1829 bool any = false;
1830 for (const auto osd : *osds) {
1831 if (osd != CRUSH_ITEM_NONE &&
1832 (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1833 any = true;
1834 break;
1835 }
1836 }
1837 if (!any)
1838 return;
1839
1840 // pick the primary. feed both the seed (for the pg) and the osd
1841 // into the hash/rng so that a proportional fraction of an osd's pgs
1842 // get rejected as primary.
1843 int pos = -1;
1844 for (unsigned i = 0; i < osds->size(); ++i) {
1845 int o = (*osds)[i];
1846 if (o == CRUSH_ITEM_NONE)
1847 continue;
1848 unsigned a = (*osd_primary_affinity)[o];
1849 if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
1850 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
1851 seed, o) >> 16) >= a) {
1852 // we chose not to use this primary. note it anyway as a
1853 // fallback in case we don't pick anyone else, but keep looking.
1854 if (pos < 0)
1855 pos = i;
1856 } else {
1857 pos = i;
1858 break;
1859 }
1860 }
1861 if (pos < 0)
1862 return;
1863
1864 *primary = (*osds)[pos];
1865
1866 if (pool.can_shift_osds() && pos > 0) {
1867 // move the new primary to the front.
1868 for (int i = pos; i > 0; --i) {
1869 (*osds)[i] = (*osds)[i-1];
1870 }
1871 (*osds)[0] = *primary;
1872 }
1873 }
1874
1875 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
1876 vector<int> *temp_pg, int *temp_primary) const
1877 {
1878 pg = pool.raw_pg_to_pg(pg);
1879 const auto p = pg_temp->find(pg);
1880 temp_pg->clear();
1881 if (p != pg_temp->end()) {
1882 for (unsigned i=0; i<p->second.size(); i++) {
1883 if (!exists(p->second[i]) || is_down(p->second[i])) {
1884 if (pool.can_shift_osds()) {
1885 continue;
1886 } else {
1887 temp_pg->push_back(CRUSH_ITEM_NONE);
1888 }
1889 } else {
1890 temp_pg->push_back(p->second[i]);
1891 }
1892 }
1893 }
1894 const auto &pp = primary_temp->find(pg);
1895 *temp_primary = -1;
1896 if (pp != primary_temp->end()) {
1897 *temp_primary = pp->second;
1898 } else if (!temp_pg->empty()) { // apply pg_temp's primary
1899 for (unsigned i = 0; i < temp_pg->size(); ++i) {
1900 if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
1901 *temp_primary = (*temp_pg)[i];
1902 break;
1903 }
1904 }
1905 }
1906 }
1907
1908 int OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
1909 {
1910 *primary = -1;
1911 raw->clear();
1912 const pg_pool_t *pool = get_pg_pool(pg.pool());
1913 if (!pool)
1914 return 0;
1915 int r = _pg_to_raw_osds(*pool, pg, raw, NULL);
1916 if (primary)
1917 *primary = _pick_primary(*raw);
1918 return r;
1919 }
1920
1921 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
1922 {
1923 const pg_pool_t *pool = get_pg_pool(pg.pool());
1924 if (!pool) {
1925 if (primary)
1926 *primary = -1;
1927 if (up)
1928 up->clear();
1929 return;
1930 }
1931 vector<int> raw;
1932 ps_t pps;
1933 _pg_to_raw_osds(*pool, pg, &raw, &pps);
1934 _apply_remap(*pool, pg, &raw);
1935 _raw_to_up_osds(*pool, raw, up);
1936 *primary = _pick_primary(raw);
1937 _apply_primary_affinity(pps, *pool, up, primary);
1938 }
1939
1940 void OSDMap::_pg_to_up_acting_osds(
1941 const pg_t& pg, vector<int> *up, int *up_primary,
1942 vector<int> *acting, int *acting_primary,
1943 bool raw_pg_to_pg) const
1944 {
1945 const pg_pool_t *pool = get_pg_pool(pg.pool());
1946 if (!pool ||
1947 (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
1948 if (up)
1949 up->clear();
1950 if (up_primary)
1951 *up_primary = -1;
1952 if (acting)
1953 acting->clear();
1954 if (acting_primary)
1955 *acting_primary = -1;
1956 return;
1957 }
1958 vector<int> raw;
1959 vector<int> _up;
1960 vector<int> _acting;
1961 int _up_primary;
1962 int _acting_primary;
1963 ps_t pps;
1964 _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
1965 if (_acting.empty() || up || up_primary) {
1966 _pg_to_raw_osds(*pool, pg, &raw, &pps);
1967 _apply_remap(*pool, pg, &raw);
1968 _raw_to_up_osds(*pool, raw, &_up);
1969 _up_primary = _pick_primary(_up);
1970 _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
1971 if (_acting.empty()) {
1972 _acting = _up;
1973 if (_acting_primary == -1) {
1974 _acting_primary = _up_primary;
1975 }
1976 }
1977
1978 if (up)
1979 up->swap(_up);
1980 if (up_primary)
1981 *up_primary = _up_primary;
1982 }
1983
1984 if (acting)
1985 acting->swap(_acting);
1986 if (acting_primary)
1987 *acting_primary = _acting_primary;
1988 }
1989
1990 int OSDMap::calc_pg_rank(int osd, const vector<int>& acting, int nrep)
1991 {
1992 if (!nrep)
1993 nrep = acting.size();
1994 for (int i=0; i<nrep; i++)
1995 if (acting[i] == osd)
1996 return i;
1997 return -1;
1998 }
1999
2000 int OSDMap::calc_pg_role(int osd, const vector<int>& acting, int nrep)
2001 {
2002 return calc_pg_rank(osd, acting, nrep);
2003 }
2004
2005 bool OSDMap::primary_changed(
2006 int oldprimary,
2007 const vector<int> &oldacting,
2008 int newprimary,
2009 const vector<int> &newacting)
2010 {
2011 if (oldacting.empty() && newacting.empty())
2012 return false; // both still empty
2013 if (oldacting.empty() ^ newacting.empty())
2014 return true; // was empty, now not, or vice versa
2015 if (oldprimary != newprimary)
2016 return true; // primary changed
2017 if (calc_pg_rank(oldprimary, oldacting) !=
2018 calc_pg_rank(newprimary, newacting))
2019 return true;
2020 return false; // same primary (tho replicas may have changed)
2021 }
2022
2023
2024 // serialize, unserialize
2025 void OSDMap::encode_client_old(bufferlist& bl) const
2026 {
2027 __u16 v = 5;
2028 ::encode(v, bl);
2029
2030 // base
2031 ::encode(fsid, bl);
2032 ::encode(epoch, bl);
2033 ::encode(created, bl);
2034 ::encode(modified, bl);
2035
2036 // for ::encode(pools, bl);
2037 __u32 n = pools.size();
2038 ::encode(n, bl);
2039
2040 for (const auto &pool : pools) {
2041 n = pool.first;
2042 ::encode(n, bl);
2043 ::encode(pool.second, bl, 0);
2044 }
2045 // for ::encode(pool_name, bl);
2046 n = pool_name.size();
2047 ::encode(n, bl);
2048 for (const auto &pname : pool_name) {
2049 n = pname.first;
2050 ::encode(n, bl);
2051 ::encode(pname.second, bl);
2052 }
2053 // for ::encode(pool_max, bl);
2054 n = pool_max;
2055 ::encode(n, bl);
2056
2057 ::encode(flags, bl);
2058
2059 ::encode(max_osd, bl);
2060 ::encode(osd_state, bl);
2061 ::encode(osd_weight, bl);
2062 ::encode(osd_addrs->client_addr, bl, 0);
2063
2064 // for ::encode(pg_temp, bl);
2065 n = pg_temp->size();
2066 ::encode(n, bl);
2067 for (const auto pg : *pg_temp) {
2068 old_pg_t opg = pg.first.get_old_pg();
2069 ::encode(opg, bl);
2070 ::encode(pg.second, bl);
2071 }
2072
2073 // crush
2074 bufferlist cbl;
2075 crush->encode(cbl, 0 /* legacy (no) features */);
2076 ::encode(cbl, bl);
2077 }
2078
2079 void OSDMap::encode_classic(bufferlist& bl, uint64_t features) const
2080 {
2081 if ((features & CEPH_FEATURE_PGID64) == 0) {
2082 encode_client_old(bl);
2083 return;
2084 }
2085
2086 __u16 v = 6;
2087 ::encode(v, bl);
2088
2089 // base
2090 ::encode(fsid, bl);
2091 ::encode(epoch, bl);
2092 ::encode(created, bl);
2093 ::encode(modified, bl);
2094
2095 ::encode(pools, bl, features);
2096 ::encode(pool_name, bl);
2097 ::encode(pool_max, bl);
2098
2099 ::encode(flags, bl);
2100
2101 ::encode(max_osd, bl);
2102 ::encode(osd_state, bl);
2103 ::encode(osd_weight, bl);
2104 ::encode(osd_addrs->client_addr, bl, features);
2105
2106 ::encode(*pg_temp, bl);
2107
2108 // crush
2109 bufferlist cbl;
2110 crush->encode(cbl, 0 /* legacy (no) features */);
2111 ::encode(cbl, bl);
2112
2113 // extended
2114 __u16 ev = 10;
2115 ::encode(ev, bl);
2116 ::encode(osd_addrs->hb_back_addr, bl, features);
2117 ::encode(osd_info, bl);
2118 ::encode(blacklist, bl, features);
2119 ::encode(osd_addrs->cluster_addr, bl, features);
2120 ::encode(cluster_snapshot_epoch, bl);
2121 ::encode(cluster_snapshot, bl);
2122 ::encode(*osd_uuid, bl);
2123 ::encode(osd_xinfo, bl);
2124 ::encode(osd_addrs->hb_front_addr, bl, features);
2125 }
2126
2127 void OSDMap::encode(bufferlist& bl, uint64_t features) const
2128 {
2129 if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
2130 encode_classic(bl, features);
2131 return;
2132 }
2133
2134 // only a select set of callers should *ever* be encoding new
2135 // OSDMaps. others should be passing around the canonical encoded
2136 // buffers from on high. select out those callers by passing in an
2137 // "impossible" feature bit.
2138 assert(features & CEPH_FEATURE_RESERVED);
2139 features &= ~CEPH_FEATURE_RESERVED;
2140
2141 size_t start_offset = bl.length();
2142 size_t tail_offset;
2143 buffer::list::iterator crc_it;
2144
2145 // meta-encoding: how we include client-used and osd-specific data
2146 ENCODE_START(8, 7, bl);
2147
2148 {
2149 uint8_t v = 4;
2150 if (!HAVE_FEATURE(features, OSDMAP_PG_UPMAP)) {
2151 v = 3;
2152 }
2153 ENCODE_START(v, 1, bl); // client-usable data
2154 // base
2155 ::encode(fsid, bl);
2156 ::encode(epoch, bl);
2157 ::encode(created, bl);
2158 ::encode(modified, bl);
2159
2160 ::encode(pools, bl, features);
2161 ::encode(pool_name, bl);
2162 ::encode(pool_max, bl);
2163
2164 ::encode(flags, bl);
2165
2166 ::encode(max_osd, bl);
2167 ::encode(osd_state, bl);
2168 ::encode(osd_weight, bl);
2169 ::encode(osd_addrs->client_addr, bl, features);
2170
2171 ::encode(*pg_temp, bl);
2172 ::encode(*primary_temp, bl);
2173 if (osd_primary_affinity) {
2174 ::encode(*osd_primary_affinity, bl);
2175 } else {
2176 vector<__u32> v;
2177 ::encode(v, bl);
2178 }
2179
2180 // crush
2181 bufferlist cbl;
2182 crush->encode(cbl, features);
2183 ::encode(cbl, bl);
2184 ::encode(erasure_code_profiles, bl);
2185
2186 if (v >= 4) {
2187 ::encode(pg_upmap, bl);
2188 ::encode(pg_upmap_items, bl);
2189 } else {
2190 assert(pg_upmap.empty());
2191 assert(pg_upmap_items.empty());
2192 }
2193 ENCODE_FINISH(bl); // client-usable data
2194 }
2195
2196 {
2197 uint8_t target_v = 4;
2198 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2199 target_v = 1;
2200 }
2201 ENCODE_START(target_v, 1, bl); // extended, osd-only data
2202 ::encode(osd_addrs->hb_back_addr, bl, features);
2203 ::encode(osd_info, bl);
2204 {
2205 // put this in a sorted, ordered map<> so that we encode in a
2206 // deterministic order.
2207 map<entity_addr_t,utime_t> blacklist_map;
2208 for (const auto &addr : blacklist)
2209 blacklist_map.insert(make_pair(addr.first, addr.second));
2210 ::encode(blacklist_map, bl, features);
2211 }
2212 ::encode(osd_addrs->cluster_addr, bl, features);
2213 ::encode(cluster_snapshot_epoch, bl);
2214 ::encode(cluster_snapshot, bl);
2215 ::encode(*osd_uuid, bl);
2216 ::encode(osd_xinfo, bl);
2217 ::encode(osd_addrs->hb_front_addr, bl, features);
2218 if (target_v >= 2) {
2219 ::encode(nearfull_ratio, bl);
2220 ::encode(full_ratio, bl);
2221 ::encode(backfillfull_ratio, bl);
2222 ::encode(require_min_compat_client, bl);
2223 }
2224 ENCODE_FINISH(bl); // osd-only data
2225 }
2226
2227 ::encode((uint32_t)0, bl); // dummy crc
2228 crc_it = bl.end();
2229 crc_it.advance(-4);
2230 tail_offset = bl.length();
2231
2232 ENCODE_FINISH(bl); // meta-encoding wrapper
2233
2234 // fill in crc
2235 bufferlist front;
2236 front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
2237 crc = front.crc32c(-1);
2238 if (tail_offset < bl.length()) {
2239 bufferlist tail;
2240 tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
2241 crc = tail.crc32c(crc);
2242 }
2243 ceph_le32 crc_le;
2244 crc_le = crc;
2245 crc_it.copy_in(4, (char*)&crc_le);
2246 crc_defined = true;
2247 }
2248
2249 void OSDMap::decode(bufferlist& bl)
2250 {
2251 auto p = bl.begin();
2252 decode(p);
2253 }
2254
2255 void OSDMap::decode_classic(bufferlist::iterator& p)
2256 {
2257 __u32 n, t;
2258 __u16 v;
2259 ::decode(v, p);
2260
2261 // base
2262 ::decode(fsid, p);
2263 ::decode(epoch, p);
2264 ::decode(created, p);
2265 ::decode(modified, p);
2266
2267 if (v < 6) {
2268 if (v < 4) {
2269 int32_t max_pools = 0;
2270 ::decode(max_pools, p);
2271 pool_max = max_pools;
2272 }
2273 pools.clear();
2274 ::decode(n, p);
2275 while (n--) {
2276 ::decode(t, p);
2277 ::decode(pools[t], p);
2278 }
2279 if (v == 4) {
2280 ::decode(n, p);
2281 pool_max = n;
2282 } else if (v == 5) {
2283 pool_name.clear();
2284 ::decode(n, p);
2285 while (n--) {
2286 ::decode(t, p);
2287 ::decode(pool_name[t], p);
2288 }
2289 ::decode(n, p);
2290 pool_max = n;
2291 }
2292 } else {
2293 ::decode(pools, p);
2294 ::decode(pool_name, p);
2295 ::decode(pool_max, p);
2296 }
2297 // kludge around some old bug that zeroed out pool_max (#2307)
2298 if (pools.size() && pool_max < pools.rbegin()->first) {
2299 pool_max = pools.rbegin()->first;
2300 }
2301
2302 ::decode(flags, p);
2303
2304 ::decode(max_osd, p);
2305 ::decode(osd_state, p);
2306 ::decode(osd_weight, p);
2307 ::decode(osd_addrs->client_addr, p);
2308 if (v <= 5) {
2309 pg_temp->clear();
2310 ::decode(n, p);
2311 while (n--) {
2312 old_pg_t opg;
2313 ::decode_raw(opg, p);
2314 ::decode((*pg_temp)[pg_t(opg)], p);
2315 }
2316 } else {
2317 ::decode(*pg_temp, p);
2318 }
2319
2320 // crush
2321 bufferlist cbl;
2322 ::decode(cbl, p);
2323 auto cblp = cbl.begin();
2324 crush->decode(cblp);
2325
2326 // extended
2327 __u16 ev = 0;
2328 if (v >= 5)
2329 ::decode(ev, p);
2330 ::decode(osd_addrs->hb_back_addr, p);
2331 ::decode(osd_info, p);
2332 if (v < 5)
2333 ::decode(pool_name, p);
2334
2335 ::decode(blacklist, p);
2336 if (ev >= 6)
2337 ::decode(osd_addrs->cluster_addr, p);
2338 else
2339 osd_addrs->cluster_addr.resize(osd_addrs->client_addr.size());
2340
2341 if (ev >= 7) {
2342 ::decode(cluster_snapshot_epoch, p);
2343 ::decode(cluster_snapshot, p);
2344 }
2345
2346 if (ev >= 8) {
2347 ::decode(*osd_uuid, p);
2348 } else {
2349 osd_uuid->resize(max_osd);
2350 }
2351 if (ev >= 9)
2352 ::decode(osd_xinfo, p);
2353 else
2354 osd_xinfo.resize(max_osd);
2355
2356 if (ev >= 10)
2357 ::decode(osd_addrs->hb_front_addr, p);
2358 else
2359 osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
2360
2361 osd_primary_affinity.reset();
2362
2363 post_decode();
2364 }
2365
2366 void OSDMap::decode(bufferlist::iterator& bl)
2367 {
2368 /**
2369 * Older encodings of the OSDMap had a single struct_v which
2370 * covered the whole encoding, and was prior to our modern
2371 * stuff which includes a compatv and a size. So if we see
2372 * a struct_v < 7, we must rewind to the beginning and use our
2373 * classic decoder.
2374 */
2375 size_t start_offset = bl.get_off();
2376 size_t tail_offset = 0;
2377 bufferlist crc_front, crc_tail;
2378
2379 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
2380 if (struct_v < 7) {
2381 int struct_v_size = sizeof(struct_v);
2382 bl.advance(-struct_v_size);
2383 decode_classic(bl);
2384 return;
2385 }
2386 /**
2387 * Since we made it past that hurdle, we can use our normal paths.
2388 */
2389 {
2390 DECODE_START(4, bl); // client-usable data
2391 // base
2392 ::decode(fsid, bl);
2393 ::decode(epoch, bl);
2394 ::decode(created, bl);
2395 ::decode(modified, bl);
2396
2397 ::decode(pools, bl);
2398 ::decode(pool_name, bl);
2399 ::decode(pool_max, bl);
2400
2401 ::decode(flags, bl);
2402
2403 ::decode(max_osd, bl);
2404 ::decode(osd_state, bl);
2405 ::decode(osd_weight, bl);
2406 ::decode(osd_addrs->client_addr, bl);
2407
2408 ::decode(*pg_temp, bl);
2409 ::decode(*primary_temp, bl);
2410 if (struct_v >= 2) {
2411 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
2412 ::decode(*osd_primary_affinity, bl);
2413 if (osd_primary_affinity->empty())
2414 osd_primary_affinity.reset();
2415 } else {
2416 osd_primary_affinity.reset();
2417 }
2418
2419 // crush
2420 bufferlist cbl;
2421 ::decode(cbl, bl);
2422 auto cblp = cbl.begin();
2423 crush->decode(cblp);
2424 if (struct_v >= 3) {
2425 ::decode(erasure_code_profiles, bl);
2426 } else {
2427 erasure_code_profiles.clear();
2428 }
2429 if (struct_v >= 4) {
2430 ::decode(pg_upmap, bl);
2431 ::decode(pg_upmap_items, bl);
2432 } else {
2433 pg_upmap.clear();
2434 pg_upmap_items.clear();
2435 }
2436 DECODE_FINISH(bl); // client-usable data
2437 }
2438
2439 {
2440 DECODE_START(4, bl); // extended, osd-only data
2441 ::decode(osd_addrs->hb_back_addr, bl);
2442 ::decode(osd_info, bl);
2443 ::decode(blacklist, bl);
2444 ::decode(osd_addrs->cluster_addr, bl);
2445 ::decode(cluster_snapshot_epoch, bl);
2446 ::decode(cluster_snapshot, bl);
2447 ::decode(*osd_uuid, bl);
2448 ::decode(osd_xinfo, bl);
2449 ::decode(osd_addrs->hb_front_addr, bl);
2450 if (struct_v >= 2) {
2451 ::decode(nearfull_ratio, bl);
2452 ::decode(full_ratio, bl);
2453 } else {
2454 nearfull_ratio = 0;
2455 full_ratio = 0;
2456 }
2457 if (struct_v >= 3) {
2458 ::decode(backfillfull_ratio, bl);
2459 } else {
2460 backfillfull_ratio = 0;
2461 }
2462 if (struct_v >= 4)
2463 ::decode(require_min_compat_client, bl);
2464 DECODE_FINISH(bl); // osd-only data
2465 }
2466
2467 if (struct_v >= 8) {
2468 crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
2469 ::decode(crc, bl);
2470 tail_offset = bl.get_off();
2471 crc_defined = true;
2472 } else {
2473 crc_defined = false;
2474 crc = 0;
2475 }
2476
2477 DECODE_FINISH(bl); // wrapper
2478
2479 if (tail_offset) {
2480 // verify crc
2481 uint32_t actual = crc_front.crc32c(-1);
2482 if (tail_offset < bl.get_off()) {
2483 bufferlist tail;
2484 tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
2485 actual = tail.crc32c(actual);
2486 }
2487 if (crc != actual) {
2488 ostringstream ss;
2489 ss << "bad crc, actual " << actual << " != expected " << crc;
2490 string s = ss.str();
2491 throw buffer::malformed_input(s.c_str());
2492 }
2493 }
2494
2495 post_decode();
2496 }
2497
2498 void OSDMap::post_decode()
2499 {
2500 // index pool names
2501 name_pool.clear();
2502 for (const auto &pname : pool_name) {
2503 name_pool[pname.second] = pname.first;
2504 }
2505
2506 calc_num_osds();
2507 _calc_up_osd_features();
2508 }
2509
2510 void OSDMap::dump_erasure_code_profiles(
2511 const mempool::osdmap::map<string,map<string,string>>& profiles,
2512 Formatter *f)
2513 {
2514 f->open_object_section("erasure_code_profiles");
2515 for (const auto &profile : profiles) {
2516 f->open_object_section(profile.first.c_str());
2517 for (const auto &profm : profile.second) {
2518 f->dump_string(profm.first.c_str(), profm.second.c_str());
2519 }
2520 f->close_section();
2521 }
2522 f->close_section();
2523 }
2524
2525 void OSDMap::dump(Formatter *f) const
2526 {
2527 f->dump_int("epoch", get_epoch());
2528 f->dump_stream("fsid") << get_fsid();
2529 f->dump_stream("created") << get_created();
2530 f->dump_stream("modified") << get_modified();
2531 f->dump_string("flags", get_flag_string());
2532 f->dump_float("full_ratio", full_ratio);
2533 f->dump_float("backfillfull_ratio", backfillfull_ratio);
2534 f->dump_float("nearfull_ratio", nearfull_ratio);
2535 f->dump_string("cluster_snapshot", get_cluster_snapshot());
2536 f->dump_int("pool_max", get_pool_max());
2537 f->dump_int("max_osd", get_max_osd());
2538 f->dump_string("require_min_compat_client", require_min_compat_client);
2539 auto mv = get_min_compat_client();
2540 f->dump_string("min_compat_client", mv.first);
2541 f->dump_string("min_compat_client_version", mv.second);
2542
2543 f->open_array_section("pools");
2544 for (const auto &pool : pools) {
2545 std::string name("<unknown>");
2546 const auto &pni = pool_name.find(pool.first);
2547 if (pni != pool_name.end())
2548 name = pni->second;
2549 f->open_object_section("pool");
2550 f->dump_int("pool", pool.first);
2551 f->dump_string("pool_name", name);
2552 pool.second.dump(f);
2553 f->close_section();
2554 }
2555 f->close_section();
2556
2557 f->open_array_section("osds");
2558 for (int i=0; i<get_max_osd(); i++)
2559 if (exists(i)) {
2560 f->open_object_section("osd_info");
2561 f->dump_int("osd", i);
2562 f->dump_stream("uuid") << get_uuid(i);
2563 f->dump_int("up", is_up(i));
2564 f->dump_int("in", is_in(i));
2565 f->dump_float("weight", get_weightf(i));
2566 f->dump_float("primary_affinity", get_primary_affinityf(i));
2567 get_info(i).dump(f);
2568 f->dump_stream("public_addr") << get_addr(i);
2569 f->dump_stream("cluster_addr") << get_cluster_addr(i);
2570 f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
2571 f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
2572
2573 set<string> st;
2574 get_state(i, st);
2575 f->open_array_section("state");
2576 for (const auto &state : st)
2577 f->dump_string("state", state);
2578 f->close_section();
2579
2580 f->close_section();
2581 }
2582 f->close_section();
2583
2584 f->open_array_section("osd_xinfo");
2585 for (int i=0; i<get_max_osd(); i++) {
2586 if (exists(i)) {
2587 f->open_object_section("xinfo");
2588 f->dump_int("osd", i);
2589 osd_xinfo[i].dump(f);
2590 f->close_section();
2591 }
2592 }
2593 f->close_section();
2594
2595 f->open_array_section("pg_upmap");
2596 for (auto& p : pg_upmap) {
2597 f->open_object_section("mapping");
2598 f->dump_stream("pgid") << p.first;
2599 f->open_array_section("osds");
2600 for (auto q : p.second) {
2601 f->dump_int("osd", q);
2602 }
2603 f->close_section();
2604 f->close_section();
2605 }
2606 f->close_section();
2607 f->open_array_section("pg_upmap_items");
2608 for (auto& p : pg_upmap_items) {
2609 f->open_object_section("mapping");
2610 f->dump_stream("pgid") << p.first;
2611 f->open_array_section("mappings");
2612 for (auto& q : p.second) {
2613 f->open_object_section("mapping");
2614 f->dump_int("from", q.first);
2615 f->dump_int("to", q.second);
2616 f->close_section();
2617 }
2618 f->close_section();
2619 f->close_section();
2620 }
2621 f->close_section();
2622 f->open_array_section("pg_temp");
2623 for (const auto &pg : *pg_temp) {
2624 f->open_object_section("osds");
2625 f->dump_stream("pgid") << pg.first;
2626 f->open_array_section("osds");
2627 for (const auto osd : pg.second)
2628 f->dump_int("osd", osd);
2629 f->close_section();
2630 f->close_section();
2631 }
2632 f->close_section();
2633
2634 f->open_array_section("primary_temp");
2635 for (const auto &pg : *primary_temp) {
2636 f->dump_stream("pgid") << pg.first;
2637 f->dump_int("osd", pg.second);
2638 }
2639 f->close_section(); // primary_temp
2640
2641 f->open_object_section("blacklist");
2642 for (const auto &addr : blacklist) {
2643 stringstream ss;
2644 ss << addr.first;
2645 f->dump_stream(ss.str().c_str()) << addr.second;
2646 }
2647 f->close_section();
2648
2649 dump_erasure_code_profiles(erasure_code_profiles, f);
2650 }
2651
2652 void OSDMap::generate_test_instances(list<OSDMap*>& o)
2653 {
2654 o.push_back(new OSDMap);
2655
2656 CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
2657 o.push_back(new OSDMap);
2658 uuid_d fsid;
2659 o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
2660 o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
2661 o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
2662 cct->put();
2663 }
2664
2665 string OSDMap::get_flag_string(unsigned f)
2666 {
2667 string s;
2668 if ( f& CEPH_OSDMAP_NEARFULL)
2669 s += ",nearfull";
2670 if (f & CEPH_OSDMAP_FULL)
2671 s += ",full";
2672 if (f & CEPH_OSDMAP_PAUSERD)
2673 s += ",pauserd";
2674 if (f & CEPH_OSDMAP_PAUSEWR)
2675 s += ",pausewr";
2676 if (f & CEPH_OSDMAP_PAUSEREC)
2677 s += ",pauserec";
2678 if (f & CEPH_OSDMAP_NOUP)
2679 s += ",noup";
2680 if (f & CEPH_OSDMAP_NODOWN)
2681 s += ",nodown";
2682 if (f & CEPH_OSDMAP_NOOUT)
2683 s += ",noout";
2684 if (f & CEPH_OSDMAP_NOIN)
2685 s += ",noin";
2686 if (f & CEPH_OSDMAP_NOBACKFILL)
2687 s += ",nobackfill";
2688 if (f & CEPH_OSDMAP_NOREBALANCE)
2689 s += ",norebalance";
2690 if (f & CEPH_OSDMAP_NORECOVER)
2691 s += ",norecover";
2692 if (f & CEPH_OSDMAP_NOSCRUB)
2693 s += ",noscrub";
2694 if (f & CEPH_OSDMAP_NODEEP_SCRUB)
2695 s += ",nodeep-scrub";
2696 if (f & CEPH_OSDMAP_NOTIERAGENT)
2697 s += ",notieragent";
2698 if (f & CEPH_OSDMAP_SORTBITWISE)
2699 s += ",sortbitwise";
2700 if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
2701 s += ",require_jewel_osds";
2702 if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
2703 s += ",require_kraken_osds";
2704 if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
2705 s += ",require_luminous_osds";
2706 if (s.length())
2707 s.erase(0, 1);
2708 return s;
2709 }
2710
2711 string OSDMap::get_flag_string() const
2712 {
2713 return get_flag_string(flags);
2714 }
2715
2716 struct qi {
2717 int item;
2718 int depth;
2719 float weight;
2720 qi() : item(0), depth(0), weight(0) {}
2721 qi(int i, int d, float w) : item(i), depth(d), weight(w) {}
2722 };
2723
2724 void OSDMap::print_pools(ostream& out) const
2725 {
2726 for (const auto &pool : pools) {
2727 std::string name("<unknown>");
2728 const auto &pni = pool_name.find(pool.first);
2729 if (pni != pool_name.end())
2730 name = pni->second;
2731 out << "pool " << pool.first
2732 << " '" << name
2733 << "' " << pool.second << "\n";
2734
2735 for (const auto &snap : pool.second.snaps)
2736 out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
2737
2738 if (!pool.second.removed_snaps.empty())
2739 out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
2740 }
2741 out << std::endl;
2742 }
2743
2744 void OSDMap::print(ostream& out) const
2745 {
2746 out << "epoch " << get_epoch() << "\n"
2747 << "fsid " << get_fsid() << "\n"
2748 << "created " << get_created() << "\n"
2749 << "modified " << get_modified() << "\n";
2750
2751 out << "flags " << get_flag_string() << "\n";
2752 out << "full_ratio " << full_ratio << "\n";
2753 out << "backfillfull_ratio " << backfillfull_ratio << "\n";
2754 out << "nearfull_ratio " << nearfull_ratio << "\n";
2755 if (require_min_compat_client.length()) {
2756 out << "require_min_compat_client " << require_min_compat_client << "\n";
2757 }
2758 auto mv = get_min_compat_client();
2759 out << "min_compat_client " << mv.first << " " << mv.second << "\n";
2760 if (get_cluster_snapshot().length())
2761 out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
2762 out << "\n";
2763
2764 print_pools(out);
2765
2766 out << "max_osd " << get_max_osd() << "\n";
2767 for (int i=0; i<get_max_osd(); i++) {
2768 if (exists(i)) {
2769 out << "osd." << i;
2770 out << (is_up(i) ? " up ":" down");
2771 out << (is_in(i) ? " in ":" out");
2772 out << " weight " << get_weightf(i);
2773 if (get_primary_affinity(i) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)
2774 out << " primary_affinity " << get_primary_affinityf(i);
2775 const osd_info_t& info(get_info(i));
2776 out << " " << info;
2777 out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
2778 << " " << get_hb_front_addr(i);
2779 set<string> st;
2780 get_state(i, st);
2781 out << " " << st;
2782 if (!get_uuid(i).is_zero())
2783 out << " " << get_uuid(i);
2784 out << "\n";
2785 }
2786 }
2787 out << std::endl;
2788
2789 for (auto& p : pg_upmap) {
2790 out << "pg_upmap " << p.first << " " << p.second << "\n";
2791 }
2792 for (auto& p : pg_upmap_items) {
2793 out << "pg_upmap_items " << p.first << " " << p.second << "\n";
2794 }
2795
2796 for (const auto pg : *pg_temp)
2797 out << "pg_temp " << pg.first << " " << pg.second << "\n";
2798
2799 for (const auto pg : *primary_temp)
2800 out << "primary_temp " << pg.first << " " << pg.second << "\n";
2801
2802 for (const auto &addr : blacklist)
2803 out << "blacklist " << addr.first << " expires " << addr.second << "\n";
2804
2805 // ignore pg_swap_primary
2806 }
2807
2808 class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
2809 public:
2810 typedef CrushTreeDumper::Dumper<TextTable> Parent;
2811 OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_)
2812 : Parent(crush), osdmap(osdmap_) {}
2813
2814 void dump(TextTable *tbl) {
2815 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
2816 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
2817 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
2818 tbl->define_column("UP/DOWN", TextTable::LEFT, TextTable::RIGHT);
2819 tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
2820 tbl->define_column("PRIMARY-AFFINITY", TextTable::LEFT, TextTable::RIGHT);
2821
2822 Parent::dump(tbl);
2823
2824 for (int i = 0; i < osdmap->get_max_osd(); i++) {
2825 if (osdmap->exists(i) && !is_touched(i))
2826 dump_item(CrushTreeDumper::Item(i, 0, 0), tbl);
2827 }
2828 }
2829
2830 protected:
2831 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
2832
2833 *tbl << qi.id
2834 << weightf_t(qi.weight);
2835
2836 ostringstream name;
2837 for (int k = 0; k < qi.depth; k++)
2838 name << " ";
2839 if (qi.is_bucket()) {
2840 name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2841 << crush->get_item_name(qi.id);
2842 } else {
2843 name << "osd." << qi.id;
2844 }
2845 *tbl << name.str();
2846
2847 if (!qi.is_bucket()) {
2848 if (!osdmap->exists(qi.id)) {
2849 *tbl << "DNE"
2850 << 0;
2851 } else {
2852 *tbl << (osdmap->is_up(qi.id) ? "up" : "down")
2853 << weightf_t(osdmap->get_weightf(qi.id))
2854 << weightf_t(osdmap->get_primary_affinityf(qi.id));
2855 }
2856 }
2857 *tbl << TextTable::endrow;
2858 }
2859
2860 private:
2861 const OSDMap *osdmap;
2862 };
2863
2864 class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2865 public:
2866 typedef CrushTreeDumper::FormattingDumper Parent;
2867
2868 OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_)
2869 : Parent(crush), osdmap(osdmap_) {}
2870
2871 void dump(Formatter *f) {
2872 f->open_array_section("nodes");
2873 Parent::dump(f);
2874 f->close_section();
2875 f->open_array_section("stray");
2876 for (int i = 0; i < osdmap->get_max_osd(); i++) {
2877 if (osdmap->exists(i) && !is_touched(i))
2878 dump_item(CrushTreeDumper::Item(i, 0, 0), f);
2879 }
2880 f->close_section();
2881 }
2882
2883 protected:
2884 void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
2885 Parent::dump_item_fields(qi, f);
2886 if (!qi.is_bucket())
2887 {
2888 f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
2889 f->dump_string("status", osdmap->is_up(qi.id) ? "up" : "down");
2890 f->dump_float("reweight", osdmap->get_weightf(qi.id));
2891 f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
2892 }
2893 }
2894
2895 private:
2896 const OSDMap *osdmap;
2897 };
2898
2899 void OSDMap::print_tree(Formatter *f, ostream *out) const
2900 {
2901 if (f)
2902 OSDTreeFormattingDumper(crush.get(), this).dump(f);
2903 else {
2904 assert(out);
2905 TextTable tbl;
2906 OSDTreePlainDumper(crush.get(), this).dump(&tbl);
2907 *out << tbl;
2908 }
2909 }
2910
2911 void OSDMap::print_summary(Formatter *f, ostream& out) const
2912 {
2913 if (f) {
2914 f->open_object_section("osdmap");
2915 f->dump_int("epoch", get_epoch());
2916 f->dump_int("num_osds", get_num_osds());
2917 f->dump_int("num_up_osds", get_num_up_osds());
2918 f->dump_int("num_in_osds", get_num_in_osds());
2919 f->dump_bool("full", test_flag(CEPH_OSDMAP_FULL) ? true : false);
2920 f->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL) ? true : false);
2921 f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
2922 f->close_section();
2923 } else {
2924 out << " osdmap e" << get_epoch() << ": "
2925 << get_num_osds() << " osds: "
2926 << get_num_up_osds() << " up, "
2927 << get_num_in_osds() << " in";
2928 if (get_num_pg_temp())
2929 out << "; " << get_num_pg_temp() << " remapped pgs";
2930 out << "\n";
2931 uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
2932 if (important_flags)
2933 out << " flags " << get_flag_string(important_flags) << "\n";
2934 }
2935 }
2936
2937 void OSDMap::print_oneline_summary(ostream& out) const
2938 {
2939 out << "e" << get_epoch() << ": "
2940 << get_num_osds() << " osds: "
2941 << get_num_up_osds() << " up, "
2942 << get_num_in_osds() << " in";
2943 if (test_flag(CEPH_OSDMAP_FULL))
2944 out << " full";
2945 else if (test_flag(CEPH_OSDMAP_NEARFULL))
2946 out << " nearfull";
2947 }
2948
2949 bool OSDMap::crush_ruleset_in_use(int ruleset) const
2950 {
2951 for (const auto &pool : pools) {
2952 if (pool.second.crush_ruleset == ruleset)
2953 return true;
2954 }
2955 return false;
2956 }
2957
2958 int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
2959 int nosd, int pg_bits, int pgp_bits)
2960 {
2961 ldout(cct, 10) << "build_simple on " << num_osd
2962 << " osds with " << pg_bits << " pg bits per osd, "
2963 << dendl;
2964 epoch = e;
2965 set_fsid(fsid);
2966 created = modified = ceph_clock_now();
2967
2968 if (nosd >= 0) {
2969 set_max_osd(nosd);
2970 } else {
2971 // count osds
2972 int maxosd = 0;
2973 const md_config_t *conf = cct->_conf;
2974 vector<string> sections;
2975 conf->get_all_sections(sections);
2976
2977 for (auto &section : sections) {
2978 if (section.find("osd.") != 0)
2979 continue;
2980
2981 const char *begin = section.c_str() + 4;
2982 char *end = (char*)begin;
2983 int o = strtol(begin, &end, 10);
2984 if (*end != '\0')
2985 continue;
2986
2987 if (o > cct->_conf->mon_max_osd) {
2988 lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
2989 return -ERANGE;
2990 }
2991
2992 if (o > maxosd)
2993 maxosd = o;
2994 }
2995
2996 set_max_osd(maxosd + 1);
2997 }
2998
2999 // pgp_num <= pg_num
3000 if (pgp_bits > pg_bits)
3001 pgp_bits = pg_bits;
3002
3003 vector<string> pool_names;
3004 pool_names.push_back("rbd");
3005
3006 stringstream ss;
3007 int r;
3008 if (nosd >= 0)
3009 r = build_simple_crush_map(cct, *crush, nosd, &ss);
3010 else
3011 r = build_simple_crush_map_from_conf(cct, *crush, &ss);
3012 assert(r == 0);
3013
3014 int poolbase = get_max_osd() ? get_max_osd() : 1;
3015
3016 int const default_replicated_ruleset = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
3017 assert(default_replicated_ruleset >= 0);
3018
3019 for (auto &plname : pool_names) {
3020 int64_t pool = ++pool_max;
3021 pools[pool].type = pg_pool_t::TYPE_REPLICATED;
3022 pools[pool].flags = cct->_conf->osd_pool_default_flags;
3023 if (cct->_conf->osd_pool_default_flag_hashpspool)
3024 pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
3025 if (cct->_conf->osd_pool_default_flag_nodelete)
3026 pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
3027 if (cct->_conf->osd_pool_default_flag_nopgchange)
3028 pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
3029 if (cct->_conf->osd_pool_default_flag_nosizechange)
3030 pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
3031 pools[pool].size = cct->_conf->osd_pool_default_size;
3032 pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
3033 pools[pool].crush_ruleset = default_replicated_ruleset;
3034 pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
3035 pools[pool].set_pg_num(poolbase << pg_bits);
3036 pools[pool].set_pgp_num(poolbase << pgp_bits);
3037 pools[pool].last_change = epoch;
3038 pool_name[pool] = plname;
3039 name_pool[plname] = pool;
3040 }
3041
3042 for (int i=0; i<get_max_osd(); i++) {
3043 set_state(i, 0);
3044 set_weight(i, CEPH_OSD_OUT);
3045 }
3046
3047 map<string,string> profile_map;
3048 r = get_erasure_code_profile_default(cct, profile_map, &ss);
3049 if (r < 0) {
3050 lderr(cct) << ss.str() << dendl;
3051 return r;
3052 }
3053 set_erasure_code_profile("default", profile_map);
3054 return 0;
3055 }
3056
3057 int OSDMap::get_erasure_code_profile_default(CephContext *cct,
3058 map<string,string> &profile_map,
3059 ostream *ss)
3060 {
3061 int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
3062 *ss,
3063 &profile_map);
3064 return r;
3065 }
3066
3067 int OSDMap::_build_crush_types(CrushWrapper& crush)
3068 {
3069 crush.set_type_name(0, "osd");
3070 crush.set_type_name(1, "host");
3071 crush.set_type_name(2, "chassis");
3072 crush.set_type_name(3, "rack");
3073 crush.set_type_name(4, "row");
3074 crush.set_type_name(5, "pdu");
3075 crush.set_type_name(6, "pod");
3076 crush.set_type_name(7, "room");
3077 crush.set_type_name(8, "datacenter");
3078 crush.set_type_name(9, "region");
3079 crush.set_type_name(10, "root");
3080 return 10;
3081 }
3082
3083 int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
3084 int nosd, ostream *ss)
3085 {
3086 crush.create();
3087
3088 // root
3089 int root_type = _build_crush_types(crush);
3090 int rootid;
3091 int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
3092 root_type, 0, NULL, NULL, &rootid);
3093 assert(r == 0);
3094 crush.set_item_name(rootid, "default");
3095
3096 for (int o=0; o<nosd; o++) {
3097 map<string,string> loc;
3098 loc["host"] = "localhost";
3099 loc["rack"] = "localrack";
3100 loc["root"] = "default";
3101 ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
3102 char name[32];
3103 snprintf(name, sizeof(name), "osd.%d", o);
3104 crush.insert_item(cct, o, 1.0, name, loc);
3105 }
3106
3107 build_simple_crush_rulesets(cct, crush, "default", ss);
3108
3109 crush.finalize();
3110
3111 return 0;
3112 }
3113
3114 int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
3115 CrushWrapper& crush,
3116 ostream *ss)
3117 {
3118 const md_config_t *conf = cct->_conf;
3119
3120 crush.create();
3121
3122 // root
3123 int root_type = _build_crush_types(crush);
3124 int rootid;
3125 int r = crush.add_bucket(0, 0,
3126 CRUSH_HASH_DEFAULT,
3127 root_type, 0, NULL, NULL, &rootid);
3128 assert(r == 0);
3129 crush.set_item_name(rootid, "default");
3130
3131 // add osds
3132 vector<string> sections;
3133 conf->get_all_sections(sections);
3134
3135 for (auto &section : sections) {
3136 if (section.find("osd.") != 0)
3137 continue;
3138
3139 const char *begin = section.c_str() + 4;
3140 char *end = (char*)begin;
3141 int o = strtol(begin, &end, 10);
3142 if (*end != '\0')
3143 continue;
3144
3145 string host, rack, row, room, dc, pool;
3146 vector<string> sectiontmp;
3147 sectiontmp.push_back("osd");
3148 sectiontmp.push_back(section);
3149 conf->get_val_from_conf_file(sectiontmp, "host", host, false);
3150 conf->get_val_from_conf_file(sectiontmp, "rack", rack, false);
3151 conf->get_val_from_conf_file(sectiontmp, "row", row, false);
3152 conf->get_val_from_conf_file(sectiontmp, "room", room, false);
3153 conf->get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
3154 conf->get_val_from_conf_file(sectiontmp, "root", pool, false);
3155
3156 if (host.length() == 0)
3157 host = "unknownhost";
3158 if (rack.length() == 0)
3159 rack = "unknownrack";
3160
3161 map<string,string> loc;
3162 loc["host"] = host;
3163 loc["rack"] = rack;
3164 if (row.size())
3165 loc["row"] = row;
3166 if (room.size())
3167 loc["room"] = room;
3168 if (dc.size())
3169 loc["datacenter"] = dc;
3170 loc["root"] = "default";
3171
3172 ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
3173 crush.insert_item(cct, o, 1.0, section, loc);
3174 }
3175
3176 build_simple_crush_rulesets(cct, crush, "default", ss);
3177
3178 crush.finalize();
3179
3180 return 0;
3181 }
3182
3183
3184 int OSDMap::build_simple_crush_rulesets(CephContext *cct,
3185 CrushWrapper& crush,
3186 const string& root,
3187 ostream *ss)
3188 {
3189 int crush_ruleset =
3190 crush._get_osd_pool_default_crush_replicated_ruleset(cct, true);
3191 string failure_domain =
3192 crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
3193
3194 if (crush_ruleset == CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
3195 crush_ruleset = -1; // create ruleset 0 by default
3196
3197 int r;
3198 r = crush.add_simple_ruleset_at("replicated_ruleset", root, failure_domain,
3199 "firstn", pg_pool_t::TYPE_REPLICATED,
3200 crush_ruleset, ss);
3201 if (r < 0)
3202 return r;
3203 // do not add an erasure rule by default or else we will implicitly
3204 // require the crush_v2 feature of clients
3205 return 0;
3206 }
3207
3208 int OSDMap::summarize_mapping_stats(
3209 OSDMap *newmap,
3210 const set<int64_t> *pools,
3211 std::string *out,
3212 Formatter *f) const
3213 {
3214 set<int64_t> ls;
3215 if (pools) {
3216 ls = *pools;
3217 } else {
3218 for (auto &p : get_pools())
3219 ls.insert(p.first);
3220 }
3221
3222 unsigned total_pg = 0;
3223 unsigned moved_pg = 0;
3224 vector<unsigned> base_by_osd(get_max_osd(), 0);
3225 vector<unsigned> new_by_osd(get_max_osd(), 0);
3226 for (int64_t pool_id : ls) {
3227 const pg_pool_t *pi = get_pg_pool(pool_id);
3228 vector<int> up, up2, acting;
3229 int up_primary, acting_primary;
3230 for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
3231 pg_t pgid(ps, pool_id, -1);
3232 total_pg += pi->get_size();
3233 pg_to_up_acting_osds(pgid, &up, &up_primary,
3234 &acting, &acting_primary);
3235 for (int osd : up) {
3236 if (osd >= 0 && osd < get_max_osd())
3237 ++base_by_osd[osd];
3238 }
3239 if (newmap) {
3240 newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary,
3241 &acting, &acting_primary);
3242 for (int osd : up2) {
3243 if (osd >= 0 && osd < get_max_osd())
3244 ++new_by_osd[osd];
3245 }
3246 if (pi->type == pg_pool_t::TYPE_ERASURE) {
3247 for (unsigned i=0; i<up.size(); ++i) {
3248 if (up[i] != up2[i]) {
3249 ++moved_pg;
3250 }
3251 }
3252 } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
3253 for (int osd : up) {
3254 if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
3255 ++moved_pg;
3256 }
3257 }
3258 } else {
3259 assert(0 == "unhandled pool type");
3260 }
3261 }
3262 }
3263 }
3264
3265 unsigned num_up_in = 0;
3266 for (int osd = 0; osd < get_max_osd(); ++osd) {
3267 if (is_up(osd) && is_in(osd))
3268 ++num_up_in;
3269 }
3270 if (!num_up_in) {
3271 return -EINVAL;
3272 }
3273
3274 float avg_pg = (float)total_pg / (float)num_up_in;
3275 float base_stddev = 0, new_stddev = 0;
3276 int min = -1, max = -1;
3277 unsigned min_base_pg = 0, max_base_pg = 0;
3278 unsigned min_new_pg = 0, max_new_pg = 0;
3279 for (int osd = 0; osd < get_max_osd(); ++osd) {
3280 if (is_up(osd) && is_in(osd)) {
3281 float base_diff = (float)base_by_osd[osd] - avg_pg;
3282 base_stddev += base_diff * base_diff;
3283 float new_diff = (float)new_by_osd[osd] - avg_pg;
3284 new_stddev += new_diff * new_diff;
3285 if (min < 0 || base_by_osd[osd] < min_base_pg) {
3286 min = osd;
3287 min_base_pg = base_by_osd[osd];
3288 min_new_pg = new_by_osd[osd];
3289 }
3290 if (max < 0 || base_by_osd[osd] > max_base_pg) {
3291 max = osd;
3292 max_base_pg = base_by_osd[osd];
3293 max_new_pg = new_by_osd[osd];
3294 }
3295 }
3296 }
3297 base_stddev = sqrt(base_stddev / num_up_in);
3298 new_stddev = sqrt(new_stddev / num_up_in);
3299
3300 float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
3301
3302 ostringstream ss;
3303 if (f)
3304 f->open_object_section("utilization");
3305 if (newmap) {
3306 if (f) {
3307 f->dump_unsigned("moved_pgs", moved_pg);
3308 f->dump_unsigned("total_pgs", total_pg);
3309 } else {
3310 float percent = 0;
3311 if (total_pg)
3312 percent = (float)moved_pg * 100.0 / (float)total_pg;
3313 ss << "moved " << moved_pg << " / " << total_pg
3314 << " (" << percent << "%)\n";
3315 }
3316 }
3317 if (f) {
3318 f->dump_float("avg_pgs", avg_pg);
3319 f->dump_float("std_dev", base_stddev);
3320 f->dump_float("expected_baseline_std_dev", edev);
3321 if (newmap)
3322 f->dump_float("new_std_dev", new_stddev);
3323 } else {
3324 ss << "avg " << avg_pg << "\n";
3325 ss << "stddev " << base_stddev;
3326 if (newmap)
3327 ss << " -> " << new_stddev;
3328 ss << " (expected baseline " << edev << ")\n";
3329 }
3330 if (min >= 0) {
3331 if (f) {
3332 f->dump_unsigned("min_osd", min);
3333 f->dump_unsigned("min_osd_pgs", min_base_pg);
3334 if (newmap)
3335 f->dump_unsigned("new_min_osd_pgs", min_new_pg);
3336 } else {
3337 ss << "min osd." << min << " with " << min_base_pg;
3338 if (newmap)
3339 ss << " -> " << min_new_pg;
3340 ss << " pgs (" << (float)min_base_pg / avg_pg;
3341 if (newmap)
3342 ss << " -> " << (float)min_new_pg / avg_pg;
3343 ss << " * mean)\n";
3344 }
3345 }
3346 if (max >= 0) {
3347 if (f) {
3348 f->dump_unsigned("max_osd", max);
3349 f->dump_unsigned("max_osd_pgs", max_base_pg);
3350 if (newmap)
3351 f->dump_unsigned("new_max_osd_pgs", max_new_pg);
3352 } else {
3353 ss << "max osd." << max << " with " << max_base_pg;
3354 if (newmap)
3355 ss << " -> " << max_new_pg;
3356 ss << " pgs (" << (float)max_base_pg / avg_pg;
3357 if (newmap)
3358 ss << " -> " << (float)max_new_pg / avg_pg;
3359 ss << " * mean)\n";
3360 }
3361 }
3362 if (f)
3363 f->close_section();
3364 if (out)
3365 *out = ss.str();
3366 return 0;
3367 }
3368
3369
3370 int OSDMap::clean_pg_upmaps(
3371 CephContext *cct,
3372 Incremental *pending_inc)
3373 {
3374 ldout(cct, 10) << __func__ << dendl;
3375 int changed = 0;
3376 for (auto& p : pg_upmap) {
3377 vector<int> raw;
3378 int primary;
3379 pg_to_raw_osds(p.first, &raw, &primary);
3380 if (vectors_equal(raw, p.second)) {
3381 ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
3382 << p.second << dendl;
3383 pending_inc->old_pg_upmap.insert(p.first);
3384 ++changed;
3385 }
3386 }
3387 for (auto& p : pg_upmap_items) {
3388 vector<int> raw;
3389 int primary;
3390 pg_to_raw_osds(p.first, &raw, &primary);
3391 mempool::osdmap::vector<pair<int,int>> newmap;
3392 for (auto& q : p.second) {
3393 if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
3394 newmap.push_back(q);
3395 }
3396 }
3397 if (newmap.empty()) {
3398 ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
3399 << p.second << dendl;
3400 pending_inc->old_pg_upmap_items.insert(p.first);
3401 ++changed;
3402 } else if (newmap != p.second) {
3403 ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
3404 << p.first << " " << p.second << " -> " << newmap << dendl;
3405 pending_inc->new_pg_upmap_items[p.first] = newmap;
3406 ++changed;
3407 }
3408 }
3409 return changed;
3410 }
3411
3412 bool OSDMap::try_pg_upmap(
3413 CephContext *cct,
3414 pg_t pg, ///< pg to potentially remap
3415 const set<int>& overfull, ///< osds we'd want to evacuate
3416 const vector<int>& underfull, ///< osds to move to, in order of preference
3417 vector<int> *orig,
3418 vector<int> *out) ///< resulting alternative mapping
3419 {
3420 const pg_pool_t *pool = get_pg_pool(pg.pool());
3421 if (!pool)
3422 return false;
3423 int rule = crush->find_rule(pool->get_crush_ruleset(), pool->get_type(),
3424 pool->get_size());
3425 if (rule < 0)
3426 return false;
3427
3428 // get original mapping
3429 _pg_to_raw_osds(*pool, pg, orig, NULL);
3430
3431 // make sure there is something there to remap
3432 bool any = false;
3433 for (auto osd : *orig) {
3434 if (overfull.count(osd)) {
3435 any = true;
3436 break;
3437 }
3438 }
3439 if (!any) {
3440 return false;
3441 }
3442
3443 int r = crush->try_remap_rule(
3444 cct,
3445 rule,
3446 pool->get_size(),
3447 overfull, underfull,
3448 *orig,
3449 out);
3450 if (r < 0)
3451 return false;
3452 if (*out == *orig)
3453 return false;
3454 return true;
3455 }
3456
3457 int OSDMap::calc_pg_upmaps(
3458 CephContext *cct,
3459 float max_deviation,
3460 int max,
3461 const set<int64_t>& only_pools,
3462 OSDMap::Incremental *pending_inc)
3463 {
3464 OSDMap tmp;
3465 tmp.deepish_copy_from(*this);
3466 int num_changed = 0;
3467 while (true) {
3468 map<int,set<pg_t>> pgs_by_osd;
3469 int total_pgs = 0;
3470 for (auto& i : pools) {
3471 if (!only_pools.empty() && !only_pools.count(i.first))
3472 continue;
3473 for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
3474 pg_t pg(ps, i.first);
3475 vector<int> up;
3476 tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
3477 for (auto osd : up) {
3478 if (osd != CRUSH_ITEM_NONE)
3479 pgs_by_osd[osd].insert(pg);
3480 }
3481 }
3482 total_pgs += i.second.get_size() * i.second.get_pg_num();
3483 }
3484 float osd_weight_total = 0;
3485 map<int,float> osd_weight;
3486 for (auto& i : pgs_by_osd) {
3487 float w = crush->get_item_weightf(i.first);
3488 osd_weight[i.first] = w;
3489 osd_weight_total += w;
3490 ldout(cct, 20) << " osd." << i.first << " weight " << w
3491 << " pgs " << i.second.size() << dendl;
3492 }
3493
3494 // NOTE: we assume we touch all osds with CRUSH!
3495 float pgs_per_weight = total_pgs / osd_weight_total;
3496 ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
3497 ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
3498
3499 // osd deviation
3500 map<int,float> osd_deviation; // osd, deviation(pgs)
3501 multimap<float,int> deviation_osd; // deviation(pgs), osd
3502 set<int> overfull;
3503 for (auto& i : pgs_by_osd) {
3504 float target = osd_weight[i.first] * pgs_per_weight;
3505 float deviation = (float)i.second.size() - target;
3506 ldout(cct, 20) << " osd." << i.first
3507 << "\tpgs " << i.second.size()
3508 << "\ttarget " << target
3509 << "\tdeviation " << deviation
3510 << dendl;
3511 osd_deviation[i.first] = deviation;
3512 deviation_osd.insert(make_pair(deviation, i.first));
3513 if (deviation > 0)
3514 overfull.insert(i.first);
3515 }
3516
3517 // build underfull, sorted from least-full to most-average
3518 vector<int> underfull;
3519 for (auto i = deviation_osd.begin();
3520 i != deviation_osd.end();
3521 ++i) {
3522 if (i->first >= -.999)
3523 break;
3524 underfull.push_back(i->second);
3525 }
3526 ldout(cct, 10) << " overfull " << overfull
3527 << " underfull " << underfull << dendl;
3528 if (overfull.empty() || underfull.empty())
3529 break;
3530
3531 // pick fullest
3532 bool restart = false;
3533 for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
3534 int osd = p->second;
3535 float target = osd_weight[osd] * pgs_per_weight;
3536 float deviation = deviation_osd.rbegin()->first;
3537 if (deviation/target < max_deviation) {
3538 ldout(cct, 10) << " osd." << osd
3539 << " target " << target
3540 << " deviation " << deviation
3541 << " -> " << deviation/target
3542 << " < max " << max_deviation << dendl;
3543 break;
3544 }
3545 int num_to_move = deviation;
3546 ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
3547 if (num_to_move < 1)
3548 break;
3549
3550 set<pg_t>& pgs = pgs_by_osd[osd];
3551
3552 // look for remaps we can un-remap
3553 for (auto pg : pgs) {
3554 auto p = tmp.pg_upmap_items.find(pg);
3555 if (p != tmp.pg_upmap_items.end()) {
3556 for (auto q : p->second) {
3557 if (q.second == osd) {
3558 ldout(cct, 10) << " dropping pg_upmap_items " << pg
3559 << " " << p->second << dendl;
3560 tmp.pg_upmap_items.erase(p);
3561 pending_inc->old_pg_upmap_items.insert(pg);
3562 ++num_changed;
3563 restart = true;
3564 }
3565 }
3566 }
3567 if (restart)
3568 break;
3569 } // pg loop
3570 if (restart)
3571 break;
3572
3573 for (auto pg : pgs) {
3574 if (tmp.pg_upmap.count(pg) ||
3575 tmp.pg_upmap_items.count(pg)) {
3576 ldout(cct, 20) << " already remapped " << pg << dendl;
3577 continue;
3578 }
3579 ldout(cct, 10) << " trying " << pg << dendl;
3580 vector<int> orig, out;
3581 if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
3582 continue;
3583 }
3584 ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
3585 if (orig.size() != out.size()) {
3586 continue;
3587 }
3588 assert(orig != out);
3589 auto& rmi = tmp.pg_upmap_items[pg];
3590 for (unsigned i = 0; i < out.size(); ++i) {
3591 if (orig[i] != out[i]) {
3592 rmi.push_back(make_pair(orig[i], out[i]));
3593 }
3594 }
3595 pending_inc->new_pg_upmap_items[pg] = rmi;
3596 ldout(cct, 10) << " " << pg << " pg_upmap_items " << rmi << dendl;
3597 restart = true;
3598 ++num_changed;
3599 break;
3600 } // pg loop
3601 if (restart)
3602 break;
3603 } // osd loop
3604
3605 if (!restart) {
3606 ldout(cct, 10) << " failed to find any changes to make" << dendl;
3607 break;
3608 }
3609 if (--max == 0) {
3610 ldout(cct, 10) << " hit max iterations, stopping" << dendl;
3611 break;
3612 }
3613 }
3614 return num_changed;
3615 }