]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.h
import ceph quincy 17.2.1
[ceph.git] / ceph / src / osd / OSDMap.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #ifndef CEPH_OSDMAP_H
20 #define CEPH_OSDMAP_H
21
22 /*
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
25 *
26 */
27 #include <vector>
28 #include <list>
29 #include <set>
30 #include <map>
31 #include <memory>
32
33 #include <boost/smart_ptr/local_shared_ptr.hpp>
34 #include "include/btree_map.h"
35 #include "include/common_fwd.h"
36 #include "include/types.h"
37 #include "common/ceph_releases.h"
38 #include "osd_types.h"
39
40 //#include "include/ceph_features.h"
41 #include "crush/CrushWrapper.h"
42
43 // forward declaration
44 class CrushWrapper;
45 class health_check_map_t;
46
47 /*
48 * we track up to two intervals during which the osd was alive and
49 * healthy. the most recent is [up_from,up_thru), where up_thru is
50 * the last epoch the osd is known to have _started_. i.e., a lower
51 * bound on the actual osd death. down_at (if it is > up_from) is an
52 * upper bound on the actual osd death.
53 *
54 * the second is the last_clean interval [begin,end). in that case,
55 * the last interval is the last epoch known to have been either
56 * _finished_, or during which the osd cleanly shut down. when
57 * possible, we push this forward to the epoch the osd was eventually
58 * marked down.
59 *
60 * the lost_at is used to allow build_prior to proceed without waiting
61 * for an osd to recover. In certain cases, progress may be blocked
62 * because an osd is down that may contain updates (i.e., a pg may have
63 * gone rw during an interval). If the osd can't be brought online, we
64 * can force things to proceed knowing that we _might_ be losing some
65 * acked writes. If the osd comes back to life later, that's fine to,
66 * but those writes will still be lost (the divergent objects will be
67 * thrown out).
68 */
69 struct osd_info_t {
70 epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
71 epoch_t last_clean_end;
72 epoch_t up_from; // epoch osd marked up
73 epoch_t up_thru; // lower bound on actual osd death (if > up_from)
74 epoch_t down_at; // upper bound on actual osd death (if > up_from)
75 epoch_t lost_at; // last epoch we decided data was "lost"
76
77 osd_info_t() : last_clean_begin(0), last_clean_end(0),
78 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
79
80 void dump(ceph::Formatter *f) const;
81 void encode(ceph::buffer::list& bl) const;
82 void decode(ceph::buffer::list::const_iterator& bl);
83 static void generate_test_instances(std::list<osd_info_t*>& o);
84 };
85 WRITE_CLASS_ENCODER(osd_info_t)
86
87 std::ostream& operator<<(std::ostream& out, const osd_info_t& info);
88
89 struct osd_xinfo_t {
90 utime_t down_stamp; ///< timestamp when we were last marked down
91 float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
92 __u32 laggy_interval; ///< average interval between being marked laggy and recovering
93 uint64_t features; ///< features supported by this osd we should know about
94 __u32 old_weight; ///< weight prior to being auto marked out
95 utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps
96 epoch_t dead_epoch = 0; ///< last epoch we were confirmed dead (not just down)
97
98 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
99 features(0), old_weight(0) {}
100
101 void dump(ceph::Formatter *f) const;
102 void encode(ceph::buffer::list& bl, uint64_t features) const;
103 void decode(ceph::buffer::list::const_iterator& bl);
104 static void generate_test_instances(std::list<osd_xinfo_t*>& o);
105 };
106 WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t)
107
108 std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi);
109
110
111 struct PGTempMap {
112 #if 1
113 ceph::buffer::list data;
114 typedef btree::btree_map<pg_t,ceph_le32*> map_t;
115 map_t map;
116
117 void encode(ceph::buffer::list& bl) const {
118 using ceph::encode;
119 uint32_t n = map.size();
120 encode(n, bl);
121 for (auto &p : map) {
122 encode(p.first, bl);
123 bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32));
124 }
125 }
126 void decode(ceph::buffer::list::const_iterator& p) {
127 using ceph::decode;
128 data.clear();
129 map.clear();
130 uint32_t n;
131 decode(n, p);
132 if (!n)
133 return;
134 auto pstart = p;
135 size_t start_off = pstart.get_off();
136 std::vector<std::pair<pg_t,size_t>> offsets;
137 offsets.resize(n);
138 for (unsigned i=0; i<n; ++i) {
139 pg_t pgid;
140 decode(pgid, p);
141 offsets[i].first = pgid;
142 offsets[i].second = p.get_off() - start_off;
143 uint32_t vn;
144 decode(vn, p);
145 p += vn * sizeof(int32_t);
146 }
147 size_t len = p.get_off() - start_off;
148 pstart.copy(len, data);
149 if (data.get_num_buffers() > 1) {
150 data.rebuild();
151 }
152 //map.reserve(n);
153 char *start = data.c_str();
154 for (auto i : offsets) {
155 map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second)));
156 }
157 }
158 void rebuild() {
159 ceph::buffer::list bl;
160 encode(bl);
161 auto p = std::cbegin(bl);
162 decode(p);
163 }
164 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
165 return
166 l.map.size() == r.map.size() &&
167 l.data.contents_equal(r.data);
168 }
169
170 class iterator {
171 map_t::const_iterator it;
172 map_t::const_iterator end;
173 std::pair<pg_t,std::vector<int32_t>> current;
174 void init_current() {
175 if (it != end) {
176 current.first = it->first;
177 ceph_assert(it->second);
178 current.second.resize(*it->second);
179 ceph_le32 *p = it->second + 1;
180 for (uint32_t n = 0; n < *it->second; ++n, ++p) {
181 current.second[n] = *p;
182 }
183 }
184 }
185 public:
186 iterator(map_t::const_iterator p,
187 map_t::const_iterator e)
188 : it(p), end(e) {
189 init_current();
190 }
191
192 const std::pair<pg_t,std::vector<int32_t>>& operator*() const {
193 return current;
194 }
195 const std::pair<pg_t,std::vector<int32_t>>* operator->() const {
196 return &current;
197 }
198 friend bool operator==(const iterator& l, const iterator& r) {
199 return l.it == r.it;
200 }
201 friend bool operator!=(const iterator& l, const iterator& r) {
202 return l.it != r.it;
203 }
204 iterator& operator++() {
205 ++it;
206 if (it != end)
207 init_current();
208 return *this;
209 }
210 iterator operator++(int) {
211 iterator r = *this;
212 ++it;
213 if (it != end)
214 init_current();
215 return r;
216 }
217 };
218 iterator begin() const {
219 return iterator(map.begin(), map.end());
220 }
221 iterator end() const {
222 return iterator(map.end(), map.end());
223 }
224 iterator find(pg_t pgid) const {
225 return iterator(map.find(pgid), map.end());
226 }
227 size_t size() const {
228 return map.size();
229 }
230 size_t count(pg_t pgid) const {
231 return map.count(pgid);
232 }
233 void erase(pg_t pgid) {
234 map.erase(pgid);
235 }
236 void clear() {
237 map.clear();
238 data.clear();
239 }
240 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
241 using ceph::encode;
242 size_t need = sizeof(ceph_le32) * (1 + v.size());
243 if (need < data.get_append_buffer_unused_tail_length()) {
244 ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length());
245 z.zero();
246 data.append(z.c_str(), z.length());
247 }
248 encode(v, data);
249 map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size());
250 }
251 mempool::osdmap::vector<int32_t> get(pg_t pgid) {
252 mempool::osdmap::vector<int32_t> v;
253 ceph_le32 *p = map[pgid];
254 size_t n = *p++;
255 v.resize(n);
256 for (size_t i = 0; i < n; ++i, ++p) {
257 v[i] = *p;
258 }
259 return v;
260 }
261 #else
262 // trivial implementation
263 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
264
265 void encode(ceph::buffer::list& bl) const {
266 encode(pg_temp, bl);
267 }
268 void decode(ceph::buffer::list::const_iterator& p) {
269 decode(pg_temp, p);
270 }
271 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
272 return
273 l.pg_temp.size() == r.pg_temp.size() &&
274 l.pg_temp == r.pg_temp;
275 }
276
277 class iterator {
278 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
279 public:
280 iterator(mempool::osdmap::map<pg_t,
281 mempool::osdmap::vector<int32_t> >::const_iterator p)
282 : it(p) {}
283
284 std::pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
285 return *it;
286 }
287 const std::pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
288 return &*it;
289 }
290 friend bool operator==(const iterator& l, const iterator& r) {
291 return l.it == r.it;
292 }
293 friend bool operator!=(const iterator& l, const iterator& r) {
294 return l.it != r.it;
295 }
296 iterator& operator++() {
297 ++it;
298 return *this;
299 }
300 iterator operator++(int) {
301 iterator r = *this;
302 ++it;
303 return r;
304 }
305 };
306 iterator begin() const {
307 return iterator(pg_temp.cbegin());
308 }
309 iterator end() const {
310 return iterator(pg_temp.cend());
311 }
312 iterator find(pg_t pgid) const {
313 return iterator(pg_temp.find(pgid));
314 }
315 size_t size() const {
316 return pg_temp.size();
317 }
318 size_t count(pg_t pgid) const {
319 return pg_temp.count(pgid);
320 }
321 void erase(pg_t pgid) {
322 pg_temp.erase(pgid);
323 }
324 void clear() {
325 pg_temp.clear();
326 }
327 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
328 pg_temp[pgid] = v;
329 }
330 const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
331 return pg_temp.at(pgid);
332 }
333 #endif
334 void dump(ceph::Formatter *f) const {
335 for (const auto &pg : *this) {
336 f->open_object_section("osds");
337 f->dump_stream("pgid") << pg.first;
338 f->open_array_section("osds");
339 for (const auto osd : pg.second)
340 f->dump_int("osd", osd);
341 f->close_section();
342 f->close_section();
343 }
344 }
345 };
346 WRITE_CLASS_ENCODER(PGTempMap)
347
348 /** OSDMap
349 */
350 class OSDMap {
351 public:
352 MEMPOOL_CLASS_HELPERS();
353
354 class Incremental {
355 public:
356 MEMPOOL_CLASS_HELPERS();
357
358 /// feature bits we were encoded with. the subsequent OSDMap
359 /// encoding should match.
360 uint64_t encode_features;
361 uuid_d fsid;
362 epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
363 utime_t modified;
364 int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
365 int32_t new_flags;
366 ceph_release_t new_require_osd_release{0xff};
367 uint32_t new_stretch_bucket_count{0};
368 uint32_t new_degraded_stretch_mode{0};
369 uint32_t new_recovering_stretch_mode{0};
370 int32_t new_stretch_mode_bucket{0};
371 bool stretch_mode_enabled{false};
372 bool change_stretch_mode{false};
373
374 // full (rare)
375 ceph::buffer::list fullmap; // in lieu of below.
376 ceph::buffer::list crush;
377
378 // incremental
379 int32_t new_max_osd;
380 mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
381 mempool::osdmap::map<int64_t,std::string> new_pool_names;
382 mempool::osdmap::set<int64_t> old_pools;
383 mempool::osdmap::map<std::string,std::map<std::string,std::string> > new_erasure_code_profiles;
384 mempool::osdmap::vector<std::string> old_erasure_code_profiles;
385 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
386 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
387 mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
388 mempool::osdmap::map<int32_t,uint32_t> new_weight;
389 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
390 mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
391 mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
392 mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
393 mempool::osdmap::map<int32_t,std::pair<epoch_t,epoch_t> > new_last_clean_interval;
394 mempool::osdmap::map<int32_t,epoch_t> new_lost;
395 mempool::osdmap::map<int32_t,uuid_d> new_uuid;
396 mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
397
398 mempool::osdmap::map<entity_addr_t,utime_t> new_blocklist;
399 mempool::osdmap::vector<entity_addr_t> old_blocklist;
400 mempool::osdmap::map<entity_addr_t,utime_t> new_range_blocklist;
401 mempool::osdmap::vector<entity_addr_t> old_range_blocklist;
402 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
403 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
404
405 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
406 mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> new_pg_upmap_items;
407 mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
408 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
409 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
410
411 mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags;
412 mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags;
413
414 std::string cluster_snapshot;
415
416 float new_nearfull_ratio = -1;
417 float new_backfillfull_ratio = -1;
418 float new_full_ratio = -1;
419
420 ceph_release_t new_require_min_compat_client{0xff};
421
422 utime_t new_last_up_change, new_last_in_change;
423
424 mutable bool have_crc; ///< crc values are defined
425 uint32_t full_crc; ///< crc of the resulting OSDMap
426 mutable uint32_t inc_crc; ///< crc of this incremental
427
428 int get_net_marked_out(const OSDMap *previous) const;
429 int get_net_marked_down(const OSDMap *previous) const;
430 int identify_osd(uuid_d u) const;
431
432 void encode_client_old(ceph::buffer::list& bl) const;
433 void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
434 void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
435 void decode_classic(ceph::buffer::list::const_iterator &p);
436 void decode(ceph::buffer::list::const_iterator &bl);
437 void dump(ceph::Formatter *f) const;
438 static void generate_test_instances(std::list<Incremental*>& o);
439
440 explicit Incremental(epoch_t e=0) :
441 encode_features(0),
442 epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
443 have_crc(false), full_crc(0), inc_crc(0) {
444 }
445 explicit Incremental(ceph::buffer::list &bl) {
446 auto p = std::cbegin(bl);
447 decode(p);
448 }
449 explicit Incremental(ceph::buffer::list::const_iterator &p) {
450 decode(p);
451 }
452
453 pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
454 if (new_pools.count(pool) == 0)
455 new_pools[pool] = *orig;
456 return &new_pools[pool];
457 }
458 bool has_erasure_code_profile(const std::string &name) const {
459 auto i = new_erasure_code_profiles.find(name);
460 return i != new_erasure_code_profiles.end();
461 }
462 void set_erasure_code_profile(const std::string &name,
463 const std::map<std::string,std::string>& profile) {
464 new_erasure_code_profiles[name] = profile;
465 }
466 mempool::osdmap::map<std::string,std::map<std::string,std::string>> get_erasure_code_profiles() const {
467 return new_erasure_code_profiles;
468 }
469
470 /// propagate update pools' (snap and other) metadata to any of their tiers
471 int propagate_base_properties_to_tiers(CephContext *cct, const OSDMap &base);
472
473 /// filter out osds with any pending state changing
474 size_t get_pending_state_osds(std::vector<int> *osds) {
475 ceph_assert(osds);
476 osds->clear();
477
478 for (auto &p : new_state) {
479 osds->push_back(p.first);
480 }
481
482 return osds->size();
483 }
484
485 bool pending_osd_has_state(int osd, unsigned state) {
486 return new_state.count(osd) && (new_state[osd] & state) != 0;
487 }
488
489 bool pending_osd_state_set(int osd, unsigned state) {
490 if (pending_osd_has_state(osd, state))
491 return false;
492 new_state[osd] |= state;
493 return true;
494 }
495
496 // cancel the specified pending osd state if there is any
497 // return ture on success, false otherwise.
498 bool pending_osd_state_clear(int osd, unsigned state) {
499 if (!pending_osd_has_state(osd, state)) {
500 // never has been set or already has been cancelled.
501 return false;
502 }
503
504 new_state[osd] &= ~state;
505 if (!new_state[osd]) {
506 // all flags cleared
507 new_state.erase(osd);
508 }
509 return true;
510 }
511
512 bool in_new_removed_snaps(int64_t pool, snapid_t snap) const {
513 auto p = new_removed_snaps.find(pool);
514 if (p == new_removed_snaps.end()) {
515 return false;
516 }
517 return p->second.contains(snap);
518 }
519 };
520
521 private:
522 uuid_d fsid;
523 epoch_t epoch; // what epoch of the osd cluster descriptor is this
524 utime_t created, modified; // epoch start time
525 int32_t pool_max; // the largest pool num, ever
526
527 uint32_t flags;
528
529 int num_osd; // not saved; see calc_num_osds
530 int num_up_osd; // not saved; see calc_num_osds
531 int num_in_osd; // not saved; see calc_num_osds
532
533 int32_t max_osd;
534 std::vector<uint32_t> osd_state;
535
536 mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags
537 mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags
538
539 utime_t last_up_change, last_in_change;
540
541 // These features affect OSDMap[::Incremental] encoding, or the
542 // encoding of some type embedded therein (CrushWrapper, something
543 // from osd_types, etc.).
544 static constexpr uint64_t SIGNIFICANT_FEATURES =
545 CEPH_FEATUREMASK_PGID64 |
546 CEPH_FEATUREMASK_PGPOOL3 |
547 CEPH_FEATUREMASK_OSDENC |
548 CEPH_FEATUREMASK_OSDMAP_ENC |
549 CEPH_FEATUREMASK_OSD_POOLRESEND |
550 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
551 CEPH_FEATUREMASK_MSG_ADDR2 |
552 CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
553 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
554 CEPH_FEATUREMASK_SERVER_LUMINOUS |
555 CEPH_FEATUREMASK_SERVER_MIMIC |
556 CEPH_FEATUREMASK_SERVER_NAUTILUS |
557 CEPH_FEATUREMASK_SERVER_OCTOPUS;
558
559 struct addrs_s {
560 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
561 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
562 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
563 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
564 };
565 std::shared_ptr<addrs_s> osd_addrs;
566
567 entity_addrvec_t _blank_addrvec;
568
569 mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
570 mempool::osdmap::vector<osd_info_t> osd_info;
571 std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
572 std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
573 std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
574
575 // remap (post-CRUSH, pre-up)
576 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
577 mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
578
579 mempool::osdmap::map<int64_t,pg_pool_t> pools;
580 mempool::osdmap::map<int64_t,std::string> pool_name;
581 mempool::osdmap::map<std::string, std::map<std::string,std::string>> erasure_code_profiles;
582 mempool::osdmap::map<std::string,int64_t, std::less<>> name_pool;
583
584 std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
585 mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
586
587 class range_bits {
588 struct ip6 {
589 uint64_t upper_64_bits, lower_64_bits;
590 uint64_t upper_mask, lower_mask;
591 };
592 struct ip4 {
593 uint32_t ip_32_bits;
594 uint32_t mask;
595 };
596 union {
597 ip6 ipv6;
598 ip4 ipv4;
599 } bits;
600 bool ipv6;
601 static void get_ipv6_bytes(unsigned const char *addr,
602 uint64_t *upper, uint64_t *lower);
603 public:
604 range_bits();
605 range_bits(const entity_addr_t& addr);
606 void parse(const entity_addr_t& addr);
607 bool matches(const entity_addr_t& addr) const;
608 };
609 mempool::osdmap::unordered_map<entity_addr_t,utime_t> blocklist;
610 mempool::osdmap::map<entity_addr_t,utime_t> range_blocklist;
611 mempool::osdmap::map<entity_addr_t,range_bits> calculated_ranges;
612
613 /// queue of snaps to remove
614 mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
615
616 /// removed_snaps additions this epoch
617 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
618
619 /// removed_snaps removals this epoch
620 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
621
622 epoch_t cluster_snapshot_epoch;
623 std::string cluster_snapshot;
624 bool new_blocklist_entries;
625
626 float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
627
628 /// min compat client we want to support
629 ceph_release_t require_min_compat_client{ceph_release_t::unknown};
630
631 public:
632 /// require osds to run at least this release
633 ceph_release_t require_osd_release{ceph_release_t::unknown};
634
635 private:
636 mutable uint64_t cached_up_osd_features;
637
638 mutable bool crc_defined;
639 mutable uint32_t crc;
640
641 void _calc_up_osd_features();
642
643 public:
644 bool have_crc() const { return crc_defined; }
645 uint32_t get_crc() const { return crc; }
646
647 std::shared_ptr<CrushWrapper> crush; // hierarchical map
648 bool stretch_mode_enabled; // we are in stretch mode, requiring multiple sites
649 uint32_t stretch_bucket_count; // number of sites we expect to be in
650 uint32_t degraded_stretch_mode; // 0 if not degraded; else count of up sites
651 uint32_t recovering_stretch_mode; // 0 if not recovering; else 1
652 int32_t stretch_mode_bucket; // the bucket type we're stretched across
653 private:
654 uint32_t crush_version = 1;
655
656 friend class OSDMonitor;
657
658 public:
659 OSDMap() : epoch(0),
660 pool_max(0),
661 flags(0),
662 num_osd(0), num_up_osd(0), num_in_osd(0),
663 max_osd(0),
664 osd_addrs(std::make_shared<addrs_s>()),
665 pg_temp(std::make_shared<PGTempMap>()),
666 primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
667 osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
668 cluster_snapshot_epoch(0),
669 new_blocklist_entries(false),
670 cached_up_osd_features(0),
671 crc_defined(false), crc(0),
672 crush(std::make_shared<CrushWrapper>()),
673 stretch_mode_enabled(false), stretch_bucket_count(0),
674 degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) {
675 }
676
677 private:
678 OSDMap(const OSDMap& other) = default;
679 OSDMap& operator=(const OSDMap& other) = default;
680 public:
681
682 /// return feature mask subset that is relevant to OSDMap encoding
683 static uint64_t get_significant_features(uint64_t features) {
684 return SIGNIFICANT_FEATURES & features;
685 }
686
687 uint64_t get_encoding_features() const;
688
689 void deepish_copy_from(const OSDMap& o) {
690 *this = o;
691 primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
692 pg_temp.reset(new PGTempMap(*o.pg_temp));
693 osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
694
695 if (o.osd_primary_affinity)
696 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
697
698 // NOTE: this still references shared entity_addrvec_t's.
699 osd_addrs.reset(new addrs_s(*o.osd_addrs));
700
701 // NOTE: we do not copy crush. note that apply_incremental will
702 // allocate a new CrushWrapper, though.
703 }
704
705 // map info
706 const uuid_d& get_fsid() const { return fsid; }
707 void set_fsid(uuid_d& f) { fsid = f; }
708
709 epoch_t get_epoch() const { return epoch; }
710 void inc_epoch() { epoch++; }
711
712 void set_epoch(epoch_t e);
713
714 uint32_t get_crush_version() const {
715 return crush_version;
716 }
717
718 /* stamps etc */
719 const utime_t& get_created() const { return created; }
720 const utime_t& get_modified() const { return modified; }
721
722 bool is_blocklisted(const entity_addr_t& a, CephContext *cct=nullptr) const;
723 bool is_blocklisted(const entity_addrvec_t& a, CephContext *cct=nullptr) const;
724 void get_blocklist(std::list<std::pair<entity_addr_t,utime_t > > *bl,
725 std::list<std::pair<entity_addr_t,utime_t> > *rl) const;
726 void get_blocklist(std::set<entity_addr_t> *bl,
727 std::set<entity_addr_t> *rl) const;
728
729 std::string get_cluster_snapshot() const {
730 if (cluster_snapshot_epoch == epoch)
731 return cluster_snapshot;
732 return std::string();
733 }
734
735 float get_full_ratio() const {
736 return full_ratio;
737 }
738 float get_backfillfull_ratio() const {
739 return backfillfull_ratio;
740 }
741 float get_nearfull_ratio() const {
742 return nearfull_ratio;
743 }
744 void get_full_pools(CephContext *cct,
745 std::set<int64_t> *full,
746 std::set<int64_t> *backfillfull,
747 std::set<int64_t> *nearfull) const;
748 void get_full_osd_counts(std::set<int> *full, std::set<int> *backfill,
749 std::set<int> *nearfull) const;
750
751
752 /***** cluster state *****/
753 /* osds */
754 int get_max_osd() const { return max_osd; }
755 void set_max_osd(int m);
756
757 unsigned get_num_osds() const {
758 return num_osd;
759 }
760 unsigned get_num_up_osds() const {
761 return num_up_osd;
762 }
763 unsigned get_num_in_osds() const {
764 return num_in_osd;
765 }
766 /// recalculate cached values for get_num{,_up,_in}_osds
767 int calc_num_osds();
768
769 void get_all_osds(std::set<int32_t>& ls) const;
770 void get_up_osds(std::set<int32_t>& ls) const;
771 void get_out_existing_osds(std::set<int32_t>& ls) const;
772 unsigned get_num_pg_temp() const {
773 return pg_temp->size();
774 }
775
776 int get_flags() const { return flags; }
777 bool test_flag(int f) const { return flags & f; }
778 void set_flag(int f) { flags |= f; }
779 void clear_flag(int f) { flags &= ~f; }
780
781 void get_flag_set(std::set<std::string> *flagset) const;
782
783 static void calc_state_set(int state, std::set<std::string>& st);
784
785 int get_state(int o) const {
786 ceph_assert(o < max_osd);
787 return osd_state[o];
788 }
789 int get_state(int o, std::set<std::string>& st) const {
790 ceph_assert(o < max_osd);
791 unsigned t = osd_state[o];
792 calc_state_set(t, st);
793 return osd_state[o];
794 }
795 void set_state(int o, unsigned s) {
796 ceph_assert(o < max_osd);
797 osd_state[o] = s;
798 }
799 void set_weight(int o, unsigned w) {
800 ceph_assert(o < max_osd);
801 osd_weight[o] = w;
802 if (w)
803 osd_state[o] |= CEPH_OSD_EXISTS;
804 }
805 unsigned get_weight(int o) const {
806 ceph_assert(o < max_osd);
807 return osd_weight[o];
808 }
809 float get_weightf(int o) const {
810 return (float)get_weight(o) / (float)CEPH_OSD_IN;
811 }
812 void adjust_osd_weights(const std::map<int,double>& weights, Incremental& inc) const;
813
814 void set_primary_affinity(int o, int w) {
815 ceph_assert(o < max_osd);
816 if (!osd_primary_affinity)
817 osd_primary_affinity.reset(
818 new mempool::osdmap::vector<__u32>(
819 max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
820 (*osd_primary_affinity)[o] = w;
821 }
822 unsigned get_primary_affinity(int o) const {
823 ceph_assert(o < max_osd);
824 if (!osd_primary_affinity)
825 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
826 return (*osd_primary_affinity)[o];
827 }
828 float get_primary_affinityf(int o) const {
829 return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
830 }
831
832 bool has_erasure_code_profile(const std::string &name) const {
833 auto i = erasure_code_profiles.find(name);
834 return i != erasure_code_profiles.end();
835 }
836 int get_erasure_code_profile_default(CephContext *cct,
837 std::map<std::string,std::string> &profile_map,
838 std::ostream *ss);
839 void set_erasure_code_profile(const std::string &name,
840 const std::map<std::string,std::string>& profile) {
841 erasure_code_profiles[name] = profile;
842 }
843 const std::map<std::string,std::string> &get_erasure_code_profile(
844 const std::string &name) const {
845 static std::map<std::string,std::string> empty;
846 auto i = erasure_code_profiles.find(name);
847 if (i == erasure_code_profiles.end())
848 return empty;
849 else
850 return i->second;
851 }
852 const mempool::osdmap::map<std::string,std::map<std::string,std::string>> &get_erasure_code_profiles() const {
853 return erasure_code_profiles;
854 }
855
856 bool exists(int osd) const {
857 //assert(osd >= 0);
858 return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
859 }
860
861 bool is_destroyed(int osd) const {
862 return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
863 }
864
865 bool is_up(int osd) const {
866 return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
867 }
868
869 bool has_been_up_since(int osd, epoch_t epoch) const {
870 return is_up(osd) && get_up_from(osd) <= epoch;
871 }
872
873 bool is_down(int osd) const {
874 return !is_up(osd);
875 }
876
877 bool is_stop(int osd) const {
878 return exists(osd) && is_down(osd) &&
879 (osd_state[osd] & CEPH_OSD_STOP);
880 }
881
882 bool is_out(int osd) const {
883 return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
884 }
885
886 bool is_in(int osd) const {
887 return !is_out(osd);
888 }
889
890 bool is_dead(int osd) const {
891 if (!exists(osd)) {
892 return false; // unclear if they know they are removed from map
893 }
894 return get_xinfo(osd).dead_epoch > get_info(osd).up_from;
895 }
896
897 unsigned get_osd_crush_node_flags(int osd) const;
898 unsigned get_crush_node_flags(int id) const;
899 unsigned get_device_class_flags(int id) const;
900
901 bool is_noup_by_osd(int osd) const {
902 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
903 }
904
905 bool is_nodown_by_osd(int osd) const {
906 return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
907 }
908
909 bool is_noin_by_osd(int osd) const {
910 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
911 }
912
913 bool is_noout_by_osd(int osd) const {
914 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
915 }
916
917 bool is_noup(int osd) const {
918 if (test_flag(CEPH_OSDMAP_NOUP)) // global?
919 return true;
920 if (is_noup_by_osd(osd)) // by osd?
921 return true;
922 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node?
923 return true;
924 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
925 get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class?
926 return true;
927 return false;
928 }
929
930 bool is_nodown(int osd) const {
931 if (test_flag(CEPH_OSDMAP_NODOWN))
932 return true;
933 if (is_nodown_by_osd(osd))
934 return true;
935 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN)
936 return true;
937 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
938 get_device_class_flags(class_id) & CEPH_OSD_NODOWN)
939 return true;
940 return false;
941 }
942
943 bool is_noin(int osd) const {
944 if (test_flag(CEPH_OSDMAP_NOIN))
945 return true;
946 if (is_noin_by_osd(osd))
947 return true;
948 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN)
949 return true;
950 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
951 get_device_class_flags(class_id) & CEPH_OSD_NOIN)
952 return true;
953 return false;
954 }
955
956 bool is_noout(int osd) const {
957 if (test_flag(CEPH_OSDMAP_NOOUT))
958 return true;
959 if (is_noout_by_osd(osd))
960 return true;
961 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT)
962 return true;
963 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
964 get_device_class_flags(class_id) & CEPH_OSD_NOOUT)
965 return true;
966 return false;
967 }
968
969 /**
970 * check if an entire crush subtree is down
971 */
972 bool subtree_is_down(int id, std::set<int> *down_cache) const;
973 bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set<int> *down_cache) const;
974
975 bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set<int> *down_in_osds, std::set<int> *up_in_osds,
976 std::set<int> *subtree_up, std::unordered_map<int, std::set<int> > *subtree_type_down) const;
977
978 int identify_osd(const entity_addr_t& addr) const;
979 int identify_osd(const uuid_d& u) const;
980 int identify_osd_on_all_channels(const entity_addr_t& addr) const;
981
982 bool have_addr(const entity_addr_t& addr) const {
983 return identify_osd(addr) >= 0;
984 }
985 int find_osd_on_ip(const entity_addr_t& ip) const;
986
987 const entity_addrvec_t& get_addrs(int osd) const {
988 ceph_assert(exists(osd));
989 return osd_addrs->client_addrs[osd] ?
990 *osd_addrs->client_addrs[osd] : _blank_addrvec;
991 }
992 const entity_addrvec_t& get_most_recent_addrs(int osd) const {
993 return get_addrs(osd);
994 }
995 const entity_addrvec_t &get_cluster_addrs(int osd) const {
996 ceph_assert(exists(osd));
997 return osd_addrs->cluster_addrs[osd] ?
998 *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
999 }
1000 const entity_addrvec_t &get_hb_back_addrs(int osd) const {
1001 ceph_assert(exists(osd));
1002 return osd_addrs->hb_back_addrs[osd] ?
1003 *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
1004 }
1005 const entity_addrvec_t &get_hb_front_addrs(int osd) const {
1006 ceph_assert(exists(osd));
1007 return osd_addrs->hb_front_addrs[osd] ?
1008 *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
1009 }
1010
1011 const uuid_d& get_uuid(int osd) const {
1012 ceph_assert(exists(osd));
1013 return (*osd_uuid)[osd];
1014 }
1015
1016 const epoch_t& get_up_from(int osd) const {
1017 ceph_assert(exists(osd));
1018 return osd_info[osd].up_from;
1019 }
1020 const epoch_t& get_up_thru(int osd) const {
1021 ceph_assert(exists(osd));
1022 return osd_info[osd].up_thru;
1023 }
1024 const epoch_t& get_down_at(int osd) const {
1025 ceph_assert(exists(osd));
1026 return osd_info[osd].down_at;
1027 }
1028 const osd_info_t& get_info(int osd) const {
1029 ceph_assert(osd < max_osd);
1030 return osd_info[osd];
1031 }
1032
1033 const osd_xinfo_t& get_xinfo(int osd) const {
1034 ceph_assert(osd < max_osd);
1035 return osd_xinfo[osd];
1036 }
1037
1038 int get_next_up_osd_after(int n) const {
1039 if (get_max_osd() == 0)
1040 return -1;
1041 for (int i = n + 1; i != n; ++i) {
1042 if (i >= get_max_osd())
1043 i = 0;
1044 if (i == n)
1045 break;
1046 if (is_up(i))
1047 return i;
1048 }
1049 return -1;
1050 }
1051
1052 int get_previous_up_osd_before(int n) const {
1053 if (get_max_osd() == 0)
1054 return -1;
1055 for (int i = n - 1; i != n; --i) {
1056 if (i < 0)
1057 i = get_max_osd() - 1;
1058 if (i == n)
1059 break;
1060 if (is_up(i))
1061 return i;
1062 }
1063 return -1;
1064 }
1065
1066
1067 void get_random_up_osds_by_subtree(int n, // whoami
1068 std::string &subtree,
1069 int limit, // how many
1070 std::set<int> skip,
1071 std::set<int> *want) const;
1072
1073 /**
1074 * get feature bits required by the current structure
1075 *
1076 * @param entity_type [in] what entity type we are asking about
1077 * @param mask [out] std::set of all possible map-related features we could std::set
1078 * @return feature bits used by this map
1079 */
1080 uint64_t get_features(int entity_type, uint64_t *mask) const;
1081
1082 /**
1083 * get oldest *client* version (firefly, hammer, etc.) that can connect given
1084 * the feature bits required (according to get_features()).
1085 */
1086 ceph_release_t get_min_compat_client() const;
1087
1088 /**
1089 * gets the required minimum *client* version that can connect to the cluster.
1090 */
1091 ceph_release_t get_require_min_compat_client() const;
1092
1093 /**
1094 * get intersection of features supported by up osds
1095 */
1096 uint64_t get_up_osd_features() const;
1097
1098 void get_upmap_pgs(std::vector<pg_t> *upmap_pgs) const;
1099 bool check_pg_upmaps(
1100 CephContext *cct,
1101 const std::vector<pg_t>& to_check,
1102 std::vector<pg_t> *to_cancel,
1103 std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> *to_remap) const;
1104 void clean_pg_upmaps(
1105 CephContext *cct,
1106 Incremental *pending_inc,
1107 const std::vector<pg_t>& to_cancel,
1108 const std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>>& to_remap) const;
1109 bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
1110
1111 int apply_incremental(const Incremental &inc);
1112
1113 /// try to re-use/reference addrs in oldmap from newmap
1114 static void dedup(const OSDMap *oldmap, OSDMap *newmap);
1115
1116 static void clean_temps(CephContext *cct,
1117 const OSDMap& oldmap,
1118 const OSDMap& nextmap,
1119 Incremental *pending_inc);
1120
1121 // serialize, unserialize
1122 private:
1123 void encode_client_old(ceph::buffer::list& bl) const;
1124 void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
1125 void decode_classic(ceph::buffer::list::const_iterator& p);
1126 void post_decode();
1127 public:
1128 void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
1129 void decode(ceph::buffer::list& bl);
1130 void decode(ceph::buffer::list::const_iterator& bl);
1131
1132
1133 /**** mapping facilities ****/
1134 int map_to_pg(
1135 int64_t pool,
1136 const std::string& name,
1137 const std::string& key,
1138 const std::string& nspace,
1139 pg_t *pg) const;
1140 int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
1141 pg_t &pg) const;
1142 pg_t object_locator_to_pg(const object_t& oid,
1143 const object_locator_t& loc) const {
1144 pg_t pg;
1145 int ret = object_locator_to_pg(oid, loc, pg);
1146 ceph_assert(ret == 0);
1147 return pg;
1148 }
1149
1150
1151 static object_locator_t file_to_object_locator(const file_layout_t& layout) {
1152 return object_locator_t(layout.pool_id, layout.pool_ns);
1153 }
1154
1155 ceph_object_layout file_to_object_layout(object_t oid,
1156 file_layout_t& layout) const {
1157 return make_object_layout(oid, layout.pool_id, layout.pool_ns);
1158 }
1159
1160 ceph_object_layout make_object_layout(object_t oid, int pg_pool,
1161 std::string nspace) const;
1162
1163 int get_pg_num(int pg_pool) const
1164 {
1165 const pg_pool_t *pool = get_pg_pool(pg_pool);
1166 ceph_assert(NULL != pool);
1167 return pool->get_pg_num();
1168 }
1169
1170 bool pg_exists(pg_t pgid) const {
1171 const pg_pool_t *p = get_pg_pool(pgid.pool());
1172 return p && pgid.ps() < p->get_pg_num();
1173 }
1174
1175 int get_pg_pool_min_size(pg_t pgid) const {
1176 if (!pg_exists(pgid)) {
1177 return -ENOENT;
1178 }
1179 const pg_pool_t *p = get_pg_pool(pgid.pool());
1180 ceph_assert(p);
1181 return p->get_min_size();
1182 }
1183
1184 int get_pg_pool_size(pg_t pgid) const {
1185 if (!pg_exists(pgid)) {
1186 return -ENOENT;
1187 }
1188 const pg_pool_t *p = get_pg_pool(pgid.pool());
1189 ceph_assert(p);
1190 return p->get_size();
1191 }
1192
1193 int get_pg_pool_crush_rule(pg_t pgid) const {
1194 if (!pg_exists(pgid)) {
1195 return -ENOENT;
1196 }
1197 const pg_pool_t *p = get_pg_pool(pgid.pool());
1198 ceph_assert(p);
1199 return p->get_crush_rule();
1200 }
1201
1202 private:
1203 /// pg -> (raw osd std::list)
1204 void _pg_to_raw_osds(
1205 const pg_pool_t& pool, pg_t pg,
1206 std::vector<int> *osds,
1207 ps_t *ppps) const;
1208 int _pick_primary(const std::vector<int>& osds) const;
1209 void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector<int>& osds) const;
1210
1211 void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
1212 std::vector<int> *osds, int *primary) const;
1213
1214 /// apply pg_upmap[_items] mappings
1215 void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector<int> *raw) const;
1216
1217 /// pg -> (up osd std::list)
1218 void _raw_to_up_osds(const pg_pool_t& pool, const std::vector<int>& raw,
1219 std::vector<int> *up) const;
1220
1221
1222 /**
1223 * Get the pg and primary temp, if they are specified.
1224 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1225 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1226 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1227 */
1228 void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
1229 std::vector<int> *temp_pg, int *temp_primary) const;
1230
1231 /**
1232 * map to up and acting. Fills in whatever fields are non-NULL.
1233 */
1234 void _pg_to_up_acting_osds(const pg_t& pg, std::vector<int> *up, int *up_primary,
1235 std::vector<int> *acting, int *acting_primary,
1236 bool raw_pg_to_pg = true) const;
1237
1238 public:
1239 /***
1240 * This is suitable only for looking at raw CRUSH outputs. It skips
1241 * applying the temp and up checks and should not be used
1242 * by anybody for data mapping purposes.
1243 * raw and primary must be non-NULL
1244 */
1245 void pg_to_raw_osds(pg_t pg, std::vector<int> *raw, int *primary) const;
1246 void pg_to_raw_upmap(pg_t pg, std::vector<int> *raw,
1247 std::vector<int> *raw_upmap) const;
1248 /// map a pg to its acting set. @return acting set size
1249 void pg_to_acting_osds(const pg_t& pg, std::vector<int> *acting,
1250 int *acting_primary) const {
1251 _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
1252 }
1253 void pg_to_acting_osds(pg_t pg, std::vector<int>& acting) const {
1254 return pg_to_acting_osds(pg, &acting, NULL);
1255 }
1256 /**
1257 * This does not apply temp overrides and should not be used
1258 * by anybody for data mapping purposes. Specify both pointers.
1259 */
1260 void pg_to_raw_up(pg_t pg, std::vector<int> *up, int *primary) const;
1261 /**
1262 * map a pg to its acting set as well as its up set. You must use
1263 * the acting set for data mapping purposes, but some users will
1264 * also find the up set useful for things like deciding what to
1265 * set as pg_temp.
1266 * Each of these pointers must be non-NULL.
1267 */
1268 void pg_to_up_acting_osds(pg_t pg, std::vector<int> *up, int *up_primary,
1269 std::vector<int> *acting, int *acting_primary) const {
1270 _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
1271 }
1272 void pg_to_up_acting_osds(pg_t pg, std::vector<int>& up, std::vector<int>& acting) const {
1273 int up_primary, acting_primary;
1274 pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
1275 }
1276 bool pg_is_ec(pg_t pg) const {
1277 auto i = pools.find(pg.pool());
1278 ceph_assert(i != pools.end());
1279 return i->second.is_erasure();
1280 }
1281 bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
1282 auto i = get_pools().find(pgid.pool());
1283 if (i == get_pools().end()) {
1284 return false;
1285 }
1286 if (!i->second.is_erasure()) {
1287 *out = spg_t(pgid);
1288 return true;
1289 }
1290 int primary;
1291 std::vector<int> acting;
1292 pg_to_acting_osds(pgid, &acting, &primary);
1293 for (uint8_t i = 0; i < acting.size(); ++i) {
1294 if (acting[i] == primary) {
1295 *out = spg_t(pgid, shard_id_t(i));
1296 return true;
1297 }
1298 }
1299 return false;
1300 }
1301 bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
1302 auto i = get_pools().find(pgid.pool());
1303 if (i == get_pools().end()) {
1304 return false;
1305 }
1306 std::vector<int> acting;
1307 pg_to_acting_osds(pgid, &acting, primary);
1308 if (i->second.is_erasure()) {
1309 for (uint8_t i = 0; i < acting.size(); ++i) {
1310 if (acting[i] == *primary) {
1311 *out = spg_t(pgid, shard_id_t(i));
1312 return true;
1313 }
1314 }
1315 } else {
1316 *out = spg_t(pgid);
1317 return true;
1318 }
1319 return false;
1320 }
1321
1322 bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
1323 auto p = removed_snaps_queue.find(pool);
1324 if (p == removed_snaps_queue.end()) {
1325 return false;
1326 }
1327 return p->second.contains(snap);
1328 }
1329
1330 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1331 get_removed_snaps_queue() const {
1332 return removed_snaps_queue;
1333 }
1334 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1335 get_new_removed_snaps() const {
1336 return new_removed_snaps;
1337 }
1338 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1339 get_new_purged_snaps() const {
1340 return new_purged_snaps;
1341 }
1342
1343 int64_t lookup_pg_pool_name(std::string_view name) const {
1344 auto p = name_pool.find(name);
1345 if (p == name_pool.end())
1346 return -ENOENT;
1347 return p->second;
1348 }
1349
1350 int64_t get_pool_max() const {
1351 return pool_max;
1352 }
1353 const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
1354 return pools;
1355 }
1356 mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
1357 return pools;
1358 }
1359 void get_pool_ids_by_rule(int rule_id, std::set<int64_t> *pool_ids) const {
1360 ceph_assert(pool_ids);
1361 for (auto &p: pools) {
1362 if (p.second.get_crush_rule() == rule_id) {
1363 pool_ids->insert(p.first);
1364 }
1365 }
1366 }
1367 void get_pool_ids_by_osd(CephContext *cct,
1368 int osd,
1369 std::set<int64_t> *pool_ids) const;
1370 const std::string& get_pool_name(int64_t p) const {
1371 auto i = pool_name.find(p);
1372 ceph_assert(i != pool_name.end());
1373 return i->second;
1374 }
1375 const mempool::osdmap::map<int64_t,std::string>& get_pool_names() const {
1376 return pool_name;
1377 }
1378 bool have_pg_pool(int64_t p) const {
1379 return pools.count(p);
1380 }
1381 const pg_pool_t* get_pg_pool(int64_t p) const {
1382 auto i = pools.find(p);
1383 if (i != pools.end())
1384 return &i->second;
1385 return NULL;
1386 }
1387 unsigned get_pg_size(pg_t pg) const {
1388 auto p = pools.find(pg.pool());
1389 ceph_assert(p != pools.end());
1390 return p->second.get_size();
1391 }
1392 int get_pg_type(pg_t pg) const {
1393 auto p = pools.find(pg.pool());
1394 ceph_assert(p != pools.end());
1395 return p->second.get_type();
1396 }
1397 int get_pool_crush_rule(int64_t pool_id) const {
1398 auto pool = get_pg_pool(pool_id);
1399 if (!pool)
1400 return -ENOENT;
1401 return pool->get_crush_rule();
1402 }
1403
1404
1405 pg_t raw_pg_to_pg(pg_t pg) const {
1406 auto p = pools.find(pg.pool());
1407 ceph_assert(p != pools.end());
1408 return p->second.raw_pg_to_pg(pg);
1409 }
1410
1411 // pg -> acting primary osd
1412 int get_pg_acting_primary(pg_t pg) const {
1413 int primary = -1;
1414 _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
1415 return primary;
1416 }
1417
1418 /*
1419 * check whether an spg_t maps to a particular osd
1420 */
1421 bool is_up_acting_osd_shard(spg_t pg, int osd) const {
1422 std::vector<int> up, acting;
1423 _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
1424 if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 ||
1425 calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) {
1426 return true;
1427 }
1428 return false;
1429 }
1430
1431
1432 static int calc_pg_role_broken(int osd, const std::vector<int>& acting, int nrep=0);
1433 static int calc_pg_role(pg_shard_t who, const std::vector<int>& acting);
1434 static bool primary_changed_broken(
1435 int oldprimary,
1436 const std::vector<int> &oldacting,
1437 int newprimary,
1438 const std::vector<int> &newacting);
1439
1440 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
1441 int get_pg_acting_role(spg_t pg, int osd) const {
1442 std::vector<int> group;
1443 pg_to_acting_osds(pg.pgid, group);
1444 return calc_pg_role(pg_shard_t(osd, pg.shard), group);
1445 }
1446
1447 bool try_pg_upmap(
1448 CephContext *cct,
1449 pg_t pg, ///< pg to potentially remap
1450 const std::set<int>& overfull, ///< osds we'd want to evacuate
1451 const std::vector<int>& underfull, ///< osds to move to, in order of preference
1452 const std::vector<int>& more_underfull, ///< less full osds to move to, in order of preference
1453 std::vector<int> *orig,
1454 std::vector<int> *out); ///< resulting alternative mapping
1455
1456 int calc_pg_upmaps(
1457 CephContext *cct,
1458 uint32_t max_deviation, ///< max deviation from target (value >= 1)
1459 int max_iterations, ///< max iterations to run
1460 const std::set<int64_t>& pools, ///< [optional] restrict to pool
1461 Incremental *pending_inc,
1462 std::random_device::result_type *p_seed = nullptr ///< [optional] for regression tests
1463 );
1464
1465 private: // Bunch of internal functions used only by calc_pg_upmaps (result of code refactoring)
1466 float build_pool_pgs_info (
1467 CephContext *cct,
1468 const std::set<int64_t>& pools, ///< [optional] restrict to pool
1469 const OSDMap& tmp_osd_map,
1470 int& total_pgs,
1471 std::map<int, std::set<pg_t>>& pgs_by_osd,
1472 std::map<int,float>& osd_weight
1473 ); // return total weight of all OSDs
1474
1475 float calc_deviations (
1476 CephContext *cct,
1477 const std::map<int,std::set<pg_t>>& pgs_by_osd,
1478 const std::map<int,float>& osd_weight,
1479 float pgs_per_weight,
1480 std::map<int,float>& osd_deviation,
1481 std::multimap<float,int>& deviation_osd,
1482 float& stddev
1483 ); // return current max deviation
1484
1485 void fill_overfull_underfull (
1486 CephContext *cct,
1487 const std::multimap<float,int>& deviation_osd,
1488 int max_deviation,
1489 std::set<int>& overfull,
1490 std::set<int>& more_overfull,
1491 std::vector<int>& underfull,
1492 std::vector<int>& more_underfull
1493 );
1494
1495 int pack_upmap_results(
1496 CephContext *cct,
1497 const std::set<pg_t>& to_unmap,
1498 const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap,
1499 OSDMap& tmp_osd_map,
1500 OSDMap::Incremental *pending_inc
1501 );
1502
1503 std::default_random_engine get_random_engine(
1504 CephContext *cct,
1505 std::random_device::result_type *p_seed
1506 );
1507
1508 bool try_drop_remap_overfull(
1509 CephContext *cct,
1510 const std::vector<pg_t>& pgs,
1511 const OSDMap& tmp_osd_map,
1512 int osd,
1513 std::map<int,std::set<pg_t>>& temp_pgs_by_osd,
1514 std::set<pg_t>& to_unmap,
1515 std::map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap
1516 );
1517
1518 typedef std::vector<std::pair<pg_t, mempool::osdmap::vector<std::pair<int, int>>>>
1519 candidates_t;
1520
1521 bool try_drop_remap_underfull(
1522 CephContext *cct,
1523 const candidates_t& candidates,
1524 int osd,
1525 std::map<int,std::set<pg_t>>& temp_pgs_by_osd,
1526 std::set<pg_t>& to_unmap,
1527 std::map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap
1528 );
1529
1530 void add_remap_pair(
1531 CephContext *cct,
1532 int orig,
1533 int out,
1534 pg_t pg,
1535 size_t pg_pool_size,
1536 int osd,
1537 std::set<int>& existing,
1538 std::map<int,std::set<pg_t>>& temp_pgs_by_osd,
1539 mempool::osdmap::vector<std::pair<int32_t,int32_t>> new_upmap_items,
1540 std::map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap
1541 );
1542
1543 int find_best_remap (
1544 CephContext *cct,
1545 const std::vector<int>& orig,
1546 const std::vector<int>& out,
1547 const std::set<int>& existing,
1548 const std::map<int,float> osd_deviation
1549 );
1550
1551 candidates_t build_candidates(
1552 CephContext *cct,
1553 const OSDMap& tmp_osd_map,
1554 const std::set<pg_t> to_skip,
1555 const std::set<int64_t>& only_pools,
1556 bool aggressive,
1557 std::random_device::result_type *p_seed
1558 );
1559
1560 public:
1561 int get_osds_by_bucket_name(const std::string &name, std::set<int> *osds) const;
1562
1563 bool have_pg_upmaps(pg_t pg) const {
1564 return pg_upmap.count(pg) ||
1565 pg_upmap_items.count(pg);
1566 }
1567
1568 bool check_full(const std::set<pg_shard_t> &missing_on) const {
1569 for (auto shard : missing_on) {
1570 if (get_state(shard.osd) & CEPH_OSD_FULL)
1571 return true;
1572 }
1573 return false;
1574 }
1575
1576 /*
1577 * handy helpers to build simple maps...
1578 */
1579 /**
1580 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1581 * it will be initialized with the specified number of OSDs in a
1582 * single host. If **num_osd** is < 0 the layout of the OSD map will
1583 * be built by reading the content of the configuration file.
1584 *
1585 * @param cct [in] in core ceph context
1586 * @param e [in] initial epoch
1587 * @param fsid [in] id of the cluster
1588 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1589 * @return **0** on success, negative errno on error.
1590 */
1591 private:
1592 int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
1593 int num_osd, int pg_bits, int pgp_bits,
1594 bool default_pool);
1595 public:
1596 int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
1597 int num_osd) {
1598 return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
1599 }
1600 int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
1601 int num_osd, int pg_bits, int pgp_bits) {
1602 return build_simple_optioned(cct, e, fsid, num_osd,
1603 pg_bits, pgp_bits, true);
1604 }
1605 static int _build_crush_types(CrushWrapper& crush);
1606 static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
1607 int num_osd, std::ostream *ss);
1608 static int build_simple_crush_map_from_conf(CephContext *cct,
1609 CrushWrapper& crush,
1610 std::ostream *ss);
1611 static int build_simple_crush_rules(
1612 CephContext *cct, CrushWrapper& crush,
1613 const std::string& root,
1614 std::ostream *ss);
1615
1616 bool crush_rule_in_use(int rule_id) const;
1617
1618 int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const;
1619
1620 void clear_temp() {
1621 pg_temp->clear();
1622 primary_temp->clear();
1623 }
1624
1625 private:
1626 void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const;
1627 public:
1628 void print(std::ostream& out) const;
1629 void print_osd(int id, std::ostream& out) const;
1630 void print_osds(std::ostream& out) const;
1631 void print_pools(std::ostream& out) const;
1632 void print_summary(ceph::Formatter *f, std::ostream& out,
1633 const std::string& prefix, bool extra=false) const;
1634 void print_oneline_summary(std::ostream& out) const;
1635
1636 enum {
1637 DUMP_IN = 1, // only 'in' osds
1638 DUMP_OUT = 2, // only 'out' osds
1639 DUMP_UP = 4, // only 'up' osds
1640 DUMP_DOWN = 8, // only 'down' osds
1641 DUMP_DESTROYED = 16, // only 'destroyed' osds
1642 };
1643 void print_tree(ceph::Formatter *f, std::ostream *out,
1644 unsigned dump_flags=0, std::string bucket="") const;
1645
1646 int summarize_mapping_stats(
1647 OSDMap *newmap,
1648 const std::set<int64_t> *pools,
1649 std::string *out,
1650 ceph::Formatter *f) const;
1651
1652 std::string get_flag_string() const;
1653 static std::string get_flag_string(unsigned flags);
1654 static void dump_erasure_code_profiles(
1655 const mempool::osdmap::map<std::string,std::map<std::string,std::string> > &profiles,
1656 ceph::Formatter *f);
1657 void dump(ceph::Formatter *f) const;
1658 void dump_osd(int id, ceph::Formatter *f) const;
1659 void dump_osds(ceph::Formatter *f) const;
1660 static void generate_test_instances(std::list<OSDMap*>& o);
1661 bool check_new_blocklist_entries() const { return new_blocklist_entries; }
1662
1663 void check_health(CephContext *cct, health_check_map_t *checks) const;
1664
1665 int parse_osd_id_list(const std::vector<std::string>& ls,
1666 std::set<int> *out,
1667 std::ostream *ss) const;
1668
1669 float pool_raw_used_rate(int64_t poolid) const;
1670 std::optional<std::string> pending_require_osd_release() const;
1671
1672 };
1673 WRITE_CLASS_ENCODER_FEATURES(OSDMap)
1674 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
1675
1676 #ifdef WITH_SEASTAR
1677 using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
1678 #else
1679 using OSDMapRef = std::shared_ptr<const OSDMap>;
1680 #endif
1681
1682
1683 inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) {
1684 m.print_oneline_summary(out);
1685 return out;
1686 }
1687
1688 class PGMap;
1689
1690 void print_osd_utilization(const OSDMap& osdmap,
1691 const PGMap& pgmap,
1692 std::ostream& out,
1693 ceph::Formatter *f,
1694 bool tree,
1695 const std::string& filter);
1696
1697 #endif