]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.h
Import ceph 15.2.8
[ceph.git] / ceph / src / osd / OSDMap.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19#ifndef CEPH_OSDMAP_H
20#define CEPH_OSDMAP_H
21
22/*
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
25 *
26 */
7c673cae
FG
27#include <vector>
28#include <list>
29#include <set>
30#include <map>
11fdf7f2 31#include <memory>
9f95a23c
TL
32
33#include <boost/smart_ptr/local_shared_ptr.hpp>
94b18763 34#include "include/btree_map.h"
9f95a23c
TL
35#include "include/common_fwd.h"
36#include "include/types.h"
37#include "common/ceph_releases.h"
38#include "osd_types.h"
39
40//#include "include/ceph_features.h"
41#include "crush/CrushWrapper.h"
7c673cae
FG
42
43// forward declaration
7c673cae 44class CrushWrapper;
224ce89b 45class health_check_map_t;
7c673cae 46
7c673cae
FG
47/*
48 * we track up to two intervals during which the osd was alive and
49 * healthy. the most recent is [up_from,up_thru), where up_thru is
50 * the last epoch the osd is known to have _started_. i.e., a lower
51 * bound on the actual osd death. down_at (if it is > up_from) is an
52 * upper bound on the actual osd death.
53 *
54 * the second is the last_clean interval [first,last]. in that case,
55 * the last interval is the last epoch known to have been either
56 * _finished_, or during which the osd cleanly shut down. when
57 * possible, we push this forward to the epoch the osd was eventually
58 * marked down.
59 *
60 * the lost_at is used to allow build_prior to proceed without waiting
61 * for an osd to recover. In certain cases, progress may be blocked
62 * because an osd is down that may contain updates (i.e., a pg may have
63 * gone rw during an interval). If the osd can't be brought online, we
64 * can force things to proceed knowing that we _might_ be losing some
65 * acked writes. If the osd comes back to life later, that's fine to,
66 * but those writes will still be lost (the divergent objects will be
67 * thrown out).
68 */
69struct osd_info_t {
70 epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
71 epoch_t last_clean_end;
72 epoch_t up_from; // epoch osd marked up
73 epoch_t up_thru; // lower bound on actual osd death (if > up_from)
74 epoch_t down_at; // upper bound on actual osd death (if > up_from)
75 epoch_t lost_at; // last epoch we decided data was "lost"
76
77 osd_info_t() : last_clean_begin(0), last_clean_end(0),
78 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
79
9f95a23c
TL
80 void dump(ceph::Formatter *f) const;
81 void encode(ceph::buffer::list& bl) const;
82 void decode(ceph::buffer::list::const_iterator& bl);
83 static void generate_test_instances(std::list<osd_info_t*>& o);
7c673cae
FG
84};
85WRITE_CLASS_ENCODER(osd_info_t)
86
9f95a23c 87std::ostream& operator<<(std::ostream& out, const osd_info_t& info);
7c673cae
FG
88
89struct osd_xinfo_t {
90 utime_t down_stamp; ///< timestamp when we were last marked down
91 float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
92 __u32 laggy_interval; ///< average interval between being marked laggy and recovering
93 uint64_t features; ///< features supported by this osd we should know about
94 __u32 old_weight; ///< weight prior to being auto marked out
9f95a23c
TL
95 utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps
96 epoch_t dead_epoch = 0; ///< last epoch we were confirmed dead (not just down)
7c673cae
FG
97
98 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
99 features(0), old_weight(0) {}
100
9f95a23c
TL
101 void dump(ceph::Formatter *f) const;
102 void encode(ceph::buffer::list& bl, uint64_t features) const;
103 void decode(ceph::buffer::list::const_iterator& bl);
104 static void generate_test_instances(std::list<osd_xinfo_t*>& o);
7c673cae 105};
9f95a23c 106WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t)
7c673cae 107
9f95a23c 108std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi);
7c673cae
FG
109
110
31f18b77
FG
111struct PGTempMap {
112#if 1
9f95a23c 113 ceph::buffer::list data;
eafe8130 114 typedef btree::btree_map<pg_t,ceph_le32*> map_t;
31f18b77
FG
115 map_t map;
116
9f95a23c 117 void encode(ceph::buffer::list& bl) const {
11fdf7f2 118 using ceph::encode;
31f18b77 119 uint32_t n = map.size();
11fdf7f2 120 encode(n, bl);
31f18b77 121 for (auto &p : map) {
11fdf7f2 122 encode(p.first, bl);
eafe8130 123 bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32));
31f18b77
FG
124 }
125 }
9f95a23c 126 void decode(ceph::buffer::list::const_iterator& p) {
11fdf7f2 127 using ceph::decode;
31f18b77
FG
128 data.clear();
129 map.clear();
130 uint32_t n;
11fdf7f2 131 decode(n, p);
31f18b77
FG
132 if (!n)
133 return;
11fdf7f2 134 auto pstart = p;
31f18b77 135 size_t start_off = pstart.get_off();
9f95a23c 136 std::vector<std::pair<pg_t,size_t>> offsets;
31f18b77
FG
137 offsets.resize(n);
138 for (unsigned i=0; i<n; ++i) {
139 pg_t pgid;
11fdf7f2 140 decode(pgid, p);
31f18b77
FG
141 offsets[i].first = pgid;
142 offsets[i].second = p.get_off() - start_off;
143 uint32_t vn;
11fdf7f2 144 decode(vn, p);
9f95a23c 145 p += vn * sizeof(int32_t);
31f18b77
FG
146 }
147 size_t len = p.get_off() - start_off;
148 pstart.copy(len, data);
149 if (data.get_num_buffers() > 1) {
150 data.rebuild();
151 }
152 //map.reserve(n);
153 char *start = data.c_str();
154 for (auto i : offsets) {
9f95a23c 155 map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second)));
31f18b77
FG
156 }
157 }
158 void rebuild() {
9f95a23c 159 ceph::buffer::list bl;
31f18b77 160 encode(bl);
11fdf7f2 161 auto p = std::cbegin(bl);
31f18b77
FG
162 decode(p);
163 }
164 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
165 return
166 l.map.size() == r.map.size() &&
167 l.data.contents_equal(r.data);
168 }
169
170 class iterator {
171 map_t::const_iterator it;
172 map_t::const_iterator end;
9f95a23c 173 std::pair<pg_t,std::vector<int32_t>> current;
31f18b77
FG
174 void init_current() {
175 if (it != end) {
176 current.first = it->first;
11fdf7f2 177 ceph_assert(it->second);
31f18b77 178 current.second.resize(*it->second);
eafe8130
TL
179 ceph_le32 *p = it->second + 1;
180 for (uint32_t n = 0; n < *it->second; ++n, ++p) {
31f18b77
FG
181 current.second[n] = *p;
182 }
183 }
184 }
185 public:
186 iterator(map_t::const_iterator p,
187 map_t::const_iterator e)
188 : it(p), end(e) {
189 init_current();
190 }
191
9f95a23c 192 const std::pair<pg_t,std::vector<int32_t>>& operator*() const {
31f18b77
FG
193 return current;
194 }
9f95a23c 195 const std::pair<pg_t,std::vector<int32_t>>* operator->() const {
31f18b77
FG
196 return &current;
197 }
198 friend bool operator==(const iterator& l, const iterator& r) {
199 return l.it == r.it;
200 }
201 friend bool operator!=(const iterator& l, const iterator& r) {
202 return l.it != r.it;
203 }
204 iterator& operator++() {
205 ++it;
206 if (it != end)
207 init_current();
208 return *this;
209 }
210 iterator operator++(int) {
211 iterator r = *this;
212 ++it;
213 if (it != end)
214 init_current();
215 return r;
216 }
217 };
218 iterator begin() const {
219 return iterator(map.begin(), map.end());
220 }
221 iterator end() const {
222 return iterator(map.end(), map.end());
223 }
224 iterator find(pg_t pgid) const {
225 return iterator(map.find(pgid), map.end());
226 }
227 size_t size() const {
228 return map.size();
229 }
230 size_t count(pg_t pgid) const {
231 return map.count(pgid);
232 }
233 void erase(pg_t pgid) {
234 map.erase(pgid);
235 }
236 void clear() {
237 map.clear();
238 data.clear();
239 }
240 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
11fdf7f2 241 using ceph::encode;
eafe8130 242 size_t need = sizeof(ceph_le32) * (1 + v.size());
31f18b77 243 if (need < data.get_append_buffer_unused_tail_length()) {
9f95a23c 244 ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length());
31f18b77
FG
245 z.zero();
246 data.append(z.c_str(), z.length());
247 }
11fdf7f2 248 encode(v, data);
eafe8130 249 map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size());
31f18b77
FG
250 }
251 mempool::osdmap::vector<int32_t> get(pg_t pgid) {
252 mempool::osdmap::vector<int32_t> v;
eafe8130 253 ceph_le32 *p = map[pgid];
31f18b77
FG
254 size_t n = *p++;
255 v.resize(n);
256 for (size_t i = 0; i < n; ++i, ++p) {
257 v[i] = *p;
258 }
259 return v;
260 }
261#else
262 // trivial implementation
263 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
264
9f95a23c 265 void encode(ceph::buffer::list& bl) const {
11fdf7f2 266 encode(pg_temp, bl);
31f18b77 267 }
9f95a23c 268 void decode(ceph::buffer::list::const_iterator& p) {
11fdf7f2 269 decode(pg_temp, p);
31f18b77
FG
270 }
271 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
272 return
273 l.pg_temp.size() == r.pg_temp.size() &&
274 l.pg_temp == r.pg_temp;
275 }
276
277 class iterator {
278 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
279 public:
280 iterator(mempool::osdmap::map<pg_t,
281 mempool::osdmap::vector<int32_t> >::const_iterator p)
282 : it(p) {}
283
9f95a23c 284 std::pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
31f18b77
FG
285 return *it;
286 }
9f95a23c 287 const std::pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
31f18b77
FG
288 return &*it;
289 }
290 friend bool operator==(const iterator& l, const iterator& r) {
291 return l.it == r.it;
292 }
293 friend bool operator!=(const iterator& l, const iterator& r) {
294 return l.it != r.it;
295 }
296 iterator& operator++() {
297 ++it;
298 return *this;
299 }
300 iterator operator++(int) {
301 iterator r = *this;
302 ++it;
303 return r;
304 }
305 };
306 iterator begin() const {
307 return iterator(pg_temp.cbegin());
308 }
309 iterator end() const {
310 return iterator(pg_temp.cend());
311 }
312 iterator find(pg_t pgid) const {
313 return iterator(pg_temp.find(pgid));
314 }
315 size_t size() const {
316 return pg_temp.size();
317 }
318 size_t count(pg_t pgid) const {
319 return pg_temp.count(pgid);
320 }
321 void erase(pg_t pgid) {
322 pg_temp.erase(pgid);
323 }
324 void clear() {
325 pg_temp.clear();
326 }
327 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
328 pg_temp[pgid] = v;
329 }
330 const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
331 return pg_temp.at(pgid);
332 }
333#endif
9f95a23c 334 void dump(ceph::Formatter *f) const {
31f18b77
FG
335 for (const auto &pg : *this) {
336 f->open_object_section("osds");
337 f->dump_stream("pgid") << pg.first;
338 f->open_array_section("osds");
339 for (const auto osd : pg.second)
340 f->dump_int("osd", osd);
341 f->close_section();
342 f->close_section();
343 }
344 }
345};
346WRITE_CLASS_ENCODER(PGTempMap)
347
7c673cae
FG
348/** OSDMap
349 */
350class OSDMap {
351public:
352 MEMPOOL_CLASS_HELPERS();
353
354 class Incremental {
355 public:
356 MEMPOOL_CLASS_HELPERS();
357
358 /// feature bits we were encoded with. the subsequent OSDMap
359 /// encoding should match.
360 uint64_t encode_features;
361 uuid_d fsid;
362 epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
363 utime_t modified;
364 int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
365 int32_t new_flags;
9f95a23c 366 ceph_release_t new_require_osd_release{0xff};
7c673cae
FG
367
368 // full (rare)
9f95a23c
TL
369 ceph::buffer::list fullmap; // in lieu of below.
370 ceph::buffer::list crush;
7c673cae
FG
371
372 // incremental
373 int32_t new_max_osd;
374 mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
9f95a23c 375 mempool::osdmap::map<int64_t,std::string> new_pool_names;
7c673cae 376 mempool::osdmap::set<int64_t> old_pools;
9f95a23c
TL
377 mempool::osdmap::map<std::string,std::map<std::string,std::string> > new_erasure_code_profiles;
378 mempool::osdmap::vector<std::string> old_erasure_code_profiles;
11fdf7f2
TL
379 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
380 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
31f18b77 381 mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
7c673cae
FG
382 mempool::osdmap::map<int32_t,uint32_t> new_weight;
383 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
384 mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
385 mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
386 mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
9f95a23c 387 mempool::osdmap::map<int32_t,std::pair<epoch_t,epoch_t> > new_last_clean_interval;
7c673cae
FG
388 mempool::osdmap::map<int32_t,epoch_t> new_lost;
389 mempool::osdmap::map<int32_t,uuid_d> new_uuid;
390 mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
391
392 mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist;
393 mempool::osdmap::vector<entity_addr_t> old_blacklist;
11fdf7f2
TL
394 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
395 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
7c673cae
FG
396
397 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
9f95a23c 398 mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> new_pg_upmap_items;
7c673cae 399 mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
11fdf7f2
TL
400 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
401 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
7c673cae 402
81eedcae
TL
403 mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags;
404 mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags;
405
9f95a23c 406 std::string cluster_snapshot;
7c673cae
FG
407
408 float new_nearfull_ratio = -1;
409 float new_backfillfull_ratio = -1;
410 float new_full_ratio = -1;
411
9f95a23c 412 ceph_release_t new_require_min_compat_client{0xff};
7c673cae 413
11fdf7f2
TL
414 utime_t new_last_up_change, new_last_in_change;
415
7c673cae
FG
416 mutable bool have_crc; ///< crc values are defined
417 uint32_t full_crc; ///< crc of the resulting OSDMap
418 mutable uint32_t inc_crc; ///< crc of this incremental
419
420 int get_net_marked_out(const OSDMap *previous) const;
421 int get_net_marked_down(const OSDMap *previous) const;
422 int identify_osd(uuid_d u) const;
423
9f95a23c
TL
424 void encode_client_old(ceph::buffer::list& bl) const;
425 void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
426 void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
427 void decode_classic(ceph::buffer::list::const_iterator &p);
428 void decode(ceph::buffer::list::const_iterator &bl);
429 void dump(ceph::Formatter *f) const;
430 static void generate_test_instances(std::list<Incremental*>& o);
7c673cae
FG
431
432 explicit Incremental(epoch_t e=0) :
433 encode_features(0),
434 epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
435 have_crc(false), full_crc(0), inc_crc(0) {
7c673cae 436 }
9f95a23c 437 explicit Incremental(ceph::buffer::list &bl) {
11fdf7f2 438 auto p = std::cbegin(bl);
7c673cae
FG
439 decode(p);
440 }
9f95a23c 441 explicit Incremental(ceph::buffer::list::const_iterator &p) {
7c673cae
FG
442 decode(p);
443 }
444
445 pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
446 if (new_pools.count(pool) == 0)
447 new_pools[pool] = *orig;
448 return &new_pools[pool];
449 }
9f95a23c 450 bool has_erasure_code_profile(const std::string &name) const {
7c673cae
FG
451 auto i = new_erasure_code_profiles.find(name);
452 return i != new_erasure_code_profiles.end();
453 }
9f95a23c
TL
454 void set_erasure_code_profile(const std::string &name,
455 const std::map<std::string,std::string>& profile) {
7c673cae
FG
456 new_erasure_code_profiles[name] = profile;
457 }
9f95a23c 458 mempool::osdmap::map<std::string,std::map<std::string,std::string>> get_erasure_code_profiles() const {
11fdf7f2
TL
459 return new_erasure_code_profiles;
460 }
7c673cae 461
11fdf7f2 462 /// propagate update pools' snap metadata to any of their tiers
7c673cae 463 int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
31f18b77
FG
464
465 /// filter out osds with any pending state changing
9f95a23c 466 size_t get_pending_state_osds(std::vector<int> *osds) {
11fdf7f2 467 ceph_assert(osds);
31f18b77
FG
468 osds->clear();
469
470 for (auto &p : new_state) {
471 osds->push_back(p.first);
472 }
473
474 return osds->size();
475 }
476
477 bool pending_osd_has_state(int osd, unsigned state) {
478 return new_state.count(osd) && (new_state[osd] & state) != 0;
479 }
480
81eedcae
TL
481 bool pending_osd_state_set(int osd, unsigned state) {
482 if (pending_osd_has_state(osd, state))
483 return false;
31f18b77 484 new_state[osd] |= state;
81eedcae 485 return true;
31f18b77
FG
486 }
487
488 // cancel the specified pending osd state if there is any
489 // return ture on success, false otherwise.
490 bool pending_osd_state_clear(int osd, unsigned state) {
491 if (!pending_osd_has_state(osd, state)) {
492 // never has been set or already has been cancelled.
493 return false;
494 }
495
496 new_state[osd] &= ~state;
11fdf7f2
TL
497 if (!new_state[osd]) {
498 // all flags cleared
499 new_state.erase(osd);
500 }
31f18b77
FG
501 return true;
502 }
503
9f95a23c
TL
504 bool in_new_removed_snaps(int64_t pool, snapid_t snap) const {
505 auto p = new_removed_snaps.find(pool);
506 if (p == new_removed_snaps.end()) {
507 return false;
508 }
509 return p->second.contains(snap);
510 }
7c673cae
FG
511 };
512
513private:
514 uuid_d fsid;
515 epoch_t epoch; // what epoch of the osd cluster descriptor is this
516 utime_t created, modified; // epoch start time
517 int32_t pool_max; // the largest pool num, ever
518
519 uint32_t flags;
520
521 int num_osd; // not saved; see calc_num_osds
522 int num_up_osd; // not saved; see calc_num_osds
523 int num_in_osd; // not saved; see calc_num_osds
524
525 int32_t max_osd;
9f95a23c 526 std::vector<uint32_t> osd_state;
7c673cae 527
81eedcae
TL
528 mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags
529 mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags
530
11fdf7f2
TL
531 utime_t last_up_change, last_in_change;
532
28e407b8
AA
533 // These features affect OSDMap[::Incremental] encoding, or the
534 // encoding of some type embedded therein (CrushWrapper, something
535 // from osd_types, etc.).
536 static constexpr uint64_t SIGNIFICANT_FEATURES =
537 CEPH_FEATUREMASK_PGID64 |
538 CEPH_FEATUREMASK_PGPOOL3 |
539 CEPH_FEATUREMASK_OSDENC |
540 CEPH_FEATUREMASK_OSDMAP_ENC |
541 CEPH_FEATUREMASK_OSD_POOLRESEND |
542 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
543 CEPH_FEATUREMASK_MSG_ADDR2 |
544 CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
545 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
11fdf7f2
TL
546 CEPH_FEATUREMASK_SERVER_LUMINOUS |
547 CEPH_FEATUREMASK_SERVER_MIMIC |
9f95a23c
TL
548 CEPH_FEATUREMASK_SERVER_NAUTILUS |
549 CEPH_FEATUREMASK_SERVER_OCTOPUS;
11fdf7f2 550
7c673cae 551 struct addrs_s {
11fdf7f2
TL
552 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
553 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
554 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
555 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
7c673cae 556 };
11fdf7f2
TL
557 std::shared_ptr<addrs_s> osd_addrs;
558
559 entity_addrvec_t _blank_addrvec;
7c673cae
FG
560
561 mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
562 mempool::osdmap::vector<osd_info_t> osd_info;
11fdf7f2
TL
563 std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
564 std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
565 std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
7c673cae
FG
566
567 // remap (post-CRUSH, pre-up)
568 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
9f95a23c 569 mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
7c673cae
FG
570
571 mempool::osdmap::map<int64_t,pg_pool_t> pools;
9f95a23c
TL
572 mempool::osdmap::map<int64_t,std::string> pool_name;
573 mempool::osdmap::map<std::string, std::map<std::string,std::string>> erasure_code_profiles;
574 mempool::osdmap::map<std::string,int64_t> name_pool;
7c673cae 575
11fdf7f2 576 std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
7c673cae
FG
577 mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
578
579 mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist;
580
11fdf7f2
TL
581 /// queue of snaps to remove
582 mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
583
584 /// removed_snaps additions this epoch
585 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
586
587 /// removed_snaps removals this epoch
588 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
589
7c673cae 590 epoch_t cluster_snapshot_epoch;
9f95a23c 591 std::string cluster_snapshot;
7c673cae
FG
592 bool new_blacklist_entries;
593
594 float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
595
596 /// min compat client we want to support
9f95a23c 597 ceph_release_t require_min_compat_client{ceph_release_t::unknown};
7c673cae 598
31f18b77
FG
599public:
600 /// require osds to run at least this release
9f95a23c 601 ceph_release_t require_osd_release{ceph_release_t::unknown};
31f18b77
FG
602
603private:
7c673cae
FG
604 mutable uint64_t cached_up_osd_features;
605
606 mutable bool crc_defined;
607 mutable uint32_t crc;
608
609 void _calc_up_osd_features();
610
611 public:
612 bool have_crc() const { return crc_defined; }
613 uint32_t get_crc() const { return crc; }
614
11fdf7f2 615 std::shared_ptr<CrushWrapper> crush; // hierarchical map
31f18b77
FG
616private:
617 uint32_t crush_version = 1;
7c673cae
FG
618
619 friend class OSDMonitor;
620
621 public:
622 OSDMap() : epoch(0),
224ce89b 623 pool_max(0),
7c673cae
FG
624 flags(0),
625 num_osd(0), num_up_osd(0), num_in_osd(0),
626 max_osd(0),
627 osd_addrs(std::make_shared<addrs_s>()),
31f18b77 628 pg_temp(std::make_shared<PGTempMap>()),
7c673cae
FG
629 primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
630 osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
631 cluster_snapshot_epoch(0),
632 new_blacklist_entries(false),
633 cached_up_osd_features(0),
634 crc_defined(false), crc(0),
635 crush(std::make_shared<CrushWrapper>()) {
7c673cae
FG
636 }
637
7c673cae
FG
638private:
639 OSDMap(const OSDMap& other) = default;
640 OSDMap& operator=(const OSDMap& other) = default;
641public:
642
28e407b8
AA
643 /// return feature mask subset that is relevant to OSDMap encoding
644 static uint64_t get_significant_features(uint64_t features) {
645 return SIGNIFICANT_FEATURES & features;
646 }
647
648 uint64_t get_encoding_features() const;
649
7c673cae
FG
650 void deepish_copy_from(const OSDMap& o) {
651 *this = o;
652 primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
31f18b77 653 pg_temp.reset(new PGTempMap(*o.pg_temp));
7c673cae
FG
654 osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
655
656 if (o.osd_primary_affinity)
657 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
658
11fdf7f2 659 // NOTE: this still references shared entity_addrvec_t's.
7c673cae
FG
660 osd_addrs.reset(new addrs_s(*o.osd_addrs));
661
662 // NOTE: we do not copy crush. note that apply_incremental will
663 // allocate a new CrushWrapper, though.
664 }
665
666 // map info
667 const uuid_d& get_fsid() const { return fsid; }
668 void set_fsid(uuid_d& f) { fsid = f; }
669
670 epoch_t get_epoch() const { return epoch; }
671 void inc_epoch() { epoch++; }
672
673 void set_epoch(epoch_t e);
674
31f18b77
FG
675 uint32_t get_crush_version() const {
676 return crush_version;
677 }
678
7c673cae
FG
679 /* stamps etc */
680 const utime_t& get_created() const { return created; }
681 const utime_t& get_modified() const { return modified; }
682
683 bool is_blacklisted(const entity_addr_t& a) const;
11fdf7f2 684 bool is_blacklisted(const entity_addrvec_t& a) const;
9f95a23c 685 void get_blacklist(std::list<std::pair<entity_addr_t,utime_t > > *bl) const;
31f18b77 686 void get_blacklist(std::set<entity_addr_t> *bl) const;
7c673cae 687
9f95a23c 688 std::string get_cluster_snapshot() const {
7c673cae
FG
689 if (cluster_snapshot_epoch == epoch)
690 return cluster_snapshot;
9f95a23c 691 return std::string();
7c673cae
FG
692 }
693
694 float get_full_ratio() const {
695 return full_ratio;
696 }
697 float get_backfillfull_ratio() const {
698 return backfillfull_ratio;
699 }
700 float get_nearfull_ratio() const {
701 return nearfull_ratio;
702 }
3efd9988 703 void get_full_pools(CephContext *cct,
9f95a23c
TL
704 std::set<int64_t> *full,
705 std::set<int64_t> *backfillfull,
706 std::set<int64_t> *nearfull) const;
707 void get_full_osd_counts(std::set<int> *full, std::set<int> *backfill,
708 std::set<int> *nearfull) const;
31f18b77
FG
709
710
7c673cae
FG
711 /***** cluster state *****/
712 /* osds */
713 int get_max_osd() const { return max_osd; }
714 void set_max_osd(int m);
715
716 unsigned get_num_osds() const {
717 return num_osd;
718 }
719 unsigned get_num_up_osds() const {
720 return num_up_osd;
721 }
722 unsigned get_num_in_osds() const {
723 return num_in_osd;
724 }
725 /// recalculate cached values for get_num{,_up,_in}_osds
726 int calc_num_osds();
727
9f95a23c
TL
728 void get_all_osds(std::set<int32_t>& ls) const;
729 void get_up_osds(std::set<int32_t>& ls) const;
81eedcae 730 void get_out_existing_osds(std::set<int32_t>& ls) const;
7c673cae
FG
731 unsigned get_num_pg_temp() const {
732 return pg_temp->size();
733 }
734
735 int get_flags() const { return flags; }
736 bool test_flag(int f) const { return flags & f; }
737 void set_flag(int f) { flags |= f; }
738 void clear_flag(int f) { flags &= ~f; }
739
9f95a23c 740 void get_flag_set(std::set<std::string> *flagset) const;
11fdf7f2 741
9f95a23c 742 static void calc_state_set(int state, std::set<std::string>& st);
7c673cae
FG
743
744 int get_state(int o) const {
11fdf7f2 745 ceph_assert(o < max_osd);
7c673cae
FG
746 return osd_state[o];
747 }
9f95a23c 748 int get_state(int o, std::set<std::string>& st) const {
11fdf7f2 749 ceph_assert(o < max_osd);
7c673cae
FG
750 unsigned t = osd_state[o];
751 calc_state_set(t, st);
752 return osd_state[o];
753 }
754 void set_state(int o, unsigned s) {
11fdf7f2 755 ceph_assert(o < max_osd);
7c673cae
FG
756 osd_state[o] = s;
757 }
758 void set_weight(int o, unsigned w) {
11fdf7f2 759 ceph_assert(o < max_osd);
7c673cae
FG
760 osd_weight[o] = w;
761 if (w)
762 osd_state[o] |= CEPH_OSD_EXISTS;
763 }
764 unsigned get_weight(int o) const {
11fdf7f2 765 ceph_assert(o < max_osd);
7c673cae
FG
766 return osd_weight[o];
767 }
768 float get_weightf(int o) const {
769 return (float)get_weight(o) / (float)CEPH_OSD_IN;
770 }
9f95a23c 771 void adjust_osd_weights(const std::map<int,double>& weights, Incremental& inc) const;
7c673cae
FG
772
773 void set_primary_affinity(int o, int w) {
11fdf7f2 774 ceph_assert(o < max_osd);
7c673cae
FG
775 if (!osd_primary_affinity)
776 osd_primary_affinity.reset(
777 new mempool::osdmap::vector<__u32>(
778 max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
779 (*osd_primary_affinity)[o] = w;
780 }
781 unsigned get_primary_affinity(int o) const {
11fdf7f2 782 ceph_assert(o < max_osd);
7c673cae
FG
783 if (!osd_primary_affinity)
784 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
785 return (*osd_primary_affinity)[o];
786 }
787 float get_primary_affinityf(int o) const {
788 return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
789 }
790
9f95a23c 791 bool has_erasure_code_profile(const std::string &name) const {
7c673cae
FG
792 auto i = erasure_code_profiles.find(name);
793 return i != erasure_code_profiles.end();
794 }
795 int get_erasure_code_profile_default(CephContext *cct,
9f95a23c
TL
796 std::map<std::string,std::string> &profile_map,
797 std::ostream *ss);
798 void set_erasure_code_profile(const std::string &name,
799 const std::map<std::string,std::string>& profile) {
7c673cae
FG
800 erasure_code_profiles[name] = profile;
801 }
9f95a23c
TL
802 const std::map<std::string,std::string> &get_erasure_code_profile(
803 const std::string &name) const {
804 static std::map<std::string,std::string> empty;
7c673cae
FG
805 auto i = erasure_code_profiles.find(name);
806 if (i == erasure_code_profiles.end())
807 return empty;
808 else
809 return i->second;
810 }
9f95a23c 811 const mempool::osdmap::map<std::string,std::map<std::string,std::string>> &get_erasure_code_profiles() const {
7c673cae
FG
812 return erasure_code_profiles;
813 }
814
815 bool exists(int osd) const {
816 //assert(osd >= 0);
817 return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
818 }
819
31f18b77
FG
820 bool is_destroyed(int osd) const {
821 return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
822 }
823
7c673cae
FG
824 bool is_up(int osd) const {
825 return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
826 }
827
828 bool has_been_up_since(int osd, epoch_t epoch) const {
829 return is_up(osd) && get_up_from(osd) <= epoch;
830 }
831
832 bool is_down(int osd) const {
833 return !is_up(osd);
834 }
835
9f95a23c
TL
836 bool is_stop(int osd) const {
837 return exists(osd) && is_down(osd) &&
838 (osd_state[osd] & CEPH_OSD_STOP);
839 }
840
7c673cae
FG
841 bool is_out(int osd) const {
842 return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
843 }
844
845 bool is_in(int osd) const {
846 return !is_out(osd);
847 }
848
9f95a23c
TL
849 bool is_dead(int osd) const {
850 if (!exists(osd)) {
851 return false; // unclear if they know they are removed from map
852 }
853 return get_xinfo(osd).dead_epoch > get_info(osd).up_from;
854 }
855
81eedcae
TL
856 unsigned get_osd_crush_node_flags(int osd) const;
857 unsigned get_crush_node_flags(int id) const;
858 unsigned get_device_class_flags(int id) const;
859
860 bool is_noup_by_osd(int osd) const {
31f18b77
FG
861 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
862 }
863
81eedcae 864 bool is_nodown_by_osd(int osd) const {
31f18b77
FG
865 return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
866 }
867
81eedcae 868 bool is_noin_by_osd(int osd) const {
31f18b77
FG
869 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
870 }
871
81eedcae 872 bool is_noout_by_osd(int osd) const {
31f18b77
FG
873 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
874 }
875
81eedcae
TL
876 bool is_noup(int osd) const {
877 if (test_flag(CEPH_OSDMAP_NOUP)) // global?
878 return true;
879 if (is_noup_by_osd(osd)) // by osd?
880 return true;
881 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node?
882 return true;
883 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
884 get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class?
885 return true;
886 return false;
31f18b77
FG
887 }
888
81eedcae
TL
889 bool is_nodown(int osd) const {
890 if (test_flag(CEPH_OSDMAP_NODOWN))
891 return true;
892 if (is_nodown_by_osd(osd))
893 return true;
894 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN)
895 return true;
896 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
897 get_device_class_flags(class_id) & CEPH_OSD_NODOWN)
898 return true;
899 return false;
31f18b77
FG
900 }
901
81eedcae
TL
902 bool is_noin(int osd) const {
903 if (test_flag(CEPH_OSDMAP_NOIN))
904 return true;
905 if (is_noin_by_osd(osd))
906 return true;
907 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN)
908 return true;
909 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
910 get_device_class_flags(class_id) & CEPH_OSD_NOIN)
911 return true;
912 return false;
31f18b77
FG
913 }
914
81eedcae
TL
915 bool is_noout(int osd) const {
916 if (test_flag(CEPH_OSDMAP_NOOUT))
917 return true;
918 if (is_noout_by_osd(osd))
919 return true;
920 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT)
921 return true;
922 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
923 get_device_class_flags(class_id) & CEPH_OSD_NOOUT)
924 return true;
925 return false;
31f18b77
FG
926 }
927
7c673cae
FG
928 /**
929 * check if an entire crush subtree is down
930 */
9f95a23c
TL
931 bool subtree_is_down(int id, std::set<int> *down_cache) const;
932 bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set<int> *down_cache) const;
933
934 bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set<int> *down_in_osds, std::set<int> *up_in_osds,
935 std::set<int> *subtree_up, std::unordered_map<int, std::set<int> > *subtree_type_down) const;
31f18b77 936
7c673cae
FG
937 int identify_osd(const entity_addr_t& addr) const;
938 int identify_osd(const uuid_d& u) const;
939 int identify_osd_on_all_channels(const entity_addr_t& addr) const;
940
941 bool have_addr(const entity_addr_t& addr) const {
942 return identify_osd(addr) >= 0;
943 }
944 int find_osd_on_ip(const entity_addr_t& ip) const;
11fdf7f2
TL
945
946 const entity_addrvec_t& get_addrs(int osd) const {
947 ceph_assert(exists(osd));
948 return osd_addrs->client_addrs[osd] ?
949 *osd_addrs->client_addrs[osd] : _blank_addrvec;
7c673cae 950 }
11fdf7f2
TL
951 const entity_addrvec_t& get_most_recent_addrs(int osd) const {
952 return get_addrs(osd);
7c673cae 953 }
11fdf7f2
TL
954 const entity_addrvec_t &get_cluster_addrs(int osd) const {
955 ceph_assert(exists(osd));
956 return osd_addrs->cluster_addrs[osd] ?
957 *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
7c673cae 958 }
11fdf7f2
TL
959 const entity_addrvec_t &get_hb_back_addrs(int osd) const {
960 ceph_assert(exists(osd));
961 return osd_addrs->hb_back_addrs[osd] ?
962 *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
7c673cae 963 }
11fdf7f2
TL
964 const entity_addrvec_t &get_hb_front_addrs(int osd) const {
965 ceph_assert(exists(osd));
966 return osd_addrs->hb_front_addrs[osd] ?
967 *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
7c673cae
FG
968 }
969
970 const uuid_d& get_uuid(int osd) const {
11fdf7f2 971 ceph_assert(exists(osd));
7c673cae
FG
972 return (*osd_uuid)[osd];
973 }
974
975 const epoch_t& get_up_from(int osd) const {
11fdf7f2 976 ceph_assert(exists(osd));
7c673cae
FG
977 return osd_info[osd].up_from;
978 }
979 const epoch_t& get_up_thru(int osd) const {
11fdf7f2 980 ceph_assert(exists(osd));
7c673cae
FG
981 return osd_info[osd].up_thru;
982 }
983 const epoch_t& get_down_at(int osd) const {
11fdf7f2 984 ceph_assert(exists(osd));
7c673cae
FG
985 return osd_info[osd].down_at;
986 }
987 const osd_info_t& get_info(int osd) const {
11fdf7f2 988 ceph_assert(osd < max_osd);
7c673cae
FG
989 return osd_info[osd];
990 }
991
992 const osd_xinfo_t& get_xinfo(int osd) const {
11fdf7f2 993 ceph_assert(osd < max_osd);
7c673cae
FG
994 return osd_xinfo[osd];
995 }
996
997 int get_next_up_osd_after(int n) const {
998 if (get_max_osd() == 0)
999 return -1;
1000 for (int i = n + 1; i != n; ++i) {
1001 if (i >= get_max_osd())
1002 i = 0;
1003 if (i == n)
1004 break;
1005 if (is_up(i))
1006 return i;
1007 }
1008 return -1;
1009 }
1010
1011 int get_previous_up_osd_before(int n) const {
1012 if (get_max_osd() == 0)
1013 return -1;
1014 for (int i = n - 1; i != n; --i) {
1015 if (i < 0)
1016 i = get_max_osd() - 1;
1017 if (i == n)
1018 break;
1019 if (is_up(i))
1020 return i;
1021 }
1022 return -1;
1023 }
1024
11fdf7f2
TL
1025
1026 void get_random_up_osds_by_subtree(int n, // whoami
9f95a23c 1027 std::string &subtree,
11fdf7f2 1028 int limit, // how many
9f95a23c
TL
1029 std::set<int> skip,
1030 std::set<int> *want) const;
11fdf7f2 1031
7c673cae
FG
1032 /**
1033 * get feature bits required by the current structure
1034 *
1035 * @param entity_type [in] what entity type we are asking about
9f95a23c 1036 * @param mask [out] std::set of all possible map-related features we could std::set
7c673cae
FG
1037 * @return feature bits used by this map
1038 */
1039 uint64_t get_features(int entity_type, uint64_t *mask) const;
1040
1041 /**
1042 * get oldest *client* version (firefly, hammer, etc.) that can connect given
1043 * the feature bits required (according to get_features()).
1044 */
9f95a23c 1045 ceph_release_t get_min_compat_client() const;
7c673cae 1046
11fdf7f2
TL
1047 /**
1048 * gets the required minimum *client* version that can connect to the cluster.
1049 */
9f95a23c 1050 ceph_release_t get_require_min_compat_client() const;
11fdf7f2 1051
7c673cae
FG
1052 /**
1053 * get intersection of features supported by up osds
1054 */
1055 uint64_t get_up_osd_features() const;
1056
494da23a
TL
1057 void get_upmap_pgs(vector<pg_t> *upmap_pgs) const;
1058 bool check_pg_upmaps(
1059 CephContext *cct,
1060 const vector<pg_t>& to_check,
1061 vector<pg_t> *to_cancel,
1062 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const;
1063 void clean_pg_upmaps(
1064 CephContext *cct,
1065 Incremental *pending_inc,
1066 const vector<pg_t>& to_cancel,
1067 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const;
1068 bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
94b18763 1069
7c673cae
FG
1070 int apply_incremental(const Incremental &inc);
1071
1072 /// try to re-use/reference addrs in oldmap from newmap
1073 static void dedup(const OSDMap *oldmap, OSDMap *newmap);
1074
11fdf7f2
TL
1075 static void clean_temps(CephContext *cct,
1076 const OSDMap& oldmap,
1077 const OSDMap& nextmap,
7c673cae
FG
1078 Incremental *pending_inc);
1079
1080 // serialize, unserialize
1081private:
9f95a23c
TL
1082 void encode_client_old(ceph::buffer::list& bl) const;
1083 void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
1084 void decode_classic(ceph::buffer::list::const_iterator& p);
7c673cae
FG
1085 void post_decode();
1086public:
9f95a23c
TL
1087 void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
1088 void decode(ceph::buffer::list& bl);
1089 void decode(ceph::buffer::list::const_iterator& bl);
7c673cae
FG
1090
1091
1092 /**** mapping facilities ****/
1093 int map_to_pg(
1094 int64_t pool,
9f95a23c
TL
1095 const std::string& name,
1096 const std::string& key,
1097 const std::string& nspace,
7c673cae
FG
1098 pg_t *pg) const;
1099 int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
1100 pg_t &pg) const;
1101 pg_t object_locator_to_pg(const object_t& oid,
1102 const object_locator_t& loc) const {
1103 pg_t pg;
1104 int ret = object_locator_to_pg(oid, loc, pg);
11fdf7f2 1105 ceph_assert(ret == 0);
7c673cae
FG
1106 return pg;
1107 }
1108
1109
1110 static object_locator_t file_to_object_locator(const file_layout_t& layout) {
1111 return object_locator_t(layout.pool_id, layout.pool_ns);
1112 }
1113
1114 ceph_object_layout file_to_object_layout(object_t oid,
1115 file_layout_t& layout) const {
1116 return make_object_layout(oid, layout.pool_id, layout.pool_ns);
1117 }
1118
1119 ceph_object_layout make_object_layout(object_t oid, int pg_pool,
9f95a23c 1120 std::string nspace) const;
7c673cae
FG
1121
1122 int get_pg_num(int pg_pool) const
1123 {
1124 const pg_pool_t *pool = get_pg_pool(pg_pool);
11fdf7f2 1125 ceph_assert(NULL != pool);
7c673cae
FG
1126 return pool->get_pg_num();
1127 }
1128
1129 bool pg_exists(pg_t pgid) const {
1130 const pg_pool_t *p = get_pg_pool(pgid.pool());
1131 return p && pgid.ps() < p->get_pg_num();
1132 }
1133
224ce89b
WB
1134 int get_pg_pool_min_size(pg_t pgid) const {
1135 if (!pg_exists(pgid)) {
1136 return -ENOENT;
1137 }
1138 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1139 ceph_assert(p);
224ce89b
WB
1140 return p->get_min_size();
1141 }
1142
1143 int get_pg_pool_size(pg_t pgid) const {
1144 if (!pg_exists(pgid)) {
1145 return -ENOENT;
1146 }
1147 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1148 ceph_assert(p);
224ce89b
WB
1149 return p->get_size();
1150 }
1151
94b18763
FG
1152 int get_pg_pool_crush_rule(pg_t pgid) const {
1153 if (!pg_exists(pgid)) {
1154 return -ENOENT;
1155 }
1156 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1157 ceph_assert(p);
94b18763
FG
1158 return p->get_crush_rule();
1159 }
1160
7c673cae 1161private:
9f95a23c 1162 /// pg -> (raw osd std::list)
31f18b77 1163 void _pg_to_raw_osds(
7c673cae 1164 const pg_pool_t& pool, pg_t pg,
9f95a23c 1165 std::vector<int> *osds,
7c673cae 1166 ps_t *ppps) const;
9f95a23c
TL
1167 int _pick_primary(const std::vector<int>& osds) const;
1168 void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector<int>& osds) const;
7c673cae
FG
1169
1170 void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
9f95a23c 1171 std::vector<int> *osds, int *primary) const;
7c673cae
FG
1172
1173 /// apply pg_upmap[_items] mappings
9f95a23c 1174 void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector<int> *raw) const;
7c673cae 1175
9f95a23c
TL
1176 /// pg -> (up osd std::list)
1177 void _raw_to_up_osds(const pg_pool_t& pool, const std::vector<int>& raw,
1178 std::vector<int> *up) const;
7c673cae
FG
1179
1180
1181 /**
1182 * Get the pg and primary temp, if they are specified.
1183 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1184 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1185 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1186 */
1187 void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
9f95a23c 1188 std::vector<int> *temp_pg, int *temp_primary) const;
7c673cae
FG
1189
1190 /**
1191 * map to up and acting. Fills in whatever fields are non-NULL.
1192 */
9f95a23c
TL
1193 void _pg_to_up_acting_osds(const pg_t& pg, std::vector<int> *up, int *up_primary,
1194 std::vector<int> *acting, int *acting_primary,
7c673cae
FG
1195 bool raw_pg_to_pg = true) const;
1196
1197public:
1198 /***
1199 * This is suitable only for looking at raw CRUSH outputs. It skips
1200 * applying the temp and up checks and should not be used
1201 * by anybody for data mapping purposes.
1202 * raw and primary must be non-NULL
1203 */
9f95a23c
TL
1204 void pg_to_raw_osds(pg_t pg, std::vector<int> *raw, int *primary) const;
1205 void pg_to_raw_upmap(pg_t pg, std::vector<int> *raw,
1206 std::vector<int> *raw_upmap) const;
7c673cae 1207 /// map a pg to its acting set. @return acting set size
9f95a23c 1208 void pg_to_acting_osds(const pg_t& pg, std::vector<int> *acting,
7c673cae
FG
1209 int *acting_primary) const {
1210 _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
7c673cae 1211 }
9f95a23c 1212 void pg_to_acting_osds(pg_t pg, std::vector<int>& acting) const {
7c673cae
FG
1213 return pg_to_acting_osds(pg, &acting, NULL);
1214 }
1215 /**
1216 * This does not apply temp overrides and should not be used
1217 * by anybody for data mapping purposes. Specify both pointers.
1218 */
9f95a23c 1219 void pg_to_raw_up(pg_t pg, std::vector<int> *up, int *primary) const;
7c673cae
FG
1220 /**
1221 * map a pg to its acting set as well as its up set. You must use
1222 * the acting set for data mapping purposes, but some users will
1223 * also find the up set useful for things like deciding what to
1224 * set as pg_temp.
1225 * Each of these pointers must be non-NULL.
1226 */
9f95a23c
TL
1227 void pg_to_up_acting_osds(pg_t pg, std::vector<int> *up, int *up_primary,
1228 std::vector<int> *acting, int *acting_primary) const {
7c673cae
FG
1229 _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
1230 }
9f95a23c 1231 void pg_to_up_acting_osds(pg_t pg, std::vector<int>& up, std::vector<int>& acting) const {
7c673cae
FG
1232 int up_primary, acting_primary;
1233 pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
1234 }
1235 bool pg_is_ec(pg_t pg) const {
1236 auto i = pools.find(pg.pool());
11fdf7f2
TL
1237 ceph_assert(i != pools.end());
1238 return i->second.is_erasure();
7c673cae
FG
1239 }
1240 bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
1241 auto i = get_pools().find(pgid.pool());
1242 if (i == get_pools().end()) {
1243 return false;
1244 }
11fdf7f2 1245 if (!i->second.is_erasure()) {
7c673cae
FG
1246 *out = spg_t(pgid);
1247 return true;
1248 }
1249 int primary;
9f95a23c 1250 std::vector<int> acting;
7c673cae
FG
1251 pg_to_acting_osds(pgid, &acting, &primary);
1252 for (uint8_t i = 0; i < acting.size(); ++i) {
1253 if (acting[i] == primary) {
1254 *out = spg_t(pgid, shard_id_t(i));
1255 return true;
1256 }
1257 }
1258 return false;
1259 }
11fdf7f2
TL
1260 bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
1261 auto i = get_pools().find(pgid.pool());
1262 if (i == get_pools().end()) {
1263 return false;
1264 }
9f95a23c 1265 std::vector<int> acting;
11fdf7f2
TL
1266 pg_to_acting_osds(pgid, &acting, primary);
1267 if (i->second.is_erasure()) {
1268 for (uint8_t i = 0; i < acting.size(); ++i) {
1269 if (acting[i] == *primary) {
1270 *out = spg_t(pgid, shard_id_t(i));
1271 return true;
1272 }
1273 }
1274 } else {
1275 *out = spg_t(pgid);
1276 return true;
1277 }
1278 return false;
1279 }
1280
9f95a23c
TL
1281 bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
1282 auto p = removed_snaps_queue.find(pool);
1283 if (p == removed_snaps_queue.end()) {
1284 return false;
1285 }
1286 return p->second.contains(snap);
1287 }
1288
11fdf7f2
TL
1289 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1290 get_removed_snaps_queue() const {
1291 return removed_snaps_queue;
1292 }
1293 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1294 get_new_removed_snaps() const {
1295 return new_removed_snaps;
1296 }
1297 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1298 get_new_purged_snaps() const {
1299 return new_purged_snaps;
1300 }
7c673cae 1301
9f95a23c 1302 int64_t lookup_pg_pool_name(const std::string& name) const {
7c673cae
FG
1303 auto p = name_pool.find(name);
1304 if (p == name_pool.end())
1305 return -ENOENT;
1306 return p->second;
1307 }
1308
1309 int64_t get_pool_max() const {
1310 return pool_max;
1311 }
1312 const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
1313 return pools;
1314 }
1315 mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
1316 return pools;
1317 }
9f95a23c 1318 void get_pool_ids_by_rule(int rule_id, std::set<int64_t> *pool_ids) const {
11fdf7f2 1319 ceph_assert(pool_ids);
3efd9988 1320 for (auto &p: pools) {
11fdf7f2 1321 if (p.second.get_crush_rule() == rule_id) {
3efd9988
FG
1322 pool_ids->insert(p.first);
1323 }
1324 }
1325 }
1326 void get_pool_ids_by_osd(CephContext *cct,
1327 int osd,
9f95a23c
TL
1328 std::set<int64_t> *pool_ids) const;
1329 const std::string& get_pool_name(int64_t p) const {
7c673cae 1330 auto i = pool_name.find(p);
11fdf7f2 1331 ceph_assert(i != pool_name.end());
7c673cae
FG
1332 return i->second;
1333 }
9f95a23c 1334 const mempool::osdmap::map<int64_t,std::string>& get_pool_names() const {
c07f9fc5
FG
1335 return pool_name;
1336 }
7c673cae
FG
1337 bool have_pg_pool(int64_t p) const {
1338 return pools.count(p);
1339 }
1340 const pg_pool_t* get_pg_pool(int64_t p) const {
1341 auto i = pools.find(p);
1342 if (i != pools.end())
1343 return &i->second;
1344 return NULL;
1345 }
1346 unsigned get_pg_size(pg_t pg) const {
1347 auto p = pools.find(pg.pool());
11fdf7f2 1348 ceph_assert(p != pools.end());
7c673cae
FG
1349 return p->second.get_size();
1350 }
1351 int get_pg_type(pg_t pg) const {
1352 auto p = pools.find(pg.pool());
11fdf7f2 1353 ceph_assert(p != pools.end());
7c673cae
FG
1354 return p->second.get_type();
1355 }
9f95a23c
TL
1356 int get_pool_crush_rule(int64_t pool_id) const {
1357 auto pool = get_pg_pool(pool_id);
1358 if (!pool)
1359 return -ENOENT;
1360 return pool->get_crush_rule();
1361 }
7c673cae
FG
1362
1363
1364 pg_t raw_pg_to_pg(pg_t pg) const {
1365 auto p = pools.find(pg.pool());
11fdf7f2 1366 ceph_assert(p != pools.end());
7c673cae
FG
1367 return p->second.raw_pg_to_pg(pg);
1368 }
1369
1370 // pg -> acting primary osd
1371 int get_pg_acting_primary(pg_t pg) const {
1372 int primary = -1;
1373 _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
1374 return primary;
1375 }
1376
1377 /*
1378 * check whether an spg_t maps to a particular osd
1379 */
1380 bool is_up_acting_osd_shard(spg_t pg, int osd) const {
9f95a23c 1381 std::vector<int> up, acting;
7c673cae 1382 _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
9f95a23c
TL
1383 if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 ||
1384 calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) {
1385 return true;
7c673cae
FG
1386 }
1387 return false;
1388 }
1389
1390
9f95a23c
TL
1391 static int calc_pg_role_broken(int osd, const std::vector<int>& acting, int nrep=0);
1392 static int calc_pg_role(pg_shard_t who, const std::vector<int>& acting);
1393 static bool primary_changed_broken(
7c673cae 1394 int oldprimary,
9f95a23c 1395 const std::vector<int> &oldacting,
7c673cae 1396 int newprimary,
9f95a23c 1397 const std::vector<int> &newacting);
7c673cae
FG
1398
1399 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
9f95a23c
TL
1400 int get_pg_acting_role(spg_t pg, int osd) const {
1401 std::vector<int> group;
1402 pg_to_acting_osds(pg.pgid, group);
1403 return calc_pg_role(pg_shard_t(osd, pg.shard), group);
7c673cae
FG
1404 }
1405
7c673cae
FG
1406 bool try_pg_upmap(
1407 CephContext *cct,
1408 pg_t pg, ///< pg to potentially remap
9f95a23c
TL
1409 const std::set<int>& overfull, ///< osds we'd want to evacuate
1410 const std::vector<int>& underfull, ///< osds to move to, in order of preference
1411 const std::vector<int>& more_underfull, ///< less full osds to move to, in order of preference
1412 std::vector<int> *orig,
1413 std::vector<int> *out); ///< resulting alternative mapping
7c673cae
FG
1414
1415 int calc_pg_upmaps(
1416 CephContext *cct,
92f5a8d4 1417 uint32_t max_deviation, ///< max deviation from target (value >= 1)
7c673cae 1418 int max_iterations, ///< max iterations to run
9f95a23c 1419 const std::set<int64_t>& pools, ///< [optional] restrict to pool
7c673cae
FG
1420 Incremental *pending_inc
1421 );
1422
9f95a23c 1423 int get_osds_by_bucket_name(const std::string &name, std::set<int> *osds) const;
31f18b77 1424
f64942e4
AA
1425 bool have_pg_upmaps(pg_t pg) const {
1426 return pg_upmap.count(pg) ||
1427 pg_upmap_items.count(pg);
1428 }
1429
9f95a23c
TL
1430 bool check_full(const set<pg_shard_t> &missing_on) const {
1431 for (auto shard : missing_on) {
1432 if (get_state(shard.osd) & CEPH_OSD_FULL)
1433 return true;
1434 }
1435 return false;
1436 }
1437
7c673cae
FG
1438 /*
1439 * handy helpers to build simple maps...
1440 */
1441 /**
1442 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1443 * it will be initialized with the specified number of OSDs in a
1444 * single host. If **num_osd** is < 0 the layout of the OSD map will
1445 * be built by reading the content of the configuration file.
1446 *
1447 * @param cct [in] in core ceph context
1448 * @param e [in] initial epoch
1449 * @param fsid [in] id of the cluster
1450 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1451 * @return **0** on success, negative errno on error.
1452 */
224ce89b
WB
1453private:
1454 int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
1455 int num_osd, int pg_bits, int pgp_bits,
1456 bool default_pool);
1457public:
7c673cae 1458 int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
224ce89b
WB
1459 int num_osd) {
1460 return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
1461 }
1462 int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
1463 int num_osd, int pg_bits, int pgp_bits) {
1464 return build_simple_optioned(cct, e, fsid, num_osd,
1465 pg_bits, pgp_bits, true);
1466 }
7c673cae
FG
1467 static int _build_crush_types(CrushWrapper& crush);
1468 static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
9f95a23c 1469 int num_osd, std::ostream *ss);
7c673cae
FG
1470 static int build_simple_crush_map_from_conf(CephContext *cct,
1471 CrushWrapper& crush,
9f95a23c 1472 std::ostream *ss);
31f18b77
FG
1473 static int build_simple_crush_rules(
1474 CephContext *cct, CrushWrapper& crush,
9f95a23c
TL
1475 const std::string& root,
1476 std::ostream *ss);
7c673cae 1477
3efd9988
FG
1478 bool crush_rule_in_use(int rule_id) const;
1479
9f95a23c 1480 int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const;
7c673cae
FG
1481
1482 void clear_temp() {
1483 pg_temp->clear();
1484 primary_temp->clear();
1485 }
1486
1487private:
9f95a23c 1488 void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const;
7c673cae 1489public:
9f95a23c
TL
1490 void print(std::ostream& out) const;
1491 void print_osd(int id, std::ostream& out) const;
1492 void print_osds(std::ostream& out) const;
1493 void print_pools(std::ostream& out) const;
1494 void print_summary(ceph::Formatter *f, std::ostream& out,
1495 const std::string& prefix, bool extra=false) const;
1496 void print_oneline_summary(std::ostream& out) const;
31f18b77
FG
1497
1498 enum {
c07f9fc5
FG
1499 DUMP_IN = 1, // only 'in' osds
1500 DUMP_OUT = 2, // only 'out' osds
1501 DUMP_UP = 4, // only 'up' osds
1502 DUMP_DOWN = 8, // only 'down' osds
1503 DUMP_DESTROYED = 16, // only 'destroyed' osds
31f18b77 1504 };
9f95a23c
TL
1505 void print_tree(ceph::Formatter *f, std::ostream *out,
1506 unsigned dump_flags=0, std::string bucket="") const;
7c673cae
FG
1507
1508 int summarize_mapping_stats(
1509 OSDMap *newmap,
9f95a23c 1510 const std::set<int64_t> *pools,
7c673cae 1511 std::string *out,
9f95a23c 1512 ceph::Formatter *f) const;
7c673cae 1513
9f95a23c
TL
1514 std::string get_flag_string() const;
1515 static std::string get_flag_string(unsigned flags);
7c673cae 1516 static void dump_erasure_code_profiles(
9f95a23c
TL
1517 const mempool::osdmap::map<std::string,std::map<std::string,std::string> > &profiles,
1518 ceph::Formatter *f);
1519 void dump(ceph::Formatter *f) const;
1520 void dump_osd(int id, ceph::Formatter *f) const;
1521 void dump_osds(ceph::Formatter *f) const;
1522 static void generate_test_instances(std::list<OSDMap*>& o);
7c673cae 1523 bool check_new_blacklist_entries() const { return new_blacklist_entries; }
224ce89b 1524
92f5a8d4 1525 void check_health(CephContext *cct, health_check_map_t *checks) const;
35e4c445 1526
9f95a23c
TL
1527 int parse_osd_id_list(const std::vector<std::string>& ls,
1528 std::set<int> *out,
1529 std::ostream *ss) const;
11fdf7f2
TL
1530
1531 float pool_raw_used_rate(int64_t poolid) const;
1532
7c673cae
FG
1533};
1534WRITE_CLASS_ENCODER_FEATURES(OSDMap)
1535WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
1536
9f95a23c
TL
1537#ifdef WITH_SEASTAR
1538using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
1539#else
1540using OSDMapRef = std::shared_ptr<const OSDMap>;
1541#endif
1542
7c673cae 1543
9f95a23c 1544inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) {
7c673cae
FG
1545 m.print_oneline_summary(out);
1546 return out;
1547}
1548
11fdf7f2 1549class PGMap;
31f18b77
FG
1550
1551void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2 1552 const PGMap& pgmap,
9f95a23c
TL
1553 std::ostream& out,
1554 ceph::Formatter *f,
11fdf7f2 1555 bool tree,
9f95a23c 1556 const std::string& filter);
7c673cae
FG
1557
1558#endif