]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSDMap.h
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / OSDMap.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19#ifndef CEPH_OSDMAP_H
20#define CEPH_OSDMAP_H
21
22/*
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
25 *
26 */
27#include "include/types.h"
28#include "osd_types.h"
29
30//#include "include/ceph_features.h"
31#include "crush/CrushWrapper.h"
32#include <vector>
33#include <list>
34#include <set>
35#include <map>
11fdf7f2 36#include <memory>
94b18763 37#include "include/btree_map.h"
7c673cae
FG
38
39// forward declaration
40class CephContext;
41class CrushWrapper;
224ce89b 42class health_check_map_t;
7c673cae 43
7c673cae
FG
44/*
45 * we track up to two intervals during which the osd was alive and
46 * healthy. the most recent is [up_from,up_thru), where up_thru is
47 * the last epoch the osd is known to have _started_. i.e., a lower
48 * bound on the actual osd death. down_at (if it is > up_from) is an
49 * upper bound on the actual osd death.
50 *
51 * the second is the last_clean interval [first,last]. in that case,
52 * the last interval is the last epoch known to have been either
53 * _finished_, or during which the osd cleanly shut down. when
54 * possible, we push this forward to the epoch the osd was eventually
55 * marked down.
56 *
57 * the lost_at is used to allow build_prior to proceed without waiting
58 * for an osd to recover. In certain cases, progress may be blocked
59 * because an osd is down that may contain updates (i.e., a pg may have
60 * gone rw during an interval). If the osd can't be brought online, we
61 * can force things to proceed knowing that we _might_ be losing some
62 * acked writes. If the osd comes back to life later, that's fine to,
63 * but those writes will still be lost (the divergent objects will be
64 * thrown out).
65 */
66struct osd_info_t {
67 epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
68 epoch_t last_clean_end;
69 epoch_t up_from; // epoch osd marked up
70 epoch_t up_thru; // lower bound on actual osd death (if > up_from)
71 epoch_t down_at; // upper bound on actual osd death (if > up_from)
72 epoch_t lost_at; // last epoch we decided data was "lost"
73
74 osd_info_t() : last_clean_begin(0), last_clean_end(0),
75 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
76
77 void dump(Formatter *f) const;
78 void encode(bufferlist& bl) const;
11fdf7f2 79 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
80 static void generate_test_instances(list<osd_info_t*>& o);
81};
82WRITE_CLASS_ENCODER(osd_info_t)
83
84ostream& operator<<(ostream& out, const osd_info_t& info);
85
86struct osd_xinfo_t {
87 utime_t down_stamp; ///< timestamp when we were last marked down
88 float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
89 __u32 laggy_interval; ///< average interval between being marked laggy and recovering
90 uint64_t features; ///< features supported by this osd we should know about
91 __u32 old_weight; ///< weight prior to being auto marked out
92
93 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
94 features(0), old_weight(0) {}
95
96 void dump(Formatter *f) const;
97 void encode(bufferlist& bl) const;
11fdf7f2 98 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
99 static void generate_test_instances(list<osd_xinfo_t*>& o);
100};
101WRITE_CLASS_ENCODER(osd_xinfo_t)
102
103ostream& operator<<(ostream& out, const osd_xinfo_t& xi);
104
105
31f18b77
FG
106struct PGTempMap {
107#if 1
108 bufferlist data;
eafe8130 109 typedef btree::btree_map<pg_t,ceph_le32*> map_t;
31f18b77
FG
110 map_t map;
111
112 void encode(bufferlist& bl) const {
11fdf7f2 113 using ceph::encode;
31f18b77 114 uint32_t n = map.size();
11fdf7f2 115 encode(n, bl);
31f18b77 116 for (auto &p : map) {
11fdf7f2 117 encode(p.first, bl);
eafe8130 118 bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32));
31f18b77
FG
119 }
120 }
11fdf7f2
TL
121 void decode(bufferlist::const_iterator& p) {
122 using ceph::decode;
31f18b77
FG
123 data.clear();
124 map.clear();
125 uint32_t n;
11fdf7f2 126 decode(n, p);
31f18b77
FG
127 if (!n)
128 return;
11fdf7f2 129 auto pstart = p;
31f18b77
FG
130 size_t start_off = pstart.get_off();
131 vector<pair<pg_t,size_t>> offsets;
132 offsets.resize(n);
133 for (unsigned i=0; i<n; ++i) {
134 pg_t pgid;
11fdf7f2 135 decode(pgid, p);
31f18b77
FG
136 offsets[i].first = pgid;
137 offsets[i].second = p.get_off() - start_off;
138 uint32_t vn;
11fdf7f2 139 decode(vn, p);
31f18b77
FG
140 p.advance(vn * sizeof(int32_t));
141 }
142 size_t len = p.get_off() - start_off;
143 pstart.copy(len, data);
144 if (data.get_num_buffers() > 1) {
145 data.rebuild();
146 }
147 //map.reserve(n);
148 char *start = data.c_str();
149 for (auto i : offsets) {
eafe8130 150 map.insert(map.end(), make_pair(i.first, (ceph_le32*)(start + i.second)));
31f18b77
FG
151 }
152 }
153 void rebuild() {
154 bufferlist bl;
155 encode(bl);
11fdf7f2 156 auto p = std::cbegin(bl);
31f18b77
FG
157 decode(p);
158 }
159 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
160 return
161 l.map.size() == r.map.size() &&
162 l.data.contents_equal(r.data);
163 }
164
165 class iterator {
166 map_t::const_iterator it;
167 map_t::const_iterator end;
168 pair<pg_t,vector<int32_t>> current;
169 void init_current() {
170 if (it != end) {
171 current.first = it->first;
11fdf7f2 172 ceph_assert(it->second);
31f18b77 173 current.second.resize(*it->second);
eafe8130
TL
174 ceph_le32 *p = it->second + 1;
175 for (uint32_t n = 0; n < *it->second; ++n, ++p) {
31f18b77
FG
176 current.second[n] = *p;
177 }
178 }
179 }
180 public:
181 iterator(map_t::const_iterator p,
182 map_t::const_iterator e)
183 : it(p), end(e) {
184 init_current();
185 }
186
187 const pair<pg_t,vector<int32_t>>& operator*() const {
188 return current;
189 }
190 const pair<pg_t,vector<int32_t>>* operator->() const {
191 return &current;
192 }
193 friend bool operator==(const iterator& l, const iterator& r) {
194 return l.it == r.it;
195 }
196 friend bool operator!=(const iterator& l, const iterator& r) {
197 return l.it != r.it;
198 }
199 iterator& operator++() {
200 ++it;
201 if (it != end)
202 init_current();
203 return *this;
204 }
205 iterator operator++(int) {
206 iterator r = *this;
207 ++it;
208 if (it != end)
209 init_current();
210 return r;
211 }
212 };
213 iterator begin() const {
214 return iterator(map.begin(), map.end());
215 }
216 iterator end() const {
217 return iterator(map.end(), map.end());
218 }
219 iterator find(pg_t pgid) const {
220 return iterator(map.find(pgid), map.end());
221 }
222 size_t size() const {
223 return map.size();
224 }
225 size_t count(pg_t pgid) const {
226 return map.count(pgid);
227 }
228 void erase(pg_t pgid) {
229 map.erase(pgid);
230 }
231 void clear() {
232 map.clear();
233 data.clear();
234 }
235 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
11fdf7f2 236 using ceph::encode;
eafe8130 237 size_t need = sizeof(ceph_le32) * (1 + v.size());
31f18b77
FG
238 if (need < data.get_append_buffer_unused_tail_length()) {
239 bufferptr z(data.get_append_buffer_unused_tail_length());
240 z.zero();
241 data.append(z.c_str(), z.length());
242 }
11fdf7f2 243 encode(v, data);
eafe8130 244 map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size());
31f18b77
FG
245 }
246 mempool::osdmap::vector<int32_t> get(pg_t pgid) {
247 mempool::osdmap::vector<int32_t> v;
eafe8130 248 ceph_le32 *p = map[pgid];
31f18b77
FG
249 size_t n = *p++;
250 v.resize(n);
251 for (size_t i = 0; i < n; ++i, ++p) {
252 v[i] = *p;
253 }
254 return v;
255 }
256#else
257 // trivial implementation
258 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
259
260 void encode(bufferlist& bl) const {
11fdf7f2 261 encode(pg_temp, bl);
31f18b77 262 }
11fdf7f2
TL
263 void decode(bufferlist::const_iterator& p) {
264 decode(pg_temp, p);
31f18b77
FG
265 }
266 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
267 return
268 l.pg_temp.size() == r.pg_temp.size() &&
269 l.pg_temp == r.pg_temp;
270 }
271
272 class iterator {
273 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
274 public:
275 iterator(mempool::osdmap::map<pg_t,
276 mempool::osdmap::vector<int32_t> >::const_iterator p)
277 : it(p) {}
278
279 pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
280 return *it;
281 }
282 const pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
283 return &*it;
284 }
285 friend bool operator==(const iterator& l, const iterator& r) {
286 return l.it == r.it;
287 }
288 friend bool operator!=(const iterator& l, const iterator& r) {
289 return l.it != r.it;
290 }
291 iterator& operator++() {
292 ++it;
293 return *this;
294 }
295 iterator operator++(int) {
296 iterator r = *this;
297 ++it;
298 return r;
299 }
300 };
301 iterator begin() const {
302 return iterator(pg_temp.cbegin());
303 }
304 iterator end() const {
305 return iterator(pg_temp.cend());
306 }
307 iterator find(pg_t pgid) const {
308 return iterator(pg_temp.find(pgid));
309 }
310 size_t size() const {
311 return pg_temp.size();
312 }
313 size_t count(pg_t pgid) const {
314 return pg_temp.count(pgid);
315 }
316 void erase(pg_t pgid) {
317 pg_temp.erase(pgid);
318 }
319 void clear() {
320 pg_temp.clear();
321 }
322 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
323 pg_temp[pgid] = v;
324 }
325 const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
326 return pg_temp.at(pgid);
327 }
328#endif
329 void dump(Formatter *f) const {
330 for (const auto &pg : *this) {
331 f->open_object_section("osds");
332 f->dump_stream("pgid") << pg.first;
333 f->open_array_section("osds");
334 for (const auto osd : pg.second)
335 f->dump_int("osd", osd);
336 f->close_section();
337 f->close_section();
338 }
339 }
340};
341WRITE_CLASS_ENCODER(PGTempMap)
342
7c673cae
FG
343/** OSDMap
344 */
345class OSDMap {
346public:
347 MEMPOOL_CLASS_HELPERS();
348
11fdf7f2
TL
349 typedef interval_set<
350 snapid_t,
351 mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t;
352
7c673cae
FG
353 class Incremental {
354 public:
355 MEMPOOL_CLASS_HELPERS();
356
357 /// feature bits we were encoded with. the subsequent OSDMap
358 /// encoding should match.
359 uint64_t encode_features;
360 uuid_d fsid;
361 epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
362 utime_t modified;
363 int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
364 int32_t new_flags;
31f18b77 365 int8_t new_require_osd_release = -1;
7c673cae
FG
366
367 // full (rare)
368 bufferlist fullmap; // in lieu of below.
369 bufferlist crush;
370
371 // incremental
372 int32_t new_max_osd;
373 mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
374 mempool::osdmap::map<int64_t,string> new_pool_names;
375 mempool::osdmap::set<int64_t> old_pools;
376 mempool::osdmap::map<string,map<string,string> > new_erasure_code_profiles;
377 mempool::osdmap::vector<string> old_erasure_code_profiles;
11fdf7f2
TL
378 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
379 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
31f18b77 380 mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
7c673cae
FG
381 mempool::osdmap::map<int32_t,uint32_t> new_weight;
382 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
383 mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
384 mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
385 mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
386 mempool::osdmap::map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
387 mempool::osdmap::map<int32_t,epoch_t> new_lost;
388 mempool::osdmap::map<int32_t,uuid_d> new_uuid;
389 mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
390
391 mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist;
392 mempool::osdmap::vector<entity_addr_t> old_blacklist;
11fdf7f2
TL
393 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
394 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
7c673cae
FG
395
396 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
397 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> new_pg_upmap_items;
398 mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
11fdf7f2
TL
399 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
400 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
7c673cae 401
81eedcae
TL
402 mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags;
403 mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags;
404
7c673cae
FG
405 string cluster_snapshot;
406
407 float new_nearfull_ratio = -1;
408 float new_backfillfull_ratio = -1;
409 float new_full_ratio = -1;
410
31f18b77 411 int8_t new_require_min_compat_client = -1;
7c673cae 412
11fdf7f2
TL
413 utime_t new_last_up_change, new_last_in_change;
414
7c673cae
FG
415 mutable bool have_crc; ///< crc values are defined
416 uint32_t full_crc; ///< crc of the resulting OSDMap
417 mutable uint32_t inc_crc; ///< crc of this incremental
418
419 int get_net_marked_out(const OSDMap *previous) const;
420 int get_net_marked_down(const OSDMap *previous) const;
421 int identify_osd(uuid_d u) const;
422
423 void encode_client_old(bufferlist& bl) const;
424 void encode_classic(bufferlist& bl, uint64_t features) const;
425 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
11fdf7f2
TL
426 void decode_classic(bufferlist::const_iterator &p);
427 void decode(bufferlist::const_iterator &bl);
7c673cae
FG
428 void dump(Formatter *f) const;
429 static void generate_test_instances(list<Incremental*>& o);
430
431 explicit Incremental(epoch_t e=0) :
432 encode_features(0),
433 epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
434 have_crc(false), full_crc(0), inc_crc(0) {
7c673cae
FG
435 }
436 explicit Incremental(bufferlist &bl) {
11fdf7f2 437 auto p = std::cbegin(bl);
7c673cae
FG
438 decode(p);
439 }
11fdf7f2 440 explicit Incremental(bufferlist::const_iterator &p) {
7c673cae
FG
441 decode(p);
442 }
443
444 pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
445 if (new_pools.count(pool) == 0)
446 new_pools[pool] = *orig;
447 return &new_pools[pool];
448 }
449 bool has_erasure_code_profile(const string &name) const {
450 auto i = new_erasure_code_profiles.find(name);
451 return i != new_erasure_code_profiles.end();
452 }
453 void set_erasure_code_profile(const string &name,
454 const map<string,string>& profile) {
455 new_erasure_code_profiles[name] = profile;
456 }
11fdf7f2
TL
457 mempool::osdmap::map<string,map<string,string>> get_erasure_code_profiles() const {
458 return new_erasure_code_profiles;
459 }
7c673cae 460
11fdf7f2 461 /// propagate update pools' snap metadata to any of their tiers
7c673cae 462 int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
31f18b77
FG
463
464 /// filter out osds with any pending state changing
465 size_t get_pending_state_osds(vector<int> *osds) {
11fdf7f2 466 ceph_assert(osds);
31f18b77
FG
467 osds->clear();
468
469 for (auto &p : new_state) {
470 osds->push_back(p.first);
471 }
472
473 return osds->size();
474 }
475
476 bool pending_osd_has_state(int osd, unsigned state) {
477 return new_state.count(osd) && (new_state[osd] & state) != 0;
478 }
479
81eedcae
TL
480 bool pending_osd_state_set(int osd, unsigned state) {
481 if (pending_osd_has_state(osd, state))
482 return false;
31f18b77 483 new_state[osd] |= state;
81eedcae 484 return true;
31f18b77
FG
485 }
486
487 // cancel the specified pending osd state if there is any
488 // return ture on success, false otherwise.
489 bool pending_osd_state_clear(int osd, unsigned state) {
490 if (!pending_osd_has_state(osd, state)) {
491 // never has been set or already has been cancelled.
492 return false;
493 }
494
495 new_state[osd] &= ~state;
11fdf7f2
TL
496 if (!new_state[osd]) {
497 // all flags cleared
498 new_state.erase(osd);
499 }
31f18b77
FG
500 return true;
501 }
502
7c673cae
FG
503 };
504
505private:
506 uuid_d fsid;
507 epoch_t epoch; // what epoch of the osd cluster descriptor is this
508 utime_t created, modified; // epoch start time
509 int32_t pool_max; // the largest pool num, ever
510
511 uint32_t flags;
512
513 int num_osd; // not saved; see calc_num_osds
514 int num_up_osd; // not saved; see calc_num_osds
515 int num_in_osd; // not saved; see calc_num_osds
516
517 int32_t max_osd;
31f18b77 518 vector<uint32_t> osd_state;
7c673cae 519
81eedcae
TL
520 mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags
521 mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags
522
11fdf7f2
TL
523 utime_t last_up_change, last_in_change;
524
28e407b8
AA
525 // These features affect OSDMap[::Incremental] encoding, or the
526 // encoding of some type embedded therein (CrushWrapper, something
527 // from osd_types, etc.).
528 static constexpr uint64_t SIGNIFICANT_FEATURES =
529 CEPH_FEATUREMASK_PGID64 |
530 CEPH_FEATUREMASK_PGPOOL3 |
531 CEPH_FEATUREMASK_OSDENC |
532 CEPH_FEATUREMASK_OSDMAP_ENC |
533 CEPH_FEATUREMASK_OSD_POOLRESEND |
534 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
535 CEPH_FEATUREMASK_MSG_ADDR2 |
536 CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
537 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
11fdf7f2
TL
538 CEPH_FEATUREMASK_SERVER_LUMINOUS |
539 CEPH_FEATUREMASK_SERVER_MIMIC |
540 CEPH_FEATUREMASK_SERVER_NAUTILUS;
541
7c673cae 542 struct addrs_s {
11fdf7f2
TL
543 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
544 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
545 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
546 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
7c673cae 547 };
11fdf7f2
TL
548 std::shared_ptr<addrs_s> osd_addrs;
549
550 entity_addrvec_t _blank_addrvec;
7c673cae
FG
551
552 mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
553 mempool::osdmap::vector<osd_info_t> osd_info;
11fdf7f2
TL
554 std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
555 std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
556 std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
7c673cae
FG
557
558 // remap (post-CRUSH, pre-up)
559 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
560 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
561
562 mempool::osdmap::map<int64_t,pg_pool_t> pools;
563 mempool::osdmap::map<int64_t,string> pool_name;
564 mempool::osdmap::map<string,map<string,string> > erasure_code_profiles;
565 mempool::osdmap::map<string,int64_t> name_pool;
566
11fdf7f2 567 std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
7c673cae
FG
568 mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
569
570 mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist;
571
11fdf7f2
TL
572 /// queue of snaps to remove
573 mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
574
575 /// removed_snaps additions this epoch
576 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
577
578 /// removed_snaps removals this epoch
579 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
580
7c673cae
FG
581 epoch_t cluster_snapshot_epoch;
582 string cluster_snapshot;
583 bool new_blacklist_entries;
584
585 float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
586
587 /// min compat client we want to support
31f18b77 588 uint8_t require_min_compat_client = 0; // CEPH_RELEASE_*
7c673cae 589
31f18b77
FG
590public:
591 /// require osds to run at least this release
592 uint8_t require_osd_release = 0; // CEPH_RELEASE_*
593
594private:
7c673cae
FG
595 mutable uint64_t cached_up_osd_features;
596
597 mutable bool crc_defined;
598 mutable uint32_t crc;
599
600 void _calc_up_osd_features();
601
602 public:
603 bool have_crc() const { return crc_defined; }
604 uint32_t get_crc() const { return crc; }
605
11fdf7f2 606 std::shared_ptr<CrushWrapper> crush; // hierarchical map
31f18b77
FG
607private:
608 uint32_t crush_version = 1;
7c673cae
FG
609
610 friend class OSDMonitor;
611
612 public:
613 OSDMap() : epoch(0),
224ce89b 614 pool_max(0),
7c673cae
FG
615 flags(0),
616 num_osd(0), num_up_osd(0), num_in_osd(0),
617 max_osd(0),
618 osd_addrs(std::make_shared<addrs_s>()),
31f18b77 619 pg_temp(std::make_shared<PGTempMap>()),
7c673cae
FG
620 primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
621 osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
622 cluster_snapshot_epoch(0),
623 new_blacklist_entries(false),
624 cached_up_osd_features(0),
625 crc_defined(false), crc(0),
626 crush(std::make_shared<CrushWrapper>()) {
7c673cae
FG
627 }
628
7c673cae
FG
629private:
630 OSDMap(const OSDMap& other) = default;
631 OSDMap& operator=(const OSDMap& other) = default;
632public:
633
28e407b8
AA
634 /// return feature mask subset that is relevant to OSDMap encoding
635 static uint64_t get_significant_features(uint64_t features) {
636 return SIGNIFICANT_FEATURES & features;
637 }
638
639 uint64_t get_encoding_features() const;
640
7c673cae
FG
641 void deepish_copy_from(const OSDMap& o) {
642 *this = o;
643 primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
31f18b77 644 pg_temp.reset(new PGTempMap(*o.pg_temp));
7c673cae
FG
645 osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
646
647 if (o.osd_primary_affinity)
648 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
649
11fdf7f2 650 // NOTE: this still references shared entity_addrvec_t's.
7c673cae
FG
651 osd_addrs.reset(new addrs_s(*o.osd_addrs));
652
653 // NOTE: we do not copy crush. note that apply_incremental will
654 // allocate a new CrushWrapper, though.
655 }
656
657 // map info
658 const uuid_d& get_fsid() const { return fsid; }
659 void set_fsid(uuid_d& f) { fsid = f; }
660
661 epoch_t get_epoch() const { return epoch; }
662 void inc_epoch() { epoch++; }
663
664 void set_epoch(epoch_t e);
665
31f18b77
FG
666 uint32_t get_crush_version() const {
667 return crush_version;
668 }
669
7c673cae
FG
670 /* stamps etc */
671 const utime_t& get_created() const { return created; }
672 const utime_t& get_modified() const { return modified; }
673
674 bool is_blacklisted(const entity_addr_t& a) const;
11fdf7f2 675 bool is_blacklisted(const entity_addrvec_t& a) const;
7c673cae 676 void get_blacklist(list<pair<entity_addr_t,utime_t > > *bl) const;
31f18b77 677 void get_blacklist(std::set<entity_addr_t> *bl) const;
7c673cae
FG
678
679 string get_cluster_snapshot() const {
680 if (cluster_snapshot_epoch == epoch)
681 return cluster_snapshot;
682 return string();
683 }
684
685 float get_full_ratio() const {
686 return full_ratio;
687 }
688 float get_backfillfull_ratio() const {
689 return backfillfull_ratio;
690 }
691 float get_nearfull_ratio() const {
692 return nearfull_ratio;
693 }
3efd9988
FG
694 void get_full_pools(CephContext *cct,
695 set<int64_t> *full,
696 set<int64_t> *backfillfull,
697 set<int64_t> *nearfull) const;
31f18b77
FG
698 void get_full_osd_counts(set<int> *full, set<int> *backfill,
699 set<int> *nearfull) const;
700
701
7c673cae
FG
702 /***** cluster state *****/
703 /* osds */
704 int get_max_osd() const { return max_osd; }
705 void set_max_osd(int m);
706
707 unsigned get_num_osds() const {
708 return num_osd;
709 }
710 unsigned get_num_up_osds() const {
711 return num_up_osd;
712 }
713 unsigned get_num_in_osds() const {
714 return num_in_osd;
715 }
716 /// recalculate cached values for get_num{,_up,_in}_osds
717 int calc_num_osds();
718
719 void get_all_osds(set<int32_t>& ls) const;
720 void get_up_osds(set<int32_t>& ls) const;
31f18b77 721 void get_out_osds(set<int32_t>& ls) const;
81eedcae 722 void get_out_existing_osds(std::set<int32_t>& ls) const;
7c673cae
FG
723 unsigned get_num_pg_temp() const {
724 return pg_temp->size();
725 }
726
727 int get_flags() const { return flags; }
728 bool test_flag(int f) const { return flags & f; }
729 void set_flag(int f) { flags |= f; }
730 void clear_flag(int f) { flags &= ~f; }
731
11fdf7f2
TL
732 void get_flag_set(set<string> *flagset) const;
733
7c673cae
FG
734 static void calc_state_set(int state, set<string>& st);
735
736 int get_state(int o) const {
11fdf7f2 737 ceph_assert(o < max_osd);
7c673cae
FG
738 return osd_state[o];
739 }
740 int get_state(int o, set<string>& st) const {
11fdf7f2 741 ceph_assert(o < max_osd);
7c673cae
FG
742 unsigned t = osd_state[o];
743 calc_state_set(t, st);
744 return osd_state[o];
745 }
746 void set_state(int o, unsigned s) {
11fdf7f2 747 ceph_assert(o < max_osd);
7c673cae
FG
748 osd_state[o] = s;
749 }
750 void set_weight(int o, unsigned w) {
11fdf7f2 751 ceph_assert(o < max_osd);
7c673cae
FG
752 osd_weight[o] = w;
753 if (w)
754 osd_state[o] |= CEPH_OSD_EXISTS;
755 }
756 unsigned get_weight(int o) const {
11fdf7f2 757 ceph_assert(o < max_osd);
7c673cae
FG
758 return osd_weight[o];
759 }
760 float get_weightf(int o) const {
761 return (float)get_weight(o) / (float)CEPH_OSD_IN;
762 }
763 void adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const;
764
765 void set_primary_affinity(int o, int w) {
11fdf7f2 766 ceph_assert(o < max_osd);
7c673cae
FG
767 if (!osd_primary_affinity)
768 osd_primary_affinity.reset(
769 new mempool::osdmap::vector<__u32>(
770 max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
771 (*osd_primary_affinity)[o] = w;
772 }
773 unsigned get_primary_affinity(int o) const {
11fdf7f2 774 ceph_assert(o < max_osd);
7c673cae
FG
775 if (!osd_primary_affinity)
776 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
777 return (*osd_primary_affinity)[o];
778 }
779 float get_primary_affinityf(int o) const {
780 return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
781 }
782
783 bool has_erasure_code_profile(const string &name) const {
784 auto i = erasure_code_profiles.find(name);
785 return i != erasure_code_profiles.end();
786 }
787 int get_erasure_code_profile_default(CephContext *cct,
788 map<string,string> &profile_map,
789 ostream *ss);
790 void set_erasure_code_profile(const string &name,
791 const map<string,string>& profile) {
792 erasure_code_profiles[name] = profile;
793 }
794 const map<string,string> &get_erasure_code_profile(
795 const string &name) const {
796 static map<string,string> empty;
797 auto i = erasure_code_profiles.find(name);
798 if (i == erasure_code_profiles.end())
799 return empty;
800 else
801 return i->second;
802 }
803 const mempool::osdmap::map<string,map<string,string> > &get_erasure_code_profiles() const {
804 return erasure_code_profiles;
805 }
806
807 bool exists(int osd) const {
808 //assert(osd >= 0);
809 return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
810 }
811
31f18b77
FG
812 bool is_destroyed(int osd) const {
813 return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
814 }
815
7c673cae
FG
816 bool is_up(int osd) const {
817 return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
818 }
819
820 bool has_been_up_since(int osd, epoch_t epoch) const {
821 return is_up(osd) && get_up_from(osd) <= epoch;
822 }
823
824 bool is_down(int osd) const {
825 return !is_up(osd);
826 }
827
828 bool is_out(int osd) const {
829 return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
830 }
831
832 bool is_in(int osd) const {
833 return !is_out(osd);
834 }
835
81eedcae
TL
836 unsigned get_osd_crush_node_flags(int osd) const;
837 unsigned get_crush_node_flags(int id) const;
838 unsigned get_device_class_flags(int id) const;
839
840 bool is_noup_by_osd(int osd) const {
31f18b77
FG
841 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
842 }
843
81eedcae 844 bool is_nodown_by_osd(int osd) const {
31f18b77
FG
845 return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
846 }
847
81eedcae 848 bool is_noin_by_osd(int osd) const {
31f18b77
FG
849 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
850 }
851
81eedcae 852 bool is_noout_by_osd(int osd) const {
31f18b77
FG
853 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
854 }
855
81eedcae
TL
856 bool is_noup(int osd) const {
857 if (test_flag(CEPH_OSDMAP_NOUP)) // global?
858 return true;
859 if (is_noup_by_osd(osd)) // by osd?
860 return true;
861 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node?
862 return true;
863 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
864 get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class?
865 return true;
866 return false;
31f18b77
FG
867 }
868
81eedcae
TL
869 bool is_nodown(int osd) const {
870 if (test_flag(CEPH_OSDMAP_NODOWN))
871 return true;
872 if (is_nodown_by_osd(osd))
873 return true;
874 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN)
875 return true;
876 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
877 get_device_class_flags(class_id) & CEPH_OSD_NODOWN)
878 return true;
879 return false;
31f18b77
FG
880 }
881
81eedcae
TL
882 bool is_noin(int osd) const {
883 if (test_flag(CEPH_OSDMAP_NOIN))
884 return true;
885 if (is_noin_by_osd(osd))
886 return true;
887 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN)
888 return true;
889 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
890 get_device_class_flags(class_id) & CEPH_OSD_NOIN)
891 return true;
892 return false;
31f18b77
FG
893 }
894
81eedcae
TL
895 bool is_noout(int osd) const {
896 if (test_flag(CEPH_OSDMAP_NOOUT))
897 return true;
898 if (is_noout_by_osd(osd))
899 return true;
900 if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT)
901 return true;
902 if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
903 get_device_class_flags(class_id) & CEPH_OSD_NOOUT)
904 return true;
905 return false;
31f18b77
FG
906 }
907
7c673cae
FG
908 /**
909 * check if an entire crush subtree is down
910 */
911 bool subtree_is_down(int id, set<int> *down_cache) const;
912 bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
913
31f18b77
FG
914 bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
915 set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
916
7c673cae
FG
917 int identify_osd(const entity_addr_t& addr) const;
918 int identify_osd(const uuid_d& u) const;
919 int identify_osd_on_all_channels(const entity_addr_t& addr) const;
920
921 bool have_addr(const entity_addr_t& addr) const {
922 return identify_osd(addr) >= 0;
923 }
924 int find_osd_on_ip(const entity_addr_t& ip) const;
11fdf7f2
TL
925
926 const entity_addrvec_t& get_addrs(int osd) const {
927 ceph_assert(exists(osd));
928 return osd_addrs->client_addrs[osd] ?
929 *osd_addrs->client_addrs[osd] : _blank_addrvec;
7c673cae 930 }
11fdf7f2
TL
931 const entity_addrvec_t& get_most_recent_addrs(int osd) const {
932 return get_addrs(osd);
7c673cae 933 }
11fdf7f2
TL
934 const entity_addrvec_t &get_cluster_addrs(int osd) const {
935 ceph_assert(exists(osd));
936 return osd_addrs->cluster_addrs[osd] ?
937 *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
7c673cae 938 }
11fdf7f2
TL
939 const entity_addrvec_t &get_hb_back_addrs(int osd) const {
940 ceph_assert(exists(osd));
941 return osd_addrs->hb_back_addrs[osd] ?
942 *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
7c673cae 943 }
11fdf7f2
TL
944 const entity_addrvec_t &get_hb_front_addrs(int osd) const {
945 ceph_assert(exists(osd));
946 return osd_addrs->hb_front_addrs[osd] ?
947 *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
7c673cae
FG
948 }
949
950 const uuid_d& get_uuid(int osd) const {
11fdf7f2 951 ceph_assert(exists(osd));
7c673cae
FG
952 return (*osd_uuid)[osd];
953 }
954
955 const epoch_t& get_up_from(int osd) const {
11fdf7f2 956 ceph_assert(exists(osd));
7c673cae
FG
957 return osd_info[osd].up_from;
958 }
959 const epoch_t& get_up_thru(int osd) const {
11fdf7f2 960 ceph_assert(exists(osd));
7c673cae
FG
961 return osd_info[osd].up_thru;
962 }
963 const epoch_t& get_down_at(int osd) const {
11fdf7f2 964 ceph_assert(exists(osd));
7c673cae
FG
965 return osd_info[osd].down_at;
966 }
967 const osd_info_t& get_info(int osd) const {
11fdf7f2 968 ceph_assert(osd < max_osd);
7c673cae
FG
969 return osd_info[osd];
970 }
971
972 const osd_xinfo_t& get_xinfo(int osd) const {
11fdf7f2 973 ceph_assert(osd < max_osd);
7c673cae
FG
974 return osd_xinfo[osd];
975 }
976
977 int get_next_up_osd_after(int n) const {
978 if (get_max_osd() == 0)
979 return -1;
980 for (int i = n + 1; i != n; ++i) {
981 if (i >= get_max_osd())
982 i = 0;
983 if (i == n)
984 break;
985 if (is_up(i))
986 return i;
987 }
988 return -1;
989 }
990
991 int get_previous_up_osd_before(int n) const {
992 if (get_max_osd() == 0)
993 return -1;
994 for (int i = n - 1; i != n; --i) {
995 if (i < 0)
996 i = get_max_osd() - 1;
997 if (i == n)
998 break;
999 if (is_up(i))
1000 return i;
1001 }
1002 return -1;
1003 }
1004
11fdf7f2
TL
1005
1006 void get_random_up_osds_by_subtree(int n, // whoami
1007 string &subtree,
1008 int limit, // how many
1009 set<int> skip,
1010 set<int> *want) const;
1011
7c673cae
FG
1012 /**
1013 * get feature bits required by the current structure
1014 *
1015 * @param entity_type [in] what entity type we are asking about
1016 * @param mask [out] set of all possible map-related features we could set
1017 * @return feature bits used by this map
1018 */
1019 uint64_t get_features(int entity_type, uint64_t *mask) const;
1020
1021 /**
1022 * get oldest *client* version (firefly, hammer, etc.) that can connect given
1023 * the feature bits required (according to get_features()).
1024 */
31f18b77 1025 uint8_t get_min_compat_client() const;
7c673cae 1026
11fdf7f2
TL
1027 /**
1028 * gets the required minimum *client* version that can connect to the cluster.
1029 */
1030 uint8_t get_require_min_compat_client() const;
1031
7c673cae
FG
1032 /**
1033 * get intersection of features supported by up osds
1034 */
1035 uint64_t get_up_osd_features() const;
1036
494da23a
TL
1037 void get_upmap_pgs(vector<pg_t> *upmap_pgs) const;
1038 bool check_pg_upmaps(
1039 CephContext *cct,
1040 const vector<pg_t>& to_check,
1041 vector<pg_t> *to_cancel,
1042 map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const;
1043 void clean_pg_upmaps(
1044 CephContext *cct,
1045 Incremental *pending_inc,
1046 const vector<pg_t>& to_cancel,
1047 const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const;
1048 bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
94b18763 1049
7c673cae
FG
1050 int apply_incremental(const Incremental &inc);
1051
1052 /// try to re-use/reference addrs in oldmap from newmap
1053 static void dedup(const OSDMap *oldmap, OSDMap *newmap);
1054
11fdf7f2
TL
1055 static void clean_temps(CephContext *cct,
1056 const OSDMap& oldmap,
1057 const OSDMap& nextmap,
7c673cae
FG
1058 Incremental *pending_inc);
1059
1060 // serialize, unserialize
1061private:
1062 void encode_client_old(bufferlist& bl) const;
1063 void encode_classic(bufferlist& bl, uint64_t features) const;
11fdf7f2 1064 void decode_classic(bufferlist::const_iterator& p);
7c673cae
FG
1065 void post_decode();
1066public:
1067 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
1068 void decode(bufferlist& bl);
11fdf7f2 1069 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1070
1071
1072 /**** mapping facilities ****/
1073 int map_to_pg(
1074 int64_t pool,
1075 const string& name,
1076 const string& key,
1077 const string& nspace,
1078 pg_t *pg) const;
1079 int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
1080 pg_t &pg) const;
1081 pg_t object_locator_to_pg(const object_t& oid,
1082 const object_locator_t& loc) const {
1083 pg_t pg;
1084 int ret = object_locator_to_pg(oid, loc, pg);
11fdf7f2 1085 ceph_assert(ret == 0);
7c673cae
FG
1086 return pg;
1087 }
1088
1089
1090 static object_locator_t file_to_object_locator(const file_layout_t& layout) {
1091 return object_locator_t(layout.pool_id, layout.pool_ns);
1092 }
1093
1094 ceph_object_layout file_to_object_layout(object_t oid,
1095 file_layout_t& layout) const {
1096 return make_object_layout(oid, layout.pool_id, layout.pool_ns);
1097 }
1098
1099 ceph_object_layout make_object_layout(object_t oid, int pg_pool,
1100 string nspace) const;
1101
1102 int get_pg_num(int pg_pool) const
1103 {
1104 const pg_pool_t *pool = get_pg_pool(pg_pool);
11fdf7f2 1105 ceph_assert(NULL != pool);
7c673cae
FG
1106 return pool->get_pg_num();
1107 }
1108
1109 bool pg_exists(pg_t pgid) const {
1110 const pg_pool_t *p = get_pg_pool(pgid.pool());
1111 return p && pgid.ps() < p->get_pg_num();
1112 }
1113
224ce89b
WB
1114 int get_pg_pool_min_size(pg_t pgid) const {
1115 if (!pg_exists(pgid)) {
1116 return -ENOENT;
1117 }
1118 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1119 ceph_assert(p);
224ce89b
WB
1120 return p->get_min_size();
1121 }
1122
1123 int get_pg_pool_size(pg_t pgid) const {
1124 if (!pg_exists(pgid)) {
1125 return -ENOENT;
1126 }
1127 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1128 ceph_assert(p);
224ce89b
WB
1129 return p->get_size();
1130 }
1131
94b18763
FG
1132 int get_pg_pool_crush_rule(pg_t pgid) const {
1133 if (!pg_exists(pgid)) {
1134 return -ENOENT;
1135 }
1136 const pg_pool_t *p = get_pg_pool(pgid.pool());
11fdf7f2 1137 ceph_assert(p);
94b18763
FG
1138 return p->get_crush_rule();
1139 }
1140
7c673cae
FG
1141private:
1142 /// pg -> (raw osd list)
31f18b77 1143 void _pg_to_raw_osds(
7c673cae
FG
1144 const pg_pool_t& pool, pg_t pg,
1145 vector<int> *osds,
1146 ps_t *ppps) const;
1147 int _pick_primary(const vector<int>& osds) const;
1148 void _remove_nonexistent_osds(const pg_pool_t& pool, vector<int>& osds) const;
1149
1150 void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
1151 vector<int> *osds, int *primary) const;
1152
1153 /// apply pg_upmap[_items] mappings
224ce89b 1154 void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const;
7c673cae
FG
1155
1156 /// pg -> (up osd list)
1157 void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1158 vector<int> *up) const;
1159
1160
1161 /**
1162 * Get the pg and primary temp, if they are specified.
1163 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1164 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1165 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1166 */
1167 void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
1168 vector<int> *temp_pg, int *temp_primary) const;
1169
1170 /**
1171 * map to up and acting. Fills in whatever fields are non-NULL.
1172 */
1173 void _pg_to_up_acting_osds(const pg_t& pg, vector<int> *up, int *up_primary,
1174 vector<int> *acting, int *acting_primary,
1175 bool raw_pg_to_pg = true) const;
1176
1177public:
1178 /***
1179 * This is suitable only for looking at raw CRUSH outputs. It skips
1180 * applying the temp and up checks and should not be used
1181 * by anybody for data mapping purposes.
1182 * raw and primary must be non-NULL
1183 */
31f18b77 1184 void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
494da23a
TL
1185 void pg_to_raw_upmap(pg_t pg, vector<int> *raw,
1186 vector<int> *raw_upmap) const;
7c673cae 1187 /// map a pg to its acting set. @return acting set size
31f18b77 1188 void pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
7c673cae
FG
1189 int *acting_primary) const {
1190 _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
7c673cae 1191 }
31f18b77 1192 void pg_to_acting_osds(pg_t pg, vector<int>& acting) const {
7c673cae
FG
1193 return pg_to_acting_osds(pg, &acting, NULL);
1194 }
1195 /**
1196 * This does not apply temp overrides and should not be used
1197 * by anybody for data mapping purposes. Specify both pointers.
1198 */
1199 void pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const;
1200 /**
1201 * map a pg to its acting set as well as its up set. You must use
1202 * the acting set for data mapping purposes, but some users will
1203 * also find the up set useful for things like deciding what to
1204 * set as pg_temp.
1205 * Each of these pointers must be non-NULL.
1206 */
1207 void pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
1208 vector<int> *acting, int *acting_primary) const {
1209 _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
1210 }
1211 void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) const {
1212 int up_primary, acting_primary;
1213 pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
1214 }
1215 bool pg_is_ec(pg_t pg) const {
1216 auto i = pools.find(pg.pool());
11fdf7f2
TL
1217 ceph_assert(i != pools.end());
1218 return i->second.is_erasure();
7c673cae
FG
1219 }
1220 bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
1221 auto i = get_pools().find(pgid.pool());
1222 if (i == get_pools().end()) {
1223 return false;
1224 }
11fdf7f2 1225 if (!i->second.is_erasure()) {
7c673cae
FG
1226 *out = spg_t(pgid);
1227 return true;
1228 }
1229 int primary;
1230 vector<int> acting;
1231 pg_to_acting_osds(pgid, &acting, &primary);
1232 for (uint8_t i = 0; i < acting.size(); ++i) {
1233 if (acting[i] == primary) {
1234 *out = spg_t(pgid, shard_id_t(i));
1235 return true;
1236 }
1237 }
1238 return false;
1239 }
11fdf7f2
TL
1240 bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
1241 auto i = get_pools().find(pgid.pool());
1242 if (i == get_pools().end()) {
1243 return false;
1244 }
1245 vector<int> acting;
1246 pg_to_acting_osds(pgid, &acting, primary);
1247 if (i->second.is_erasure()) {
1248 for (uint8_t i = 0; i < acting.size(); ++i) {
1249 if (acting[i] == *primary) {
1250 *out = spg_t(pgid, shard_id_t(i));
1251 return true;
1252 }
1253 }
1254 } else {
1255 *out = spg_t(pgid);
1256 return true;
1257 }
1258 return false;
1259 }
1260
1261 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1262 get_removed_snaps_queue() const {
1263 return removed_snaps_queue;
1264 }
1265 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1266 get_new_removed_snaps() const {
1267 return new_removed_snaps;
1268 }
1269 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1270 get_new_purged_snaps() const {
1271 return new_purged_snaps;
1272 }
7c673cae
FG
1273
1274 int64_t lookup_pg_pool_name(const string& name) const {
1275 auto p = name_pool.find(name);
1276 if (p == name_pool.end())
1277 return -ENOENT;
1278 return p->second;
1279 }
1280
1281 int64_t get_pool_max() const {
1282 return pool_max;
1283 }
1284 const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
1285 return pools;
1286 }
1287 mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
1288 return pools;
1289 }
3efd9988 1290 void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
11fdf7f2 1291 ceph_assert(pool_ids);
3efd9988 1292 for (auto &p: pools) {
11fdf7f2 1293 if (p.second.get_crush_rule() == rule_id) {
3efd9988
FG
1294 pool_ids->insert(p.first);
1295 }
1296 }
1297 }
1298 void get_pool_ids_by_osd(CephContext *cct,
1299 int osd,
1300 set<int64_t> *pool_ids) const;
7c673cae
FG
1301 const string& get_pool_name(int64_t p) const {
1302 auto i = pool_name.find(p);
11fdf7f2 1303 ceph_assert(i != pool_name.end());
7c673cae
FG
1304 return i->second;
1305 }
c07f9fc5
FG
1306 const mempool::osdmap::map<int64_t,string>& get_pool_names() const {
1307 return pool_name;
1308 }
7c673cae
FG
1309 bool have_pg_pool(int64_t p) const {
1310 return pools.count(p);
1311 }
1312 const pg_pool_t* get_pg_pool(int64_t p) const {
1313 auto i = pools.find(p);
1314 if (i != pools.end())
1315 return &i->second;
1316 return NULL;
1317 }
1318 unsigned get_pg_size(pg_t pg) const {
1319 auto p = pools.find(pg.pool());
11fdf7f2 1320 ceph_assert(p != pools.end());
7c673cae
FG
1321 return p->second.get_size();
1322 }
1323 int get_pg_type(pg_t pg) const {
1324 auto p = pools.find(pg.pool());
11fdf7f2 1325 ceph_assert(p != pools.end());
7c673cae
FG
1326 return p->second.get_type();
1327 }
1328
1329
1330 pg_t raw_pg_to_pg(pg_t pg) const {
1331 auto p = pools.find(pg.pool());
11fdf7f2 1332 ceph_assert(p != pools.end());
7c673cae
FG
1333 return p->second.raw_pg_to_pg(pg);
1334 }
1335
1336 // pg -> acting primary osd
1337 int get_pg_acting_primary(pg_t pg) const {
1338 int primary = -1;
1339 _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
1340 return primary;
1341 }
1342
1343 /*
1344 * check whether an spg_t maps to a particular osd
1345 */
1346 bool is_up_acting_osd_shard(spg_t pg, int osd) const {
1347 vector<int> up, acting;
1348 _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
1349 if (pg.shard == shard_id_t::NO_SHARD) {
1350 if (calc_pg_role(osd, acting, acting.size()) >= 0 ||
1351 calc_pg_role(osd, up, up.size()) >= 0)
1352 return true;
1353 } else {
1354 if (pg.shard < (int)acting.size() && acting[pg.shard] == osd)
1355 return true;
1356 if (pg.shard < (int)up.size() && up[pg.shard] == osd)
1357 return true;
1358 }
1359 return false;
1360 }
1361
1362
1363 /* what replica # is a given osd? 0 primary, -1 for none. */
1364 static int calc_pg_rank(int osd, const vector<int>& acting, int nrep=0);
1365 static int calc_pg_role(int osd, const vector<int>& acting, int nrep=0);
1366 static bool primary_changed(
1367 int oldprimary,
1368 const vector<int> &oldacting,
1369 int newprimary,
1370 const vector<int> &newacting);
1371
1372 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
1373 int get_pg_acting_rank(pg_t pg, int osd) const {
1374 vector<int> group;
31f18b77
FG
1375 pg_to_acting_osds(pg, group);
1376 return calc_pg_rank(osd, group, group.size());
7c673cae
FG
1377 }
1378 /* role is -1 (stray), 0 (primary), 1 (replica) */
1379 int get_pg_acting_role(const pg_t& pg, int osd) const {
1380 vector<int> group;
31f18b77
FG
1381 pg_to_acting_osds(pg, group);
1382 return calc_pg_role(osd, group, group.size());
7c673cae
FG
1383 }
1384
1385 bool osd_is_valid_op_target(pg_t pg, int osd) const {
1386 int primary;
1387 vector<int> group;
31f18b77 1388 pg_to_acting_osds(pg, &group, &primary);
7c673cae
FG
1389 if (osd == primary)
1390 return true;
1391 if (pg_is_ec(pg))
1392 return false;
1393
31f18b77 1394 return calc_pg_role(osd, group, group.size()) >= 0;
7c673cae
FG
1395 }
1396
7c673cae
FG
1397 bool try_pg_upmap(
1398 CephContext *cct,
1399 pg_t pg, ///< pg to potentially remap
1400 const set<int>& overfull, ///< osds we'd want to evacuate
1401 const vector<int>& underfull, ///< osds to move to, in order of preference
92f5a8d4 1402 const vector<int>& more_underfull, ///< less full osds to move to, in order of preference
7c673cae
FG
1403 vector<int> *orig,
1404 vector<int> *out); ///< resulting alternative mapping
1405
1406 int calc_pg_upmaps(
1407 CephContext *cct,
92f5a8d4 1408 uint32_t max_deviation, ///< max deviation from target (value >= 1)
7c673cae
FG
1409 int max_iterations, ///< max iterations to run
1410 const set<int64_t>& pools, ///< [optional] restrict to pool
1411 Incremental *pending_inc
1412 );
1413
31f18b77
FG
1414 int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
1415
f64942e4
AA
1416 bool have_pg_upmaps(pg_t pg) const {
1417 return pg_upmap.count(pg) ||
1418 pg_upmap_items.count(pg);
1419 }
1420
7c673cae
FG
1421 /*
1422 * handy helpers to build simple maps...
1423 */
1424 /**
1425 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1426 * it will be initialized with the specified number of OSDs in a
1427 * single host. If **num_osd** is < 0 the layout of the OSD map will
1428 * be built by reading the content of the configuration file.
1429 *
1430 * @param cct [in] in core ceph context
1431 * @param e [in] initial epoch
1432 * @param fsid [in] id of the cluster
1433 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1434 * @return **0** on success, negative errno on error.
1435 */
224ce89b
WB
1436private:
1437 int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
1438 int num_osd, int pg_bits, int pgp_bits,
1439 bool default_pool);
1440public:
7c673cae 1441 int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
224ce89b
WB
1442 int num_osd) {
1443 return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
1444 }
1445 int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
1446 int num_osd, int pg_bits, int pgp_bits) {
1447 return build_simple_optioned(cct, e, fsid, num_osd,
1448 pg_bits, pgp_bits, true);
1449 }
7c673cae
FG
1450 static int _build_crush_types(CrushWrapper& crush);
1451 static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
1452 int num_osd, ostream *ss);
1453 static int build_simple_crush_map_from_conf(CephContext *cct,
1454 CrushWrapper& crush,
1455 ostream *ss);
31f18b77
FG
1456 static int build_simple_crush_rules(
1457 CephContext *cct, CrushWrapper& crush,
1458 const string& root,
1459 ostream *ss);
7c673cae 1460
3efd9988
FG
1461 bool crush_rule_in_use(int rule_id) const;
1462
1463 int validate_crush_rules(CrushWrapper *crush, ostream *ss) const;
7c673cae
FG
1464
1465 void clear_temp() {
1466 pg_temp->clear();
1467 primary_temp->clear();
1468 }
1469
1470private:
1471 void print_osd_line(int cur, ostream *out, Formatter *f) const;
1472public:
1473 void print(ostream& out) const;
1474 void print_pools(ostream& out) const;
11fdf7f2 1475 void print_summary(Formatter *f, ostream& out, const string& prefix, bool extra=false) const;
7c673cae 1476 void print_oneline_summary(ostream& out) const;
31f18b77
FG
1477
1478 enum {
c07f9fc5
FG
1479 DUMP_IN = 1, // only 'in' osds
1480 DUMP_OUT = 2, // only 'out' osds
1481 DUMP_UP = 4, // only 'up' osds
1482 DUMP_DOWN = 8, // only 'down' osds
1483 DUMP_DESTROYED = 16, // only 'destroyed' osds
31f18b77 1484 };
11fdf7f2 1485 void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0, string bucket="") const;
7c673cae
FG
1486
1487 int summarize_mapping_stats(
1488 OSDMap *newmap,
1489 const set<int64_t> *pools,
1490 std::string *out,
1491 Formatter *f) const;
1492
1493 string get_flag_string() const;
1494 static string get_flag_string(unsigned flags);
1495 static void dump_erasure_code_profiles(
1496 const mempool::osdmap::map<string,map<string,string> > &profiles,
1497 Formatter *f);
1498 void dump(Formatter *f) const;
1499 static void generate_test_instances(list<OSDMap*>& o);
1500 bool check_new_blacklist_entries() const { return new_blacklist_entries; }
224ce89b 1501
92f5a8d4 1502 void check_health(CephContext *cct, health_check_map_t *checks) const;
35e4c445
FG
1503
1504 int parse_osd_id_list(const vector<string>& ls,
1505 set<int> *out,
1506 ostream *ss) const;
11fdf7f2
TL
1507
1508 float pool_raw_used_rate(int64_t poolid) const;
1509
7c673cae
FG
1510};
1511WRITE_CLASS_ENCODER_FEATURES(OSDMap)
1512WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
1513
11fdf7f2 1514typedef std::shared_ptr<const OSDMap> OSDMapRef;
7c673cae
FG
1515
1516inline ostream& operator<<(ostream& out, const OSDMap& m) {
1517 m.print_oneline_summary(out);
1518 return out;
1519}
1520
11fdf7f2 1521class PGMap;
31f18b77
FG
1522
1523void print_osd_utilization(const OSDMap& osdmap,
11fdf7f2
TL
1524 const PGMap& pgmap,
1525 ostream& out,
1526 Formatter *f,
1527 bool tree,
1528 const string& class_name,
1529 const string& item_name);
7c673cae
FG
1530
1531#endif