]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / osd / OSDMap.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #ifndef CEPH_OSDMAP_H
20 #define CEPH_OSDMAP_H
21
22 /*
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
25 *
26 */
27 #include "include/types.h"
28 #include "osd_types.h"
29
30 //#include "include/ceph_features.h"
31 #include "crush/CrushWrapper.h"
32 #include <vector>
33 #include <list>
34 #include <set>
35 #include <map>
36 #include <memory>
37 #include "include/btree_map.h"
38
39 // forward declaration
40 class CephContext;
41 class CrushWrapper;
42 class health_check_map_t;
43
44 /*
45 * we track up to two intervals during which the osd was alive and
46 * healthy. the most recent is [up_from,up_thru), where up_thru is
47 * the last epoch the osd is known to have _started_. i.e., a lower
48 * bound on the actual osd death. down_at (if it is > up_from) is an
49 * upper bound on the actual osd death.
50 *
51 * the second is the last_clean interval [first,last]. in that case,
52 * the last interval is the last epoch known to have been either
53 * _finished_, or during which the osd cleanly shut down. when
54 * possible, we push this forward to the epoch the osd was eventually
55 * marked down.
56 *
57 * the lost_at is used to allow build_prior to proceed without waiting
58 * for an osd to recover. In certain cases, progress may be blocked
59 * because an osd is down that may contain updates (i.e., a pg may have
60 * gone rw during an interval). If the osd can't be brought online, we
61 * can force things to proceed knowing that we _might_ be losing some
62 * acked writes. If the osd comes back to life later, that's fine to,
63 * but those writes will still be lost (the divergent objects will be
64 * thrown out).
65 */
66 struct osd_info_t {
67 epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
68 epoch_t last_clean_end;
69 epoch_t up_from; // epoch osd marked up
70 epoch_t up_thru; // lower bound on actual osd death (if > up_from)
71 epoch_t down_at; // upper bound on actual osd death (if > up_from)
72 epoch_t lost_at; // last epoch we decided data was "lost"
73
74 osd_info_t() : last_clean_begin(0), last_clean_end(0),
75 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
76
77 void dump(Formatter *f) const;
78 void encode(bufferlist& bl) const;
79 void decode(bufferlist::const_iterator& bl);
80 static void generate_test_instances(list<osd_info_t*>& o);
81 };
82 WRITE_CLASS_ENCODER(osd_info_t)
83
84 ostream& operator<<(ostream& out, const osd_info_t& info);
85
86 struct osd_xinfo_t {
87 utime_t down_stamp; ///< timestamp when we were last marked down
88 float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
89 __u32 laggy_interval; ///< average interval between being marked laggy and recovering
90 uint64_t features; ///< features supported by this osd we should know about
91 __u32 old_weight; ///< weight prior to being auto marked out
92
93 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
94 features(0), old_weight(0) {}
95
96 void dump(Formatter *f) const;
97 void encode(bufferlist& bl) const;
98 void decode(bufferlist::const_iterator& bl);
99 static void generate_test_instances(list<osd_xinfo_t*>& o);
100 };
101 WRITE_CLASS_ENCODER(osd_xinfo_t)
102
103 ostream& operator<<(ostream& out, const osd_xinfo_t& xi);
104
105
106 struct PGTempMap {
107 #if 1
108 bufferlist data;
109 typedef btree::btree_map<pg_t,int32_t*> map_t;
110 map_t map;
111
112 void encode(bufferlist& bl) const {
113 using ceph::encode;
114 uint32_t n = map.size();
115 encode(n, bl);
116 for (auto &p : map) {
117 encode(p.first, bl);
118 bl.append((char*)p.second, (*p.second + 1) * sizeof(int32_t));
119 }
120 }
121 void decode(bufferlist::const_iterator& p) {
122 using ceph::decode;
123 data.clear();
124 map.clear();
125 uint32_t n;
126 decode(n, p);
127 if (!n)
128 return;
129 auto pstart = p;
130 size_t start_off = pstart.get_off();
131 vector<pair<pg_t,size_t>> offsets;
132 offsets.resize(n);
133 for (unsigned i=0; i<n; ++i) {
134 pg_t pgid;
135 decode(pgid, p);
136 offsets[i].first = pgid;
137 offsets[i].second = p.get_off() - start_off;
138 uint32_t vn;
139 decode(vn, p);
140 p.advance(vn * sizeof(int32_t));
141 }
142 size_t len = p.get_off() - start_off;
143 pstart.copy(len, data);
144 if (data.get_num_buffers() > 1) {
145 data.rebuild();
146 }
147 //map.reserve(n);
148 char *start = data.c_str();
149 for (auto i : offsets) {
150 map.insert(map.end(), make_pair(i.first, (int32_t*)(start + i.second)));
151 }
152 }
153 void rebuild() {
154 bufferlist bl;
155 encode(bl);
156 auto p = std::cbegin(bl);
157 decode(p);
158 }
159 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
160 return
161 l.map.size() == r.map.size() &&
162 l.data.contents_equal(r.data);
163 }
164
165 class iterator {
166 map_t::const_iterator it;
167 map_t::const_iterator end;
168 pair<pg_t,vector<int32_t>> current;
169 void init_current() {
170 if (it != end) {
171 current.first = it->first;
172 ceph_assert(it->second);
173 current.second.resize(*it->second);
174 int32_t *p = it->second + 1;
175 for (int n = 0; n < *it->second; ++n, ++p) {
176 current.second[n] = *p;
177 }
178 }
179 }
180 public:
181 iterator(map_t::const_iterator p,
182 map_t::const_iterator e)
183 : it(p), end(e) {
184 init_current();
185 }
186
187 const pair<pg_t,vector<int32_t>>& operator*() const {
188 return current;
189 }
190 const pair<pg_t,vector<int32_t>>* operator->() const {
191 return &current;
192 }
193 friend bool operator==(const iterator& l, const iterator& r) {
194 return l.it == r.it;
195 }
196 friend bool operator!=(const iterator& l, const iterator& r) {
197 return l.it != r.it;
198 }
199 iterator& operator++() {
200 ++it;
201 if (it != end)
202 init_current();
203 return *this;
204 }
205 iterator operator++(int) {
206 iterator r = *this;
207 ++it;
208 if (it != end)
209 init_current();
210 return r;
211 }
212 };
213 iterator begin() const {
214 return iterator(map.begin(), map.end());
215 }
216 iterator end() const {
217 return iterator(map.end(), map.end());
218 }
219 iterator find(pg_t pgid) const {
220 return iterator(map.find(pgid), map.end());
221 }
222 size_t size() const {
223 return map.size();
224 }
225 size_t count(pg_t pgid) const {
226 return map.count(pgid);
227 }
228 void erase(pg_t pgid) {
229 map.erase(pgid);
230 }
231 void clear() {
232 map.clear();
233 data.clear();
234 }
235 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
236 using ceph::encode;
237 size_t need = sizeof(int32_t) * (1 + v.size());
238 if (need < data.get_append_buffer_unused_tail_length()) {
239 bufferptr z(data.get_append_buffer_unused_tail_length());
240 z.zero();
241 data.append(z.c_str(), z.length());
242 }
243 encode(v, data);
244 map[pgid] = (int32_t*)(data.back().end_c_str()) - (1 + v.size());
245 }
246 mempool::osdmap::vector<int32_t> get(pg_t pgid) {
247 mempool::osdmap::vector<int32_t> v;
248 int32_t *p = map[pgid];
249 size_t n = *p++;
250 v.resize(n);
251 for (size_t i = 0; i < n; ++i, ++p) {
252 v[i] = *p;
253 }
254 return v;
255 }
256 #else
257 // trivial implementation
258 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
259
260 void encode(bufferlist& bl) const {
261 encode(pg_temp, bl);
262 }
263 void decode(bufferlist::const_iterator& p) {
264 decode(pg_temp, p);
265 }
266 friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
267 return
268 l.pg_temp.size() == r.pg_temp.size() &&
269 l.pg_temp == r.pg_temp;
270 }
271
272 class iterator {
273 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
274 public:
275 iterator(mempool::osdmap::map<pg_t,
276 mempool::osdmap::vector<int32_t> >::const_iterator p)
277 : it(p) {}
278
279 pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
280 return *it;
281 }
282 const pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
283 return &*it;
284 }
285 friend bool operator==(const iterator& l, const iterator& r) {
286 return l.it == r.it;
287 }
288 friend bool operator!=(const iterator& l, const iterator& r) {
289 return l.it != r.it;
290 }
291 iterator& operator++() {
292 ++it;
293 return *this;
294 }
295 iterator operator++(int) {
296 iterator r = *this;
297 ++it;
298 return r;
299 }
300 };
301 iterator begin() const {
302 return iterator(pg_temp.cbegin());
303 }
304 iterator end() const {
305 return iterator(pg_temp.cend());
306 }
307 iterator find(pg_t pgid) const {
308 return iterator(pg_temp.find(pgid));
309 }
310 size_t size() const {
311 return pg_temp.size();
312 }
313 size_t count(pg_t pgid) const {
314 return pg_temp.count(pgid);
315 }
316 void erase(pg_t pgid) {
317 pg_temp.erase(pgid);
318 }
319 void clear() {
320 pg_temp.clear();
321 }
322 void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
323 pg_temp[pgid] = v;
324 }
325 const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
326 return pg_temp.at(pgid);
327 }
328 #endif
329 void dump(Formatter *f) const {
330 for (const auto &pg : *this) {
331 f->open_object_section("osds");
332 f->dump_stream("pgid") << pg.first;
333 f->open_array_section("osds");
334 for (const auto osd : pg.second)
335 f->dump_int("osd", osd);
336 f->close_section();
337 f->close_section();
338 }
339 }
340 };
341 WRITE_CLASS_ENCODER(PGTempMap)
342
343 /** OSDMap
344 */
345 class OSDMap {
346 public:
347 MEMPOOL_CLASS_HELPERS();
348
349 typedef interval_set<
350 snapid_t,
351 mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t;
352
353 class Incremental {
354 public:
355 MEMPOOL_CLASS_HELPERS();
356
357 /// feature bits we were encoded with. the subsequent OSDMap
358 /// encoding should match.
359 uint64_t encode_features;
360 uuid_d fsid;
361 epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
362 utime_t modified;
363 int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
364 int32_t new_flags;
365 int8_t new_require_osd_release = -1;
366
367 // full (rare)
368 bufferlist fullmap; // in lieu of below.
369 bufferlist crush;
370
371 // incremental
372 int32_t new_max_osd;
373 mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
374 mempool::osdmap::map<int64_t,string> new_pool_names;
375 mempool::osdmap::set<int64_t> old_pools;
376 mempool::osdmap::map<string,map<string,string> > new_erasure_code_profiles;
377 mempool::osdmap::vector<string> old_erasure_code_profiles;
378 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
379 mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
380 mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
381 mempool::osdmap::map<int32_t,uint32_t> new_weight;
382 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
383 mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
384 mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
385 mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
386 mempool::osdmap::map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
387 mempool::osdmap::map<int32_t,epoch_t> new_lost;
388 mempool::osdmap::map<int32_t,uuid_d> new_uuid;
389 mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
390
391 mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist;
392 mempool::osdmap::vector<entity_addr_t> old_blacklist;
393 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
394 mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
395
396 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
397 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> new_pg_upmap_items;
398 mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
399 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
400 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
401
402 string cluster_snapshot;
403
404 float new_nearfull_ratio = -1;
405 float new_backfillfull_ratio = -1;
406 float new_full_ratio = -1;
407
408 int8_t new_require_min_compat_client = -1;
409
410 utime_t new_last_up_change, new_last_in_change;
411
412 mutable bool have_crc; ///< crc values are defined
413 uint32_t full_crc; ///< crc of the resulting OSDMap
414 mutable uint32_t inc_crc; ///< crc of this incremental
415
416 int get_net_marked_out(const OSDMap *previous) const;
417 int get_net_marked_down(const OSDMap *previous) const;
418 int identify_osd(uuid_d u) const;
419
420 void encode_client_old(bufferlist& bl) const;
421 void encode_classic(bufferlist& bl, uint64_t features) const;
422 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
423 void decode_classic(bufferlist::const_iterator &p);
424 void decode(bufferlist::const_iterator &bl);
425 void dump(Formatter *f) const;
426 static void generate_test_instances(list<Incremental*>& o);
427
428 explicit Incremental(epoch_t e=0) :
429 encode_features(0),
430 epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
431 have_crc(false), full_crc(0), inc_crc(0) {
432 }
433 explicit Incremental(bufferlist &bl) {
434 auto p = std::cbegin(bl);
435 decode(p);
436 }
437 explicit Incremental(bufferlist::const_iterator &p) {
438 decode(p);
439 }
440
441 pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
442 if (new_pools.count(pool) == 0)
443 new_pools[pool] = *orig;
444 return &new_pools[pool];
445 }
446 bool has_erasure_code_profile(const string &name) const {
447 auto i = new_erasure_code_profiles.find(name);
448 return i != new_erasure_code_profiles.end();
449 }
450 void set_erasure_code_profile(const string &name,
451 const map<string,string>& profile) {
452 new_erasure_code_profiles[name] = profile;
453 }
454 mempool::osdmap::map<string,map<string,string>> get_erasure_code_profiles() const {
455 return new_erasure_code_profiles;
456 }
457
458 /// propagate update pools' snap metadata to any of their tiers
459 int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
460
461 /// filter out osds with any pending state changing
462 size_t get_pending_state_osds(vector<int> *osds) {
463 ceph_assert(osds);
464 osds->clear();
465
466 for (auto &p : new_state) {
467 osds->push_back(p.first);
468 }
469
470 return osds->size();
471 }
472
473 bool pending_osd_has_state(int osd, unsigned state) {
474 return new_state.count(osd) && (new_state[osd] & state) != 0;
475 }
476
477 void pending_osd_state_set(int osd, unsigned state) {
478 new_state[osd] |= state;
479 }
480
481 // cancel the specified pending osd state if there is any
482 // return ture on success, false otherwise.
483 bool pending_osd_state_clear(int osd, unsigned state) {
484 if (!pending_osd_has_state(osd, state)) {
485 // never has been set or already has been cancelled.
486 return false;
487 }
488
489 new_state[osd] &= ~state;
490 if (!new_state[osd]) {
491 // all flags cleared
492 new_state.erase(osd);
493 }
494 return true;
495 }
496
497 };
498
499 private:
500 uuid_d fsid;
501 epoch_t epoch; // what epoch of the osd cluster descriptor is this
502 utime_t created, modified; // epoch start time
503 int32_t pool_max; // the largest pool num, ever
504
505 uint32_t flags;
506
507 int num_osd; // not saved; see calc_num_osds
508 int num_up_osd; // not saved; see calc_num_osds
509 int num_in_osd; // not saved; see calc_num_osds
510
511 int32_t max_osd;
512 vector<uint32_t> osd_state;
513
514 utime_t last_up_change, last_in_change;
515
516 // These features affect OSDMap[::Incremental] encoding, or the
517 // encoding of some type embedded therein (CrushWrapper, something
518 // from osd_types, etc.).
519 static constexpr uint64_t SIGNIFICANT_FEATURES =
520 CEPH_FEATUREMASK_PGID64 |
521 CEPH_FEATUREMASK_PGPOOL3 |
522 CEPH_FEATUREMASK_OSDENC |
523 CEPH_FEATUREMASK_OSDMAP_ENC |
524 CEPH_FEATUREMASK_OSD_POOLRESEND |
525 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
526 CEPH_FEATUREMASK_MSG_ADDR2 |
527 CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
528 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
529 CEPH_FEATUREMASK_SERVER_LUMINOUS |
530 CEPH_FEATUREMASK_SERVER_MIMIC |
531 CEPH_FEATUREMASK_SERVER_NAUTILUS;
532
533 struct addrs_s {
534 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
535 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
536 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
537 mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
538 };
539 std::shared_ptr<addrs_s> osd_addrs;
540
541 entity_addrvec_t _blank_addrvec;
542
543 mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
544 mempool::osdmap::vector<osd_info_t> osd_info;
545 std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
546 std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
547 std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
548
549 // remap (post-CRUSH, pre-up)
550 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
551 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
552
553 mempool::osdmap::map<int64_t,pg_pool_t> pools;
554 mempool::osdmap::map<int64_t,string> pool_name;
555 mempool::osdmap::map<string,map<string,string> > erasure_code_profiles;
556 mempool::osdmap::map<string,int64_t> name_pool;
557
558 std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
559 mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
560
561 mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist;
562
563 /// queue of snaps to remove
564 mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
565
566 /// removed_snaps additions this epoch
567 mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
568
569 /// removed_snaps removals this epoch
570 mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
571
572 epoch_t cluster_snapshot_epoch;
573 string cluster_snapshot;
574 bool new_blacklist_entries;
575
576 float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
577
578 /// min compat client we want to support
579 uint8_t require_min_compat_client = 0; // CEPH_RELEASE_*
580
581 public:
582 /// require osds to run at least this release
583 uint8_t require_osd_release = 0; // CEPH_RELEASE_*
584
585 private:
586 mutable uint64_t cached_up_osd_features;
587
588 mutable bool crc_defined;
589 mutable uint32_t crc;
590
591 void _calc_up_osd_features();
592
593 public:
594 bool have_crc() const { return crc_defined; }
595 uint32_t get_crc() const { return crc; }
596
597 std::shared_ptr<CrushWrapper> crush; // hierarchical map
598 private:
599 uint32_t crush_version = 1;
600
601 friend class OSDMonitor;
602
603 public:
604 OSDMap() : epoch(0),
605 pool_max(0),
606 flags(0),
607 num_osd(0), num_up_osd(0), num_in_osd(0),
608 max_osd(0),
609 osd_addrs(std::make_shared<addrs_s>()),
610 pg_temp(std::make_shared<PGTempMap>()),
611 primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
612 osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
613 cluster_snapshot_epoch(0),
614 new_blacklist_entries(false),
615 cached_up_osd_features(0),
616 crc_defined(false), crc(0),
617 crush(std::make_shared<CrushWrapper>()) {
618 }
619
620 private:
621 OSDMap(const OSDMap& other) = default;
622 OSDMap& operator=(const OSDMap& other) = default;
623 public:
624
625 /// return feature mask subset that is relevant to OSDMap encoding
626 static uint64_t get_significant_features(uint64_t features) {
627 return SIGNIFICANT_FEATURES & features;
628 }
629
630 uint64_t get_encoding_features() const;
631
632 void deepish_copy_from(const OSDMap& o) {
633 *this = o;
634 primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
635 pg_temp.reset(new PGTempMap(*o.pg_temp));
636 osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
637
638 if (o.osd_primary_affinity)
639 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
640
641 // NOTE: this still references shared entity_addrvec_t's.
642 osd_addrs.reset(new addrs_s(*o.osd_addrs));
643
644 // NOTE: we do not copy crush. note that apply_incremental will
645 // allocate a new CrushWrapper, though.
646 }
647
648 // map info
649 const uuid_d& get_fsid() const { return fsid; }
650 void set_fsid(uuid_d& f) { fsid = f; }
651
652 epoch_t get_epoch() const { return epoch; }
653 void inc_epoch() { epoch++; }
654
655 void set_epoch(epoch_t e);
656
657 uint32_t get_crush_version() const {
658 return crush_version;
659 }
660
661 /* stamps etc */
662 const utime_t& get_created() const { return created; }
663 const utime_t& get_modified() const { return modified; }
664
665 bool is_blacklisted(const entity_addr_t& a) const;
666 bool is_blacklisted(const entity_addrvec_t& a) const;
667 void get_blacklist(list<pair<entity_addr_t,utime_t > > *bl) const;
668 void get_blacklist(std::set<entity_addr_t> *bl) const;
669
670 string get_cluster_snapshot() const {
671 if (cluster_snapshot_epoch == epoch)
672 return cluster_snapshot;
673 return string();
674 }
675
676 float get_full_ratio() const {
677 return full_ratio;
678 }
679 float get_backfillfull_ratio() const {
680 return backfillfull_ratio;
681 }
682 float get_nearfull_ratio() const {
683 return nearfull_ratio;
684 }
685 void get_full_pools(CephContext *cct,
686 set<int64_t> *full,
687 set<int64_t> *backfillfull,
688 set<int64_t> *nearfull) const;
689 void get_full_osd_counts(set<int> *full, set<int> *backfill,
690 set<int> *nearfull) const;
691
692
693 /***** cluster state *****/
694 /* osds */
695 int get_max_osd() const { return max_osd; }
696 void set_max_osd(int m);
697
698 unsigned get_num_osds() const {
699 return num_osd;
700 }
701 unsigned get_num_up_osds() const {
702 return num_up_osd;
703 }
704 unsigned get_num_in_osds() const {
705 return num_in_osd;
706 }
707 /// recalculate cached values for get_num{,_up,_in}_osds
708 int calc_num_osds();
709
710 void get_all_osds(set<int32_t>& ls) const;
711 void get_up_osds(set<int32_t>& ls) const;
712 void get_out_osds(set<int32_t>& ls) const;
713 unsigned get_num_pg_temp() const {
714 return pg_temp->size();
715 }
716
717 int get_flags() const { return flags; }
718 bool test_flag(int f) const { return flags & f; }
719 void set_flag(int f) { flags |= f; }
720 void clear_flag(int f) { flags &= ~f; }
721
722 void get_flag_set(set<string> *flagset) const;
723
724 static void calc_state_set(int state, set<string>& st);
725
726 int get_state(int o) const {
727 ceph_assert(o < max_osd);
728 return osd_state[o];
729 }
730 int get_state(int o, set<string>& st) const {
731 ceph_assert(o < max_osd);
732 unsigned t = osd_state[o];
733 calc_state_set(t, st);
734 return osd_state[o];
735 }
736 void set_state(int o, unsigned s) {
737 ceph_assert(o < max_osd);
738 osd_state[o] = s;
739 }
740 void set_weight(int o, unsigned w) {
741 ceph_assert(o < max_osd);
742 osd_weight[o] = w;
743 if (w)
744 osd_state[o] |= CEPH_OSD_EXISTS;
745 }
746 unsigned get_weight(int o) const {
747 ceph_assert(o < max_osd);
748 return osd_weight[o];
749 }
750 float get_weightf(int o) const {
751 return (float)get_weight(o) / (float)CEPH_OSD_IN;
752 }
753 void adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const;
754
755 void set_primary_affinity(int o, int w) {
756 ceph_assert(o < max_osd);
757 if (!osd_primary_affinity)
758 osd_primary_affinity.reset(
759 new mempool::osdmap::vector<__u32>(
760 max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
761 (*osd_primary_affinity)[o] = w;
762 }
763 unsigned get_primary_affinity(int o) const {
764 ceph_assert(o < max_osd);
765 if (!osd_primary_affinity)
766 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
767 return (*osd_primary_affinity)[o];
768 }
769 float get_primary_affinityf(int o) const {
770 return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
771 }
772
773 bool has_erasure_code_profile(const string &name) const {
774 auto i = erasure_code_profiles.find(name);
775 return i != erasure_code_profiles.end();
776 }
777 int get_erasure_code_profile_default(CephContext *cct,
778 map<string,string> &profile_map,
779 ostream *ss);
780 void set_erasure_code_profile(const string &name,
781 const map<string,string>& profile) {
782 erasure_code_profiles[name] = profile;
783 }
784 const map<string,string> &get_erasure_code_profile(
785 const string &name) const {
786 static map<string,string> empty;
787 auto i = erasure_code_profiles.find(name);
788 if (i == erasure_code_profiles.end())
789 return empty;
790 else
791 return i->second;
792 }
793 const mempool::osdmap::map<string,map<string,string> > &get_erasure_code_profiles() const {
794 return erasure_code_profiles;
795 }
796
797 bool exists(int osd) const {
798 //assert(osd >= 0);
799 return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
800 }
801
802 bool is_destroyed(int osd) const {
803 return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
804 }
805
806 bool is_up(int osd) const {
807 return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
808 }
809
810 bool has_been_up_since(int osd, epoch_t epoch) const {
811 return is_up(osd) && get_up_from(osd) <= epoch;
812 }
813
814 bool is_down(int osd) const {
815 return !is_up(osd);
816 }
817
818 bool is_out(int osd) const {
819 return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
820 }
821
822 bool is_in(int osd) const {
823 return !is_out(osd);
824 }
825
826 bool is_noup(int osd) const {
827 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
828 }
829
830 bool is_nodown(int osd) const {
831 return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
832 }
833
834 bool is_noin(int osd) const {
835 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
836 }
837
838 bool is_noout(int osd) const {
839 return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
840 }
841
842 void get_noup_osds(vector<int> *osds) const {
843 ceph_assert(osds);
844 osds->clear();
845
846 for (int i = 0; i < max_osd; i++) {
847 if (is_noup(i)) {
848 osds->push_back(i);
849 }
850 }
851 }
852
853 void get_nodown_osds(vector<int> *osds) const {
854 ceph_assert(osds);
855 osds->clear();
856
857 for (int i = 0; i < max_osd; i++) {
858 if (is_nodown(i)) {
859 osds->push_back(i);
860 }
861 }
862 }
863
864 void get_noin_osds(vector<int> *osds) const {
865 ceph_assert(osds);
866 osds->clear();
867
868 for (int i = 0; i < max_osd; i++) {
869 if (is_noin(i)) {
870 osds->push_back(i);
871 }
872 }
873 }
874
875 void get_noout_osds(vector<int> *osds) const {
876 ceph_assert(osds);
877 osds->clear();
878
879 for (int i = 0; i < max_osd; i++) {
880 if (is_noout(i)) {
881 osds->push_back(i);
882 }
883 }
884 }
885
886 /**
887 * check if an entire crush subtree is down
888 */
889 bool subtree_is_down(int id, set<int> *down_cache) const;
890 bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
891
892 bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
893 set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
894
895 int identify_osd(const entity_addr_t& addr) const;
896 int identify_osd(const uuid_d& u) const;
897 int identify_osd_on_all_channels(const entity_addr_t& addr) const;
898
899 bool have_addr(const entity_addr_t& addr) const {
900 return identify_osd(addr) >= 0;
901 }
902 int find_osd_on_ip(const entity_addr_t& ip) const;
903
904 const entity_addrvec_t& get_addrs(int osd) const {
905 ceph_assert(exists(osd));
906 return osd_addrs->client_addrs[osd] ?
907 *osd_addrs->client_addrs[osd] : _blank_addrvec;
908 }
909 const entity_addrvec_t& get_most_recent_addrs(int osd) const {
910 return get_addrs(osd);
911 }
912 const entity_addrvec_t &get_cluster_addrs(int osd) const {
913 ceph_assert(exists(osd));
914 return osd_addrs->cluster_addrs[osd] ?
915 *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
916 }
917 const entity_addrvec_t &get_hb_back_addrs(int osd) const {
918 ceph_assert(exists(osd));
919 return osd_addrs->hb_back_addrs[osd] ?
920 *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
921 }
922 const entity_addrvec_t &get_hb_front_addrs(int osd) const {
923 ceph_assert(exists(osd));
924 return osd_addrs->hb_front_addrs[osd] ?
925 *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
926 }
927
928 const uuid_d& get_uuid(int osd) const {
929 ceph_assert(exists(osd));
930 return (*osd_uuid)[osd];
931 }
932
933 const epoch_t& get_up_from(int osd) const {
934 ceph_assert(exists(osd));
935 return osd_info[osd].up_from;
936 }
937 const epoch_t& get_up_thru(int osd) const {
938 ceph_assert(exists(osd));
939 return osd_info[osd].up_thru;
940 }
941 const epoch_t& get_down_at(int osd) const {
942 ceph_assert(exists(osd));
943 return osd_info[osd].down_at;
944 }
945 const osd_info_t& get_info(int osd) const {
946 ceph_assert(osd < max_osd);
947 return osd_info[osd];
948 }
949
950 const osd_xinfo_t& get_xinfo(int osd) const {
951 ceph_assert(osd < max_osd);
952 return osd_xinfo[osd];
953 }
954
955 int get_next_up_osd_after(int n) const {
956 if (get_max_osd() == 0)
957 return -1;
958 for (int i = n + 1; i != n; ++i) {
959 if (i >= get_max_osd())
960 i = 0;
961 if (i == n)
962 break;
963 if (is_up(i))
964 return i;
965 }
966 return -1;
967 }
968
969 int get_previous_up_osd_before(int n) const {
970 if (get_max_osd() == 0)
971 return -1;
972 for (int i = n - 1; i != n; --i) {
973 if (i < 0)
974 i = get_max_osd() - 1;
975 if (i == n)
976 break;
977 if (is_up(i))
978 return i;
979 }
980 return -1;
981 }
982
983
984 void get_random_up_osds_by_subtree(int n, // whoami
985 string &subtree,
986 int limit, // how many
987 set<int> skip,
988 set<int> *want) const;
989
990 /**
991 * get feature bits required by the current structure
992 *
993 * @param entity_type [in] what entity type we are asking about
994 * @param mask [out] set of all possible map-related features we could set
995 * @return feature bits used by this map
996 */
997 uint64_t get_features(int entity_type, uint64_t *mask) const;
998
999 /**
1000 * get oldest *client* version (firefly, hammer, etc.) that can connect given
1001 * the feature bits required (according to get_features()).
1002 */
1003 uint8_t get_min_compat_client() const;
1004
1005 /**
1006 * gets the required minimum *client* version that can connect to the cluster.
1007 */
1008 uint8_t get_require_min_compat_client() const;
1009
1010 /**
1011 * get intersection of features supported by up osds
1012 */
1013 uint64_t get_up_osd_features() const;
1014
1015 void maybe_remove_pg_upmaps(CephContext *cct,
1016 const OSDMap& oldmap,
1017 const OSDMap& nextmap,
1018 Incremental *pending_inc);
1019
1020 int apply_incremental(const Incremental &inc);
1021
1022 /// try to re-use/reference addrs in oldmap from newmap
1023 static void dedup(const OSDMap *oldmap, OSDMap *newmap);
1024
1025 static void clean_temps(CephContext *cct,
1026 const OSDMap& oldmap,
1027 const OSDMap& nextmap,
1028 Incremental *pending_inc);
1029
1030 // serialize, unserialize
1031 private:
1032 void encode_client_old(bufferlist& bl) const;
1033 void encode_classic(bufferlist& bl, uint64_t features) const;
1034 void decode_classic(bufferlist::const_iterator& p);
1035 void post_decode();
1036 public:
1037 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
1038 void decode(bufferlist& bl);
1039 void decode(bufferlist::const_iterator& bl);
1040
1041
1042 /**** mapping facilities ****/
1043 int map_to_pg(
1044 int64_t pool,
1045 const string& name,
1046 const string& key,
1047 const string& nspace,
1048 pg_t *pg) const;
1049 int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
1050 pg_t &pg) const;
1051 pg_t object_locator_to_pg(const object_t& oid,
1052 const object_locator_t& loc) const {
1053 pg_t pg;
1054 int ret = object_locator_to_pg(oid, loc, pg);
1055 ceph_assert(ret == 0);
1056 return pg;
1057 }
1058
1059
1060 static object_locator_t file_to_object_locator(const file_layout_t& layout) {
1061 return object_locator_t(layout.pool_id, layout.pool_ns);
1062 }
1063
1064 ceph_object_layout file_to_object_layout(object_t oid,
1065 file_layout_t& layout) const {
1066 return make_object_layout(oid, layout.pool_id, layout.pool_ns);
1067 }
1068
1069 ceph_object_layout make_object_layout(object_t oid, int pg_pool,
1070 string nspace) const;
1071
1072 int get_pg_num(int pg_pool) const
1073 {
1074 const pg_pool_t *pool = get_pg_pool(pg_pool);
1075 ceph_assert(NULL != pool);
1076 return pool->get_pg_num();
1077 }
1078
1079 bool pg_exists(pg_t pgid) const {
1080 const pg_pool_t *p = get_pg_pool(pgid.pool());
1081 return p && pgid.ps() < p->get_pg_num();
1082 }
1083
1084 int get_pg_pool_min_size(pg_t pgid) const {
1085 if (!pg_exists(pgid)) {
1086 return -ENOENT;
1087 }
1088 const pg_pool_t *p = get_pg_pool(pgid.pool());
1089 ceph_assert(p);
1090 return p->get_min_size();
1091 }
1092
1093 int get_pg_pool_size(pg_t pgid) const {
1094 if (!pg_exists(pgid)) {
1095 return -ENOENT;
1096 }
1097 const pg_pool_t *p = get_pg_pool(pgid.pool());
1098 ceph_assert(p);
1099 return p->get_size();
1100 }
1101
1102 int get_pg_pool_crush_rule(pg_t pgid) const {
1103 if (!pg_exists(pgid)) {
1104 return -ENOENT;
1105 }
1106 const pg_pool_t *p = get_pg_pool(pgid.pool());
1107 ceph_assert(p);
1108 return p->get_crush_rule();
1109 }
1110
1111 private:
1112 /// pg -> (raw osd list)
1113 void _pg_to_raw_osds(
1114 const pg_pool_t& pool, pg_t pg,
1115 vector<int> *osds,
1116 ps_t *ppps) const;
1117 int _pick_primary(const vector<int>& osds) const;
1118 void _remove_nonexistent_osds(const pg_pool_t& pool, vector<int>& osds) const;
1119
1120 void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
1121 vector<int> *osds, int *primary) const;
1122
1123 /// apply pg_upmap[_items] mappings
1124 void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const;
1125
1126 /// pg -> (up osd list)
1127 void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
1128 vector<int> *up) const;
1129
1130
1131 /**
1132 * Get the pg and primary temp, if they are specified.
1133 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1134 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1135 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1136 */
1137 void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
1138 vector<int> *temp_pg, int *temp_primary) const;
1139
1140 /**
1141 * map to up and acting. Fills in whatever fields are non-NULL.
1142 */
1143 void _pg_to_up_acting_osds(const pg_t& pg, vector<int> *up, int *up_primary,
1144 vector<int> *acting, int *acting_primary,
1145 bool raw_pg_to_pg = true) const;
1146
1147 public:
1148 /***
1149 * This is suitable only for looking at raw CRUSH outputs. It skips
1150 * applying the temp and up checks and should not be used
1151 * by anybody for data mapping purposes.
1152 * raw and primary must be non-NULL
1153 */
1154 void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
1155 void pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const;
1156 /// map a pg to its acting set. @return acting set size
1157 void pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
1158 int *acting_primary) const {
1159 _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
1160 }
1161 void pg_to_acting_osds(pg_t pg, vector<int>& acting) const {
1162 return pg_to_acting_osds(pg, &acting, NULL);
1163 }
1164 /**
1165 * This does not apply temp overrides and should not be used
1166 * by anybody for data mapping purposes. Specify both pointers.
1167 */
1168 void pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const;
1169 /**
1170 * map a pg to its acting set as well as its up set. You must use
1171 * the acting set for data mapping purposes, but some users will
1172 * also find the up set useful for things like deciding what to
1173 * set as pg_temp.
1174 * Each of these pointers must be non-NULL.
1175 */
1176 void pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
1177 vector<int> *acting, int *acting_primary) const {
1178 _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
1179 }
1180 void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) const {
1181 int up_primary, acting_primary;
1182 pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
1183 }
1184 bool pg_is_ec(pg_t pg) const {
1185 auto i = pools.find(pg.pool());
1186 ceph_assert(i != pools.end());
1187 return i->second.is_erasure();
1188 }
1189 bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
1190 auto i = get_pools().find(pgid.pool());
1191 if (i == get_pools().end()) {
1192 return false;
1193 }
1194 if (!i->second.is_erasure()) {
1195 *out = spg_t(pgid);
1196 return true;
1197 }
1198 int primary;
1199 vector<int> acting;
1200 pg_to_acting_osds(pgid, &acting, &primary);
1201 for (uint8_t i = 0; i < acting.size(); ++i) {
1202 if (acting[i] == primary) {
1203 *out = spg_t(pgid, shard_id_t(i));
1204 return true;
1205 }
1206 }
1207 return false;
1208 }
1209 bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
1210 auto i = get_pools().find(pgid.pool());
1211 if (i == get_pools().end()) {
1212 return false;
1213 }
1214 vector<int> acting;
1215 pg_to_acting_osds(pgid, &acting, primary);
1216 if (i->second.is_erasure()) {
1217 for (uint8_t i = 0; i < acting.size(); ++i) {
1218 if (acting[i] == *primary) {
1219 *out = spg_t(pgid, shard_id_t(i));
1220 return true;
1221 }
1222 }
1223 } else {
1224 *out = spg_t(pgid);
1225 return true;
1226 }
1227 return false;
1228 }
1229
1230 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1231 get_removed_snaps_queue() const {
1232 return removed_snaps_queue;
1233 }
1234 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1235 get_new_removed_snaps() const {
1236 return new_removed_snaps;
1237 }
1238 const mempool::osdmap::map<int64_t,snap_interval_set_t>&
1239 get_new_purged_snaps() const {
1240 return new_purged_snaps;
1241 }
1242
1243 int64_t lookup_pg_pool_name(const string& name) const {
1244 auto p = name_pool.find(name);
1245 if (p == name_pool.end())
1246 return -ENOENT;
1247 return p->second;
1248 }
1249
1250 int64_t get_pool_max() const {
1251 return pool_max;
1252 }
1253 const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
1254 return pools;
1255 }
1256 mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
1257 return pools;
1258 }
1259 void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
1260 ceph_assert(pool_ids);
1261 for (auto &p: pools) {
1262 if (p.second.get_crush_rule() == rule_id) {
1263 pool_ids->insert(p.first);
1264 }
1265 }
1266 }
1267 void get_pool_ids_by_osd(CephContext *cct,
1268 int osd,
1269 set<int64_t> *pool_ids) const;
1270 const string& get_pool_name(int64_t p) const {
1271 auto i = pool_name.find(p);
1272 ceph_assert(i != pool_name.end());
1273 return i->second;
1274 }
1275 const mempool::osdmap::map<int64_t,string>& get_pool_names() const {
1276 return pool_name;
1277 }
1278 bool have_pg_pool(int64_t p) const {
1279 return pools.count(p);
1280 }
1281 const pg_pool_t* get_pg_pool(int64_t p) const {
1282 auto i = pools.find(p);
1283 if (i != pools.end())
1284 return &i->second;
1285 return NULL;
1286 }
1287 unsigned get_pg_size(pg_t pg) const {
1288 auto p = pools.find(pg.pool());
1289 ceph_assert(p != pools.end());
1290 return p->second.get_size();
1291 }
1292 int get_pg_type(pg_t pg) const {
1293 auto p = pools.find(pg.pool());
1294 ceph_assert(p != pools.end());
1295 return p->second.get_type();
1296 }
1297
1298
1299 pg_t raw_pg_to_pg(pg_t pg) const {
1300 auto p = pools.find(pg.pool());
1301 ceph_assert(p != pools.end());
1302 return p->second.raw_pg_to_pg(pg);
1303 }
1304
1305 // pg -> acting primary osd
1306 int get_pg_acting_primary(pg_t pg) const {
1307 int primary = -1;
1308 _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
1309 return primary;
1310 }
1311
1312 /*
1313 * check whether an spg_t maps to a particular osd
1314 */
1315 bool is_up_acting_osd_shard(spg_t pg, int osd) const {
1316 vector<int> up, acting;
1317 _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
1318 if (pg.shard == shard_id_t::NO_SHARD) {
1319 if (calc_pg_role(osd, acting, acting.size()) >= 0 ||
1320 calc_pg_role(osd, up, up.size()) >= 0)
1321 return true;
1322 } else {
1323 if (pg.shard < (int)acting.size() && acting[pg.shard] == osd)
1324 return true;
1325 if (pg.shard < (int)up.size() && up[pg.shard] == osd)
1326 return true;
1327 }
1328 return false;
1329 }
1330
1331
1332 /* what replica # is a given osd? 0 primary, -1 for none. */
1333 static int calc_pg_rank(int osd, const vector<int>& acting, int nrep=0);
1334 static int calc_pg_role(int osd, const vector<int>& acting, int nrep=0);
1335 static bool primary_changed(
1336 int oldprimary,
1337 const vector<int> &oldacting,
1338 int newprimary,
1339 const vector<int> &newacting);
1340
1341 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
1342 int get_pg_acting_rank(pg_t pg, int osd) const {
1343 vector<int> group;
1344 pg_to_acting_osds(pg, group);
1345 return calc_pg_rank(osd, group, group.size());
1346 }
1347 /* role is -1 (stray), 0 (primary), 1 (replica) */
1348 int get_pg_acting_role(const pg_t& pg, int osd) const {
1349 vector<int> group;
1350 pg_to_acting_osds(pg, group);
1351 return calc_pg_role(osd, group, group.size());
1352 }
1353
1354 bool osd_is_valid_op_target(pg_t pg, int osd) const {
1355 int primary;
1356 vector<int> group;
1357 pg_to_acting_osds(pg, &group, &primary);
1358 if (osd == primary)
1359 return true;
1360 if (pg_is_ec(pg))
1361 return false;
1362
1363 return calc_pg_role(osd, group, group.size()) >= 0;
1364 }
1365
1366 int clean_pg_upmaps(
1367 CephContext *cct,
1368 Incremental *pending_inc) const;
1369
1370 bool try_pg_upmap(
1371 CephContext *cct,
1372 pg_t pg, ///< pg to potentially remap
1373 const set<int>& overfull, ///< osds we'd want to evacuate
1374 const vector<int>& underfull, ///< osds to move to, in order of preference
1375 vector<int> *orig,
1376 vector<int> *out); ///< resulting alternative mapping
1377
1378 int calc_pg_upmaps(
1379 CephContext *cct,
1380 float max_deviation, ///< max deviation from target (value < 1.0)
1381 int max_iterations, ///< max iterations to run
1382 const set<int64_t>& pools, ///< [optional] restrict to pool
1383 Incremental *pending_inc
1384 );
1385
1386 int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
1387
1388 bool have_pg_upmaps(pg_t pg) const {
1389 return pg_upmap.count(pg) ||
1390 pg_upmap_items.count(pg);
1391 }
1392
1393 /*
1394 * handy helpers to build simple maps...
1395 */
1396 /**
1397 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1398 * it will be initialized with the specified number of OSDs in a
1399 * single host. If **num_osd** is < 0 the layout of the OSD map will
1400 * be built by reading the content of the configuration file.
1401 *
1402 * @param cct [in] in core ceph context
1403 * @param e [in] initial epoch
1404 * @param fsid [in] id of the cluster
1405 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1406 * @return **0** on success, negative errno on error.
1407 */
1408 private:
1409 int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
1410 int num_osd, int pg_bits, int pgp_bits,
1411 bool default_pool);
1412 public:
1413 int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
1414 int num_osd) {
1415 return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
1416 }
1417 int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
1418 int num_osd, int pg_bits, int pgp_bits) {
1419 return build_simple_optioned(cct, e, fsid, num_osd,
1420 pg_bits, pgp_bits, true);
1421 }
1422 static int _build_crush_types(CrushWrapper& crush);
1423 static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
1424 int num_osd, ostream *ss);
1425 static int build_simple_crush_map_from_conf(CephContext *cct,
1426 CrushWrapper& crush,
1427 ostream *ss);
1428 static int build_simple_crush_rules(
1429 CephContext *cct, CrushWrapper& crush,
1430 const string& root,
1431 ostream *ss);
1432
1433 bool crush_rule_in_use(int rule_id) const;
1434
1435 int validate_crush_rules(CrushWrapper *crush, ostream *ss) const;
1436
1437 void clear_temp() {
1438 pg_temp->clear();
1439 primary_temp->clear();
1440 }
1441
1442 private:
1443 void print_osd_line(int cur, ostream *out, Formatter *f) const;
1444 public:
1445 void print(ostream& out) const;
1446 void print_pools(ostream& out) const;
1447 void print_summary(Formatter *f, ostream& out, const string& prefix, bool extra=false) const;
1448 void print_oneline_summary(ostream& out) const;
1449
1450 enum {
1451 DUMP_IN = 1, // only 'in' osds
1452 DUMP_OUT = 2, // only 'out' osds
1453 DUMP_UP = 4, // only 'up' osds
1454 DUMP_DOWN = 8, // only 'down' osds
1455 DUMP_DESTROYED = 16, // only 'destroyed' osds
1456 };
1457 void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0, string bucket="") const;
1458
1459 int summarize_mapping_stats(
1460 OSDMap *newmap,
1461 const set<int64_t> *pools,
1462 std::string *out,
1463 Formatter *f) const;
1464
1465 string get_flag_string() const;
1466 static string get_flag_string(unsigned flags);
1467 static void dump_erasure_code_profiles(
1468 const mempool::osdmap::map<string,map<string,string> > &profiles,
1469 Formatter *f);
1470 void dump(Formatter *f) const;
1471 static void generate_test_instances(list<OSDMap*>& o);
1472 bool check_new_blacklist_entries() const { return new_blacklist_entries; }
1473
1474 void check_health(health_check_map_t *checks) const;
1475
1476 int parse_osd_id_list(const vector<string>& ls,
1477 set<int> *out,
1478 ostream *ss) const;
1479
1480 float pool_raw_used_rate(int64_t poolid) const;
1481
1482 };
1483 WRITE_CLASS_ENCODER_FEATURES(OSDMap)
1484 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
1485
1486 typedef std::shared_ptr<const OSDMap> OSDMapRef;
1487
1488 inline ostream& operator<<(ostream& out, const OSDMap& m) {
1489 m.print_oneline_summary(out);
1490 return out;
1491 }
1492
1493 class PGMap;
1494
1495 void print_osd_utilization(const OSDMap& osdmap,
1496 const PGMap& pgmap,
1497 ostream& out,
1498 Formatter *f,
1499 bool tree,
1500 const string& class_name,
1501 const string& item_name);
1502
1503 #endif