1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
27 #include "include/types.h"
28 #include "osd_types.h"
30 //#include "include/ceph_features.h"
31 #include "crush/CrushWrapper.h"
36 #include "include/memory.h"
37 #include "include/btree_map.h"
40 // forward declaration
43 class health_check_map_t
;
45 // FIXME C++11 does not have std::equal for two differently-typed containers.
46 // use this until we move to c++14
47 template<typename A
, typename B
>
48 bool vectors_equal(A a
, B b
)
51 a
.size() == b
.size() &&
53 memcmp((char*)&a
[0], (char*)&b
[0], sizeof(a
[0]) * a
.size()) == 0);
58 * we track up to two intervals during which the osd was alive and
59 * healthy. the most recent is [up_from,up_thru), where up_thru is
60 * the last epoch the osd is known to have _started_. i.e., a lower
61 * bound on the actual osd death. down_at (if it is > up_from) is an
62 * upper bound on the actual osd death.
64 * the second is the last_clean interval [first,last]. in that case,
65 * the last interval is the last epoch known to have been either
66 * _finished_, or during which the osd cleanly shut down. when
67 * possible, we push this forward to the epoch the osd was eventually
70 * the lost_at is used to allow build_prior to proceed without waiting
71 * for an osd to recover. In certain cases, progress may be blocked
72 * because an osd is down that may contain updates (i.e., a pg may have
73 * gone rw during an interval). If the osd can't be brought online, we
74 * can force things to proceed knowing that we _might_ be losing some
75 * acked writes. If the osd comes back to life later, that's fine to,
76 * but those writes will still be lost (the divergent objects will be
80 epoch_t last_clean_begin
; // last interval that ended with a clean osd shutdown
81 epoch_t last_clean_end
;
82 epoch_t up_from
; // epoch osd marked up
83 epoch_t up_thru
; // lower bound on actual osd death (if > up_from)
84 epoch_t down_at
; // upper bound on actual osd death (if > up_from)
85 epoch_t lost_at
; // last epoch we decided data was "lost"
87 osd_info_t() : last_clean_begin(0), last_clean_end(0),
88 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
90 void dump(Formatter
*f
) const;
91 void encode(bufferlist
& bl
) const;
92 void decode(bufferlist::iterator
& bl
);
93 static void generate_test_instances(list
<osd_info_t
*>& o
);
95 WRITE_CLASS_ENCODER(osd_info_t
)
97 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
);
100 utime_t down_stamp
; ///< timestamp when we were last marked down
101 float laggy_probability
; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
102 __u32 laggy_interval
; ///< average interval between being marked laggy and recovering
103 uint64_t features
; ///< features supported by this osd we should know about
104 __u32 old_weight
; ///< weight prior to being auto marked out
106 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
107 features(0), old_weight(0) {}
109 void dump(Formatter
*f
) const;
110 void encode(bufferlist
& bl
) const;
111 void decode(bufferlist::iterator
& bl
);
112 static void generate_test_instances(list
<osd_xinfo_t
*>& o
);
114 WRITE_CLASS_ENCODER(osd_xinfo_t
)
116 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
);
122 typedef btree::btree_map
<pg_t
,int32_t*> map_t
;
125 void encode(bufferlist
& bl
) const {
126 uint32_t n
= map
.size();
128 for (auto &p
: map
) {
129 ::encode(p
.first
, bl
);
130 bl
.append((char*)p
.second
, (*p
.second
+ 1) * sizeof(int32_t));
133 void decode(bufferlist::iterator
& p
) {
140 bufferlist::iterator pstart
= p
;
141 size_t start_off
= pstart
.get_off();
142 vector
<pair
<pg_t
,size_t>> offsets
;
144 for (unsigned i
=0; i
<n
; ++i
) {
147 offsets
[i
].first
= pgid
;
148 offsets
[i
].second
= p
.get_off() - start_off
;
151 p
.advance(vn
* sizeof(int32_t));
153 size_t len
= p
.get_off() - start_off
;
154 pstart
.copy(len
, data
);
155 if (data
.get_num_buffers() > 1) {
159 char *start
= data
.c_str();
160 for (auto i
: offsets
) {
161 map
.insert(map
.end(), make_pair(i
.first
, (int32_t*)(start
+ i
.second
)));
170 friend bool operator==(const PGTempMap
& l
, const PGTempMap
& r
) {
172 l
.map
.size() == r
.map
.size() &&
173 l
.data
.contents_equal(r
.data
);
177 map_t::const_iterator it
;
178 map_t::const_iterator end
;
179 pair
<pg_t
,vector
<int32_t>> current
;
180 void init_current() {
182 current
.first
= it
->first
;
184 current
.second
.resize(*it
->second
);
185 int32_t *p
= it
->second
+ 1;
186 for (int n
= 0; n
< *it
->second
; ++n
, ++p
) {
187 current
.second
[n
] = *p
;
192 iterator(map_t::const_iterator p
,
193 map_t::const_iterator e
)
198 const pair
<pg_t
,vector
<int32_t>>& operator*() const {
201 const pair
<pg_t
,vector
<int32_t>>* operator->() const {
204 friend bool operator==(const iterator
& l
, const iterator
& r
) {
207 friend bool operator!=(const iterator
& l
, const iterator
& r
) {
210 iterator
& operator++() {
216 iterator
operator++(int) {
224 iterator
begin() const {
225 return iterator(map
.begin(), map
.end());
227 iterator
end() const {
228 return iterator(map
.end(), map
.end());
230 iterator
find(pg_t pgid
) const {
231 return iterator(map
.find(pgid
), map
.end());
233 size_t size() const {
236 size_t count(pg_t pgid
) const {
237 return map
.count(pgid
);
239 void erase(pg_t pgid
) {
246 void set(pg_t pgid
, const mempool::osdmap::vector
<int32_t>& v
) {
247 size_t need
= sizeof(int32_t) * (1 + v
.size());
248 if (need
< data
.get_append_buffer_unused_tail_length()) {
249 bufferptr
z(data
.get_append_buffer_unused_tail_length());
251 data
.append(z
.c_str(), z
.length());
254 map
[pgid
] = (int32_t*)(data
.back().end_c_str()) - (1 + v
.size());
256 mempool::osdmap::vector
<int32_t> get(pg_t pgid
) {
257 mempool::osdmap::vector
<int32_t> v
;
258 int32_t *p
= map
[pgid
];
261 for (size_t i
= 0; i
< n
; ++i
, ++p
) {
267 // trivial implementation
268 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > pg_temp
;
270 void encode(bufferlist
& bl
) const {
271 ::encode(pg_temp
, bl
);
273 void decode(bufferlist::iterator
& p
) {
274 ::decode(pg_temp
, p
);
276 friend bool operator==(const PGTempMap
& l
, const PGTempMap
& r
) {
278 l
.pg_temp
.size() == r
.pg_temp
.size() &&
279 l
.pg_temp
== r
.pg_temp
;
283 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> >::const_iterator it
;
285 iterator(mempool::osdmap::map
<pg_t
,
286 mempool::osdmap::vector
<int32_t> >::const_iterator p
)
289 pair
<pg_t
,const mempool::osdmap::vector
<int32_t>&> operator*() const {
292 const pair
<const pg_t
,mempool::osdmap::vector
<int32_t>>* operator->() const {
295 friend bool operator==(const iterator
& l
, const iterator
& r
) {
298 friend bool operator!=(const iterator
& l
, const iterator
& r
) {
301 iterator
& operator++() {
305 iterator
operator++(int) {
311 iterator
begin() const {
312 return iterator(pg_temp
.cbegin());
314 iterator
end() const {
315 return iterator(pg_temp
.cend());
317 iterator
find(pg_t pgid
) const {
318 return iterator(pg_temp
.find(pgid
));
320 size_t size() const {
321 return pg_temp
.size();
323 size_t count(pg_t pgid
) const {
324 return pg_temp
.count(pgid
);
326 void erase(pg_t pgid
) {
332 void set(pg_t pgid
, const mempool::osdmap::vector
<int32_t>& v
) {
335 const mempool::osdmap::vector
<int32_t>& get(pg_t pgid
) {
336 return pg_temp
.at(pgid
);
339 void dump(Formatter
*f
) const {
340 for (const auto &pg
: *this) {
341 f
->open_object_section("osds");
342 f
->dump_stream("pgid") << pg
.first
;
343 f
->open_array_section("osds");
344 for (const auto osd
: pg
.second
)
345 f
->dump_int("osd", osd
);
351 WRITE_CLASS_ENCODER(PGTempMap
)
357 MEMPOOL_CLASS_HELPERS();
361 MEMPOOL_CLASS_HELPERS();
363 /// feature bits we were encoded with. the subsequent OSDMap
364 /// encoding should match.
365 uint64_t encode_features
;
367 epoch_t epoch
; // new epoch; we are a diff from epoch-1 to epoch
369 int64_t new_pool_max
; //incremented by the OSDMonitor on each pool create
371 int8_t new_require_osd_release
= -1;
374 bufferlist fullmap
; // in lieu of below.
379 mempool::osdmap::map
<int64_t,pg_pool_t
> new_pools
;
380 mempool::osdmap::map
<int64_t,string
> new_pool_names
;
381 mempool::osdmap::set
<int64_t> old_pools
;
382 mempool::osdmap::map
<string
,map
<string
,string
> > new_erasure_code_profiles
;
383 mempool::osdmap::vector
<string
> old_erasure_code_profiles
;
384 mempool::osdmap::map
<int32_t,entity_addr_t
> new_up_client
;
385 mempool::osdmap::map
<int32_t,entity_addr_t
> new_up_cluster
;
386 mempool::osdmap::map
<int32_t,uint32_t> new_state
; // XORed onto previous state.
387 mempool::osdmap::map
<int32_t,uint32_t> new_weight
;
388 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > new_pg_temp
; // [] to remove
389 mempool::osdmap::map
<pg_t
, int32_t> new_primary_temp
; // [-1] to remove
390 mempool::osdmap::map
<int32_t,uint32_t> new_primary_affinity
;
391 mempool::osdmap::map
<int32_t,epoch_t
> new_up_thru
;
392 mempool::osdmap::map
<int32_t,pair
<epoch_t
,epoch_t
> > new_last_clean_interval
;
393 mempool::osdmap::map
<int32_t,epoch_t
> new_lost
;
394 mempool::osdmap::map
<int32_t,uuid_d
> new_uuid
;
395 mempool::osdmap::map
<int32_t,osd_xinfo_t
> new_xinfo
;
397 mempool::osdmap::map
<entity_addr_t
,utime_t
> new_blacklist
;
398 mempool::osdmap::vector
<entity_addr_t
> old_blacklist
;
399 mempool::osdmap::map
<int32_t, entity_addr_t
> new_hb_back_up
;
400 mempool::osdmap::map
<int32_t, entity_addr_t
> new_hb_front_up
;
402 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> new_pg_upmap
;
403 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<pair
<int32_t,int32_t>>> new_pg_upmap_items
;
404 mempool::osdmap::set
<pg_t
> old_pg_upmap
, old_pg_upmap_items
;
406 string cluster_snapshot
;
408 float new_nearfull_ratio
= -1;
409 float new_backfillfull_ratio
= -1;
410 float new_full_ratio
= -1;
412 int8_t new_require_min_compat_client
= -1;
414 mutable bool have_crc
; ///< crc values are defined
415 uint32_t full_crc
; ///< crc of the resulting OSDMap
416 mutable uint32_t inc_crc
; ///< crc of this incremental
418 int get_net_marked_out(const OSDMap
*previous
) const;
419 int get_net_marked_down(const OSDMap
*previous
) const;
420 int identify_osd(uuid_d u
) const;
422 void encode_client_old(bufferlist
& bl
) const;
423 void encode_classic(bufferlist
& bl
, uint64_t features
) const;
424 void encode(bufferlist
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
425 void decode_classic(bufferlist::iterator
&p
);
426 void decode(bufferlist::iterator
&bl
);
427 void dump(Formatter
*f
) const;
428 static void generate_test_instances(list
<Incremental
*>& o
);
430 explicit Incremental(epoch_t e
=0) :
432 epoch(e
), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
433 have_crc(false), full_crc(0), inc_crc(0) {
434 memset(&fsid
, 0, sizeof(fsid
));
436 explicit Incremental(bufferlist
&bl
) {
437 bufferlist::iterator p
= bl
.begin();
440 explicit Incremental(bufferlist::iterator
&p
) {
444 pg_pool_t
*get_new_pool(int64_t pool
, const pg_pool_t
*orig
) {
445 if (new_pools
.count(pool
) == 0)
446 new_pools
[pool
] = *orig
;
447 return &new_pools
[pool
];
449 bool has_erasure_code_profile(const string
&name
) const {
450 auto i
= new_erasure_code_profiles
.find(name
);
451 return i
!= new_erasure_code_profiles
.end();
453 void set_erasure_code_profile(const string
&name
,
454 const map
<string
,string
>& profile
) {
455 new_erasure_code_profiles
[name
] = profile
;
458 /// propage update pools' snap metadata to any of their tiers
459 int propagate_snaps_to_tiers(CephContext
*cct
, const OSDMap
&base
);
461 /// filter out osds with any pending state changing
462 size_t get_pending_state_osds(vector
<int> *osds
) {
466 for (auto &p
: new_state
) {
467 osds
->push_back(p
.first
);
473 bool pending_osd_has_state(int osd
, unsigned state
) {
474 return new_state
.count(osd
) && (new_state
[osd
] & state
) != 0;
477 void pending_osd_state_set(int osd
, unsigned state
) {
478 new_state
[osd
] |= state
;
481 // cancel the specified pending osd state if there is any
482 // return ture on success, false otherwise.
483 bool pending_osd_state_clear(int osd
, unsigned state
) {
484 if (!pending_osd_has_state(osd
, state
)) {
485 // never has been set or already has been cancelled.
489 new_state
[osd
] &= ~state
;
497 epoch_t epoch
; // what epoch of the osd cluster descriptor is this
498 utime_t created
, modified
; // epoch start time
499 int32_t pool_max
; // the largest pool num, ever
503 int num_osd
; // not saved; see calc_num_osds
504 int num_up_osd
; // not saved; see calc_num_osds
505 int num_in_osd
; // not saved; see calc_num_osds
508 vector
<uint32_t> osd_state
;
510 // These features affect OSDMap[::Incremental] encoding, or the
511 // encoding of some type embedded therein (CrushWrapper, something
512 // from osd_types, etc.).
513 static constexpr uint64_t SIGNIFICANT_FEATURES
=
514 CEPH_FEATUREMASK_PGID64
|
515 CEPH_FEATUREMASK_PGPOOL3
|
516 CEPH_FEATUREMASK_OSDENC
|
517 CEPH_FEATUREMASK_OSDMAP_ENC
|
518 CEPH_FEATUREMASK_OSD_POOLRESEND
|
519 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING
|
520 CEPH_FEATUREMASK_MSG_ADDR2
|
521 CEPH_FEATUREMASK_CRUSH_TUNABLES5
|
522 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
|
523 CEPH_FEATUREMASK_SERVER_LUMINOUS
;
525 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > client_addr
;
526 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > cluster_addr
;
527 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > hb_back_addr
;
528 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > hb_front_addr
;
531 ceph::shared_ptr
<addrs_s
> osd_addrs
;
533 mempool::osdmap::vector
<__u32
> osd_weight
; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
534 mempool::osdmap::vector
<osd_info_t
> osd_info
;
535 ceph::shared_ptr
<PGTempMap
> pg_temp
; // temp pg mapping (e.g. while we rebuild)
536 ceph::shared_ptr
< mempool::osdmap::map
<pg_t
,int32_t > > primary_temp
; // temp primary mapping (e.g. while we rebuild)
537 ceph::shared_ptr
< mempool::osdmap::vector
<__u32
> > osd_primary_affinity
; ///< 16.16 fixed point, 0x10000 = baseline
539 // remap (post-CRUSH, pre-up)
540 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> pg_upmap
; ///< remap pg
541 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<pair
<int32_t,int32_t>>> pg_upmap_items
; ///< remap osds in up set
543 mempool::osdmap::map
<int64_t,pg_pool_t
> pools
;
544 mempool::osdmap::map
<int64_t,string
> pool_name
;
545 mempool::osdmap::map
<string
,map
<string
,string
> > erasure_code_profiles
;
546 mempool::osdmap::map
<string
,int64_t> name_pool
;
548 ceph::shared_ptr
< mempool::osdmap::vector
<uuid_d
> > osd_uuid
;
549 mempool::osdmap::vector
<osd_xinfo_t
> osd_xinfo
;
551 mempool::osdmap::unordered_map
<entity_addr_t
,utime_t
> blacklist
;
553 epoch_t cluster_snapshot_epoch
;
554 string cluster_snapshot
;
555 bool new_blacklist_entries
;
557 float full_ratio
= 0, backfillfull_ratio
= 0, nearfull_ratio
= 0;
559 /// min compat client we want to support
560 uint8_t require_min_compat_client
= 0; // CEPH_RELEASE_*
563 /// require osds to run at least this release
564 uint8_t require_osd_release
= 0; // CEPH_RELEASE_*
567 mutable uint64_t cached_up_osd_features
;
569 mutable bool crc_defined
;
570 mutable uint32_t crc
;
572 void _calc_up_osd_features();
575 bool have_crc() const { return crc_defined
; }
576 uint32_t get_crc() const { return crc
; }
578 ceph::shared_ptr
<CrushWrapper
> crush
; // hierarchical map
580 uint32_t crush_version
= 1;
582 friend class OSDMonitor
;
588 num_osd(0), num_up_osd(0), num_in_osd(0),
590 osd_addrs(std::make_shared
<addrs_s
>()),
591 pg_temp(std::make_shared
<PGTempMap
>()),
592 primary_temp(std::make_shared
<mempool::osdmap::map
<pg_t
,int32_t>>()),
593 osd_uuid(std::make_shared
<mempool::osdmap::vector
<uuid_d
>>()),
594 cluster_snapshot_epoch(0),
595 new_blacklist_entries(false),
596 cached_up_osd_features(0),
597 crc_defined(false), crc(0),
598 crush(std::make_shared
<CrushWrapper
>()) {
599 memset(&fsid
, 0, sizeof(fsid
));
604 OSDMap(const OSDMap
& other
) = default;
605 OSDMap
& operator=(const OSDMap
& other
) = default;
608 /// return feature mask subset that is relevant to OSDMap encoding
609 static uint64_t get_significant_features(uint64_t features
) {
610 return SIGNIFICANT_FEATURES
& features
;
613 uint64_t get_encoding_features() const;
615 void deepish_copy_from(const OSDMap
& o
) {
617 primary_temp
.reset(new mempool::osdmap::map
<pg_t
,int32_t>(*o
.primary_temp
));
618 pg_temp
.reset(new PGTempMap(*o
.pg_temp
));
619 osd_uuid
.reset(new mempool::osdmap::vector
<uuid_d
>(*o
.osd_uuid
));
621 if (o
.osd_primary_affinity
)
622 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>(*o
.osd_primary_affinity
));
624 // NOTE: this still references shared entity_addr_t's.
625 osd_addrs
.reset(new addrs_s(*o
.osd_addrs
));
627 // NOTE: we do not copy crush. note that apply_incremental will
628 // allocate a new CrushWrapper, though.
632 const uuid_d
& get_fsid() const { return fsid
; }
633 void set_fsid(uuid_d
& f
) { fsid
= f
; }
635 epoch_t
get_epoch() const { return epoch
; }
636 void inc_epoch() { epoch
++; }
638 void set_epoch(epoch_t e
);
640 uint32_t get_crush_version() const {
641 return crush_version
;
645 const utime_t
& get_created() const { return created
; }
646 const utime_t
& get_modified() const { return modified
; }
648 bool is_blacklisted(const entity_addr_t
& a
) const;
649 void get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const;
650 void get_blacklist(std::set
<entity_addr_t
> *bl
) const;
652 string
get_cluster_snapshot() const {
653 if (cluster_snapshot_epoch
== epoch
)
654 return cluster_snapshot
;
658 float get_full_ratio() const {
661 float get_backfillfull_ratio() const {
662 return backfillfull_ratio
;
664 float get_nearfull_ratio() const {
665 return nearfull_ratio
;
667 void get_full_osd_util(
668 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
669 map
<int, float> *full
,
670 map
<int, float> *backfill
,
671 map
<int, float> *nearfull
) const;
672 void get_full_pools(CephContext
*cct
,
674 set
<int64_t> *backfillfull
,
675 set
<int64_t> *nearfull
) const;
676 void get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
677 set
<int> *nearfull
) const;
680 /***** cluster state *****/
682 int get_max_osd() const { return max_osd
; }
683 void set_max_osd(int m
);
685 unsigned get_num_osds() const {
688 unsigned get_num_up_osds() const {
691 unsigned get_num_in_osds() const {
694 /// recalculate cached values for get_num{,_up,_in}_osds
697 void get_all_osds(set
<int32_t>& ls
) const;
698 void get_up_osds(set
<int32_t>& ls
) const;
699 void get_out_osds(set
<int32_t>& ls
) const;
700 unsigned get_num_pg_temp() const {
701 return pg_temp
->size();
704 int get_flags() const { return flags
; }
705 bool test_flag(int f
) const { return flags
& f
; }
706 void set_flag(int f
) { flags
|= f
; }
707 void clear_flag(int f
) { flags
&= ~f
; }
709 static void calc_state_set(int state
, set
<string
>& st
);
711 int get_state(int o
) const {
715 int get_state(int o
, set
<string
>& st
) const {
717 unsigned t
= osd_state
[o
];
718 calc_state_set(t
, st
);
721 void set_state(int o
, unsigned s
) {
725 void set_weight(int o
, unsigned w
) {
729 osd_state
[o
] |= CEPH_OSD_EXISTS
;
731 unsigned get_weight(int o
) const {
733 return osd_weight
[o
];
735 float get_weightf(int o
) const {
736 return (float)get_weight(o
) / (float)CEPH_OSD_IN
;
738 void adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const;
740 void set_primary_affinity(int o
, int w
) {
742 if (!osd_primary_affinity
)
743 osd_primary_affinity
.reset(
744 new mempool::osdmap::vector
<__u32
>(
745 max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
));
746 (*osd_primary_affinity
)[o
] = w
;
748 unsigned get_primary_affinity(int o
) const {
750 if (!osd_primary_affinity
)
751 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
752 return (*osd_primary_affinity
)[o
];
754 float get_primary_affinityf(int o
) const {
755 return (float)get_primary_affinity(o
) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY
;
758 bool has_erasure_code_profile(const string
&name
) const {
759 auto i
= erasure_code_profiles
.find(name
);
760 return i
!= erasure_code_profiles
.end();
762 int get_erasure_code_profile_default(CephContext
*cct
,
763 map
<string
,string
> &profile_map
,
765 void set_erasure_code_profile(const string
&name
,
766 const map
<string
,string
>& profile
) {
767 erasure_code_profiles
[name
] = profile
;
769 const map
<string
,string
> &get_erasure_code_profile(
770 const string
&name
) const {
771 static map
<string
,string
> empty
;
772 auto i
= erasure_code_profiles
.find(name
);
773 if (i
== erasure_code_profiles
.end())
778 const mempool::osdmap::map
<string
,map
<string
,string
> > &get_erasure_code_profiles() const {
779 return erasure_code_profiles
;
782 bool exists(int osd
) const {
784 return osd
>= 0 && osd
< max_osd
&& (osd_state
[osd
] & CEPH_OSD_EXISTS
);
787 bool is_destroyed(int osd
) const {
788 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_DESTROYED
);
791 bool is_up(int osd
) const {
792 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_UP
);
795 bool has_been_up_since(int osd
, epoch_t epoch
) const {
796 return is_up(osd
) && get_up_from(osd
) <= epoch
;
799 bool is_down(int osd
) const {
803 bool is_out(int osd
) const {
804 return !exists(osd
) || get_weight(osd
) == CEPH_OSD_OUT
;
807 bool is_in(int osd
) const {
811 bool is_noup(int osd
) const {
812 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOUP
);
815 bool is_nodown(int osd
) const {
816 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NODOWN
);
819 bool is_noin(int osd
) const {
820 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOIN
);
823 bool is_noout(int osd
) const {
824 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOOUT
);
827 void get_noup_osds(vector
<int> *osds
) const {
831 for (int i
= 0; i
< max_osd
; i
++) {
838 void get_nodown_osds(vector
<int> *osds
) const {
842 for (int i
= 0; i
< max_osd
; i
++) {
849 void get_noin_osds(vector
<int> *osds
) const {
853 for (int i
= 0; i
< max_osd
; i
++) {
860 void get_noout_osds(vector
<int> *osds
) const {
864 for (int i
= 0; i
< max_osd
; i
++) {
872 * check if an entire crush subtree is down
874 bool subtree_is_down(int id
, set
<int> *down_cache
) const;
875 bool containing_subtree_is_down(CephContext
*cct
, int osd
, int subtree_type
, set
<int> *down_cache
) const;
877 bool subtree_type_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_in_osds
, set
<int> *up_in_osds
,
878 set
<int> *subtree_up
, unordered_map
<int, set
<int> > *subtree_type_down
) const;
880 int identify_osd(const entity_addr_t
& addr
) const;
881 int identify_osd(const uuid_d
& u
) const;
882 int identify_osd_on_all_channels(const entity_addr_t
& addr
) const;
884 bool have_addr(const entity_addr_t
& addr
) const {
885 return identify_osd(addr
) >= 0;
887 int find_osd_on_ip(const entity_addr_t
& ip
) const;
888 const entity_addr_t
&get_addr(int osd
) const {
890 return osd_addrs
->client_addr
[osd
] ? *osd_addrs
->client_addr
[osd
] : osd_addrs
->blank
;
892 const entity_addr_t
&get_cluster_addr(int osd
) const {
894 if (!osd_addrs
->cluster_addr
[osd
] || *osd_addrs
->cluster_addr
[osd
] == entity_addr_t())
895 return get_addr(osd
);
896 return *osd_addrs
->cluster_addr
[osd
];
898 const entity_addr_t
&get_hb_back_addr(int osd
) const {
900 return osd_addrs
->hb_back_addr
[osd
] ? *osd_addrs
->hb_back_addr
[osd
] : osd_addrs
->blank
;
902 const entity_addr_t
&get_hb_front_addr(int osd
) const {
904 return osd_addrs
->hb_front_addr
[osd
] ? *osd_addrs
->hb_front_addr
[osd
] : osd_addrs
->blank
;
906 entity_inst_t
get_most_recent_inst(int osd
) const {
908 return entity_inst_t(entity_name_t::OSD(osd
), get_addr(osd
));
910 entity_inst_t
get_inst(int osd
) const {
912 return get_most_recent_inst(osd
);
914 entity_inst_t
get_cluster_inst(int osd
) const {
916 return entity_inst_t(entity_name_t::OSD(osd
), get_cluster_addr(osd
));
918 entity_inst_t
get_hb_back_inst(int osd
) const {
920 return entity_inst_t(entity_name_t::OSD(osd
), get_hb_back_addr(osd
));
922 entity_inst_t
get_hb_front_inst(int osd
) const {
924 return entity_inst_t(entity_name_t::OSD(osd
), get_hb_front_addr(osd
));
927 const uuid_d
& get_uuid(int osd
) const {
929 return (*osd_uuid
)[osd
];
932 const epoch_t
& get_up_from(int osd
) const {
934 return osd_info
[osd
].up_from
;
936 const epoch_t
& get_up_thru(int osd
) const {
938 return osd_info
[osd
].up_thru
;
940 const epoch_t
& get_down_at(int osd
) const {
942 return osd_info
[osd
].down_at
;
944 const osd_info_t
& get_info(int osd
) const {
945 assert(osd
< max_osd
);
946 return osd_info
[osd
];
949 const osd_xinfo_t
& get_xinfo(int osd
) const {
950 assert(osd
< max_osd
);
951 return osd_xinfo
[osd
];
954 int get_next_up_osd_after(int n
) const {
955 if (get_max_osd() == 0)
957 for (int i
= n
+ 1; i
!= n
; ++i
) {
958 if (i
>= get_max_osd())
968 int get_previous_up_osd_before(int n
) const {
969 if (get_max_osd() == 0)
971 for (int i
= n
- 1; i
!= n
; --i
) {
973 i
= get_max_osd() - 1;
983 * get feature bits required by the current structure
985 * @param entity_type [in] what entity type we are asking about
986 * @param mask [out] set of all possible map-related features we could set
987 * @return feature bits used by this map
989 uint64_t get_features(int entity_type
, uint64_t *mask
) const;
992 * get oldest *client* version (firefly, hammer, etc.) that can connect given
993 * the feature bits required (according to get_features()).
995 uint8_t get_min_compat_client() const;
998 * get intersection of features supported by up osds
1000 uint64_t get_up_osd_features() const;
1002 void maybe_remove_pg_upmaps(CephContext
*cct
,
1003 const OSDMap
& osdmap
,
1004 Incremental
*pending_inc
);
1006 int apply_incremental(const Incremental
&inc
);
1008 /// try to re-use/reference addrs in oldmap from newmap
1009 static void dedup(const OSDMap
*oldmap
, OSDMap
*newmap
);
1011 static void clean_temps(CephContext
*cct
, const OSDMap
& osdmap
,
1012 Incremental
*pending_inc
);
1014 // serialize, unserialize
1016 void encode_client_old(bufferlist
& bl
) const;
1017 void encode_classic(bufferlist
& bl
, uint64_t features
) const;
1018 void decode_classic(bufferlist::iterator
& p
);
1021 void encode(bufferlist
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
1022 void decode(bufferlist
& bl
);
1023 void decode(bufferlist::iterator
& bl
);
1026 /**** mapping facilities ****/
1031 const string
& nspace
,
1033 int object_locator_to_pg(const object_t
& oid
, const object_locator_t
& loc
,
1035 pg_t
object_locator_to_pg(const object_t
& oid
,
1036 const object_locator_t
& loc
) const {
1038 int ret
= object_locator_to_pg(oid
, loc
, pg
);
1044 static object_locator_t
file_to_object_locator(const file_layout_t
& layout
) {
1045 return object_locator_t(layout
.pool_id
, layout
.pool_ns
);
1048 ceph_object_layout
file_to_object_layout(object_t oid
,
1049 file_layout_t
& layout
) const {
1050 return make_object_layout(oid
, layout
.pool_id
, layout
.pool_ns
);
1053 ceph_object_layout
make_object_layout(object_t oid
, int pg_pool
,
1054 string nspace
) const;
1056 int get_pg_num(int pg_pool
) const
1058 const pg_pool_t
*pool
= get_pg_pool(pg_pool
);
1059 assert(NULL
!= pool
);
1060 return pool
->get_pg_num();
1063 bool pg_exists(pg_t pgid
) const {
1064 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1065 return p
&& pgid
.ps() < p
->get_pg_num();
1068 int get_pg_pool_min_size(pg_t pgid
) const {
1069 if (!pg_exists(pgid
)) {
1072 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1074 return p
->get_min_size();
1077 int get_pg_pool_size(pg_t pgid
) const {
1078 if (!pg_exists(pgid
)) {
1081 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1083 return p
->get_size();
1086 int get_pg_pool_crush_rule(pg_t pgid
) const {
1087 if (!pg_exists(pgid
)) {
1090 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1092 return p
->get_crush_rule();
1096 /// pg -> (raw osd list)
1097 void _pg_to_raw_osds(
1098 const pg_pool_t
& pool
, pg_t pg
,
1101 int _pick_primary(const vector
<int>& osds
) const;
1102 void _remove_nonexistent_osds(const pg_pool_t
& pool
, vector
<int>& osds
) const;
1104 void _apply_primary_affinity(ps_t seed
, const pg_pool_t
& pool
,
1105 vector
<int> *osds
, int *primary
) const;
1107 /// apply pg_upmap[_items] mappings
1108 void _apply_upmap(const pg_pool_t
& pi
, pg_t pg
, vector
<int> *raw
) const;
1110 /// pg -> (up osd list)
1111 void _raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
1112 vector
<int> *up
) const;
1116 * Get the pg and primary temp, if they are specified.
1117 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1118 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1119 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1121 void _get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
1122 vector
<int> *temp_pg
, int *temp_primary
) const;
1125 * map to up and acting. Fills in whatever fields are non-NULL.
1127 void _pg_to_up_acting_osds(const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
1128 vector
<int> *acting
, int *acting_primary
,
1129 bool raw_pg_to_pg
= true) const;
1133 * This is suitable only for looking at raw CRUSH outputs. It skips
1134 * applying the temp and up checks and should not be used
1135 * by anybody for data mapping purposes.
1136 * raw and primary must be non-NULL
1138 void pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const;
1139 /// map a pg to its acting set. @return acting set size
1140 void pg_to_acting_osds(const pg_t
& pg
, vector
<int> *acting
,
1141 int *acting_primary
) const {
1142 _pg_to_up_acting_osds(pg
, NULL
, NULL
, acting
, acting_primary
);
1144 void pg_to_acting_osds(pg_t pg
, vector
<int>& acting
) const {
1145 return pg_to_acting_osds(pg
, &acting
, NULL
);
1148 * This does not apply temp overrides and should not be used
1149 * by anybody for data mapping purposes. Specify both pointers.
1151 void pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const;
1153 * map a pg to its acting set as well as its up set. You must use
1154 * the acting set for data mapping purposes, but some users will
1155 * also find the up set useful for things like deciding what to
1157 * Each of these pointers must be non-NULL.
1159 void pg_to_up_acting_osds(pg_t pg
, vector
<int> *up
, int *up_primary
,
1160 vector
<int> *acting
, int *acting_primary
) const {
1161 _pg_to_up_acting_osds(pg
, up
, up_primary
, acting
, acting_primary
);
1163 void pg_to_up_acting_osds(pg_t pg
, vector
<int>& up
, vector
<int>& acting
) const {
1164 int up_primary
, acting_primary
;
1165 pg_to_up_acting_osds(pg
, &up
, &up_primary
, &acting
, &acting_primary
);
1167 bool pg_is_ec(pg_t pg
) const {
1168 auto i
= pools
.find(pg
.pool());
1169 assert(i
!= pools
.end());
1170 return i
->second
.ec_pool();
1172 bool get_primary_shard(const pg_t
& pgid
, spg_t
*out
) const {
1173 auto i
= get_pools().find(pgid
.pool());
1174 if (i
== get_pools().end()) {
1177 if (!i
->second
.ec_pool()) {
1183 pg_to_acting_osds(pgid
, &acting
, &primary
);
1184 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
1185 if (acting
[i
] == primary
) {
1186 *out
= spg_t(pgid
, shard_id_t(i
));
1193 int64_t lookup_pg_pool_name(const string
& name
) const {
1194 auto p
= name_pool
.find(name
);
1195 if (p
== name_pool
.end())
1200 int64_t get_pool_max() const {
1203 const mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() const {
1206 mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() {
1209 void get_pool_ids_by_rule(int rule_id
, set
<int64_t> *pool_ids
) const {
1211 for (auto &p
: pools
) {
1212 if ((int)p
.second
.get_crush_rule() == rule_id
) {
1213 pool_ids
->insert(p
.first
);
1217 void get_pool_ids_by_osd(CephContext
*cct
,
1219 set
<int64_t> *pool_ids
) const;
1220 const string
& get_pool_name(int64_t p
) const {
1221 auto i
= pool_name
.find(p
);
1222 assert(i
!= pool_name
.end());
1225 const mempool::osdmap::map
<int64_t,string
>& get_pool_names() const {
1228 bool have_pg_pool(int64_t p
) const {
1229 return pools
.count(p
);
1231 const pg_pool_t
* get_pg_pool(int64_t p
) const {
1232 auto i
= pools
.find(p
);
1233 if (i
!= pools
.end())
1237 unsigned get_pg_size(pg_t pg
) const {
1238 auto p
= pools
.find(pg
.pool());
1239 assert(p
!= pools
.end());
1240 return p
->second
.get_size();
1242 int get_pg_type(pg_t pg
) const {
1243 auto p
= pools
.find(pg
.pool());
1244 assert(p
!= pools
.end());
1245 return p
->second
.get_type();
1249 pg_t
raw_pg_to_pg(pg_t pg
) const {
1250 auto p
= pools
.find(pg
.pool());
1251 assert(p
!= pools
.end());
1252 return p
->second
.raw_pg_to_pg(pg
);
1255 // pg -> acting primary osd
1256 int get_pg_acting_primary(pg_t pg
) const {
1258 _pg_to_up_acting_osds(pg
, nullptr, nullptr, nullptr, &primary
);
1263 * check whether an spg_t maps to a particular osd
1265 bool is_up_acting_osd_shard(spg_t pg
, int osd
) const {
1266 vector
<int> up
, acting
;
1267 _pg_to_up_acting_osds(pg
.pgid
, &up
, NULL
, &acting
, NULL
, false);
1268 if (pg
.shard
== shard_id_t::NO_SHARD
) {
1269 if (calc_pg_role(osd
, acting
, acting
.size()) >= 0 ||
1270 calc_pg_role(osd
, up
, up
.size()) >= 0)
1273 if (pg
.shard
< (int)acting
.size() && acting
[pg
.shard
] == osd
)
1275 if (pg
.shard
< (int)up
.size() && up
[pg
.shard
] == osd
)
1282 /* what replica # is a given osd? 0 primary, -1 for none. */
1283 static int calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
=0);
1284 static int calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
=0);
1285 static bool primary_changed(
1287 const vector
<int> &oldacting
,
1289 const vector
<int> &newacting
);
1291 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
1292 int get_pg_acting_rank(pg_t pg
, int osd
) const {
1294 pg_to_acting_osds(pg
, group
);
1295 return calc_pg_rank(osd
, group
, group
.size());
1297 /* role is -1 (stray), 0 (primary), 1 (replica) */
1298 int get_pg_acting_role(const pg_t
& pg
, int osd
) const {
1300 pg_to_acting_osds(pg
, group
);
1301 return calc_pg_role(osd
, group
, group
.size());
1304 bool osd_is_valid_op_target(pg_t pg
, int osd
) const {
1307 pg_to_acting_osds(pg
, &group
, &primary
);
1313 return calc_pg_role(osd
, group
, group
.size()) >= 0;
1316 int clean_pg_upmaps(
1318 Incremental
*pending_inc
);
1322 pg_t pg
, ///< pg to potentially remap
1323 const set
<int>& overfull
, ///< osds we'd want to evacuate
1324 const vector
<int>& underfull
, ///< osds to move to, in order of preference
1326 vector
<int> *out
); ///< resulting alternative mapping
1330 float max_deviation
, ///< max deviation from target (value < 1.0)
1331 int max_iterations
, ///< max iterations to run
1332 const set
<int64_t>& pools
, ///< [optional] restrict to pool
1333 Incremental
*pending_inc
1336 int get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const;
1339 * handy helpers to build simple maps...
1342 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1343 * it will be initialized with the specified number of OSDs in a
1344 * single host. If **num_osd** is < 0 the layout of the OSD map will
1345 * be built by reading the content of the configuration file.
1347 * @param cct [in] in core ceph context
1348 * @param e [in] initial epoch
1349 * @param fsid [in] id of the cluster
1350 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1351 * @return **0** on success, negative errno on error.
1354 int build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1355 int num_osd
, int pg_bits
, int pgp_bits
,
1358 int build_simple(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1360 return build_simple_optioned(cct
, e
, fsid
, num_osd
, 0, 0, false);
1362 int build_simple_with_pool(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1363 int num_osd
, int pg_bits
, int pgp_bits
) {
1364 return build_simple_optioned(cct
, e
, fsid
, num_osd
,
1365 pg_bits
, pgp_bits
, true);
1367 static int _build_crush_types(CrushWrapper
& crush
);
1368 static int build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
1369 int num_osd
, ostream
*ss
);
1370 static int build_simple_crush_map_from_conf(CephContext
*cct
,
1371 CrushWrapper
& crush
,
1373 static int build_simple_crush_rules(
1374 CephContext
*cct
, CrushWrapper
& crush
,
1378 bool crush_rule_in_use(int rule_id
) const;
1380 int validate_crush_rules(CrushWrapper
*crush
, ostream
*ss
) const;
1384 primary_temp
->clear();
1388 void print_osd_line(int cur
, ostream
*out
, Formatter
*f
) const;
1390 void print(ostream
& out
) const;
1391 void print_pools(ostream
& out
) const;
1392 void print_summary(Formatter
*f
, ostream
& out
, const string
& prefix
) const;
1393 void print_oneline_summary(ostream
& out
) const;
1396 DUMP_IN
= 1, // only 'in' osds
1397 DUMP_OUT
= 2, // only 'out' osds
1398 DUMP_UP
= 4, // only 'up' osds
1399 DUMP_DOWN
= 8, // only 'down' osds
1400 DUMP_DESTROYED
= 16, // only 'destroyed' osds
1402 void print_tree(Formatter
*f
, ostream
*out
, unsigned dump_flags
=0) const;
1404 int summarize_mapping_stats(
1406 const set
<int64_t> *pools
,
1408 Formatter
*f
) const;
1410 string
get_flag_string() const;
1411 static string
get_flag_string(unsigned flags
);
1412 static void dump_erasure_code_profiles(
1413 const mempool::osdmap::map
<string
,map
<string
,string
> > &profiles
,
1415 void dump(Formatter
*f
) const;
1416 static void generate_test_instances(list
<OSDMap
*>& o
);
1417 bool check_new_blacklist_entries() const { return new_blacklist_entries
; }
1419 void check_health(health_check_map_t
*checks
) const;
1421 int parse_osd_id_list(const vector
<string
>& ls
,
1425 WRITE_CLASS_ENCODER_FEATURES(OSDMap
)
1426 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental
)
1428 typedef ceph::shared_ptr
<const OSDMap
> OSDMapRef
;
1430 inline ostream
& operator<<(ostream
& out
, const OSDMap
& m
) {
1431 m
.print_oneline_summary(out
);
1435 class PGStatService
;
1437 void print_osd_utilization(const OSDMap
& osdmap
,
1438 const PGStatService
*pgstat
,