1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
27 #include "include/types.h"
28 #include "osd_types.h"
30 //#include "include/ceph_features.h"
31 #include "crush/CrushWrapper.h"
36 #include "include/memory.h"
39 // forward declaration
43 // FIXME C++11 does not have std::equal for two differently-typed containers.
44 // use this until we move to c++14
45 template<typename A
, typename B
>
46 bool vectors_equal(A a
, B b
)
49 a
.size() == b
.size() &&
51 memcmp((char*)&a
[0], (char*)&b
[0], sizeof(a
[0]) * a
.size()) == 0);
56 * we track up to two intervals during which the osd was alive and
57 * healthy. the most recent is [up_from,up_thru), where up_thru is
58 * the last epoch the osd is known to have _started_. i.e., a lower
59 * bound on the actual osd death. down_at (if it is > up_from) is an
60 * upper bound on the actual osd death.
62 * the second is the last_clean interval [first,last]. in that case,
63 * the last interval is the last epoch known to have been either
64 * _finished_, or during which the osd cleanly shut down. when
65 * possible, we push this forward to the epoch the osd was eventually
68 * the lost_at is used to allow build_prior to proceed without waiting
69 * for an osd to recover. In certain cases, progress may be blocked
70 * because an osd is down that may contain updates (i.e., a pg may have
71 * gone rw during an interval). If the osd can't be brought online, we
72 * can force things to proceed knowing that we _might_ be losing some
73 * acked writes. If the osd comes back to life later, that's fine to,
74 * but those writes will still be lost (the divergent objects will be
78 epoch_t last_clean_begin
; // last interval that ended with a clean osd shutdown
79 epoch_t last_clean_end
;
80 epoch_t up_from
; // epoch osd marked up
81 epoch_t up_thru
; // lower bound on actual osd death (if > up_from)
82 epoch_t down_at
; // upper bound on actual osd death (if > up_from)
83 epoch_t lost_at
; // last epoch we decided data was "lost"
85 osd_info_t() : last_clean_begin(0), last_clean_end(0),
86 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
88 void dump(Formatter
*f
) const;
89 void encode(bufferlist
& bl
) const;
90 void decode(bufferlist::iterator
& bl
);
91 static void generate_test_instances(list
<osd_info_t
*>& o
);
93 WRITE_CLASS_ENCODER(osd_info_t
)
95 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
);
98 utime_t down_stamp
; ///< timestamp when we were last marked down
99 float laggy_probability
; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
100 __u32 laggy_interval
; ///< average interval between being marked laggy and recovering
101 uint64_t features
; ///< features supported by this osd we should know about
102 __u32 old_weight
; ///< weight prior to being auto marked out
104 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
105 features(0), old_weight(0) {}
107 void dump(Formatter
*f
) const;
108 void encode(bufferlist
& bl
) const;
109 void decode(bufferlist::iterator
& bl
);
110 static void generate_test_instances(list
<osd_xinfo_t
*>& o
);
112 WRITE_CLASS_ENCODER(osd_xinfo_t
)
114 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
);
121 MEMPOOL_CLASS_HELPERS();
125 MEMPOOL_CLASS_HELPERS();
127 /// feature bits we were encoded with. the subsequent OSDMap
128 /// encoding should match.
129 uint64_t encode_features
;
131 epoch_t epoch
; // new epoch; we are a diff from epoch-1 to epoch
133 int64_t new_pool_max
; //incremented by the OSDMonitor on each pool create
137 bufferlist fullmap
; // in lieu of below.
142 mempool::osdmap::map
<int64_t,pg_pool_t
> new_pools
;
143 mempool::osdmap::map
<int64_t,string
> new_pool_names
;
144 mempool::osdmap::set
<int64_t> old_pools
;
145 mempool::osdmap::map
<string
,map
<string
,string
> > new_erasure_code_profiles
;
146 mempool::osdmap::vector
<string
> old_erasure_code_profiles
;
147 mempool::osdmap::map
<int32_t,entity_addr_t
> new_up_client
;
148 mempool::osdmap::map
<int32_t,entity_addr_t
> new_up_cluster
;
149 mempool::osdmap::map
<int32_t,uint8_t> new_state
; // XORed onto previous state.
150 mempool::osdmap::map
<int32_t,uint32_t> new_weight
;
151 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > new_pg_temp
; // [] to remove
152 mempool::osdmap::map
<pg_t
, int32_t> new_primary_temp
; // [-1] to remove
153 mempool::osdmap::map
<int32_t,uint32_t> new_primary_affinity
;
154 mempool::osdmap::map
<int32_t,epoch_t
> new_up_thru
;
155 mempool::osdmap::map
<int32_t,pair
<epoch_t
,epoch_t
> > new_last_clean_interval
;
156 mempool::osdmap::map
<int32_t,epoch_t
> new_lost
;
157 mempool::osdmap::map
<int32_t,uuid_d
> new_uuid
;
158 mempool::osdmap::map
<int32_t,osd_xinfo_t
> new_xinfo
;
160 mempool::osdmap::map
<entity_addr_t
,utime_t
> new_blacklist
;
161 mempool::osdmap::vector
<entity_addr_t
> old_blacklist
;
162 mempool::osdmap::map
<int32_t, entity_addr_t
> new_hb_back_up
;
163 mempool::osdmap::map
<int32_t, entity_addr_t
> new_hb_front_up
;
165 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> new_pg_upmap
;
166 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<pair
<int32_t,int32_t>>> new_pg_upmap_items
;
167 mempool::osdmap::set
<pg_t
> old_pg_upmap
, old_pg_upmap_items
;
169 string cluster_snapshot
;
171 float new_nearfull_ratio
= -1;
172 float new_backfillfull_ratio
= -1;
173 float new_full_ratio
= -1;
175 string new_require_min_compat_client
;
177 mutable bool have_crc
; ///< crc values are defined
178 uint32_t full_crc
; ///< crc of the resulting OSDMap
179 mutable uint32_t inc_crc
; ///< crc of this incremental
181 int get_net_marked_out(const OSDMap
*previous
) const;
182 int get_net_marked_down(const OSDMap
*previous
) const;
183 int identify_osd(uuid_d u
) const;
185 void encode_client_old(bufferlist
& bl
) const;
186 void encode_classic(bufferlist
& bl
, uint64_t features
) const;
187 void encode(bufferlist
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
188 void decode_classic(bufferlist::iterator
&p
);
189 void decode(bufferlist::iterator
&bl
);
190 void dump(Formatter
*f
) const;
191 static void generate_test_instances(list
<Incremental
*>& o
);
193 explicit Incremental(epoch_t e
=0) :
195 epoch(e
), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
196 have_crc(false), full_crc(0), inc_crc(0) {
197 memset(&fsid
, 0, sizeof(fsid
));
199 explicit Incremental(bufferlist
&bl
) {
200 bufferlist::iterator p
= bl
.begin();
203 explicit Incremental(bufferlist::iterator
&p
) {
207 pg_pool_t
*get_new_pool(int64_t pool
, const pg_pool_t
*orig
) {
208 if (new_pools
.count(pool
) == 0)
209 new_pools
[pool
] = *orig
;
210 return &new_pools
[pool
];
212 bool has_erasure_code_profile(const string
&name
) const {
213 auto i
= new_erasure_code_profiles
.find(name
);
214 return i
!= new_erasure_code_profiles
.end();
216 void set_erasure_code_profile(const string
&name
,
217 const map
<string
,string
>& profile
) {
218 new_erasure_code_profiles
[name
] = profile
;
221 /// propage update pools' snap metadata to any of their tiers
222 int propagate_snaps_to_tiers(CephContext
*cct
, const OSDMap
&base
);
227 epoch_t epoch
; // what epoch of the osd cluster descriptor is this
228 utime_t created
, modified
; // epoch start time
229 int32_t pool_max
; // the largest pool num, ever
233 int num_osd
; // not saved; see calc_num_osds
234 int num_up_osd
; // not saved; see calc_num_osds
235 int num_in_osd
; // not saved; see calc_num_osds
238 vector
<uint8_t> osd_state
;
241 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > client_addr
;
242 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > cluster_addr
;
243 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > hb_back_addr
;
244 mempool::osdmap::vector
<ceph::shared_ptr
<entity_addr_t
> > hb_front_addr
;
247 ceph::shared_ptr
<addrs_s
> osd_addrs
;
249 mempool::osdmap::vector
<__u32
> osd_weight
; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
250 mempool::osdmap::vector
<osd_info_t
> osd_info
;
251 ceph::shared_ptr
< mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > > pg_temp
; // temp pg mapping (e.g. while we rebuild)
252 ceph::shared_ptr
< mempool::osdmap::map
<pg_t
,int32_t > > primary_temp
; // temp primary mapping (e.g. while we rebuild)
253 ceph::shared_ptr
< mempool::osdmap::vector
<__u32
> > osd_primary_affinity
; ///< 16.16 fixed point, 0x10000 = baseline
255 // remap (post-CRUSH, pre-up)
256 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> pg_upmap
; ///< remap pg
257 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<pair
<int32_t,int32_t>>> pg_upmap_items
; ///< remap osds in up set
259 mempool::osdmap::map
<int64_t,pg_pool_t
> pools
;
260 mempool::osdmap::map
<int64_t,string
> pool_name
;
261 mempool::osdmap::map
<string
,map
<string
,string
> > erasure_code_profiles
;
262 mempool::osdmap::map
<string
,int64_t> name_pool
;
264 ceph::shared_ptr
< mempool::osdmap::vector
<uuid_d
> > osd_uuid
;
265 mempool::osdmap::vector
<osd_xinfo_t
> osd_xinfo
;
267 mempool::osdmap::unordered_map
<entity_addr_t
,utime_t
> blacklist
;
269 epoch_t cluster_snapshot_epoch
;
270 string cluster_snapshot
;
271 bool new_blacklist_entries
;
273 float full_ratio
= 0, backfillfull_ratio
= 0, nearfull_ratio
= 0;
275 /// min compat client we want to support
276 string require_min_compat_client
;
278 mutable uint64_t cached_up_osd_features
;
280 mutable bool crc_defined
;
281 mutable uint32_t crc
;
283 void _calc_up_osd_features();
286 bool have_crc() const { return crc_defined
; }
287 uint32_t get_crc() const { return crc
; }
289 ceph::shared_ptr
<CrushWrapper
> crush
; // hierarchical map
291 friend class OSDMonitor
;
297 num_osd(0), num_up_osd(0), num_in_osd(0),
299 osd_addrs(std::make_shared
<addrs_s
>()),
300 pg_temp(std::make_shared
<mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>>>()),
301 primary_temp(std::make_shared
<mempool::osdmap::map
<pg_t
,int32_t>>()),
302 osd_uuid(std::make_shared
<mempool::osdmap::vector
<uuid_d
>>()),
303 cluster_snapshot_epoch(0),
304 new_blacklist_entries(false),
305 cached_up_osd_features(0),
306 crc_defined(false), crc(0),
307 crush(std::make_shared
<CrushWrapper
>()) {
308 memset(&fsid
, 0, sizeof(fsid
));
313 OSDMap(const OSDMap
& other
) = default;
314 OSDMap
& operator=(const OSDMap
& other
) = default;
317 void deepish_copy_from(const OSDMap
& o
) {
319 primary_temp
.reset(new mempool::osdmap::map
<pg_t
,int32_t>(*o
.primary_temp
));
320 pg_temp
.reset(new mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> >(*o
.pg_temp
));
321 osd_uuid
.reset(new mempool::osdmap::vector
<uuid_d
>(*o
.osd_uuid
));
323 if (o
.osd_primary_affinity
)
324 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>(*o
.osd_primary_affinity
));
326 // NOTE: this still references shared entity_addr_t's.
327 osd_addrs
.reset(new addrs_s(*o
.osd_addrs
));
329 // NOTE: we do not copy crush. note that apply_incremental will
330 // allocate a new CrushWrapper, though.
334 const uuid_d
& get_fsid() const { return fsid
; }
335 void set_fsid(uuid_d
& f
) { fsid
= f
; }
337 epoch_t
get_epoch() const { return epoch
; }
338 void inc_epoch() { epoch
++; }
340 void set_epoch(epoch_t e
);
343 const utime_t
& get_created() const { return created
; }
344 const utime_t
& get_modified() const { return modified
; }
346 bool is_blacklisted(const entity_addr_t
& a
) const;
347 void get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const;
349 string
get_cluster_snapshot() const {
350 if (cluster_snapshot_epoch
== epoch
)
351 return cluster_snapshot
;
355 float get_full_ratio() const {
358 float get_backfillfull_ratio() const {
359 return backfillfull_ratio
;
361 float get_nearfull_ratio() const {
362 return nearfull_ratio
;
364 void count_full_nearfull_osds(int *full
, int *backfill
, int *nearfull
) const;
365 void get_full_osd_util(
366 const ceph::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
367 map
<int, float> *full
,
368 map
<int, float> *backfill
,
369 map
<int, float> *nearfull
) const;
371 /***** cluster state *****/
373 int get_max_osd() const { return max_osd
; }
374 void set_max_osd(int m
);
376 unsigned get_num_osds() const {
379 unsigned get_num_up_osds() const {
382 unsigned get_num_in_osds() const {
385 /// recalculate cached values for get_num{,_up,_in}_osds
388 void get_all_osds(set
<int32_t>& ls
) const;
389 void get_up_osds(set
<int32_t>& ls
) const;
390 unsigned get_num_pg_temp() const {
391 return pg_temp
->size();
394 int get_flags() const { return flags
; }
395 bool test_flag(int f
) const { return flags
& f
; }
396 void set_flag(int f
) { flags
|= f
; }
397 void clear_flag(int f
) { flags
&= ~f
; }
399 static void calc_state_set(int state
, set
<string
>& st
);
401 int get_state(int o
) const {
405 int get_state(int o
, set
<string
>& st
) const {
407 unsigned t
= osd_state
[o
];
408 calc_state_set(t
, st
);
411 void set_state(int o
, unsigned s
) {
415 void set_weight(int o
, unsigned w
) {
419 osd_state
[o
] |= CEPH_OSD_EXISTS
;
421 unsigned get_weight(int o
) const {
423 return osd_weight
[o
];
425 float get_weightf(int o
) const {
426 return (float)get_weight(o
) / (float)CEPH_OSD_IN
;
428 void adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const;
430 void set_primary_affinity(int o
, int w
) {
432 if (!osd_primary_affinity
)
433 osd_primary_affinity
.reset(
434 new mempool::osdmap::vector
<__u32
>(
435 max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
));
436 (*osd_primary_affinity
)[o
] = w
;
438 unsigned get_primary_affinity(int o
) const {
440 if (!osd_primary_affinity
)
441 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
442 return (*osd_primary_affinity
)[o
];
444 float get_primary_affinityf(int o
) const {
445 return (float)get_primary_affinity(o
) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY
;
448 bool has_erasure_code_profile(const string
&name
) const {
449 auto i
= erasure_code_profiles
.find(name
);
450 return i
!= erasure_code_profiles
.end();
452 int get_erasure_code_profile_default(CephContext
*cct
,
453 map
<string
,string
> &profile_map
,
455 void set_erasure_code_profile(const string
&name
,
456 const map
<string
,string
>& profile
) {
457 erasure_code_profiles
[name
] = profile
;
459 const map
<string
,string
> &get_erasure_code_profile(
460 const string
&name
) const {
461 static map
<string
,string
> empty
;
462 auto i
= erasure_code_profiles
.find(name
);
463 if (i
== erasure_code_profiles
.end())
468 const mempool::osdmap::map
<string
,map
<string
,string
> > &get_erasure_code_profiles() const {
469 return erasure_code_profiles
;
472 bool exists(int osd
) const {
474 return osd
>= 0 && osd
< max_osd
&& (osd_state
[osd
] & CEPH_OSD_EXISTS
);
477 bool is_up(int osd
) const {
478 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_UP
);
481 bool has_been_up_since(int osd
, epoch_t epoch
) const {
482 return is_up(osd
) && get_up_from(osd
) <= epoch
;
485 bool is_down(int osd
) const {
489 bool is_out(int osd
) const {
490 return !exists(osd
) || get_weight(osd
) == CEPH_OSD_OUT
;
493 bool is_in(int osd
) const {
498 * check if an entire crush subtree is down
500 bool subtree_is_down(int id
, set
<int> *down_cache
) const;
501 bool containing_subtree_is_down(CephContext
*cct
, int osd
, int subtree_type
, set
<int> *down_cache
) const;
503 int identify_osd(const entity_addr_t
& addr
) const;
504 int identify_osd(const uuid_d
& u
) const;
505 int identify_osd_on_all_channels(const entity_addr_t
& addr
) const;
507 bool have_addr(const entity_addr_t
& addr
) const {
508 return identify_osd(addr
) >= 0;
510 int find_osd_on_ip(const entity_addr_t
& ip
) const;
511 const entity_addr_t
&get_addr(int osd
) const {
513 return osd_addrs
->client_addr
[osd
] ? *osd_addrs
->client_addr
[osd
] : osd_addrs
->blank
;
515 const entity_addr_t
&get_cluster_addr(int osd
) const {
517 if (!osd_addrs
->cluster_addr
[osd
] || *osd_addrs
->cluster_addr
[osd
] == entity_addr_t())
518 return get_addr(osd
);
519 return *osd_addrs
->cluster_addr
[osd
];
521 const entity_addr_t
&get_hb_back_addr(int osd
) const {
523 return osd_addrs
->hb_back_addr
[osd
] ? *osd_addrs
->hb_back_addr
[osd
] : osd_addrs
->blank
;
525 const entity_addr_t
&get_hb_front_addr(int osd
) const {
527 return osd_addrs
->hb_front_addr
[osd
] ? *osd_addrs
->hb_front_addr
[osd
] : osd_addrs
->blank
;
529 entity_inst_t
get_most_recent_inst(int osd
) const {
531 return entity_inst_t(entity_name_t::OSD(osd
), get_addr(osd
));
533 entity_inst_t
get_inst(int osd
) const {
535 return get_most_recent_inst(osd
);
537 entity_inst_t
get_cluster_inst(int osd
) const {
539 return entity_inst_t(entity_name_t::OSD(osd
), get_cluster_addr(osd
));
541 entity_inst_t
get_hb_back_inst(int osd
) const {
543 return entity_inst_t(entity_name_t::OSD(osd
), get_hb_back_addr(osd
));
545 entity_inst_t
get_hb_front_inst(int osd
) const {
547 return entity_inst_t(entity_name_t::OSD(osd
), get_hb_front_addr(osd
));
550 const uuid_d
& get_uuid(int osd
) const {
552 return (*osd_uuid
)[osd
];
555 const epoch_t
& get_up_from(int osd
) const {
557 return osd_info
[osd
].up_from
;
559 const epoch_t
& get_up_thru(int osd
) const {
561 return osd_info
[osd
].up_thru
;
563 const epoch_t
& get_down_at(int osd
) const {
565 return osd_info
[osd
].down_at
;
567 const osd_info_t
& get_info(int osd
) const {
568 assert(osd
< max_osd
);
569 return osd_info
[osd
];
572 const osd_xinfo_t
& get_xinfo(int osd
) const {
573 assert(osd
< max_osd
);
574 return osd_xinfo
[osd
];
577 int get_next_up_osd_after(int n
) const {
578 if (get_max_osd() == 0)
580 for (int i
= n
+ 1; i
!= n
; ++i
) {
581 if (i
>= get_max_osd())
591 int get_previous_up_osd_before(int n
) const {
592 if (get_max_osd() == 0)
594 for (int i
= n
- 1; i
!= n
; --i
) {
596 i
= get_max_osd() - 1;
606 * get feature bits required by the current structure
608 * @param entity_type [in] what entity type we are asking about
609 * @param mask [out] set of all possible map-related features we could set
610 * @return feature bits used by this map
612 uint64_t get_features(int entity_type
, uint64_t *mask
) const;
615 * get oldest *client* version (firefly, hammer, etc.) that can connect given
616 * the feature bits required (according to get_features()).
618 pair
<string
,string
> get_min_compat_client() const;
621 * get intersection of features supported by up osds
623 uint64_t get_up_osd_features() const;
625 int apply_incremental(const Incremental
&inc
);
627 /// try to re-use/reference addrs in oldmap from newmap
628 static void dedup(const OSDMap
*oldmap
, OSDMap
*newmap
);
630 static void clean_temps(CephContext
*cct
, const OSDMap
& osdmap
,
631 Incremental
*pending_inc
);
633 // serialize, unserialize
635 void encode_client_old(bufferlist
& bl
) const;
636 void encode_classic(bufferlist
& bl
, uint64_t features
) const;
637 void decode_classic(bufferlist::iterator
& p
);
640 void encode(bufferlist
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
641 void decode(bufferlist
& bl
);
642 void decode(bufferlist::iterator
& bl
);
645 /**** mapping facilities ****/
650 const string
& nspace
,
652 int object_locator_to_pg(const object_t
& oid
, const object_locator_t
& loc
,
654 pg_t
object_locator_to_pg(const object_t
& oid
,
655 const object_locator_t
& loc
) const {
657 int ret
= object_locator_to_pg(oid
, loc
, pg
);
663 static object_locator_t
file_to_object_locator(const file_layout_t
& layout
) {
664 return object_locator_t(layout
.pool_id
, layout
.pool_ns
);
667 ceph_object_layout
file_to_object_layout(object_t oid
,
668 file_layout_t
& layout
) const {
669 return make_object_layout(oid
, layout
.pool_id
, layout
.pool_ns
);
672 ceph_object_layout
make_object_layout(object_t oid
, int pg_pool
,
673 string nspace
) const;
675 int get_pg_num(int pg_pool
) const
677 const pg_pool_t
*pool
= get_pg_pool(pg_pool
);
678 assert(NULL
!= pool
);
679 return pool
->get_pg_num();
682 bool pg_exists(pg_t pgid
) const {
683 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
684 return p
&& pgid
.ps() < p
->get_pg_num();
688 /// pg -> (raw osd list)
690 const pg_pool_t
& pool
, pg_t pg
,
693 int _pick_primary(const vector
<int>& osds
) const;
694 void _remove_nonexistent_osds(const pg_pool_t
& pool
, vector
<int>& osds
) const;
696 void _apply_primary_affinity(ps_t seed
, const pg_pool_t
& pool
,
697 vector
<int> *osds
, int *primary
) const;
699 /// apply pg_upmap[_items] mappings
700 void _apply_remap(const pg_pool_t
& pi
, pg_t pg
, vector
<int> *raw
) const;
702 /// pg -> (up osd list)
703 void _raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
704 vector
<int> *up
) const;
708 * Get the pg and primary temp, if they are specified.
709 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
710 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
711 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
713 void _get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
714 vector
<int> *temp_pg
, int *temp_primary
) const;
717 * map to up and acting. Fills in whatever fields are non-NULL.
719 void _pg_to_up_acting_osds(const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
720 vector
<int> *acting
, int *acting_primary
,
721 bool raw_pg_to_pg
= true) const;
725 * This is suitable only for looking at raw CRUSH outputs. It skips
726 * applying the temp and up checks and should not be used
727 * by anybody for data mapping purposes.
728 * raw and primary must be non-NULL
730 int pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const;
731 /// map a pg to its acting set. @return acting set size
732 int pg_to_acting_osds(const pg_t
& pg
, vector
<int> *acting
,
733 int *acting_primary
) const {
734 _pg_to_up_acting_osds(pg
, NULL
, NULL
, acting
, acting_primary
);
735 return acting
->size();
737 int pg_to_acting_osds(pg_t pg
, vector
<int>& acting
) const {
738 return pg_to_acting_osds(pg
, &acting
, NULL
);
741 * This does not apply temp overrides and should not be used
742 * by anybody for data mapping purposes. Specify both pointers.
744 void pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const;
746 * map a pg to its acting set as well as its up set. You must use
747 * the acting set for data mapping purposes, but some users will
748 * also find the up set useful for things like deciding what to
750 * Each of these pointers must be non-NULL.
752 void pg_to_up_acting_osds(pg_t pg
, vector
<int> *up
, int *up_primary
,
753 vector
<int> *acting
, int *acting_primary
) const {
754 _pg_to_up_acting_osds(pg
, up
, up_primary
, acting
, acting_primary
);
756 void pg_to_up_acting_osds(pg_t pg
, vector
<int>& up
, vector
<int>& acting
) const {
757 int up_primary
, acting_primary
;
758 pg_to_up_acting_osds(pg
, &up
, &up_primary
, &acting
, &acting_primary
);
760 bool pg_is_ec(pg_t pg
) const {
761 auto i
= pools
.find(pg
.pool());
762 assert(i
!= pools
.end());
763 return i
->second
.ec_pool();
765 bool get_primary_shard(const pg_t
& pgid
, spg_t
*out
) const {
766 auto i
= get_pools().find(pgid
.pool());
767 if (i
== get_pools().end()) {
770 if (!i
->second
.ec_pool()) {
776 pg_to_acting_osds(pgid
, &acting
, &primary
);
777 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
778 if (acting
[i
] == primary
) {
779 *out
= spg_t(pgid
, shard_id_t(i
));
786 int64_t lookup_pg_pool_name(const string
& name
) const {
787 auto p
= name_pool
.find(name
);
788 if (p
== name_pool
.end())
793 int64_t get_pool_max() const {
796 const mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() const {
799 mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() {
802 const string
& get_pool_name(int64_t p
) const {
803 auto i
= pool_name
.find(p
);
804 assert(i
!= pool_name
.end());
807 bool have_pg_pool(int64_t p
) const {
808 return pools
.count(p
);
810 const pg_pool_t
* get_pg_pool(int64_t p
) const {
811 auto i
= pools
.find(p
);
812 if (i
!= pools
.end())
816 unsigned get_pg_size(pg_t pg
) const {
817 auto p
= pools
.find(pg
.pool());
818 assert(p
!= pools
.end());
819 return p
->second
.get_size();
821 int get_pg_type(pg_t pg
) const {
822 auto p
= pools
.find(pg
.pool());
823 assert(p
!= pools
.end());
824 return p
->second
.get_type();
828 pg_t
raw_pg_to_pg(pg_t pg
) const {
829 auto p
= pools
.find(pg
.pool());
830 assert(p
!= pools
.end());
831 return p
->second
.raw_pg_to_pg(pg
);
834 // pg -> acting primary osd
835 int get_pg_acting_primary(pg_t pg
) const {
837 _pg_to_up_acting_osds(pg
, nullptr, nullptr, nullptr, &primary
);
842 * check whether an spg_t maps to a particular osd
844 bool is_up_acting_osd_shard(spg_t pg
, int osd
) const {
845 vector
<int> up
, acting
;
846 _pg_to_up_acting_osds(pg
.pgid
, &up
, NULL
, &acting
, NULL
, false);
847 if (pg
.shard
== shard_id_t::NO_SHARD
) {
848 if (calc_pg_role(osd
, acting
, acting
.size()) >= 0 ||
849 calc_pg_role(osd
, up
, up
.size()) >= 0)
852 if (pg
.shard
< (int)acting
.size() && acting
[pg
.shard
] == osd
)
854 if (pg
.shard
< (int)up
.size() && up
[pg
.shard
] == osd
)
861 /* what replica # is a given osd? 0 primary, -1 for none. */
862 static int calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
=0);
863 static int calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
=0);
864 static bool primary_changed(
866 const vector
<int> &oldacting
,
868 const vector
<int> &newacting
);
870 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
871 int get_pg_acting_rank(pg_t pg
, int osd
) const {
873 int nrep
= pg_to_acting_osds(pg
, group
);
874 return calc_pg_rank(osd
, group
, nrep
);
876 /* role is -1 (stray), 0 (primary), 1 (replica) */
877 int get_pg_acting_role(const pg_t
& pg
, int osd
) const {
879 int nrep
= pg_to_acting_osds(pg
, group
);
880 return calc_pg_role(osd
, group
, nrep
);
883 bool osd_is_valid_op_target(pg_t pg
, int osd
) const {
886 int nrep
= pg_to_acting_osds(pg
, &group
, &primary
);
892 return calc_pg_role(osd
, group
, nrep
) >= 0;
897 Incremental
*pending_inc
);
901 pg_t pg
, ///< pg to potentially remap
902 const set
<int>& overfull
, ///< osds we'd want to evacuate
903 const vector
<int>& underfull
, ///< osds to move to, in order of preference
905 vector
<int> *out
); ///< resulting alternative mapping
909 float max_deviation
, ///< max deviation from target (value < 1.0)
910 int max_iterations
, ///< max iterations to run
911 const set
<int64_t>& pools
, ///< [optional] restrict to pool
912 Incremental
*pending_inc
916 * handy helpers to build simple maps...
919 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
920 * it will be initialized with the specified number of OSDs in a
921 * single host. If **num_osd** is < 0 the layout of the OSD map will
922 * be built by reading the content of the configuration file.
924 * @param cct [in] in core ceph context
925 * @param e [in] initial epoch
926 * @param fsid [in] id of the cluster
927 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
928 * @return **0** on success, negative errno on error.
930 int build_simple(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
931 int num_osd
, int pg_bits
, int pgp_bits
);
932 static int _build_crush_types(CrushWrapper
& crush
);
933 static int build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
934 int num_osd
, ostream
*ss
);
935 static int build_simple_crush_map_from_conf(CephContext
*cct
,
938 static int build_simple_crush_rulesets(CephContext
*cct
, CrushWrapper
& crush
,
942 bool crush_ruleset_in_use(int ruleset
) const;
946 primary_temp
->clear();
950 void print_osd_line(int cur
, ostream
*out
, Formatter
*f
) const;
952 void print(ostream
& out
) const;
953 void print_pools(ostream
& out
) const;
954 void print_summary(Formatter
*f
, ostream
& out
) const;
955 void print_oneline_summary(ostream
& out
) const;
956 void print_tree(Formatter
*f
, ostream
*out
) const;
958 int summarize_mapping_stats(
960 const set
<int64_t> *pools
,
964 string
get_flag_string() const;
965 static string
get_flag_string(unsigned flags
);
966 static void dump_erasure_code_profiles(
967 const mempool::osdmap::map
<string
,map
<string
,string
> > &profiles
,
969 void dump(Formatter
*f
) const;
970 static void generate_test_instances(list
<OSDMap
*>& o
);
971 bool check_new_blacklist_entries() const { return new_blacklist_entries
; }
973 WRITE_CLASS_ENCODER_FEATURES(OSDMap
)
974 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental
)
976 typedef ceph::shared_ptr
<const OSDMap
> OSDMapRef
;
978 inline ostream
& operator<<(ostream
& out
, const OSDMap
& m
) {
979 m
.print_oneline_summary(out
);