1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 /* Object Store Device (OSD) Monitor
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
28 #include "include/types.h"
29 #include "include/encoding.h"
30 #include "common/simple_cache.hpp"
31 #include "common/PriorityCache.h"
32 #include "msg/Messenger.h"
34 #include "osd/OSDMap.h"
35 #include "osd/OSDMapMapping.h"
37 #include "CreatingPGs.h"
38 #include "PaxosService.h"
40 #include "erasure-code/ErasureCodeInterface.h"
41 #include "mon/MonOpRequest.h"
42 #include <boost/functional/hash.hpp>
50 /// information about a particular peer's failure reports for one osd
51 struct failure_reporter_t
{
52 utime_t failed_since
; ///< when they think it failed
53 MonOpRequestRef op
; ///< failure op request
55 failure_reporter_t() {}
56 failure_reporter_t(utime_t s
, MonOpRequestRef op
)
57 : failed_since(s
), op(op
) {}
58 ~failure_reporter_t() { }
61 /// information about all failure reports for one osd
62 struct failure_info_t
{
63 std::map
<int, failure_reporter_t
> reporters
; ///< reporter -> failed_since etc
64 utime_t max_failed_since
; ///< most recent failed_since
68 utime_t
get_failed_since() {
69 if (max_failed_since
== utime_t() && !reporters
.empty()) {
70 // the old max must have canceled; recalculate.
71 for (auto p
= reporters
.begin(); p
!= reporters
.end(); ++p
)
72 if (p
->second
.failed_since
> max_failed_since
)
73 max_failed_since
= p
->second
.failed_since
;
75 return max_failed_since
;
78 // set the message for the latest report.
79 void add_report(int who
, utime_t failed_since
, MonOpRequestRef op
) {
80 [[maybe_unused
]] auto [it
, new_reporter
] =
81 reporters
.insert_or_assign(who
, failure_reporter_t
{failed_since
, op
});
83 if (max_failed_since
!= utime_t() && max_failed_since
< failed_since
) {
84 max_failed_since
= failed_since
;
89 void take_report_messages(std::list
<MonOpRequestRef
>& ls
) {
90 for (auto p
= reporters
.begin(); p
!= reporters
.end(); ++p
) {
92 ls
.push_back(p
->second
.op
);
98 void cancel_report(int who
) {
100 max_failed_since
= utime_t();
105 class LastEpochClean
{
107 std::vector
<epoch_t
> epoch_by_pg
;
108 ps_t next_missing
= 0;
109 epoch_t floor
= std::numeric_limits
<epoch_t
>::max();
110 void report(unsigned pg_num
, ps_t pg
, epoch_t last_epoch_clean
);
112 std::map
<uint64_t, Lec
> report_by_pool
;
114 void report(unsigned pg_num
, const pg_t
& pg
, epoch_t last_epoch_clean
);
115 void remove_pool(uint64_t pool
);
116 epoch_t
get_lower_bound(const OSDMap
& latest
) const;
118 void dump(Formatter
*f
) const;
122 struct osdmap_manifest_t
{
123 // all the maps we have pinned -- i.e., won't be removed unless
124 // they are inside a trim interval.
125 std::set
<version_t
> pinned
;
127 osdmap_manifest_t() {}
129 version_t
get_last_pinned() const
131 auto it
= pinned
.crbegin();
132 if (it
== pinned
.crend()) {
138 version_t
get_first_pinned() const
140 auto it
= pinned
.cbegin();
141 if (it
== pinned
.cend()) {
147 bool is_pinned(version_t v
) const
149 return pinned
.find(v
) != pinned
.end();
152 void pin(version_t v
)
157 version_t
get_lower_closest_pinned(version_t v
) const {
158 auto p
= pinned
.lower_bound(v
);
159 if (p
== pinned
.cend()) {
162 if (p
== pinned
.cbegin()) {
170 void encode(ceph::buffer::list
& bl
) const
172 ENCODE_START(1, 1, bl
);
177 void decode(ceph::buffer::list::const_iterator
& bl
)
184 void decode(ceph::buffer::list
& bl
) {
185 auto p
= bl
.cbegin();
189 void dump(ceph::Formatter
*f
) {
190 f
->dump_unsigned("first_pinned", get_first_pinned());
191 f
->dump_unsigned("last_pinned", get_last_pinned());
192 f
->open_array_section("pinned_maps");
193 for (auto& i
: pinned
) {
194 f
->dump_unsigned("epoch", i
);
199 WRITE_CLASS_ENCODER(osdmap_manifest_t
);
201 class OSDMonitor
: public PaxosService
,
202 public md_config_obs_t
{
209 const char** get_tracked_conf_keys() const override
;
210 void handle_conf_change(const ConfigProxy
& conf
,
211 const std::set
<std::string
> &changed
) override
;
213 OSDMap::Incremental pending_inc
;
214 std::map
<int, ceph::buffer::list
> pending_metadata
;
215 std::set
<int> pending_metadata_rm
;
216 std::map
<int, failure_info_t
> failure_info
;
217 std::map
<int,utime_t
> down_pending_out
; // osd down -> out
218 bool priority_convert
= false;
219 std::map
<int64_t,std::set
<snapid_t
>> pending_pseudo_purged_snaps
;
220 std::shared_ptr
<PriorityCache::PriCache
> rocksdb_binned_kv_cache
= nullptr;
221 std::shared_ptr
<PriorityCache::Manager
> pcm
= nullptr;
222 ceph::mutex balancer_lock
= ceph::make_mutex("OSDMonitor::balancer_lock");
224 std::map
<int,double> osd_weight
;
225 std::set
<int32_t> filestore_osds
;
227 using osdmap_key_t
= std::pair
<version_t
, uint64_t>;
228 using osdmap_cache_t
= SimpleLRU
<osdmap_key_t
,
230 std::less
<osdmap_key_t
>,
231 boost::hash
<osdmap_key_t
>>;
232 osdmap_cache_t inc_osd_cache
;
233 osdmap_cache_t full_osd_cache
;
235 bool has_osdmap_manifest
;
236 osdmap_manifest_t osdmap_manifest
;
238 bool check_failures(utime_t now
);
239 bool check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
);
240 utime_t
get_grace_time(utime_t now
, int target_osd
, failure_info_t
& fi
) const;
241 bool is_failure_stale(utime_t now
, failure_info_t
& fi
) const;
242 void force_failure(int target_osd
, int by
);
244 bool _have_pending_crush();
245 CrushWrapper
&_get_stable_crush();
246 CrushWrapper
_get_pending_crush();
254 struct CleanUpmapJob
: public ParallelPGMapper::Job
{
256 const OSDMap
& osdmap
;
257 OSDMap::Incremental
& pending_inc
;
258 // lock to protect pending_inc form changing
259 // when checking is done
260 ceph::mutex pending_inc_lock
=
261 ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
263 CleanUpmapJob(CephContext
*cct
, const OSDMap
& om
, OSDMap::Incremental
& pi
)
264 : ParallelPGMapper::Job(&om
),
269 void process(const std::vector
<pg_t
>& to_check
) override
{
270 std::vector
<pg_t
> to_cancel
;
271 std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int,int>>> to_remap
;
272 osdmap
.check_pg_upmaps(cct
, to_check
, &to_cancel
, &to_remap
);
273 // don't bother taking lock if nothing changes
274 if (!to_cancel
.empty() || !to_remap
.empty()) {
275 std::lock_guard
l(pending_inc_lock
);
276 osdmap
.clean_pg_upmaps(cct
, &pending_inc
, to_cancel
, to_remap
);
280 void process(int64_t poolid
, unsigned ps_begin
, unsigned ps_end
) override
{}
281 void complete() override
{}
282 }; // public as this will need to be accessible from TestTestOSDMap.cc
286 void create_initial() override
;
287 void get_store_prefixes(std::set
<std::string
>& s
) const override
;
290 void update_from_paxos(bool *need_bootstrap
) override
;
291 void create_pending() override
; // prepare a new pending
292 void encode_pending(MonitorDBStore::TransactionRef t
) override
;
293 void on_active() override
;
294 void on_restart() override
;
295 void on_shutdown() override
;
297 /* osdmap full map prune */
298 void load_osdmap_manifest();
299 bool should_prune() const;
300 void _prune_update_trimmed(
301 MonitorDBStore::TransactionRef tx
,
303 void prune_init(osdmap_manifest_t
& manifest
);
304 bool _prune_sanitize_options() const;
305 bool is_prune_enabled() const;
306 bool is_prune_supported() const;
307 bool do_prune(MonitorDBStore::TransactionRef tx
);
309 // Priority cache control
310 uint32_t mon_osd_cache_size
= 0; ///< Number of cached OSDMaps
311 uint64_t rocksdb_cache_size
= 0; ///< Cache for kv Db
312 double cache_kv_ratio
= 0; ///< Cache ratio dedicated to kv
313 double cache_inc_ratio
= 0; ///< Cache ratio dedicated to inc
314 double cache_full_ratio
= 0; ///< Cache ratio dedicated to full
315 uint64_t mon_memory_base
= 0; ///< Mon base memory for cache autotuning
316 double mon_memory_fragmentation
= 0; ///< Expected memory fragmentation
317 uint64_t mon_memory_target
= 0; ///< Mon target memory for cache autotuning
318 uint64_t mon_memory_min
= 0; ///< Min memory to cache osdmaps
319 bool mon_memory_autotune
= false; ///< Cache auto tune setting
320 int register_cache_with_pcm();
321 int _set_cache_sizes();
322 int _set_cache_ratios();
323 void _set_new_cache_sizes();
324 void _set_cache_autotuning();
325 int _update_mon_cache_settings();
327 friend struct OSDMemCache
;
328 friend struct IncCache
;
329 friend struct FullCache
;
332 * we haven't delegated full version stashing to paxosservice for some time
333 * now, making this function useless in current context.
335 void encode_full(MonitorDBStore::TransactionRef t
) override
{ }
337 * do not let paxosservice periodically stash full osdmaps, or we will break our
338 * locally-managed full maps. (update_from_paxos loads the latest and writes them
339 * out going forward from there, but if we just synced that may mean we skip some.)
341 bool should_stash_full() override
{
346 * hook into trim to include the oldest full map in the trim transaction
348 * This ensures that anyone post-sync will have enough to rebuild their
351 void encode_trim_extra(MonitorDBStore::TransactionRef tx
, version_t first
) override
;
353 void update_msgr_features();
355 * check if the cluster supports the features required by the
356 * given crush map. Outputs the daemons which don't support it
357 * to the stringstream.
359 * @returns true if the map is passable, false otherwise
361 bool validate_crush_against_features(const CrushWrapper
*newcrush
,
362 std::stringstream
&ss
);
363 void check_osdmap_subs();
364 void share_map_with_random_osd();
366 ceph::mutex prime_pg_temp_lock
=
367 ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
368 struct PrimeTempJob
: public ParallelPGMapper::Job
{
370 PrimeTempJob(const OSDMap
& om
, OSDMonitor
*m
)
371 : ParallelPGMapper::Job(&om
), osdmon(m
) {}
372 void process(int64_t pool
, unsigned ps_begin
, unsigned ps_end
) override
{
373 for (unsigned ps
= ps_begin
; ps
< ps_end
; ++ps
) {
375 osdmon
->prime_pg_temp(*osdmap
, pgid
);
378 void process(const std::vector
<pg_t
>& pgs
) override
{}
379 void complete() override
{}
381 void maybe_prime_pg_temp();
382 void prime_pg_temp(const OSDMap
& next
, pg_t pgid
);
384 ParallelPGMapper mapper
; ///< for background pg work
385 OSDMapMapping mapping
; ///< pg <-> osd mappings
386 std::unique_ptr
<ParallelPGMapper::Job
> mapping_job
; ///< background mapping job
387 void start_mapping();
389 void update_logger();
391 void handle_query(PaxosServiceMessage
*m
);
392 bool preprocess_query(MonOpRequestRef op
) override
; // true if processed.
393 bool prepare_update(MonOpRequestRef op
) override
;
394 bool should_propose(double &delay
) override
;
396 version_t
get_trim_to() const override
;
398 bool can_mark_down(int o
);
399 bool can_mark_up(int o
);
400 bool can_mark_out(int o
);
401 bool can_mark_in(int o
);
404 MOSDMap
*build_latest_full(uint64_t features
);
405 MOSDMap
*build_incremental(epoch_t first
, epoch_t last
, uint64_t features
);
406 void send_full(MonOpRequestRef op
);
407 void send_incremental(MonOpRequestRef op
, epoch_t first
);
410 * Make sure the existing (up) OSDs support the given features
411 * @return 0 on success, or an error code if any OSDs re missing features.
412 * @param ss Filled in with ane explanation of failure, if any
414 int check_cluster_features(uint64_t features
, std::stringstream
&ss
);
415 // @param req an optional op request, if the osdmaps are replies to it. so
416 // @c Monitor::send_reply() can mark_event with it.
417 void send_incremental(epoch_t first
, MonSession
*session
, bool onetime
,
418 MonOpRequestRef req
= MonOpRequestRef());
421 void print_utilization(std::ostream
&out
, ceph::Formatter
*f
, bool tree
) const;
423 bool check_source(MonOpRequestRef op
, uuid_d fsid
);
425 bool preprocess_get_osdmap(MonOpRequestRef op
);
427 bool preprocess_mark_me_down(MonOpRequestRef op
);
429 friend class C_AckMarkedDown
;
430 bool preprocess_failure(MonOpRequestRef op
);
431 bool prepare_failure(MonOpRequestRef op
);
432 bool prepare_mark_me_down(MonOpRequestRef op
);
433 void process_failures();
434 void take_all_failures(std::list
<MonOpRequestRef
>& ls
);
436 bool preprocess_mark_me_dead(MonOpRequestRef op
);
437 bool prepare_mark_me_dead(MonOpRequestRef op
);
439 bool preprocess_full(MonOpRequestRef op
);
440 bool prepare_full(MonOpRequestRef op
);
442 bool preprocess_boot(MonOpRequestRef op
);
443 bool prepare_boot(MonOpRequestRef op
);
444 void _booted(MonOpRequestRef op
, bool logit
);
446 void update_up_thru(int from
, epoch_t up_thru
);
447 bool preprocess_alive(MonOpRequestRef op
);
448 bool prepare_alive(MonOpRequestRef op
);
449 void _reply_map(MonOpRequestRef op
, epoch_t e
);
451 bool preprocess_pgtemp(MonOpRequestRef op
);
452 bool prepare_pgtemp(MonOpRequestRef op
);
454 bool preprocess_pg_created(MonOpRequestRef op
);
455 bool prepare_pg_created(MonOpRequestRef op
);
457 bool preprocess_pg_ready_to_merge(MonOpRequestRef op
);
458 bool prepare_pg_ready_to_merge(MonOpRequestRef op
);
460 int _check_remove_pool(int64_t pool_id
, const pg_pool_t
&pool
, std::ostream
*ss
);
461 bool _check_become_tier(
462 int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
463 int64_t base_pool_id
, const pg_pool_t
*base_pool
,
464 int *err
, std::ostream
*ss
) const;
465 bool _check_remove_tier(
466 int64_t base_pool_id
, const pg_pool_t
*base_pool
, const pg_pool_t
*tier_pool
,
467 int *err
, std::ostream
*ss
) const;
469 int _prepare_remove_pool(int64_t pool
, std::ostream
*ss
, bool no_fake
);
470 int _prepare_rename_pool(int64_t pool
, std::string newname
);
472 bool enforce_pool_op_caps(MonOpRequestRef op
);
473 bool preprocess_pool_op (MonOpRequestRef op
);
474 bool preprocess_pool_op_create (MonOpRequestRef op
);
475 bool prepare_pool_op (MonOpRequestRef op
);
476 bool prepare_pool_op_create (MonOpRequestRef op
);
477 bool prepare_pool_op_delete(MonOpRequestRef op
);
478 int crush_rename_bucket(const std::string
& srcname
,
479 const std::string
& dstname
,
481 void check_legacy_ec_plugin(const std::string
& plugin
,
482 const std::string
& profile
) const;
483 int normalize_profile(const std::string
& profilename
,
484 ceph::ErasureCodeProfile
&profile
,
487 int crush_rule_create_erasure(const std::string
&name
,
488 const std::string
&profile
,
491 int get_crush_rule(const std::string
&rule_name
,
494 int get_erasure_code(const std::string
&erasure_code_profile
,
495 ceph::ErasureCodeInterfaceRef
*erasure_code
,
496 std::ostream
*ss
) const;
497 int prepare_pool_crush_rule(const unsigned pool_type
,
498 const std::string
&erasure_code_profile
,
499 const std::string
&rule_name
,
502 bool erasure_code_profile_in_use(
503 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
504 const std::string
&profile
,
506 int parse_erasure_code_profile(const std::vector
<std::string
> &erasure_code_profile
,
507 std::map
<std::string
,std::string
> *erasure_code_profile_map
,
509 int prepare_pool_size(const unsigned pool_type
,
510 const std::string
&erasure_code_profile
,
512 unsigned *size
, unsigned *min_size
,
514 int prepare_pool_stripe_width(const unsigned pool_type
,
515 const std::string
&erasure_code_profile
,
516 unsigned *stripe_width
,
518 int check_pg_num(int64_t pool
, int pg_num
, int size
, int crush_rule
, std::ostream
* ss
);
519 int prepare_new_pool(std::string
& name
,
521 const std::string
&crush_rule_name
,
522 unsigned pg_num
, unsigned pgp_num
,
526 const uint64_t target_size_bytes
,
527 const float target_size_ratio
,
528 const std::string
&erasure_code_profile
,
529 const unsigned pool_type
,
530 const uint64_t expected_num_objects
,
531 FastReadType fast_read
,
532 const std::string
& pg_autoscale_mode
,
535 int prepare_new_pool(MonOpRequestRef op
);
537 void set_pool_flags(int64_t pool_id
, uint64_t flags
);
538 void clear_pool_flags(int64_t pool_id
, uint64_t flags
);
539 bool update_pools_status();
541 bool _is_removed_snap(int64_t pool_id
, snapid_t snapid
);
542 bool _is_pending_removed_snap(int64_t pool_id
, snapid_t snapid
);
544 std::string
make_purged_snap_epoch_key(epoch_t epoch
);
545 std::string
make_purged_snap_key(int64_t pool
, snapid_t snap
);
546 std::string
make_purged_snap_key_value(int64_t pool
, snapid_t snap
, snapid_t num
,
547 epoch_t epoch
, ceph::buffer::list
*v
);
549 bool try_prune_purged_snaps();
550 int lookup_purged_snap(int64_t pool
, snapid_t snap
,
551 snapid_t
*begin
, snapid_t
*end
);
553 void insert_purged_snap_update(
555 snapid_t start
, snapid_t end
,
557 MonitorDBStore::TransactionRef t
);
559 bool prepare_set_flag(MonOpRequestRef op
, int flag
);
560 bool prepare_unset_flag(MonOpRequestRef op
, int flag
);
562 void _pool_op_reply(MonOpRequestRef op
,
563 int ret
, epoch_t epoch
, ceph::buffer::list
*blp
=NULL
);
565 struct C_Booted
: public C_MonOp
{
568 C_Booted(OSDMonitor
*cm
, MonOpRequestRef op_
, bool l
=true) :
569 C_MonOp(op_
), cmon(cm
), logit(l
) {}
570 void _finish(int r
) override
{
572 cmon
->_booted(op
, logit
);
573 else if (r
== -ECANCELED
)
575 else if (r
== -EAGAIN
)
578 ceph_abort_msg("bad C_Booted return value");
582 struct C_ReplyMap
: public C_MonOp
{
585 C_ReplyMap(OSDMonitor
*o
, MonOpRequestRef op_
, epoch_t ee
)
586 : C_MonOp(op_
), osdmon(o
), e(ee
) {}
587 void _finish(int r
) override
{
589 osdmon
->_reply_map(op
, e
);
590 else if (r
== -ECANCELED
)
592 else if (r
== -EAGAIN
)
593 osdmon
->dispatch(op
);
595 ceph_abort_msg("bad C_ReplyMap return value");
598 struct C_PoolOp
: public C_MonOp
{
602 ceph::buffer::list reply_data
;
603 C_PoolOp(OSDMonitor
* osd
, MonOpRequestRef op_
, int rc
, int e
, ceph::buffer::list
*rd
=NULL
) :
604 C_MonOp(op_
), osdmon(osd
), replyCode(rc
), epoch(e
) {
608 void _finish(int r
) override
{
610 osdmon
->_pool_op_reply(op
, replyCode
, epoch
, &reply_data
);
611 else if (r
== -ECANCELED
)
613 else if (r
== -EAGAIN
)
614 osdmon
->dispatch(op
);
616 ceph_abort_msg("bad C_PoolOp return value");
620 bool preprocess_remove_snaps(MonOpRequestRef op
);
621 bool prepare_remove_snaps(MonOpRequestRef op
);
623 bool preprocess_get_purged_snaps(MonOpRequestRef op
);
625 int load_metadata(int osd
, std::map
<std::string
, std::string
>& m
,
627 void count_metadata(const std::string
& field
, ceph::Formatter
*f
);
629 void reencode_incremental_map(ceph::buffer::list
& bl
, uint64_t features
);
630 void reencode_full_map(ceph::buffer::list
& bl
, uint64_t features
);
632 void count_metadata(const std::string
& field
, std::map
<std::string
,int> *out
);
633 void get_versions(std::map
<std::string
, std::list
<std::string
>> &versions
);
634 void get_filestore_osd_list();
635 void check_for_filestore_osds(health_check_map_t
*checks
);
637 int get_osd_objectstore_type(int osd
, std::string
*type
);
638 bool is_pool_currently_all_bluestore(int64_t pool_id
, const pg_pool_t
&pool
,
641 // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
642 std::map
<int, std::pair
<utime_t
, int>> last_osd_report
;
643 // TODO: use last_osd_report to store the osd report epochs, once we don't
644 // need to upgrade from pre-luminous releases.
645 std::map
<int,epoch_t
> osd_epochs
;
646 LastEpochClean last_epoch_clean
;
647 bool preprocess_beacon(MonOpRequestRef op
);
648 bool prepare_beacon(MonOpRequestRef op
);
649 epoch_t
get_min_last_epoch_clean() const;
651 friend class C_UpdateCreatingPGs
;
652 std::map
<int, std::map
<epoch_t
, std::set
<spg_t
>>> creating_pgs_by_osd_epoch
;
653 std::vector
<pg_t
> pending_created_pgs
;
654 // the epoch when the pg mapping was calculated
655 epoch_t creating_pgs_epoch
= 0;
656 creating_pgs_t creating_pgs
;
657 mutable std::mutex creating_pgs_lock
;
659 creating_pgs_t
update_pending_pgs(const OSDMap::Incremental
& inc
,
660 const OSDMap
& nextmap
);
661 unsigned scan_for_creating_pgs(
662 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
663 const mempool::osdmap::set
<int64_t>& removed_pools
,
665 creating_pgs_t
* creating_pgs
) const;
666 std::pair
<int32_t, pg_t
> get_parent_pg(pg_t pgid
) const;
667 void update_creating_pgs();
668 void check_pg_creates_subs();
669 epoch_t
send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const;
671 int32_t _allocate_osd_id(int32_t* existing_id
);
673 int get_grace_interval_threshold();
674 bool grace_interval_threshold_exceeded(int last_failed
);
675 void set_default_laggy_params(int target_osd
);
678 OSDMonitor(CephContext
*cct
, Monitor
&mn
, Paxos
&p
, const std::string
& service_name
);
680 void tick() override
; // check state, take actions
682 bool preprocess_command(MonOpRequestRef op
);
683 bool prepare_command(MonOpRequestRef op
);
684 bool prepare_command_impl(MonOpRequestRef op
, const cmdmap_t
& cmdmap
);
686 int validate_osd_create(
689 const bool check_osd_exists
,
690 int32_t* existing_id
,
691 std::stringstream
& ss
);
692 int prepare_command_osd_create(
695 int32_t* existing_id
,
696 std::stringstream
& ss
);
697 void do_osd_create(const int32_t id
, const uuid_d
& uuid
,
698 const std::string
& device_class
,
700 int prepare_command_osd_purge(int32_t id
, std::stringstream
& ss
);
701 int prepare_command_osd_destroy(int32_t id
, std::stringstream
& ss
);
702 int _prepare_command_osd_crush_remove(
703 CrushWrapper
&newcrush
,
708 void do_osd_crush_remove(CrushWrapper
& newcrush
);
709 int prepare_command_osd_crush_remove(
710 CrushWrapper
&newcrush
,
715 int prepare_command_osd_remove(int32_t id
);
716 int prepare_command_osd_new(
718 const cmdmap_t
& cmdmap
,
719 const std::map
<std::string
,std::string
>& secrets
,
720 std::stringstream
&ss
,
723 int prepare_command_pool_set(const cmdmap_t
& cmdmap
,
724 std::stringstream
& ss
);
726 int prepare_command_pool_application(const std::string
&prefix
,
727 const cmdmap_t
& cmdmap
,
728 std::stringstream
& ss
);
729 int preprocess_command_pool_application(const std::string
&prefix
,
730 const cmdmap_t
& cmdmap
,
731 std::stringstream
& ss
,
733 int _command_pool_application(const std::string
&prefix
,
734 const cmdmap_t
& cmdmap
,
735 std::stringstream
& ss
,
739 bool handle_osd_timeouts(const utime_t
&now
,
740 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
);
742 void send_latest(MonOpRequestRef op
, epoch_t start
=0);
743 void send_latest_now_nodelete(MonOpRequestRef op
, epoch_t start
=0) {
744 op
->mark_osdmon_event(__func__
);
745 send_incremental(op
, start
);
748 int get_version(version_t ver
, ceph::buffer::list
& bl
) override
;
749 int get_version(version_t ver
, uint64_t feature
, ceph::buffer::list
& bl
);
751 int get_version_full(version_t ver
, uint64_t feature
, ceph::buffer::list
& bl
);
752 int get_version_full(version_t ver
, ceph::buffer::list
& bl
) override
;
753 int get_inc(version_t ver
, OSDMap::Incremental
& inc
);
754 int get_full_from_pinned_map(version_t ver
, ceph::buffer::list
& bl
);
756 epoch_t
blocklist(const entity_addrvec_t
& av
, utime_t until
);
757 epoch_t
blocklist(entity_addr_t a
, utime_t until
);
759 void dump_info(ceph::Formatter
*f
);
760 int dump_osd_metadata(int osd
, ceph::Formatter
*f
, std::ostream
*err
);
761 void print_nodes(ceph::Formatter
*f
);
763 void check_osdmap_sub(Subscription
*sub
);
764 void check_pg_creates_sub(Subscription
*sub
);
766 void do_application_enable(int64_t pool_id
, const std::string
&app_name
,
767 const std::string
&app_key
="",
768 const std::string
&app_value
="",
770 void do_set_pool_opt(int64_t pool_id
, pool_opts_t::key_t opt
,
771 pool_opts_t::value_t
);
773 void add_flag(int flag
) {
774 if (!(osdmap
.flags
& flag
)) {
775 if (pending_inc
.new_flags
< 0)
776 pending_inc
.new_flags
= osdmap
.flags
;
777 pending_inc
.new_flags
|= flag
;
781 void remove_flag(int flag
) {
782 if(osdmap
.flags
& flag
) {
783 if (pending_inc
.new_flags
< 0)
784 pending_inc
.new_flags
= osdmap
.flags
;
785 pending_inc
.new_flags
&= ~flag
;
788 void convert_pool_priorities(void);
790 * Find the pools which are requested to be put into stretch mode,
791 * validate that they are allowed to be in stretch mode (eg, are replicated)
792 * and place copies of them in the pools set.
793 * This does not make any changes to the pools or state; it's just
794 * a safety-check-and-collect function.
796 void try_enable_stretch_mode_pools(std::stringstream
& ss
, bool *okay
,
798 std::set
<pg_pool_t
*>* pools
,
799 const std::string
& new_crush_rule
);
801 * Check validity of inputs and OSD/CRUSH state to
802 * engage stretch mode. Designed to be used with
803 * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
804 * first with commit=false to validate.
805 * @param ss: a stringstream to write errors into
806 * @param okay: Filled to true if okay, false if validation fails
807 * @param errcode: filled with -errno if there's a problem
808 * @param commit: true if we should commit the change, false if just testing
809 * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
810 * @param bucket_count: The number of buckets required in peering.
811 * Currently must be 2.
812 * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
813 * from try_enable_stretch_mode_pools()).
814 * @param new_crush_rule: The crush rule to set the pools to.
816 void try_enable_stretch_mode(std::stringstream
& ss
, bool *okay
,
817 int *errcode
, bool commit
,
818 const std::string
& dividing_bucket
,
819 uint32_t bucket_count
,
820 const std::set
<pg_pool_t
*>& pools
,
821 const std::string
& new_crush_rule
);
823 * Check the input dead_buckets mapping (buckets->dead monitors) to see
824 * if the OSDs are also down. If so, fill in really_down_buckets and
825 * really_down_mons and return true; else return false.
827 bool check_for_dead_crush_zones(const std::map
<std::string
,std::set
<std::string
>>& dead_buckets
,
828 std::set
<int> *really_down_buckets
,
829 std::set
<std::string
> *really_down_mons
);
831 * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
832 * and using the live_zones (should presently be size 1)
834 void trigger_degraded_stretch_mode(const std::set
<int>& dead_buckets
,
835 const std::set
<std::string
>& live_zones
);
837 * This is just to maintain stretch_recovery_triggered; below
839 void set_degraded_stretch_mode();
841 * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
843 void trigger_recovery_stretch_mode();
845 * This is just to maintain stretch_recovery_triggered; below
847 void set_recovery_stretch_mode();
849 * This is just to maintain stretch_recovery_triggered; below
851 void set_healthy_stretch_mode();
853 * Tells the OSD there's a new pg digest, in case it's interested.
854 * (It's interested when in recovering stretch mode.)
856 void notify_new_pg_digest();
858 * Check if we can exit recovery stretch mode and go back to normal.
859 * @param force If true, we will force the exit through once it is legal,
860 * without regard to the reported PG status.
862 void try_end_recovery_stretch_mode(bool force
);
864 * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
866 void trigger_healthy_stretch_mode();
868 * Obtain the crush rule being used for stretch pools.
869 * Note that right now this is heuristic and simply selects the
870 * most-used rule on replicated stretch pools.
871 * @return the crush rule ID, or a negative errno
873 int get_replicated_stretch_crush_rule();
875 utime_t stretch_recovery_triggered
; // what time we committed a switch to recovery mode