1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 /* Object Store Device (OSD) Monitor
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
27 #include "include/types.h"
28 #include "include/encoding.h"
29 #include "common/simple_cache.hpp"
30 #include "msg/Messenger.h"
32 #include "osd/OSDMap.h"
33 #include "osd/OSDMapMapping.h"
35 #include "CreatingPGs.h"
36 #include "PaxosService.h"
43 #include "erasure-code/ErasureCodeInterface.h"
44 #include "mon/MonOpRequest.h"
45 #include <boost/functional/hash.hpp>
46 // re-include our assert to clobber the system one; fix dout:
47 #include "include/ceph_assert.h"
49 /// information about a particular peer's failure reports for one osd
50 struct failure_reporter_t
{
51 utime_t failed_since
; ///< when they think it failed
52 MonOpRequestRef op
; ///< failure op request
54 failure_reporter_t() {}
55 explicit failure_reporter_t(utime_t s
) : failed_since(s
) {}
56 ~failure_reporter_t() { }
59 /// information about all failure reports for one osd
60 struct failure_info_t
{
61 map
<int, failure_reporter_t
> reporters
; ///< reporter -> failed_since etc
62 utime_t max_failed_since
; ///< most recent failed_since
66 utime_t
get_failed_since() {
67 if (max_failed_since
== utime_t() && !reporters
.empty()) {
68 // the old max must have canceled; recalculate.
69 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
72 if (p
->second
.failed_since
> max_failed_since
)
73 max_failed_since
= p
->second
.failed_since
;
75 return max_failed_since
;
78 // set the message for the latest report. return any old op request we had,
79 // if any, so we can discard it.
80 MonOpRequestRef
add_report(int who
, utime_t failed_since
,
82 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
83 if (p
== reporters
.end()) {
84 if (max_failed_since
!= utime_t() && max_failed_since
< failed_since
)
85 max_failed_since
= failed_since
;
86 p
= reporters
.insert(map
<int, failure_reporter_t
>::value_type(who
, failure_reporter_t(failed_since
))).first
;
89 MonOpRequestRef ret
= p
->second
.op
;
94 void take_report_messages(list
<MonOpRequestRef
>& ls
) {
95 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
99 ls
.push_back(p
->second
.op
);
100 p
->second
.op
.reset();
105 MonOpRequestRef
cancel_report(int who
) {
106 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
107 if (p
== reporters
.end())
108 return MonOpRequestRef();
109 MonOpRequestRef ret
= p
->second
.op
;
111 max_failed_since
= utime_t();
117 class LastEpochClean
{
119 vector
<epoch_t
> epoch_by_pg
;
120 ps_t next_missing
= 0;
121 epoch_t floor
= std::numeric_limits
<epoch_t
>::max();
122 void report(ps_t pg
, epoch_t last_epoch_clean
);
124 std::map
<uint64_t, Lec
> report_by_pool
;
126 void report(const pg_t
& pg
, epoch_t last_epoch_clean
);
127 void remove_pool(uint64_t pool
);
128 epoch_t
get_lower_bound(const OSDMap
& latest
) const;
132 struct osdmap_manifest_t
{
133 // all the maps we have pinned -- i.e., won't be removed unless
134 // they are inside a trim interval.
135 set
<version_t
> pinned
;
137 osdmap_manifest_t() {}
139 version_t
get_last_pinned() const
141 set
<version_t
>::const_reverse_iterator it
= pinned
.crbegin();
142 if (it
== pinned
.crend()) {
148 version_t
get_first_pinned() const
150 set
<version_t
>::const_iterator it
= pinned
.cbegin();
151 if (it
== pinned
.cend()) {
157 bool is_pinned(version_t v
) const
159 return pinned
.find(v
) != pinned
.end();
162 void pin(version_t v
)
167 version_t
get_lower_closest_pinned(version_t v
) const {
168 set
<version_t
>::const_iterator p
= pinned
.lower_bound(v
);
169 if (p
== pinned
.cend()) {
172 if (p
== pinned
.cbegin()) {
180 void encode(bufferlist
& bl
) const
182 ENCODE_START(1, 1, bl
);
187 void decode(bufferlist::const_iterator
& bl
)
194 void decode(bufferlist
& bl
) {
195 auto p
= bl
.cbegin();
199 void dump(Formatter
*f
) {
200 f
->dump_unsigned("first_pinned", get_first_pinned());
201 f
->dump_unsigned("last_pinned", get_last_pinned());
202 f
->open_array_section("pinned_maps");
203 for (auto& i
: pinned
) {
204 f
->dump_unsigned("epoch", i
);
209 WRITE_CLASS_ENCODER(osdmap_manifest_t
);
211 class OSDMonitor
: public PaxosService
{
218 OSDMap::Incremental pending_inc
;
219 map
<int, bufferlist
> pending_metadata
;
220 set
<int> pending_metadata_rm
;
221 map
<int, failure_info_t
> failure_info
;
222 map
<int,utime_t
> down_pending_out
; // osd down -> out
224 map
<int,double> osd_weight
;
226 using osdmap_key_t
= std::pair
<version_t
, uint64_t>;
227 using osdmap_cache_t
= SimpleLRU
<osdmap_key_t
,
229 std::less
<osdmap_key_t
>,
230 boost::hash
<osdmap_key_t
>>;
231 osdmap_cache_t inc_osd_cache
;
232 osdmap_cache_t full_osd_cache
;
234 bool has_osdmap_manifest
;
235 osdmap_manifest_t osdmap_manifest
;
237 bool check_failures(utime_t now
);
238 bool check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
);
239 void force_failure(int target_osd
, int by
);
241 bool _have_pending_crush();
242 CrushWrapper
&_get_stable_crush();
243 void _get_pending_crush(CrushWrapper
& newcrush
);
253 void create_initial() override
;
254 void get_store_prefixes(std::set
<string
>& s
) const override
;
257 void update_from_paxos(bool *need_bootstrap
) override
;
258 void create_pending() override
; // prepare a new pending
259 void encode_pending(MonitorDBStore::TransactionRef t
) override
;
260 void on_active() override
;
261 void on_restart() override
;
262 void on_shutdown() override
;
264 /* osdmap full map prune */
265 void load_osdmap_manifest();
266 bool should_prune() const;
267 void _prune_update_trimmed(
268 MonitorDBStore::TransactionRef tx
,
270 void prune_init(osdmap_manifest_t
& manifest
);
271 bool _prune_sanitize_options() const;
272 bool is_prune_enabled() const;
273 bool is_prune_supported() const;
274 bool do_prune(MonitorDBStore::TransactionRef tx
);
277 * we haven't delegated full version stashing to paxosservice for some time
278 * now, making this function useless in current context.
280 void encode_full(MonitorDBStore::TransactionRef t
) override
{ }
282 * do not let paxosservice periodically stash full osdmaps, or we will break our
283 * locally-managed full maps. (update_from_paxos loads the latest and writes them
284 * out going forward from there, but if we just synced that may mean we skip some.)
286 bool should_stash_full() override
{
291 * hook into trim to include the oldest full map in the trim transaction
293 * This ensures that anyone post-sync will have enough to rebuild their
296 void encode_trim_extra(MonitorDBStore::TransactionRef tx
, version_t first
) override
;
298 void update_msgr_features();
299 int check_cluster_features(uint64_t features
, stringstream
&ss
);
301 * check if the cluster supports the features required by the
302 * given crush map. Outputs the daemons which don't support it
303 * to the stringstream.
305 * @returns true if the map is passable, false otherwise
307 bool validate_crush_against_features(const CrushWrapper
*newcrush
,
309 void check_osdmap_subs();
310 void share_map_with_random_osd();
312 Mutex prime_pg_temp_lock
= {"OSDMonitor::prime_pg_temp_lock"};
313 struct PrimeTempJob
: public ParallelPGMapper::Job
{
315 PrimeTempJob(const OSDMap
& om
, OSDMonitor
*m
)
316 : ParallelPGMapper::Job(&om
), osdmon(m
) {}
317 void process(int64_t pool
, unsigned ps_begin
, unsigned ps_end
) override
{
318 for (unsigned ps
= ps_begin
; ps
< ps_end
; ++ps
) {
320 osdmon
->prime_pg_temp(*osdmap
, pgid
);
323 void complete() override
{}
325 void maybe_prime_pg_temp();
326 void prime_pg_temp(const OSDMap
& next
, pg_t pgid
);
328 ParallelPGMapper mapper
; ///< for background pg work
329 OSDMapMapping mapping
; ///< pg <-> osd mappings
330 unique_ptr
<ParallelPGMapper::Job
> mapping_job
; ///< background mapping job
331 void start_mapping();
333 void update_logger();
335 void handle_query(PaxosServiceMessage
*m
);
336 bool preprocess_query(MonOpRequestRef op
) override
; // true if processed.
337 bool prepare_update(MonOpRequestRef op
) override
;
338 bool should_propose(double &delay
) override
;
340 version_t
get_trim_to() const override
;
342 bool can_mark_down(int o
);
343 bool can_mark_up(int o
);
344 bool can_mark_out(int o
);
345 bool can_mark_in(int o
);
348 MOSDMap
*build_latest_full(uint64_t features
);
349 MOSDMap
*build_incremental(epoch_t first
, epoch_t last
, uint64_t features
);
350 void send_full(MonOpRequestRef op
);
351 void send_incremental(MonOpRequestRef op
, epoch_t first
);
353 // @param req an optional op request, if the osdmaps are replies to it. so
354 // @c Monitor::send_reply() can mark_event with it.
355 void send_incremental(epoch_t first
, MonSession
*session
, bool onetime
,
356 MonOpRequestRef req
= MonOpRequestRef());
359 void print_utilization(ostream
&out
, Formatter
*f
, bool tree
) const;
361 bool check_source(MonOpRequestRef op
, uuid_d fsid
);
363 bool preprocess_get_osdmap(MonOpRequestRef op
);
365 bool preprocess_mark_me_down(MonOpRequestRef op
);
367 friend class C_AckMarkedDown
;
368 bool preprocess_failure(MonOpRequestRef op
);
369 bool prepare_failure(MonOpRequestRef op
);
370 bool prepare_mark_me_down(MonOpRequestRef op
);
371 void process_failures();
372 void take_all_failures(list
<MonOpRequestRef
>& ls
);
374 bool preprocess_full(MonOpRequestRef op
);
375 bool prepare_full(MonOpRequestRef op
);
377 bool preprocess_boot(MonOpRequestRef op
);
378 bool prepare_boot(MonOpRequestRef op
);
379 void _booted(MonOpRequestRef op
, bool logit
);
381 void update_up_thru(int from
, epoch_t up_thru
);
382 bool preprocess_alive(MonOpRequestRef op
);
383 bool prepare_alive(MonOpRequestRef op
);
384 void _reply_map(MonOpRequestRef op
, epoch_t e
);
386 bool preprocess_pgtemp(MonOpRequestRef op
);
387 bool prepare_pgtemp(MonOpRequestRef op
);
389 bool preprocess_pg_created(MonOpRequestRef op
);
390 bool prepare_pg_created(MonOpRequestRef op
);
392 bool preprocess_pg_ready_to_merge(MonOpRequestRef op
);
393 bool prepare_pg_ready_to_merge(MonOpRequestRef op
);
395 int _check_remove_pool(int64_t pool_id
, const pg_pool_t
&pool
, ostream
*ss
);
396 bool _check_become_tier(
397 int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
398 int64_t base_pool_id
, const pg_pool_t
*base_pool
,
399 int *err
, ostream
*ss
) const;
400 bool _check_remove_tier(
401 int64_t base_pool_id
, const pg_pool_t
*base_pool
, const pg_pool_t
*tier_pool
,
402 int *err
, ostream
*ss
) const;
404 int _prepare_remove_pool(int64_t pool
, ostream
*ss
, bool no_fake
);
405 int _prepare_rename_pool(int64_t pool
, string newname
);
407 bool enforce_pool_op_caps(MonOpRequestRef op
);
408 bool preprocess_pool_op (MonOpRequestRef op
);
409 bool preprocess_pool_op_create (MonOpRequestRef op
);
410 bool prepare_pool_op (MonOpRequestRef op
);
411 bool prepare_pool_op_create (MonOpRequestRef op
);
412 bool prepare_pool_op_delete(MonOpRequestRef op
);
413 int crush_rename_bucket(const string
& srcname
,
414 const string
& dstname
,
416 void check_legacy_ec_plugin(const string
& plugin
,
417 const string
& profile
) const;
418 int normalize_profile(const string
& profilename
,
419 ErasureCodeProfile
&profile
,
422 int crush_rule_create_erasure(const string
&name
,
423 const string
&profile
,
426 int get_crush_rule(const string
&rule_name
,
429 int get_erasure_code(const string
&erasure_code_profile
,
430 ErasureCodeInterfaceRef
*erasure_code
,
432 int prepare_pool_crush_rule(const unsigned pool_type
,
433 const string
&erasure_code_profile
,
434 const string
&rule_name
,
437 bool erasure_code_profile_in_use(
438 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
439 const string
&profile
,
441 int parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
442 map
<string
,string
> *erasure_code_profile_map
,
444 int prepare_pool_size(const unsigned pool_type
,
445 const string
&erasure_code_profile
,
447 unsigned *size
, unsigned *min_size
,
449 int prepare_pool_stripe_width(const unsigned pool_type
,
450 const string
&erasure_code_profile
,
451 unsigned *stripe_width
,
453 int check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
* ss
);
454 int prepare_new_pool(string
& name
,
456 const string
&crush_rule_name
,
457 unsigned pg_num
, unsigned pgp_num
,
460 const uint64_t target_size_bytes
,
461 const float target_size_ratio
,
462 const string
&erasure_code_profile
,
463 const unsigned pool_type
,
464 const uint64_t expected_num_objects
,
465 FastReadType fast_read
,
467 int prepare_new_pool(MonOpRequestRef op
);
469 void set_pool_flags(int64_t pool_id
, uint64_t flags
);
470 void clear_pool_flags(int64_t pool_id
, uint64_t flags
);
471 bool update_pools_status();
473 string
make_snap_epoch_key(int64_t pool
, epoch_t epoch
);
474 string
make_snap_key(int64_t pool
, snapid_t snap
);
475 string
make_snap_key_value(int64_t pool
, snapid_t snap
, snapid_t num
,
476 epoch_t epoch
, bufferlist
*v
);
477 string
make_snap_purged_key(int64_t pool
, snapid_t snap
);
478 string
make_snap_purged_key_value(int64_t pool
, snapid_t snap
, snapid_t num
,
479 epoch_t epoch
, bufferlist
*v
);
480 bool try_prune_purged_snaps();
481 int lookup_pruned_snap(int64_t pool
, snapid_t snap
,
482 snapid_t
*begin
, snapid_t
*end
);
484 bool prepare_set_flag(MonOpRequestRef op
, int flag
);
485 bool prepare_unset_flag(MonOpRequestRef op
, int flag
);
487 void _pool_op_reply(MonOpRequestRef op
,
488 int ret
, epoch_t epoch
, bufferlist
*blp
=NULL
);
490 struct C_Booted
: public C_MonOp
{
493 C_Booted(OSDMonitor
*cm
, MonOpRequestRef op_
, bool l
=true) :
494 C_MonOp(op_
), cmon(cm
), logit(l
) {}
495 void _finish(int r
) override
{
497 cmon
->_booted(op
, logit
);
498 else if (r
== -ECANCELED
)
500 else if (r
== -EAGAIN
)
503 ceph_abort_msg("bad C_Booted return value");
507 struct C_ReplyMap
: public C_MonOp
{
510 C_ReplyMap(OSDMonitor
*o
, MonOpRequestRef op_
, epoch_t ee
)
511 : C_MonOp(op_
), osdmon(o
), e(ee
) {}
512 void _finish(int r
) override
{
514 osdmon
->_reply_map(op
, e
);
515 else if (r
== -ECANCELED
)
517 else if (r
== -EAGAIN
)
518 osdmon
->dispatch(op
);
520 ceph_abort_msg("bad C_ReplyMap return value");
523 struct C_PoolOp
: public C_MonOp
{
527 bufferlist reply_data
;
528 C_PoolOp(OSDMonitor
* osd
, MonOpRequestRef op_
, int rc
, int e
, bufferlist
*rd
=NULL
) :
529 C_MonOp(op_
), osdmon(osd
), replyCode(rc
), epoch(e
) {
533 void _finish(int r
) override
{
535 osdmon
->_pool_op_reply(op
, replyCode
, epoch
, &reply_data
);
536 else if (r
== -ECANCELED
)
538 else if (r
== -EAGAIN
)
539 osdmon
->dispatch(op
);
541 ceph_abort_msg("bad C_PoolOp return value");
545 bool preprocess_remove_snaps(MonOpRequestRef op
);
546 bool prepare_remove_snaps(MonOpRequestRef op
);
548 int load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
);
549 void count_metadata(const string
& field
, Formatter
*f
);
551 void reencode_incremental_map(bufferlist
& bl
, uint64_t features
);
552 void reencode_full_map(bufferlist
& bl
, uint64_t features
);
554 void count_metadata(const string
& field
, map
<string
,int> *out
);
556 int get_osd_objectstore_type(int osd
, std::string
*type
);
557 bool is_pool_currently_all_bluestore(int64_t pool_id
, const pg_pool_t
&pool
,
560 // when we last received PG stats from each osd
561 map
<int,utime_t
> last_osd_report
;
562 // TODO: use last_osd_report to store the osd report epochs, once we don't
563 // need to upgrade from pre-luminous releases.
564 map
<int,epoch_t
> osd_epochs
;
565 LastEpochClean last_epoch_clean
;
566 bool preprocess_beacon(MonOpRequestRef op
);
567 bool prepare_beacon(MonOpRequestRef op
);
568 epoch_t
get_min_last_epoch_clean() const;
570 friend class C_UpdateCreatingPGs
;
571 std::map
<int, std::map
<epoch_t
, std::set
<spg_t
>>> creating_pgs_by_osd_epoch
;
572 std::vector
<pg_t
> pending_created_pgs
;
573 // the epoch when the pg mapping was calculated
574 epoch_t creating_pgs_epoch
= 0;
575 creating_pgs_t creating_pgs
;
576 mutable std::mutex creating_pgs_lock
;
578 creating_pgs_t
update_pending_pgs(const OSDMap::Incremental
& inc
,
579 const OSDMap
& nextmap
);
580 unsigned scan_for_creating_pgs(
581 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
582 const mempool::osdmap::set
<int64_t>& removed_pools
,
584 creating_pgs_t
* creating_pgs
) const;
585 pair
<int32_t, pg_t
> get_parent_pg(pg_t pgid
) const;
586 void update_creating_pgs();
587 void check_pg_creates_subs();
588 epoch_t
send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const;
590 int32_t _allocate_osd_id(int32_t* existing_id
);
593 OSDMonitor(CephContext
*cct
, Monitor
*mn
, Paxos
*p
, const string
& service_name
);
595 void tick() override
; // check state, take actions
597 bool preprocess_command(MonOpRequestRef op
);
598 bool prepare_command(MonOpRequestRef op
);
599 bool prepare_command_impl(MonOpRequestRef op
, const cmdmap_t
& cmdmap
);
601 int validate_osd_create(
604 const bool check_osd_exists
,
605 int32_t* existing_id
,
607 int prepare_command_osd_create(
610 int32_t* existing_id
,
612 void do_osd_create(const int32_t id
, const uuid_d
& uuid
,
613 const string
& device_class
,
615 int prepare_command_osd_purge(int32_t id
, stringstream
& ss
);
616 int prepare_command_osd_destroy(int32_t id
, stringstream
& ss
);
617 int _prepare_command_osd_crush_remove(
618 CrushWrapper
&newcrush
,
623 void do_osd_crush_remove(CrushWrapper
& newcrush
);
624 int prepare_command_osd_crush_remove(
625 CrushWrapper
&newcrush
,
630 int prepare_command_osd_remove(int32_t id
);
631 int prepare_command_osd_new(
633 const cmdmap_t
& cmdmap
,
634 const map
<string
,string
>& secrets
,
638 int prepare_command_pool_set(const cmdmap_t
& cmdmap
,
641 int prepare_command_pool_application(const string
&prefix
,
642 const cmdmap_t
& cmdmap
,
644 int preprocess_command_pool_application(const string
&prefix
,
645 const cmdmap_t
& cmdmap
,
648 int _command_pool_application(const string
&prefix
,
649 const cmdmap_t
& cmdmap
,
654 bool handle_osd_timeouts(const utime_t
&now
,
655 std::map
<int,utime_t
> &last_osd_report
);
657 void send_latest(MonOpRequestRef op
, epoch_t start
=0);
658 void send_latest_now_nodelete(MonOpRequestRef op
, epoch_t start
=0) {
659 op
->mark_osdmon_event(__func__
);
660 send_incremental(op
, start
);
663 void get_removed_snaps_range(
664 epoch_t start
, epoch_t end
,
665 mempool::osdmap::map
<int64_t,OSDMap::snap_interval_set_t
> *gap_removed_snaps
);
667 int get_version(version_t ver
, bufferlist
& bl
) override
;
668 int get_version(version_t ver
, uint64_t feature
, bufferlist
& bl
);
670 int get_version_full(version_t ver
, uint64_t feature
, bufferlist
& bl
);
671 int get_version_full(version_t ver
, bufferlist
& bl
) override
;
672 int get_inc(version_t ver
, OSDMap::Incremental
& inc
);
673 int get_full_from_pinned_map(version_t ver
, bufferlist
& bl
);
675 epoch_t
blacklist(const entity_addrvec_t
& av
, utime_t until
);
676 epoch_t
blacklist(entity_addr_t a
, utime_t until
);
678 void dump_info(Formatter
*f
);
679 int dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
);
680 void print_nodes(Formatter
*f
);
682 void check_osdmap_sub(Subscription
*sub
);
683 void check_pg_creates_sub(Subscription
*sub
);
685 void do_application_enable(int64_t pool_id
, const std::string
&app_name
,
686 const std::string
&app_key
="",
687 const std::string
&app_value
="");
689 void add_flag(int flag
) {
690 if (!(osdmap
.flags
& flag
)) {
691 if (pending_inc
.new_flags
< 0)
692 pending_inc
.new_flags
= osdmap
.flags
;
693 pending_inc
.new_flags
|= flag
;
697 void remove_flag(int flag
) {
698 if(osdmap
.flags
& flag
) {
699 if (pending_inc
.new_flags
< 0)
700 pending_inc
.new_flags
= osdmap
.flags
;
701 pending_inc
.new_flags
&= ~flag
;