1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include <boost/container/flat_set.hpp>
28 #include "include/mempool.h"
30 // re-include our assert to clobber boost's
31 #include "include/ceph_assert.h"
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
39 #include "common/Timer.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46 #include "PGPeeringEvent.h"
48 #include "mgr/OSDPerfMetricTypes.h"
57 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
58 //#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
71 typedef OpRequest::Ref OpRequestRef
;
74 class DynamicPerfStats
;
80 using state_history_entry
= std::tuple
<utime_t
, utime_t
, const char*>;
81 using embedded_state
= std::pair
<utime_t
, const char*>;
83 struct PGStateInstance
{
84 // Time spent in pg states
86 void setepoch(const epoch_t current_epoch
) {
87 this_epoch
= current_epoch
;
90 void enter_state(const utime_t entime
, const char* state
) {
91 embedded_states
.push(std::make_pair(entime
, state
));
94 void exit_state(const utime_t extime
) {
95 embedded_state this_state
= embedded_states
.top();
96 state_history
.push_back(state_history_entry
{
97 this_state
.first
, extime
, this_state
.second
});
98 embedded_states
.pop();
103 std::vector
<state_history_entry
> state_history
;
104 std::stack
<embedded_state
> embedded_states
;
107 class PGStateHistory
{
108 // Member access protected with the PG lock
110 PGStateHistory() : buffer(10) {}
112 void enter(PG
* pg
, const utime_t entime
, const char* state
);
114 void exit(const char* state
);
120 void set_pg_in_destructor() { pg_in_destructor
= true; }
122 void dump(Formatter
* f
) const;
124 string
get_current_state() {
125 if (pi
== nullptr) return "unknown";
126 return std::get
<1>(pi
->embedded_states
.top());
130 bool pg_in_destructor
= false;
131 PG
* thispg
= nullptr;
132 std::unique_ptr
<PGStateInstance
> tmppi
;
133 PGStateInstance
* pi
= nullptr;
134 boost::circular_buffer
<std::unique_ptr
<PGStateInstance
>> buffer
;
139 #include "common/tracked_int_ptr.hpp"
140 uint64_t get_with_id(PG
*pg
);
141 void put_with_id(PG
*pg
, uint64_t id
);
142 typedef TrackedIntPtr
<PG
> PGRef
;
144 typedef boost::intrusive_ptr
<PG
> PGRef
;
147 class PGRecoveryStats
{
148 struct per_state_info
{
149 uint64_t enter
, exit
; // enter/exit counts
151 utime_t event_time
; // time spent processing events
152 utime_t total_time
; // total time in state
153 utime_t min_time
, max_time
;
155 // cppcheck-suppress unreachableCode
156 per_state_info() : enter(0), exit(0), events(0) {}
158 map
<const char *,per_state_info
> info
;
162 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
165 std::lock_guard
l(lock
);
168 void dump(ostream
& out
) {
169 std::lock_guard
l(lock
);
170 for (map
<const char *,per_state_info
>::iterator p
= info
.begin(); p
!= info
.end(); ++p
) {
171 per_state_info
& i
= p
->second
;
172 out
<< i
.enter
<< "\t" << i
.exit
<< "\t"
173 << i
.events
<< "\t" << i
.event_time
<< "\t"
174 << i
.total_time
<< "\t"
175 << i
.min_time
<< "\t" << i
.max_time
<< "\t"
180 void dump_formatted(Formatter
*f
) {
181 std::lock_guard
l(lock
);
182 f
->open_array_section("pg_recovery_stats");
183 for (map
<const char *,per_state_info
>::iterator p
= info
.begin();
184 p
!= info
.end(); ++p
) {
185 per_state_info
& i
= p
->second
;
186 f
->open_object_section("recovery_state");
187 f
->dump_int("enter", i
.enter
);
188 f
->dump_int("exit", i
.exit
);
189 f
->dump_int("events", i
.events
);
190 f
->dump_stream("event_time") << i
.event_time
;
191 f
->dump_stream("total_time") << i
.total_time
;
192 f
->dump_stream("min_time") << i
.min_time
;
193 f
->dump_stream("max_time") << i
.max_time
;
194 vector
<string
> states
;
195 get_str_vec(p
->first
, "/", states
);
196 f
->open_array_section("nested_states");
197 for (vector
<string
>::iterator st
= states
.begin();
198 st
!= states
.end(); ++st
) {
199 f
->dump_string("state", *st
);
207 void log_enter(const char *s
) {
208 std::lock_guard
l(lock
);
211 void log_exit(const char *s
, utime_t dur
, uint64_t events
, utime_t event_dur
) {
212 std::lock_guard
l(lock
);
213 per_state_info
&i
= info
[s
];
216 if (dur
> i
.max_time
)
218 if (dur
< i
.min_time
|| i
.min_time
== utime_t())
221 i
.event_time
+= event_dur
;
227 epoch_t cached_epoch
;
232 SnapContext snapc
; // the default pool snapc, ready to go.
234 // these two sets are for < mimic only
235 interval_set
<snapid_t
> cached_removed_snaps
; // current removed_snaps set
236 interval_set
<snapid_t
> newly_removed_snaps
; // newly removed in the last epoch
238 PGPool(CephContext
* cct
, OSDMapRef map
, int64_t i
, const pg_pool_t
& info
,
241 cached_epoch(map
->get_epoch()),
245 snapc
= info
.get_snap_context();
246 if (map
->require_osd_release
< CEPH_RELEASE_MIMIC
) {
247 info
.build_removed_snaps(cached_removed_snaps
);
251 void update(CephContext
*cct
, OSDMapRef map
);
254 /** PG - Replica Placement Group
258 class PG
: public DoutPrefixProvider
{
264 ObjectStore::CollectionHandle ch
;
269 std::ostream
& gen_prefix(std::ostream
& out
) const override
;
270 CephContext
*get_cct() const override
{
273 unsigned get_subsys() const override
{
274 return ceph_subsys_osd
;
277 const OSDMapRef
& get_osdmap() const {
278 ceph_assert(is_locked());
279 ceph_assert(osdmap_ref
);
282 epoch_t
get_osdmap_epoch() const {
283 return osdmap_ref
->get_epoch();
286 void lock_suspend_timeout(ThreadPool::TPHandle
&handle
) {
287 handle
.suspend_tp_timeout();
289 handle
.reset_tp_timeout();
291 void lock(bool no_lockdep
= false) const;
292 void unlock() const {
293 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
294 ceph_assert(!dirty_info
);
295 ceph_assert(!dirty_big_info
);
298 bool is_locked() const {
299 return _lock
.is_locked();
302 const spg_t
& get_pgid() const {
306 const PGPool
& get_pool() const {
309 uint64_t get_last_user_version() const {
310 return info
.last_user_version
;
312 const pg_history_t
& get_history() const {
315 bool get_need_up_thru() const {
318 epoch_t
get_same_interval_since() const {
319 return info
.history
.same_interval_since
;
322 void set_last_scrub_stamp(utime_t t
) {
323 info
.stats
.last_scrub_stamp
= t
;
324 info
.history
.last_scrub_stamp
= t
;
327 void set_last_deep_scrub_stamp(utime_t t
) {
328 info
.stats
.last_deep_scrub_stamp
= t
;
329 info
.history
.last_deep_scrub_stamp
= t
;
332 bool is_deleting() const {
335 bool is_deleted() const {
338 bool is_replica() const {
341 bool is_primary() const {
342 return pg_whoami
== primary
;
344 bool pg_has_reset_since(epoch_t e
) {
345 ceph_assert(is_locked());
346 return deleted
|| e
< get_last_peering_reset();
349 bool is_ec_pg() const {
350 return pool
.info
.is_erasure();
352 int get_role() const {
355 const vector
<int> get_acting() const {
358 int get_acting_primary() const {
361 pg_shard_t
get_primary() const {
364 const vector
<int> get_up() const {
367 int get_up_primary() const {
368 return up_primary
.osd
;
370 const PastIntervals
& get_past_intervals() const {
371 return past_intervals
;
374 /// initialize created PG
377 const vector
<int>& up
,
379 const vector
<int>& acting
,
381 const pg_history_t
& history
,
382 const PastIntervals
& pim
,
384 ObjectStore::Transaction
*t
);
386 /// read existing pg state off disk
387 void read_state(ObjectStore
*store
);
388 static int peek_map_epoch(ObjectStore
*store
, spg_t pgid
, epoch_t
*pepoch
);
390 static int get_latest_struct_v() {
391 return latest_struct_v
;
393 static int get_compat_struct_v() {
394 return compat_struct_v
;
396 static int read_info(
397 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
,
398 pg_info_t
&info
, PastIntervals
&past_intervals
,
400 static bool _has_removal_flag(ObjectStore
*store
, spg_t pgid
);
402 void rm_backoff(BackoffRef b
);
404 void update_snap_mapper_bits(uint32_t bits
) {
405 snap_mapper
.update_bits(bits
);
407 void start_split_stats(const set
<spg_t
>& childpgs
, vector
<object_stat_sum_t
> *v
);
408 virtual void split_colls(
412 const pg_pool_t
*pool
,
413 ObjectStore::Transaction
*t
) = 0;
414 void split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
);
415 void merge_from(map
<spg_t
,PGRef
>& sources
, RecoveryCtx
*rctx
,
417 const pg_merge_meta_t
& last_pg_merge_meta
);
418 void finish_split_stats(const object_stat_sum_t
& stats
, ObjectStore::Transaction
*t
);
420 void scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
);
422 bool is_scrub_registered();
423 void reg_next_scrub();
424 void unreg_next_scrub();
426 void on_info_history_change();
428 void scrub_requested(bool deep
, bool repair
, bool need_auto
= false);
430 bool is_forced_recovery_or_backfill() const {
431 return get_state() & (PG_STATE_FORCED_RECOVERY
| PG_STATE_FORCED_BACKFILL
);
433 bool set_force_recovery(bool b
);
434 bool set_force_backfill(bool b
);
436 void queue_peering_event(PGPeeringEventRef evt
);
437 void do_peering_event(PGPeeringEventRef evt
, RecoveryCtx
*rcx
);
438 void queue_null(epoch_t msg_epoch
, epoch_t query_epoch
);
439 void queue_flushed(epoch_t started_at
);
440 void handle_advance_map(
441 OSDMapRef osdmap
, OSDMapRef lastmap
,
442 vector
<int>& newup
, int up_primary
,
443 vector
<int>& newacting
, int acting_primary
,
445 void handle_activate_map(RecoveryCtx
*rctx
);
446 void handle_initialize(RecoveryCtx
*rctx
);
447 void handle_query_state(Formatter
*f
);
450 * @param ops_begun returns how many recovery ops the function started
451 * @returns true if any useful work was accomplished; false otherwise
453 virtual bool start_recovery_ops(
455 ThreadPool::TPHandle
&handle
,
456 uint64_t *ops_begun
) = 0;
458 // more work after the above, but with a RecoveryCtx
459 void find_unfound(epoch_t queued
, RecoveryCtx
*rctx
);
461 virtual void get_watchers(std::list
<obj_watch_item_t
> *ls
) = 0;
463 void dump_pgstate_history(Formatter
*f
);
464 void dump_missing(Formatter
*f
);
466 void get_pg_stats(std::function
<void(const pg_stat_t
&, epoch_t lec
)> f
);
467 void with_heartbeat_peers(std::function
<void(int)> f
);
470 virtual void on_shutdown() = 0;
472 bool get_must_scrub() const {
473 return scrubber
.must_scrub
;
477 virtual void do_request(
479 ThreadPool::TPHandle
&handle
481 virtual void clear_cache() = 0;
482 virtual int get_cache_obj_count() = 0;
484 virtual void snap_trimmer(epoch_t epoch_queued
) = 0;
485 virtual int do_command(
493 virtual bool agent_work(int max
) = 0;
494 virtual bool agent_work(int max
, int agent_flush_quota
) = 0;
495 virtual void agent_stop() = 0;
496 virtual void agent_delay() = 0;
497 virtual void agent_clear() = 0;
498 virtual void agent_choose_mode_restart() = 0;
500 virtual void on_removal(ObjectStore::Transaction
*t
) = 0;
502 void _delete_some(ObjectStore::Transaction
*t
);
504 virtual void set_dynamic_perf_stats_queries(
505 const std::list
<OSDPerfMetricQuery
> &queries
) {
507 virtual void get_dynamic_perf_stats(DynamicPerfStats
*stats
) {
510 // reference counting
512 uint64_t get_with_id();
513 void put_with_id(uint64_t);
514 void dump_live_ids();
516 void get(const char* tag
);
517 void put(const char* tag
);
523 PG(OSDService
*o
, OSDMapRef curmap
,
524 const PGPool
&pool
, spg_t p
);
528 explicit PG(const PG
& rhs
) = delete;
529 PG
& operator=(const PG
& rhs
) = delete;
536 OSDShard
*osd_shard
= nullptr;
537 OSDShardPGSlot
*pg_slot
= nullptr;
542 OSDMapRef osdmap_ref
;
546 // locking and reference counting.
547 // I destroy myself when the reference count hits zero.
548 // lock() should be called before doing anything.
549 // get() should be called on pointer copy (to another thread, etc.).
550 // put() should be called on destruction of some previously copied pointer.
551 // unlock() when done with the current pointer (_most common_).
552 mutable Mutex _lock
= {"PG::_lock"};
554 std::atomic
<unsigned int> ref
{0};
557 Mutex _ref_id_lock
= {"PG::_ref_id_lock"};
558 map
<uint64_t, string
> _live_ids
;
559 map
<string
, uint64_t> _tag_counts
;
560 uint64_t _ref_id
= 0;
562 friend uint64_t get_with_id(PG
*pg
) { return pg
->get_with_id(); }
563 friend void put_with_id(PG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
567 friend void intrusive_ptr_add_ref(PG
*pg
) {
570 friend void intrusive_ptr_release(PG
*pg
) {
575 // =====================
579 SnapMapper snap_mapper
;
580 bool eio_errors_to_process
= false;
582 virtual PGBackend
*get_pgbackend() = 0;
583 virtual const PGBackend
* get_pgbackend() const = 0;
587 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
588 IsPGRecoverablePredicate
*get_is_recoverable_predicate() const {
589 return get_pgbackend()->get_is_recoverable_predicate();
592 epoch_t last_persisted_osdmap
;
594 void requeue_map_waiters();
596 void update_osdmap_ref(OSDMapRef newmap
) {
597 ceph_assert(_lock
.is_locked_by_me());
598 osdmap_ref
= std::move(newmap
);
604 bool deleting
; // true while in removing or OSD is shutting down
605 atomic
<bool> deleted
= {false};
607 ZTracer::Endpoint trace_endpoint
;
611 bool dirty_info
, dirty_big_info
;
615 pg_info_t info
; ///< current pg info
616 pg_info_t last_written_info
; ///< last written info
617 __u8 info_struct_v
= 0;
618 static const __u8 latest_struct_v
= 10;
619 // v10 is the new past_intervals encoding
620 // v9 was fastinfo_key addition
621 // v8 was the move to a per-pg pgmeta object
622 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
623 // (first appeared in cuttlefish).
624 static const __u8 compat_struct_v
= 10;
625 void upgrade(ObjectStore
*store
);
629 ghobject_t pgmeta_oid
;
631 // ------------------
636 // a loc_count indicates how many locations we know in each of
637 // these distinct sets
640 int other
= 0; //< other
642 friend bool operator<(const loc_count_t
& l
,
643 const loc_count_t
& r
) {
644 return (l
.up
< r
.up
||
646 (l
.other
< r
.other
)));
648 friend ostream
& operator<<(ostream
& out
, const loc_count_t
& l
) {
649 ceph_assert(l
.up
>= 0);
650 ceph_assert(l
.other
>= 0);
651 return out
<< "(" << l
.up
<< "+" << l
.other
<< ")";
658 loc_count_t
_get_count(const set
<pg_shard_t
>& shards
) {
660 for (auto s
: shards
) {
661 if (pg
->upset
.count(s
)) {
670 map
<hobject_t
, pg_missing_item
> needs_recovery_map
;
671 map
<hobject_t
, set
<pg_shard_t
> > missing_loc
;
672 set
<pg_shard_t
> missing_loc_sources
;
674 // for every entry in missing_loc, we count how many of each type of shard we have,
675 // and maintain totals here. The sum of the values for this map will always equal
676 // missing_loc.size().
677 map
< shard_id_t
, map
<loc_count_t
,int> > missing_by_count
;
679 void pgs_by_shard_id(const set
<pg_shard_t
>& s
, map
< shard_id_t
, set
<pg_shard_t
> >& pgsbs
) {
680 if (pg
->get_osdmap()->pg_is_ec(pg
->info
.pgid
.pgid
)) {
681 int num_shards
= pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
);
682 // For completely missing shards initialize with empty set<pg_shard_t>
683 for (int i
= 0 ; i
< num_shards
; ++i
) {
688 pgsbs
[pgs
.shard
].insert(pgs
);
690 pgsbs
[shard_id_t::NO_SHARD
] = s
;
694 void _inc_count(const set
<pg_shard_t
>& s
) {
695 map
< shard_id_t
, set
<pg_shard_t
> > pgsbs
;
696 pgs_by_shard_id(s
, pgsbs
);
697 for (auto shard
: pgsbs
)
698 ++missing_by_count
[shard
.first
][_get_count(shard
.second
)];
700 void _dec_count(const set
<pg_shard_t
>& s
) {
701 map
< shard_id_t
, set
<pg_shard_t
> > pgsbs
;
702 pgs_by_shard_id(s
, pgsbs
);
703 for (auto shard
: pgsbs
) {
704 auto p
= missing_by_count
[shard
.first
].find(_get_count(shard
.second
));
705 ceph_assert(p
!= missing_by_count
[shard
.first
].end());
706 if (--p
->second
== 0) {
707 missing_by_count
[shard
.first
].erase(p
);
713 set
<pg_shard_t
> empty_set
;
715 boost::scoped_ptr
<IsPGReadablePredicate
> is_readable
;
716 boost::scoped_ptr
<IsPGRecoverablePredicate
> is_recoverable
;
717 explicit MissingLoc(PG
*pg
)
719 void set_backend_predicates(
720 IsPGReadablePredicate
*_is_readable
,
721 IsPGRecoverablePredicate
*_is_recoverable
) {
722 is_readable
.reset(_is_readable
);
723 is_recoverable
.reset(_is_recoverable
);
725 std::ostream
& gen_prefix(std::ostream
& out
) const {
726 return pg
->gen_prefix(out
);
729 const hobject_t
&hoid
,
730 eversion_t
*v
= 0) const {
731 map
<hobject_t
, pg_missing_item
>::const_iterator i
=
732 needs_recovery_map
.find(hoid
);
733 if (i
== needs_recovery_map
.end())
739 bool is_deleted(const hobject_t
&hoid
) const {
740 auto i
= needs_recovery_map
.find(hoid
);
741 if (i
== needs_recovery_map
.end())
743 return i
->second
.is_delete();
745 bool is_unfound(const hobject_t
&hoid
) const {
746 auto it
= needs_recovery_map
.find(hoid
);
747 if (it
== needs_recovery_map
.end()) {
750 if (it
->second
.is_delete()) {
753 auto mit
= missing_loc
.find(hoid
);
754 return mit
== missing_loc
.end() || !(*is_recoverable
)(mit
->second
);
756 bool readable_with_acting(
757 const hobject_t
&hoid
,
758 const set
<pg_shard_t
> &acting
) const;
759 uint64_t num_unfound() const {
761 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
762 needs_recovery_map
.begin();
763 i
!= needs_recovery_map
.end();
765 if (i
->second
.is_delete())
767 auto mi
= missing_loc
.find(i
->first
);
768 if (mi
== missing_loc
.end() || !(*is_recoverable
)(mi
->second
))
774 bool have_unfound() const {
775 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
776 needs_recovery_map
.begin();
777 i
!= needs_recovery_map
.end();
779 if (i
->second
.is_delete())
781 auto mi
= missing_loc
.find(i
->first
);
782 if (mi
== missing_loc
.end() || !(*is_recoverable
)(mi
->second
))
788 needs_recovery_map
.clear();
790 missing_loc_sources
.clear();
791 missing_by_count
.clear();
794 void add_location(const hobject_t
&hoid
, pg_shard_t location
) {
795 auto p
= missing_loc
.find(hoid
);
796 if (p
== missing_loc
.end()) {
797 p
= missing_loc
.emplace(hoid
, set
<pg_shard_t
>()).first
;
799 _dec_count(p
->second
);
801 p
->second
.insert(location
);
802 _inc_count(p
->second
);
804 void remove_location(const hobject_t
&hoid
, pg_shard_t location
) {
805 auto p
= missing_loc
.find(hoid
);
806 if (p
!= missing_loc
.end()) {
807 _dec_count(p
->second
);
808 p
->second
.erase(location
);
809 if (p
->second
.empty()) {
810 missing_loc
.erase(p
);
812 _inc_count(p
->second
);
817 void clear_location(const hobject_t
&hoid
) {
818 auto p
= missing_loc
.find(hoid
);
819 if (p
!= missing_loc
.end()) {
820 _dec_count(p
->second
);
821 missing_loc
.erase(p
);
825 void add_active_missing(const pg_missing_t
&missing
) {
826 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
827 missing
.get_items().begin();
828 i
!= missing
.get_items().end();
830 map
<hobject_t
, pg_missing_item
>::const_iterator j
=
831 needs_recovery_map
.find(i
->first
);
832 if (j
== needs_recovery_map
.end()) {
833 needs_recovery_map
.insert(*i
);
835 lgeneric_dout(pg
->cct
, 0) << this << " " << pg
->info
.pgid
<< " unexpected need for "
836 << i
->first
<< " have " << j
->second
837 << " tried to add " << i
->second
<< dendl
;
838 ceph_assert(i
->second
.need
== j
->second
.need
);
843 void add_missing(const hobject_t
&hoid
, eversion_t need
, eversion_t have
, bool is_delete
=false) {
844 needs_recovery_map
[hoid
] = pg_missing_item(need
, have
, is_delete
);
846 void revise_need(const hobject_t
&hoid
, eversion_t need
) {
847 auto it
= needs_recovery_map
.find(hoid
);
848 ceph_assert(it
!= needs_recovery_map
.end());
849 it
->second
.need
= need
;
852 /// Adds info about a possible recovery source
853 bool add_source_info(
854 pg_shard_t source
, ///< [in] source
855 const pg_info_t
&oinfo
, ///< [in] info
856 const pg_missing_t
&omissing
, ///< [in] (optional) missing
857 ThreadPool::TPHandle
* handle
///< [in] ThreadPool handle
858 ); ///< @return whether a new object location was discovered
860 /// Adds recovery sources in batch
861 void add_batch_sources_info(
862 const set
<pg_shard_t
> &sources
, ///< [in] a set of resources which can be used for all objects
863 ThreadPool::TPHandle
* handle
///< [in] ThreadPool handle
866 /// Uses osdmap to update structures for now down sources
867 void check_recovery_sources(const OSDMapRef
& osdmap
);
869 /// Call when hoid is no longer missing in acting set
870 void recovered(const hobject_t
&hoid
) {
871 needs_recovery_map
.erase(hoid
);
872 auto p
= missing_loc
.find(hoid
);
873 if (p
!= missing_loc
.end()) {
874 _dec_count(p
->second
);
875 missing_loc
.erase(p
);
879 /// Call to update structures for hoid after a change
881 const hobject_t
&hoid
,
883 const set
<pg_shard_t
> to_recover
,
884 const pg_info_t
&info
,
885 const pg_missing_t
&missing
,
886 const map
<pg_shard_t
, pg_missing_t
> &pmissing
,
887 const map
<pg_shard_t
, pg_info_t
> &pinfo
) {
889 boost::optional
<pg_missing_item
> item
;
890 auto miter
= missing
.get_items().find(hoid
);
891 if (miter
!= missing
.get_items().end()) {
892 item
= miter
->second
;
894 for (auto &&i
: to_recover
) {
897 auto pmiter
= pmissing
.find(i
);
898 ceph_assert(pmiter
!= pmissing
.end());
899 miter
= pmiter
->second
.get_items().find(hoid
);
900 if (miter
!= pmiter
->second
.get_items().end()) {
901 item
= miter
->second
;
907 return; // recovered!
909 needs_recovery_map
[hoid
] = *item
;
910 if (item
->is_delete())
913 missing_loc
.insert(make_pair(hoid
, set
<pg_shard_t
>())).first
;
914 ceph_assert(info
.last_backfill
.is_max());
915 ceph_assert(info
.last_update
>= item
->need
);
916 if (!missing
.is_missing(hoid
))
917 mliter
->second
.insert(self
);
918 for (auto &&i
: pmissing
) {
921 auto pinfoiter
= pinfo
.find(i
.first
);
922 ceph_assert(pinfoiter
!= pinfo
.end());
923 if (item
->need
<= pinfoiter
->second
.last_update
&&
924 hoid
<= pinfoiter
->second
.last_backfill
&&
925 !i
.second
.is_missing(hoid
))
926 mliter
->second
.insert(i
.first
);
928 _inc_count(mliter
->second
);
931 const set
<pg_shard_t
> &get_locations(const hobject_t
&hoid
) const {
932 auto it
= missing_loc
.find(hoid
);
933 return it
== missing_loc
.end() ? empty_set
: it
->second
;
935 const map
<hobject_t
, set
<pg_shard_t
>> &get_missing_locs() const {
938 const map
<hobject_t
, pg_missing_item
> &get_needs_recovery() const {
939 return needs_recovery_map
;
941 const map
< shard_id_t
, map
<loc_count_t
,int> > &get_missing_by_count() const {
942 return missing_by_count
;
946 PastIntervals past_intervals
;
948 interval_set
<snapid_t
> snap_trimq
;
950 /* You should not use these items without taking their respective queue locks
951 * (if they have one) */
952 xlist
<PG
*>::item stat_queue_item
;
954 bool recovery_queued
;
956 int recovery_ops_active
;
957 set
<pg_shard_t
> waiting_on_backfill
;
958 #ifdef DEBUG_RECOVERY_OIDS
959 multiset
<hobject_t
> recovering_oids
;
963 int role
; // 0 = primary, 1 = replica, -1=none.
964 uint64_t state
; // PG_STATE_*
966 bool send_notify
; ///< true if we are non-primary and should notify the primary
969 eversion_t last_update_ondisk
; // last_update that has committed; ONLY DEFINED WHEN is_active()
970 eversion_t last_complete_ondisk
; // last_complete that has committed.
971 eversion_t last_update_applied
;
973 // entries <= last_rollback_info_trimmed_to_applied have been trimmed
974 eversion_t last_rollback_info_trimmed_to_applied
;
979 pg_shard_t pg_whoami
;
980 pg_shard_t up_primary
;
981 vector
<int> up
, acting
, want_acting
;
982 // acting_recovery_backfill contains shards that are acting,
983 // async recovery targets, or backfill targets.
984 set
<pg_shard_t
> acting_recovery_backfill
, actingset
, upset
;
985 map
<pg_shard_t
,eversion_t
> peer_last_complete_ondisk
;
986 eversion_t min_last_complete_ondisk
; // up: min over last_complete_ondisk, peer_last_complete_ondisk
987 eversion_t pg_trim_to
;
989 set
<int> blocked_by
; ///< osds we are blocked by (for pg stats)
992 // [primary only] content recovery state
993 struct BufferedRecoveryMessages
{
994 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
995 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > info_map
;
996 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > notify_list
;
1000 bool dne() { return info
.dne(); }
1001 struct RecoveryCtx
{
1003 map
<int, map
<spg_t
, pg_query_t
> > *query_map
;
1004 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
;
1005 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
;
1006 ObjectStore::Transaction
*transaction
;
1007 ThreadPool::TPHandle
* handle
;
1008 RecoveryCtx(map
<int, map
<spg_t
, pg_query_t
> > *query_map
,
1010 vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
,
1012 vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
,
1013 ObjectStore::Transaction
*transaction
)
1014 : query_map(query_map
), info_map(info_map
),
1015 notify_list(notify_list
),
1016 transaction(transaction
),
1019 RecoveryCtx(BufferedRecoveryMessages
&buf
, RecoveryCtx
&rctx
)
1020 : query_map(&(buf
.query_map
)),
1021 info_map(&(buf
.info_map
)),
1022 notify_list(&(buf
.notify_list
)),
1023 transaction(rctx
.transaction
),
1024 handle(rctx
.handle
) {}
1026 void accept_buffered_messages(BufferedRecoveryMessages
&m
) {
1027 ceph_assert(query_map
);
1028 ceph_assert(info_map
);
1029 ceph_assert(notify_list
);
1030 for (map
<int, map
<spg_t
, pg_query_t
> >::iterator i
= m
.query_map
.begin();
1031 i
!= m
.query_map
.end();
1033 map
<spg_t
, pg_query_t
> &omap
= (*query_map
)[i
->first
];
1034 for (map
<spg_t
, pg_query_t
>::iterator j
= i
->second
.begin();
1035 j
!= i
->second
.end();
1037 omap
[j
->first
] = j
->second
;
1040 for (map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator i
1041 = m
.info_map
.begin();
1042 i
!= m
.info_map
.end();
1044 vector
<pair
<pg_notify_t
, PastIntervals
> > &ovec
=
1045 (*info_map
)[i
->first
];
1046 ovec
.reserve(ovec
.size() + i
->second
.size());
1047 ovec
.insert(ovec
.end(), i
->second
.begin(), i
->second
.end());
1049 for (map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator i
1050 = m
.notify_list
.begin();
1051 i
!= m
.notify_list
.end();
1053 vector
<pair
<pg_notify_t
, PastIntervals
> > &ovec
=
1054 (*notify_list
)[i
->first
];
1055 ovec
.reserve(ovec
.size() + i
->second
.size());
1056 ovec
.insert(ovec
.end(), i
->second
.begin(), i
->second
.end());
1060 void send_notify(pg_shard_t to
,
1061 const pg_notify_t
&info
, const PastIntervals
&pi
) {
1062 ceph_assert(notify_list
);
1063 (*notify_list
)[to
.osd
].push_back(make_pair(info
, pi
));
1068 PGStateHistory pgstate_history
;
1071 const char *state_name
;
1074 const char *get_state_name() { return state_name
; }
1075 NamedState(PG
*pg_
, const char *state_name_
)
1076 : state_name(state_name_
), enter_time(ceph_clock_now()), pg(pg_
) {
1077 pg
->pgstate_history
.enter(pg
, enter_time
, state_name
);
1079 virtual ~NamedState() { pg
->pgstate_history
.exit(state_name
); }
1087 * peer_info -- projected (updates _before_ replicas ack)
1088 * peer_missing -- committed (updates _after_ replicas ack)
1092 set
<pg_shard_t
> stray_set
; // non-acting osds that have PG data.
1093 map
<pg_shard_t
, pg_info_t
> peer_info
; // info from peers (stray or prior)
1094 map
<pg_shard_t
, int64_t> peer_bytes
; // Peer's num_bytes from peer_info
1095 set
<pg_shard_t
> peer_purged
; // peers purged
1096 map
<pg_shard_t
, pg_missing_t
> peer_missing
;
1097 set
<pg_shard_t
> peer_log_requested
; // logs i've requested (and start stamps)
1098 set
<pg_shard_t
> peer_missing_requested
;
1100 // i deleted these strays; ignore racing PGInfo from them
1101 set
<pg_shard_t
> peer_activated
;
1103 // primary-only, recovery-only state
1104 set
<pg_shard_t
> might_have_unfound
; // These osds might have objects on them
1105 // which are unfound on the primary
1106 epoch_t last_peering_reset
;
1108 epoch_t
get_last_peering_reset() const {
1109 return last_peering_reset
;
1112 /* heartbeat peers */
1113 void set_probe_targets(const set
<pg_shard_t
> &probe_set
);
1114 void clear_probe_targets();
1116 Mutex heartbeat_peer_lock
;
1117 set
<int> heartbeat_peers
;
1118 set
<int> probe_targets
;
1124 * Represents the objects in a range [begin, end)
1127 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1128 * 2) Else, objects contains all objects in [begin, end)
1130 struct BackfillInterval
{
1131 // info about a backfill interval on a peer
1132 eversion_t version
; /// version at which the scan occurred
1133 map
<hobject_t
,eversion_t
> objects
;
1139 *this = BackfillInterval();
1142 /// clear objects list only
1143 void clear_objects() {
1147 /// reinstantiate with a new start+end position and sort order
1148 void reset(hobject_t start
) {
1150 begin
= end
= start
;
1153 /// true if there are no objects in this interval
1154 bool empty() const {
1155 return objects
.empty();
1158 /// true if interval extends to the end of the range
1159 bool extends_to_end() const {
1160 return end
.is_max();
1163 /// removes items <= soid and adjusts begin to the first object
1164 void trim_to(const hobject_t
&soid
) {
1166 while (!objects
.empty() &&
1167 objects
.begin()->first
<= soid
) {
1172 /// Adjusts begin to the first object
1174 if (!objects
.empty())
1175 begin
= objects
.begin()->first
;
1180 /// drop first entry, and adjust @begin accordingly
1182 ceph_assert(!objects
.empty());
1183 objects
.erase(objects
.begin());
1188 void dump(Formatter
*f
) const {
1189 f
->dump_stream("begin") << begin
;
1190 f
->dump_stream("end") << end
;
1191 f
->open_array_section("objects");
1192 for (map
<hobject_t
, eversion_t
>::const_iterator i
=
1196 f
->open_object_section("object");
1197 f
->dump_stream("object") << i
->first
;
1198 f
->dump_stream("version") << i
->second
;
1206 BackfillInterval backfill_info
;
1207 map
<pg_shard_t
, BackfillInterval
> peer_backfill_info
;
1208 bool backfill_reserved
;
1209 bool backfill_reserving
;
1211 set
<pg_shard_t
> backfill_targets
, async_recovery_targets
;
1213 // The primary's num_bytes and local num_bytes for this pg, only valid
1214 // during backfill for non-primary shards.
1215 // Both of these are adjusted for EC to reflect the on-disk bytes
1216 std::atomic
<int64_t> primary_num_bytes
= 0;
1217 std::atomic
<int64_t> local_num_bytes
= 0;
1220 bool is_backfill_targets(pg_shard_t osd
) {
1221 return backfill_targets
.count(osd
);
1224 // Space reserved for backfill is primary_num_bytes - local_num_bytes
1225 // Don't care that difference itself isn't atomic
1226 uint64_t get_reserved_num_bytes() {
1227 int64_t primary
= primary_num_bytes
.load();
1228 int64_t local
= local_num_bytes
.load();
1229 if (primary
> local
)
1230 return primary
- local
;
1235 bool is_remote_backfilling() {
1236 return primary_num_bytes
.load() > 0;
1239 void set_reserved_num_bytes(int64_t primary
, int64_t local
);
1240 void clear_reserved_num_bytes();
1242 // If num_bytes are inconsistent and local_num- goes negative
1243 // it's ok, because it would then be ignored.
1245 // The value of num_bytes could be negative,
1246 // but we don't let local_num_bytes go negative.
1247 void add_local_num_bytes(int64_t num_bytes
) {
1249 int64_t prev_bytes
= local_num_bytes
.load();
1252 new_bytes
= prev_bytes
+ num_bytes
;
1255 } while(!local_num_bytes
.compare_exchange_weak(prev_bytes
, new_bytes
));
1258 void sub_local_num_bytes(int64_t num_bytes
) {
1259 ceph_assert(num_bytes
>= 0);
1261 int64_t prev_bytes
= local_num_bytes
.load();
1264 new_bytes
= prev_bytes
- num_bytes
;
1267 } while(!local_num_bytes
.compare_exchange_weak(prev_bytes
, new_bytes
));
1270 // The value of num_bytes could be negative,
1271 // but we don't let info.stats.stats.sum.num_bytes go negative.
1272 void add_num_bytes(int64_t num_bytes
) {
1273 ceph_assert(_lock
.is_locked_by_me());
1275 info
.stats
.stats
.sum
.num_bytes
+= num_bytes
;
1276 if (info
.stats
.stats
.sum
.num_bytes
< 0) {
1277 info
.stats
.stats
.sum
.num_bytes
= 0;
1281 void sub_num_bytes(int64_t num_bytes
) {
1282 ceph_assert(_lock
.is_locked_by_me());
1283 ceph_assert(num_bytes
>= 0);
1285 info
.stats
.stats
.sum
.num_bytes
-= num_bytes
;
1286 if (info
.stats
.stats
.sum
.num_bytes
< 0) {
1287 info
.stats
.stats
.sum
.num_bytes
= 0;
1292 // Only used in testing so not worried about needing the PG lock here
1293 int64_t get_stats_num_bytes() {
1294 Mutex::Locker
l(_lock
);
1295 int num_bytes
= info
.stats
.stats
.sum
.num_bytes
;
1296 if (pool
.info
.is_erasure()) {
1297 num_bytes
/= (int)get_pgbackend()->get_ec_data_chunk_count();
1298 // Round up each object by a stripe
1299 num_bytes
+= get_pgbackend()->get_ec_stripe_chunk_size() * info
.stats
.stats
.sum
.num_objects
;
1301 int64_t lnb
= local_num_bytes
.load();
1302 if (lnb
&& lnb
!= num_bytes
) {
1303 lgeneric_dout(cct
, 0) << this << " " << info
.pgid
<< " num_bytes mismatch "
1304 << lnb
<< " vs stats "
1305 << info
.stats
.stats
.sum
.num_bytes
<< " / chunk "
1306 << get_pgbackend()->get_ec_data_chunk_count()
1315 * blocked request wait hierarchy
1317 * In order to preserve request ordering we need to be careful about the
1318 * order in which blocked requests get requeued. Generally speaking, we
1319 * push the requests back up to the op_wq in reverse order (most recent
1320 * request first) so that they come back out again in the original order.
1321 * However, because there are multiple wait queues, we need to requeue
1322 * waitlists in order. Generally speaking, we requeue the wait lists
1323 * that are checked first.
1325 * Here are the various wait lists, in the order they are used during
1326 * request processing, with notes:
1329 * - may start or stop blocking at any time (depending on client epoch)
1330 * - waiting_for_peered
1331 * - !is_peered() or flushes_in_progress
1332 * - only starts blocking on interval change; never restarts
1333 * - waiting_for_active
1335 * - only starts blocking on interval change; never restarts
1336 * - waiting_for_flush
1337 * - is_active() and flushes_in_progress
1338 * - waiting for final flush during activate
1339 * - waiting_for_scrub
1340 * - starts and stops blocking for varying intervals during scrub
1341 * - waiting_for_unreadable_object
1342 * - never restarts once object is readable (* except for EIO?)
1343 * - waiting_for_degraded_object
1344 * - never restarts once object is writeable (* except for EIO?)
1345 * - waiting_for_blocked_object
1346 * - starts and stops based on proxied op activity
1348 * - starts and stops based on read/write activity
1352 * 1. During and interval change, we requeue *everything* in the above order.
1354 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1355 * the op there if it applies. We ignore the unreadable/degraded/blocked
1356 * queues because we assume they cannot apply at that time (this is
1357 * probably mostly true).
1359 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1362 * These three behaviors are generally sufficient to maintain ordering, with
1363 * the possible exception of cases where we make an object degraded or
1364 * unreadable that was previously okay, e.g. when scrub or op processing
1365 * encounter an unexpected error. FIXME.
1369 unsigned flushes_in_progress
;
1371 // ops with newer maps than our (or blocked behind them)
1372 // track these by client, since inter-request ordering doesn't otherwise
1374 unordered_map
<entity_name_t
,list
<OpRequestRef
>> waiting_for_map
;
1376 // ops waiting on peered
1377 list
<OpRequestRef
> waiting_for_peered
;
1379 // ops waiting on active (require peered as well)
1380 list
<OpRequestRef
> waiting_for_active
;
1381 list
<OpRequestRef
> waiting_for_flush
;
1382 list
<OpRequestRef
> waiting_for_scrub
;
1384 list
<OpRequestRef
> waiting_for_cache_not_full
;
1385 list
<OpRequestRef
> waiting_for_clean_to_primary_repair
;
1386 map
<hobject_t
, list
<OpRequestRef
>> waiting_for_unreadable_object
,
1387 waiting_for_degraded_object
,
1388 waiting_for_blocked_object
;
1390 set
<hobject_t
> objects_blocked_on_cache_full
;
1391 map
<hobject_t
,snapid_t
> objects_blocked_on_degraded_snap
;
1392 map
<hobject_t
,ObjectContextRef
> objects_blocked_on_snap_promotion
;
1394 // Callbacks should assume pg (and nothing else) is locked
1395 map
<hobject_t
, list
<Context
*>> callbacks_for_degraded_object
;
1398 list
<tuple
<OpRequestRef
, version_t
, int> > > waiting_for_ondisk
;
1400 void requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
);
1401 void requeue_op(OpRequestRef op
);
1402 void requeue_ops(list
<OpRequestRef
> &l
);
1404 // stats that persist lazily
1405 object_stat_collection_t unstable_stats
;
1408 Mutex pg_stats_publish_lock
;
1409 bool pg_stats_publish_valid
;
1410 pg_stat_t pg_stats_publish
;
1412 void _update_calc_stats();
1413 void _update_blocked_by();
1414 friend class TestOpsSocketHook
;
1415 void publish_stats_to_osd();
1416 void clear_publish_stats();
1418 void clear_primary_state();
1420 bool is_acting_recovery_backfill(pg_shard_t osd
) const {
1421 return acting_recovery_backfill
.count(osd
);
1423 bool is_acting(pg_shard_t osd
) const {
1424 return has_shard(pool
.info
.is_erasure(), acting
, osd
);
1426 bool is_up(pg_shard_t osd
) const {
1427 return has_shard(pool
.info
.is_erasure(), up
, osd
);
1429 static bool has_shard(bool ec
, const vector
<int>& v
, pg_shard_t osd
) {
1431 return v
.size() > (unsigned)osd
.shard
&& v
[osd
.shard
] == osd
.osd
;
1433 return std::find(v
.begin(), v
.end(), osd
.osd
) != v
.end();
1437 bool needs_recovery() const;
1438 bool needs_backfill() const;
1440 /// clip calculated priority to reasonable range
1441 int clamp_recovery_priority(int prio
, int pool_recovery_prio
, int max
);
1442 /// get log recovery reservation priority
1443 unsigned get_recovery_priority();
1444 /// get backfill reservation priority
1445 unsigned get_backfill_priority();
1446 /// get priority for pg deletion
1447 unsigned get_delete_priority();
1449 void try_mark_clean(); ///< mark an active pg clean
1451 /// return [start,end) bounds for required past_intervals
1452 static pair
<epoch_t
, epoch_t
> get_required_past_interval_bounds(
1453 const pg_info_t
&info
,
1454 epoch_t oldest_map
) {
1455 epoch_t start
= std::max(
1456 info
.history
.last_epoch_clean
? info
.history
.last_epoch_clean
:
1457 info
.history
.epoch_pool_created
,
1459 epoch_t end
= std::max(
1460 info
.history
.same_interval_since
,
1461 info
.history
.epoch_pool_created
);
1462 return make_pair(start
, end
);
1464 void check_past_interval_bounds() const;
1465 PastIntervals::PriorSet
build_prior();
1467 void remove_down_peer_info(const OSDMapRef osdmap
);
1469 bool adjust_need_up_thru(const OSDMapRef osdmap
);
1471 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const;
1472 virtual void dump_recovery_info(Formatter
*f
) const = 0;
1474 void calc_min_last_complete_ondisk() {
1475 eversion_t min
= last_complete_ondisk
;
1476 ceph_assert(!acting_recovery_backfill
.empty());
1477 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
1478 i
!= acting_recovery_backfill
.end();
1480 if (*i
== get_primary()) continue;
1481 if (peer_last_complete_ondisk
.count(*i
) == 0)
1482 return; // we don't have complete info
1483 eversion_t a
= peer_last_complete_ondisk
[*i
];
1487 if (min
== min_last_complete_ondisk
)
1489 min_last_complete_ondisk
= min
;
1493 virtual void calc_trim_to() = 0;
1495 virtual void calc_trim_to_aggressive() = 0;
1497 void proc_replica_log(pg_info_t
&oinfo
, const pg_log_t
&olog
,
1498 pg_missing_t
& omissing
, pg_shard_t from
);
1499 void proc_master_log(ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
,
1500 pg_missing_t
& omissing
, pg_shard_t from
);
1501 bool proc_replica_info(
1502 pg_shard_t from
, const pg_info_t
&info
, epoch_t send_epoch
);
1504 struct PGLogEntryHandler
: public PGLog::LogEntryHandler
{
1506 ObjectStore::Transaction
*t
;
1507 PGLogEntryHandler(PG
*pg
, ObjectStore::Transaction
*t
) : pg(pg
), t(t
) {}
1510 void remove(const hobject_t
&hoid
) override
{
1511 pg
->get_pgbackend()->remove(hoid
, t
);
1513 void try_stash(const hobject_t
&hoid
, version_t v
) override
{
1514 pg
->get_pgbackend()->try_stash(hoid
, v
, t
);
1516 void rollback(const pg_log_entry_t
&entry
) override
{
1517 ceph_assert(entry
.can_rollback());
1518 pg
->get_pgbackend()->rollback(entry
, t
);
1520 void rollforward(const pg_log_entry_t
&entry
) override
{
1521 pg
->get_pgbackend()->rollforward(entry
, t
);
1523 void trim(const pg_log_entry_t
&entry
) override
{
1524 pg
->get_pgbackend()->trim(entry
, t
);
1528 void update_object_snap_mapping(
1529 ObjectStore::Transaction
*t
, const hobject_t
&soid
,
1530 const set
<snapid_t
> &snaps
);
1531 void clear_object_snap_mapping(
1532 ObjectStore::Transaction
*t
, const hobject_t
&soid
);
1533 void remove_snap_mapped_object(
1534 ObjectStore::Transaction
& t
, const hobject_t
& soid
);
1536 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
1537 pg_log_t
&olog
, pg_shard_t from
);
1538 void rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
);
1539 bool search_for_missing(
1540 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
1544 void discover_all_missing(std::map
<int, map
<spg_t
,pg_query_t
> > &query_map
);
1546 map
<pg_shard_t
, pg_info_t
>::const_iterator
find_best_info(
1547 const map
<pg_shard_t
, pg_info_t
> &infos
,
1548 bool restrict_to_up_acting
,
1549 bool *history_les_bound
) const;
1550 static void calc_ec_acting(
1551 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1553 const vector
<int> &acting
,
1554 const vector
<int> &up
,
1555 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1556 bool restrict_to_up_acting
,
1558 set
<pg_shard_t
> *backfill
,
1559 set
<pg_shard_t
> *acting_backfill
,
1561 static void calc_replicated_acting(
1562 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1563 uint64_t force_auth_primary_missing_objects
,
1565 const vector
<int> &acting
,
1566 const vector
<int> &up
,
1567 pg_shard_t up_primary
,
1568 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1569 bool restrict_to_up_acting
,
1571 set
<pg_shard_t
> *backfill
,
1572 set
<pg_shard_t
> *acting_backfill
,
1573 const OSDMapRef osdmap
,
1575 void choose_async_recovery_ec(const map
<pg_shard_t
, pg_info_t
> &all_info
,
1576 const pg_info_t
&auth_info
,
1578 set
<pg_shard_t
> *async_recovery
,
1579 const OSDMapRef osdmap
) const;
1580 void choose_async_recovery_replicated(const map
<pg_shard_t
, pg_info_t
> &all_info
,
1581 const pg_info_t
&auth_info
,
1583 set
<pg_shard_t
> *async_recovery
,
1584 const OSDMapRef osdmap
) const;
1586 bool recoverable_and_ge_min_size(const vector
<int> &want
) const;
1587 bool choose_acting(pg_shard_t
&auth_log_shard
,
1588 bool restrict_to_up_acting
,
1589 bool *history_les_bound
);
1590 void build_might_have_unfound();
1592 ObjectStore::Transaction
& t
,
1593 epoch_t activation_epoch
,
1594 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1596 vector
<pair
<pg_notify_t
, PastIntervals
> > > *activator_map
,
1599 struct C_PG_ActivateCommitted
: public Context
{
1602 epoch_t activation_epoch
;
1603 C_PG_ActivateCommitted(PG
*p
, epoch_t e
, epoch_t ae
)
1604 : pg(p
), epoch(e
), activation_epoch(ae
) {}
1605 void finish(int r
) override
{
1606 pg
->_activate_committed(epoch
, activation_epoch
);
1609 void _activate_committed(epoch_t epoch
, epoch_t activation_epoch
);
1610 void all_activated_and_committed();
1612 void proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&info
);
1614 bool have_unfound() const {
1615 return missing_loc
.have_unfound();
1617 uint64_t get_num_unfound() const {
1618 return missing_loc
.num_unfound();
1620 bool all_missing_unfound() const {
1621 const auto& missing
= pg_log
.get_missing();
1622 if (!missing
.have_missing())
1624 for (auto& m
: missing
.get_items()) {
1625 if (!missing_loc
.is_unfound(m
.first
))
1631 virtual void check_local() = 0;
1633 void purge_strays();
1635 void update_heartbeat_peers();
1637 Context
*finish_sync_event
;
1639 Context
*finish_recovery();
1640 void _finish_recovery(Context
*c
);
1641 struct C_PG_FinishRecovery
: public Context
{
1643 explicit C_PG_FinishRecovery(PG
*p
) : pg(p
) {}
1644 void finish(int r
) override
{
1645 pg
->_finish_recovery(this);
1648 void cancel_recovery();
1649 void clear_recovery_state();
1650 virtual void _clear_recovery_state() = 0;
1651 virtual void check_recovery_sources(const OSDMapRef
& newmap
) = 0;
1652 void start_recovery_op(const hobject_t
& soid
);
1653 void finish_recovery_op(const hobject_t
& soid
, bool dequeue
=false);
1655 virtual void _split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
) = 0;
1657 friend class C_OSD_RepModify_Commit
;
1658 friend class C_DeleteMore
;
1661 Mutex backoff_lock
; // orders inside Backoff::lock
1662 map
<hobject_t
,set
<BackoffRef
>> backoffs
;
1664 void add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
);
1665 void release_backoffs(const hobject_t
& begin
, const hobject_t
& end
);
1666 void release_backoffs(const hobject_t
& o
) {
1667 release_backoffs(o
, o
);
1669 void clear_backoffs();
1671 void add_pg_backoff(SessionRef s
) {
1672 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1673 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1674 add_backoff(s
, begin
, end
);
1677 void release_pg_backoffs() {
1678 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1679 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1680 release_backoffs(begin
, end
);
1691 set
<pg_shard_t
> reserved_peers
;
1692 bool local_reserved
, remote_reserved
, reserve_failed
;
1693 epoch_t epoch_start
;
1695 // common to both scrubs
1697 set
<pg_shard_t
> waiting_on_whom
;
1701 ScrubMap primary_scrubmap
;
1702 ScrubMapBuilder primary_scrubmap_pos
;
1703 epoch_t replica_scrub_start
= 0;
1704 ScrubMap replica_scrubmap
;
1705 ScrubMapBuilder replica_scrubmap_pos
;
1706 map
<pg_shard_t
, ScrubMap
> received_maps
;
1707 OpRequestRef active_rep_scrub
;
1708 utime_t scrub_reg_stamp
; // stamp we registered for
1710 static utime_t
scrub_must_stamp() { return utime_t(0,1); }
1712 omap_stat_t omap_stats
= (const struct omap_stat_t
){ 0 };
1715 bool sleeping
= false;
1716 bool needs_sleep
= true;
1717 utime_t sleep_start
;
1719 // flags to indicate explicitly requested scrubs (by admin)
1720 bool must_scrub
, must_deep_scrub
, must_repair
, need_auto
;
1722 // Priority to use for scrub scheduling
1723 unsigned priority
= 0;
1726 // this flag indicates whether we would like to do auto-repair of the PG or not
1728 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1730 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1731 // we should deep scrub in order to auto repair
1732 bool deep_scrub_on_error
;
1734 // Maps from objects with errors to missing/inconsistent peers
1735 map
<hobject_t
, set
<pg_shard_t
>> missing
;
1736 map
<hobject_t
, set
<pg_shard_t
>> inconsistent
;
1738 // Map from object with errors to good peers
1739 map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >> authoritative
;
1741 // Cleaned map pending snap metadata scrub
1742 ScrubMap cleaned_meta_map
;
1744 void clean_meta_map(ScrubMap
&for_meta_scrub
) {
1746 cleaned_meta_map
.objects
.empty()) {
1747 cleaned_meta_map
.swap(for_meta_scrub
);
1749 auto iter
= cleaned_meta_map
.objects
.end();
1750 --iter
; // not empty, see if clause
1751 auto begin
= cleaned_meta_map
.objects
.begin();
1752 if (iter
->first
.has_snapset()) {
1755 while (iter
!= begin
) {
1757 if (next
->first
.get_head() != iter
->first
.get_head()) {
1763 for_meta_scrub
.objects
.insert(begin
, iter
);
1764 cleaned_meta_map
.objects
.erase(begin
, iter
);
1768 // digest updates which we are waiting on
1769 int num_digest_updates_pending
;
1772 hobject_t start
, end
; // [start,end)
1773 hobject_t max_end
; // Largest end that may have been sent to replicas
1774 eversion_t subset_last_update
;
1776 // chunky scrub state
1786 WAIT_DIGEST_UPDATES
,
1791 std::unique_ptr
<Scrub::Store
> store
;
1795 int preempt_divisor
;
1797 list
<Context
*> callbacks
;
1798 void add_callback(Context
*context
) {
1799 callbacks
.push_back(context
);
1801 void run_callbacks() {
1802 list
<Context
*> to_run
;
1803 to_run
.swap(callbacks
);
1804 for (list
<Context
*>::iterator i
= to_run
.begin();
1811 static const char *state_string(const PG::Scrubber::State
& state
) {
1812 const char *ret
= NULL
;
1815 case INACTIVE
: ret
= "INACTIVE"; break;
1816 case NEW_CHUNK
: ret
= "NEW_CHUNK"; break;
1817 case WAIT_PUSHES
: ret
= "WAIT_PUSHES"; break;
1818 case WAIT_LAST_UPDATE
: ret
= "WAIT_LAST_UPDATE"; break;
1819 case BUILD_MAP
: ret
= "BUILD_MAP"; break;
1820 case BUILD_MAP_DONE
: ret
= "BUILD_MAP_DONE"; break;
1821 case WAIT_REPLICAS
: ret
= "WAIT_REPLICAS"; break;
1822 case COMPARE_MAPS
: ret
= "COMPARE_MAPS"; break;
1823 case WAIT_DIGEST_UPDATES
: ret
= "WAIT_DIGEST_UPDATES"; break;
1824 case FINISH
: ret
= "FINISH"; break;
1825 case BUILD_MAP_REPLICA
: ret
= "BUILD_MAP_REPLICA"; break;
1830 bool is_chunky_scrub_active() const { return state
!= INACTIVE
; }
1835 waiting_on_whom
.clear();
1836 if (active_rep_scrub
) {
1837 active_rep_scrub
= OpRequestRef();
1839 received_maps
.clear();
1842 must_deep_scrub
= false;
1843 must_repair
= false;
1845 time_for_deep
= false;
1846 auto_repair
= false;
1847 check_repair
= false;
1848 deep_scrub_on_error
= false;
1850 state
= PG::Scrubber::INACTIVE
;
1851 start
= hobject_t();
1853 max_end
= hobject_t();
1854 subset_last_update
= eversion_t();
1858 omap_stats
= (const struct omap_stat_t
){ 0 };
1861 inconsistent
.clear();
1863 authoritative
.clear();
1864 num_digest_updates_pending
= 0;
1865 primary_scrubmap
= ScrubMap();
1866 primary_scrubmap_pos
.reset();
1867 replica_scrubmap
= ScrubMap();
1868 replica_scrubmap_pos
.reset();
1869 cleaned_meta_map
= ScrubMap();
1872 sleep_start
= utime_t();
1875 void create_results(const hobject_t
& obj
);
1876 void cleanup_store(ObjectStore::Transaction
*t
);
1880 bool scrub_after_recovery
;
1884 bool scrub_can_preempt
= false;
1885 bool scrub_preempted
= false;
1887 // we allow some number of preemptions of the scrub, which mean we do
1888 // not block. then we start to block. once we start blocking, we do
1889 // not stop until the scrub range is completed.
1890 bool write_blocked_by_scrub(const hobject_t
&soid
);
1892 /// true if the given range intersects the scrub interval in any way
1893 bool range_intersects_scrub(const hobject_t
&start
, const hobject_t
& end
);
1896 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
1897 pg_shard_t bad_peer
);
1899 void chunky_scrub(ThreadPool::TPHandle
&handle
);
1900 void scrub_compare_maps();
1902 * return true if any inconsistency/missing is repaired, false otherwise
1904 bool scrub_process_inconsistent();
1905 bool ops_blocked_by_scrub() const;
1906 void scrub_finish();
1907 void scrub_clear_state(bool keep_repair
= false);
1908 void _scan_snaps(ScrubMap
&map
);
1909 void _repair_oinfo_oid(ScrubMap
&map
);
1910 void _scan_rollback_obs(const vector
<ghobject_t
> &rollback_obs
);
1911 void _request_scrub_map(pg_shard_t replica
, eversion_t version
,
1912 hobject_t start
, hobject_t end
, bool deep
,
1913 bool allow_preemption
);
1914 int build_scrub_map_chunk(
1916 ScrubMapBuilder
&pos
,
1917 hobject_t start
, hobject_t end
, bool deep
,
1918 ThreadPool::TPHandle
&handle
);
1920 * returns true if [begin, end) is good to scrub at this time
1921 * a false return value obliges the implementer to requeue scrub when the
1922 * condition preventing scrub clears
1924 virtual bool _range_available_for_scrub(
1925 const hobject_t
&begin
, const hobject_t
&end
) = 0;
1926 virtual void scrub_snapshot_metadata(
1928 const std::map
<hobject_t
,
1929 pair
<boost::optional
<uint32_t>,
1930 boost::optional
<uint32_t>>> &missing_digest
) { }
1931 virtual void _scrub_clear_state() { }
1932 virtual void _scrub_finish() { }
1933 void clear_scrub_reserved();
1934 void scrub_reserve_replicas();
1935 void scrub_unreserve_replicas();
1936 bool scrub_all_replicas_reserved() const;
1940 ThreadPool::TPHandle
&handle
);
1941 void do_replica_scrub_map(OpRequestRef op
);
1943 void handle_scrub_reserve_request(OpRequestRef op
);
1944 void handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
);
1945 void handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
);
1946 void handle_scrub_reserve_release(OpRequestRef op
);
1948 void reject_reservation();
1949 void schedule_backfill_retry(float retry
);
1950 void schedule_recovery_retry(float retry
);
1952 // -- recovery state --
1954 template <class EVT
>
1955 struct QueuePeeringEvt
: Context
{
1959 QueuePeeringEvt(PG
*pg
, epoch_t epoch
, EVT evt
) :
1960 pg(pg
), epoch(epoch
), evt(evt
) {}
1961 void finish(int r
) override
{
1963 pg
->queue_peering_event(PGPeeringEventRef(
1973 struct QueryState
: boost::statechart::event
< QueryState
> {
1975 explicit QueryState(Formatter
*f
) : f(f
) {}
1976 void print(std::ostream
*out
) const {
1982 int pg_stat_adjust(osd_stat_t
*new_stat
);
1985 struct AdvMap
: boost::statechart::event
< AdvMap
> {
1988 vector
<int> newup
, newacting
;
1989 int up_primary
, acting_primary
;
1991 OSDMapRef osdmap
, OSDMapRef lastmap
,
1992 vector
<int>& newup
, int up_primary
,
1993 vector
<int>& newacting
, int acting_primary
):
1994 osdmap(osdmap
), lastmap(lastmap
),
1996 newacting(newacting
),
1997 up_primary(up_primary
),
1998 acting_primary(acting_primary
) {}
1999 void print(std::ostream
*out
) const {
2004 struct ActMap
: boost::statechart::event
< ActMap
> {
2005 ActMap() : boost::statechart::event
< ActMap
>() {}
2006 void print(std::ostream
*out
) const {
2010 struct Activate
: boost::statechart::event
< Activate
> {
2011 epoch_t activation_epoch
;
2012 explicit Activate(epoch_t q
) : boost::statechart::event
< Activate
>(),
2013 activation_epoch(q
) {}
2014 void print(std::ostream
*out
) const {
2015 *out
<< "Activate from " << activation_epoch
;
2019 struct UnfoundBackfill
: boost::statechart::event
<UnfoundBackfill
> {
2020 explicit UnfoundBackfill() {}
2021 void print(std::ostream
*out
) const {
2022 *out
<< "UnfoundBackfill";
2025 struct UnfoundRecovery
: boost::statechart::event
<UnfoundRecovery
> {
2026 explicit UnfoundRecovery() {}
2027 void print(std::ostream
*out
) const {
2028 *out
<< "UnfoundRecovery";
2032 struct RequestScrub
: boost::statechart::event
<RequestScrub
> {
2035 explicit RequestScrub(bool d
, bool r
) : deep(d
), repair(r
) {}
2036 void print(std::ostream
*out
) const {
2037 *out
<< "RequestScrub(" << (deep
? "deep" : "shallow")
2038 << (repair
? " repair" : "");
2043 TrivialEvent(Initialize
)
2044 TrivialEvent(GotInfo
)
2045 TrivialEvent(NeedUpThru
)
2046 TrivialEvent(Backfilled
)
2047 TrivialEvent(LocalBackfillReserved
)
2048 TrivialEvent(RejectTooFullRemoteReservation
)
2050 TrivialEvent(RequestBackfill
)
2052 TrivialEvent(RemoteRecoveryPreempted
)
2053 TrivialEvent(RemoteBackfillPreempted
)
2054 TrivialEvent(BackfillTooFull
)
2055 TrivialEvent(RecoveryTooFull
)
2057 TrivialEvent(MakePrimary
)
2058 TrivialEvent(MakeStray
)
2059 TrivialEvent(NeedActingChange
)
2060 TrivialEvent(IsIncomplete
)
2061 TrivialEvent(IsDown
)
2063 TrivialEvent(AllReplicasRecovered
)
2064 TrivialEvent(DoRecovery
)
2065 TrivialEvent(LocalRecoveryReserved
)
2068 TrivialEvent(AllRemotesReserved
)
2069 TrivialEvent(AllBackfillsReserved
)
2070 TrivialEvent(GoClean
)
2072 TrivialEvent(AllReplicasActivated
)
2074 TrivialEvent(IntervalFlush
)
2077 TrivialEvent(DeleteStart
)
2078 TrivialEvent(DeleteSome
)
2080 TrivialEvent(SetForceRecovery
)
2081 TrivialEvent(UnsetForceRecovery
)
2082 TrivialEvent(SetForceBackfill
)
2083 TrivialEvent(UnsetForceBackfill
)
2086 TrivialEvent(DeleteReserved
)
2087 TrivialEvent(DeleteInterrupted
)
2089 /* Encapsulates PG recovery process */
2090 class RecoveryState
{
2091 void start_handle(RecoveryCtx
*new_ctx
);
2094 void begin_block_outgoing();
2095 void end_block_outgoing();
2096 void clear_blocked_outgoing();
2101 class RecoveryMachine
: public boost::statechart::state_machine
< RecoveryMachine
, Initial
> {
2102 RecoveryState
*state
;
2107 uint64_t event_count
;
2109 void clear_event_counters() {
2110 event_time
= utime_t();
2114 void log_enter(const char *state_name
);
2115 void log_exit(const char *state_name
, utime_t duration
);
2117 RecoveryMachine(RecoveryState
*state
, PG
*pg
) : state(state
), pg(pg
), event_count(0) {}
2119 /* Accessor functions for state methods */
2120 ObjectStore::Transaction
* get_cur_transaction() {
2121 ceph_assert(state
->rctx
);
2122 ceph_assert(state
->rctx
->transaction
);
2123 return state
->rctx
->transaction
;
2126 void send_query(pg_shard_t to
, const pg_query_t
&query
) {
2127 ceph_assert(state
->rctx
);
2128 ceph_assert(state
->rctx
->query_map
);
2129 (*state
->rctx
->query_map
)[to
.osd
][spg_t(pg
->info
.pgid
.pgid
, to
.shard
)] =
2133 map
<int, map
<spg_t
, pg_query_t
> > *get_query_map() {
2134 ceph_assert(state
->rctx
);
2135 ceph_assert(state
->rctx
->query_map
);
2136 return state
->rctx
->query_map
;
2139 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *get_info_map() {
2140 ceph_assert(state
->rctx
);
2141 ceph_assert(state
->rctx
->info_map
);
2142 return state
->rctx
->info_map
;
2145 RecoveryCtx
*get_recovery_ctx() { return &*(state
->rctx
); }
2147 void send_notify(pg_shard_t to
,
2148 const pg_notify_t
&info
, const PastIntervals
&pi
) {
2149 ceph_assert(state
->rctx
);
2150 state
->rctx
->send_notify(to
, info
, pi
);
2153 friend class RecoveryMachine
;
2173 // WaitRemoteBackfillReserved
2174 // WaitLocalBackfillReserved
2178 // WaitRemoteRecoveryReserved
2179 // WaitLocalRecoveryReserved
2183 // RepWaitBackfillReserved
2184 // RepWaitRecoveryReserved
2187 // WaitDeleteReserved
2191 struct Crashed
: boost::statechart::state
< Crashed
, RecoveryMachine
>, NamedState
{
2192 explicit Crashed(my_context ctx
);
2197 struct Initial
: boost::statechart::state
< Initial
, RecoveryMachine
>, NamedState
{
2198 explicit Initial(my_context ctx
);
2201 typedef boost::mpl::list
<
2202 boost::statechart::transition
< Initialize
, Reset
>,
2203 boost::statechart::custom_reaction
< NullEvt
>,
2204 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
2207 boost::statechart::result
react(const MNotifyRec
&);
2208 boost::statechart::result
react(const MInfoRec
&);
2209 boost::statechart::result
react(const MLogRec
&);
2210 boost::statechart::result
react(const boost::statechart::event_base
&) {
2211 return discard_event();
2215 struct Reset
: boost::statechart::state
< Reset
, RecoveryMachine
>, NamedState
{
2216 explicit Reset(my_context ctx
);
2219 typedef boost::mpl::list
<
2220 boost::statechart::custom_reaction
< QueryState
>,
2221 boost::statechart::custom_reaction
< AdvMap
>,
2222 boost::statechart::custom_reaction
< ActMap
>,
2223 boost::statechart::custom_reaction
< NullEvt
>,
2224 boost::statechart::custom_reaction
< IntervalFlush
>,
2225 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
2227 boost::statechart::result
react(const QueryState
& q
);
2228 boost::statechart::result
react(const AdvMap
&);
2229 boost::statechart::result
react(const ActMap
&);
2230 boost::statechart::result
react(const IntervalFlush
&);
2231 boost::statechart::result
react(const boost::statechart::event_base
&) {
2232 return discard_event();
2238 struct Started
: boost::statechart::state
< Started
, RecoveryMachine
, Start
>, NamedState
{
2239 explicit Started(my_context ctx
);
2242 typedef boost::mpl::list
<
2243 boost::statechart::custom_reaction
< QueryState
>,
2244 boost::statechart::custom_reaction
< AdvMap
>,
2245 boost::statechart::custom_reaction
< IntervalFlush
>,
2247 boost::statechart::custom_reaction
< NullEvt
>,
2248 boost::statechart::custom_reaction
<SetForceRecovery
>,
2249 boost::statechart::custom_reaction
<UnsetForceRecovery
>,
2250 boost::statechart::custom_reaction
<SetForceBackfill
>,
2251 boost::statechart::custom_reaction
<UnsetForceBackfill
>,
2252 boost::statechart::custom_reaction
<RequestScrub
>,
2254 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
2256 boost::statechart::result
react(const QueryState
& q
);
2257 boost::statechart::result
react(const AdvMap
&);
2258 boost::statechart::result
react(const IntervalFlush
&);
2259 boost::statechart::result
react(const boost::statechart::event_base
&) {
2260 return discard_event();
2267 struct Start
: boost::statechart::state
< Start
, Started
>, NamedState
{
2268 explicit Start(my_context ctx
);
2271 typedef boost::mpl::list
<
2272 boost::statechart::transition
< MakePrimary
, Primary
>,
2273 boost::statechart::transition
< MakeStray
, Stray
>
2278 struct WaitActingChange
;
2282 struct Primary
: boost::statechart::state
< Primary
, Started
, Peering
>, NamedState
{
2283 explicit Primary(my_context ctx
);
2286 typedef boost::mpl::list
<
2287 boost::statechart::custom_reaction
< ActMap
>,
2288 boost::statechart::custom_reaction
< MNotifyRec
>,
2289 boost::statechart::transition
< NeedActingChange
, WaitActingChange
>,
2290 boost::statechart::custom_reaction
<SetForceRecovery
>,
2291 boost::statechart::custom_reaction
<UnsetForceRecovery
>,
2292 boost::statechart::custom_reaction
<SetForceBackfill
>,
2293 boost::statechart::custom_reaction
<UnsetForceBackfill
>,
2294 boost::statechart::custom_reaction
<RequestScrub
>
2296 boost::statechart::result
react(const ActMap
&);
2297 boost::statechart::result
react(const MNotifyRec
&);
2298 boost::statechart::result
react(const SetForceRecovery
&);
2299 boost::statechart::result
react(const UnsetForceRecovery
&);
2300 boost::statechart::result
react(const SetForceBackfill
&);
2301 boost::statechart::result
react(const UnsetForceBackfill
&);
2302 boost::statechart::result
react(const RequestScrub
&);
2305 struct WaitActingChange
: boost::statechart::state
< WaitActingChange
, Primary
>,
2307 typedef boost::mpl::list
<
2308 boost::statechart::custom_reaction
< QueryState
>,
2309 boost::statechart::custom_reaction
< AdvMap
>,
2310 boost::statechart::custom_reaction
< MLogRec
>,
2311 boost::statechart::custom_reaction
< MInfoRec
>,
2312 boost::statechart::custom_reaction
< MNotifyRec
>
2314 explicit WaitActingChange(my_context ctx
);
2315 boost::statechart::result
react(const QueryState
& q
);
2316 boost::statechart::result
react(const AdvMap
&);
2317 boost::statechart::result
react(const MLogRec
&);
2318 boost::statechart::result
react(const MInfoRec
&);
2319 boost::statechart::result
react(const MNotifyRec
&);
2326 struct Peering
: boost::statechart::state
< Peering
, Primary
, GetInfo
>, NamedState
{
2327 PastIntervals::PriorSet prior_set
;
2328 bool history_les_bound
; //< need osd_find_best_info_ignore_history_les
2330 explicit Peering(my_context ctx
);
2333 typedef boost::mpl::list
<
2334 boost::statechart::custom_reaction
< QueryState
>,
2335 boost::statechart::transition
< Activate
, Active
>,
2336 boost::statechart::custom_reaction
< AdvMap
>
2338 boost::statechart::result
react(const QueryState
& q
);
2339 boost::statechart::result
react(const AdvMap
&advmap
);
2342 struct WaitLocalRecoveryReserved
;
2344 struct Active
: boost::statechart::state
< Active
, Primary
, Activating
>, NamedState
{
2345 explicit Active(my_context ctx
);
2348 const set
<pg_shard_t
> remote_shards_to_reserve_recovery
;
2349 const set
<pg_shard_t
> remote_shards_to_reserve_backfill
;
2350 bool all_replicas_activated
;
2352 typedef boost::mpl::list
<
2353 boost::statechart::custom_reaction
< QueryState
>,
2354 boost::statechart::custom_reaction
< ActMap
>,
2355 boost::statechart::custom_reaction
< AdvMap
>,
2356 boost::statechart::custom_reaction
< MInfoRec
>,
2357 boost::statechart::custom_reaction
< MNotifyRec
>,
2358 boost::statechart::custom_reaction
< MLogRec
>,
2359 boost::statechart::custom_reaction
< MTrim
>,
2360 boost::statechart::custom_reaction
< Backfilled
>,
2361 boost::statechart::custom_reaction
< AllReplicasActivated
>,
2362 boost::statechart::custom_reaction
< DeferRecovery
>,
2363 boost::statechart::custom_reaction
< DeferBackfill
>,
2364 boost::statechart::custom_reaction
< UnfoundRecovery
>,
2365 boost::statechart::custom_reaction
< UnfoundBackfill
>,
2366 boost::statechart::custom_reaction
< RemoteReservationRevokedTooFull
>,
2367 boost::statechart::custom_reaction
< RemoteReservationRevoked
>,
2368 boost::statechart::custom_reaction
< DoRecovery
>
2370 boost::statechart::result
react(const QueryState
& q
);
2371 boost::statechart::result
react(const ActMap
&);
2372 boost::statechart::result
react(const AdvMap
&);
2373 boost::statechart::result
react(const MInfoRec
& infoevt
);
2374 boost::statechart::result
react(const MNotifyRec
& notevt
);
2375 boost::statechart::result
react(const MLogRec
& logevt
);
2376 boost::statechart::result
react(const MTrim
& trimevt
);
2377 boost::statechart::result
react(const Backfilled
&) {
2378 return discard_event();
2380 boost::statechart::result
react(const AllReplicasActivated
&);
2381 boost::statechart::result
react(const DeferRecovery
& evt
) {
2382 return discard_event();
2384 boost::statechart::result
react(const DeferBackfill
& evt
) {
2385 return discard_event();
2387 boost::statechart::result
react(const UnfoundRecovery
& evt
) {
2388 return discard_event();
2390 boost::statechart::result
react(const UnfoundBackfill
& evt
) {
2391 return discard_event();
2393 boost::statechart::result
react(const RemoteReservationRevokedTooFull
&) {
2394 return discard_event();
2396 boost::statechart::result
react(const RemoteReservationRevoked
&) {
2397 return discard_event();
2399 boost::statechart::result
react(const DoRecovery
&) {
2400 return discard_event();
2404 struct Clean
: boost::statechart::state
< Clean
, Active
>, NamedState
{
2405 typedef boost::mpl::list
<
2406 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
2407 boost::statechart::custom_reaction
<SetForceRecovery
>,
2408 boost::statechart::custom_reaction
<SetForceBackfill
>
2410 explicit Clean(my_context ctx
);
2412 boost::statechart::result
react(const boost::statechart::event_base
&) {
2413 return discard_event();
2417 struct Recovered
: boost::statechart::state
< Recovered
, Active
>, NamedState
{
2418 typedef boost::mpl::list
<
2419 boost::statechart::transition
< GoClean
, Clean
>,
2420 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
2421 boost::statechart::custom_reaction
< AllReplicasActivated
>
2423 explicit Recovered(my_context ctx
);
2425 boost::statechart::result
react(const AllReplicasActivated
&) {
2426 post_event(GoClean());
2427 return forward_event();
2431 struct Backfilling
: boost::statechart::state
< Backfilling
, Active
>, NamedState
{
2432 typedef boost::mpl::list
<
2433 boost::statechart::custom_reaction
< Backfilled
>,
2434 boost::statechart::custom_reaction
< DeferBackfill
>,
2435 boost::statechart::custom_reaction
< UnfoundBackfill
>,
2436 boost::statechart::custom_reaction
< RemoteReservationRejectedTooFull
>,
2437 boost::statechart::custom_reaction
< RemoteReservationRevokedTooFull
>,
2438 boost::statechart::custom_reaction
< RemoteReservationRevoked
>
2440 explicit Backfilling(my_context ctx
);
2441 boost::statechart::result
react(const RemoteReservationRejectedTooFull
& evt
) {
2442 // for compat with old peers
2443 post_event(RemoteReservationRevokedTooFull());
2444 return discard_event();
2446 void backfill_release_reservations();
2447 boost::statechart::result
react(const Backfilled
& evt
);
2448 boost::statechart::result
react(const RemoteReservationRevokedTooFull
& evt
);
2449 boost::statechart::result
react(const RemoteReservationRevoked
& evt
);
2450 boost::statechart::result
react(const DeferBackfill
& evt
);
2451 boost::statechart::result
react(const UnfoundBackfill
& evt
);
2452 void cancel_backfill();
2456 struct WaitRemoteBackfillReserved
: boost::statechart::state
< WaitRemoteBackfillReserved
, Active
>, NamedState
{
2457 typedef boost::mpl::list
<
2458 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
2459 boost::statechart::custom_reaction
< RemoteReservationRejectedTooFull
>,
2460 boost::statechart::custom_reaction
< RemoteReservationRevoked
>,
2461 boost::statechart::transition
< AllBackfillsReserved
, Backfilling
>
2463 set
<pg_shard_t
>::const_iterator backfill_osd_it
;
2464 explicit WaitRemoteBackfillReserved(my_context ctx
);
2467 boost::statechart::result
react(const RemoteBackfillReserved
& evt
);
2468 boost::statechart::result
react(const RemoteReservationRejectedTooFull
& evt
);
2469 boost::statechart::result
react(const RemoteReservationRevoked
& evt
);
2472 struct WaitLocalBackfillReserved
: boost::statechart::state
< WaitLocalBackfillReserved
, Active
>, NamedState
{
2473 typedef boost::mpl::list
<
2474 boost::statechart::transition
< LocalBackfillReserved
, WaitRemoteBackfillReserved
>
2476 explicit WaitLocalBackfillReserved(my_context ctx
);
2480 struct NotBackfilling
: boost::statechart::state
< NotBackfilling
, Active
>, NamedState
{
2481 typedef boost::mpl::list
<
2482 boost::statechart::transition
< RequestBackfill
, WaitLocalBackfillReserved
>,
2483 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
2484 boost::statechart::custom_reaction
< RemoteReservationRejectedTooFull
>
2486 explicit NotBackfilling(my_context ctx
);
2488 boost::statechart::result
react(const RemoteBackfillReserved
& evt
);
2489 boost::statechart::result
react(const RemoteReservationRejectedTooFull
& evt
);
2492 struct NotRecovering
: boost::statechart::state
< NotRecovering
, Active
>, NamedState
{
2493 typedef boost::mpl::list
<
2494 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
2495 boost::statechart::custom_reaction
< DeferRecovery
>,
2496 boost::statechart::custom_reaction
< UnfoundRecovery
>
2498 explicit NotRecovering(my_context ctx
);
2499 boost::statechart::result
react(const DeferRecovery
& evt
) {
2501 return discard_event();
2503 boost::statechart::result
react(const UnfoundRecovery
& evt
) {
2505 return discard_event();
2511 struct RepNotRecovering
;
2512 struct ReplicaActive
: boost::statechart::state
< ReplicaActive
, Started
, RepNotRecovering
>, NamedState
{
2513 explicit ReplicaActive(my_context ctx
);
2516 typedef boost::mpl::list
<
2517 boost::statechart::custom_reaction
< QueryState
>,
2518 boost::statechart::custom_reaction
< ActMap
>,
2519 boost::statechart::custom_reaction
< MQuery
>,
2520 boost::statechart::custom_reaction
< MInfoRec
>,
2521 boost::statechart::custom_reaction
< MLogRec
>,
2522 boost::statechart::custom_reaction
< MTrim
>,
2523 boost::statechart::custom_reaction
< Activate
>,
2524 boost::statechart::custom_reaction
< DeferRecovery
>,
2525 boost::statechart::custom_reaction
< DeferBackfill
>,
2526 boost::statechart::custom_reaction
< UnfoundRecovery
>,
2527 boost::statechart::custom_reaction
< UnfoundBackfill
>,
2528 boost::statechart::custom_reaction
< RemoteBackfillPreempted
>,
2529 boost::statechart::custom_reaction
< RemoteRecoveryPreempted
>,
2530 boost::statechart::custom_reaction
< RecoveryDone
>,
2531 boost::statechart::transition
<DeleteStart
, ToDelete
>
2533 boost::statechart::result
react(const QueryState
& q
);
2534 boost::statechart::result
react(const MInfoRec
& infoevt
);
2535 boost::statechart::result
react(const MLogRec
& logevt
);
2536 boost::statechart::result
react(const MTrim
& trimevt
);
2537 boost::statechart::result
react(const ActMap
&);
2538 boost::statechart::result
react(const MQuery
&);
2539 boost::statechart::result
react(const Activate
&);
2540 boost::statechart::result
react(const RecoveryDone
&) {
2541 return discard_event();
2543 boost::statechart::result
react(const DeferRecovery
& evt
) {
2544 return discard_event();
2546 boost::statechart::result
react(const DeferBackfill
& evt
) {
2547 return discard_event();
2549 boost::statechart::result
react(const UnfoundRecovery
& evt
) {
2550 return discard_event();
2552 boost::statechart::result
react(const UnfoundBackfill
& evt
) {
2553 return discard_event();
2555 boost::statechart::result
react(const RemoteBackfillPreempted
& evt
) {
2556 return discard_event();
2558 boost::statechart::result
react(const RemoteRecoveryPreempted
& evt
) {
2559 return discard_event();
2563 struct RepRecovering
: boost::statechart::state
< RepRecovering
, ReplicaActive
>, NamedState
{
2564 typedef boost::mpl::list
<
2565 boost::statechart::transition
< RecoveryDone
, RepNotRecovering
>,
2566 // for compat with old peers
2567 boost::statechart::transition
< RemoteReservationRejectedTooFull
, RepNotRecovering
>,
2568 boost::statechart::transition
< RemoteReservationCanceled
, RepNotRecovering
>,
2569 boost::statechart::custom_reaction
< BackfillTooFull
>,
2570 boost::statechart::custom_reaction
< RemoteRecoveryPreempted
>,
2571 boost::statechart::custom_reaction
< RemoteBackfillPreempted
>
2573 explicit RepRecovering(my_context ctx
);
2574 boost::statechart::result
react(const RemoteRecoveryPreempted
&evt
);
2575 boost::statechart::result
react(const BackfillTooFull
&evt
);
2576 boost::statechart::result
react(const RemoteBackfillPreempted
&evt
);
2580 struct RepWaitBackfillReserved
: boost::statechart::state
< RepWaitBackfillReserved
, ReplicaActive
>, NamedState
{
2581 typedef boost::mpl::list
<
2582 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
2583 boost::statechart::custom_reaction
< RejectTooFullRemoteReservation
>,
2584 boost::statechart::custom_reaction
< RemoteReservationRejectedTooFull
>,
2585 boost::statechart::custom_reaction
< RemoteReservationCanceled
>
2587 explicit RepWaitBackfillReserved(my_context ctx
);
2589 boost::statechart::result
react(const RemoteBackfillReserved
&evt
);
2590 boost::statechart::result
react(const RejectTooFullRemoteReservation
&evt
);
2591 boost::statechart::result
react(const RemoteReservationRejectedTooFull
&evt
);
2592 boost::statechart::result
react(const RemoteReservationCanceled
&evt
);
2595 struct RepWaitRecoveryReserved
: boost::statechart::state
< RepWaitRecoveryReserved
, ReplicaActive
>, NamedState
{
2596 typedef boost::mpl::list
<
2597 boost::statechart::custom_reaction
< RemoteRecoveryReserved
>,
2598 // for compat with old peers
2599 boost::statechart::custom_reaction
< RemoteReservationRejectedTooFull
>,
2600 boost::statechart::custom_reaction
< RemoteReservationCanceled
>
2602 explicit RepWaitRecoveryReserved(my_context ctx
);
2604 boost::statechart::result
react(const RemoteRecoveryReserved
&evt
);
2605 boost::statechart::result
react(const RemoteReservationRejectedTooFull
&evt
) {
2606 // for compat with old peers
2607 post_event(RemoteReservationCanceled());
2608 return discard_event();
2610 boost::statechart::result
react(const RemoteReservationCanceled
&evt
);
2613 struct RepNotRecovering
: boost::statechart::state
< RepNotRecovering
, ReplicaActive
>, NamedState
{
2614 typedef boost::mpl::list
<
2615 boost::statechart::custom_reaction
< RequestRecoveryPrio
>,
2616 boost::statechart::custom_reaction
< RequestBackfillPrio
>,
2617 boost::statechart::custom_reaction
< RejectTooFullRemoteReservation
>,
2618 boost::statechart::transition
< RemoteReservationRejectedTooFull
, RepNotRecovering
>,
2619 boost::statechart::transition
< RemoteReservationCanceled
, RepNotRecovering
>,
2620 boost::statechart::custom_reaction
< RemoteRecoveryReserved
>,
2621 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
2622 boost::statechart::transition
< RecoveryDone
, RepNotRecovering
> // for compat with pre-reservation peers
2624 explicit RepNotRecovering(my_context ctx
);
2625 boost::statechart::result
react(const RequestRecoveryPrio
&evt
);
2626 boost::statechart::result
react(const RequestBackfillPrio
&evt
);
2627 boost::statechart::result
react(const RemoteBackfillReserved
&evt
) {
2628 // my reservation completion raced with a RELEASE from primary
2629 return discard_event();
2631 boost::statechart::result
react(const RemoteRecoveryReserved
&evt
) {
2632 // my reservation completion raced with a RELEASE from primary
2633 return discard_event();
2635 boost::statechart::result
react(const RejectTooFullRemoteReservation
&evt
);
2639 struct Recovering
: boost::statechart::state
< Recovering
, Active
>, NamedState
{
2640 typedef boost::mpl::list
<
2641 boost::statechart::custom_reaction
< AllReplicasRecovered
>,
2642 boost::statechart::custom_reaction
< DeferRecovery
>,
2643 boost::statechart::custom_reaction
< UnfoundRecovery
>,
2644 boost::statechart::custom_reaction
< RequestBackfill
>
2646 explicit Recovering(my_context ctx
);
2648 void release_reservations(bool cancel
= false);
2649 boost::statechart::result
react(const AllReplicasRecovered
&evt
);
2650 boost::statechart::result
react(const DeferRecovery
& evt
);
2651 boost::statechart::result
react(const UnfoundRecovery
& evt
);
2652 boost::statechart::result
react(const RequestBackfill
&evt
);
2655 struct WaitRemoteRecoveryReserved
: boost::statechart::state
< WaitRemoteRecoveryReserved
, Active
>, NamedState
{
2656 typedef boost::mpl::list
<
2657 boost::statechart::custom_reaction
< RemoteRecoveryReserved
>,
2658 boost::statechart::transition
< AllRemotesReserved
, Recovering
>
2660 set
<pg_shard_t
>::const_iterator remote_recovery_reservation_it
;
2661 explicit WaitRemoteRecoveryReserved(my_context ctx
);
2662 boost::statechart::result
react(const RemoteRecoveryReserved
&evt
);
2666 struct WaitLocalRecoveryReserved
: boost::statechart::state
< WaitLocalRecoveryReserved
, Active
>, NamedState
{
2667 typedef boost::mpl::list
<
2668 boost::statechart::transition
< LocalRecoveryReserved
, WaitRemoteRecoveryReserved
>,
2669 boost::statechart::custom_reaction
< RecoveryTooFull
>
2671 explicit WaitLocalRecoveryReserved(my_context ctx
);
2673 boost::statechart::result
react(const RecoveryTooFull
&evt
);
2676 struct Activating
: boost::statechart::state
< Activating
, Active
>, NamedState
{
2677 typedef boost::mpl::list
<
2678 boost::statechart::transition
< AllReplicasRecovered
, Recovered
>,
2679 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
2680 boost::statechart::transition
< RequestBackfill
, WaitLocalBackfillReserved
>
2682 explicit Activating(my_context ctx
);
2686 struct Stray
: boost::statechart::state
< Stray
, Started
>,
2688 explicit Stray(my_context ctx
);
2691 typedef boost::mpl::list
<
2692 boost::statechart::custom_reaction
< MQuery
>,
2693 boost::statechart::custom_reaction
< MLogRec
>,
2694 boost::statechart::custom_reaction
< MInfoRec
>,
2695 boost::statechart::custom_reaction
< ActMap
>,
2696 boost::statechart::custom_reaction
< RecoveryDone
>,
2697 boost::statechart::transition
<DeleteStart
, ToDelete
>
2699 boost::statechart::result
react(const MQuery
& query
);
2700 boost::statechart::result
react(const MLogRec
& logevt
);
2701 boost::statechart::result
react(const MInfoRec
& infoevt
);
2702 boost::statechart::result
react(const ActMap
&);
2703 boost::statechart::result
react(const RecoveryDone
&) {
2704 return discard_event();
2708 struct WaitDeleteReserved
;
2709 struct ToDelete
: boost::statechart::state
<ToDelete
, Started
, WaitDeleteReserved
>, NamedState
{
2710 unsigned priority
= 0;
2711 typedef boost::mpl::list
<
2712 boost::statechart::custom_reaction
< ActMap
>,
2713 boost::statechart::custom_reaction
< DeleteSome
>
2715 explicit ToDelete(my_context ctx
);
2716 boost::statechart::result
react(const ActMap
&evt
);
2717 boost::statechart::result
react(const DeleteSome
&evt
) {
2718 // happens if we drop out of Deleting due to reprioritization etc.
2719 return discard_event();
2725 struct WaitDeleteReserved
: boost::statechart::state
<WaitDeleteReserved
,
2726 ToDelete
>, NamedState
{
2727 typedef boost::mpl::list
<
2728 boost::statechart::transition
<DeleteReserved
, Deleting
>
2730 explicit WaitDeleteReserved(my_context ctx
);
2734 struct Deleting
: boost::statechart::state
<Deleting
,
2735 ToDelete
>, NamedState
{
2736 typedef boost::mpl::list
<
2737 boost::statechart::custom_reaction
< DeleteSome
>,
2738 boost::statechart::transition
<DeleteInterrupted
, WaitDeleteReserved
>
2740 explicit Deleting(my_context ctx
);
2741 boost::statechart::result
react(const DeleteSome
&evt
);
2747 struct GetInfo
: boost::statechart::state
< GetInfo
, Peering
>, NamedState
{
2748 set
<pg_shard_t
> peer_info_requested
;
2750 explicit GetInfo(my_context ctx
);
2754 typedef boost::mpl::list
<
2755 boost::statechart::custom_reaction
< QueryState
>,
2756 boost::statechart::transition
< GotInfo
, GetLog
>,
2757 boost::statechart::custom_reaction
< MNotifyRec
>,
2758 boost::statechart::transition
< IsDown
, Down
>
2760 boost::statechart::result
react(const QueryState
& q
);
2761 boost::statechart::result
react(const MNotifyRec
& infoevt
);
2764 struct GotLog
: boost::statechart::event
< GotLog
> {
2765 GotLog() : boost::statechart::event
< GotLog
>() {}
2768 struct GetLog
: boost::statechart::state
< GetLog
, Peering
>, NamedState
{
2769 pg_shard_t auth_log_shard
;
2770 boost::intrusive_ptr
<MOSDPGLog
> msg
;
2772 explicit GetLog(my_context ctx
);
2775 typedef boost::mpl::list
<
2776 boost::statechart::custom_reaction
< QueryState
>,
2777 boost::statechart::custom_reaction
< MLogRec
>,
2778 boost::statechart::custom_reaction
< GotLog
>,
2779 boost::statechart::custom_reaction
< AdvMap
>,
2780 boost::statechart::transition
< IsIncomplete
, Incomplete
>
2782 boost::statechart::result
react(const AdvMap
&);
2783 boost::statechart::result
react(const QueryState
& q
);
2784 boost::statechart::result
react(const MLogRec
& logevt
);
2785 boost::statechart::result
react(const GotLog
&);
2790 struct GetMissing
: boost::statechart::state
< GetMissing
, Peering
>, NamedState
{
2791 set
<pg_shard_t
> peer_missing_requested
;
2793 explicit GetMissing(my_context ctx
);
2796 typedef boost::mpl::list
<
2797 boost::statechart::custom_reaction
< QueryState
>,
2798 boost::statechart::custom_reaction
< MLogRec
>,
2799 boost::statechart::transition
< NeedUpThru
, WaitUpThru
>
2801 boost::statechart::result
react(const QueryState
& q
);
2802 boost::statechart::result
react(const MLogRec
& logevt
);
2805 struct WaitUpThru
: boost::statechart::state
< WaitUpThru
, Peering
>, NamedState
{
2806 explicit WaitUpThru(my_context ctx
);
2809 typedef boost::mpl::list
<
2810 boost::statechart::custom_reaction
< QueryState
>,
2811 boost::statechart::custom_reaction
< ActMap
>,
2812 boost::statechart::custom_reaction
< MLogRec
>
2814 boost::statechart::result
react(const QueryState
& q
);
2815 boost::statechart::result
react(const ActMap
& am
);
2816 boost::statechart::result
react(const MLogRec
& logrec
);
2819 struct Down
: boost::statechart::state
< Down
, Peering
>, NamedState
{
2820 explicit Down(my_context ctx
);
2821 typedef boost::mpl::list
<
2822 boost::statechart::custom_reaction
< QueryState
>,
2823 boost::statechart::custom_reaction
< MNotifyRec
>
2825 boost::statechart::result
react(const QueryState
& q
);
2826 boost::statechart::result
react(const MNotifyRec
& infoevt
);
2830 struct Incomplete
: boost::statechart::state
< Incomplete
, Peering
>, NamedState
{
2831 typedef boost::mpl::list
<
2832 boost::statechart::custom_reaction
< AdvMap
>,
2833 boost::statechart::custom_reaction
< MNotifyRec
>,
2834 boost::statechart::custom_reaction
< QueryState
>
2836 explicit Incomplete(my_context ctx
);
2837 boost::statechart::result
react(const AdvMap
&advmap
);
2838 boost::statechart::result
react(const MNotifyRec
& infoevt
);
2839 boost::statechart::result
react(const QueryState
& q
);
2843 RecoveryMachine machine
;
2846 /// context passed in by state machine caller
2847 RecoveryCtx
*orig_ctx
;
2849 /// populated if we are buffering messages pending a flush
2850 boost::optional
<BufferedRecoveryMessages
> messages_pending_flush
;
2853 * populated between start_handle() and end_handle(), points into
2854 * the message lists for messages_pending_flush while blocking messages
2855 * or into orig_ctx otherwise
2857 boost::optional
<RecoveryCtx
> rctx
;
2860 explicit RecoveryState(PG
*pg
)
2861 : machine(this, pg
), pg(pg
), orig_ctx(0) {
2865 void handle_event(const boost::statechart::event_base
&evt
,
2866 RecoveryCtx
*rctx
) {
2868 machine
.process_event(evt
);
2872 void handle_event(PGPeeringEventRef evt
,
2873 RecoveryCtx
*rctx
) {
2875 machine
.process_event(evt
->get_event());
2883 uint64_t peer_features
;
2884 uint64_t acting_features
;
2885 uint64_t upacting_features
;
2889 /// most recently consumed osdmap's require_osd_version
2890 unsigned last_require_osd_release
= 0;
2891 bool delete_needs_sleep
= false;
2894 void reset_min_peer_features() {
2895 peer_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
2897 uint64_t get_min_peer_features() const { return peer_features
; }
2898 void apply_peer_features(uint64_t f
) { peer_features
&= f
; }
2900 uint64_t get_min_acting_features() const { return acting_features
; }
2901 uint64_t get_min_upacting_features() const { return upacting_features
; }
2902 bool perform_deletes_during_peering() const {
2903 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
));
2906 bool hard_limit_pglog() const {
2907 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
));
2910 void init_primary_up_acting(
2911 const vector
<int> &newup
,
2912 const vector
<int> &newacting
,
2914 int new_acting_primary
) {
2917 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
2918 if (acting
[i
] != CRUSH_ITEM_NONE
)
2922 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2926 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
2927 if (up
[i
] != CRUSH_ITEM_NONE
)
2931 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2933 if (!pool
.info
.is_erasure()) {
2934 up_primary
= pg_shard_t(new_up_primary
, shard_id_t::NO_SHARD
);
2935 primary
= pg_shard_t(new_acting_primary
, shard_id_t::NO_SHARD
);
2938 up_primary
= pg_shard_t();
2939 primary
= pg_shard_t();
2940 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
2941 if (up
[i
] == new_up_primary
) {
2942 up_primary
= pg_shard_t(up
[i
], shard_id_t(i
));
2946 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
2947 if (acting
[i
] == new_acting_primary
) {
2948 primary
= pg_shard_t(acting
[i
], shard_id_t(i
));
2952 ceph_assert(up_primary
.osd
== new_up_primary
);
2953 ceph_assert(primary
.osd
== new_acting_primary
);
2956 void set_role(int r
) {
2960 bool state_test(uint64_t m
) const { return (state
& m
) != 0; }
2961 void state_set(uint64_t m
) { state
|= m
; }
2962 void state_clear(uint64_t m
) { state
&= ~m
; }
2964 bool is_complete() const { return info
.last_complete
== info
.last_update
; }
2965 bool should_send_notify() const { return send_notify
; }
2967 uint64_t get_state() const { return state
; }
2968 bool is_active() const { return state_test(PG_STATE_ACTIVE
); }
2969 bool is_activating() const { return state_test(PG_STATE_ACTIVATING
); }
2970 bool is_peering() const { return state_test(PG_STATE_PEERING
); }
2971 bool is_down() const { return state_test(PG_STATE_DOWN
); }
2972 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND
); }
2973 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND
); }
2974 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE
); }
2975 bool is_clean() const { return state_test(PG_STATE_CLEAN
); }
2976 bool is_degraded() const { return state_test(PG_STATE_DEGRADED
); }
2977 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED
); }
2978 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING
); }
2979 bool is_remapped() const { return state_test(PG_STATE_REMAPPED
); }
2980 bool is_peered() const {
2981 return state_test(PG_STATE_ACTIVE
) || state_test(PG_STATE_PEERED
);
2983 bool is_recovering() const { return state_test(PG_STATE_RECOVERING
); }
2984 bool is_premerge() const { return state_test(PG_STATE_PREMERGE
); }
2985 bool is_repair() const { return state_test(PG_STATE_REPAIR
); }
2987 bool is_empty() const { return info
.last_update
== eversion_t(0,0); }
2990 void do_pending_flush();
2993 static void _create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
);
2994 static void _init(ObjectStore::Transaction
& t
,
2995 spg_t pgid
, const pg_pool_t
*pool
);
2998 void prepare_write_info(map
<string
,bufferlist
> *km
);
3000 void update_store_with_options();
3003 static int _prepare_write_info(
3005 map
<string
,bufferlist
> *km
,
3008 pg_info_t
&last_written_info
,
3009 PastIntervals
&past_intervals
,
3010 bool dirty_big_info
,
3013 PerfCounters
*logger
= nullptr);
3015 void write_if_dirty(RecoveryCtx
*rctx
) {
3016 write_if_dirty(*rctx
->transaction
);
3019 void write_if_dirty(ObjectStore::Transaction
& t
);
3021 PGLog::IndexedLog projected_log
;
3022 bool check_in_progress_op(
3023 const osd_reqid_t
&r
,
3024 eversion_t
*version
,
3025 version_t
*user_version
,
3026 int *return_code
) const;
3027 eversion_t projected_last_update
;
3028 eversion_t
get_next_version() const {
3029 eversion_t
at_version(
3031 projected_last_update
.version
+1);
3032 ceph_assert(at_version
> info
.last_update
);
3033 ceph_assert(at_version
> pg_log
.get_head());
3034 ceph_assert(at_version
> projected_last_update
);
3038 void add_log_entry(const pg_log_entry_t
& e
, bool applied
);
3040 const vector
<pg_log_entry_t
>& logv
,
3042 eversion_t roll_forward_to
,
3043 ObjectStore::Transaction
&t
,
3044 bool transaction_applied
= true,
3045 bool async
= false);
3046 bool check_log_for_corruption(ObjectStore
*store
);
3048 std::string
get_corrupt_pg_log_name() const;
3050 void update_snap_map(
3051 const vector
<pg_log_entry_t
> &log_entries
,
3052 ObjectStore::Transaction
& t
);
3054 void filter_snapc(vector
<snapid_t
> &snaps
);
3056 void log_weirdness();
3058 virtual void kick_snap_trim() = 0;
3059 virtual void snap_trimmer_scrub_complete() = 0;
3060 bool requeue_scrub(bool high_priority
= false);
3061 void queue_recovery();
3063 unsigned get_scrub_priority();
3065 /// share pg info after a pg is active
3066 void share_pg_info();
3069 bool append_log_entries_update_missing(
3070 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3071 ObjectStore::Transaction
&t
,
3072 boost::optional
<eversion_t
> trim_to
,
3073 boost::optional
<eversion_t
> roll_forward_to
);
3076 * Merge entries updating missing as necessary on all
3077 * acting_recovery_backfill logs and missings (also missing_loc)
3079 void merge_new_log_entries(
3080 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3081 ObjectStore::Transaction
&t
,
3082 boost::optional
<eversion_t
> trim_to
,
3083 boost::optional
<eversion_t
> roll_forward_to
);
3085 void reset_interval_flush();
3086 void start_peering_interval(
3087 const OSDMapRef lastmap
,
3088 const vector
<int>& newup
, int up_primary
,
3089 const vector
<int>& newacting
, int acting_primary
,
3090 ObjectStore::Transaction
*t
);
3091 void on_new_interval();
3092 virtual void _on_new_interval() = 0;
3093 void start_flush(ObjectStore::Transaction
*t
);
3094 void set_last_peering_reset();
3096 void update_history(const pg_history_t
& history
);
3097 void fulfill_info(pg_shard_t from
, const pg_query_t
&query
,
3098 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
);
3099 void fulfill_log(pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
);
3100 void fulfill_query(const MQuery
& q
, RecoveryCtx
*rctx
);
3101 void check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
);
3103 bool should_restart_peering(
3105 int newactingprimary
,
3106 const vector
<int>& newup
,
3107 const vector
<int>& newacting
,
3111 // OpRequest queueing
3112 bool can_discard_op(OpRequestRef
& op
);
3113 bool can_discard_scan(OpRequestRef op
);
3114 bool can_discard_backfill(OpRequestRef op
);
3115 bool can_discard_request(OpRequestRef
& op
);
3117 template<typename T
, int MSGTYPE
>
3118 bool can_discard_replica_op(OpRequestRef
& op
);
3120 bool old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
);
3121 bool old_peering_evt(PGPeeringEventRef evt
) {
3122 return old_peering_msg(evt
->get_epoch_sent(), evt
->get_epoch_requested());
3124 static bool have_same_or_newer_map(epoch_t cur_epoch
, epoch_t e
) {
3125 return e
<= cur_epoch
;
3127 bool have_same_or_newer_map(epoch_t e
) {
3128 return e
<= get_osdmap_epoch();
3131 bool op_has_sufficient_caps(OpRequestRef
& op
);
3135 void take_waiters();
3139 friend class FlushState
;
3141 virtual void on_role_change() = 0;
3142 virtual void on_pool_change() = 0;
3143 virtual void on_change(ObjectStore::Transaction
*t
) = 0;
3144 virtual void on_activate() = 0;
3145 virtual void on_flushed() = 0;
3146 virtual void check_blacklisted_watchers() = 0;
3148 friend ostream
& operator<<(ostream
& out
, const PG
& pg
);
3152 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
);