1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
39 #include "common/Timer.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
69 typedef OpRequest::Ref OpRequestRef
;
77 void intrusive_ptr_add_ref(PG
*pg
);
78 void intrusive_ptr_release(PG
*pg
);
80 using state_history_entry
= std::tuple
<utime_t
, utime_t
, const char*>;
81 using embedded_state
= std::pair
<utime_t
, const char*>;
83 struct PGStateInstance
{
84 // Time spent in pg states
86 void setepoch(const epoch_t current_epoch
) {
87 this_epoch
= current_epoch
;
90 void enter_state(const utime_t entime
, const char* state
) {
91 embedded_states
.push(std::make_pair(entime
, state
));
94 void exit_state(const utime_t extime
) {
95 embedded_state this_state
= embedded_states
.top();
96 state_history
.push_back(state_history_entry
{
97 this_state
.first
, extime
, this_state
.second
});
98 embedded_states
.pop();
103 std::vector
<state_history_entry
> state_history
;
104 std::stack
<embedded_state
> embedded_states
;
107 class PGStateHistory
{
108 // Member access protected with the PG lock
110 PGStateHistory() : buffer(10) {}
112 void enter(PG
* pg
, const utime_t entime
, const char* state
);
114 void exit(const char* state
);
120 void set_pg_in_destructor() { pg_in_destructor
= true; }
122 void dump(Formatter
* f
) const;
125 bool pg_in_destructor
= false;
126 PG
* thispg
= nullptr;
127 std::unique_ptr
<PGStateInstance
> tmppi
;
128 PGStateInstance
* pi
= nullptr;
129 boost::circular_buffer
<std::unique_ptr
<PGStateInstance
>> buffer
;
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG
*pg
);
136 void put_with_id(PG
*pg
, uint64_t id
);
137 typedef TrackedIntPtr
<PG
> PGRef
;
139 typedef boost::intrusive_ptr
<PG
> PGRef
;
142 class PGRecoveryStats
{
143 struct per_state_info
{
144 uint64_t enter
, exit
; // enter/exit counts
146 utime_t event_time
; // time spent processing events
147 utime_t total_time
; // total time in state
148 utime_t min_time
, max_time
;
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
153 map
<const char *,per_state_info
> info
;
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
160 Mutex::Locker
l(lock
);
163 void dump(ostream
& out
) {
164 Mutex::Locker
l(lock
);
165 for (map
<const char *,per_state_info
>::iterator p
= info
.begin(); p
!= info
.end(); ++p
) {
166 per_state_info
& i
= p
->second
;
167 out
<< i
.enter
<< "\t" << i
.exit
<< "\t"
168 << i
.events
<< "\t" << i
.event_time
<< "\t"
169 << i
.total_time
<< "\t"
170 << i
.min_time
<< "\t" << i
.max_time
<< "\t"
175 void dump_formatted(Formatter
*f
) {
176 Mutex::Locker
l(lock
);
177 f
->open_array_section("pg_recovery_stats");
178 for (map
<const char *,per_state_info
>::iterator p
= info
.begin();
179 p
!= info
.end(); ++p
) {
180 per_state_info
& i
= p
->second
;
181 f
->open_object_section("recovery_state");
182 f
->dump_int("enter", i
.enter
);
183 f
->dump_int("exit", i
.exit
);
184 f
->dump_int("events", i
.events
);
185 f
->dump_stream("event_time") << i
.event_time
;
186 f
->dump_stream("total_time") << i
.total_time
;
187 f
->dump_stream("min_time") << i
.min_time
;
188 f
->dump_stream("max_time") << i
.max_time
;
189 vector
<string
> states
;
190 get_str_vec(p
->first
, "/", states
);
191 f
->open_array_section("nested_states");
192 for (vector
<string
>::iterator st
= states
.begin();
193 st
!= states
.end(); ++st
) {
194 f
->dump_string("state", *st
);
202 void log_enter(const char *s
) {
203 Mutex::Locker
l(lock
);
206 void log_exit(const char *s
, utime_t dur
, uint64_t events
, utime_t event_dur
) {
207 Mutex::Locker
l(lock
);
208 per_state_info
&i
= info
[s
];
211 if (dur
> i
.max_time
)
213 if (dur
< i
.min_time
|| i
.min_time
== utime_t())
216 i
.event_time
+= event_dur
;
222 epoch_t cached_epoch
;
228 SnapContext snapc
; // the default pool snapc, ready to go.
230 interval_set
<snapid_t
> cached_removed_snaps
; // current removed_snaps set
231 interval_set
<snapid_t
> newly_removed_snaps
; // newly removed in the last epoch
233 PGPool(CephContext
* cct
, OSDMapRef map
, int64_t i
)
235 cached_epoch(map
->get_epoch()),
237 name(map
->get_pool_name(id
)),
238 auid(map
->get_pg_pool(id
)->auid
) {
239 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
242 snapc
= pi
->get_snap_context();
243 pi
->build_removed_snaps(cached_removed_snaps
);
246 void update(OSDMapRef map
);
249 /** PG - Replica Placement Group
253 class PG
: public DoutPrefixProvider
{
258 SnapMapper snap_mapper
;
259 bool eio_errors_to_process
= false;
261 virtual PGBackend
*get_pgbackend() = 0;
263 std::string
gen_prefix() const override
;
264 CephContext
*get_cct() const override
{ return cct
; }
265 unsigned get_subsys() const override
{ return ceph_subsys_osd
; }
268 void update_snap_mapper_bits(uint32_t bits
) {
269 snap_mapper
.update_bits(bits
);
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate
*get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
276 OSDMapRef osdmap_ref
;
277 OSDMapRef last_persisted_osdmap_ref
;
280 void requeue_map_waiters();
282 void update_osdmap_ref(OSDMapRef newmap
) {
283 assert(_lock
.is_locked_by_me());
284 osdmap_ref
= std::move(newmap
);
288 OSDMapRef
get_osdmap() const {
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
303 std::atomic_uint ref
{0};
307 map
<uint64_t, string
> _live_ids
;
308 map
<string
, uint64_t> _tag_counts
;
313 bool deleting
; // true while in removing or OSD is shutting down
315 ZTracer::Endpoint trace_endpoint
;
317 void lock_suspend_timeout(ThreadPool::TPHandle
&handle
);
318 void lock(bool no_lockdep
= false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
322 assert(!dirty_big_info
);
326 bool is_locked() const {
327 return _lock
.is_locked();
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
335 void get(const char* tag
);
336 void put(const char* tag
);
338 bool dirty_info
, dirty_big_info
;
341 bool is_ec_pg() const {
342 return pool
.info
.ec_pool();
345 pg_info_t info
; ///< current pg info
346 pg_info_t last_written_info
; ///< last written info
348 static const __u8 cur_struct_v
= 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v
= 7;
355 bool must_upgrade() {
356 return info_struct_v
< cur_struct_v
;
359 return info_struct_v
>= compat_struct_v
;
361 void upgrade(ObjectStore
*store
);
364 ObjectStore::CollectionHandle ch
;
366 static string
get_info_key(spg_t pgid
) {
367 return stringify(pgid
) + "_info";
369 static string
get_biginfo_key(spg_t pgid
) {
370 return stringify(pgid
) + "_biginfo";
372 static string
get_epoch_key(spg_t pgid
) {
373 return stringify(pgid
) + "_epoch";
375 ghobject_t pgmeta_oid
;
378 map
<hobject_t
, pg_missing_item
> needs_recovery_map
;
379 map
<hobject_t
, set
<pg_shard_t
> > missing_loc
;
380 set
<pg_shard_t
> missing_loc_sources
;
382 set
<pg_shard_t
> empty_set
;
384 boost::scoped_ptr
<IsPGReadablePredicate
> is_readable
;
385 boost::scoped_ptr
<IsPGRecoverablePredicate
> is_recoverable
;
386 explicit MissingLoc(PG
*pg
)
388 void set_backend_predicates(
389 IsPGReadablePredicate
*_is_readable
,
390 IsPGRecoverablePredicate
*_is_recoverable
) {
391 is_readable
.reset(_is_readable
);
392 is_recoverable
.reset(_is_recoverable
);
394 string
gen_prefix() const { return pg
->gen_prefix(); }
396 const hobject_t
&hoid
,
397 eversion_t
*v
= 0) const {
398 map
<hobject_t
, pg_missing_item
>::const_iterator i
=
399 needs_recovery_map
.find(hoid
);
400 if (i
== needs_recovery_map
.end())
406 bool is_unfound(const hobject_t
&hoid
) const {
407 return needs_recovery(hoid
) && (
408 !missing_loc
.count(hoid
) ||
409 !(*is_recoverable
)(missing_loc
.find(hoid
)->second
));
411 bool readable_with_acting(
412 const hobject_t
&hoid
,
413 const set
<pg_shard_t
> &acting
) const;
414 uint64_t num_unfound() const {
416 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
417 needs_recovery_map
.begin();
418 i
!= needs_recovery_map
.end();
420 if (is_unfound(i
->first
))
426 bool have_unfound() const {
427 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
428 needs_recovery_map
.begin();
429 i
!= needs_recovery_map
.end();
431 if (is_unfound(i
->first
))
437 needs_recovery_map
.clear();
439 missing_loc_sources
.clear();
442 void add_location(const hobject_t
&hoid
, pg_shard_t location
) {
443 missing_loc
[hoid
].insert(location
);
445 void remove_location(const hobject_t
&hoid
, pg_shard_t location
) {
446 missing_loc
[hoid
].erase(location
);
448 void add_active_missing(const pg_missing_t
&missing
) {
449 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
450 missing
.get_items().begin();
451 i
!= missing
.get_items().end();
453 map
<hobject_t
, pg_missing_item
>::const_iterator j
=
454 needs_recovery_map
.find(i
->first
);
455 if (j
== needs_recovery_map
.end()) {
456 needs_recovery_map
.insert(*i
);
458 assert(i
->second
.need
== j
->second
.need
);
463 void add_missing(const hobject_t
&hoid
, eversion_t need
, eversion_t have
) {
464 needs_recovery_map
[hoid
] = pg_missing_item(need
, have
);
466 void revise_need(const hobject_t
&hoid
, eversion_t need
) {
467 assert(needs_recovery(hoid
));
468 needs_recovery_map
[hoid
].need
= need
;
471 /// Adds info about a possible recovery source
472 bool add_source_info(
473 pg_shard_t source
, ///< [in] source
474 const pg_info_t
&oinfo
, ///< [in] info
475 const pg_missing_t
&omissing
, ///< [in] (optional) missing
476 ThreadPool::TPHandle
* handle
///< [in] ThreadPool handle
477 ); ///< @return whether a new object location was discovered
479 /// Adds recovery sources in batch
480 void add_batch_sources_info(
481 const set
<pg_shard_t
> &sources
, ///< [in] a set of resources which can be used for all objects
482 ThreadPool::TPHandle
* handle
///< [in] ThreadPool handle
485 /// Uses osdmap to update structures for now down sources
486 void check_recovery_sources(const OSDMapRef
& osdmap
);
488 /// Call when hoid is no longer missing in acting set
489 void recovered(const hobject_t
&hoid
) {
490 needs_recovery_map
.erase(hoid
);
491 missing_loc
.erase(hoid
);
494 /// Call to update structures for hoid after a change
496 const hobject_t
&hoid
,
498 const set
<pg_shard_t
> to_recover
,
499 const pg_info_t
&info
,
500 const pg_missing_t
&missing
,
501 const map
<pg_shard_t
, pg_missing_t
> &pmissing
,
502 const map
<pg_shard_t
, pg_info_t
> &pinfo
) {
504 boost::optional
<pg_missing_item
> item
;
505 auto miter
= missing
.get_items().find(hoid
);
506 if (miter
!= missing
.get_items().end()) {
507 item
= miter
->second
;
509 for (auto &&i
: to_recover
) {
512 auto pmiter
= pmissing
.find(i
);
513 assert(pmiter
!= pmissing
.end());
514 miter
= pmiter
->second
.get_items().find(hoid
);
515 if (miter
!= pmiter
->second
.get_items().end()) {
516 item
= miter
->second
;
522 return; // recovered!
524 needs_recovery_map
[hoid
] = *item
;
526 missing_loc
.insert(make_pair(hoid
, set
<pg_shard_t
>())).first
;
527 assert(info
.last_backfill
.is_max());
528 assert(info
.last_update
>= item
->need
);
529 if (!missing
.is_missing(hoid
))
530 mliter
->second
.insert(self
);
531 for (auto &&i
: pmissing
) {
532 auto pinfoiter
= pinfo
.find(i
.first
);
533 assert(pinfoiter
!= pinfo
.end());
534 if (item
->need
<= pinfoiter
->second
.last_update
&&
535 hoid
<= pinfoiter
->second
.last_backfill
&&
536 !i
.second
.is_missing(hoid
))
537 mliter
->second
.insert(i
.first
);
541 const set
<pg_shard_t
> &get_locations(const hobject_t
&hoid
) const {
542 return missing_loc
.count(hoid
) ?
543 missing_loc
.find(hoid
)->second
: empty_set
;
545 const map
<hobject_t
, set
<pg_shard_t
>> &get_missing_locs() const {
548 const map
<hobject_t
, pg_missing_item
> &get_needs_recovery() const {
549 return needs_recovery_map
;
553 PastIntervals past_intervals
;
555 interval_set
<snapid_t
> snap_trimq
;
557 /* You should not use these items without taking their respective queue locks
558 * (if they have one) */
559 xlist
<PG
*>::item stat_queue_item
;
561 bool recovery_queued
;
563 int recovery_ops_active
;
564 set
<pg_shard_t
> waiting_on_backfill
;
565 #ifdef DEBUG_RECOVERY_OIDS
566 set
<hobject_t
> recovering_oids
;
570 int role
; // 0 = primary, 1 = replica, -1=none.
571 unsigned state
; // PG_STATE_*
573 bool send_notify
; ///< true if we are non-primary and should notify the primary
576 eversion_t last_update_ondisk
; // last_update that has committed; ONLY DEFINED WHEN is_active()
577 eversion_t last_complete_ondisk
; // last_complete that has committed.
578 eversion_t last_update_applied
;
581 struct C_UpdateLastRollbackInfoTrimmedToApplied
: Context
{
585 C_UpdateLastRollbackInfoTrimmedToApplied(PG
*pg
, epoch_t e
, eversion_t v
)
586 : pg(pg
), e(e
), v(v
) {}
587 void finish(int) override
{
589 if (!pg
->pg_has_reset_since(e
)) {
590 pg
->last_rollback_info_trimmed_to_applied
= v
;
595 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
596 // and the transaction has applied
597 eversion_t last_rollback_info_trimmed_to_applied
;
602 pg_shard_t pg_whoami
;
603 pg_shard_t up_primary
;
604 vector
<int> up
, acting
, want_acting
;
605 set
<pg_shard_t
> actingbackfill
, actingset
, upset
;
606 map
<pg_shard_t
,eversion_t
> peer_last_complete_ondisk
;
607 eversion_t min_last_complete_ondisk
; // up: min over last_complete_ondisk, peer_last_complete_ondisk
608 eversion_t pg_trim_to
;
610 set
<int> blocked_by
; ///< osds we are blocked by (for pg stats)
612 // [primary only] content recovery state
615 struct BufferedRecoveryMessages
{
616 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
617 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > info_map
;
618 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > notify_list
;
623 map
<int, map
<spg_t
, pg_query_t
> > *query_map
;
624 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
;
625 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
;
626 set
<PGRef
> created_pgs
;
627 C_Contexts
*on_applied
;
629 ObjectStore::Transaction
*transaction
;
630 ThreadPool::TPHandle
* handle
;
631 RecoveryCtx(map
<int, map
<spg_t
, pg_query_t
> > *query_map
,
633 vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
,
635 vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
,
636 C_Contexts
*on_applied
,
638 ObjectStore::Transaction
*transaction
)
639 : query_map(query_map
), info_map(info_map
),
640 notify_list(notify_list
),
641 on_applied(on_applied
),
643 transaction(transaction
),
646 RecoveryCtx(BufferedRecoveryMessages
&buf
, RecoveryCtx
&rctx
)
647 : query_map(&(buf
.query_map
)),
648 info_map(&(buf
.info_map
)),
649 notify_list(&(buf
.notify_list
)),
650 on_applied(rctx
.on_applied
),
651 on_safe(rctx
.on_safe
),
652 transaction(rctx
.transaction
),
653 handle(rctx
.handle
) {}
655 void accept_buffered_messages(BufferedRecoveryMessages
&m
) {
659 for (map
<int, map
<spg_t
, pg_query_t
> >::iterator i
= m
.query_map
.begin();
660 i
!= m
.query_map
.end();
662 map
<spg_t
, pg_query_t
> &omap
= (*query_map
)[i
->first
];
663 for (map
<spg_t
, pg_query_t
>::iterator j
= i
->second
.begin();
664 j
!= i
->second
.end();
666 omap
[j
->first
] = j
->second
;
669 for (map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator i
670 = m
.info_map
.begin();
671 i
!= m
.info_map
.end();
673 vector
<pair
<pg_notify_t
, PastIntervals
> > &ovec
=
674 (*info_map
)[i
->first
];
675 ovec
.reserve(ovec
.size() + i
->second
.size());
676 ovec
.insert(ovec
.end(), i
->second
.begin(), i
->second
.end());
678 for (map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator i
679 = m
.notify_list
.begin();
680 i
!= m
.notify_list
.end();
682 vector
<pair
<pg_notify_t
, PastIntervals
> > &ovec
=
683 (*notify_list
)[i
->first
];
684 ovec
.reserve(ovec
.size() + i
->second
.size());
685 ovec
.insert(ovec
.end(), i
->second
.begin(), i
->second
.end());
691 PGStateHistory pgstate_history
;
694 const char *state_name
;
697 const char *get_state_name() { return state_name
; }
698 NamedState(PG
*pg_
, const char *state_name_
)
699 : state_name(state_name_
), enter_time(ceph_clock_now()), pg(pg_
) {
700 pg
->pgstate_history
.enter(pg
, enter_time
, state_name
);
702 virtual ~NamedState() { pg
->pgstate_history
.exit(state_name
); }
710 * peer_info -- projected (updates _before_ replicas ack)
711 * peer_missing -- committed (updates _after_ replicas ack)
715 set
<pg_shard_t
> stray_set
; // non-acting osds that have PG data.
716 eversion_t oldest_update
; // acting: lowest (valid) last_update in active set
717 map
<pg_shard_t
, pg_info_t
> peer_info
; // info from peers (stray or prior)
718 set
<pg_shard_t
> peer_purged
; // peers purged
719 map
<pg_shard_t
, pg_missing_t
> peer_missing
;
720 set
<pg_shard_t
> peer_log_requested
; // logs i've requested (and start stamps)
721 set
<pg_shard_t
> peer_missing_requested
;
723 // i deleted these strays; ignore racing PGInfo from them
724 set
<pg_shard_t
> peer_activated
;
726 // primary-only, recovery-only state
727 set
<pg_shard_t
> might_have_unfound
; // These osds might have objects on them
728 // which are unfound on the primary
729 epoch_t last_peering_reset
;
732 /* heartbeat peers */
733 void set_probe_targets(const set
<pg_shard_t
> &probe_set
);
734 void clear_probe_targets();
736 Mutex heartbeat_peer_lock
;
737 set
<int> heartbeat_peers
;
738 set
<int> probe_targets
;
743 * Represents the objects in a range [begin, end)
746 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
747 * 2) Else, objects contains all objects in [begin, end)
749 struct BackfillInterval
{
750 // info about a backfill interval on a peer
751 eversion_t version
; /// version at which the scan occurred
752 map
<hobject_t
,eversion_t
> objects
;
758 *this = BackfillInterval();
761 /// clear objects list only
762 void clear_objects() {
766 /// reinstantiate with a new start+end position and sort order
767 void reset(hobject_t start
) {
772 /// true if there are no objects in this interval
774 return objects
.empty();
777 /// true if interval extends to the end of the range
778 bool extends_to_end() const {
782 /// removes items <= soid and adjusts begin to the first object
783 void trim_to(const hobject_t
&soid
) {
785 while (!objects
.empty() &&
786 objects
.begin()->first
<= soid
) {
791 /// Adjusts begin to the first object
793 if (!objects
.empty())
794 begin
= objects
.begin()->first
;
799 /// drop first entry, and adjust @begin accordingly
801 assert(!objects
.empty());
802 objects
.erase(objects
.begin());
807 void dump(Formatter
*f
) const {
808 f
->dump_stream("begin") << begin
;
809 f
->dump_stream("end") << end
;
810 f
->open_array_section("objects");
811 for (map
<hobject_t
, eversion_t
>::const_iterator i
=
815 f
->open_object_section("object");
816 f
->dump_stream("object") << i
->first
;
817 f
->dump_stream("version") << i
->second
;
825 BackfillInterval backfill_info
;
826 map
<pg_shard_t
, BackfillInterval
> peer_backfill_info
;
827 bool backfill_reserved
;
828 bool backfill_reserving
;
833 set
<pg_shard_t
> backfill_targets
;
835 bool is_backfill_targets(pg_shard_t osd
) {
836 return backfill_targets
.count(osd
);
842 * blocked request wait hierarchy
844 * In order to preserve request ordering we need to be careful about the
845 * order in which blocked requests get requeued. Generally speaking, we
846 * push the requests back up to the op_wq in reverse order (most recent
847 * request first) so that they come back out again in the original order.
848 * However, because there are multiple wait queues, we need to requeue
849 * waitlists in order. Generally speaking, we requeue the wait lists
850 * that are checked first.
852 * Here are the various wait lists, in the order they are used during
853 * request processing, with notes:
856 * - may start or stop blocking at any time (depending on client epoch)
857 * - waiting_for_peered
858 * - !is_peered() or flushes_in_progress
859 * - only starts blocking on interval change; never restarts
860 * - waiting_for_active
862 * - only starts blocking on interval change; never restarts
863 * - waiting_for_scrub
864 * - starts and stops blocking for varying intervals during scrub
865 * - waiting_for_unreadable_object
866 * - never restarts once object is readable (* except for EIO?)
867 * - waiting_for_degraded_object
868 * - never restarts once object is writeable (* except for EIO?)
869 * - waiting_for_blocked_object
870 * - starts and stops based on proxied op activity
872 * - starts and stops based on read/write activity
876 * 1. During and interval change, we requeue *everything* in the above order.
878 * 2. When an obc rwlock is released, we check for a scrub block and requeue
879 * the op there if it applies. We ignore the unreadable/degraded/blocked
880 * queues because we assume they cannot apply at that time (this is
881 * probably mostly true).
883 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
886 * These three behaviors are generally sufficient to maintain ordering, with
887 * the possible exception of cases where we make an object degraded or
888 * unreadable that was previously okay, e.g. when scrub or op processing
889 * encounter an unexpected error. FIXME.
893 unsigned flushes_in_progress
;
895 // ops with newer maps than our (or blocked behind them)
896 // track these by client, since inter-request ordering doesn't otherwise
898 unordered_map
<entity_name_t
,list
<OpRequestRef
>> waiting_for_map
;
900 // ops waiting on peered
901 list
<OpRequestRef
> waiting_for_peered
;
903 // ops waiting on active (require peered as well)
904 list
<OpRequestRef
> waiting_for_active
;
905 list
<OpRequestRef
> waiting_for_scrub
;
907 list
<OpRequestRef
> waiting_for_cache_not_full
;
908 list
<OpRequestRef
> waiting_for_clean_to_primary_repair
;
909 map
<hobject_t
, list
<OpRequestRef
>> waiting_for_unreadable_object
,
910 waiting_for_degraded_object
,
911 waiting_for_blocked_object
;
913 set
<hobject_t
> objects_blocked_on_cache_full
;
914 map
<hobject_t
,snapid_t
> objects_blocked_on_degraded_snap
;
915 map
<hobject_t
,ObjectContextRef
> objects_blocked_on_snap_promotion
;
917 // Callbacks should assume pg (and nothing else) is locked
918 map
<hobject_t
, list
<Context
*>> callbacks_for_degraded_object
;
921 list
<pair
<OpRequestRef
, version_t
> > > waiting_for_ondisk
;
923 void requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
);
924 void requeue_op(OpRequestRef op
);
925 void requeue_ops(list
<OpRequestRef
> &l
);
927 // stats that persist lazily
928 object_stat_collection_t unstable_stats
;
931 Mutex pg_stats_publish_lock
;
932 bool pg_stats_publish_valid
;
933 pg_stat_t pg_stats_publish
;
935 // for ordering writes
936 ceph::shared_ptr
<ObjectStore::Sequencer
> osr
;
938 void _update_calc_stats();
939 void _update_blocked_by();
940 void publish_stats_to_osd();
941 void clear_publish_stats();
944 void clear_primary_state();
946 bool is_actingbackfill(pg_shard_t osd
) const {
947 return actingbackfill
.count(osd
);
949 bool is_acting(pg_shard_t osd
) const {
950 return has_shard(pool
.info
.ec_pool(), acting
, osd
);
952 bool is_up(pg_shard_t osd
) const {
953 return has_shard(pool
.info
.ec_pool(), up
, osd
);
955 static bool has_shard(bool ec
, const vector
<int>& v
, pg_shard_t osd
) {
957 return v
.size() > (unsigned)osd
.shard
&& v
[osd
.shard
] == osd
.osd
;
959 return std::find(v
.begin(), v
.end(), osd
.osd
) != v
.end();
963 bool needs_recovery() const;
964 bool needs_backfill() const;
966 /// get log recovery reservation priority
967 unsigned get_recovery_priority();
968 /// get backfill reservation priority
969 unsigned get_backfill_priority();
971 void mark_clean(); ///< mark an active pg clean
973 /// return [start,end) bounds for required past_intervals
974 static pair
<epoch_t
, epoch_t
> get_required_past_interval_bounds(
975 const pg_info_t
&info
,
976 epoch_t oldest_map
) {
978 info
.history
.last_epoch_clean
? info
.history
.last_epoch_clean
:
979 info
.history
.epoch_pool_created
,
982 info
.history
.same_interval_since
,
983 info
.history
.epoch_pool_created
);
984 return make_pair(start
, end
);
986 void check_past_interval_bounds() const;
987 PastIntervals::PriorSet
build_prior();
989 void remove_down_peer_info(const OSDMapRef osdmap
);
991 bool adjust_need_up_thru(const OSDMapRef osdmap
);
993 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const;
994 virtual void dump_recovery_info(Formatter
*f
) const = 0;
996 bool calc_min_last_complete_ondisk() {
997 eversion_t min
= last_complete_ondisk
;
998 assert(!actingbackfill
.empty());
999 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1000 i
!= actingbackfill
.end();
1002 if (*i
== get_primary()) continue;
1003 if (peer_last_complete_ondisk
.count(*i
) == 0)
1004 return false; // we don't have complete info
1005 eversion_t a
= peer_last_complete_ondisk
[*i
];
1009 if (min
== min_last_complete_ondisk
)
1011 min_last_complete_ondisk
= min
;
1015 virtual void calc_trim_to() = 0;
1017 void proc_replica_log(pg_info_t
&oinfo
, const pg_log_t
&olog
,
1018 pg_missing_t
& omissing
, pg_shard_t from
);
1019 void proc_master_log(ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
,
1020 pg_missing_t
& omissing
, pg_shard_t from
);
1021 bool proc_replica_info(
1022 pg_shard_t from
, const pg_info_t
&info
, epoch_t send_epoch
);
1024 struct PGLogEntryHandler
: public PGLog::LogEntryHandler
{
1026 ObjectStore::Transaction
*t
;
1027 PGLogEntryHandler(PG
*pg
, ObjectStore::Transaction
*t
) : pg(pg
), t(t
) {}
1030 void remove(const hobject_t
&hoid
) override
{
1031 pg
->get_pgbackend()->remove(hoid
, t
);
1033 void try_stash(const hobject_t
&hoid
, version_t v
) override
{
1034 pg
->get_pgbackend()->try_stash(hoid
, v
, t
);
1036 void rollback(const pg_log_entry_t
&entry
) override
{
1037 assert(entry
.can_rollback());
1038 pg
->get_pgbackend()->rollback(entry
, t
);
1040 void rollforward(const pg_log_entry_t
&entry
) override
{
1041 pg
->get_pgbackend()->rollforward(entry
, t
);
1043 void trim(const pg_log_entry_t
&entry
) override
{
1044 pg
->get_pgbackend()->trim(entry
, t
);
1048 void update_object_snap_mapping(
1049 ObjectStore::Transaction
*t
, const hobject_t
&soid
,
1050 const set
<snapid_t
> &snaps
);
1051 void clear_object_snap_mapping(
1052 ObjectStore::Transaction
*t
, const hobject_t
&soid
);
1053 void remove_snap_mapped_object(
1054 ObjectStore::Transaction
& t
, const hobject_t
& soid
);
1056 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
1057 pg_log_t
&olog
, pg_shard_t from
);
1058 void rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
);
1059 bool search_for_missing(
1060 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
1064 void check_for_lost_objects();
1065 void forget_lost_objects();
1067 void discover_all_missing(std::map
<int, map
<spg_t
,pg_query_t
> > &query_map
);
1069 void trim_write_ahead();
1071 map
<pg_shard_t
, pg_info_t
>::const_iterator
find_best_info(
1072 const map
<pg_shard_t
, pg_info_t
> &infos
,
1073 bool restrict_to_up_acting
,
1074 bool *history_les_bound
) const;
1075 static void calc_ec_acting(
1076 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1078 const vector
<int> &acting
,
1079 pg_shard_t acting_primary
,
1080 const vector
<int> &up
,
1081 pg_shard_t up_primary
,
1082 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1083 bool restrict_to_up_acting
,
1085 set
<pg_shard_t
> *backfill
,
1086 set
<pg_shard_t
> *acting_backfill
,
1087 pg_shard_t
*want_primary
,
1089 static void calc_replicated_acting(
1090 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1092 const vector
<int> &acting
,
1093 pg_shard_t acting_primary
,
1094 const vector
<int> &up
,
1095 pg_shard_t up_primary
,
1096 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1097 bool restrict_to_up_acting
,
1099 set
<pg_shard_t
> *backfill
,
1100 set
<pg_shard_t
> *acting_backfill
,
1101 pg_shard_t
*want_primary
,
1103 bool choose_acting(pg_shard_t
&auth_log_shard
,
1104 bool restrict_to_up_acting
,
1105 bool *history_les_bound
);
1106 void build_might_have_unfound();
1108 ObjectStore::Transaction
& t
,
1109 epoch_t activation_epoch
,
1110 list
<Context
*>& tfin
,
1111 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1113 vector
<pair
<pg_notify_t
, PastIntervals
> > > *activator_map
,
1115 void _activate_committed(epoch_t epoch
, epoch_t activation_epoch
);
1116 void all_activated_and_committed();
1118 void proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&info
);
1120 bool have_unfound() const {
1121 return missing_loc
.have_unfound();
1123 uint64_t get_num_unfound() const {
1124 return missing_loc
.num_unfound();
1127 virtual void check_local() = 0;
1130 * @param ops_begun returns how many recovery ops the function started
1131 * @returns true if any useful work was accomplished; false otherwise
1133 virtual bool start_recovery_ops(
1135 ThreadPool::TPHandle
&handle
,
1136 uint64_t *ops_begun
) = 0;
1138 void purge_strays();
1140 void update_heartbeat_peers();
1142 Context
*finish_sync_event
;
1144 void finish_recovery(list
<Context
*>& tfin
);
1145 void _finish_recovery(Context
*c
);
1146 void cancel_recovery();
1147 void clear_recovery_state();
1148 virtual void _clear_recovery_state() = 0;
1149 virtual void check_recovery_sources(const OSDMapRef
& newmap
) = 0;
1150 void start_recovery_op(const hobject_t
& soid
);
1151 void finish_recovery_op(const hobject_t
& soid
, bool dequeue
=false);
1153 void split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
);
1154 virtual void _split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
) = 0;
1156 friend class C_OSD_RepModify_Commit
;
1159 Mutex backoff_lock
; // orders inside Backoff::lock
1160 map
<hobject_t
,set
<BackoffRef
>> backoffs
;
1162 void add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
);
1163 void release_backoffs(const hobject_t
& begin
, const hobject_t
& end
);
1164 void release_backoffs(const hobject_t
& o
) {
1165 release_backoffs(o
, o
);
1167 void clear_backoffs();
1169 void add_pg_backoff(SessionRef s
) {
1170 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1171 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1172 add_backoff(s
, begin
, end
);
1174 void release_pg_backoffs() {
1175 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1176 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1177 release_backoffs(begin
, end
);
1180 void rm_backoff(BackoffRef b
);
1188 set
<pg_shard_t
> reserved_peers
;
1189 bool reserved
, reserve_failed
;
1190 epoch_t epoch_start
;
1192 // common to both scrubs
1195 set
<pg_shard_t
> waiting_on_whom
;
1199 ScrubMap primary_scrubmap
;
1200 map
<pg_shard_t
, ScrubMap
> received_maps
;
1201 OpRequestRef active_rep_scrub
;
1202 utime_t scrub_reg_stamp
; // stamp we registered for
1205 bool sleeping
= false;
1206 bool needs_sleep
= true;
1207 utime_t sleep_start
;
1209 // flags to indicate explicitly requested scrubs (by admin)
1210 bool must_scrub
, must_deep_scrub
, must_repair
;
1212 // Priority to use for scrub scheduling
1215 // this flag indicates whether we would like to do auto-repair of the PG or not
1218 // Maps from objects with errors to missing/inconsistent peers
1219 map
<hobject_t
, set
<pg_shard_t
>> missing
;
1220 map
<hobject_t
, set
<pg_shard_t
>> inconsistent
;
1222 // Map from object with errors to good peers
1223 map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >> authoritative
;
1225 // Cleaned map pending snap metadata scrub
1226 ScrubMap cleaned_meta_map
;
1228 // digest updates which we are waiting on
1229 int num_digest_updates_pending
;
1232 hobject_t start
, end
;
1233 eversion_t subset_last_update
;
1235 // chunky scrub state
1244 WAIT_DIGEST_UPDATES
,
1248 std::unique_ptr
<Scrub::Store
> store
;
1253 list
<Context
*> callbacks
;
1254 void add_callback(Context
*context
) {
1255 callbacks
.push_back(context
);
1257 void run_callbacks() {
1258 list
<Context
*> to_run
;
1259 to_run
.swap(callbacks
);
1260 for (list
<Context
*>::iterator i
= to_run
.begin();
1267 static const char *state_string(const PG::Scrubber::State
& state
) {
1268 const char *ret
= NULL
;
1271 case INACTIVE
: ret
= "INACTIVE"; break;
1272 case NEW_CHUNK
: ret
= "NEW_CHUNK"; break;
1273 case WAIT_PUSHES
: ret
= "WAIT_PUSHES"; break;
1274 case WAIT_LAST_UPDATE
: ret
= "WAIT_LAST_UPDATE"; break;
1275 case BUILD_MAP
: ret
= "BUILD_MAP"; break;
1276 case WAIT_REPLICAS
: ret
= "WAIT_REPLICAS"; break;
1277 case COMPARE_MAPS
: ret
= "COMPARE_MAPS"; break;
1278 case WAIT_DIGEST_UPDATES
: ret
= "WAIT_DIGEST_UPDATES"; break;
1279 case FINISH
: ret
= "FINISH"; break;
1284 bool is_chunky_scrub_active() const { return state
!= INACTIVE
; }
1286 // classic (non chunk) scrubs block all writes
1287 // chunky scrubs only block writes to a range
1288 bool write_blocked_by_scrub(const hobject_t
&soid
) {
1289 return (soid
>= start
&& soid
< end
);
1296 waiting_on_whom
.clear();
1297 if (active_rep_scrub
) {
1298 active_rep_scrub
= OpRequestRef();
1300 received_maps
.clear();
1303 must_deep_scrub
= false;
1304 must_repair
= false;
1305 auto_repair
= false;
1307 state
= PG::Scrubber::INACTIVE
;
1308 start
= hobject_t();
1310 subset_last_update
= eversion_t();
1317 inconsistent
.clear();
1319 authoritative
.clear();
1320 num_digest_updates_pending
= 0;
1321 cleaned_meta_map
= ScrubMap();
1324 sleep_start
= utime_t();
1327 void create_results(const hobject_t
& obj
);
1328 void cleanup_store(ObjectStore::Transaction
*t
);
1331 bool scrub_after_recovery
;
1336 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
1337 pg_shard_t bad_peer
);
1339 void scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
);
1340 void chunky_scrub(ThreadPool::TPHandle
&handle
);
1341 void scrub_compare_maps();
1343 * return true if any inconsistency/missing is repaired, false otherwise
1345 bool scrub_process_inconsistent();
1346 bool ops_blocked_by_scrub() const;
1347 void scrub_finish();
1348 void scrub_clear_state();
1349 void _scan_snaps(ScrubMap
&map
);
1350 void _repair_oinfo_oid(ScrubMap
&map
);
1351 void _scan_rollback_obs(
1352 const vector
<ghobject_t
> &rollback_obs
,
1353 ThreadPool::TPHandle
&handle
);
1354 void _request_scrub_map(pg_shard_t replica
, eversion_t version
,
1355 hobject_t start
, hobject_t end
, bool deep
,
1357 int build_scrub_map_chunk(
1359 hobject_t start
, hobject_t end
, bool deep
, uint32_t seed
,
1360 ThreadPool::TPHandle
&handle
);
1362 * returns true if [begin, end) is good to scrub at this time
1363 * a false return value obliges the implementer to requeue scrub when the
1364 * condition preventing scrub clears
1366 virtual bool _range_available_for_scrub(
1367 const hobject_t
&begin
, const hobject_t
&end
) = 0;
1368 virtual void scrub_snapshot_metadata(
1370 const std::map
<hobject_t
, pair
<uint32_t, uint32_t>> &missing_digest
) { }
1371 virtual void _scrub_clear_state() { }
1372 virtual void _scrub_finish() { }
1373 virtual void split_colls(
1377 const pg_pool_t
*pool
,
1378 ObjectStore::Transaction
*t
) = 0;
1379 void clear_scrub_reserved();
1380 void scrub_reserve_replicas();
1381 void scrub_unreserve_replicas();
1382 bool scrub_all_replicas_reserved() const;
1384 void reg_next_scrub();
1385 void unreg_next_scrub();
1389 ThreadPool::TPHandle
&handle
);
1390 void do_replica_scrub_map(OpRequestRef op
);
1391 void sub_op_scrub_map(OpRequestRef op
);
1393 void handle_scrub_reserve_request(OpRequestRef op
);
1394 void handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
);
1395 void handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
);
1396 void handle_scrub_reserve_release(OpRequestRef op
);
1398 void reject_reservation();
1399 void schedule_backfill_full_retry();
1400 void schedule_recovery_full_retry();
1402 // -- recovery state --
1404 template <class EVT
>
1405 struct QueuePeeringEvt
: Context
{
1409 QueuePeeringEvt(PG
*pg
, epoch_t epoch
, EVT evt
) :
1410 pg(pg
), epoch(epoch
), evt(evt
) {}
1411 void finish(int r
) override
{
1413 pg
->queue_peering_event(PG::CephPeeringEvtRef(
1414 new PG::CephPeeringEvt(
1422 class CephPeeringEvt
{
1424 epoch_t epoch_requested
;
1425 boost::intrusive_ptr
< const boost::statechart::event_base
> evt
;
1428 MEMPOOL_CLASS_HELPERS();
1430 CephPeeringEvt(epoch_t epoch_sent
,
1431 epoch_t epoch_requested
,
1433 epoch_sent(epoch_sent
), epoch_requested(epoch_requested
),
1434 evt(evt_
.intrusive_from_this()) {
1436 out
<< "epoch_sent: " << epoch_sent
1437 << " epoch_requested: " << epoch_requested
<< " ";
1441 epoch_t
get_epoch_sent() { return epoch_sent
; }
1442 epoch_t
get_epoch_requested() { return epoch_requested
; }
1443 const boost::statechart::event_base
&get_event() { return *evt
; }
1444 string
get_desc() { return desc
; }
1446 typedef ceph::shared_ptr
<CephPeeringEvt
> CephPeeringEvtRef
;
1447 list
<CephPeeringEvtRef
> peering_queue
; // op queue
1448 list
<CephPeeringEvtRef
> peering_waiters
;
1450 struct QueryState
: boost::statechart::event
< QueryState
> {
1452 explicit QueryState(Formatter
*f
) : f(f
) {}
1453 void print(std::ostream
*out
) const {
1458 struct MInfoRec
: boost::statechart::event
< MInfoRec
> {
1462 MInfoRec(pg_shard_t from
, const pg_info_t
&info
, epoch_t msg_epoch
) :
1463 from(from
), info(info
), msg_epoch(msg_epoch
) {}
1464 void print(std::ostream
*out
) const {
1465 *out
<< "MInfoRec from " << from
<< " info: " << info
;
1469 struct MLogRec
: boost::statechart::event
< MLogRec
> {
1471 boost::intrusive_ptr
<MOSDPGLog
> msg
;
1472 MLogRec(pg_shard_t from
, MOSDPGLog
*msg
) :
1473 from(from
), msg(msg
) {}
1474 void print(std::ostream
*out
) const {
1475 *out
<< "MLogRec from " << from
;
1479 struct MNotifyRec
: boost::statechart::event
< MNotifyRec
> {
1483 MNotifyRec(pg_shard_t from
, const pg_notify_t
¬ify
, uint64_t f
) :
1484 from(from
), notify(notify
), features(f
) {}
1485 void print(std::ostream
*out
) const {
1486 *out
<< "MNotifyRec from " << from
<< " notify: " << notify
1487 << " features: 0x" << hex
<< features
<< dec
;
1491 struct MQuery
: boost::statechart::event
< MQuery
> {
1494 epoch_t query_epoch
;
1495 MQuery(pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
):
1496 from(from
), query(query
), query_epoch(query_epoch
) {}
1497 void print(std::ostream
*out
) const {
1498 *out
<< "MQuery from " << from
1499 << " query_epoch " << query_epoch
1500 << " query: " << query
;
1504 struct AdvMap
: boost::statechart::event
< AdvMap
> {
1507 vector
<int> newup
, newacting
;
1508 int up_primary
, acting_primary
;
1510 OSDMapRef osdmap
, OSDMapRef lastmap
,
1511 vector
<int>& newup
, int up_primary
,
1512 vector
<int>& newacting
, int acting_primary
):
1513 osdmap(osdmap
), lastmap(lastmap
),
1515 newacting(newacting
),
1516 up_primary(up_primary
),
1517 acting_primary(acting_primary
) {}
1518 void print(std::ostream
*out
) const {
1523 struct ActMap
: boost::statechart::event
< ActMap
> {
1524 ActMap() : boost::statechart::event
< ActMap
>() {}
1525 void print(std::ostream
*out
) const {
1529 struct Activate
: boost::statechart::event
< Activate
> {
1530 epoch_t activation_epoch
;
1531 explicit Activate(epoch_t q
) : boost::statechart::event
< Activate
>(),
1532 activation_epoch(q
) {}
1533 void print(std::ostream
*out
) const {
1534 *out
<< "Activate from " << activation_epoch
;
1537 struct RequestBackfillPrio
: boost::statechart::event
< RequestBackfillPrio
> {
1539 explicit RequestBackfillPrio(unsigned prio
) :
1540 boost::statechart::event
< RequestBackfillPrio
>(),
1542 void print(std::ostream
*out
) const {
1543 *out
<< "RequestBackfillPrio: priority " << priority
;
1546 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1547 T() : boost::statechart::event< T >() {} \
1548 void print(std::ostream *out) const { \
1552 TrivialEvent(Initialize
)
1554 TrivialEvent(GotInfo
)
1555 TrivialEvent(NeedUpThru
)
1556 TrivialEvent(NullEvt
)
1557 TrivialEvent(FlushedEvt
)
1558 TrivialEvent(Backfilled
)
1559 TrivialEvent(LocalBackfillReserved
)
1560 TrivialEvent(RemoteBackfillReserved
)
1561 TrivialEvent(RemoteReservationRejected
)
1562 TrivialEvent(CancelBackfill
)
1563 TrivialEvent(RequestBackfill
)
1564 TrivialEvent(RequestRecovery
)
1565 TrivialEvent(RecoveryDone
)
1566 TrivialEvent(BackfillTooFull
)
1567 TrivialEvent(RecoveryTooFull
)
1568 TrivialEvent(CancelRecovery
)
1570 TrivialEvent(AllReplicasRecovered
)
1571 TrivialEvent(DoRecovery
)
1572 TrivialEvent(LocalRecoveryReserved
)
1573 TrivialEvent(RemoteRecoveryReserved
)
1574 TrivialEvent(AllRemotesReserved
)
1575 TrivialEvent(AllBackfillsReserved
)
1576 TrivialEvent(GoClean
)
1578 TrivialEvent(AllReplicasActivated
)
1580 TrivialEvent(IntervalFlush
)
1582 /* Encapsulates PG recovery process */
1583 class RecoveryState
{
1584 void start_handle(RecoveryCtx
*new_ctx
);
1587 void begin_block_outgoing();
1588 void end_block_outgoing();
1589 void clear_blocked_outgoing();
1594 class RecoveryMachine
: public boost::statechart::state_machine
< RecoveryMachine
, Initial
> {
1595 RecoveryState
*state
;
1600 uint64_t event_count
;
1602 void clear_event_counters() {
1603 event_time
= utime_t();
1607 void log_enter(const char *state_name
);
1608 void log_exit(const char *state_name
, utime_t duration
);
1610 RecoveryMachine(RecoveryState
*state
, PG
*pg
) : state(state
), pg(pg
), event_count(0) {}
1612 /* Accessor functions for state methods */
1613 ObjectStore::Transaction
* get_cur_transaction() {
1614 assert(state
->rctx
);
1615 assert(state
->rctx
->transaction
);
1616 return state
->rctx
->transaction
;
1619 void send_query(pg_shard_t to
, const pg_query_t
&query
) {
1620 assert(state
->rctx
);
1621 assert(state
->rctx
->query_map
);
1622 (*state
->rctx
->query_map
)[to
.osd
][spg_t(pg
->info
.pgid
.pgid
, to
.shard
)] =
1626 map
<int, map
<spg_t
, pg_query_t
> > *get_query_map() {
1627 assert(state
->rctx
);
1628 assert(state
->rctx
->query_map
);
1629 return state
->rctx
->query_map
;
1632 map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > > *get_info_map() {
1633 assert(state
->rctx
);
1634 assert(state
->rctx
->info_map
);
1635 return state
->rctx
->info_map
;
1638 list
< Context
* > *get_on_safe_context_list() {
1639 assert(state
->rctx
);
1640 assert(state
->rctx
->on_safe
);
1641 return &(state
->rctx
->on_safe
->contexts
);
1644 list
< Context
* > *get_on_applied_context_list() {
1645 assert(state
->rctx
);
1646 assert(state
->rctx
->on_applied
);
1647 return &(state
->rctx
->on_applied
->contexts
);
1650 RecoveryCtx
*get_recovery_ctx() { return &*(state
->rctx
); }
1652 void send_notify(pg_shard_t to
,
1653 const pg_notify_t
&info
, const PastIntervals
&pi
) {
1654 assert(state
->rctx
);
1655 assert(state
->rctx
->notify_list
);
1656 (*state
->rctx
->notify_list
)[to
.osd
].push_back(make_pair(info
, pi
));
1659 friend class RecoveryMachine
;
1663 struct Crashed
: boost::statechart::state
< Crashed
, RecoveryMachine
>, NamedState
{
1664 explicit Crashed(my_context ctx
);
1669 struct Initial
: boost::statechart::state
< Initial
, RecoveryMachine
>, NamedState
{
1670 explicit Initial(my_context ctx
);
1673 typedef boost::mpl::list
<
1674 boost::statechart::transition
< Initialize
, Reset
>,
1675 boost::statechart::custom_reaction
< Load
>,
1676 boost::statechart::custom_reaction
< NullEvt
>,
1677 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
1680 boost::statechart::result
react(const Load
&);
1681 boost::statechart::result
react(const MNotifyRec
&);
1682 boost::statechart::result
react(const MInfoRec
&);
1683 boost::statechart::result
react(const MLogRec
&);
1684 boost::statechart::result
react(const boost::statechart::event_base
&) {
1685 return discard_event();
1689 struct Reset
: boost::statechart::state
< Reset
, RecoveryMachine
>, NamedState
{
1690 explicit Reset(my_context ctx
);
1693 typedef boost::mpl::list
<
1694 boost::statechart::custom_reaction
< QueryState
>,
1695 boost::statechart::custom_reaction
< AdvMap
>,
1696 boost::statechart::custom_reaction
< ActMap
>,
1697 boost::statechart::custom_reaction
< NullEvt
>,
1698 boost::statechart::custom_reaction
< FlushedEvt
>,
1699 boost::statechart::custom_reaction
< IntervalFlush
>,
1700 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
1702 boost::statechart::result
react(const QueryState
& q
);
1703 boost::statechart::result
react(const AdvMap
&);
1704 boost::statechart::result
react(const ActMap
&);
1705 boost::statechart::result
react(const FlushedEvt
&);
1706 boost::statechart::result
react(const IntervalFlush
&);
1707 boost::statechart::result
react(const boost::statechart::event_base
&) {
1708 return discard_event();
1714 struct Started
: boost::statechart::state
< Started
, RecoveryMachine
, Start
>, NamedState
{
1715 explicit Started(my_context ctx
);
1718 typedef boost::mpl::list
<
1719 boost::statechart::custom_reaction
< QueryState
>,
1720 boost::statechart::custom_reaction
< AdvMap
>,
1721 boost::statechart::custom_reaction
< NullEvt
>,
1722 boost::statechart::custom_reaction
< FlushedEvt
>,
1723 boost::statechart::custom_reaction
< IntervalFlush
>,
1724 boost::statechart::transition
< boost::statechart::event_base
, Crashed
>
1726 boost::statechart::result
react(const QueryState
& q
);
1727 boost::statechart::result
react(const AdvMap
&);
1728 boost::statechart::result
react(const FlushedEvt
&);
1729 boost::statechart::result
react(const IntervalFlush
&);
1730 boost::statechart::result
react(const boost::statechart::event_base
&) {
1731 return discard_event();
1735 struct MakePrimary
: boost::statechart::event
< MakePrimary
> {
1736 MakePrimary() : boost::statechart::event
< MakePrimary
>() {}
1738 struct MakeStray
: boost::statechart::event
< MakeStray
> {
1739 MakeStray() : boost::statechart::event
< MakeStray
>() {}
1744 struct Start
: boost::statechart::state
< Start
, Started
>, NamedState
{
1745 explicit Start(my_context ctx
);
1748 typedef boost::mpl::list
<
1749 boost::statechart::transition
< MakePrimary
, Primary
>,
1750 boost::statechart::transition
< MakeStray
, Stray
>
1755 struct WaitActingChange
;
1756 struct NeedActingChange
: boost::statechart::event
< NeedActingChange
> {
1757 NeedActingChange() : boost::statechart::event
< NeedActingChange
>() {}
1760 struct IsIncomplete
: boost::statechart::event
< IsIncomplete
> {
1761 IsIncomplete() : boost::statechart::event
< IsIncomplete
>() {}
1764 struct IsDown
: boost::statechart::event
< IsDown
> {
1765 IsDown() : boost::statechart::event
< IsDown
>() {}
1768 struct Primary
: boost::statechart::state
< Primary
, Started
, Peering
>, NamedState
{
1769 explicit Primary(my_context ctx
);
1772 typedef boost::mpl::list
<
1773 boost::statechart::custom_reaction
< ActMap
>,
1774 boost::statechart::custom_reaction
< MNotifyRec
>,
1775 boost::statechart::transition
< NeedActingChange
, WaitActingChange
>
1777 boost::statechart::result
react(const ActMap
&);
1778 boost::statechart::result
react(const MNotifyRec
&);
1781 struct WaitActingChange
: boost::statechart::state
< WaitActingChange
, Primary
>,
1783 typedef boost::mpl::list
<
1784 boost::statechart::custom_reaction
< QueryState
>,
1785 boost::statechart::custom_reaction
< AdvMap
>,
1786 boost::statechart::custom_reaction
< MLogRec
>,
1787 boost::statechart::custom_reaction
< MInfoRec
>,
1788 boost::statechart::custom_reaction
< MNotifyRec
>
1790 explicit WaitActingChange(my_context ctx
);
1791 boost::statechart::result
react(const QueryState
& q
);
1792 boost::statechart::result
react(const AdvMap
&);
1793 boost::statechart::result
react(const MLogRec
&);
1794 boost::statechart::result
react(const MInfoRec
&);
1795 boost::statechart::result
react(const MNotifyRec
&);
1802 struct Peering
: boost::statechart::state
< Peering
, Primary
, GetInfo
>, NamedState
{
1803 PastIntervals::PriorSet prior_set
;
1804 bool history_les_bound
; //< need osd_find_best_info_ignore_history_les
1806 explicit Peering(my_context ctx
);
1809 typedef boost::mpl::list
<
1810 boost::statechart::custom_reaction
< QueryState
>,
1811 boost::statechart::transition
< Activate
, Active
>,
1812 boost::statechart::custom_reaction
< AdvMap
>
1814 boost::statechart::result
react(const QueryState
& q
);
1815 boost::statechart::result
react(const AdvMap
&advmap
);
1818 struct WaitLocalRecoveryReserved
;
1820 struct Active
: boost::statechart::state
< Active
, Primary
, Activating
>, NamedState
{
1821 explicit Active(my_context ctx
);
1824 const set
<pg_shard_t
> remote_shards_to_reserve_recovery
;
1825 const set
<pg_shard_t
> remote_shards_to_reserve_backfill
;
1826 bool all_replicas_activated
;
1828 typedef boost::mpl::list
<
1829 boost::statechart::custom_reaction
< QueryState
>,
1830 boost::statechart::custom_reaction
< ActMap
>,
1831 boost::statechart::custom_reaction
< AdvMap
>,
1832 boost::statechart::custom_reaction
< MInfoRec
>,
1833 boost::statechart::custom_reaction
< MNotifyRec
>,
1834 boost::statechart::custom_reaction
< MLogRec
>,
1835 boost::statechart::custom_reaction
< Backfilled
>,
1836 boost::statechart::custom_reaction
< AllReplicasActivated
>
1838 boost::statechart::result
react(const QueryState
& q
);
1839 boost::statechart::result
react(const ActMap
&);
1840 boost::statechart::result
react(const AdvMap
&);
1841 boost::statechart::result
react(const MInfoRec
& infoevt
);
1842 boost::statechart::result
react(const MNotifyRec
& notevt
);
1843 boost::statechart::result
react(const MLogRec
& logevt
);
1844 boost::statechart::result
react(const Backfilled
&) {
1845 return discard_event();
1847 boost::statechart::result
react(const AllReplicasActivated
&);
1850 struct Clean
: boost::statechart::state
< Clean
, Active
>, NamedState
{
1851 typedef boost::mpl::list
<
1852 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>
1854 explicit Clean(my_context ctx
);
1858 struct Recovered
: boost::statechart::state
< Recovered
, Active
>, NamedState
{
1859 typedef boost::mpl::list
<
1860 boost::statechart::transition
< GoClean
, Clean
>,
1861 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
1862 boost::statechart::custom_reaction
< AllReplicasActivated
>
1864 explicit Recovered(my_context ctx
);
1866 boost::statechart::result
react(const AllReplicasActivated
&) {
1867 post_event(GoClean());
1868 return forward_event();
1872 struct Backfilling
: boost::statechart::state
< Backfilling
, Active
>, NamedState
{
1873 typedef boost::mpl::list
<
1874 boost::statechart::transition
< Backfilled
, Recovered
>,
1875 boost::statechart::custom_reaction
< CancelBackfill
>,
1876 boost::statechart::custom_reaction
< RemoteReservationRejected
>
1878 explicit Backfilling(my_context ctx
);
1879 boost::statechart::result
react(const RemoteReservationRejected
& evt
);
1880 boost::statechart::result
react(const CancelBackfill
& evt
);
1884 struct WaitRemoteBackfillReserved
: boost::statechart::state
< WaitRemoteBackfillReserved
, Active
>, NamedState
{
1885 typedef boost::mpl::list
<
1886 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
1887 boost::statechart::custom_reaction
< RemoteReservationRejected
>,
1888 boost::statechart::transition
< AllBackfillsReserved
, Backfilling
>
1890 set
<pg_shard_t
>::const_iterator backfill_osd_it
;
1891 explicit WaitRemoteBackfillReserved(my_context ctx
);
1893 boost::statechart::result
react(const RemoteBackfillReserved
& evt
);
1894 boost::statechart::result
react(const RemoteReservationRejected
& evt
);
1897 struct WaitLocalBackfillReserved
: boost::statechart::state
< WaitLocalBackfillReserved
, Active
>, NamedState
{
1898 typedef boost::mpl::list
<
1899 boost::statechart::transition
< LocalBackfillReserved
, WaitRemoteBackfillReserved
>
1901 explicit WaitLocalBackfillReserved(my_context ctx
);
1905 struct NotBackfilling
: boost::statechart::state
< NotBackfilling
, Active
>, NamedState
{
1906 typedef boost::mpl::list
<
1907 boost::statechart::transition
< RequestBackfill
, WaitLocalBackfillReserved
>,
1908 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
1909 boost::statechart::custom_reaction
< RemoteReservationRejected
>
1911 explicit NotBackfilling(my_context ctx
);
1913 boost::statechart::result
react(const RemoteBackfillReserved
& evt
);
1914 boost::statechart::result
react(const RemoteReservationRejected
& evt
);
1917 struct NotRecovering
: boost::statechart::state
< NotRecovering
, Active
>, NamedState
{
1918 typedef boost::mpl::list
<
1919 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>
1921 explicit NotRecovering(my_context ctx
);
1925 struct RepNotRecovering
;
1926 struct ReplicaActive
: boost::statechart::state
< ReplicaActive
, Started
, RepNotRecovering
>, NamedState
{
1927 explicit ReplicaActive(my_context ctx
);
1930 typedef boost::mpl::list
<
1931 boost::statechart::custom_reaction
< QueryState
>,
1932 boost::statechart::custom_reaction
< ActMap
>,
1933 boost::statechart::custom_reaction
< MQuery
>,
1934 boost::statechart::custom_reaction
< MInfoRec
>,
1935 boost::statechart::custom_reaction
< MLogRec
>,
1936 boost::statechart::custom_reaction
< Activate
>
1938 boost::statechart::result
react(const QueryState
& q
);
1939 boost::statechart::result
react(const MInfoRec
& infoevt
);
1940 boost::statechart::result
react(const MLogRec
& logevt
);
1941 boost::statechart::result
react(const ActMap
&);
1942 boost::statechart::result
react(const MQuery
&);
1943 boost::statechart::result
react(const Activate
&);
1946 struct RepRecovering
: boost::statechart::state
< RepRecovering
, ReplicaActive
>, NamedState
{
1947 typedef boost::mpl::list
<
1948 boost::statechart::transition
< RecoveryDone
, RepNotRecovering
>,
1949 boost::statechart::transition
< RemoteReservationRejected
, RepNotRecovering
>,
1950 boost::statechart::custom_reaction
< BackfillTooFull
>
1952 explicit RepRecovering(my_context ctx
);
1953 boost::statechart::result
react(const BackfillTooFull
&evt
);
1957 struct RepWaitBackfillReserved
: boost::statechart::state
< RepWaitBackfillReserved
, ReplicaActive
>, NamedState
{
1958 typedef boost::mpl::list
<
1959 boost::statechart::custom_reaction
< RemoteBackfillReserved
>,
1960 boost::statechart::custom_reaction
< RemoteReservationRejected
>
1962 explicit RepWaitBackfillReserved(my_context ctx
);
1964 boost::statechart::result
react(const RemoteBackfillReserved
&evt
);
1965 boost::statechart::result
react(const RemoteReservationRejected
&evt
);
1968 struct RepWaitRecoveryReserved
: boost::statechart::state
< RepWaitRecoveryReserved
, ReplicaActive
>, NamedState
{
1969 typedef boost::mpl::list
<
1970 boost::statechart::custom_reaction
< RemoteRecoveryReserved
>
1972 explicit RepWaitRecoveryReserved(my_context ctx
);
1974 boost::statechart::result
react(const RemoteRecoveryReserved
&evt
);
1977 struct RepNotRecovering
: boost::statechart::state
< RepNotRecovering
, ReplicaActive
>, NamedState
{
1978 typedef boost::mpl::list
<
1979 boost::statechart::custom_reaction
< RequestBackfillPrio
>,
1980 boost::statechart::transition
< RequestRecovery
, RepWaitRecoveryReserved
>,
1981 boost::statechart::transition
< RecoveryDone
, RepNotRecovering
> // for compat with pre-reservation peers
1983 explicit RepNotRecovering(my_context ctx
);
1984 boost::statechart::result
react(const RequestBackfillPrio
&evt
);
1988 struct Recovering
: boost::statechart::state
< Recovering
, Active
>, NamedState
{
1989 typedef boost::mpl::list
<
1990 boost::statechart::custom_reaction
< AllReplicasRecovered
>,
1991 boost::statechart::custom_reaction
< CancelRecovery
>,
1992 boost::statechart::custom_reaction
< RequestBackfill
>
1994 explicit Recovering(my_context ctx
);
1996 void release_reservations(bool cancel
= false);
1997 boost::statechart::result
react(const AllReplicasRecovered
&evt
);
1998 boost::statechart::result
react(const CancelRecovery
& evt
);
1999 boost::statechart::result
react(const RequestBackfill
&evt
);
2002 struct WaitRemoteRecoveryReserved
: boost::statechart::state
< WaitRemoteRecoveryReserved
, Active
>, NamedState
{
2003 typedef boost::mpl::list
<
2004 boost::statechart::custom_reaction
< RemoteRecoveryReserved
>,
2005 boost::statechart::transition
< AllRemotesReserved
, Recovering
>
2007 set
<pg_shard_t
>::const_iterator remote_recovery_reservation_it
;
2008 explicit WaitRemoteRecoveryReserved(my_context ctx
);
2009 boost::statechart::result
react(const RemoteRecoveryReserved
&evt
);
2013 struct WaitLocalRecoveryReserved
: boost::statechart::state
< WaitLocalRecoveryReserved
, Active
>, NamedState
{
2014 typedef boost::mpl::list
<
2015 boost::statechart::transition
< LocalRecoveryReserved
, WaitRemoteRecoveryReserved
>,
2016 boost::statechart::custom_reaction
< RecoveryTooFull
>
2018 explicit WaitLocalRecoveryReserved(my_context ctx
);
2020 boost::statechart::result
react(const RecoveryTooFull
&evt
);
2023 struct Activating
: boost::statechart::state
< Activating
, Active
>, NamedState
{
2024 typedef boost::mpl::list
<
2025 boost::statechart::transition
< AllReplicasRecovered
, Recovered
>,
2026 boost::statechart::transition
< DoRecovery
, WaitLocalRecoveryReserved
>,
2027 boost::statechart::transition
< RequestBackfill
, WaitLocalBackfillReserved
>
2029 explicit Activating(my_context ctx
);
2033 struct Stray
: boost::statechart::state
< Stray
, Started
>, NamedState
{
2034 map
<int, pair
<pg_query_t
, epoch_t
> > pending_queries
;
2036 explicit Stray(my_context ctx
);
2039 typedef boost::mpl::list
<
2040 boost::statechart::custom_reaction
< MQuery
>,
2041 boost::statechart::custom_reaction
< MLogRec
>,
2042 boost::statechart::custom_reaction
< MInfoRec
>,
2043 boost::statechart::custom_reaction
< ActMap
>,
2044 boost::statechart::custom_reaction
< RecoveryDone
>
2046 boost::statechart::result
react(const MQuery
& query
);
2047 boost::statechart::result
react(const MLogRec
& logevt
);
2048 boost::statechart::result
react(const MInfoRec
& infoevt
);
2049 boost::statechart::result
react(const ActMap
&);
2050 boost::statechart::result
react(const RecoveryDone
&) {
2051 return discard_event();
2057 struct GetInfo
: boost::statechart::state
< GetInfo
, Peering
>, NamedState
{
2058 set
<pg_shard_t
> peer_info_requested
;
2060 explicit GetInfo(my_context ctx
);
2064 typedef boost::mpl::list
<
2065 boost::statechart::custom_reaction
< QueryState
>,
2066 boost::statechart::transition
< GotInfo
, GetLog
>,
2067 boost::statechart::custom_reaction
< MNotifyRec
>,
2068 boost::statechart::transition
< IsDown
, Down
>
2070 boost::statechart::result
react(const QueryState
& q
);
2071 boost::statechart::result
react(const MNotifyRec
& infoevt
);
2074 struct GotLog
: boost::statechart::event
< GotLog
> {
2075 GotLog() : boost::statechart::event
< GotLog
>() {}
2078 struct GetLog
: boost::statechart::state
< GetLog
, Peering
>, NamedState
{
2079 pg_shard_t auth_log_shard
;
2080 boost::intrusive_ptr
<MOSDPGLog
> msg
;
2082 explicit GetLog(my_context ctx
);
2085 typedef boost::mpl::list
<
2086 boost::statechart::custom_reaction
< QueryState
>,
2087 boost::statechart::custom_reaction
< MLogRec
>,
2088 boost::statechart::custom_reaction
< GotLog
>,
2089 boost::statechart::custom_reaction
< AdvMap
>,
2090 boost::statechart::transition
< IsIncomplete
, Incomplete
>
2092 boost::statechart::result
react(const AdvMap
&);
2093 boost::statechart::result
react(const QueryState
& q
);
2094 boost::statechart::result
react(const MLogRec
& logevt
);
2095 boost::statechart::result
react(const GotLog
&);
2100 struct GetMissing
: boost::statechart::state
< GetMissing
, Peering
>, NamedState
{
2101 set
<pg_shard_t
> peer_missing_requested
;
2103 explicit GetMissing(my_context ctx
);
2106 typedef boost::mpl::list
<
2107 boost::statechart::custom_reaction
< QueryState
>,
2108 boost::statechart::custom_reaction
< MLogRec
>,
2109 boost::statechart::transition
< NeedUpThru
, WaitUpThru
>
2111 boost::statechart::result
react(const QueryState
& q
);
2112 boost::statechart::result
react(const MLogRec
& logevt
);
2115 struct WaitUpThru
: boost::statechart::state
< WaitUpThru
, Peering
>, NamedState
{
2116 explicit WaitUpThru(my_context ctx
);
2119 typedef boost::mpl::list
<
2120 boost::statechart::custom_reaction
< QueryState
>,
2121 boost::statechart::custom_reaction
< ActMap
>,
2122 boost::statechart::custom_reaction
< MLogRec
>
2124 boost::statechart::result
react(const QueryState
& q
);
2125 boost::statechart::result
react(const ActMap
& am
);
2126 boost::statechart::result
react(const MLogRec
& logrec
);
2129 struct Down
: boost::statechart::state
< Down
, Peering
>, NamedState
{
2130 explicit Down(my_context ctx
);
2131 typedef boost::mpl::list
<
2132 boost::statechart::custom_reaction
< QueryState
>
2134 boost::statechart::result
react(const QueryState
& infoevt
);
2138 struct Incomplete
: boost::statechart::state
< Incomplete
, Peering
>, NamedState
{
2139 typedef boost::mpl::list
<
2140 boost::statechart::custom_reaction
< AdvMap
>,
2141 boost::statechart::custom_reaction
< MNotifyRec
>,
2142 boost::statechart::custom_reaction
< QueryState
>
2144 explicit Incomplete(my_context ctx
);
2145 boost::statechart::result
react(const AdvMap
&advmap
);
2146 boost::statechart::result
react(const MNotifyRec
& infoevt
);
2147 boost::statechart::result
react(const QueryState
& infoevt
);
2152 RecoveryMachine machine
;
2155 /// context passed in by state machine caller
2156 RecoveryCtx
*orig_ctx
;
2158 /// populated if we are buffering messages pending a flush
2159 boost::optional
<BufferedRecoveryMessages
> messages_pending_flush
;
2162 * populated between start_handle() and end_handle(), points into
2163 * the message lists for messages_pending_flush while blocking messages
2164 * or into orig_ctx otherwise
2166 boost::optional
<RecoveryCtx
> rctx
;
2169 explicit RecoveryState(PG
*pg
)
2170 : machine(this, pg
), pg(pg
), orig_ctx(0) {
2174 void handle_event(const boost::statechart::event_base
&evt
,
2175 RecoveryCtx
*rctx
) {
2177 machine
.process_event(evt
);
2181 void handle_event(CephPeeringEvtRef evt
,
2182 RecoveryCtx
*rctx
) {
2184 machine
.process_event(evt
->get_event());
2192 PG(OSDService
*o
, OSDMapRef curmap
,
2193 const PGPool
&pool
, spg_t p
);
2198 explicit PG(const PG
& rhs
);
2199 PG
& operator=(const PG
& rhs
);
2201 uint64_t peer_features
;
2202 uint64_t acting_features
;
2203 uint64_t upacting_features
;
2208 const spg_t
& get_pgid() const { return pg_id
; }
2210 void reset_min_peer_features() {
2211 peer_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
2213 uint64_t get_min_peer_features() const { return peer_features
; }
2214 void apply_peer_features(uint64_t f
) { peer_features
&= f
; }
2216 uint64_t get_min_acting_features() const { return acting_features
; }
2217 uint64_t get_min_upacting_features() const { return upacting_features
; }
2219 void init_primary_up_acting(
2220 const vector
<int> &newup
,
2221 const vector
<int> &newacting
,
2223 int new_acting_primary
) {
2226 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
2227 if (acting
[i
] != CRUSH_ITEM_NONE
)
2231 pool
.info
.ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2235 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
2236 if (up
[i
] != CRUSH_ITEM_NONE
)
2240 pool
.info
.ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2242 if (!pool
.info
.ec_pool()) {
2243 up_primary
= pg_shard_t(new_up_primary
, shard_id_t::NO_SHARD
);
2244 primary
= pg_shard_t(new_acting_primary
, shard_id_t::NO_SHARD
);
2247 up_primary
= pg_shard_t();
2248 primary
= pg_shard_t();
2249 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
2250 if (up
[i
] == new_up_primary
) {
2251 up_primary
= pg_shard_t(up
[i
], shard_id_t(i
));
2255 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
2256 if (acting
[i
] == new_acting_primary
) {
2257 primary
= pg_shard_t(acting
[i
], shard_id_t(i
));
2261 assert(up_primary
.osd
== new_up_primary
);
2262 assert(primary
.osd
== new_acting_primary
);
2264 pg_shard_t
get_primary() const { return primary
; }
2266 int get_role() const { return role
; }
2267 void set_role(int r
) { role
= r
; }
2269 bool is_primary() const { return pg_whoami
== primary
; }
2270 bool is_replica() const { return role
> 0; }
2272 epoch_t
get_last_peering_reset() const { return last_peering_reset
; }
2274 //int get_state() const { return state; }
2275 bool state_test(int m
) const { return (state
& m
) != 0; }
2276 void state_set(int m
) { state
|= m
; }
2277 void state_clear(int m
) { state
&= ~m
; }
2279 bool is_complete() const { return info
.last_complete
== info
.last_update
; }
2280 bool should_send_notify() const { return send_notify
; }
2282 int get_state() const { return state
; }
2283 bool is_active() const { return state_test(PG_STATE_ACTIVE
); }
2284 bool is_activating() const { return state_test(PG_STATE_ACTIVATING
); }
2285 bool is_peering() const { return state_test(PG_STATE_PEERING
); }
2286 bool is_down() const { return state_test(PG_STATE_DOWN
); }
2287 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE
); }
2288 bool is_clean() const { return state_test(PG_STATE_CLEAN
); }
2289 bool is_degraded() const { return state_test(PG_STATE_DEGRADED
); }
2290 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED
); }
2292 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING
); }
2293 bool is_peered() const {
2294 return state_test(PG_STATE_ACTIVE
) || state_test(PG_STATE_PEERED
);
2297 bool is_empty() const { return info
.last_update
== eversion_t(0,0); }
2301 const vector
<int>& up
,
2303 const vector
<int>& acting
,
2305 const pg_history_t
& history
,
2306 const PastIntervals
& pim
,
2308 ObjectStore::Transaction
*t
);
2311 void do_pending_flush();
2313 static void _create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
);
2314 static void _init(ObjectStore::Transaction
& t
,
2315 spg_t pgid
, const pg_pool_t
*pool
);
2318 void prepare_write_info(map
<string
,bufferlist
> *km
);
2320 void update_store_with_options();
2321 void update_store_on_load();
2324 static int _prepare_write_info(
2326 map
<string
,bufferlist
> *km
,
2329 pg_info_t
&last_written_info
,
2330 PastIntervals
&past_intervals
,
2331 bool dirty_big_info
,
2334 PerfCounters
*logger
= nullptr);
2335 void write_if_dirty(ObjectStore::Transaction
& t
);
2337 PGLog::IndexedLog projected_log
;
2338 bool check_in_progress_op(
2339 const osd_reqid_t
&r
,
2340 eversion_t
*version
,
2341 version_t
*user_version
,
2342 int *return_code
) const;
2343 eversion_t projected_last_update
;
2344 eversion_t
get_next_version() const {
2345 eversion_t
at_version(
2346 get_osdmap()->get_epoch(),
2347 projected_last_update
.version
+1);
2348 assert(at_version
> info
.last_update
);
2349 assert(at_version
> pg_log
.get_head());
2350 assert(at_version
> projected_last_update
);
2354 void add_log_entry(const pg_log_entry_t
& e
, bool applied
);
2356 const vector
<pg_log_entry_t
>& logv
,
2358 eversion_t roll_forward_to
,
2359 ObjectStore::Transaction
&t
,
2360 bool transaction_applied
= true);
2361 bool check_log_for_corruption(ObjectStore
*store
);
2364 std::string
get_corrupt_pg_log_name() const;
2365 static int read_info(
2366 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
,
2367 bufferlist
&bl
, pg_info_t
&info
, PastIntervals
&past_intervals
,
2369 void read_state(ObjectStore
*store
, bufferlist
&bl
);
2370 static bool _has_removal_flag(ObjectStore
*store
, spg_t pgid
);
2371 static int peek_map_epoch(ObjectStore
*store
, spg_t pgid
,
2372 epoch_t
*pepoch
, bufferlist
*bl
);
2373 void update_snap_map(
2374 const vector
<pg_log_entry_t
> &log_entries
,
2375 ObjectStore::Transaction
& t
);
2377 void filter_snapc(vector
<snapid_t
> &snaps
);
2379 void log_weirdness();
2381 virtual void kick_snap_trim() = 0;
2382 virtual void snap_trimmer_scrub_complete() = 0;
2383 bool requeue_scrub(bool high_priority
= false);
2384 void queue_recovery(bool front
= false);
2386 unsigned get_scrub_priority();
2388 /// share pg info after a pg is active
2389 void share_pg_info();
2392 bool append_log_entries_update_missing(
2393 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
2394 ObjectStore::Transaction
&t
);
2397 * Merge entries updating missing as necessary on all
2398 * actingbackfill logs and missings (also missing_loc)
2400 void merge_new_log_entries(
2401 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
2402 ObjectStore::Transaction
&t
);
2404 void reset_interval_flush();
2405 void start_peering_interval(
2406 const OSDMapRef lastmap
,
2407 const vector
<int>& newup
, int up_primary
,
2408 const vector
<int>& newacting
, int acting_primary
,
2409 ObjectStore::Transaction
*t
);
2410 void on_new_interval();
2411 virtual void _on_new_interval() = 0;
2412 void start_flush(ObjectStore::Transaction
*t
,
2413 list
<Context
*> *on_applied
,
2414 list
<Context
*> *on_safe
);
2415 void set_last_peering_reset();
2416 bool pg_has_reset_since(epoch_t e
) {
2417 assert(is_locked());
2418 return deleting
|| e
< get_last_peering_reset();
2421 void update_history(const pg_history_t
& history
);
2422 void fulfill_info(pg_shard_t from
, const pg_query_t
&query
,
2423 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
);
2424 void fulfill_log(pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
);
2426 void check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
);
2428 bool should_restart_peering(
2430 int newactingprimary
,
2431 const vector
<int>& newup
,
2432 const vector
<int>& newacting
,
2436 // OpRequest queueing
2437 bool can_discard_op(OpRequestRef
& op
);
2438 bool can_discard_scan(OpRequestRef op
);
2439 bool can_discard_backfill(OpRequestRef op
);
2440 bool can_discard_request(OpRequestRef
& op
);
2442 template<typename T
, int MSGTYPE
>
2443 bool can_discard_replica_op(OpRequestRef
& op
);
2445 bool old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
);
2446 bool old_peering_evt(CephPeeringEvtRef evt
) {
2447 return old_peering_msg(evt
->get_epoch_sent(), evt
->get_epoch_requested());
2449 static bool have_same_or_newer_map(epoch_t cur_epoch
, epoch_t e
) {
2450 return e
<= cur_epoch
;
2452 bool have_same_or_newer_map(epoch_t e
) {
2453 return e
<= get_osdmap()->get_epoch();
2456 bool op_has_sufficient_caps(OpRequestRef
& op
);
2460 void take_waiters();
2461 void queue_peering_event(CephPeeringEvtRef evt
);
2462 void handle_peering_event(CephPeeringEvtRef evt
, RecoveryCtx
*rctx
);
2463 void queue_query(epoch_t msg_epoch
, epoch_t query_epoch
,
2464 pg_shard_t from
, const pg_query_t
& q
);
2465 void queue_null(epoch_t msg_epoch
, epoch_t query_epoch
);
2466 void queue_flushed(epoch_t started_at
);
2467 void handle_advance_map(
2468 OSDMapRef osdmap
, OSDMapRef lastmap
,
2469 vector
<int>& newup
, int up_primary
,
2470 vector
<int>& newacting
, int acting_primary
,
2472 void handle_activate_map(RecoveryCtx
*rctx
);
2473 void handle_create(RecoveryCtx
*rctx
);
2474 void handle_loaded(RecoveryCtx
*rctx
);
2475 void handle_query_state(Formatter
*f
);
2477 virtual void on_removal(ObjectStore::Transaction
*t
) = 0;
2481 virtual void do_request(
2483 ThreadPool::TPHandle
&handle
2486 virtual void do_op(OpRequestRef
& op
) = 0;
2487 virtual void do_sub_op(OpRequestRef op
) = 0;
2488 virtual void do_sub_op_reply(OpRequestRef op
) = 0;
2489 virtual void do_scan(
2491 ThreadPool::TPHandle
&handle
2493 virtual void do_backfill(OpRequestRef op
) = 0;
2494 virtual void snap_trimmer(epoch_t epoch_queued
) = 0;
2496 virtual int do_command(
2502 ceph_tid_t tid
) = 0;
2504 virtual void on_role_change() = 0;
2505 virtual void on_pool_change() = 0;
2506 virtual void on_change(ObjectStore::Transaction
*t
) = 0;
2507 virtual void on_activate() = 0;
2508 virtual void on_flushed() = 0;
2509 virtual void on_shutdown() = 0;
2510 virtual void check_blacklisted_watchers() = 0;
2511 virtual void get_watchers(std::list
<obj_watch_item_t
>&) = 0;
2513 virtual bool agent_work(int max
) = 0;
2514 virtual bool agent_work(int max
, int agent_flush_quota
) = 0;
2515 virtual void agent_stop() = 0;
2516 virtual void agent_delay() = 0;
2517 virtual void agent_clear() = 0;
2518 virtual void agent_choose_mode_restart() = 0;
2521 ostream
& operator<<(ostream
& out
, const PG
& pg
);
2523 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
);