1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo2.h"
14 #include "messages/MOSDPGTrim.h"
15 #include "messages/MOSDPGLog.h"
16 #include "messages/MOSDPGNotify2.h"
17 #include "messages/MOSDPGQuery2.h"
18 #include "messages/MOSDPGLease.h"
19 #include "messages/MOSDPGLeaseAck.h"
21 #define dout_context cct
22 #define dout_subsys ceph_subsys_osd
32 using std::stringstream
;
35 using ceph::Formatter
;
36 using ceph::make_message
;
38 BufferedRecoveryMessages::BufferedRecoveryMessages(PeeringCtx
&ctx
)
39 // steal messages from ctx
40 : message_map
{std::move(ctx
.message_map
)}
43 void BufferedRecoveryMessages::send_notify(int to
, const pg_notify_t
&n
)
45 spg_t
pgid(n
.info
.pgid
.pgid
, n
.to
);
46 send_osd_message(to
, TOPNSPC::make_message
<MOSDPGNotify2
>(pgid
, n
));
49 void BufferedRecoveryMessages::send_query(
54 send_osd_message(to
, TOPNSPC::make_message
<MOSDPGQuery2
>(to_spgid
, q
));
57 void BufferedRecoveryMessages::send_info(
62 const pg_info_t
&info
,
63 std::optional
<pg_lease_t
> lease
,
64 std::optional
<pg_lease_ack_t
> lease_ack
)
68 TOPNSPC::make_message
<MOSDPGInfo2
>(
78 void PGPool::update(OSDMapRef map
)
80 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
82 return; // pool has been deleted
85 name
= map
->get_pool_name(id
);
88 if ((map
->get_epoch() != cached_epoch
+ 1) ||
89 (pi
->get_snap_epoch() == map
->get_epoch())) {
93 if (info
.is_pool_snaps_mode() && updated
) {
94 snapc
= pi
->get_snap_context();
96 cached_epoch
= map
->get_epoch();
99 /*-------------Peering State Helpers----------------*/
101 #define dout_prefix (dpp->gen_prefix(*_dout))
103 #define psdout(x) ldout(cct, x)
105 PeeringState::PeeringState(
107 pg_shard_t pg_whoami
,
111 DoutPrefixProvider
*dpp
,
113 : state_history(*pl
),
121 pg_whoami(pg_whoami
),
124 last_require_osd_release(curmap
->require_osd_release
),
125 missing_loc(spgid
, this, dpp
, cct
),
126 machine(this, cct
, spgid
, dpp
, pl
, &state_history
)
131 void PeeringState::start_handle(PeeringCtx
*new_ctx
) {
133 ceph_assert(!orig_ctx
);
136 if (messages_pending_flush
) {
137 rctx
.emplace(*messages_pending_flush
, *new_ctx
);
139 rctx
.emplace(*new_ctx
);
141 rctx
->start_time
= ceph_clock_now();
145 void PeeringState::begin_block_outgoing() {
146 ceph_assert(!messages_pending_flush
);
147 ceph_assert(orig_ctx
);
149 messages_pending_flush
.emplace();
150 rctx
.emplace(*messages_pending_flush
, *orig_ctx
);
153 void PeeringState::clear_blocked_outgoing() {
154 ceph_assert(orig_ctx
);
156 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
159 void PeeringState::end_block_outgoing() {
160 ceph_assert(messages_pending_flush
);
161 ceph_assert(orig_ctx
);
164 orig_ctx
->accept_buffered_messages(*messages_pending_flush
);
165 rctx
.emplace(*orig_ctx
);
166 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
169 void PeeringState::end_handle() {
171 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
172 machine
.event_time
+= dur
;
175 machine
.event_count
++;
180 void PeeringState::check_recovery_sources(const OSDMapRef
& osdmap
)
183 * check that any peers we are planning to (or currently) pulling
184 * objects from are dealt with.
186 missing_loc
.check_recovery_sources(osdmap
);
187 pl
->check_recovery_sources(osdmap
);
189 for (auto i
= peer_log_requested
.begin(); i
!= peer_log_requested
.end();) {
190 if (!osdmap
->is_up(i
->osd
)) {
191 psdout(10) << "peer_log_requested removing " << *i
<< dendl
;
192 peer_log_requested
.erase(i
++);
198 for (auto i
= peer_missing_requested
.begin();
199 i
!= peer_missing_requested
.end();) {
200 if (!osdmap
->is_up(i
->osd
)) {
201 psdout(10) << "peer_missing_requested removing " << *i
<< dendl
;
202 peer_missing_requested
.erase(i
++);
209 void PeeringState::update_history(const pg_history_t
& new_history
)
211 auto mnow
= pl
->get_mnow();
212 info
.history
.refresh_prior_readable_until_ub(mnow
, prior_readable_until_ub
);
213 if (info
.history
.merge(new_history
)) {
214 psdout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
216 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
217 psdout(20) << __func__
<< " clearing past_intervals" << dendl
;
218 past_intervals
.clear();
219 dirty_big_info
= true;
221 prior_readable_until_ub
= info
.history
.get_prior_readable_until_ub(mnow
);
222 if (prior_readable_until_ub
!= ceph::signedspan::zero()) {
224 << " prior_readable_until_ub " << prior_readable_until_ub
225 << " (mnow " << mnow
<< " + "
226 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
229 pl
->on_info_history_change();
232 hobject_t
PeeringState::earliest_backfill() const
234 hobject_t e
= hobject_t::get_max();
235 for (const pg_shard_t
& bt
: get_backfill_targets()) {
236 const pg_info_t
&pi
= get_peer_info(bt
);
237 e
= std::min(pi
.last_backfill
, e
);
242 void PeeringState::purge_strays()
245 psdout(10) << "purge_strays " << stray_set
<< " but premerge, doing nothing"
249 if (cct
->_conf
.get_val
<bool>("osd_debug_no_purge_strays")) {
252 psdout(10) << "purge_strays " << stray_set
<< dendl
;
254 bool removed
= false;
255 for (auto p
= stray_set
.begin(); p
!= stray_set
.end(); ++p
) {
256 ceph_assert(!is_acting_recovery_backfill(*p
));
257 if (get_osdmap()->is_up(p
->osd
)) {
258 psdout(10) << "sending PGRemove to osd." << *p
<< dendl
;
259 vector
<spg_t
> to_remove
;
260 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
261 auto m
= TOPNSPC::make_message
<MOSDPGRemove
>(
264 pl
->send_cluster_message(p
->osd
, std::move(m
), get_osdmap_epoch());
266 psdout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
268 peer_missing
.erase(*p
);
270 missing_loc
.remove_stray_recovery_sources(*p
);
271 peer_purged
.insert(*p
);
275 // if we removed anyone, update peers (which include peer_info)
277 update_heartbeat_peers();
281 // clear _requested maps; we may have to peer() again if we discover
282 // (more) stray content
283 peer_log_requested
.clear();
284 peer_missing_requested
.clear();
287 void PeeringState::query_unfound(Formatter
*f
, string state
)
289 psdout(20) << "Enter PeeringState common QueryUnfound" << dendl
;
291 f
->dump_string("state", state
);
292 f
->dump_bool("available_might_have_unfound", true);
293 f
->open_array_section("might_have_unfound");
294 for (auto p
= might_have_unfound
.begin();
295 p
!= might_have_unfound
.end();
297 if (peer_missing
.count(*p
)) {
298 ; // Ignore already probed OSDs
300 f
->open_object_section("osd");
301 f
->dump_stream("osd") << *p
;
302 if (peer_missing_requested
.count(*p
)) {
303 f
->dump_string("status", "querying");
304 } else if (!get_osdmap()->is_up(p
->osd
)) {
305 f
->dump_string("status", "osd is down");
307 f
->dump_string("status", "not queried");
314 psdout(20) << "Exit PeeringState common QueryUnfound" << dendl
;
318 bool PeeringState::proc_replica_info(
319 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
321 auto p
= peer_info
.find(from
);
322 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
323 psdout(10) << " got dup osd." << from
<< " info "
324 << oinfo
<< ", identical to ours" << dendl
;
328 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
329 psdout(10) << " got info " << oinfo
<< " from down osd." << from
330 << " discarding" << dendl
;
334 psdout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
335 ceph_assert(is_primary());
336 peer_info
[from
] = oinfo
;
337 might_have_unfound
.insert(from
);
339 update_history(oinfo
.history
);
342 if (!is_up(from
) && !is_acting(from
)) {
343 psdout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
344 stray_set
.insert(from
);
350 // was this a new info? if so, update peers!
351 if (p
== peer_info
.end())
352 update_heartbeat_peers();
358 void PeeringState::remove_down_peer_info(const OSDMapRef
&osdmap
)
360 // Remove any downed osds from peer_info
361 bool removed
= false;
362 auto p
= peer_info
.begin();
363 while (p
!= peer_info
.end()) {
364 if (!osdmap
->is_up(p
->first
.osd
)) {
365 psdout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
366 peer_missing
.erase(p
->first
);
367 peer_log_requested
.erase(p
->first
);
368 peer_missing_requested
.erase(p
->first
);
369 peer_info
.erase(p
++);
375 // Remove any downed osds from peer_purged so we can re-purge if necessary
376 auto it
= peer_purged
.begin();
377 while (it
!= peer_purged
.end()) {
378 if (!osdmap
->is_up(it
->osd
)) {
379 psdout(10) << " dropping down osd." << *it
<< " from peer_purged" << dendl
;
380 peer_purged
.erase(it
++);
386 // if we removed anyone, update peers (which include peer_info)
388 update_heartbeat_peers();
390 check_recovery_sources(osdmap
);
393 void PeeringState::update_heartbeat_peers()
399 for (unsigned i
=0; i
<acting
.size(); i
++) {
400 if (acting
[i
] != CRUSH_ITEM_NONE
)
401 new_peers
.insert(acting
[i
]);
403 for (unsigned i
=0; i
<up
.size(); i
++) {
404 if (up
[i
] != CRUSH_ITEM_NONE
)
405 new_peers
.insert(up
[i
]);
407 for (auto p
= peer_info
.begin(); p
!= peer_info
.end(); ++p
) {
408 new_peers
.insert(p
->first
.osd
);
410 pl
->update_heartbeat_peers(std::move(new_peers
));
413 void PeeringState::write_if_dirty(ObjectStore::Transaction
& t
)
422 last_persisted_osdmap
< get_osdmap_epoch(),
424 if (dirty_info
|| dirty_big_info
) {
425 last_persisted_osdmap
= get_osdmap_epoch();
426 last_written_info
= info
;
428 dirty_big_info
= false;
432 void PeeringState::advance_map(
433 OSDMapRef osdmap
, OSDMapRef lastmap
,
434 vector
<int>& newup
, int up_primary
,
435 vector
<int>& newacting
, int acting_primary
,
438 ceph_assert(lastmap
== osdmap_ref
);
439 psdout(10) << "handle_advance_map "
440 << newup
<< "/" << newacting
441 << " -- " << up_primary
<< "/" << acting_primary
444 update_osdmap_ref(osdmap
);
448 osdmap
, lastmap
, newup
, up_primary
,
449 newacting
, acting_primary
);
450 handle_event(evt
, &rctx
);
451 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
452 pl
->on_pool_change();
454 readable_interval
= pool
.get_readable_interval(cct
->_conf
);
455 last_require_osd_release
= osdmap
->require_osd_release
;
458 void PeeringState::activate_map(PeeringCtx
&rctx
)
460 psdout(10) << __func__
<< dendl
;
462 handle_event(evt
, &rctx
);
463 if (osdmap_ref
->get_epoch() - last_persisted_osdmap
>
464 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
465 psdout(20) << __func__
<< ": Dirtying info: last_persisted is "
466 << last_persisted_osdmap
467 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
470 psdout(20) << __func__
<< ": Not dirtying info: last_persisted is "
471 << last_persisted_osdmap
472 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
474 write_if_dirty(rctx
.transaction
);
476 if (get_osdmap()->check_new_blocklist_entries()) {
477 pl
->check_blocklisted_watchers();
481 void PeeringState::set_last_peering_reset()
483 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl
;
484 if (last_peering_reset
!= get_osdmap_epoch()) {
485 last_peering_reset
= get_osdmap_epoch();
486 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
487 clear_blocked_outgoing();
488 if (!pl
->try_flush_or_schedule_async()) {
489 psdout(10) << "Beginning to block outgoing recovery messages" << dendl
;
490 begin_block_outgoing();
492 psdout(10) << "Not blocking outgoing recovery messages" << dendl
;
497 void PeeringState::complete_flush()
499 flushes_in_progress
--;
500 if (flushes_in_progress
== 0) {
505 void PeeringState::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
507 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
509 return; // pool deleted
511 bool changed
= false;
512 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
513 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
514 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
515 psdout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
520 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
525 bool PeeringState::should_restart_peering(
527 int newactingprimary
,
528 const vector
<int>& newup
,
529 const vector
<int>& newacting
,
533 if (PastIntervals::is_new_interval(
545 psdout(20) << "new interval newup " << newup
546 << " newacting " << newacting
<< dendl
;
549 if (!lastmap
->is_up(pg_whoami
.osd
) && osdmap
->is_up(pg_whoami
.osd
)) {
550 psdout(10) << __func__
<< " osd transitioned from down -> up"
557 /* Called before initializing peering during advance_map */
558 void PeeringState::start_peering_interval(
559 const OSDMapRef lastmap
,
560 const vector
<int>& newup
, int new_up_primary
,
561 const vector
<int>& newacting
, int new_acting_primary
,
562 ObjectStore::Transaction
&t
)
564 const OSDMapRef osdmap
= get_osdmap();
566 set_last_peering_reset();
568 vector
<int> oldacting
, oldup
;
569 int oldrole
= get_role();
572 pl
->clear_ready_to_merge();
576 pg_shard_t old_acting_primary
= get_primary();
577 pg_shard_t old_up_primary
= up_primary
;
578 bool was_old_primary
= is_primary();
579 bool was_old_nonprimary
= is_nonprimary();
581 acting
.swap(oldacting
);
583 init_primary_up_acting(
589 if (info
.stats
.up
!= up
||
590 info
.stats
.acting
!= acting
||
591 info
.stats
.up_primary
!= new_up_primary
||
592 info
.stats
.acting_primary
!= new_acting_primary
) {
594 info
.stats
.up_primary
= new_up_primary
;
595 info
.stats
.acting
= acting
;
596 info
.stats
.acting_primary
= new_acting_primary
;
597 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
600 pl
->clear_publish_stats();
602 // This will now be remapped during a backfill in cases
603 // that it would not have been before.
605 state_set(PG_STATE_REMAPPED
);
607 state_clear(PG_STATE_REMAPPED
);
609 int role
= osdmap
->calc_pg_role(pg_whoami
, acting
);
612 // did acting, up, primary|acker change?
614 psdout(10) << " no lastmap" << dendl
;
616 dirty_big_info
= true;
617 info
.history
.same_interval_since
= osdmap
->get_epoch();
619 std::stringstream debug
;
620 ceph_assert(info
.history
.same_interval_since
!= 0);
621 bool new_interval
= PastIntervals::check_new_interval(
622 old_acting_primary
.osd
,
624 oldacting
, newacting
,
628 info
.history
.same_interval_since
,
629 info
.history
.last_epoch_clean
,
633 missing_loc
.get_recoverable_predicate(),
636 psdout(10) << __func__
<< ": check_new_interval output: "
637 << debug
.str() << dendl
;
639 if (osdmap
->get_epoch() == pl
->oldest_stored_osdmap() &&
640 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
641 psdout(10) << " map gap, clearing past_intervals and faking" << dendl
;
642 // our information is incomplete and useless; someone else was clean
643 // after everything we know if osdmaps were trimmed.
644 past_intervals
.clear();
646 psdout(10) << " noting past " << past_intervals
<< dendl
;
649 dirty_big_info
= true;
650 info
.history
.same_interval_since
= osdmap
->get_epoch();
651 if (osdmap
->have_pg_pool(info
.pgid
.pgid
.pool()) &&
652 info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
653 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
655 info
.history
.last_epoch_split
= osdmap
->get_epoch();
660 if (old_up_primary
!= up_primary
||
662 info
.history
.same_up_since
= osdmap
->get_epoch();
664 // this comparison includes primary rank via pg_shard_t
665 if (old_acting_primary
!= get_primary()) {
666 info
.history
.same_primary_since
= osdmap
->get_epoch();
670 pl
->on_info_history_change();
672 psdout(1) << __func__
<< " up " << oldup
<< " -> " << up
673 << ", acting " << oldacting
<< " -> " << acting
674 << ", acting_primary " << old_acting_primary
<< " -> "
675 << new_acting_primary
676 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
677 << ", role " << oldrole
<< " -> " << role
678 << ", features acting " << acting_features
679 << " upacting " << upacting_features
683 state_clear(PG_STATE_ACTIVE
);
684 state_clear(PG_STATE_PEERED
);
685 state_clear(PG_STATE_PREMERGE
);
686 state_clear(PG_STATE_DOWN
);
687 state_clear(PG_STATE_RECOVERY_WAIT
);
688 state_clear(PG_STATE_RECOVERY_TOOFULL
);
689 state_clear(PG_STATE_RECOVERING
);
692 acting_recovery_backfill
.clear();
694 // reset primary/replica state?
695 if (was_old_primary
|| is_primary()) {
696 pl
->clear_want_pg_temp();
697 } else if (was_old_nonprimary
|| is_nonprimary()) {
698 pl
->clear_want_pg_temp();
700 clear_primary_state();
704 ceph_assert(!deleting
);
706 // should we tell the primary we are here?
707 send_notify
= !is_primary();
709 if (role
!= oldrole
||
710 was_old_primary
!= is_primary()) {
711 // did primary change?
712 if (was_old_primary
!= is_primary()) {
713 state_clear(PG_STATE_CLEAN
);
714 // queue/dequeue the scrubber
715 pl
->on_primary_status_change(was_old_primary
, is_primary());
718 pl
->on_role_change();
721 // did primary change?
722 if (get_primary() != old_acting_primary
) {
723 psdout(10) << oldacting
<< " -> " << acting
724 << ", acting primary "
725 << old_acting_primary
<< " -> " << get_primary()
728 // primary is the same.
730 // i am (still) primary. but my replica set changed.
731 state_clear(PG_STATE_CLEAN
);
733 psdout(10) << oldacting
<< " -> " << acting
734 << ", replicas changed" << dendl
;
739 if (is_primary() && was_old_primary
) {
740 pl
->reschedule_scrub();
743 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
744 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
745 pl
->queue_want_pg_temp(acting
);
749 void PeeringState::on_new_interval()
751 dout(20) << __func__
<< dendl
;
752 const OSDMapRef osdmap
= get_osdmap();
754 // initialize features
755 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
756 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
757 for (auto p
= acting
.begin(); p
!= acting
.end(); ++p
) {
758 if (*p
== CRUSH_ITEM_NONE
)
760 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
761 acting_features
&= f
;
762 upacting_features
&= f
;
764 for (auto p
= up
.begin(); p
!= up
.end(); ++p
) {
765 if (*p
== CRUSH_ITEM_NONE
)
767 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
769 psdout(20) << __func__
<< " upacting_features 0x" << std::hex
770 << upacting_features
<< std::dec
771 << " from " << acting
<< "+" << up
<< dendl
;
773 psdout(20) << __func__
<< " checking missing set deletes flag. missing = "
774 << get_pg_log().get_missing() << dendl
;
776 if (!pg_log
.get_missing().may_include_deletes
&&
777 !perform_deletes_during_peering()) {
778 pl
->rebuild_missing_set_with_deletes(pg_log
);
781 pg_log
.get_missing().may_include_deletes
==
782 !perform_deletes_during_peering());
786 // update lease bounds for a new interval
787 auto mnow
= pl
->get_mnow();
788 prior_readable_until_ub
= std::max(prior_readable_until_ub
,
790 prior_readable_until_ub
= info
.history
.refresh_prior_readable_until_ub(
791 mnow
, prior_readable_until_ub
);
792 psdout(10) << __func__
<< " prior_readable_until_ub "
793 << prior_readable_until_ub
<< " (mnow " << mnow
<< " + "
794 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
795 prior_readable_down_osds
.clear(); // we populate this when we build the priorset
799 readable_until_ub_sent
=
800 readable_until_ub_from_primary
= ceph::signedspan::zero();
802 acting_readable_until_ub
.clear();
804 acting_readable_until_ub
.resize(acting
.size(), ceph::signedspan::zero());
807 pl
->on_new_interval();
810 void PeeringState::init_primary_up_acting(
811 const vector
<int> &newup
,
812 const vector
<int> &newacting
,
814 int new_acting_primary
)
818 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
819 if (acting
[i
] != CRUSH_ITEM_NONE
)
823 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
827 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
828 if (up
[i
] != CRUSH_ITEM_NONE
)
832 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
834 if (!pool
.info
.is_erasure()) {
836 up_primary
= pg_shard_t(new_up_primary
, shard_id_t::NO_SHARD
);
837 primary
= pg_shard_t(new_acting_primary
, shard_id_t::NO_SHARD
);
840 up_primary
= pg_shard_t();
841 primary
= pg_shard_t();
842 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
843 if (up
[i
] == new_up_primary
) {
844 up_primary
= pg_shard_t(up
[i
], shard_id_t(i
));
848 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
849 if (acting
[i
] == new_acting_primary
) {
850 primary
= pg_shard_t(acting
[i
], shard_id_t(i
));
854 ceph_assert(up_primary
.osd
== new_up_primary
);
855 ceph_assert(primary
.osd
== new_acting_primary
);
859 void PeeringState::init_hb_stamps()
862 // we care about all other osds in the acting set
863 hb_stamps
.resize(acting
.size() - 1);
865 for (auto p
: acting
) {
866 if (p
== CRUSH_ITEM_NONE
|| p
== get_primary().osd
) {
869 hb_stamps
[i
++] = pl
->get_hb_stamps(p
);
872 } else if (is_nonprimary()) {
873 // we care about just the primary
875 hb_stamps
[0] = pl
->get_hb_stamps(get_primary().osd
);
879 dout(10) << __func__
<< " now " << hb_stamps
<< dendl
;
883 void PeeringState::clear_recovery_state()
885 async_recovery_targets
.clear();
886 backfill_targets
.clear();
889 void PeeringState::clear_primary_state()
891 psdout(10) << "clear_primary_state" << dendl
;
893 // clear peering state
895 peer_log_requested
.clear();
896 peer_missing_requested
.clear();
899 peer_missing
.clear();
900 peer_last_complete_ondisk
.clear();
901 peer_activated
.clear();
902 min_last_complete_ondisk
= eversion_t();
903 pg_trim_to
= eversion_t();
904 might_have_unfound
.clear();
905 need_up_thru
= false;
907 pg_log
.reset_recovery_pointers();
909 clear_recovery_state();
911 last_update_ondisk
= eversion_t();
913 pl
->clear_primary_state();
916 /// return [start,end) bounds for required past_intervals
917 static pair
<epoch_t
, epoch_t
> get_required_past_interval_bounds(
918 const pg_info_t
&info
,
919 epoch_t oldest_map
) {
920 epoch_t start
= std::max(
921 info
.history
.last_epoch_clean
? info
.history
.last_epoch_clean
:
922 info
.history
.epoch_pool_created
,
924 epoch_t end
= std::max(
925 info
.history
.same_interval_since
,
926 info
.history
.epoch_pool_created
);
927 return make_pair(start
, end
);
931 void PeeringState::check_past_interval_bounds() const
933 auto oldest_epoch
= pl
->oldest_stored_osdmap();
934 auto rpib
= get_required_past_interval_bounds(
937 if (rpib
.first
>= rpib
.second
) {
938 // do not warn if the start bound is dictated by oldest_map; the
939 // past intervals are presumably appropriate given the pg info.
940 if (!past_intervals
.empty() &&
941 rpib
.first
> oldest_epoch
) {
942 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
943 << " empty [" << rpib
<< ") but past_intervals is not: "
945 derr
<< info
.pgid
<< " required past_interval bounds are"
946 << " empty [" << rpib
<< ") but past_intervals is not: "
947 << past_intervals
<< dendl
;
950 if (past_intervals
.empty()) {
951 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
952 << " not empty [" << rpib
<< ") but past_intervals "
953 << past_intervals
<< " is empty";
954 derr
<< info
.pgid
<< " required past_interval bounds are"
955 << " not empty [" << rpib
<< ") but past_intervals "
956 << past_intervals
<< " is empty" << dendl
;
957 ceph_assert(!past_intervals
.empty());
960 auto apib
= past_intervals
.get_bounds();
961 if (apib
.first
> rpib
.first
) {
962 pl
->get_clog_error() << info
.pgid
<< " past_intervals [" << apib
963 << ") start interval does not contain the required"
964 << " bound [" << rpib
<< ") start";
965 derr
<< info
.pgid
<< " past_intervals [" << apib
966 << ") start interval does not contain the required"
967 << " bound [" << rpib
<< ") start" << dendl
;
968 ceph_abort_msg("past_interval start interval mismatch");
970 if (apib
.second
!= rpib
.second
) {
971 pl
->get_clog_error() << info
.pgid
<< " past_interal bound [" << apib
972 << ") end does not match required [" << rpib
974 derr
<< info
.pgid
<< " past_interal bound [" << apib
975 << ") end does not match required [" << rpib
977 ceph_abort_msg("past_interval end mismatch");
982 int PeeringState::clamp_recovery_priority(int priority
, int pool_recovery_priority
, int max
)
984 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
985 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
987 ceph_assert(max
<= OSD_RECOVERY_PRIORITY_MAX
);
989 // User can't set this too high anymore, but might be a legacy value
990 if (pool_recovery_priority
> OSD_POOL_PRIORITY_MAX
)
991 pool_recovery_priority
= OSD_POOL_PRIORITY_MAX
;
992 if (pool_recovery_priority
< OSD_POOL_PRIORITY_MIN
)
993 pool_recovery_priority
= OSD_POOL_PRIORITY_MIN
;
994 // Shift range from min to max to 0 to max - min
995 pool_recovery_priority
+= (0 - OSD_POOL_PRIORITY_MIN
);
996 ceph_assert(pool_recovery_priority
>= 0 && pool_recovery_priority
<= (OSD_POOL_PRIORITY_MAX
- OSD_POOL_PRIORITY_MIN
));
998 priority
+= pool_recovery_priority
;
1000 // Clamp to valid range
1001 if (priority
> max
) {
1003 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
1004 return OSD_RECOVERY_PRIORITY_MIN
;
1010 unsigned PeeringState::get_recovery_priority()
1012 // a higher value -> a higher priority
1013 int ret
= OSD_RECOVERY_PRIORITY_BASE
;
1016 if (state
& PG_STATE_FORCED_RECOVERY
) {
1017 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
1019 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
1020 if (is_degraded() && info
.stats
.avail_no_missing
.size() < pool
.info
.min_size
) {
1021 base
= OSD_RECOVERY_INACTIVE_PRIORITY_BASE
;
1022 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1023 ret
= base
+ (pool
.info
.min_size
- info
.stats
.avail_no_missing
.size());
1026 int64_t pool_recovery_priority
= 0;
1027 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
1029 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
1031 psdout(20) << __func__
<< " recovery priority is " << ret
<< dendl
;
1032 return static_cast<unsigned>(ret
);
1035 unsigned PeeringState::get_backfill_priority()
1037 // a higher value -> a higher priority
1038 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
1041 if (state
& PG_STATE_FORCED_BACKFILL
) {
1042 ret
= OSD_BACKFILL_PRIORITY_FORCED
;
1044 if (actingset
.size() < pool
.info
.min_size
) {
1045 base
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
;
1046 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1047 ret
= base
+ (pool
.info
.min_size
- actingset
.size());
1049 } else if (is_undersized()) {
1050 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1051 ceph_assert(pool
.info
.size
> actingset
.size());
1052 base
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1053 ret
= base
+ (pool
.info
.size
- actingset
.size());
1055 } else if (is_degraded()) {
1056 // degraded: baseline degraded
1057 base
= ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1060 // Adjust with pool's recovery priority
1061 int64_t pool_recovery_priority
= 0;
1062 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
1064 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
1067 psdout(20) << __func__
<< " backfill priority is " << ret
<< dendl
;
1068 return static_cast<unsigned>(ret
);
1071 unsigned PeeringState::get_delete_priority()
1073 auto state
= get_osdmap()->get_state(pg_whoami
.osd
);
1074 if (state
& (CEPH_OSD_BACKFILLFULL
|
1076 return OSD_DELETE_PRIORITY_FULL
;
1077 } else if (state
& CEPH_OSD_NEARFULL
) {
1078 return OSD_DELETE_PRIORITY_FULLISH
;
1080 return OSD_DELETE_PRIORITY_NORMAL
;
1084 bool PeeringState::set_force_recovery(bool b
)
1088 if (!(state
& PG_STATE_FORCED_RECOVERY
) &&
1089 (state
& (PG_STATE_DEGRADED
|
1090 PG_STATE_RECOVERY_WAIT
|
1091 PG_STATE_RECOVERING
))) {
1092 psdout(20) << __func__
<< " set" << dendl
;
1093 state_set(PG_STATE_FORCED_RECOVERY
);
1094 pl
->publish_stats_to_osd();
1097 } else if (state
& PG_STATE_FORCED_RECOVERY
) {
1098 psdout(20) << __func__
<< " clear" << dendl
;
1099 state_clear(PG_STATE_FORCED_RECOVERY
);
1100 pl
->publish_stats_to_osd();
1104 psdout(20) << __func__
<< " state " << get_current_state()
1106 pl
->update_local_background_io_priority(get_recovery_priority());
1111 bool PeeringState::set_force_backfill(bool b
)
1115 if (!(state
& PG_STATE_FORCED_BACKFILL
) &&
1116 (state
& (PG_STATE_DEGRADED
|
1117 PG_STATE_BACKFILL_WAIT
|
1118 PG_STATE_BACKFILLING
))) {
1119 psdout(10) << __func__
<< " set" << dendl
;
1120 state_set(PG_STATE_FORCED_BACKFILL
);
1121 pl
->publish_stats_to_osd();
1124 } else if (state
& PG_STATE_FORCED_BACKFILL
) {
1125 psdout(10) << __func__
<< " clear" << dendl
;
1126 state_clear(PG_STATE_FORCED_BACKFILL
);
1127 pl
->publish_stats_to_osd();
1131 psdout(20) << __func__
<< " state " << get_current_state()
1133 pl
->update_local_background_io_priority(get_backfill_priority());
1138 void PeeringState::schedule_renew_lease()
1140 pl
->schedule_renew_lease(
1142 readable_interval
/ 2);
1145 void PeeringState::send_lease()
1147 epoch_t epoch
= pl
->get_osdmap_epoch();
1148 for (auto peer
: actingset
) {
1149 if (peer
== pg_whoami
) {
1152 pl
->send_cluster_message(
1154 TOPNSPC::make_message
<MOSDPGLease
>(epoch
,
1155 spg_t(spgid
.pgid
, peer
.shard
),
1161 void PeeringState::proc_lease(const pg_lease_t
& l
)
1163 assert(HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
));
1164 if (!is_nonprimary()) {
1165 psdout(20) << __func__
<< " no-op, !nonprimary" << dendl
;
1168 psdout(10) << __func__
<< " " << l
<< dendl
;
1169 if (l
.readable_until_ub
> readable_until_ub_from_primary
) {
1170 readable_until_ub_from_primary
= l
.readable_until_ub
;
1173 ceph::signedspan ru
= ceph::signedspan::zero();
1174 if (l
.readable_until
!= ceph::signedspan::zero() &&
1175 hb_stamps
[0]->peer_clock_delta_ub
) {
1176 ru
= l
.readable_until
- *hb_stamps
[0]->peer_clock_delta_ub
;
1177 psdout(20) << " peer_clock_delta_ub " << *hb_stamps
[0]->peer_clock_delta_ub
1178 << " -> ru " << ru
<< dendl
;
1180 if (ru
> readable_until
) {
1181 readable_until
= ru
;
1182 psdout(20) << __func__
<< " readable_until now " << readable_until
<< dendl
;
1183 // NOTE: if we ever decide to block/queue ops on the replica,
1184 // we'll need to wake them up here.
1187 ceph::signedspan ruub
;
1188 if (hb_stamps
[0]->peer_clock_delta_lb
) {
1189 ruub
= l
.readable_until_ub
- *hb_stamps
[0]->peer_clock_delta_lb
;
1190 psdout(20) << " peer_clock_delta_lb " << *hb_stamps
[0]->peer_clock_delta_lb
1191 << " -> ruub " << ruub
<< dendl
;
1193 ruub
= pl
->get_mnow() + l
.interval
;
1194 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub
<< dendl
;
1196 if (ruub
> readable_until_ub
) {
1197 readable_until_ub
= ruub
;
1198 psdout(20) << __func__
<< " readable_until_ub now " << readable_until_ub
1203 void PeeringState::proc_lease_ack(int from
, const pg_lease_ack_t
& a
)
1205 assert(HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
));
1206 auto now
= pl
->get_mnow();
1207 bool was_min
= false;
1208 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1209 if (from
== acting
[i
]) {
1210 // the lease_ack value is based on the primary's clock
1211 if (a
.readable_until_ub
> acting_readable_until_ub
[i
]) {
1212 if (acting_readable_until_ub
[i
] == readable_until
) {
1215 acting_readable_until_ub
[i
] = a
.readable_until_ub
;
1221 auto old_ru
= readable_until
;
1222 recalc_readable_until();
1224 pl
->recheck_readable();
1229 void PeeringState::proc_renew_lease()
1231 assert(HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
));
1232 renew_lease(pl
->get_mnow());
1234 schedule_renew_lease();
1237 void PeeringState::recalc_readable_until()
1239 assert(is_primary());
1240 ceph::signedspan min
= readable_until_ub_sent
;
1241 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1242 if (acting
[i
] == pg_whoami
.osd
|| acting
[i
] == CRUSH_ITEM_NONE
) {
1245 dout(20) << __func__
<< " peer osd." << acting
[i
]
1246 << " ruub " << acting_readable_until_ub
[i
] << dendl
;
1247 if (acting_readable_until_ub
[i
] < min
) {
1248 min
= acting_readable_until_ub
[i
];
1251 readable_until
= min
;
1252 readable_until_ub
= min
;
1253 dout(20) << __func__
<< " readable_until[_ub] " << readable_until
1254 << " (sent " << readable_until_ub_sent
<< ")" << dendl
;
1257 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef
& map
)
1259 assert(HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
));
1260 bool changed
= false;
1261 auto p
= prior_readable_down_osds
.begin();
1262 while (p
!= prior_readable_down_osds
.end()) {
1263 if (map
->is_dead(*p
)) {
1264 dout(10) << __func__
<< " prior_readable_down_osds osd." << *p
1265 << " is dead as of epoch " << map
->get_epoch()
1267 p
= prior_readable_down_osds
.erase(p
);
1273 if (changed
&& prior_readable_down_osds
.empty()) {
1274 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl
;
1275 clear_prior_readable_until_ub();
1281 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap
)
1283 epoch_t up_thru
= osdmap
->get_up_thru(pg_whoami
.osd
);
1285 up_thru
>= info
.history
.same_interval_since
) {
1286 psdout(10) << "adjust_need_up_thru now "
1287 << up_thru
<< ", need_up_thru now false" << dendl
;
1288 need_up_thru
= false;
1294 PastIntervals::PriorSet
PeeringState::build_prior()
1298 for (auto it
= peer_info
.begin(); it
!= peer_info
.end(); ++it
) {
1299 ceph_assert(info
.history
.last_epoch_started
>=
1300 it
->second
.history
.last_epoch_started
);
1304 const OSDMap
&osdmap
= *get_osdmap();
1305 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
1306 pool
.info
.is_erasure(),
1307 info
.history
.last_epoch_started
,
1308 &missing_loc
.get_recoverable_predicate(),
1309 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
1310 const osd_info_t
*pinfo
= 0;
1311 if (osdmap
.exists(osd
)) {
1312 pinfo
= &osdmap
.get_info(osd
);
1314 *lost_at
= pinfo
->lost_at
;
1317 if (osdmap
.is_up(osd
)) {
1318 return PastIntervals::UP
;
1319 } else if (!pinfo
) {
1320 return PastIntervals::DNE
;
1321 } else if (pinfo
->lost_at
> start
) {
1322 return PastIntervals::LOST
;
1324 return PastIntervals::DOWN
;
1331 if (prior
.pg_down
) {
1332 state_set(PG_STATE_DOWN
);
1335 if (get_osdmap()->get_up_thru(pg_whoami
.osd
) <
1336 info
.history
.same_interval_since
) {
1337 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1338 << " < same_since " << info
.history
.same_interval_since
1339 << ", must notify monitor" << dendl
;
1340 need_up_thru
= true;
1342 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1343 << " >= same_since " << info
.history
.same_interval_since
1344 << ", all is well" << dendl
;
1345 need_up_thru
= false;
1347 pl
->set_probe_targets(prior
.probe
);
1351 bool PeeringState::needs_recovery() const
1353 ceph_assert(is_primary());
1355 auto &missing
= pg_log
.get_missing();
1357 if (missing
.num_missing()) {
1358 psdout(10) << __func__
<< " primary has " << missing
.num_missing()
1359 << " missing" << dendl
;
1363 ceph_assert(!acting_recovery_backfill
.empty());
1364 for (const pg_shard_t
& peer
: acting_recovery_backfill
) {
1365 if (peer
== get_primary()) {
1368 auto pm
= peer_missing
.find(peer
);
1369 if (pm
== peer_missing
.end()) {
1370 psdout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
1374 if (pm
->second
.num_missing()) {
1375 psdout(10) << __func__
<< " osd." << peer
<< " has "
1376 << pm
->second
.num_missing() << " missing" << dendl
;
1381 psdout(10) << __func__
<< " is recovered" << dendl
;
1385 bool PeeringState::needs_backfill() const
1387 ceph_assert(is_primary());
1389 // We can assume that only possible osds that need backfill
1390 // are on the backfill_targets vector nodes.
1391 for (const pg_shard_t
& peer
: backfill_targets
) {
1392 auto pi
= peer_info
.find(peer
);
1393 ceph_assert(pi
!= peer_info
.end());
1394 if (!pi
->second
.last_backfill
.is_max()) {
1395 psdout(10) << __func__
<< " osd." << peer
1396 << " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
1401 psdout(10) << __func__
<< " does not need backfill" << dendl
;
1406 * Returns true unless there is a non-lost OSD in might_have_unfound.
1408 bool PeeringState::all_unfound_are_queried_or_lost(
1409 const OSDMapRef osdmap
) const
1411 ceph_assert(is_primary());
1413 auto peer
= might_have_unfound
.begin();
1414 auto mend
= might_have_unfound
.end();
1415 for (; peer
!= mend
; ++peer
) {
1416 if (peer_missing
.count(*peer
))
1418 auto iter
= peer_info
.find(*peer
);
1419 if (iter
!= peer_info
.end() &&
1420 (iter
->second
.is_empty() || iter
->second
.dne()))
1422 if (!osdmap
->exists(peer
->osd
))
1424 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
1425 if (osd_info
.lost_at
<= osd_info
.up_from
) {
1426 // If there is even one OSD in might_have_unfound that isn't lost, we
1427 // still might retrieve our unfound.
1431 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1432 << might_have_unfound
1433 << " have been queried or are marked lost" << dendl
;
1438 void PeeringState::reject_reservation()
1440 pl
->unreserve_recovery_space();
1441 pl
->send_cluster_message(
1443 TOPNSPC::make_message
<MBackfillReserve
>(
1444 MBackfillReserve::REJECT_TOOFULL
,
1445 spg_t(info
.pgid
.pgid
, primary
.shard
),
1446 get_osdmap_epoch()),
1447 get_osdmap_epoch());
1453 * Returns an iterator to the best info in infos sorted by:
1454 * 1) Prefer newer last_update
1455 * 2) Prefer longer tail if it brings another info into contiguity
1456 * 3) Prefer current primary
1458 map
<pg_shard_t
, pg_info_t
>::const_iterator
PeeringState::find_best_info(
1459 const map
<pg_shard_t
, pg_info_t
> &infos
,
1460 bool restrict_to_up_acting
,
1461 bool *history_les_bound
) const
1463 ceph_assert(history_les_bound
);
1464 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1465 * to make changes to this process. Also, make sure to update it
1466 * when you find bugs! */
1467 epoch_t max_last_epoch_started_found
= 0;
1468 for (auto i
= infos
.begin(); i
!= infos
.end(); ++i
) {
1469 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1470 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1471 *history_les_bound
= true;
1472 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1474 if (!i
->second
.is_incomplete() &&
1475 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1476 *history_les_bound
= false;
1477 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1480 eversion_t min_last_update_acceptable
= eversion_t::max();
1481 for (auto i
= infos
.begin(); i
!= infos
.end(); ++i
) {
1482 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1483 if (min_last_update_acceptable
> i
->second
.last_update
)
1484 min_last_update_acceptable
= i
->second
.last_update
;
1487 if (min_last_update_acceptable
== eversion_t::max())
1490 auto best
= infos
.end();
1491 // find osd with newest last_update (oldest for ec_pool).
1492 // if there are multiples, prefer
1493 // - a longer tail, if it brings another peer into log contiguity
1494 // - the current primary
1495 for (auto p
= infos
.begin(); p
!= infos
.end(); ++p
) {
1496 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1497 !is_acting(p
->first
))
1499 // Only consider peers with last_update >= min_last_update_acceptable
1500 if (p
->second
.last_update
< min_last_update_acceptable
)
1502 // Disqualify anyone with a too old last_epoch_started
1503 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1505 // Disqualify anyone who is incomplete (not fully backfilled)
1506 if (p
->second
.is_incomplete())
1508 if (best
== infos
.end()) {
1512 // Prefer newer last_update
1513 if (pool
.info
.require_rollback()) {
1514 if (p
->second
.last_update
> best
->second
.last_update
)
1516 if (p
->second
.last_update
< best
->second
.last_update
) {
1521 if (p
->second
.last_update
< best
->second
.last_update
)
1523 if (p
->second
.last_update
> best
->second
.last_update
) {
1529 // Prefer longer tail
1530 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1532 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1537 if (!p
->second
.has_missing() && best
->second
.has_missing()) {
1538 psdout(10) << __func__
<< " prefer osd." << p
->first
1539 << " because it is complete while best has missing"
1543 } else if (p
->second
.has_missing() && !best
->second
.has_missing()) {
1544 psdout(10) << __func__
<< " skipping osd." << p
->first
1545 << " because it has missing while best is complete"
1549 // both are complete or have missing
1553 // prefer current primary (usually the caller), all things being equal
1554 if (p
->first
== pg_whoami
) {
1555 psdout(10) << "calc_acting prefer osd." << p
->first
1556 << " because it is current primary" << dendl
;
1564 void PeeringState::calc_ec_acting(
1565 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1567 const vector
<int> &acting
,
1568 const vector
<int> &up
,
1569 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1570 bool restrict_to_up_acting
,
1572 set
<pg_shard_t
> *backfill
,
1573 set
<pg_shard_t
> *acting_backfill
,
1576 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1577 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1578 for (auto i
= all_info
.begin();
1579 i
!= all_info
.end();
1581 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1583 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1584 ss
<< "For position " << (unsigned)i
<< ": ";
1585 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1586 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1587 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1588 auth_log_shard
->second
.log_tail
) {
1589 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1593 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1594 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1596 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1599 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1600 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1601 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1602 auth_log_shard
->second
.log_tail
) {
1603 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1604 want
[i
] = acting
[i
];
1605 } else if (!restrict_to_up_acting
) {
1606 for (auto j
= all_info_by_shard
[shard_id_t(i
)].begin();
1607 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1609 ceph_assert(j
->shard
== i
);
1610 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1611 all_info
.find(*j
)->second
.last_update
>=
1612 auth_log_shard
->second
.log_tail
) {
1613 ss
<< " selecting stray: " << *j
<< std::endl
;
1618 if (want
[i
] == CRUSH_ITEM_NONE
)
1619 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1623 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1624 if (want
[i
] != CRUSH_ITEM_NONE
) {
1625 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1628 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1632 std::pair
<map
<pg_shard_t
, pg_info_t
>::const_iterator
, eversion_t
>
1633 PeeringState::select_replicated_primary(
1634 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1635 uint64_t force_auth_primary_missing_objects
,
1636 const std::vector
<int> &up
,
1637 pg_shard_t up_primary
,
1638 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1639 const OSDMapRef osdmap
,
1642 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1644 ss
<< __func__
<< " newest update on osd." << auth_log_shard_id
1645 << " with " << auth_log_shard
->second
<< std::endl
;
1648 auto primary
= all_info
.find(up_primary
);
1650 !primary
->second
.is_incomplete() &&
1651 primary
->second
.last_update
>=
1652 auth_log_shard
->second
.log_tail
) {
1653 assert(HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
));
1654 auto approx_missing_objects
=
1655 primary
->second
.stats
.stats
.sum
.num_objects_missing
;
1656 auto auth_version
= auth_log_shard
->second
.last_update
.version
;
1657 auto primary_version
= primary
->second
.last_update
.version
;
1658 if (auth_version
> primary_version
) {
1659 approx_missing_objects
+= auth_version
- primary_version
;
1661 approx_missing_objects
+= primary_version
- auth_version
;
1663 if ((uint64_t)approx_missing_objects
>
1664 force_auth_primary_missing_objects
) {
1665 primary
= auth_log_shard
;
1666 ss
<< "up_primary: " << up_primary
<< ") has approximate "
1667 << approx_missing_objects
1668 << "(>" << force_auth_primary_missing_objects
<<") "
1669 << "missing objects, osd." << auth_log_shard_id
1670 << " selected as primary instead"
1673 ss
<< "up_primary: " << up_primary
<< ") selected as primary"
1677 ceph_assert(!auth_log_shard
->second
.is_incomplete());
1678 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1679 << " selected as primary instead" << std::endl
;
1680 primary
= auth_log_shard
;
1683 ss
<< __func__
<< " primary is osd." << primary
->first
1684 << " with " << primary
->second
<< std::endl
;
1686 /* We include auth_log_shard->second.log_tail because in GetLog,
1687 * we will request logs back to the min last_update over our
1688 * acting_backfill set, which will result in our log being extended
1689 * as far backwards as necessary to pick up any peers which can
1690 * be log recovered by auth_log_shard's log */
1691 eversion_t oldest_auth_log_entry
=
1692 std::min(primary
->second
.log_tail
, auth_log_shard
->second
.log_tail
);
1694 return std::make_pair(primary
, oldest_auth_log_entry
);
1699 * calculate the desired acting set.
1701 * Choose an appropriate acting set. Prefer up[0], unless it is
1702 * incomplete, or another osd has a longer tail that allows us to
1703 * bring other up nodes up to date.
1705 void PeeringState::calc_replicated_acting(
1706 map
<pg_shard_t
, pg_info_t
>::const_iterator primary
,
1707 eversion_t oldest_auth_log_entry
,
1709 const vector
<int> &acting
,
1710 const vector
<int> &up
,
1711 pg_shard_t up_primary
,
1712 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1713 bool restrict_to_up_acting
,
1715 set
<pg_shard_t
> *backfill
,
1716 set
<pg_shard_t
> *acting_backfill
,
1717 const OSDMapRef osdmap
,
1721 ss
<< __func__
<< (restrict_to_up_acting
? " restrict_to_up_acting" : "")
1724 want
->push_back(primary
->first
.osd
);
1725 acting_backfill
->insert(primary
->first
);
1727 // select replicas that have log contiguity with primary.
1728 // prefer up, then acting, then any peer_info osds
1730 pg_shard_t up_cand
= pg_shard_t(i
, shard_id_t::NO_SHARD
);
1731 if (up_cand
== primary
->first
)
1733 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1734 if (cur_info
.is_incomplete() ||
1735 cur_info
.last_update
< oldest_auth_log_entry
) {
1736 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1737 backfill
->insert(up_cand
);
1738 acting_backfill
->insert(up_cand
);
1741 acting_backfill
->insert(up_cand
);
1742 ss
<< " osd." << i
<< " (up) accepted " << cur_info
<< std::endl
;
1746 if (want
->size() >= size
) {
1750 std::vector
<std::pair
<eversion_t
, int>> candidate_by_last_update
;
1751 candidate_by_last_update
.reserve(acting
.size());
1752 // This no longer has backfill OSDs, but they are covered above.
1753 for (auto i
: acting
) {
1754 pg_shard_t
acting_cand(i
, shard_id_t::NO_SHARD
);
1755 // skip up osds we already considered above
1756 if (acting_cand
== primary
->first
)
1758 auto up_it
= find(up
.begin(), up
.end(), i
);
1759 if (up_it
!= up
.end())
1762 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1763 if (cur_info
.is_incomplete() ||
1764 cur_info
.last_update
< oldest_auth_log_entry
) {
1765 ss
<< " shard " << acting_cand
<< " (acting) REJECTED "
1766 << cur_info
<< std::endl
;
1768 candidate_by_last_update
.emplace_back(cur_info
.last_update
, i
);
1772 auto sort_by_eversion
=[](const std::pair
<eversion_t
, int> &lhs
,
1773 const std::pair
<eversion_t
, int> &rhs
) {
1774 return lhs
.first
> rhs
.first
;
1776 // sort by last_update, in descending order.
1777 std::sort(candidate_by_last_update
.begin(),
1778 candidate_by_last_update
.end(), sort_by_eversion
);
1779 for (auto &p
: candidate_by_last_update
) {
1780 ceph_assert(want
->size() < size
);
1781 want
->push_back(p
.second
);
1782 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1783 acting_backfill
->insert(s
);
1784 ss
<< " shard " << s
<< " (acting) accepted "
1785 << all_info
.find(s
)->second
<< std::endl
;
1786 if (want
->size() >= size
) {
1791 if (restrict_to_up_acting
) {
1794 candidate_by_last_update
.clear();
1795 candidate_by_last_update
.reserve(all_info
.size()); // overestimate but fine
1796 // continue to search stray to find more suitable peers
1797 for (auto &i
: all_info
) {
1798 // skip up osds we already considered above
1799 if (i
.first
== primary
->first
)
1801 auto up_it
= find(up
.begin(), up
.end(), i
.first
.osd
);
1802 if (up_it
!= up
.end())
1804 auto acting_it
= find(
1805 acting
.begin(), acting
.end(), i
.first
.osd
);
1806 if (acting_it
!= acting
.end())
1809 if (i
.second
.is_incomplete() ||
1810 i
.second
.last_update
< oldest_auth_log_entry
) {
1811 ss
<< " shard " << i
.first
<< " (stray) REJECTED " << i
.second
1814 candidate_by_last_update
.emplace_back(
1815 i
.second
.last_update
, i
.first
.osd
);
1819 if (candidate_by_last_update
.empty()) {
1820 // save us some effort
1824 // sort by last_update, in descending order.
1825 std::sort(candidate_by_last_update
.begin(),
1826 candidate_by_last_update
.end(), sort_by_eversion
);
1828 for (auto &p
: candidate_by_last_update
) {
1829 ceph_assert(want
->size() < size
);
1830 want
->push_back(p
.second
);
1831 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1832 acting_backfill
->insert(s
);
1833 ss
<< " shard " << s
<< " (stray) accepted "
1834 << all_info
.find(s
)->second
<< std::endl
;
1835 if (want
->size() >= size
) {
1841 // Defines osd preference order: acting set, then larger last_update
1842 using osd_ord_t
= std::tuple
<bool, eversion_t
>; // <acting, last_update>
1843 using osd_id_t
= int;
1845 class bucket_candidates_t
{
1846 std::deque
<std::pair
<osd_ord_t
, osd_id_t
>> osds
;
1850 void add_osd(osd_ord_t ord
, osd_id_t osd
) {
1851 // osds will be added in smallest to largest order
1852 assert(osds
.empty() || osds
.back().first
<= ord
);
1853 osds
.push_back(std::make_pair(ord
, osd
));
1855 osd_id_t
pop_osd() {
1856 ceph_assert(!is_empty());
1857 auto ret
= osds
.front();
1862 void inc_selected() { selected
++; }
1863 unsigned get_num_selected() const { return selected
; }
1865 osd_ord_t
get_ord() const {
1866 return osds
.empty() ? std::make_tuple(false, eversion_t())
1867 : osds
.front().first
;
1870 bool is_empty() const { return osds
.empty(); }
1872 bool operator<(const bucket_candidates_t
&rhs
) const {
1873 return std::make_tuple(-selected
, get_ord()) <
1874 std::make_tuple(-rhs
.selected
, rhs
.get_ord());
1877 friend std::ostream
&operator<<(std::ostream
&, const bucket_candidates_t
&);
1880 std::ostream
&operator<<(std::ostream
&lhs
, const bucket_candidates_t
&cand
)
1882 return lhs
<< "candidates[" << cand
.osds
<< "]";
1885 class bucket_heap_t
{
1886 using elem_t
= std::reference_wrapper
<bucket_candidates_t
>;
1887 std::vector
<elem_t
> heap
;
1889 // Max heap -- should emit buckets in order of preference
1891 bool operator()(const elem_t
&lhs
, const elem_t
&rhs
) {
1892 return lhs
.get() < rhs
.get();
1896 void push_if_nonempty(elem_t e
) {
1897 if (!e
.get().is_empty()) {
1899 std::push_heap(heap
.begin(), heap
.end(), comp());
1903 std::pop_heap(heap
.begin(), heap
.end(), comp());
1904 auto ret
= heap
.back();
1909 bool is_empty() const { return heap
.empty(); }
1913 * calc_replicated_acting_stretch
1915 * Choose an acting set using as much of the up set as possible; filling
1916 * in the remaining slots so as to maximize the number of crush buckets at
1917 * level pool.info.peering_crush_bucket_barrier represented.
1919 * Stretch clusters are a bit special: while they have a "size" the
1920 * same way as normal pools, if we happen to lose a data center
1921 * (we call it a "stretch bucket", but really it'll be a data center or
1922 * a cloud availability zone), we don't actually want to shove
1923 * 2 DC's worth of replication into a single site -- it won't fit!
1924 * So we locally calculate a bucket_max, based
1925 * on the targeted number of stretch buckets for the pool and
1926 * its size. Then we won't pull more than bucket_max from any
1927 * given ancestor even if it leaves us undersized.
1929 * There are two distinct phases: (commented below)
1931 void PeeringState::calc_replicated_acting_stretch(
1932 map
<pg_shard_t
, pg_info_t
>::const_iterator primary
,
1933 eversion_t oldest_auth_log_entry
,
1935 const vector
<int> &acting
,
1936 const vector
<int> &up
,
1937 pg_shard_t up_primary
,
1938 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1939 bool restrict_to_up_acting
,
1941 set
<pg_shard_t
> *backfill
,
1942 set
<pg_shard_t
> *acting_backfill
,
1943 const OSDMapRef osdmap
,
1948 ceph_assert(acting_backfill
);
1949 ceph_assert(backfill
);
1950 ss
<< __func__
<< (restrict_to_up_acting
? " restrict_to_up_acting" : "")
1953 auto used
= [want
](int osd
) {
1954 return std::find(want
->begin(), want
->end(), osd
) != want
->end();
1957 auto usable_info
= [&](const auto &cur_info
) mutable {
1958 return !(cur_info
.is_incomplete() ||
1959 cur_info
.last_update
< oldest_auth_log_entry
);
1962 auto osd_info
= [&](int osd
) mutable -> const pg_info_t
& {
1963 pg_shard_t cand
= pg_shard_t(osd
, shard_id_t::NO_SHARD
);
1964 const pg_info_t
&cur_info
= all_info
.find(cand
)->second
;
1968 auto usable_osd
= [&](int osd
) mutable {
1969 return usable_info(osd_info(osd
));
1972 std::map
<int, bucket_candidates_t
> ancestors
;
1973 auto get_ancestor
= [&](int osd
) mutable {
1974 int ancestor
= osdmap
->crush
->get_parent_of_type(
1976 pool
.info
.peering_crush_bucket_barrier
,
1977 pool
.info
.crush_rule
);
1978 return &ancestors
[ancestor
];
1981 unsigned bucket_max
= pool
.info
.size
/ pool
.info
.peering_crush_bucket_target
;
1982 if (bucket_max
* pool
.info
.peering_crush_bucket_target
< pool
.info
.size
) {
1986 /* 1) Select all usable osds from the up set as well as the primary
1988 * We also stash any unusable osds from up into backfill.
1990 auto add_required
= [&](int osd
) {
1992 want
->push_back(osd
);
1993 acting_backfill
->insert(
1994 pg_shard_t(osd
, shard_id_t::NO_SHARD
));
1995 get_ancestor(osd
)->inc_selected();
1998 add_required(primary
->first
.osd
);
1999 ss
<< " osd " << primary
->first
.osd
<< " primary accepted "
2000 << osd_info(primary
->first
.osd
) << std::endl
;
2001 for (auto upcand
: up
) {
2002 auto upshard
= pg_shard_t(upcand
, shard_id_t::NO_SHARD
);
2003 auto &curinfo
= osd_info(upcand
);
2004 if (usable_osd(upcand
)) {
2005 ss
<< " osd " << upcand
<< " (up) accepted " << curinfo
<< std::endl
;
2006 add_required(upcand
);
2008 ss
<< " osd " << upcand
<< " (up) backfill " << curinfo
<< std::endl
;
2009 backfill
->insert(upshard
);
2010 acting_backfill
->insert(upshard
);
2014 if (want
->size() >= pool
.info
.size
) { // non-failed CRUSH mappings are valid
2015 ss
<< " up set sufficient" << std::endl
;
2018 ss
<< " up set insufficient, considering remaining osds" << std::endl
;
2020 /* 2) Fill out remaining slots from usable osds in all_info
2021 * while maximizing the number of ancestor nodes at the
2022 * barrier_id crush level.
2025 std::vector
<std::pair
<osd_ord_t
, osd_id_t
>> candidates
;
2026 /* To do this, we first filter the set of usable osd into an ordered
2027 * list of usable osds
2029 auto get_osd_ord
= [&](bool is_acting
, const pg_info_t
&info
) -> osd_ord_t
{
2030 return std::make_tuple(
2031 !is_acting
/* acting should sort first */,
2034 for (auto &cand
: acting
) {
2035 auto &cand_info
= osd_info(cand
);
2036 if (!used(cand
) && usable_info(cand_info
)) {
2037 ss
<< " acting candidate " << cand
<< " " << cand_info
<< std::endl
;
2038 candidates
.push_back(std::make_pair(get_osd_ord(true, cand_info
), cand
));
2041 if (!restrict_to_up_acting
) {
2042 for (auto &[cand
, info
] : all_info
) {
2043 if (!used(cand
.osd
) && usable_info(info
) &&
2044 (std::find(acting
.begin(), acting
.end(), cand
.osd
)
2046 ss
<< " other candidate " << cand
<< " " << info
<< std::endl
;
2047 candidates
.push_back(
2048 std::make_pair(get_osd_ord(false, info
), cand
.osd
));
2052 std::sort(candidates
.begin(), candidates
.end());
2054 // We then filter these candidates by ancestor
2055 std::for_each(candidates
.begin(), candidates
.end(), [&](auto cand
) {
2056 get_ancestor(cand
.second
)->add_osd(cand
.first
, cand
.second
);
2060 auto pop_ancestor
= [&](auto &ancestor
) {
2061 ceph_assert(!ancestor
.is_empty());
2062 auto osd
= ancestor
.pop_osd();
2064 ss
<< " accepting candidate " << osd
<< std::endl
;
2066 ceph_assert(!used(osd
));
2067 ceph_assert(usable_osd(osd
));
2069 want
->push_back(osd
);
2070 acting_backfill
->insert(
2071 pg_shard_t(osd
, shard_id_t::NO_SHARD
));
2072 ancestor
.inc_selected();
2075 /* Next, we use the ancestors map to grab a descendant of the
2076 * peering_crush_mandatory_member if not already represented.
2078 * TODO: using 0 here to match other users. Prior to merge, I
2079 * expect that this and other users should instead check against
2082 if (pool
.info
.peering_crush_mandatory_member
!= CRUSH_ITEM_NONE
) {
2083 auto aiter
= ancestors
.find(pool
.info
.peering_crush_mandatory_member
);
2084 if (aiter
!= ancestors
.end() &&
2085 !aiter
->second
.get_num_selected()) {
2086 ss
<< " adding required ancestor " << aiter
->first
<< std::endl
;
2087 ceph_assert(!aiter
->second
.is_empty()); // wouldn't exist otherwise
2088 pop_ancestor(aiter
->second
);
2092 /* We then place the ancestors in a heap ordered by fewest selected
2093 * and then by the ordering token of the next osd */
2094 bucket_heap_t aheap
;
2095 std::for_each(ancestors
.begin(), ancestors
.end(), [&](auto &anc
) {
2096 aheap
.push_if_nonempty(anc
.second
);
2099 /* and pull from this heap until it's empty or we have enough.
2100 * "We have enough" is a sufficient check here for
2101 * stretch_set_can_peer() because our heap sorting always
2102 * pulls from ancestors with the least number of included OSDs,
2103 * so if it is possible to satisfy the bucket_count constraints we
2106 while (!aheap
.is_empty() && want
->size() < pool
.info
.size
) {
2107 auto next
= aheap
.pop();
2108 pop_ancestor(next
.get());
2109 if (next
.get().get_num_selected() < bucket_max
) {
2110 aheap
.push_if_nonempty(next
);
2114 /* The end result is that we should have as many buckets covered as
2115 * possible while respecting up, the primary selection,
2116 * the pool size (given bucket count constraints),
2117 * and the mandatory member.
2122 bool PeeringState::recoverable(const vector
<int> &want
) const
2124 unsigned num_want_acting
= 0;
2125 set
<pg_shard_t
> have
;
2126 for (int i
= 0; i
< (int)want
.size(); ++i
) {
2127 if (want
[i
] != CRUSH_ITEM_NONE
) {
2132 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
2136 if (num_want_acting
< pool
.info
.min_size
) {
2137 if (!cct
->_conf
.get_val
<bool>("osd_allow_recovery_below_min_size")) {
2138 psdout(10) << __func__
<< " failed, recovery below min size not enabled" << dendl
;
2142 if (missing_loc
.get_recoverable_predicate()(have
)) {
2145 psdout(10) << __func__
<< " failed, not recoverable " << dendl
;
2150 void PeeringState::choose_async_recovery_ec(
2151 const map
<pg_shard_t
, pg_info_t
> &all_info
,
2152 const pg_info_t
&auth_info
,
2154 set
<pg_shard_t
> *async_recovery
,
2155 const OSDMapRef osdmap
) const
2157 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
2158 for (uint8_t i
= 0; i
< want
->size(); ++i
) {
2159 if ((*want
)[i
] == CRUSH_ITEM_NONE
)
2162 // Considering log entries to recover is accurate enough for
2163 // now. We could use minimum_to_decode_with_cost() later if
2165 pg_shard_t
shard_i((*want
)[i
], shard_id_t(i
));
2166 // do not include strays
2167 if (stray_set
.find(shard_i
) != stray_set
.end())
2169 // Do not include an osd that is not up, since choosing it as
2170 // an async_recovery_target will move it out of the acting set.
2171 // This results in it being identified as a stray during peering,
2172 // because it is no longer in the up or acting set.
2173 if (!is_up(shard_i
))
2175 auto shard_info
= all_info
.find(shard_i
)->second
;
2176 // for ec pools we rollback all entries past the authoritative
2177 // last_update *before* activation. This is relatively inexpensive
2178 // compared to recovery, since it is purely local, so treat shards
2179 // past the authoritative last_update the same as those equal to it.
2180 version_t auth_version
= auth_info
.last_update
.version
;
2181 version_t candidate_version
= shard_info
.last_update
.version
;
2182 assert(HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
));
2183 auto approx_missing_objects
=
2184 shard_info
.stats
.stats
.sum
.num_objects_missing
;
2185 if (auth_version
> candidate_version
) {
2186 approx_missing_objects
+= auth_version
- candidate_version
;
2188 if (static_cast<uint64_t>(approx_missing_objects
) >
2189 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
2190 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
2194 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
2197 // take out as many osds as we can for async recovery, in order of cost
2198 for (auto rit
= candidates_by_cost
.rbegin();
2199 rit
!= candidates_by_cost
.rend(); ++rit
) {
2200 pg_shard_t cur_shard
= rit
->second
;
2201 vector
<int> candidate_want(*want
);
2202 candidate_want
[cur_shard
.shard
.id
] = CRUSH_ITEM_NONE
;
2203 if (recoverable(candidate_want
)) {
2204 want
->swap(candidate_want
);
2205 async_recovery
->insert(cur_shard
);
2208 psdout(20) << __func__
<< " result want=" << *want
2209 << " async_recovery=" << *async_recovery
<< dendl
;
2212 void PeeringState::choose_async_recovery_replicated(
2213 const map
<pg_shard_t
, pg_info_t
> &all_info
,
2214 const pg_info_t
&auth_info
,
2216 set
<pg_shard_t
> *async_recovery
,
2217 const OSDMapRef osdmap
) const
2219 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
2220 for (auto osd_num
: *want
) {
2221 pg_shard_t
shard_i(osd_num
, shard_id_t::NO_SHARD
);
2222 // do not include strays
2223 if (stray_set
.find(shard_i
) != stray_set
.end())
2225 // Do not include an osd that is not up, since choosing it as
2226 // an async_recovery_target will move it out of the acting set.
2227 // This results in it being identified as a stray during peering,
2228 // because it is no longer in the up or acting set.
2229 if (!is_up(shard_i
))
2231 auto shard_info
= all_info
.find(shard_i
)->second
;
2232 // use the approximate magnitude of the difference in length of
2233 // logs plus historical missing objects as the cost of recovery
2234 version_t auth_version
= auth_info
.last_update
.version
;
2235 version_t candidate_version
= shard_info
.last_update
.version
;
2236 assert(HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
));
2237 auto approx_missing_objects
=
2238 shard_info
.stats
.stats
.sum
.num_objects_missing
;
2239 if (auth_version
> candidate_version
) {
2240 approx_missing_objects
+= auth_version
- candidate_version
;
2242 approx_missing_objects
+= candidate_version
- auth_version
;
2244 if (static_cast<uint64_t>(approx_missing_objects
) >
2245 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
2246 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
2250 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
2252 // take out as many osds as we can for async recovery, in order of cost
2253 for (auto rit
= candidates_by_cost
.rbegin();
2254 rit
!= candidates_by_cost
.rend(); ++rit
) {
2255 if (want
->size() <= pool
.info
.min_size
) {
2258 pg_shard_t cur_shard
= rit
->second
;
2259 vector
<int> candidate_want(*want
);
2260 for (auto it
= candidate_want
.begin(); it
!= candidate_want
.end(); ++it
) {
2261 if (*it
== cur_shard
.osd
) {
2262 candidate_want
.erase(it
);
2263 if (pool
.info
.stretch_set_can_peer(candidate_want
, *osdmap
, NULL
)) {
2264 // if we're in stretch mode, we can only remove the osd if it doesn't
2265 // break peering limits.
2266 want
->swap(candidate_want
);
2267 async_recovery
->insert(cur_shard
);
2274 psdout(20) << __func__
<< " result want=" << *want
2275 << " async_recovery=" << *async_recovery
<< dendl
;
2281 * calculate the desired acting, and request a change with the monitor
2282 * if it differs from the current acting.
2284 * if restrict_to_up_acting=true, we filter out anything that's not in
2285 * up/acting. in order to lift this restriction, we need to
2286 * 1) check whether it's worth switching the acting set any time we get
2287 * a new pg info (not just here, when recovery finishes)
2288 * 2) check whether anything in want_acting went down on each new map
2289 * (and, if so, calculate a new want_acting)
2290 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2293 bool PeeringState::choose_acting(pg_shard_t
&auth_log_shard_id
,
2294 bool restrict_to_up_acting
,
2295 bool *history_les_bound
,
2296 bool request_pg_temp_change_only
)
2298 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
2299 all_info
[pg_whoami
] = info
;
2301 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 10>()) {
2302 for (auto p
= all_info
.begin(); p
!= all_info
.end(); ++p
) {
2303 psdout(10) << __func__
<< " all_info osd." << p
->first
<< " "
2304 << p
->second
<< dendl
;
2308 auto auth_log_shard
= find_best_info(all_info
, restrict_to_up_acting
,
2311 if (auth_log_shard
== all_info
.end()) {
2313 psdout(10) << __func__
<< " no suitable info found (incomplete backfills?),"
2314 << " reverting to up" << dendl
;
2317 pl
->queue_want_pg_temp(empty
);
2319 psdout(10) << __func__
<< " failed" << dendl
;
2320 ceph_assert(want_acting
.empty());
2325 ceph_assert(!auth_log_shard
->second
.is_incomplete());
2326 auth_log_shard_id
= auth_log_shard
->first
;
2328 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
2331 if (pool
.info
.is_replicated()) {
2332 auto [primary_shard
, oldest_log
] = select_replicated_primary(
2334 cct
->_conf
.get_val
<uint64_t>(
2335 "osd_force_auth_primary_missing_objects"),
2341 if (pool
.info
.is_stretch_pool()) {
2342 calc_replicated_acting_stretch(
2345 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2350 restrict_to_up_acting
,
2353 &want_acting_backfill
,
2358 calc_replicated_acting(
2361 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2366 restrict_to_up_acting
,
2369 &want_acting_backfill
,
2377 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2381 restrict_to_up_acting
,
2384 &want_acting_backfill
,
2387 psdout(10) << ss
.str() << dendl
;
2389 if (!recoverable(want
)) {
2390 want_acting
.clear();
2394 set
<pg_shard_t
> want_async_recovery
;
2395 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC
)) {
2396 if (pool
.info
.is_erasure()) {
2397 choose_async_recovery_ec(
2398 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2401 choose_async_recovery_replicated(
2402 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2406 while (want
.size() > pool
.info
.size
) {
2407 // async recovery should have taken out as many osds as it can.
2408 // if not, then always evict the last peer
2409 // (will get synchronously recovered later)
2410 psdout(10) << __func__
<< " evicting osd." << want
.back()
2411 << " from oversized want " << want
<< dendl
;
2414 if (want
!= acting
) {
2415 psdout(10) << __func__
<< " want " << want
<< " != acting " << acting
2416 << ", requesting pg_temp change" << dendl
;
2419 if (!cct
->_conf
->osd_debug_no_acting_change
) {
2420 if (want_acting
== up
) {
2421 // There can't be any pending backfill if
2422 // want is the same as crush map up OSDs.
2423 ceph_assert(want_backfill
.empty());
2425 pl
->queue_want_pg_temp(empty
);
2427 pl
->queue_want_pg_temp(want
);
2432 if (request_pg_temp_change_only
)
2434 want_acting
.clear();
2435 acting_recovery_backfill
= want_acting_backfill
;
2436 psdout(10) << "acting_recovery_backfill is "
2437 << acting_recovery_backfill
<< dendl
;
2439 backfill_targets
.empty() ||
2440 backfill_targets
== want_backfill
);
2441 if (backfill_targets
.empty()) {
2442 // Caller is GetInfo
2443 backfill_targets
= want_backfill
;
2445 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2447 async_recovery_targets
.empty() ||
2448 async_recovery_targets
== want_async_recovery
||
2450 if (async_recovery_targets
.empty() || !needs_recovery()) {
2451 async_recovery_targets
= want_async_recovery
;
2453 // Will not change if already set because up would have had to change
2454 // Verify that nothing in backfill is in stray_set
2455 for (auto i
= want_backfill
.begin(); i
!= want_backfill
.end(); ++i
) {
2456 ceph_assert(stray_set
.find(*i
) == stray_set
.end());
2458 psdout(10) << "choose_acting want=" << want
<< " backfill_targets="
2459 << want_backfill
<< " async_recovery_targets="
2460 << async_recovery_targets
<< dendl
;
2464 void PeeringState::log_weirdness()
2466 if (pg_log
.get_tail() != info
.log_tail
)
2467 pl
->get_clog_error() << info
.pgid
2468 << " info mismatch, log.tail " << pg_log
.get_tail()
2469 << " != info.log_tail " << info
.log_tail
;
2470 if (pg_log
.get_head() != info
.last_update
)
2471 pl
->get_clog_error() << info
.pgid
2472 << " info mismatch, log.head " << pg_log
.get_head()
2473 << " != info.last_update " << info
.last_update
;
2475 if (!pg_log
.get_log().empty()) {
2477 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
2478 pl
->get_clog_error() << info
.pgid
2479 << " log bound mismatch, info (tail,head] ("
2480 << pg_log
.get_tail() << ","
2481 << pg_log
.get_head() << "]"
2483 << pg_log
.get_log().log
.begin()->version
<< ","
2484 << pg_log
.get_log().log
.rbegin()->version
<< "]";
2487 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
2488 pl
->get_clog_error() << info
.pgid
2489 << " caller_ops.size "
2490 << pg_log
.get_log().caller_ops
.size()
2491 << " > log size " << pg_log
.get_log().log
.size();
2496 * Process information from a replica to determine if it could have any
2497 * objects that i need.
2499 * TODO: if the missing set becomes very large, this could get expensive.
2500 * Instead, we probably want to just iterate over our unfound set.
2502 bool PeeringState::search_for_missing(
2503 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
2505 PeeringCtxWrapper
&ctx
)
2507 uint64_t num_unfound_before
= missing_loc
.num_unfound();
2508 bool found_missing
= missing_loc
.add_source_info(
2509 from
, oinfo
, omissing
, ctx
.handle
);
2510 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
2511 pl
->publish_stats_to_osd();
2512 // avoid doing this if the peer is empty. This is abit of paranoia
2513 // to avoid doing something rash if add_source_info() above
2514 // incorrectly decided we found something new. (if the peer has
2515 // last_update=0'0 that's impossible.)
2516 if (found_missing
&&
2517 oinfo
.last_update
!= eversion_t()) {
2518 pg_info_t
tinfo(oinfo
);
2519 tinfo
.pgid
.shard
= pg_whoami
.shard
;
2522 spg_t(info
.pgid
.pgid
, from
.shard
),
2523 get_osdmap_epoch(), // fixme: use lower epoch?
2527 return found_missing
;
2530 bool PeeringState::discover_all_missing(
2531 BufferedRecoveryMessages
&rctx
)
2533 auto &missing
= pg_log
.get_missing();
2534 uint64_t unfound
= get_num_unfound();
2535 bool any
= false; // did we start any queries
2537 psdout(10) << __func__
<< " "
2538 << missing
.num_missing() << " missing, "
2539 << unfound
<< " unfound"
2542 auto m
= might_have_unfound
.begin();
2543 auto mend
= might_have_unfound
.end();
2544 for (; m
!= mend
; ++m
) {
2545 pg_shard_t
peer(*m
);
2547 if (!get_osdmap()->is_up(peer
.osd
)) {
2548 psdout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
2552 if (peer_purged
.count(peer
)) {
2553 psdout(20) << __func__
<< " skipping purged osd." << peer
<< dendl
;
2557 auto iter
= peer_info
.find(peer
);
2558 if (iter
!= peer_info
.end() &&
2559 (iter
->second
.is_empty() || iter
->second
.dne())) {
2560 // ignore empty peers
2564 // If we've requested any of this stuff, the pg_missing_t information
2565 // should be on its way.
2566 // TODO: coalsce requested_* into a single data structure
2567 if (peer_missing
.find(peer
) != peer_missing
.end()) {
2568 psdout(20) << __func__
<< ": osd." << peer
2569 << ": we already have pg_missing_t" << dendl
;
2572 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
2573 psdout(20) << __func__
<< ": osd." << peer
2574 << ": in peer_log_requested" << dendl
;
2577 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
2578 psdout(20) << __func__
<< ": osd." << peer
2579 << ": in peer_missing_requested" << dendl
;
2584 psdout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
2586 peer_missing_requested
.insert(peer
);
2589 spg_t(info
.pgid
.pgid
, peer
.shard
),
2591 pg_query_t::FULLLOG
,
2592 peer
.shard
, pg_whoami
.shard
,
2593 info
.history
, get_osdmap_epoch()));
2599 /* Build the might_have_unfound set.
2601 * This is used by the primary OSD during recovery.
2603 * This set tracks the OSDs which might have unfound objects that the primary
2604 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2605 * will remove the OSD from the set.
2607 void PeeringState::build_might_have_unfound()
2609 ceph_assert(might_have_unfound
.empty());
2610 ceph_assert(is_primary());
2612 psdout(10) << __func__
<< dendl
;
2614 check_past_interval_bounds();
2616 might_have_unfound
= past_intervals
.get_might_have_unfound(
2618 pool
.info
.is_erasure());
2620 // include any (stray) peers
2621 for (auto p
= peer_info
.begin(); p
!= peer_info
.end(); ++p
)
2622 might_have_unfound
.insert(p
->first
);
2624 psdout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
2627 void PeeringState::activate(
2628 ObjectStore::Transaction
& t
,
2629 epoch_t activation_epoch
,
2630 PeeringCtxWrapper
&ctx
)
2632 ceph_assert(!is_peered());
2635 state_clear(PG_STATE_DOWN
);
2637 send_notify
= false;
2640 // only update primary last_epoch_started if we will go active
2641 if (acting_set_writeable()) {
2642 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
2643 info
.last_epoch_started
<= activation_epoch
);
2644 info
.last_epoch_started
= activation_epoch
;
2645 info
.last_interval_started
= info
.history
.same_interval_since
;
2647 } else if (is_acting(pg_whoami
)) {
2648 /* update last_epoch_started on acting replica to whatever the primary sent
2649 * unless it's smaller (could happen if we are going peered rather than
2650 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2651 if (info
.last_epoch_started
< activation_epoch
) {
2652 info
.last_epoch_started
= activation_epoch
;
2653 info
.last_interval_started
= info
.history
.same_interval_since
;
2657 auto &missing
= pg_log
.get_missing();
2659 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
2661 last_update_ondisk
= info
.last_update
;
2663 last_update_applied
= info
.last_update
;
2664 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
2666 need_up_thru
= false;
2668 // write pg info, log
2670 dirty_big_info
= true; // maybe
2672 pl
->schedule_event_on_commit(
2674 std::make_shared
<PGPeeringEvent
>(
2679 activation_epoch
)));
2681 // init complete pointer
2682 if (missing
.num_missing() == 0) {
2683 psdout(10) << "activate - no missing, moving last_complete " << info
.last_complete
2684 << " -> " << info
.last_update
<< dendl
;
2685 info
.last_complete
= info
.last_update
;
2686 info
.stats
.stats
.sum
.num_objects_missing
= 0;
2687 pg_log
.reset_recovery_pointers();
2689 psdout(10) << "activate - not complete, " << missing
<< dendl
;
2690 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
2691 pg_log
.activate_not_complete(info
);
2697 // initialize snap_trimq
2698 interval_set
<snapid_t
> to_trim
;
2699 auto& removed_snaps_queue
= get_osdmap()->get_removed_snaps_queue();
2700 auto p
= removed_snaps_queue
.find(info
.pgid
.pgid
.pool());
2701 if (p
!= removed_snaps_queue
.end()) {
2702 dout(20) << "activate - purged_snaps " << info
.purged_snaps
2703 << " removed_snaps " << p
->second
2705 for (auto q
: p
->second
) {
2706 to_trim
.insert(q
.first
, q
.second
);
2709 interval_set
<snapid_t
> purged
;
2710 purged
.intersection_of(to_trim
, info
.purged_snaps
);
2711 to_trim
.subtract(purged
);
2713 assert(HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
));
2714 renew_lease(pl
->get_mnow());
2715 // do not schedule until we are actually activated
2717 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2718 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2719 // reflect only those snaps that we thought were pruned and were still in
2721 info
.purged_snaps
.swap(purged
);
2723 // start up replicas
2724 if (prior_readable_down_osds
.empty()) {
2725 dout(10) << __func__
<< " no prior_readable_down_osds to wait on, clearing ub"
2727 clear_prior_readable_until_ub();
2729 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2730 prior_readable_until_ub
);
2732 ceph_assert(!acting_recovery_backfill
.empty());
2733 for (auto i
= acting_recovery_backfill
.begin();
2734 i
!= acting_recovery_backfill
.end();
2736 if (*i
== pg_whoami
) continue;
2737 pg_shard_t peer
= *i
;
2738 ceph_assert(peer_info
.count(peer
));
2739 pg_info_t
& pi
= peer_info
[peer
];
2741 psdout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
2743 #if defined(WITH_SEASTAR)
2748 ceph_assert(peer_missing
.count(peer
));
2749 pg_missing_t
& pm
= peer_missing
[peer
];
2751 bool needs_past_intervals
= pi
.dne();
2753 // Save num_bytes for backfill reservation request, can't be negative
2754 peer_bytes
[peer
] = std::max
<int64_t>(0, pi
.stats
.stats
.sum
.num_bytes
);
2756 if (pi
.last_update
== info
.last_update
) {
2758 if (!pi
.last_backfill
.is_max())
2759 pl
->get_clog_info() << info
.pgid
<< " continuing backfill to osd."
2761 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2762 << "] " << pi
.last_backfill
2763 << " to " << info
.last_update
;
2764 if (!pi
.is_empty()) {
2765 psdout(10) << "activate peer osd." << peer
2766 << " is up to date, queueing in pending_activators" << dendl
;
2769 spg_t(info
.pgid
.pgid
, peer
.shard
),
2770 get_osdmap_epoch(), // fixme: use lower epoch?
2775 psdout(10) << "activate peer osd." << peer
2776 << " is up to date, but sending pg_log anyway" << dendl
;
2777 m
= TOPNSPC::make_message
<MOSDPGLog
>(
2778 i
->shard
, pg_whoami
.shard
,
2779 get_osdmap_epoch(), info
,
2780 last_peering_reset
);
2783 pg_log
.get_tail() > pi
.last_update
||
2784 pi
.last_backfill
== hobject_t() ||
2785 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
2786 /* ^ This last case covers a situation where a replica is not contiguous
2787 * with the auth_log, but is contiguous with this replica. Reshuffling
2788 * the active set to handle this would be tricky, so instead we just go
2789 * ahead and backfill it anyway. This is probably preferrable in any
2790 * case since the replica in question would have to be significantly
2794 pl
->get_clog_debug() << info
.pgid
<< " starting backfill to osd." << peer
2795 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2796 << "] " << pi
.last_backfill
2797 << " to " << info
.last_update
;
2799 pi
.last_update
= info
.last_update
;
2800 pi
.last_complete
= info
.last_update
;
2801 pi
.set_last_backfill(hobject_t());
2802 pi
.last_epoch_started
= info
.last_epoch_started
;
2803 pi
.last_interval_started
= info
.last_interval_started
;
2804 pi
.history
= info
.history
;
2805 pi
.hit_set
= info
.hit_set
;
2806 pi
.stats
.stats
.clear();
2807 pi
.stats
.stats
.sum
.num_bytes
= peer_bytes
[peer
];
2809 // initialize peer with our purged_snaps.
2810 pi
.purged_snaps
= info
.purged_snaps
;
2812 m
= TOPNSPC::make_message
<MOSDPGLog
>(
2813 i
->shard
, pg_whoami
.shard
,
2814 get_osdmap_epoch(), pi
,
2815 last_peering_reset
/* epoch to create pg at */);
2817 // send some recent log, so that op dup detection works well.
2818 m
->log
.copy_up_to(cct
, pg_log
.get_log(),
2819 cct
->_conf
->osd_max_pg_log_entries
);
2820 m
->info
.log_tail
= m
->log
.tail
;
2821 pi
.log_tail
= m
->log
.tail
; // sigh...
2826 ceph_assert(pg_log
.get_tail() <= pi
.last_update
);
2827 m
= TOPNSPC::make_message
<MOSDPGLog
>(
2828 i
->shard
, pg_whoami
.shard
,
2829 get_osdmap_epoch(), info
,
2830 last_peering_reset
/* epoch to create pg at */);
2831 // send new stuff to append to replicas log
2832 m
->log
.copy_after(cct
, pg_log
.get_log(), pi
.last_update
);
2835 // share past_intervals if we are creating the pg on the replica
2836 // based on whether our info for that peer was dne() *before*
2837 // updating pi.history in the backfill block above.
2838 if (m
&& needs_past_intervals
)
2839 m
->past_intervals
= past_intervals
;
2841 // update local version of peer's missing list!
2842 if (m
&& pi
.last_backfill
!= hobject_t()) {
2843 for (auto p
= m
->log
.log
.begin(); p
!= m
->log
.log
.end(); ++p
) {
2844 if (p
->soid
<= pi
.last_backfill
&&
2846 if (perform_deletes_during_peering() && p
->is_delete()) {
2847 pm
.rm(p
->soid
, p
->version
);
2849 pm
.add_next_event(*p
);
2856 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
2858 m
->lease
= get_lease();
2859 pl
->send_cluster_message(peer
.osd
, std::move(m
), get_osdmap_epoch());
2863 pi
.last_update
= info
.last_update
;
2865 // update our missing
2866 if (pm
.num_missing() == 0) {
2867 pi
.last_complete
= pi
.last_update
;
2868 psdout(10) << "activate peer osd." << peer
<< " " << pi
2869 << " uptodate" << dendl
;
2871 psdout(10) << "activate peer osd." << peer
<< " " << pi
2872 << " missing " << pm
<< dendl
;
2876 // Set up missing_loc
2877 set
<pg_shard_t
> complete_shards
;
2878 for (auto i
= acting_recovery_backfill
.begin();
2879 i
!= acting_recovery_backfill
.end();
2881 psdout(20) << __func__
<< " setting up missing_loc from shard " << *i
2883 if (*i
== get_primary()) {
2884 missing_loc
.add_active_missing(missing
);
2885 if (!missing
.have_missing())
2886 complete_shards
.insert(*i
);
2888 auto peer_missing_entry
= peer_missing
.find(*i
);
2889 ceph_assert(peer_missing_entry
!= peer_missing
.end());
2890 missing_loc
.add_active_missing(peer_missing_entry
->second
);
2891 if (!peer_missing_entry
->second
.have_missing() &&
2892 peer_info
[*i
].last_backfill
.is_max())
2893 complete_shards
.insert(*i
);
2897 // If necessary, create might_have_unfound to help us find our unfound objects.
2898 // NOTE: It's important that we build might_have_unfound before trimming the
2900 might_have_unfound
.clear();
2901 if (needs_recovery()) {
2902 // If only one shard has missing, we do a trick to add all others as recovery
2903 // source, this is considered safe since the PGLogs have been merged locally,
2904 // and covers vast majority of the use cases, like one OSD/host is down for
2905 // a while for hardware repairing
2906 if (complete_shards
.size() + 1 == acting_recovery_backfill
.size()) {
2907 missing_loc
.add_batch_sources_info(complete_shards
, ctx
.handle
);
2909 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
2911 for (auto i
= acting_recovery_backfill
.begin();
2912 i
!= acting_recovery_backfill
.end();
2914 if (*i
== pg_whoami
) continue;
2915 psdout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
2916 ceph_assert(peer_missing
.count(*i
));
2917 ceph_assert(peer_info
.count(*i
));
2918 missing_loc
.add_source_info(
2925 for (auto i
= peer_missing
.begin(); i
!= peer_missing
.end(); ++i
) {
2926 if (is_acting_recovery_backfill(i
->first
))
2928 ceph_assert(peer_info
.count(i
->first
));
2930 peer_info
[i
->first
],
2936 build_might_have_unfound();
2938 // Always call now so update_calc_stats() will be accurate
2939 discover_all_missing(ctx
.msgs
);
2943 // num_objects_degraded if calculated should reflect this too, unless no
2944 // missing and we are about to go clean.
2945 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
2946 state_set(PG_STATE_UNDERSIZED
);
2949 state_set(PG_STATE_ACTIVATING
);
2950 pl
->on_activate(std::move(to_trim
));
2952 if (acting_set_writeable()) {
2953 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2954 pg_log
.roll_forward(rollbacker
.get());
2958 void PeeringState::share_pg_info()
2960 psdout(10) << "share_pg_info" << dendl
;
2962 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2963 prior_readable_until_ub
);
2965 // share new pg_info_t with replicas
2966 ceph_assert(!acting_recovery_backfill
.empty());
2967 for (auto pg_shard
: acting_recovery_backfill
) {
2968 if (pg_shard
== pg_whoami
) continue;
2969 if (auto peer
= peer_info
.find(pg_shard
); peer
!= peer_info
.end()) {
2970 peer
->second
.last_epoch_started
= info
.last_epoch_started
;
2971 peer
->second
.last_interval_started
= info
.last_interval_started
;
2972 peer
->second
.history
.merge(info
.history
);
2974 auto m
= TOPNSPC::make_message
<MOSDPGInfo2
>(spg_t
{info
.pgid
.pgid
, pg_shard
.shard
},
2978 std::optional
<pg_lease_t
>{get_lease()},
2980 pl
->send_cluster_message(pg_shard
.osd
, std::move(m
), get_osdmap_epoch());
2984 void PeeringState::merge_log(
2985 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&& olog
,
2988 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2990 oinfo
, std::move(olog
), from
, info
, rollbacker
.get(),
2991 dirty_info
, dirty_big_info
);
2994 void PeeringState::rewind_divergent_log(
2995 ObjectStore::Transaction
& t
, eversion_t newhead
)
2997 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2998 pg_log
.rewind_divergent_log(
2999 newhead
, info
, rollbacker
.get(), dirty_info
, dirty_big_info
);
3003 void PeeringState::proc_primary_info(
3004 ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
3006 ceph_assert(!is_primary());
3008 update_history(oinfo
.history
);
3009 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
3010 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
3011 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
3012 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
3016 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
3017 psdout(10) << __func__
<< " updating purged_snaps to "
3018 << oinfo
.purged_snaps
3020 info
.purged_snaps
= oinfo
.purged_snaps
;
3022 dirty_big_info
= true;
3026 void PeeringState::proc_master_log(
3027 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
3028 pg_log_t
&& olog
, pg_missing_t
&& omissing
, pg_shard_t from
)
3030 psdout(10) << "proc_master_log for osd." << from
<< ": "
3031 << olog
<< " " << omissing
<< dendl
;
3032 ceph_assert(!is_peered() && is_primary());
3034 // merge log into our own log to build master log. no need to
3035 // make any adjustments to their missing map; we are taking their
3036 // log to be authoritative (i.e., their entries are by definitely
3038 merge_log(t
, oinfo
, std::move(olog
), from
);
3039 peer_info
[from
] = oinfo
;
3040 psdout(10) << " peer osd." << from
<< " now " << oinfo
3041 << " " << omissing
<< dendl
;
3042 might_have_unfound
.insert(from
);
3044 // See doc/dev/osd_internals/last_epoch_started
3045 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
3046 info
.last_epoch_started
= oinfo
.last_epoch_started
;
3049 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
3050 info
.last_interval_started
= oinfo
.last_interval_started
;
3053 update_history(oinfo
.history
);
3054 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
3055 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
3057 peer_missing
[from
].claim(std::move(omissing
));
3060 void PeeringState::proc_replica_log(
3062 const pg_log_t
&olog
,
3063 pg_missing_t
&& omissing
,
3066 psdout(10) << "proc_replica_log for osd." << from
<< ": "
3067 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
3069 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
3071 peer_info
[from
] = oinfo
;
3072 psdout(10) << " peer osd." << from
<< " now "
3073 << oinfo
<< " " << omissing
<< dendl
;
3074 might_have_unfound
.insert(from
);
3076 for (auto i
= omissing
.get_items().begin();
3077 i
!= omissing
.get_items().end();
3079 psdout(20) << " after missing " << i
->first
3080 << " need " << i
->second
.need
3081 << " have " << i
->second
.have
<< dendl
;
3083 peer_missing
[from
].claim(std::move(omissing
));
3086 void PeeringState::fulfill_info(
3087 pg_shard_t from
, const pg_query_t
&query
,
3088 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
3090 ceph_assert(from
== primary
);
3091 ceph_assert(query
.type
== pg_query_t::INFO
);
3094 psdout(10) << "sending info" << dendl
;
3095 notify_info
= make_pair(from
, info
);
3098 void PeeringState::fulfill_log(
3099 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
3101 psdout(10) << "log request from " << from
<< dendl
;
3102 ceph_assert(from
== primary
);
3103 ceph_assert(query
.type
!= pg_query_t::INFO
);
3105 auto mlog
= TOPNSPC::make_message
<MOSDPGLog
>(
3106 from
.shard
, pg_whoami
.shard
,
3109 mlog
->missing
= pg_log
.get_missing();
3111 // primary -> other, when building master log
3112 if (query
.type
== pg_query_t::LOG
) {
3113 psdout(10) << " sending info+missing+log since " << query
.since
3115 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
3116 pl
->get_clog_error() << info
.pgid
<< " got broken pg_query_t::LOG since "
3118 << " when my log.tail is " << pg_log
.get_tail()
3119 << ", sending full log instead";
3120 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
3122 mlog
->log
.copy_after(cct
, pg_log
.get_log(), query
.since
);
3124 else if (query
.type
== pg_query_t::FULLLOG
) {
3125 psdout(10) << " sending info+missing+full log" << dendl
;
3126 mlog
->log
= pg_log
.get_log();
3129 psdout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
3131 pl
->send_cluster_message(from
.osd
, std::move(mlog
), get_osdmap_epoch(), true);
3134 void PeeringState::fulfill_query(const MQuery
& query
, PeeringCtxWrapper
&rctx
)
3136 if (query
.query
.type
== pg_query_t::INFO
) {
3137 pair
<pg_shard_t
, pg_info_t
> notify_info
;
3138 // note this refreshes our prior_readable_until_ub value
3139 update_history(query
.query
.history
);
3140 fulfill_info(query
.from
, query
.query
, notify_info
);
3142 notify_info
.first
.osd
,
3144 notify_info
.first
.shard
, pg_whoami
.shard
,
3150 update_history(query
.query
.history
);
3151 fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
3155 void PeeringState::try_mark_clean()
3157 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
3158 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
3159 state_set(PG_STATE_CLEAN
);
3160 info
.history
.last_epoch_clean
= get_osdmap_epoch();
3161 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
3162 past_intervals
.clear();
3163 dirty_big_info
= true;
3167 if (!is_active() && is_peered()) {
3170 if (pool
.info
.is_pending_merge(info
.pgid
.pgid
, &target
)) {
3172 psdout(10) << "ready to merge (target)" << dendl
;
3173 pl
->set_ready_to_merge_target(
3175 info
.history
.last_epoch_started
,
3176 info
.history
.last_epoch_clean
);
3178 psdout(10) << "ready to merge (source)" << dendl
;
3179 pl
->set_ready_to_merge_source(info
.last_update
);
3183 psdout(10) << "not clean, not ready to merge" << dendl
;
3184 // we should have notified OSD in Active state entry point
3188 state_clear(PG_STATE_FORCED_RECOVERY
| PG_STATE_FORCED_BACKFILL
);
3191 pl
->publish_stats_to_osd();
3192 clear_recovery_state();
3195 void PeeringState::split_into(
3196 pg_t child_pgid
, PeeringState
*child
, unsigned split_bits
)
3198 child
->update_osdmap_ref(get_osdmap());
3202 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
3203 child
->info
.last_complete
= info
.last_complete
;
3205 info
.last_update
= pg_log
.get_head();
3206 child
->info
.last_update
= child
->pg_log
.get_head();
3208 child
->info
.last_user_version
= info
.last_user_version
;
3210 info
.log_tail
= pg_log
.get_tail();
3211 child
->info
.log_tail
= child
->pg_log
.get_tail();
3213 // reset last_complete, we might have modified pg_log & missing above
3214 pg_log
.reset_complete_to(&info
);
3215 child
->pg_log
.reset_complete_to(&child
->info
);
3218 child
->info
.history
= info
.history
;
3219 child
->info
.history
.epoch_created
= get_osdmap_epoch();
3220 child
->info
.purged_snaps
= info
.purged_snaps
;
3222 if (info
.last_backfill
.is_max()) {
3223 child
->info
.set_last_backfill(hobject_t::get_max());
3225 // restart backfill on parent and child to be safe. we could
3226 // probably do better in the bitwise sort case, but it's more
3227 // fragile (there may be special work to do on backfill completion
3229 info
.set_last_backfill(hobject_t());
3230 child
->info
.set_last_backfill(hobject_t());
3231 // restarting backfill implies that the missing set is empty,
3232 // since it is only used for objects prior to last_backfill
3233 pg_log
.reset_backfill();
3234 child
->pg_log
.reset_backfill();
3237 child
->info
.stats
= info
.stats
;
3238 child
->info
.stats
.parent_split_bits
= split_bits
;
3239 info
.stats
.stats_invalid
= true;
3240 child
->info
.stats
.stats_invalid
= true;
3241 child
->info
.last_epoch_started
= info
.last_epoch_started
;
3242 child
->info
.last_interval_started
= info
.last_interval_started
;
3244 // There can't be recovery/backfill going on now
3245 int primary
, up_primary
;
3246 vector
<int> newup
, newacting
;
3247 get_osdmap()->pg_to_up_acting_osds(
3248 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
3249 child
->init_primary_up_acting(
3254 child
->role
= OSDMap::calc_pg_role(pg_whoami
, child
->acting
);
3256 // this comparison includes primary rank via pg_shard_t
3257 if (get_primary() != child
->get_primary())
3258 child
->info
.history
.same_primary_since
= get_osdmap_epoch();
3260 child
->info
.stats
.up
= newup
;
3261 child
->info
.stats
.up_primary
= up_primary
;
3262 child
->info
.stats
.acting
= newacting
;
3263 child
->info
.stats
.acting_primary
= primary
;
3264 child
->info
.stats
.mapping_epoch
= get_osdmap_epoch();
3267 child
->past_intervals
= past_intervals
;
3269 child
->on_new_interval();
3271 child
->send_notify
= !child
->is_primary();
3273 child
->dirty_info
= true;
3274 child
->dirty_big_info
= true;
3276 dirty_big_info
= true;
3279 void PeeringState::merge_from(
3280 map
<spg_t
,PeeringState
*>& sources
,
3282 unsigned split_bits
,
3283 const pg_merge_meta_t
& last_pg_merge_meta
)
3285 bool incomplete
= false;
3286 if (info
.last_complete
!= info
.last_update
||
3287 info
.is_incomplete() ||
3289 psdout(10) << __func__
<< " target incomplete" << dendl
;
3292 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
3293 if (info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
.get_parent()) {
3294 psdout(10) << __func__
<< " target doesn't match expected parent "
3295 << last_pg_merge_meta
.source_pgid
.get_parent()
3296 << " of source_pgid " << last_pg_merge_meta
.source_pgid
3300 if (info
.last_update
!= last_pg_merge_meta
.target_version
) {
3301 psdout(10) << __func__
<< " target version doesn't match expected "
3302 << last_pg_merge_meta
.target_version
<< dendl
;
3307 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(rctx
.transaction
)};
3308 pg_log
.roll_forward(handler
.get());
3310 info
.last_complete
= info
.last_update
; // to fake out trim()
3311 pg_log
.reset_recovery_pointers();
3312 pg_log
.trim(info
.last_update
, info
);
3314 vector
<PGLog
*> log_from
;
3315 for (auto& i
: sources
) {
3316 auto& source
= i
.second
;
3318 psdout(10) << __func__
<< " source " << i
.first
<< " missing" << dendl
;
3322 if (source
->info
.last_complete
!= source
->info
.last_update
||
3323 source
->info
.is_incomplete() ||
3324 source
->info
.dne()) {
3325 psdout(10) << __func__
<< " source " << source
->pg_whoami
3330 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
3331 if (source
->info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
) {
3332 dout(10) << __func__
<< " source " << source
->info
.pgid
.pgid
3333 << " doesn't match expected source pgid "
3334 << last_pg_merge_meta
.source_pgid
<< dendl
;
3337 if (source
->info
.last_update
!= last_pg_merge_meta
.source_version
) {
3338 dout(10) << __func__
<< " source version doesn't match expected "
3339 << last_pg_merge_meta
.target_version
<< dendl
;
3345 PGLog::LogEntryHandlerRef handler
{
3346 source
->pl
->get_log_handler(rctx
.transaction
)};
3347 source
->pg_log
.roll_forward(handler
.get());
3348 source
->info
.last_complete
= source
->info
.last_update
; // to fake out trim()
3349 source
->pg_log
.reset_recovery_pointers();
3350 source
->pg_log
.trim(source
->info
.last_update
, source
->info
);
3351 log_from
.push_back(&source
->pg_log
);
3354 info
.stats
.add(source
->info
.stats
);
3356 // pull up last_update
3357 info
.last_update
= std::max(info
.last_update
, source
->info
.last_update
);
3359 // adopt source's PastIntervals if target has none. we can do this since
3360 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3361 // the PGs are identical.
3362 if (past_intervals
.empty() && !source
->past_intervals
.empty()) {
3363 psdout(10) << __func__
<< " taking source's past_intervals" << dendl
;
3364 past_intervals
= source
->past_intervals
;
3368 info
.last_complete
= info
.last_update
;
3369 info
.log_tail
= info
.last_update
;
3371 info
.last_backfill
= hobject_t();
3375 pg_log
.merge_from(log_from
, info
.last_update
);
3377 // make sure we have a meaningful last_epoch_started/clean (if we were a
3379 if (info
.history
.epoch_created
== 0) {
3380 // start with (a) source's history, since these PGs *should* have been
3381 // remapped in concert with each other...
3382 info
.history
= sources
.begin()->second
->info
.history
;
3384 // we use the last_epoch_{started,clean} we got from
3385 // the caller, which are the epochs that were reported by the PGs were
3386 // found to be ready for merge.
3387 info
.history
.last_epoch_clean
= last_pg_merge_meta
.last_epoch_clean
;
3388 info
.history
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3389 info
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3390 psdout(10) << __func__
3391 << " set les/c to " << last_pg_merge_meta
.last_epoch_started
<< "/"
3392 << last_pg_merge_meta
.last_epoch_clean
3393 << " from pool last_dec_*, source pg history was "
3394 << sources
.begin()->second
->info
.history
3397 // above we have pulled down source's history and we need to check
3398 // history.epoch_created again to confirm that source is not a placeholder
3399 // too. (peering requires a sane history.same_interval_since value for any
3400 // non-newly created pg and below here we know we are basically iterating
3401 // back a series of past maps to fake a merge process, hence we need to
3402 // fix history.same_interval_since first so that start_peering_interval()
3403 // will not complain)
3404 if (info
.history
.epoch_created
== 0) {
3405 dout(10) << __func__
<< " both merge target and source are placeholders,"
3406 << " set sis to lec " << info
.history
.last_epoch_clean
3408 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
3411 // if the past_intervals start is later than last_epoch_clean, it
3412 // implies the source repeered again but the target didn't, or
3413 // that the source became clean in a later epoch than the target.
3414 // avoid the discrepancy but adjusting the interval start
3415 // backwards to match so that check_past_interval_bounds() will
3417 auto pib
= past_intervals
.get_bounds();
3418 if (info
.history
.last_epoch_clean
< pib
.first
) {
3419 psdout(10) << __func__
<< " last_epoch_clean "
3420 << info
.history
.last_epoch_clean
<< " < past_interval start "
3421 << pib
.first
<< ", adjusting start backwards" << dendl
;
3422 past_intervals
.adjust_start_backwards(info
.history
.last_epoch_clean
);
3425 // Similarly, if the same_interval_since value is later than
3426 // last_epoch_clean, the next interval change will result in a
3427 // past_interval start that is later than last_epoch_clean. This
3428 // can happen if we use the pg_history values from the merge
3429 // source. Adjust the same_interval_since value backwards if that
3430 // happens. (We trust the les and lec values more because they came from
3431 // the real target, whereas the history value we stole from the source.)
3432 if (info
.history
.last_epoch_started
< info
.history
.same_interval_since
) {
3433 psdout(10) << __func__
<< " last_epoch_started "
3434 << info
.history
.last_epoch_started
<< " < same_interval_since "
3435 << info
.history
.same_interval_since
3436 << ", adjusting pg_history backwards" << dendl
;
3437 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
3438 // make sure same_{up,primary}_since are <= same_interval_since
3439 info
.history
.same_up_since
= std::min(
3440 info
.history
.same_up_since
, info
.history
.same_interval_since
);
3441 info
.history
.same_primary_since
= std::min(
3442 info
.history
.same_primary_since
, info
.history
.same_interval_since
);
3447 dirty_big_info
= true;
3450 void PeeringState::start_split_stats(
3451 const set
<spg_t
>& childpgs
, vector
<object_stat_sum_t
> *out
)
3453 out
->resize(childpgs
.size() + 1);
3454 info
.stats
.stats
.sum
.split(*out
);
3457 void PeeringState::finish_split_stats(
3458 const object_stat_sum_t
& stats
, ObjectStore::Transaction
&t
)
3460 info
.stats
.stats
.sum
= stats
;
3464 void PeeringState::update_blocked_by()
3466 // set a max on the number of blocking peers we report. if we go
3467 // over, report a random subset. keep the result sorted.
3468 unsigned keep
= std::min
<unsigned>(
3469 blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
3470 unsigned skip
= blocked_by
.size() - keep
;
3471 info
.stats
.blocked_by
.clear();
3472 info
.stats
.blocked_by
.resize(keep
);
3474 for (auto p
= blocked_by
.begin(); p
!= blocked_by
.end() && keep
> 0; ++p
) {
3475 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
3478 info
.stats
.blocked_by
[pos
++] = *p
;
3484 static bool find_shard(const set
<pg_shard_t
> & pgs
, shard_id_t shard
)
3487 if (p
.shard
== shard
)
3492 static pg_shard_t
get_another_shard(const set
<pg_shard_t
> & pgs
, pg_shard_t skip
, shard_id_t shard
)
3494 for (auto&p
: pgs
) {
3497 if (p
.shard
== shard
)
3500 return pg_shard_t();
3503 void PeeringState::update_calc_stats()
3505 info
.stats
.version
= info
.last_update
;
3506 info
.stats
.created
= info
.history
.epoch_created
;
3507 info
.stats
.last_scrub
= info
.history
.last_scrub
;
3508 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
3509 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
3510 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
3511 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
3512 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
3514 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
3515 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
3516 info
.stats
.log_start
= pg_log
.get_tail();
3517 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
3518 info
.stats
.snaptrimq_len
= pl
->get_snap_trimq_size();
3520 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
3522 // In rare case that upset is too large (usually transient), use as target
3523 // for calculations below.
3524 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
3525 // For undersized actingset may be larger with OSDs out
3526 unsigned nrep
= std::max(actingset
.size(), upset
.size());
3527 // calc num_object_copies
3528 info
.stats
.stats
.calc_copies(std::max(target
, nrep
));
3529 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
3530 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
3531 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
3532 info
.stats
.avail_no_missing
.clear();
3533 info
.stats
.object_location_counts
.clear();
3535 // We should never hit this condition, but if end up hitting it,
3536 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3537 if (info
.stats
.stats
.sum
.num_objects
< 0) {
3538 psdout(0) << __func__
<< " negative num_objects = "
3539 << info
.stats
.stats
.sum
.num_objects
<< " setting it to 0 "
3541 info
.stats
.stats
.sum
.num_objects
= 0;
3542 state_set(PG_STATE_INCONSISTENT
);
3545 if ((is_remapped() || is_undersized() || !is_clean()) &&
3546 (is_peered()|| is_activating())) {
3547 psdout(20) << __func__
<< " actingset " << actingset
<< " upset "
3548 << upset
<< " acting_recovery_backfill " << acting_recovery_backfill
<< dendl
;
3550 ceph_assert(!acting_recovery_backfill
.empty());
3552 bool estimate
= false;
3554 // NOTE: we only generate degraded, misplaced and unfound
3555 // values for the summation, not individual stat categories.
3556 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
3558 // Objects missing from up nodes, sorted by # objects.
3559 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
3560 // Objects missing from nodes not in up, sort by # objects
3561 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
3563 // Fill missing_target_objects/acting_source_objects
3569 missing
= pg_log
.get_missing().num_missing();
3570 ceph_assert(acting_recovery_backfill
.count(pg_whoami
));
3571 if (upset
.count(pg_whoami
)) {
3572 missing_target_objects
.emplace(missing
, pg_whoami
);
3574 acting_source_objects
.emplace(missing
, pg_whoami
);
3576 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
3578 info
.stats
.avail_no_missing
.push_back(pg_whoami
);
3579 psdout(20) << __func__
<< " shard " << pg_whoami
3580 << " primary objects " << num_objects
3581 << " missing " << missing
3586 for (auto& peer
: peer_info
) {
3587 // Primary should not be in the peer_info, skip if it is.
3588 if (peer
.first
== pg_whoami
) continue;
3589 int64_t missing
= 0;
3590 int64_t peer_num_objects
=
3591 std::max((int64_t)0, peer
.second
.stats
.stats
.sum
.num_objects
);
3592 // Backfill targets always track num_objects accurately
3593 // all other peers track missing accurately.
3594 if (is_backfill_target(peer
.first
)) {
3595 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3597 if (peer_missing
.count(peer
.first
)) {
3598 missing
= peer_missing
[peer
.first
].num_missing();
3600 psdout(20) << __func__
<< " no peer_missing found for "
3601 << peer
.first
<< dendl
;
3602 if (is_recovering()) {
3605 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3608 if (upset
.count(peer
.first
)) {
3609 missing_target_objects
.emplace(missing
, peer
.first
);
3610 } else if (actingset
.count(peer
.first
)) {
3611 acting_source_objects
.emplace(missing
, peer
.first
);
3613 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
3615 info
.stats
.avail_no_missing
.push_back(peer
.first
);
3616 psdout(20) << __func__
<< " shard " << peer
.first
3617 << " objects " << peer_num_objects
3618 << " missing " << missing
3622 // Compute object_location_counts
3623 for (auto& ml
: missing_loc
.get_missing_locs()) {
3624 info
.stats
.object_location_counts
[ml
.second
]++;
3625 psdout(30) << __func__
<< " " << ml
.first
<< " object_location_counts["
3626 << ml
.second
<< "]=" << info
.stats
.object_location_counts
[ml
.second
]
3629 int64_t not_missing
= num_objects
- missing_loc
.get_missing_locs().size();
3631 // During recovery we know upset == actingset and is being populated
3632 // During backfill we know that all non-missing objects are in the actingset
3633 info
.stats
.object_location_counts
[actingset
] = not_missing
;
3635 psdout(30) << __func__
<< " object_location_counts["
3636 << upset
<< "]=" << info
.stats
.object_location_counts
[upset
]
3638 psdout(20) << __func__
<< " object_location_counts "
3639 << info
.stats
.object_location_counts
<< dendl
;
3641 // A misplaced object is not stored on the correct OSD
3642 int64_t misplaced
= 0;
3643 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3644 int64_t degraded
= 0;
3646 if (is_recovering()) {
3647 for (auto& sml
: missing_loc
.get_missing_by_count()) {
3648 for (auto& ml
: sml
.second
) {
3650 if (sml
.first
== shard_id_t::NO_SHARD
) {
3651 psdout(20) << __func__
<< " ml " << ml
.second
3652 << " upset size " << upset
.size()
3653 << " up " << ml
.first
.up
<< dendl
;
3654 missing_shards
= (int)upset
.size() - ml
.first
.up
;
3656 // Handle shards not even in upset below
3657 if (!find_shard(upset
, sml
.first
))
3659 missing_shards
= std::max(0, 1 - ml
.first
.up
);
3660 psdout(20) << __func__
3661 << " shard " << sml
.first
3662 << " ml " << ml
.second
3663 << " missing shards " << missing_shards
<< dendl
;
3665 int odegraded
= ml
.second
* missing_shards
;
3666 // Copies on other osds but limited to the possible degraded
3667 int more_osds
= std::min(missing_shards
, ml
.first
.other
);
3668 int omisplaced
= ml
.second
* more_osds
;
3669 ceph_assert(omisplaced
<= odegraded
);
3670 odegraded
-= omisplaced
;
3672 misplaced
+= omisplaced
;
3673 degraded
+= odegraded
;
3677 psdout(20) << __func__
<< " missing based degraded "
3678 << degraded
<< dendl
;
3679 psdout(20) << __func__
<< " missing based misplaced "
3680 << misplaced
<< dendl
;
3682 // Handle undersized case
3683 if (pool
.info
.is_replicated()) {
3684 // Add degraded for missing targets (num_objects missing)
3685 ceph_assert(target
>= upset
.size());
3686 unsigned needed
= target
- upset
.size();
3687 degraded
+= num_objects
* needed
;
3689 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3690 shard_id_t
shard(i
);
3692 if (!find_shard(upset
, shard
)) {
3693 pg_shard_t pgs
= get_another_shard(actingset
, pg_shard_t(), shard
);
3695 if (pgs
!= pg_shard_t()) {
3698 if (pgs
== pg_whoami
)
3699 missing
= info
.stats
.stats
.sum
.num_objects_missing_on_primary
;
3701 missing
= peer_info
[pgs
].stats
.stats
.sum
.num_objects_missing
;
3703 degraded
+= missing
;
3704 misplaced
+= std::max((int64_t)0, num_objects
- missing
);
3706 // No shard anywhere
3707 degraded
+= num_objects
;
3715 // Handle undersized case
3716 if (pool
.info
.is_replicated()) {
3717 // Add to missing_target_objects
3718 ceph_assert(target
>= missing_target_objects
.size());
3719 unsigned needed
= target
- missing_target_objects
.size();
3721 missing_target_objects
.emplace(num_objects
* needed
, pg_shard_t(pg_shard_t::NO_OSD
));
3723 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3724 shard_id_t
shard(i
);
3726 for (const auto& t
: missing_target_objects
) {
3727 if (std::get
<1>(t
).shard
== shard
) {
3733 missing_target_objects
.emplace(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
));
3737 for (const auto& item
: missing_target_objects
)
3738 psdout(20) << __func__
<< " missing shard " << std::get
<1>(item
)
3739 << " missing= " << std::get
<0>(item
) << dendl
;
3740 for (const auto& item
: acting_source_objects
)
3741 psdout(20) << __func__
<< " acting shard " << std::get
<1>(item
)
3742 << " missing= " << std::get
<0>(item
) << dendl
;
3744 // Handle all objects not in missing for remapped
3746 for (auto m
= missing_target_objects
.rbegin();
3747 m
!= missing_target_objects
.rend(); ++m
) {
3749 int64_t extra_missing
= -1;
3751 if (pool
.info
.is_replicated()) {
3752 if (!acting_source_objects
.empty()) {
3753 auto extra_copy
= acting_source_objects
.begin();
3754 extra_missing
= std::get
<0>(*extra_copy
);
3755 acting_source_objects
.erase(extra_copy
);
3757 } else { // Erasure coded
3758 // Use corresponding shard
3759 for (const auto& a
: acting_source_objects
) {
3760 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
3761 extra_missing
= std::get
<0>(a
);
3762 acting_source_objects
.erase(a
);
3768 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
3769 // We don't know which of the objects on the target
3770 // are part of extra_missing so assume are all degraded.
3771 misplaced
+= std::get
<0>(*m
) - extra_missing
;
3772 degraded
+= extra_missing
;
3774 // 1. extra_missing == -1, more targets than sources so degraded
3775 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3776 // previously degraded are now present on the target.
3777 degraded
+= std::get
<0>(*m
);
3780 // If there are still acting that haven't been accounted for
3781 // then they are misplaced
3782 for (const auto& a
: acting_source_objects
) {
3783 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
3784 psdout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
3786 misplaced
+= extra_misplaced
;
3789 // NOTE: Tests use these messages to verify this code
3790 psdout(20) << __func__
<< " degraded " << degraded
3791 << (estimate
? " (est)": "") << dendl
;
3792 psdout(20) << __func__
<< " misplaced " << misplaced
3793 << (estimate
? " (est)": "")<< dendl
;
3795 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
3796 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
3797 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
3801 std::optional
<pg_stat_t
> PeeringState::prepare_stats_for_publish(
3802 const std::optional
<pg_stat_t
> &pg_stats_publish
,
3803 const object_stat_collection_t
&unstable_stats
)
3805 if (info
.stats
.stats
.sum
.num_scrub_errors
) {
3806 psdout(10) << __func__
<< " inconsistent due to " <<
3807 info
.stats
.stats
.sum
.num_scrub_errors
<< " scrub errors" << dendl
;
3808 state_set(PG_STATE_INCONSISTENT
);
3810 state_clear(PG_STATE_INCONSISTENT
);
3811 state_clear(PG_STATE_FAILED_REPAIR
);
3814 utime_t now
= ceph_clock_now();
3815 if (info
.stats
.state
!= state
) {
3816 info
.stats
.last_change
= now
;
3817 // Optimistic estimation, if we just find out an inactive PG,
3818 // assume it is active till now.
3819 if (!(state
& PG_STATE_ACTIVE
) &&
3820 (info
.stats
.state
& PG_STATE_ACTIVE
))
3821 info
.stats
.last_active
= now
;
3823 if ((state
& PG_STATE_ACTIVE
) &&
3824 !(info
.stats
.state
& PG_STATE_ACTIVE
))
3825 info
.stats
.last_became_active
= now
;
3826 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
3827 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
3828 info
.stats
.last_became_peered
= now
;
3829 info
.stats
.state
= state
;
3832 update_calc_stats();
3833 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
3834 state_set(PG_STATE_DEGRADED
);
3836 state_clear(PG_STATE_DEGRADED
);
3838 update_blocked_by();
3840 pg_stat_t pre_publish
= info
.stats
;
3841 pre_publish
.stats
.add(unstable_stats
);
3842 utime_t cutoff
= now
;
3843 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
3845 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3846 // because we don't want to make the pg_stat_t structures too expensive.
3847 unsigned max
= cct
->_conf
->osd_max_snap_prune_intervals_per_epoch
;
3849 auto i
= info
.purged_snaps
.begin();
3850 while (num
< max
&& i
!= info
.purged_snaps
.end()) {
3851 pre_publish
.purged_snaps
.insert(i
.get_start(), i
.get_len());
3855 psdout(20) << __func__
<< " reporting purged_snaps "
3856 << pre_publish
.purged_snaps
<< dendl
;
3858 if (pg_stats_publish
&& pre_publish
== *pg_stats_publish
&&
3859 info
.stats
.last_fresh
> cutoff
) {
3860 psdout(15) << "publish_stats_to_osd " << pg_stats_publish
->reported_epoch
3861 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
3862 return std::nullopt
;
3864 // update our stat summary and timestamps
3865 info
.stats
.reported_epoch
= get_osdmap_epoch();
3866 ++info
.stats
.reported_seq
;
3868 info
.stats
.last_fresh
= now
;
3870 if (info
.stats
.state
& PG_STATE_CLEAN
)
3871 info
.stats
.last_clean
= now
;
3872 if (info
.stats
.state
& PG_STATE_ACTIVE
)
3873 info
.stats
.last_active
= now
;
3874 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
3875 info
.stats
.last_peered
= now
;
3876 info
.stats
.last_unstale
= now
;
3877 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
3878 info
.stats
.last_undegraded
= now
;
3879 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
3880 info
.stats
.last_fullsized
= now
;
3882 psdout(15) << "publish_stats_to_osd " << pre_publish
.reported_epoch
3883 << ":" << pre_publish
.reported_seq
<< dendl
;
3884 return std::make_optional(std::move(pre_publish
));
3888 void PeeringState::init(
3890 const vector
<int>& newup
, int new_up_primary
,
3891 const vector
<int>& newacting
, int new_acting_primary
,
3892 const pg_history_t
& history
,
3893 const PastIntervals
& pi
,
3894 ObjectStore::Transaction
&t
)
3896 psdout(10) << "init role " << role
<< " up "
3897 << newup
<< " acting " << newacting
3898 << " history " << history
3899 << " past_intervals " << pi
3903 init_primary_up_acting(
3907 new_acting_primary
);
3909 info
.history
= history
;
3910 past_intervals
= pi
;
3913 info
.stats
.up_primary
= new_up_primary
;
3914 info
.stats
.acting
= acting
;
3915 info
.stats
.acting_primary
= new_acting_primary
;
3916 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
3918 if (!perform_deletes_during_peering()) {
3919 pg_log
.set_missing_may_contain_deletes();
3925 dirty_big_info
= true;
3929 void PeeringState::dump_peering_state(Formatter
*f
)
3931 f
->dump_string("state", get_pg_state_string());
3932 f
->dump_unsigned("epoch", get_osdmap_epoch());
3933 f
->open_array_section("up");
3934 for (auto p
= up
.begin(); p
!= up
.end(); ++p
)
3935 f
->dump_unsigned("osd", *p
);
3937 f
->open_array_section("acting");
3938 for (auto p
= acting
.begin(); p
!= acting
.end(); ++p
)
3939 f
->dump_unsigned("osd", *p
);
3941 if (!backfill_targets
.empty()) {
3942 f
->open_array_section("backfill_targets");
3943 for (auto p
= backfill_targets
.begin(); p
!= backfill_targets
.end(); ++p
)
3944 f
->dump_stream("shard") << *p
;
3947 if (!async_recovery_targets
.empty()) {
3948 f
->open_array_section("async_recovery_targets");
3949 for (auto p
= async_recovery_targets
.begin();
3950 p
!= async_recovery_targets
.end();
3952 f
->dump_stream("shard") << *p
;
3955 if (!acting_recovery_backfill
.empty()) {
3956 f
->open_array_section("acting_recovery_backfill");
3957 for (auto p
= acting_recovery_backfill
.begin();
3958 p
!= acting_recovery_backfill
.end();
3960 f
->dump_stream("shard") << *p
;
3963 f
->open_object_section("info");
3964 update_calc_stats();
3968 f
->open_array_section("peer_info");
3969 for (auto p
= peer_info
.begin(); p
!= peer_info
.end(); ++p
) {
3970 f
->open_object_section("info");
3971 f
->dump_stream("peer") << p
->first
;
3978 void PeeringState::update_stats(
3979 std::function
<bool(pg_history_t
&, pg_stat_t
&)> f
,
3980 ObjectStore::Transaction
*t
) {
3981 if (f(info
.history
, info
.stats
)) {
3982 pl
->publish_stats_to_osd();
3984 pl
->reschedule_scrub();
3992 void PeeringState::update_stats_wo_resched(
3993 std::function
<void(pg_history_t
&, pg_stat_t
&)> f
)
3995 f(info
.history
, info
.stats
);
3998 bool PeeringState::append_log_entries_update_missing(
3999 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
4000 ObjectStore::Transaction
&t
, std::optional
<eversion_t
> trim_to
,
4001 std::optional
<eversion_t
> roll_forward_to
)
4003 ceph_assert(!entries
.empty());
4004 ceph_assert(entries
.begin()->version
> info
.last_update
);
4006 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
4007 bool invalidate_stats
=
4008 pg_log
.append_new_log_entries(
4013 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
4014 pg_log
.roll_forward(rollbacker
.get());
4016 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
4017 pg_log
.roll_forward_to(*roll_forward_to
, rollbacker
.get());
4018 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
4021 info
.last_update
= pg_log
.get_head();
4023 if (pg_log
.get_missing().num_missing() == 0) {
4024 // advance last_complete since nothing else is missing!
4025 info
.last_complete
= info
.last_update
;
4027 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
4029 psdout(20) << __func__
<< " trim_to bool = " << bool(trim_to
)
4030 << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
4032 pg_log
.trim(*trim_to
, info
);
4035 return invalidate_stats
;
4038 void PeeringState::merge_new_log_entries(
4039 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
4040 ObjectStore::Transaction
&t
,
4041 std::optional
<eversion_t
> trim_to
,
4042 std::optional
<eversion_t
> roll_forward_to
)
4044 psdout(10) << __func__
<< " " << entries
<< dendl
;
4045 ceph_assert(is_primary());
4047 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
4048 for (auto i
= acting_recovery_backfill
.begin();
4049 i
!= acting_recovery_backfill
.end();
4051 pg_shard_t
peer(*i
);
4052 if (peer
== pg_whoami
) continue;
4053 ceph_assert(peer_missing
.count(peer
));
4054 ceph_assert(peer_info
.count(peer
));
4055 pg_missing_t
& pmissing(peer_missing
[peer
]);
4056 psdout(20) << __func__
<< " peer_missing for " << peer
4057 << " = " << pmissing
<< dendl
;
4058 pg_info_t
& pinfo(peer_info
[peer
]);
4059 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
4060 pinfo
.last_backfill
,
4067 pinfo
.last_update
= info
.last_update
;
4068 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
4069 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
4072 if (!rebuild_missing
) {
4076 for (auto &&i
: entries
) {
4077 missing_loc
.rebuild(
4080 acting_recovery_backfill
,
4082 pg_log
.get_missing(),
4088 void PeeringState::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
4090 // raise last_complete only if we were previously up to date
4091 if (info
.last_complete
== info
.last_update
)
4092 info
.last_complete
= e
.version
;
4094 // raise last_update.
4095 ceph_assert(e
.version
> info
.last_update
);
4096 info
.last_update
= e
.version
;
4098 // raise user_version, if it increased (it may have not get bumped
4099 // by all logged updates)
4100 if (e
.user_version
> info
.last_user_version
)
4101 info
.last_user_version
= e
.user_version
;
4104 pg_log
.add(e
, applied
);
4105 psdout(10) << "add_log_entry " << e
<< dendl
;
4109 void PeeringState::append_log(
4110 vector
<pg_log_entry_t
>&& logv
,
4112 eversion_t roll_forward_to
,
4114 ObjectStore::Transaction
&t
,
4115 bool transaction_applied
,
4118 /* The primary has sent an info updating the history, but it may not
4119 * have arrived yet. We want to make sure that we cannot remember this
4120 * write without remembering that it happened in an interval which went
4121 * active in epoch history.last_epoch_started.
4123 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
4124 info
.history
.last_epoch_started
= info
.last_epoch_started
;
4126 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
4127 info
.history
.last_interval_started
= info
.last_interval_started
;
4129 psdout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
4131 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
4132 if (!transaction_applied
) {
4133 /* We must be a backfill or async recovery peer, so it's ok if we apply
4134 * out-of-turn since we won't be considered when
4135 * determining a min possible last_update.
4137 * We skip_rollforward() here, which advances the crt, without
4138 * doing an actual rollforward. This avoids cleaning up entries
4139 * from the backend and we do not end up in a situation, where the
4140 * object is deleted before we can _merge_object_divergent_entries().
4142 pg_log
.skip_rollforward();
4145 for (auto p
= logv
.begin(); p
!= logv
.end(); ++p
) {
4146 add_log_entry(*p
, transaction_applied
);
4148 /* We don't want to leave the rollforward artifacts around
4149 * here past last_backfill. It's ok for the same reason as
4151 if (transaction_applied
&&
4152 p
->soid
> info
.last_backfill
) {
4153 pg_log
.roll_forward(handler
.get());
4156 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
4157 pg_log
.roll_forward_to(
4160 last_rollback_info_trimmed_to_applied
= roll_forward_to
;
4163 psdout(10) << __func__
<< " approx pg log length = "
4164 << pg_log
.get_log().approx_size() << dendl
;
4165 psdout(10) << __func__
<< " transaction_applied = "
4166 << transaction_applied
<< dendl
;
4167 if (!transaction_applied
|| async
)
4168 psdout(10) << __func__
<< " " << pg_whoami
4169 << " is async_recovery or backfill target" << dendl
;
4170 pg_log
.trim(trim_to
, info
, transaction_applied
, async
);
4172 // update the local pg, pg log
4177 min_last_complete_ondisk
= mlcod
;
4180 void PeeringState::recover_got(
4181 const hobject_t
&oid
, eversion_t v
,
4183 ObjectStore::Transaction
&t
)
4185 if (v
> pg_log
.get_can_rollback_to()) {
4186 /* This can only happen during a repair, and even then, it would
4187 * be one heck of a race. If we are repairing the object, the
4188 * write in question must be fully committed, so it's not valid
4189 * to roll it back anyway (and we'll be rolled forward shortly
4191 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
4192 pg_log
.roll_forward_to(v
, handler
.get());
4195 psdout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
4196 pg_log
.recover_got(oid
, v
, info
);
4197 if (pg_log
.get_log().log
.empty()) {
4198 psdout(10) << "last_complete now " << info
.last_complete
4199 << " while log is empty" << dendl
;
4200 } else if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
4201 psdout(10) << "last_complete now " << info
.last_complete
4202 << " log.complete_to " << pg_log
.get_log().complete_to
->version
4205 psdout(10) << "last_complete now " << info
.last_complete
4206 << " log.complete_to at end" << dendl
;
4207 //below is not true in the repair case.
4208 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
4209 ceph_assert(info
.last_complete
== info
.last_update
);
4213 ceph_assert(missing_loc
.needs_recovery(oid
));
4215 missing_loc
.add_location(oid
, pg_whoami
);
4223 void PeeringState::update_backfill_progress(
4224 const hobject_t
&updated_backfill
,
4225 const pg_stat_t
&updated_stats
,
4226 bool preserve_local_num_bytes
,
4227 ObjectStore::Transaction
&t
) {
4228 info
.set_last_backfill(updated_backfill
);
4229 if (preserve_local_num_bytes
) {
4230 psdout(25) << __func__
<< " primary " << updated_stats
.stats
.sum
.num_bytes
4231 << " local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
4232 int64_t bytes
= info
.stats
.stats
.sum
.num_bytes
;
4233 info
.stats
= updated_stats
;
4234 info
.stats
.stats
.sum
.num_bytes
= bytes
;
4236 psdout(20) << __func__
<< " final " << updated_stats
.stats
.sum
.num_bytes
4237 << " replaces local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
4238 info
.stats
= updated_stats
;
4245 void PeeringState::adjust_purged_snaps(
4246 std::function
<void(interval_set
<snapid_t
> &snaps
)> f
) {
4247 f(info
.purged_snaps
);
4249 dirty_big_info
= true;
4252 void PeeringState::on_peer_recover(
4254 const hobject_t
&soid
,
4255 const eversion_t
&version
)
4257 pl
->publish_stats_to_osd();
4259 peer_missing
[peer
].got(soid
, version
);
4260 missing_loc
.add_location(soid
, peer
);
4263 void PeeringState::begin_peer_recover(
4265 const hobject_t soid
)
4267 peer_missing
[peer
].revise_have(soid
, eversion_t());
4270 void PeeringState::force_object_missing(
4271 const set
<pg_shard_t
> &peers
,
4272 const hobject_t
&soid
,
4275 for (auto &&peer
: peers
) {
4276 if (peer
!= primary
) {
4277 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
4279 pg_log
.missing_add(soid
, version
, eversion_t());
4280 pg_log
.reset_complete_to(&info
);
4281 pg_log
.set_last_requested(0);
4285 missing_loc
.rebuild(
4288 acting_recovery_backfill
,
4290 pg_log
.get_missing(),
4295 void PeeringState::pre_submit_op(
4296 const hobject_t
&hoid
,
4297 const vector
<pg_log_entry_t
>& logv
,
4298 eversion_t at_version
)
4300 if (at_version
> eversion_t()) {
4301 for (auto &&i
: get_acting_recovery_backfill()) {
4302 if (i
== primary
) continue;
4303 pg_info_t
&pinfo
= peer_info
[i
];
4304 // keep peer_info up to date
4305 if (pinfo
.last_complete
== pinfo
.last_update
)
4306 pinfo
.last_complete
= at_version
;
4307 pinfo
.last_update
= at_version
;
4311 bool requires_missing_loc
= false;
4312 for (auto &&i
: get_async_recovery_targets()) {
4313 if (i
== primary
|| !get_peer_missing(i
).is_missing(hoid
))
4315 requires_missing_loc
= true;
4316 for (auto &&entry
: logv
) {
4317 peer_missing
[i
].add_next_event(entry
);
4321 if (requires_missing_loc
) {
4322 for (auto &&entry
: logv
) {
4323 psdout(30) << __func__
<< " missing_loc before: "
4324 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4325 missing_loc
.add_missing(entry
.soid
, entry
.version
,
4326 eversion_t(), entry
.is_delete());
4327 // clear out missing_loc
4328 missing_loc
.clear_location(entry
.soid
);
4329 for (auto &i
: get_actingset()) {
4330 if (!get_peer_missing(i
).is_missing(entry
.soid
))
4331 missing_loc
.add_location(entry
.soid
, i
);
4333 psdout(30) << __func__
<< " missing_loc after: "
4334 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4339 void PeeringState::recovery_committed_to(eversion_t version
)
4341 psdout(10) << __func__
<< " version " << version
4342 << " now ondisk" << dendl
;
4343 last_complete_ondisk
= version
;
4345 if (last_complete_ondisk
== info
.last_update
) {
4346 if (!is_primary()) {
4347 // Either we are a replica or backfill target.
4348 // we are fully up to date. tell the primary!
4349 pl
->send_cluster_message(
4351 TOPNSPC::make_message
<MOSDPGTrim
>(
4353 spg_t(info
.pgid
.pgid
, primary
.shard
),
4354 last_complete_ondisk
),
4355 get_osdmap_epoch());
4357 calc_min_last_complete_ondisk();
4362 void PeeringState::complete_write(eversion_t v
, eversion_t lc
)
4364 last_update_ondisk
= v
;
4365 last_complete_ondisk
= lc
;
4366 calc_min_last_complete_ondisk();
4369 void PeeringState::calc_trim_to()
4371 size_t target
= pl
->get_target_pg_log_entries();
4373 eversion_t limit
= std::min(
4374 min_last_complete_ondisk
,
4375 pg_log
.get_can_rollback_to());
4376 if (limit
!= eversion_t() &&
4377 limit
!= pg_trim_to
&&
4378 pg_log
.get_log().approx_size() > target
) {
4379 size_t num_to_trim
= std::min(pg_log
.get_log().approx_size() - target
,
4380 cct
->_conf
->osd_pg_log_trim_max
);
4381 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4382 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4385 auto it
= pg_log
.get_log().log
.begin();
4386 eversion_t new_trim_to
;
4387 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
4388 new_trim_to
= it
->version
;
4390 if (new_trim_to
> limit
) {
4391 new_trim_to
= limit
;
4392 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
4396 psdout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
4397 pg_trim_to
= new_trim_to
;
4398 assert(pg_trim_to
<= pg_log
.get_head());
4399 assert(pg_trim_to
<= min_last_complete_ondisk
);
4403 void PeeringState::calc_trim_to_aggressive()
4405 size_t target
= pl
->get_target_pg_log_entries();
4407 // limit pg log trimming up to the can_rollback_to value
4408 eversion_t limit
= std::min({
4410 pg_log
.get_can_rollback_to(),
4411 last_update_ondisk
});
4412 psdout(10) << __func__
<< " limit = " << limit
<< dendl
;
4414 if (limit
!= eversion_t() &&
4415 limit
!= pg_trim_to
&&
4416 pg_log
.get_log().approx_size() > target
) {
4417 psdout(10) << __func__
<< " approx pg log length = "
4418 << pg_log
.get_log().approx_size() << dendl
;
4419 uint64_t num_to_trim
= std::min
<uint64_t>(pg_log
.get_log().approx_size() - target
,
4420 cct
->_conf
->osd_pg_log_trim_max
);
4421 psdout(10) << __func__
<< " num_to_trim = " << num_to_trim
<< dendl
;
4422 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4423 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4426 auto it
= pg_log
.get_log().log
.begin(); // oldest log entry
4427 auto rit
= pg_log
.get_log().log
.rbegin();
4428 eversion_t by_n_to_keep
; // start from tail
4429 eversion_t by_n_to_trim
= eversion_t::max(); // start from head
4430 for (size_t i
= 0; it
!= pg_log
.get_log().log
.end(); ++it
, ++rit
) {
4432 if (i
> target
&& by_n_to_keep
== eversion_t()) {
4433 by_n_to_keep
= rit
->version
;
4435 if (i
>= num_to_trim
&& by_n_to_trim
== eversion_t::max()) {
4436 by_n_to_trim
= it
->version
;
4438 if (by_n_to_keep
!= eversion_t() &&
4439 by_n_to_trim
!= eversion_t::max()) {
4444 if (by_n_to_keep
== eversion_t()) {
4448 pg_trim_to
= std::min({by_n_to_keep
, by_n_to_trim
, limit
});
4449 psdout(10) << __func__
<< " pg_trim_to now " << pg_trim_to
<< dendl
;
4450 ceph_assert(pg_trim_to
<= pg_log
.get_head());
4454 void PeeringState::apply_op_stats(
4455 const hobject_t
&soid
,
4456 const object_stat_sum_t
&delta_stats
)
4458 info
.stats
.stats
.add(delta_stats
);
4459 info
.stats
.stats
.floor(0);
4461 for (auto i
= get_backfill_targets().begin();
4462 i
!= get_backfill_targets().end();
4465 pg_info_t
& pinfo
= peer_info
[bt
];
4466 if (soid
<= pinfo
.last_backfill
)
4467 pinfo
.stats
.stats
.add(delta_stats
);
4471 void PeeringState::update_complete_backfill_object_stats(
4472 const hobject_t
&hoid
,
4473 const pg_stat_t
&stats
)
4475 for (auto &&bt
: get_backfill_targets()) {
4476 pg_info_t
& pinfo
= peer_info
[bt
];
4477 //Add stats to all peers that were missing object
4478 if (hoid
> pinfo
.last_backfill
)
4479 pinfo
.stats
.add(stats
);
4483 void PeeringState::update_peer_last_backfill(
4485 const hobject_t
&new_last_backfill
)
4487 pg_info_t
&pinfo
= peer_info
[peer
];
4488 pinfo
.last_backfill
= new_last_backfill
;
4489 if (new_last_backfill
.is_max()) {
4490 /* pinfo.stats might be wrong if we did log-based recovery on the
4491 * backfilled portion in addition to continuing backfill.
4493 pinfo
.stats
= info
.stats
;
4497 void PeeringState::set_revert_with_targets(
4498 const hobject_t
&soid
,
4499 const set
<pg_shard_t
> &good_peers
)
4501 for (auto &&peer
: good_peers
) {
4502 missing_loc
.add_location(soid
, peer
);
4506 void PeeringState::prepare_backfill_for_missing(
4507 const hobject_t
&soid
,
4508 const eversion_t
&version
,
4509 const vector
<pg_shard_t
> &targets
) {
4510 for (auto &&peer
: targets
) {
4511 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
4515 void PeeringState::update_hset(const pg_hit_set_history_t
&hset_history
)
4517 info
.hit_set
= hset_history
;
4520 /*------------ Peering State Machine----------------*/
4522 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4523 << "state<" << get_state_name() << ">: ")
4525 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4527 #define DECLARE_LOCALS \
4528 PeeringState *ps = context< PeeringMachine >().state; \
4530 PeeringListener *pl = context< PeeringMachine >().pl; \
4534 /*------Crashed-------*/
4535 PeeringState::Crashed::Crashed(my_context ctx
)
4537 NamedState(context
< PeeringMachine
>().state_history
, "Crashed")
4539 context
< PeeringMachine
>().log_enter(state_name
);
4540 ceph_abort_msg("we got a bad state machine event");
4544 /*------Initial-------*/
4545 PeeringState::Initial::Initial(my_context ctx
)
4547 NamedState(context
< PeeringMachine
>().state_history
, "Initial")
4549 context
< PeeringMachine
>().log_enter(state_name
);
4552 boost::statechart::result
PeeringState::Initial::react(const MNotifyRec
& notify
)
4555 ps
->proc_replica_info(
4556 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
4557 ps
->set_last_peering_reset();
4558 return transit
< Primary
>();
4561 boost::statechart::result
PeeringState::Initial::react(const MInfoRec
& i
)
4564 ceph_assert(!ps
->is_primary());
4566 return transit
< Stray
>();
4569 boost::statechart::result
PeeringState::Initial::react(const MLogRec
& i
)
4572 ceph_assert(!ps
->is_primary());
4574 return transit
< Stray
>();
4577 void PeeringState::Initial::exit()
4579 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4581 utime_t dur
= ceph_clock_now() - enter_time
;
4582 pl
->get_peering_perf().tinc(rs_initial_latency
, dur
);
4585 /*------Started-------*/
4586 PeeringState::Started::Started(my_context ctx
)
4588 NamedState(context
< PeeringMachine
>().state_history
, "Started")
4590 context
< PeeringMachine
>().log_enter(state_name
);
4593 boost::statechart::result
4594 PeeringState::Started::react(const IntervalFlush
&)
4596 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4597 context
< PeeringMachine
>().state
->end_block_outgoing();
4598 return discard_event();
4601 boost::statechart::result
PeeringState::Started::react(const AdvMap
& advmap
)
4604 psdout(10) << "Started advmap" << dendl
;
4605 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4606 if (ps
->should_restart_peering(
4608 advmap
.acting_primary
,
4613 psdout(10) << "should_restart_peering, transitioning to Reset"
4616 return transit
< Reset
>();
4618 ps
->remove_down_peer_info(advmap
.osdmap
);
4619 return discard_event();
4622 boost::statechart::result
PeeringState::Started::react(const QueryState
& q
)
4624 q
.f
->open_object_section("state");
4625 q
.f
->dump_string("name", state_name
);
4626 q
.f
->dump_stream("enter_time") << enter_time
;
4627 q
.f
->close_section();
4628 return discard_event();
4631 boost::statechart::result
PeeringState::Started::react(const QueryUnfound
& q
)
4633 q
.f
->dump_string("state", "Started");
4634 q
.f
->dump_bool("available_might_have_unfound", false);
4635 return discard_event();
4638 void PeeringState::Started::exit()
4640 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4642 utime_t dur
= ceph_clock_now() - enter_time
;
4643 pl
->get_peering_perf().tinc(rs_started_latency
, dur
);
4644 ps
->state_clear(PG_STATE_WAIT
| PG_STATE_LAGGY
);
4647 /*--------Reset---------*/
4648 PeeringState::Reset::Reset(my_context ctx
)
4650 NamedState(context
< PeeringMachine
>().state_history
, "Reset")
4652 context
< PeeringMachine
>().log_enter(state_name
);
4655 ps
->flushes_in_progress
= 0;
4656 ps
->set_last_peering_reset();
4657 ps
->log_weirdness();
4660 boost::statechart::result
4661 PeeringState::Reset::react(const IntervalFlush
&)
4663 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4664 context
< PeeringMachine
>().state
->end_block_outgoing();
4665 return discard_event();
4668 boost::statechart::result
PeeringState::Reset::react(const AdvMap
& advmap
)
4671 psdout(10) << "Reset advmap" << dendl
;
4673 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4675 if (ps
->should_restart_peering(
4677 advmap
.acting_primary
,
4682 psdout(10) << "should restart peering, calling start_peering_interval again"
4684 ps
->start_peering_interval(
4686 advmap
.newup
, advmap
.up_primary
,
4687 advmap
.newacting
, advmap
.acting_primary
,
4688 context
< PeeringMachine
>().get_cur_transaction());
4690 ps
->remove_down_peer_info(advmap
.osdmap
);
4691 ps
->check_past_interval_bounds();
4692 return discard_event();
4695 boost::statechart::result
PeeringState::Reset::react(const ActMap
&)
4698 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
4699 ps
->info
.history
.refresh_prior_readable_until_ub(
4701 ps
->prior_readable_until_ub
);
4702 context
< PeeringMachine
>().send_notify(
4703 ps
->get_primary().osd
,
4705 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
4706 ps
->get_osdmap_epoch(),
4707 ps
->get_osdmap_epoch(),
4709 ps
->past_intervals
));
4712 ps
->update_heartbeat_peers();
4714 return transit
< Started
>();
4717 boost::statechart::result
PeeringState::Reset::react(const QueryState
& q
)
4719 q
.f
->open_object_section("state");
4720 q
.f
->dump_string("name", state_name
);
4721 q
.f
->dump_stream("enter_time") << enter_time
;
4722 q
.f
->close_section();
4723 return discard_event();
4726 boost::statechart::result
PeeringState::Reset::react(const QueryUnfound
& q
)
4728 q
.f
->dump_string("state", "Reset");
4729 q
.f
->dump_bool("available_might_have_unfound", false);
4730 return discard_event();
4733 void PeeringState::Reset::exit()
4735 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4737 utime_t dur
= ceph_clock_now() - enter_time
;
4738 pl
->get_peering_perf().tinc(rs_reset_latency
, dur
);
4741 /*-------Start---------*/
4742 PeeringState::Start::Start(my_context ctx
)
4744 NamedState(context
< PeeringMachine
>().state_history
, "Start")
4746 context
< PeeringMachine
>().log_enter(state_name
);
4749 if (ps
->is_primary()) {
4750 psdout(1) << "transitioning to Primary" << dendl
;
4751 post_event(MakePrimary());
4753 psdout(1) << "transitioning to Stray" << dendl
;
4754 post_event(MakeStray());
4758 void PeeringState::Start::exit()
4760 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4762 utime_t dur
= ceph_clock_now() - enter_time
;
4763 pl
->get_peering_perf().tinc(rs_start_latency
, dur
);
4766 /*---------Primary--------*/
4767 PeeringState::Primary::Primary(my_context ctx
)
4769 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary")
4771 context
< PeeringMachine
>().log_enter(state_name
);
4773 ceph_assert(ps
->want_acting
.empty());
4775 // set CREATING bit until we have peered for the first time.
4776 if (ps
->info
.history
.last_epoch_started
== 0) {
4777 ps
->state_set(PG_STATE_CREATING
);
4778 // use the history timestamp, which ultimately comes from the
4779 // monitor in the create case.
4780 utime_t t
= ps
->info
.history
.last_scrub_stamp
;
4781 ps
->info
.stats
.last_fresh
= t
;
4782 ps
->info
.stats
.last_active
= t
;
4783 ps
->info
.stats
.last_change
= t
;
4784 ps
->info
.stats
.last_peered
= t
;
4785 ps
->info
.stats
.last_clean
= t
;
4786 ps
->info
.stats
.last_unstale
= t
;
4787 ps
->info
.stats
.last_undegraded
= t
;
4788 ps
->info
.stats
.last_fullsized
= t
;
4789 ps
->info
.stats
.last_scrub_stamp
= t
;
4790 ps
->info
.stats
.last_deep_scrub_stamp
= t
;
4791 ps
->info
.stats
.last_clean_scrub_stamp
= t
;
4795 boost::statechart::result
PeeringState::Primary::react(const MNotifyRec
& notevt
)
4798 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
4799 ps
->proc_replica_info(
4800 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
4801 return discard_event();
4804 boost::statechart::result
PeeringState::Primary::react(const ActMap
&)
4807 psdout(7) << "handle ActMap primary" << dendl
;
4808 pl
->publish_stats_to_osd();
4809 return discard_event();
4812 boost::statechart::result
PeeringState::Primary::react(
4813 const SetForceRecovery
&)
4816 ps
->set_force_recovery(true);
4817 return discard_event();
4820 boost::statechart::result
PeeringState::Primary::react(
4821 const UnsetForceRecovery
&)
4824 ps
->set_force_recovery(false);
4825 return discard_event();
4828 boost::statechart::result
PeeringState::Primary::react(
4829 const RequestScrub
& evt
)
4832 if (ps
->is_primary()) {
4833 pl
->scrub_requested(evt
.deep
, evt
.repair
);
4834 psdout(10) << "marking for scrub" << dendl
;
4836 return discard_event();
4839 boost::statechart::result
PeeringState::Primary::react(
4840 const SetForceBackfill
&)
4843 ps
->set_force_backfill(true);
4844 return discard_event();
4847 boost::statechart::result
PeeringState::Primary::react(
4848 const UnsetForceBackfill
&)
4851 ps
->set_force_backfill(false);
4852 return discard_event();
4855 void PeeringState::Primary::exit()
4857 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4859 ps
->want_acting
.clear();
4860 utime_t dur
= ceph_clock_now() - enter_time
;
4861 pl
->get_peering_perf().tinc(rs_primary_latency
, dur
);
4862 pl
->clear_primary_state();
4863 ps
->state_clear(PG_STATE_CREATING
);
4866 /*---------Peering--------*/
4867 PeeringState::Peering::Peering(my_context ctx
)
4869 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering"),
4870 history_les_bound(false)
4872 context
< PeeringMachine
>().log_enter(state_name
);
4875 ceph_assert(!ps
->is_peered());
4876 ceph_assert(!ps
->is_peering());
4877 ceph_assert(ps
->is_primary());
4878 ps
->state_set(PG_STATE_PEERING
);
4881 boost::statechart::result
PeeringState::Peering::react(const AdvMap
& advmap
)
4884 psdout(10) << "Peering advmap" << dendl
;
4885 if (prior_set
.affected_by_map(*(advmap
.osdmap
), ps
->dpp
)) {
4886 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl
;
4888 return transit
< Reset
>();
4891 ps
->adjust_need_up_thru(advmap
.osdmap
);
4892 ps
->check_prior_readable_down_osds(advmap
.osdmap
);
4894 return forward_event();
4897 boost::statechart::result
PeeringState::Peering::react(const QueryState
& q
)
4901 q
.f
->open_object_section("state");
4902 q
.f
->dump_string("name", state_name
);
4903 q
.f
->dump_stream("enter_time") << enter_time
;
4905 q
.f
->open_array_section("past_intervals");
4906 ps
->past_intervals
.dump(q
.f
);
4907 q
.f
->close_section();
4909 q
.f
->open_array_section("probing_osds");
4910 for (auto p
= prior_set
.probe
.begin(); p
!= prior_set
.probe
.end(); ++p
)
4911 q
.f
->dump_stream("osd") << *p
;
4912 q
.f
->close_section();
4914 if (prior_set
.pg_down
)
4915 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
4917 q
.f
->open_array_section("down_osds_we_would_probe");
4918 for (auto p
= prior_set
.down
.begin(); p
!= prior_set
.down
.end(); ++p
)
4919 q
.f
->dump_int("osd", *p
);
4920 q
.f
->close_section();
4922 q
.f
->open_array_section("peering_blocked_by");
4923 for (auto p
= prior_set
.blocked_by
.begin();
4924 p
!= prior_set
.blocked_by
.end();
4926 q
.f
->open_object_section("osd");
4927 q
.f
->dump_int("osd", p
->first
);
4928 q
.f
->dump_int("current_lost_at", p
->second
);
4929 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
4930 q
.f
->close_section();
4932 q
.f
->close_section();
4934 if (history_les_bound
) {
4935 q
.f
->open_array_section("peering_blocked_by_detail");
4936 q
.f
->open_object_section("item");
4937 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
4938 q
.f
->close_section();
4939 q
.f
->close_section();
4942 q
.f
->close_section();
4943 return forward_event();
4946 boost::statechart::result
PeeringState::Peering::react(const QueryUnfound
& q
)
4948 q
.f
->dump_string("state", "Peering");
4949 q
.f
->dump_bool("available_might_have_unfound", false);
4950 return discard_event();
4953 void PeeringState::Peering::exit()
4957 psdout(10) << "Leaving Peering" << dendl
;
4958 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4959 ps
->state_clear(PG_STATE_PEERING
);
4960 pl
->clear_probe_targets();
4962 utime_t dur
= ceph_clock_now() - enter_time
;
4963 pl
->get_peering_perf().tinc(rs_peering_latency
, dur
);
4967 /*------Backfilling-------*/
4968 PeeringState::Backfilling::Backfilling(my_context ctx
)
4970 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Backfilling")
4972 context
< PeeringMachine
>().log_enter(state_name
);
4976 ps
->backfill_reserved
= true;
4977 pl
->on_backfill_reserved();
4978 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
4979 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
4980 ps
->state_set(PG_STATE_BACKFILLING
);
4981 pl
->publish_stats_to_osd();
4984 void PeeringState::Backfilling::backfill_release_reservations()
4987 pl
->cancel_local_background_io_reservation();
4988 for (auto it
= ps
->backfill_targets
.begin();
4989 it
!= ps
->backfill_targets
.end();
4991 ceph_assert(*it
!= ps
->pg_whoami
);
4992 pl
->send_cluster_message(
4994 TOPNSPC::make_message
<MBackfillReserve
>(
4995 MBackfillReserve::RELEASE
,
4996 spg_t(ps
->info
.pgid
.pgid
, it
->shard
),
4997 ps
->get_osdmap_epoch()),
4998 ps
->get_osdmap_epoch());
5002 void PeeringState::Backfilling::cancel_backfill()
5005 backfill_release_reservations();
5006 pl
->on_backfill_canceled();
5009 boost::statechart::result
5010 PeeringState::Backfilling::react(const Backfilled
&c
)
5012 backfill_release_reservations();
5013 return transit
<Recovered
>();
5016 boost::statechart::result
5017 PeeringState::Backfilling::react(const DeferBackfill
&c
)
5021 psdout(10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
5022 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
5023 ps
->state_clear(PG_STATE_BACKFILLING
);
5026 pl
->schedule_event_after(
5027 std::make_shared
<PGPeeringEvent
>(
5028 ps
->get_osdmap_epoch(),
5029 ps
->get_osdmap_epoch(),
5032 return transit
<NotBackfilling
>();
5035 boost::statechart::result
5036 PeeringState::Backfilling::react(const UnfoundBackfill
&c
)
5039 psdout(10) << "backfill has unfound, can't continue" << dendl
;
5040 ps
->state_set(PG_STATE_BACKFILL_UNFOUND
);
5041 ps
->state_clear(PG_STATE_BACKFILLING
);
5043 return transit
<NotBackfilling
>();
5046 boost::statechart::result
5047 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull
&)
5051 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
5052 ps
->state_clear(PG_STATE_BACKFILLING
);
5055 pl
->schedule_event_after(
5056 std::make_shared
<PGPeeringEvent
>(
5057 ps
->get_osdmap_epoch(),
5058 ps
->get_osdmap_epoch(),
5060 ps
->cct
->_conf
->osd_backfill_retry_interval
);
5062 return transit
<NotBackfilling
>();
5065 boost::statechart::result
5066 PeeringState::Backfilling::react(const RemoteReservationRevoked
&)
5069 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
5071 if (ps
->needs_backfill()) {
5072 return transit
<WaitLocalBackfillReserved
>();
5074 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
5075 return discard_event();
5079 void PeeringState::Backfilling::exit()
5081 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5083 ps
->backfill_reserved
= false;
5084 ps
->state_clear(PG_STATE_BACKFILLING
);
5085 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
5086 utime_t dur
= ceph_clock_now() - enter_time
;
5087 pl
->get_peering_perf().tinc(rs_backfilling_latency
, dur
);
5090 /*--WaitRemoteBackfillReserved--*/
5092 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
5094 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
5095 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
5097 context
< PeeringMachine
>().log_enter(state_name
);
5100 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
5101 pl
->publish_stats_to_osd();
5102 post_event(RemoteBackfillReserved());
5105 boost::statechart::result
5106 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
5110 int64_t num_bytes
= ps
->info
.stats
.stats
.sum
.num_bytes
;
5111 psdout(10) << __func__
<< " num_bytes " << num_bytes
<< dendl
;
5112 if (backfill_osd_it
!=
5113 context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
5114 // The primary never backfills itself
5115 ceph_assert(*backfill_osd_it
!= ps
->pg_whoami
);
5116 pl
->send_cluster_message(
5117 backfill_osd_it
->osd
,
5118 TOPNSPC::make_message
<MBackfillReserve
>(
5119 MBackfillReserve::REQUEST
,
5120 spg_t(context
< PeeringMachine
>().spgid
.pgid
, backfill_osd_it
->shard
),
5121 ps
->get_osdmap_epoch(),
5122 ps
->get_backfill_priority(),
5124 ps
->peer_bytes
[*backfill_osd_it
]),
5125 ps
->get_osdmap_epoch());
5128 ps
->peer_bytes
.clear();
5129 post_event(AllBackfillsReserved());
5131 return discard_event();
5134 void PeeringState::WaitRemoteBackfillReserved::exit()
5136 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5139 utime_t dur
= ceph_clock_now() - enter_time
;
5140 pl
->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency
, dur
);
5143 void PeeringState::WaitRemoteBackfillReserved::retry()
5146 pl
->cancel_local_background_io_reservation();
5148 // Send CANCEL to all previously acquired reservations
5149 set
<pg_shard_t
>::const_iterator it
, begin
, end
;
5150 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
5151 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
5152 ceph_assert(begin
!= end
);
5153 for (it
= begin
; it
!= backfill_osd_it
; ++it
) {
5154 // The primary never backfills itself
5155 ceph_assert(*it
!= ps
->pg_whoami
);
5156 pl
->send_cluster_message(
5158 TOPNSPC::make_message
<MBackfillReserve
>(
5159 MBackfillReserve::RELEASE
,
5160 spg_t(context
< PeeringMachine
>().spgid
.pgid
, it
->shard
),
5161 ps
->get_osdmap_epoch()),
5162 ps
->get_osdmap_epoch());
5165 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
5166 pl
->publish_stats_to_osd();
5168 pl
->schedule_event_after(
5169 std::make_shared
<PGPeeringEvent
>(
5170 ps
->get_osdmap_epoch(),
5171 ps
->get_osdmap_epoch(),
5173 ps
->cct
->_conf
->osd_backfill_retry_interval
);
5176 boost::statechart::result
5177 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull
&evt
)
5180 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
5182 return transit
<NotBackfilling
>();
5185 boost::statechart::result
5186 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked
&evt
)
5189 return transit
<NotBackfilling
>();
5192 /*--WaitLocalBackfillReserved--*/
5193 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
5195 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalBackfillReserved")
5197 context
< PeeringMachine
>().log_enter(state_name
);
5200 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
5201 pl
->request_local_background_io_reservation(
5202 ps
->get_backfill_priority(),
5203 std::make_unique
<PGPeeringEvent
>(
5204 ps
->get_osdmap_epoch(),
5205 ps
->get_osdmap_epoch(),
5206 LocalBackfillReserved()),
5207 std::make_unique
<PGPeeringEvent
>(
5208 ps
->get_osdmap_epoch(),
5209 ps
->get_osdmap_epoch(),
5210 DeferBackfill(0.0)));
5211 pl
->publish_stats_to_osd();
5214 void PeeringState::WaitLocalBackfillReserved::exit()
5216 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5218 utime_t dur
= ceph_clock_now() - enter_time
;
5219 pl
->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency
, dur
);
5222 /*----NotBackfilling------*/
5223 PeeringState::NotBackfilling::NotBackfilling(my_context ctx
)
5225 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotBackfilling")
5227 context
< PeeringMachine
>().log_enter(state_name
);
5229 ps
->state_clear(PG_STATE_REPAIR
);
5230 pl
->publish_stats_to_osd();
5233 boost::statechart::result
PeeringState::NotBackfilling::react(const QueryUnfound
& q
)
5237 ps
->query_unfound(q
.f
, "NotBackfilling");
5238 return discard_event();
5241 boost::statechart::result
5242 PeeringState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
5244 return discard_event();
5247 boost::statechart::result
5248 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull
&evt
)
5250 return discard_event();
5253 void PeeringState::NotBackfilling::exit()
5255 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5258 ps
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
5259 utime_t dur
= ceph_clock_now() - enter_time
;
5260 pl
->get_peering_perf().tinc(rs_notbackfilling_latency
, dur
);
5263 /*----NotRecovering------*/
5264 PeeringState::NotRecovering::NotRecovering(my_context ctx
)
5266 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotRecovering")
5268 context
< PeeringMachine
>().log_enter(state_name
);
5270 ps
->state_clear(PG_STATE_REPAIR
);
5271 pl
->publish_stats_to_osd();
5274 boost::statechart::result
PeeringState::NotRecovering::react(const QueryUnfound
& q
)
5278 ps
->query_unfound(q
.f
, "NotRecovering");
5279 return discard_event();
5282 void PeeringState::NotRecovering::exit()
5284 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5287 ps
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
5288 utime_t dur
= ceph_clock_now() - enter_time
;
5289 pl
->get_peering_perf().tinc(rs_notrecovering_latency
, dur
);
5292 /*---RepNotRecovering----*/
5293 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx
)
5295 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepNotRecovering")
5297 context
< PeeringMachine
>().log_enter(state_name
);
5300 boost::statechart::result
5301 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation
&evt
)
5304 ps
->reject_reservation();
5305 post_event(RemoteReservationRejectedTooFull());
5306 return discard_event();
5309 void PeeringState::RepNotRecovering::exit()
5311 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5313 utime_t dur
= ceph_clock_now() - enter_time
;
5314 pl
->get_peering_perf().tinc(rs_repnotrecovering_latency
, dur
);
5317 /*---RepWaitRecoveryReserved--*/
5318 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
5320 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitRecoveryReserved")
5322 context
< PeeringMachine
>().log_enter(state_name
);
5325 boost::statechart::result
5326 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
5329 pl
->send_cluster_message(
5331 TOPNSPC::make_message
<MRecoveryReserve
>(
5332 MRecoveryReserve::GRANT
,
5333 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5334 ps
->get_osdmap_epoch()),
5335 ps
->get_osdmap_epoch());
5336 return transit
<RepRecovering
>();
5339 boost::statechart::result
5340 PeeringState::RepWaitRecoveryReserved::react(
5341 const RemoteReservationCanceled
&evt
)
5344 pl
->unreserve_recovery_space();
5346 pl
->cancel_remote_recovery_reservation();
5347 return transit
<RepNotRecovering
>();
5350 void PeeringState::RepWaitRecoveryReserved::exit()
5352 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5354 utime_t dur
= ceph_clock_now() - enter_time
;
5355 pl
->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency
, dur
);
5358 /*-RepWaitBackfillReserved*/
5359 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
5361 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitBackfillReserved")
5363 context
< PeeringMachine
>().log_enter(state_name
);
5366 boost::statechart::result
5367 PeeringState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
5372 if (!pl
->try_reserve_recovery_space(
5373 evt
.primary_num_bytes
, evt
.local_num_bytes
)) {
5374 post_event(RejectTooFullRemoteReservation());
5376 PGPeeringEventURef preempt
;
5377 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5378 // older peers will interpret preemption as TOOFULL
5379 preempt
= std::make_unique
<PGPeeringEvent
>(
5380 pl
->get_osdmap_epoch(),
5381 pl
->get_osdmap_epoch(),
5382 RemoteBackfillPreempted());
5384 pl
->request_remote_recovery_reservation(
5386 std::make_unique
<PGPeeringEvent
>(
5387 pl
->get_osdmap_epoch(),
5388 pl
->get_osdmap_epoch(),
5389 RemoteBackfillReserved()),
5390 std::move(preempt
));
5392 return transit
<RepWaitBackfillReserved
>();
5395 boost::statechart::result
5396 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio
&evt
)
5400 // fall back to a local reckoning of priority of primary doesn't pass one
5401 // (pre-mimic compat)
5402 int prio
= evt
.priority
? evt
.priority
: ps
->get_recovery_priority();
5404 PGPeeringEventURef preempt
;
5405 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5406 // older peers can't handle this
5407 preempt
= std::make_unique
<PGPeeringEvent
>(
5408 ps
->get_osdmap_epoch(),
5409 ps
->get_osdmap_epoch(),
5410 RemoteRecoveryPreempted());
5413 pl
->request_remote_recovery_reservation(
5415 std::make_unique
<PGPeeringEvent
>(
5416 ps
->get_osdmap_epoch(),
5417 ps
->get_osdmap_epoch(),
5418 RemoteRecoveryReserved()),
5419 std::move(preempt
));
5420 return transit
<RepWaitRecoveryReserved
>();
5423 void PeeringState::RepWaitBackfillReserved::exit()
5425 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5427 utime_t dur
= ceph_clock_now() - enter_time
;
5428 pl
->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency
, dur
);
5431 boost::statechart::result
5432 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
5437 pl
->send_cluster_message(
5439 TOPNSPC::make_message
<MBackfillReserve
>(
5440 MBackfillReserve::GRANT
,
5441 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5442 ps
->get_osdmap_epoch()),
5443 ps
->get_osdmap_epoch());
5444 return transit
<RepRecovering
>();
5447 boost::statechart::result
5448 PeeringState::RepWaitBackfillReserved::react(
5449 const RejectTooFullRemoteReservation
&evt
)
5452 ps
->reject_reservation();
5453 post_event(RemoteReservationRejectedTooFull());
5454 return discard_event();
5457 boost::statechart::result
5458 PeeringState::RepWaitBackfillReserved::react(
5459 const RemoteReservationRejectedTooFull
&evt
)
5462 pl
->unreserve_recovery_space();
5464 pl
->cancel_remote_recovery_reservation();
5465 return transit
<RepNotRecovering
>();
5468 boost::statechart::result
5469 PeeringState::RepWaitBackfillReserved::react(
5470 const RemoteReservationCanceled
&evt
)
5473 pl
->unreserve_recovery_space();
5475 pl
->cancel_remote_recovery_reservation();
5476 return transit
<RepNotRecovering
>();
5479 /*---RepRecovering-------*/
5480 PeeringState::RepRecovering::RepRecovering(my_context ctx
)
5482 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepRecovering")
5484 context
< PeeringMachine
>().log_enter(state_name
);
5487 boost::statechart::result
5488 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted
&)
5493 pl
->unreserve_recovery_space();
5494 pl
->send_cluster_message(
5496 TOPNSPC::make_message
<MRecoveryReserve
>(
5497 MRecoveryReserve::REVOKE
,
5498 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5499 ps
->get_osdmap_epoch()),
5500 ps
->get_osdmap_epoch());
5501 return discard_event();
5504 boost::statechart::result
5505 PeeringState::RepRecovering::react(const BackfillTooFull
&)
5510 pl
->unreserve_recovery_space();
5511 pl
->send_cluster_message(
5513 TOPNSPC::make_message
<MBackfillReserve
>(
5514 MBackfillReserve::REVOKE_TOOFULL
,
5515 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5516 ps
->get_osdmap_epoch()),
5517 ps
->get_osdmap_epoch());
5518 return discard_event();
5521 boost::statechart::result
5522 PeeringState::RepRecovering::react(const RemoteBackfillPreempted
&)
5527 pl
->unreserve_recovery_space();
5528 pl
->send_cluster_message(
5530 TOPNSPC::make_message
<MBackfillReserve
>(
5531 MBackfillReserve::REVOKE
,
5532 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5533 ps
->get_osdmap_epoch()),
5534 ps
->get_osdmap_epoch());
5535 return discard_event();
5538 void PeeringState::RepRecovering::exit()
5540 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5542 pl
->unreserve_recovery_space();
5544 pl
->cancel_remote_recovery_reservation();
5545 utime_t dur
= ceph_clock_now() - enter_time
;
5546 pl
->get_peering_perf().tinc(rs_reprecovering_latency
, dur
);
5549 /*------Activating--------*/
5550 PeeringState::Activating::Activating(my_context ctx
)
5552 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Activating")
5554 context
< PeeringMachine
>().log_enter(state_name
);
5557 void PeeringState::Activating::exit()
5559 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5561 utime_t dur
= ceph_clock_now() - enter_time
;
5562 pl
->get_peering_perf().tinc(rs_activating_latency
, dur
);
5565 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
5567 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalRecoveryReserved")
5569 context
< PeeringMachine
>().log_enter(state_name
);
5572 // Make sure all nodes that part of the recovery aren't full
5573 if (!ps
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
5574 ps
->get_osdmap()->check_full(ps
->acting_recovery_backfill
)) {
5575 post_event(RecoveryTooFull());
5579 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5580 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5581 pl
->request_local_background_io_reservation(
5582 ps
->get_recovery_priority(),
5583 std::make_unique
<PGPeeringEvent
>(
5584 ps
->get_osdmap_epoch(),
5585 ps
->get_osdmap_epoch(),
5586 LocalRecoveryReserved()),
5587 std::make_unique
<PGPeeringEvent
>(
5588 ps
->get_osdmap_epoch(),
5589 ps
->get_osdmap_epoch(),
5590 DeferRecovery(0.0)));
5591 pl
->publish_stats_to_osd();
5594 boost::statechart::result
5595 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
5598 ps
->state_set(PG_STATE_RECOVERY_TOOFULL
);
5599 pl
->schedule_event_after(
5600 std::make_shared
<PGPeeringEvent
>(
5601 ps
->get_osdmap_epoch(),
5602 ps
->get_osdmap_epoch(),
5604 ps
->cct
->_conf
->osd_recovery_retry_interval
);
5605 return transit
<NotRecovering
>();
5608 void PeeringState::WaitLocalRecoveryReserved::exit()
5610 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5612 utime_t dur
= ceph_clock_now() - enter_time
;
5613 pl
->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
5616 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
5618 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5619 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
5621 context
< PeeringMachine
>().log_enter(state_name
);
5622 post_event(RemoteRecoveryReserved());
5625 boost::statechart::result
5626 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
5629 if (remote_recovery_reservation_it
!=
5630 context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
5631 ceph_assert(*remote_recovery_reservation_it
!= ps
->pg_whoami
);
5632 pl
->send_cluster_message(
5633 remote_recovery_reservation_it
->osd
,
5634 TOPNSPC::make_message
<MRecoveryReserve
>(
5635 MRecoveryReserve::REQUEST
,
5636 spg_t(context
< PeeringMachine
>().spgid
.pgid
,
5637 remote_recovery_reservation_it
->shard
),
5638 ps
->get_osdmap_epoch(),
5639 ps
->get_recovery_priority()),
5640 ps
->get_osdmap_epoch());
5641 ++remote_recovery_reservation_it
;
5643 post_event(AllRemotesReserved());
5645 return discard_event();
5648 void PeeringState::WaitRemoteRecoveryReserved::exit()
5650 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5652 utime_t dur
= ceph_clock_now() - enter_time
;
5653 pl
->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency
, dur
);
5656 PeeringState::Recovering::Recovering(my_context ctx
)
5658 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovering")
5660 context
< PeeringMachine
>().log_enter(state_name
);
5663 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
5664 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5665 ps
->state_set(PG_STATE_RECOVERING
);
5666 pl
->on_recovery_reserved();
5667 ceph_assert(!ps
->state_test(PG_STATE_ACTIVATING
));
5668 pl
->publish_stats_to_osd();
5671 void PeeringState::Recovering::release_reservations(bool cancel
)
5674 ceph_assert(cancel
|| !ps
->pg_log
.get_missing().have_missing());
5676 // release remote reservations
5677 for (auto i
= context
< Active
>().remote_shards_to_reserve_recovery
.begin();
5678 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
5680 if (*i
== ps
->pg_whoami
) // skip myself
5682 pl
->send_cluster_message(
5684 TOPNSPC::make_message
<MRecoveryReserve
>(
5685 MRecoveryReserve::RELEASE
,
5686 spg_t(ps
->info
.pgid
.pgid
, i
->shard
),
5687 ps
->get_osdmap_epoch()),
5688 ps
->get_osdmap_epoch());
5692 boost::statechart::result
5693 PeeringState::Recovering::react(const AllReplicasRecovered
&evt
)
5696 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5697 release_reservations();
5698 pl
->cancel_local_background_io_reservation();
5699 return transit
<Recovered
>();
5702 boost::statechart::result
5703 PeeringState::Recovering::react(const RequestBackfill
&evt
)
5707 release_reservations();
5709 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5710 pl
->cancel_local_background_io_reservation();
5711 pl
->publish_stats_to_osd();
5712 // transit any async_recovery_targets back into acting
5713 // so pg won't have to stay undersized for long
5714 // as backfill might take a long time to complete..
5715 if (!ps
->async_recovery_targets
.empty()) {
5716 pg_shard_t auth_log_shard
;
5717 bool history_les_bound
= false;
5718 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5719 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5721 return transit
<WaitLocalBackfillReserved
>();
5724 boost::statechart::result
5725 PeeringState::Recovering::react(const DeferRecovery
&evt
)
5728 if (!ps
->state_test(PG_STATE_RECOVERING
)) {
5729 // we may have finished recovery and have an AllReplicasRecovered
5730 // event queued to move us to the next state.
5731 psdout(10) << "got defer recovery but not recovering" << dendl
;
5732 return discard_event();
5734 psdout(10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
5735 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5736 pl
->cancel_local_background_io_reservation();
5737 release_reservations(true);
5738 pl
->schedule_event_after(
5739 std::make_shared
<PGPeeringEvent
>(
5740 ps
->get_osdmap_epoch(),
5741 ps
->get_osdmap_epoch(),
5744 return transit
<NotRecovering
>();
5747 boost::statechart::result
5748 PeeringState::Recovering::react(const UnfoundRecovery
&evt
)
5751 psdout(10) << "recovery has unfound, can't continue" << dendl
;
5752 ps
->state_set(PG_STATE_RECOVERY_UNFOUND
);
5753 pl
->cancel_local_background_io_reservation();
5754 release_reservations(true);
5755 return transit
<NotRecovering
>();
5758 void PeeringState::Recovering::exit()
5760 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5763 utime_t dur
= ceph_clock_now() - enter_time
;
5764 ps
->state_clear(PG_STATE_RECOVERING
);
5765 pl
->get_peering_perf().tinc(rs_recovering_latency
, dur
);
5768 PeeringState::Recovered::Recovered(my_context ctx
)
5770 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovered")
5772 pg_shard_t auth_log_shard
;
5774 context
< PeeringMachine
>().log_enter(state_name
);
5778 ceph_assert(!ps
->needs_recovery());
5780 // if we finished backfill, all acting are active; recheck if
5781 // DEGRADED | UNDERSIZED is appropriate.
5782 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5783 if (ps
->get_osdmap()->get_pg_size(context
< PeeringMachine
>().spgid
.pgid
) <=
5784 ps
->acting_recovery_backfill
.size()) {
5785 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
5786 pl
->publish_stats_to_osd();
5789 // adjust acting set? (e.g. because backfill completed...)
5790 bool history_les_bound
= false;
5791 if (ps
->acting
!= ps
->up
&& !ps
->choose_acting(auth_log_shard
,
5792 true, &history_les_bound
)) {
5793 ceph_assert(ps
->want_acting
.size());
5794 } else if (!ps
->async_recovery_targets
.empty()) {
5795 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5796 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5799 if (context
< Active
>().all_replicas_activated
&&
5800 ps
->async_recovery_targets
.empty())
5801 post_event(GoClean());
5804 void PeeringState::Recovered::exit()
5806 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5809 utime_t dur
= ceph_clock_now() - enter_time
;
5810 pl
->get_peering_perf().tinc(rs_recovered_latency
, dur
);
5813 PeeringState::Clean::Clean(my_context ctx
)
5815 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Clean")
5817 context
< PeeringMachine
>().log_enter(state_name
);
5821 if (ps
->info
.last_complete
!= ps
->info
.last_update
) {
5826 ps
->try_mark_clean();
5828 context
< PeeringMachine
>().get_cur_transaction().register_on_commit(
5832 void PeeringState::Clean::exit()
5834 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5837 ps
->state_clear(PG_STATE_CLEAN
);
5838 utime_t dur
= ceph_clock_now() - enter_time
;
5839 pl
->get_peering_perf().tinc(rs_clean_latency
, dur
);
5842 template <typename T
>
5843 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
5845 set
<int> osds_found
;
5846 set
<pg_shard_t
> out
;
5847 for (auto i
= in
.begin(); i
!= in
.end(); ++i
) {
5848 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
5849 osds_found
.insert(i
->osd
);
5856 /*---------Active---------*/
5857 PeeringState::Active::Active(my_context ctx
)
5859 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active"),
5860 remote_shards_to_reserve_recovery(
5861 unique_osd_shard_set(
5862 context
< PeeringMachine
>().state
->pg_whoami
,
5863 context
< PeeringMachine
>().state
->acting_recovery_backfill
)),
5864 remote_shards_to_reserve_backfill(
5865 unique_osd_shard_set(
5866 context
< PeeringMachine
>().state
->pg_whoami
,
5867 context
< PeeringMachine
>().state
->backfill_targets
)),
5868 all_replicas_activated(false)
5870 context
< PeeringMachine
>().log_enter(state_name
);
5875 ceph_assert(!ps
->backfill_reserved
);
5876 ceph_assert(ps
->is_primary());
5877 psdout(10) << "In Active, about to call activate" << dendl
;
5878 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
5879 ps
->activate(context
< PeeringMachine
>().get_cur_transaction(),
5880 ps
->get_osdmap_epoch(),
5881 context
< PeeringMachine
>().get_recovery_ctx());
5883 // everyone has to commit/ack before we are truly active
5884 ps
->blocked_by
.clear();
5885 for (auto p
= ps
->acting_recovery_backfill
.begin();
5886 p
!= ps
->acting_recovery_backfill
.end();
5888 if (p
->shard
!= ps
->pg_whoami
.shard
) {
5889 ps
->blocked_by
.insert(p
->shard
);
5892 pl
->publish_stats_to_osd();
5893 psdout(10) << "Activate Finished" << dendl
;
5896 boost::statechart::result
PeeringState::Active::react(const AdvMap
& advmap
)
5900 if (ps
->should_restart_peering(
5902 advmap
.acting_primary
,
5907 psdout(10) << "Active advmap interval change, fast return" << dendl
;
5908 return forward_event();
5910 psdout(10) << "Active advmap" << dendl
;
5911 bool need_publish
= false;
5913 pl
->on_active_advmap(advmap
.osdmap
);
5914 if (ps
->dirty_big_info
) {
5915 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5916 // purged snaps and (b) perhaps share more snaps that we have purged
5917 // but didn't fit in pg_stat_t.
5918 need_publish
= true;
5919 ps
->share_pg_info();
5922 bool need_acting_change
= false;
5923 for (size_t i
= 0; i
< ps
->want_acting
.size(); i
++) {
5924 int osd
= ps
->want_acting
[i
];
5925 if (!advmap
.osdmap
->is_up(osd
)) {
5926 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
5927 if (!ps
->is_acting(osd_with_shard
) && !ps
->is_up(osd_with_shard
)) {
5928 psdout(10) << "Active stray osd." << osd
<< " in want_acting is down"
5930 need_acting_change
= true;
5934 if (need_acting_change
) {
5935 psdout(10) << "Active need acting change, call choose_acting again"
5937 // possibly because we re-add some strays into the acting set and
5938 // some of them then go down in a subsequent map before we could see
5939 // the map changing the pg temp.
5940 // call choose_acting again to clear them out.
5941 // note that we leave restrict_to_up_acting to false in order to
5942 // not overkill any chosen stray that is still alive.
5943 pg_shard_t auth_log_shard
;
5944 bool history_les_bound
= false;
5945 ps
->remove_down_peer_info(advmap
.osdmap
);
5946 ps
->choose_acting(auth_log_shard
, false, &history_les_bound
, true);
5949 /* Check for changes in pool size (if the acting set changed as a result,
5950 * this does not matter) */
5951 if (advmap
.lastmap
->get_pg_size(ps
->info
.pgid
.pgid
) !=
5952 ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
)) {
5953 if (ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
) <=
5954 ps
->actingset
.size()) {
5955 ps
->state_clear(PG_STATE_UNDERSIZED
);
5957 ps
->state_set(PG_STATE_UNDERSIZED
);
5959 // degraded changes will be detected by call from publish_stats_to_osd()
5960 need_publish
= true;
5963 // if we haven't reported our PG stats in a long time, do so now.
5964 if (ps
->info
.stats
.reported_epoch
+ ps
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
5965 psdout(20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - ps
->info
.stats
.reported_epoch
)
5966 << " epochs" << dendl
;
5967 need_publish
= true;
5971 pl
->publish_stats_to_osd();
5973 if (ps
->check_prior_readable_down_osds(advmap
.osdmap
)) {
5974 pl
->recheck_readable();
5977 return forward_event();
5980 boost::statechart::result
PeeringState::Active::react(const ActMap
&)
5983 psdout(10) << "Active: handling ActMap" << dendl
;
5984 ceph_assert(ps
->is_primary());
5986 pl
->on_active_actmap();
5988 if (ps
->have_unfound()) {
5989 // object may have become unfound
5990 ps
->discover_all_missing(context
<PeeringMachine
>().get_recovery_ctx().msgs
);
5993 uint64_t unfound
= ps
->missing_loc
.num_unfound();
5995 ps
->all_unfound_are_queried_or_lost(ps
->get_osdmap())) {
5996 if (ps
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
5997 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has " << unfound
5998 << " objects unfound and apparently lost, would automatically "
5999 << "mark these objects lost but this feature is not yet implemented "
6000 << "(osd_auto_mark_unfound_lost)";
6002 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has "
6003 << unfound
<< " objects unfound and apparently lost";
6006 return forward_event();
6009 boost::statechart::result
PeeringState::Active::react(const MNotifyRec
& notevt
)
6013 ceph_assert(ps
->is_primary());
6014 if (ps
->peer_info
.count(notevt
.from
)) {
6015 psdout(10) << "Active: got notify from " << notevt
.from
6016 << ", already have info from that osd, ignoring"
6018 } else if (ps
->peer_purged
.count(notevt
.from
)) {
6019 psdout(10) << "Active: got notify from " << notevt
.from
6020 << ", already purged that peer, ignoring"
6023 psdout(10) << "Active: got notify from " << notevt
.from
6024 << ", calling proc_replica_info and discover_all_missing"
6026 ps
->proc_replica_info(
6027 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
6028 if (ps
->have_unfound() || (ps
->is_degraded() && ps
->might_have_unfound
.count(notevt
.from
))) {
6029 ps
->discover_all_missing(
6030 context
<PeeringMachine
>().get_recovery_ctx().msgs
);
6032 // check if it is a previous down acting member that's coming back.
6033 // if so, request pg_temp change to trigger a new interval transition
6034 pg_shard_t auth_log_shard
;
6035 bool history_les_bound
= false;
6036 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6037 ps
->choose_acting(auth_log_shard
, false, &history_les_bound
, true);
6038 if (!ps
->want_acting
.empty() && ps
->want_acting
!= ps
->acting
) {
6039 psdout(10) << "Active: got notify from previous acting member "
6040 << notevt
.from
<< ", requesting pg_temp change"
6044 return discard_event();
6047 boost::statechart::result
PeeringState::Active::react(const MTrim
& trim
)
6050 ceph_assert(ps
->is_primary());
6052 // peer is informing us of their last_complete_ondisk
6053 ldout(ps
->cct
,10) << " replica osd." << trim
.from
<< " lcod " << trim
.trim_to
<< dendl
;
6054 ps
->update_peer_last_complete_ondisk(pg_shard_t
{trim
.from
, trim
.shard
},
6056 // trim log when the pg is recovered
6057 ps
->calc_min_last_complete_ondisk();
6058 return discard_event();
6061 boost::statechart::result
PeeringState::Active::react(const MInfoRec
& infoevt
)
6064 ceph_assert(ps
->is_primary());
6066 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6067 if (infoevt
.lease_ack
) {
6068 ps
->proc_lease_ack(infoevt
.from
.osd
, *infoevt
.lease_ack
);
6070 // don't update history (yet) if we are active and primary; the replica
6071 // may be telling us they have activated (and committed) but we can't
6072 // share that until _everyone_ does the same.
6073 if (ps
->is_acting_recovery_backfill(infoevt
.from
) &&
6074 ps
->peer_activated
.count(infoevt
.from
) == 0) {
6075 psdout(10) << " peer osd." << infoevt
.from
6076 << " activated and committed" << dendl
;
6077 ps
->peer_activated
.insert(infoevt
.from
);
6078 ps
->blocked_by
.erase(infoevt
.from
.shard
);
6079 pl
->publish_stats_to_osd();
6080 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size()) {
6081 all_activated_and_committed();
6084 return discard_event();
6087 boost::statechart::result
PeeringState::Active::react(const MLogRec
& logevt
)
6090 psdout(10) << "searching osd." << logevt
.from
6091 << " log for unfound items" << dendl
;
6092 ps
->proc_replica_log(
6093 logevt
.msg
->info
, logevt
.msg
->log
, std::move(logevt
.msg
->missing
), logevt
.from
);
6094 bool got_missing
= ps
->search_for_missing(
6095 ps
->peer_info
[logevt
.from
],
6096 ps
->peer_missing
[logevt
.from
],
6098 context
< PeeringMachine
>().get_recovery_ctx());
6099 // If there are missing AND we are "fully" active then start recovery now
6100 if (got_missing
&& ps
->state_test(PG_STATE_ACTIVE
)) {
6101 post_event(DoRecovery());
6103 return discard_event();
6106 boost::statechart::result
PeeringState::Active::react(const QueryState
& q
)
6110 q
.f
->open_object_section("state");
6111 q
.f
->dump_string("name", state_name
);
6112 q
.f
->dump_stream("enter_time") << enter_time
;
6115 q
.f
->open_array_section("might_have_unfound");
6116 for (auto p
= ps
->might_have_unfound
.begin();
6117 p
!= ps
->might_have_unfound
.end();
6119 q
.f
->open_object_section("osd");
6120 q
.f
->dump_stream("osd") << *p
;
6121 if (ps
->peer_missing
.count(*p
)) {
6122 q
.f
->dump_string("status", "already probed");
6123 } else if (ps
->peer_missing_requested
.count(*p
)) {
6124 q
.f
->dump_string("status", "querying");
6125 } else if (!ps
->get_osdmap()->is_up(p
->osd
)) {
6126 q
.f
->dump_string("status", "osd is down");
6128 q
.f
->dump_string("status", "not queried");
6130 q
.f
->close_section();
6132 q
.f
->close_section();
6135 q
.f
->open_object_section("recovery_progress");
6136 q
.f
->open_array_section("backfill_targets");
6137 for (auto p
= ps
->backfill_targets
.begin();
6138 p
!= ps
->backfill_targets
.end(); ++p
)
6139 q
.f
->dump_stream("replica") << *p
;
6140 q
.f
->close_section();
6141 pl
->dump_recovery_info(q
.f
);
6142 q
.f
->close_section();
6145 q
.f
->close_section();
6146 return forward_event();
6149 boost::statechart::result
PeeringState::Active::react(const QueryUnfound
& q
)
6153 ps
->query_unfound(q
.f
, "Active");
6154 return discard_event();
6157 boost::statechart::result
PeeringState::Active::react(
6158 const ActivateCommitted
&evt
)
6161 ceph_assert(!ps
->peer_activated
.count(ps
->pg_whoami
));
6162 ps
->peer_activated
.insert(ps
->pg_whoami
);
6163 psdout(10) << "_activate_committed " << evt
.epoch
6164 << " peer_activated now " << ps
->peer_activated
6165 << " last_interval_started "
6166 << ps
->info
.history
.last_interval_started
6167 << " last_epoch_started "
6168 << ps
->info
.history
.last_epoch_started
6169 << " same_interval_since "
6170 << ps
->info
.history
.same_interval_since
6172 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6173 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size())
6174 all_activated_and_committed();
6175 return discard_event();
6178 boost::statechart::result
PeeringState::Active::react(const AllReplicasActivated
&evt
)
6182 pg_t pgid
= context
< PeeringMachine
>().spgid
.pgid
;
6184 all_replicas_activated
= true;
6186 ps
->state_clear(PG_STATE_ACTIVATING
);
6187 ps
->state_clear(PG_STATE_CREATING
);
6188 ps
->state_clear(PG_STATE_PREMERGE
);
6191 if (ps
->pool
.info
.is_pending_merge(pgid
, &merge_target
)) {
6192 ps
->state_set(PG_STATE_PEERED
);
6193 ps
->state_set(PG_STATE_PREMERGE
);
6195 if (ps
->actingset
.size() != ps
->get_osdmap()->get_pg_size(pgid
)) {
6198 src
.set_ps(ps
->pool
.info
.get_pg_num_pending());
6199 assert(src
.get_parent() == pgid
);
6200 pl
->set_not_ready_to_merge_target(pgid
, src
);
6202 pl
->set_not_ready_to_merge_source(pgid
);
6205 } else if (!ps
->acting_set_writeable()) {
6206 ps
->state_set(PG_STATE_PEERED
);
6208 ps
->state_set(PG_STATE_ACTIVE
);
6211 auto mnow
= pl
->get_mnow();
6212 if (ps
->prior_readable_until_ub
> mnow
) {
6213 psdout(10) << " waiting for prior_readable_until_ub "
6214 << ps
->prior_readable_until_ub
<< " > mnow " << mnow
<< dendl
;
6215 ps
->state_set(PG_STATE_WAIT
);
6216 pl
->queue_check_readable(
6217 ps
->last_peering_reset
,
6218 ps
->prior_readable_until_ub
- mnow
);
6220 psdout(10) << " mnow " << mnow
<< " >= prior_readable_until_ub "
6221 << ps
->prior_readable_until_ub
<< dendl
;
6224 if (ps
->pool
.info
.has_flag(pg_pool_t::FLAG_CREATING
)) {
6225 pl
->send_pg_created(pgid
);
6228 ps
->info
.history
.last_epoch_started
= ps
->info
.last_epoch_started
;
6229 ps
->info
.history
.last_interval_started
= ps
->info
.last_interval_started
;
6230 ps
->dirty_info
= true;
6232 ps
->share_pg_info();
6233 pl
->publish_stats_to_osd();
6235 pl
->on_activate_complete();
6237 return discard_event();
6240 boost::statechart::result
PeeringState::Active::react(const RenewLease
& rl
)
6243 ps
->proc_renew_lease();
6244 return discard_event();
6247 boost::statechart::result
PeeringState::Active::react(const MLeaseAck
& la
)
6250 ps
->proc_lease_ack(la
.from
, la
.lease_ack
);
6251 return discard_event();
6255 boost::statechart::result
PeeringState::Active::react(const CheckReadable
&evt
)
6258 pl
->recheck_readable();
6259 return discard_event();
6263 * update info.history.last_epoch_started ONLY after we and all
6264 * replicas have activated AND committed the activate transaction
6265 * (i.e. the peering results are stable on disk).
6267 void PeeringState::Active::all_activated_and_committed()
6270 psdout(10) << "all_activated_and_committed" << dendl
;
6271 ceph_assert(ps
->is_primary());
6272 ceph_assert(ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size());
6273 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6274 ceph_assert(ps
->blocked_by
.empty());
6276 assert(HAVE_FEATURE(ps
->upacting_features
, SERVER_OCTOPUS
));
6277 // this is overkill when the activation is quick, but when it is slow it
6278 // is important, because the lease was renewed by the activate itself but we
6279 // don't know how long ago that was, and simply scheduling now may leave
6280 // a gap in lease coverage. keep it simple and aggressively renew.
6281 ps
->renew_lease(pl
->get_mnow());
6283 ps
->schedule_renew_lease();
6286 ps
->update_calc_stats();
6287 if (ps
->info
.stats
.stats
.sum
.num_objects_degraded
) {
6288 ps
->state_set(PG_STATE_DEGRADED
);
6290 ps
->state_clear(PG_STATE_DEGRADED
);
6293 post_event(PeeringState::AllReplicasActivated());
6297 void PeeringState::Active::exit()
6299 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6303 pl
->cancel_local_background_io_reservation();
6305 ps
->blocked_by
.clear();
6306 ps
->backfill_reserved
= false;
6307 ps
->state_clear(PG_STATE_ACTIVATING
);
6308 ps
->state_clear(PG_STATE_DEGRADED
);
6309 ps
->state_clear(PG_STATE_UNDERSIZED
);
6310 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
6311 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
6312 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
6313 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
6314 utime_t dur
= ceph_clock_now() - enter_time
;
6315 pl
->get_peering_perf().tinc(rs_active_latency
, dur
);
6316 pl
->on_active_exit();
6319 /*------ReplicaActive-----*/
6320 PeeringState::ReplicaActive::ReplicaActive(my_context ctx
)
6322 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive")
6324 context
< PeeringMachine
>().log_enter(state_name
);
6327 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6331 boost::statechart::result
PeeringState::ReplicaActive::react(
6332 const Activate
& actevt
) {
6334 psdout(10) << "In ReplicaActive, about to call activate" << dendl
;
6336 context
< PeeringMachine
>().get_cur_transaction(),
6337 actevt
.activation_epoch
,
6338 context
< PeeringMachine
>().get_recovery_ctx());
6339 psdout(10) << "Activate Finished" << dendl
;
6340 return discard_event();
6343 boost::statechart::result
PeeringState::ReplicaActive::react(
6344 const ActivateCommitted
&evt
)
6347 psdout(10) << __func__
<< " " << evt
.epoch
<< " telling primary" << dendl
;
6349 auto &rctx
= context
<PeeringMachine
>().get_recovery_ctx();
6350 auto epoch
= ps
->get_osdmap_epoch();
6351 pg_info_t i
= ps
->info
;
6352 i
.history
.last_epoch_started
= evt
.activation_epoch
;
6353 i
.history
.last_interval_started
= i
.history
.same_interval_since
;
6355 ps
->get_primary().osd
,
6356 spg_t(ps
->info
.pgid
.pgid
, ps
->get_primary().shard
),
6361 ps
->get_lease_ack());
6363 if (ps
->acting_set_writeable()) {
6364 ps
->state_set(PG_STATE_ACTIVE
);
6366 ps
->state_set(PG_STATE_PEERED
);
6368 pl
->on_activate_committed();
6370 return discard_event();
6373 boost::statechart::result
PeeringState::ReplicaActive::react(const MLease
& l
)
6376 spg_t spgid
= context
< PeeringMachine
>().spgid
;
6377 epoch_t epoch
= pl
->get_osdmap_epoch();
6379 ps
->proc_lease(l
.lease
);
6380 pl
->send_cluster_message(
6381 ps
->get_primary().osd
,
6382 TOPNSPC::make_message
<MOSDPGLeaseAck
>(epoch
,
6383 spg_t(spgid
.pgid
, ps
->get_primary().shard
),
6384 ps
->get_lease_ack()),
6386 return discard_event();
6389 boost::statechart::result
PeeringState::ReplicaActive::react(const MInfoRec
& infoevt
)
6392 ps
->proc_primary_info(context
<PeeringMachine
>().get_cur_transaction(),
6394 return discard_event();
6397 boost::statechart::result
PeeringState::ReplicaActive::react(const MLogRec
& logevt
)
6400 psdout(10) << "received log from " << logevt
.from
<< dendl
;
6401 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6402 ps
->merge_log(t
, logevt
.msg
->info
, std::move(logevt
.msg
->log
), logevt
.from
);
6403 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6404 if (logevt
.msg
->lease
) {
6405 ps
->proc_lease(*logevt
.msg
->lease
);
6408 return discard_event();
6411 boost::statechart::result
PeeringState::ReplicaActive::react(const MTrim
& trim
)
6414 // primary is instructing us to trim
6415 ps
->pg_log
.trim(trim
.trim_to
, ps
->info
);
6416 ps
->dirty_info
= true;
6417 return discard_event();
6420 boost::statechart::result
PeeringState::ReplicaActive::react(const ActMap
&)
6423 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6424 ps
->info
.history
.refresh_prior_readable_until_ub(
6425 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6426 context
< PeeringMachine
>().send_notify(
6427 ps
->get_primary().osd
,
6429 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6430 ps
->get_osdmap_epoch(),
6431 ps
->get_osdmap_epoch(),
6433 ps
->past_intervals
));
6435 return discard_event();
6438 boost::statechart::result
PeeringState::ReplicaActive::react(
6439 const MQuery
& query
)
6442 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6443 return discard_event();
6446 boost::statechart::result
PeeringState::ReplicaActive::react(const QueryState
& q
)
6448 q
.f
->open_object_section("state");
6449 q
.f
->dump_string("name", state_name
);
6450 q
.f
->dump_stream("enter_time") << enter_time
;
6451 q
.f
->close_section();
6452 return forward_event();
6455 boost::statechart::result
PeeringState::ReplicaActive::react(const QueryUnfound
& q
)
6457 q
.f
->dump_string("state", "ReplicaActive");
6458 q
.f
->dump_bool("available_might_have_unfound", false);
6459 return discard_event();
6462 void PeeringState::ReplicaActive::exit()
6464 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6466 pl
->unreserve_recovery_space();
6468 pl
->cancel_remote_recovery_reservation();
6469 utime_t dur
= ceph_clock_now() - enter_time
;
6470 pl
->get_peering_perf().tinc(rs_replicaactive_latency
, dur
);
6472 ps
->min_last_complete_ondisk
= eversion_t();
6476 PeeringState::Stray::Stray(my_context ctx
)
6478 NamedState(context
< PeeringMachine
>().state_history
, "Started/Stray")
6480 context
< PeeringMachine
>().log_enter(state_name
);
6484 ceph_assert(!ps
->is_peered());
6485 ceph_assert(!ps
->is_peering());
6486 ceph_assert(!ps
->is_primary());
6488 if (!ps
->get_osdmap()->have_pg_pool(ps
->info
.pgid
.pgid
.pool())) {
6489 ldout(ps
->cct
,10) << __func__
<< " pool is deleted" << dendl
;
6490 post_event(DeleteStart());
6492 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6496 boost::statechart::result
PeeringState::Stray::react(const MLogRec
& logevt
)
6499 MOSDPGLog
*msg
= logevt
.msg
.get();
6500 psdout(10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
6502 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6503 if (msg
->info
.last_backfill
== hobject_t()) {
6505 ps
->info
= msg
->info
;
6506 pl
->on_info_history_change();
6507 ps
->dirty_info
= true;
6508 ps
->dirty_big_info
= true; // maybe.
6510 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6511 ps
->pg_log
.reset_backfill_claim_log(msg
->log
, rollbacker
.get());
6513 ps
->pg_log
.reset_backfill();
6515 ps
->merge_log(t
, msg
->info
, std::move(msg
->log
), logevt
.from
);
6517 if (logevt
.msg
->lease
) {
6518 ps
->proc_lease(*logevt
.msg
->lease
);
6521 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6523 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
6524 return transit
<ReplicaActive
>();
6527 boost::statechart::result
PeeringState::Stray::react(const MInfoRec
& infoevt
)
6530 psdout(10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
6532 if (ps
->info
.last_update
> infoevt
.info
.last_update
) {
6533 // rewind divergent log entries
6534 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6535 ps
->rewind_divergent_log(t
, infoevt
.info
.last_update
);
6536 ps
->info
.stats
= infoevt
.info
.stats
;
6537 ps
->info
.hit_set
= infoevt
.info
.hit_set
;
6540 if (infoevt
.lease
) {
6541 ps
->proc_lease(*infoevt
.lease
);
6544 ceph_assert(infoevt
.info
.last_update
== ps
->info
.last_update
);
6545 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6547 post_event(Activate(infoevt
.info
.last_epoch_started
));
6548 return transit
<ReplicaActive
>();
6551 boost::statechart::result
PeeringState::Stray::react(const MQuery
& query
)
6554 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6555 return discard_event();
6558 boost::statechart::result
PeeringState::Stray::react(const ActMap
&)
6561 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6562 ps
->info
.history
.refresh_prior_readable_until_ub(
6563 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6564 context
< PeeringMachine
>().send_notify(
6565 ps
->get_primary().osd
,
6567 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6568 ps
->get_osdmap_epoch(),
6569 ps
->get_osdmap_epoch(),
6571 ps
->past_intervals
));
6573 return discard_event();
6576 void PeeringState::Stray::exit()
6578 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6580 utime_t dur
= ceph_clock_now() - enter_time
;
6581 pl
->get_peering_perf().tinc(rs_stray_latency
, dur
);
6585 /*--------ToDelete----------*/
6586 PeeringState::ToDelete::ToDelete(my_context ctx
)
6588 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete")
6590 context
< PeeringMachine
>().log_enter(state_name
);
6592 pl
->get_perf_logger().inc(l_osd_pg_removing
);
6595 void PeeringState::ToDelete::exit()
6597 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6599 // note: on a successful removal, this path doesn't execute. see
6600 // do_delete_work().
6601 pl
->get_perf_logger().dec(l_osd_pg_removing
);
6603 pl
->cancel_local_background_io_reservation();
6606 /*----WaitDeleteReserved----*/
6607 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx
)
6609 NamedState(context
< PeeringMachine
>().state_history
,
6610 "Started/ToDelete/WaitDeleteReseved")
6612 context
< PeeringMachine
>().log_enter(state_name
);
6614 context
< ToDelete
>().priority
= ps
->get_delete_priority();
6616 pl
->cancel_local_background_io_reservation();
6617 pl
->request_local_background_io_reservation(
6618 context
<ToDelete
>().priority
,
6619 std::make_unique
<PGPeeringEvent
>(
6620 ps
->get_osdmap_epoch(),
6621 ps
->get_osdmap_epoch(),
6623 std::make_unique
<PGPeeringEvent
>(
6624 ps
->get_osdmap_epoch(),
6625 ps
->get_osdmap_epoch(),
6626 DeleteInterrupted()));
6629 boost::statechart::result
PeeringState::ToDelete::react(
6633 if (ps
->get_delete_priority() != priority
) {
6634 psdout(10) << __func__
<< " delete priority changed, resetting"
6636 return transit
<ToDelete
>();
6638 return discard_event();
6641 void PeeringState::WaitDeleteReserved::exit()
6643 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6646 /*----Deleting-----*/
6647 PeeringState::Deleting::Deleting(my_context ctx
)
6649 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete/Deleting")
6651 context
< PeeringMachine
>().log_enter(state_name
);
6654 ps
->deleting
= true;
6655 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6658 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6659 ps
->pg_log
.roll_forward(rollbacker
.get());
6661 // adjust info to backfill
6662 ps
->info
.set_last_backfill(hobject_t());
6663 ps
->pg_log
.reset_backfill();
6664 ps
->dirty_info
= true;
6669 boost::statechart::result
PeeringState::Deleting::react(
6670 const DeleteSome
& evt
)
6673 std::pair
<ghobject_t
, bool> p
;
6674 p
= pl
->do_delete_work(context
<PeeringMachine
>().get_cur_transaction(),
6677 return p
.second
? discard_event() : terminate();
6680 void PeeringState::Deleting::exit()
6682 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6684 ps
->deleting
= false;
6685 pl
->cancel_local_background_io_reservation();
6688 /*--------GetInfo---------*/
6689 PeeringState::GetInfo::GetInfo(my_context ctx
)
6691 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetInfo")
6693 context
< PeeringMachine
>().log_enter(state_name
);
6697 ps
->check_past_interval_bounds();
6698 ps
->log_weirdness();
6699 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6701 ceph_assert(ps
->blocked_by
.empty());
6703 prior_set
= ps
->build_prior();
6704 ps
->prior_readable_down_osds
= prior_set
.down
;
6706 if (ps
->prior_readable_down_osds
.empty()) {
6707 psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating"
6711 ps
->reset_min_peer_features();
6713 if (prior_set
.pg_down
) {
6714 post_event(IsDown());
6715 } else if (peer_info_requested
.empty()) {
6716 post_event(GotInfo());
6720 void PeeringState::GetInfo::get_infos()
6723 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6725 ps
->blocked_by
.clear();
6726 for (auto it
= prior_set
.probe
.begin(); it
!= prior_set
.probe
.end(); ++it
) {
6727 pg_shard_t peer
= *it
;
6728 if (peer
== ps
->pg_whoami
) {
6731 if (ps
->peer_info
.count(peer
)) {
6732 psdout(10) << " have osd." << peer
<< " info " << ps
->peer_info
[peer
] << dendl
;
6735 if (peer_info_requested
.count(peer
)) {
6736 psdout(10) << " already requested info from osd." << peer
<< dendl
;
6737 ps
->blocked_by
.insert(peer
.osd
);
6738 } else if (!ps
->get_osdmap()->is_up(peer
.osd
)) {
6739 psdout(10) << " not querying info from down osd." << peer
<< dendl
;
6741 psdout(10) << " querying info from osd." << peer
<< dendl
;
6742 context
< PeeringMachine
>().send_query(
6744 pg_query_t(pg_query_t::INFO
,
6745 it
->shard
, ps
->pg_whoami
.shard
,
6747 ps
->get_osdmap_epoch()));
6748 peer_info_requested
.insert(peer
);
6749 ps
->blocked_by
.insert(peer
.osd
);
6753 ps
->check_prior_readable_down_osds(ps
->get_osdmap());
6755 pl
->publish_stats_to_osd();
6758 boost::statechart::result
PeeringState::GetInfo::react(const MNotifyRec
& infoevt
)
6763 auto p
= peer_info_requested
.find(infoevt
.from
);
6764 if (p
!= peer_info_requested
.end()) {
6765 peer_info_requested
.erase(p
);
6766 ps
->blocked_by
.erase(infoevt
.from
.osd
);
6769 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
6770 if (ps
->proc_replica_info(
6771 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
6772 // we got something new ...
6773 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6774 if (old_start
< ps
->info
.history
.last_epoch_started
) {
6775 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
6776 prior_set
= ps
->build_prior();
6777 ps
->prior_readable_down_osds
= prior_set
.down
;
6779 // filter out any osds that got dropped from the probe set from
6780 // peer_info_requested. this is less expensive than restarting
6781 // peering (which would re-probe everyone).
6782 auto p
= peer_info_requested
.begin();
6783 while (p
!= peer_info_requested
.end()) {
6784 if (prior_set
.probe
.count(*p
) == 0) {
6785 psdout(20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
6786 peer_info_requested
.erase(p
++);
6793 psdout(20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
6794 << hex
<< infoevt
.features
<< dec
<< dendl
;
6795 ps
->apply_peer_features(infoevt
.features
);
6797 // are we done getting everything?
6798 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
6799 psdout(20) << "Common peer features: " << hex
<< ps
->get_min_peer_features() << dec
<< dendl
;
6800 psdout(20) << "Common acting features: " << hex
<< ps
->get_min_acting_features() << dec
<< dendl
;
6801 psdout(20) << "Common upacting features: " << hex
<< ps
->get_min_upacting_features() << dec
<< dendl
;
6802 post_event(GotInfo());
6805 return discard_event();
6808 boost::statechart::result
PeeringState::GetInfo::react(const QueryState
& q
)
6811 q
.f
->open_object_section("state");
6812 q
.f
->dump_string("name", state_name
);
6813 q
.f
->dump_stream("enter_time") << enter_time
;
6815 q
.f
->open_array_section("requested_info_from");
6816 for (auto p
= peer_info_requested
.begin();
6817 p
!= peer_info_requested
.end();
6819 q
.f
->open_object_section("osd");
6820 q
.f
->dump_stream("osd") << *p
;
6821 if (ps
->peer_info
.count(*p
)) {
6822 q
.f
->open_object_section("got_info");
6823 ps
->peer_info
[*p
].dump(q
.f
);
6824 q
.f
->close_section();
6826 q
.f
->close_section();
6828 q
.f
->close_section();
6830 q
.f
->close_section();
6831 return forward_event();
6834 boost::statechart::result
PeeringState::GetInfo::react(const QueryUnfound
& q
)
6836 q
.f
->dump_string("state", "GetInfo");
6837 q
.f
->dump_bool("available_might_have_unfound", false);
6838 return discard_event();
6841 void PeeringState::GetInfo::exit()
6843 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6846 utime_t dur
= ceph_clock_now() - enter_time
;
6847 pl
->get_peering_perf().tinc(rs_getinfo_latency
, dur
);
6848 ps
->blocked_by
.clear();
6851 /*------GetLog------------*/
6852 PeeringState::GetLog::GetLog(my_context ctx
)
6855 context
< PeeringMachine
>().state_history
,
6856 "Started/Primary/Peering/GetLog"),
6859 context
< PeeringMachine
>().log_enter(state_name
);
6863 ps
->log_weirdness();
6866 if (!ps
->choose_acting(auth_log_shard
, false,
6867 &context
< Peering
>().history_les_bound
)) {
6868 if (!ps
->want_acting
.empty()) {
6869 post_event(NeedActingChange());
6871 post_event(IsIncomplete());
6877 if (auth_log_shard
== ps
->pg_whoami
) {
6878 post_event(GotLog());
6882 const pg_info_t
& best
= ps
->peer_info
[auth_log_shard
];
6885 if (ps
->info
.last_update
< best
.log_tail
) {
6886 psdout(10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
6887 post_event(IsIncomplete());
6891 // how much log to request?
6892 eversion_t request_log_from
= ps
->info
.last_update
;
6893 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6894 for (auto p
= ps
->acting_recovery_backfill
.begin();
6895 p
!= ps
->acting_recovery_backfill
.end();
6897 if (*p
== ps
->pg_whoami
) continue;
6898 pg_info_t
& ri
= ps
->peer_info
[*p
];
6899 if (ri
.last_update
< ps
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
6900 ri
.last_update
< request_log_from
)
6901 request_log_from
= ri
.last_update
;
6905 psdout(10) << " requesting log from osd." << auth_log_shard
<< dendl
;
6906 context
<PeeringMachine
>().send_query(
6910 auth_log_shard
.shard
, ps
->pg_whoami
.shard
,
6911 request_log_from
, ps
->info
.history
,
6912 ps
->get_osdmap_epoch()));
6914 ceph_assert(ps
->blocked_by
.empty());
6915 ps
->blocked_by
.insert(auth_log_shard
.osd
);
6916 pl
->publish_stats_to_osd();
6919 boost::statechart::result
PeeringState::GetLog::react(const AdvMap
& advmap
)
6921 // make sure our log source didn't go down. we need to check
6922 // explicitly because it may not be part of the prior set, which
6923 // means the Peering state check won't catch it going down.
6924 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
6925 psdout(10) << "GetLog: auth_log_shard osd."
6926 << auth_log_shard
.osd
<< " went down" << dendl
;
6928 return transit
< Reset
>();
6931 // let the Peering state do its checks.
6932 return forward_event();
6935 boost::statechart::result
PeeringState::GetLog::react(const MLogRec
& logevt
)
6938 if (logevt
.from
!= auth_log_shard
) {
6939 psdout(10) << "GetLog: discarding log from "
6940 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
6941 return discard_event();
6943 psdout(10) << "GetLog: received master log from osd."
6944 << logevt
.from
<< dendl
;
6946 post_event(GotLog());
6947 return discard_event();
6950 boost::statechart::result
PeeringState::GetLog::react(const GotLog
&)
6954 psdout(10) << "leaving GetLog" << dendl
;
6956 psdout(10) << "processing master log" << dendl
;
6957 ps
->proc_master_log(context
<PeeringMachine
>().get_cur_transaction(),
6958 msg
->info
, std::move(msg
->log
), std::move(msg
->missing
),
6961 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6962 return transit
< GetMissing
>();
6965 boost::statechart::result
PeeringState::GetLog::react(const QueryState
& q
)
6967 q
.f
->open_object_section("state");
6968 q
.f
->dump_string("name", state_name
);
6969 q
.f
->dump_stream("enter_time") << enter_time
;
6970 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
6971 q
.f
->close_section();
6972 return forward_event();
6975 boost::statechart::result
PeeringState::GetLog::react(const QueryUnfound
& q
)
6977 q
.f
->dump_string("state", "GetLog");
6978 q
.f
->dump_bool("available_might_have_unfound", false);
6979 return discard_event();
6982 void PeeringState::GetLog::exit()
6984 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6987 utime_t dur
= ceph_clock_now() - enter_time
;
6988 pl
->get_peering_perf().tinc(rs_getlog_latency
, dur
);
6989 ps
->blocked_by
.clear();
6992 /*------WaitActingChange--------*/
6993 PeeringState::WaitActingChange::WaitActingChange(my_context ctx
)
6995 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/WaitActingChange")
6997 context
< PeeringMachine
>().log_enter(state_name
);
7000 boost::statechart::result
PeeringState::WaitActingChange::react(const AdvMap
& advmap
)
7003 OSDMapRef osdmap
= advmap
.osdmap
;
7005 psdout(10) << "verifying no want_acting " << ps
->want_acting
<< " targets didn't go down" << dendl
;
7006 for (auto p
= ps
->want_acting
.begin(); p
!= ps
->want_acting
.end(); ++p
) {
7007 if (!osdmap
->is_up(*p
)) {
7008 psdout(10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
7010 return transit
< Reset
>();
7013 return forward_event();
7016 boost::statechart::result
PeeringState::WaitActingChange::react(const MLogRec
& logevt
)
7018 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
7019 return discard_event();
7022 boost::statechart::result
PeeringState::WaitActingChange::react(const MInfoRec
& evt
)
7024 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
7025 return discard_event();
7028 boost::statechart::result
PeeringState::WaitActingChange::react(const MNotifyRec
& evt
)
7030 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
7031 return discard_event();
7034 boost::statechart::result
PeeringState::WaitActingChange::react(const QueryState
& q
)
7036 q
.f
->open_object_section("state");
7037 q
.f
->dump_string("name", state_name
);
7038 q
.f
->dump_stream("enter_time") << enter_time
;
7039 q
.f
->dump_string("comment", "waiting for pg acting set to change");
7040 q
.f
->close_section();
7041 return forward_event();
7044 boost::statechart::result
PeeringState::WaitActingChange::react(const QueryUnfound
& q
)
7046 q
.f
->dump_string("state", "WaitActingChange");
7047 q
.f
->dump_bool("available_might_have_unfound", false);
7048 return discard_event();
7051 void PeeringState::WaitActingChange::exit()
7053 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7055 utime_t dur
= ceph_clock_now() - enter_time
;
7056 pl
->get_peering_perf().tinc(rs_waitactingchange_latency
, dur
);
7059 /*------Down--------*/
7060 PeeringState::Down::Down(my_context ctx
)
7062 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Down")
7064 context
< PeeringMachine
>().log_enter(state_name
);
7067 ps
->state_clear(PG_STATE_PEERING
);
7068 ps
->state_set(PG_STATE_DOWN
);
7070 auto &prior_set
= context
< Peering
>().prior_set
;
7071 ceph_assert(ps
->blocked_by
.empty());
7072 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
7073 pl
->publish_stats_to_osd();
7076 void PeeringState::Down::exit()
7078 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7082 ps
->state_clear(PG_STATE_DOWN
);
7083 utime_t dur
= ceph_clock_now() - enter_time
;
7084 pl
->get_peering_perf().tinc(rs_down_latency
, dur
);
7086 ps
->blocked_by
.clear();
7089 boost::statechart::result
PeeringState::Down::react(const QueryState
& q
)
7091 q
.f
->open_object_section("state");
7092 q
.f
->dump_string("name", state_name
);
7093 q
.f
->dump_stream("enter_time") << enter_time
;
7094 q
.f
->dump_string("comment",
7095 "not enough up instances of this PG to go active");
7096 q
.f
->close_section();
7097 return forward_event();
7100 boost::statechart::result
PeeringState::Down::react(const QueryUnfound
& q
)
7102 q
.f
->dump_string("state", "Down");
7103 q
.f
->dump_bool("available_might_have_unfound", false);
7104 return discard_event();
7107 boost::statechart::result
PeeringState::Down::react(const MNotifyRec
& infoevt
)
7111 ceph_assert(ps
->is_primary());
7112 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
7113 if (!ps
->peer_info
.count(infoevt
.from
) &&
7114 ps
->get_osdmap()->has_been_up_since(infoevt
.from
.osd
, infoevt
.notify
.epoch_sent
)) {
7115 ps
->update_history(infoevt
.notify
.info
.history
);
7117 // if we got something new to make pg escape down state
7118 if (ps
->info
.history
.last_epoch_started
> old_start
) {
7119 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl
;
7120 ps
->state_clear(PG_STATE_DOWN
);
7121 ps
->state_set(PG_STATE_PEERING
);
7122 return transit
< GetInfo
>();
7125 return discard_event();
7129 /*------Incomplete--------*/
7130 PeeringState::Incomplete::Incomplete(my_context ctx
)
7132 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Incomplete")
7134 context
< PeeringMachine
>().log_enter(state_name
);
7137 ps
->state_clear(PG_STATE_PEERING
);
7138 ps
->state_set(PG_STATE_INCOMPLETE
);
7140 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7141 ceph_assert(ps
->blocked_by
.empty());
7142 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
7143 pl
->publish_stats_to_osd();
7146 boost::statechart::result
PeeringState::Incomplete::react(const AdvMap
&advmap
) {
7148 int64_t poolnum
= ps
->info
.pgid
.pool();
7150 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7151 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
7152 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
7153 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
7155 return transit
< Reset
>();
7158 return forward_event();
7161 boost::statechart::result
PeeringState::Incomplete::react(const MNotifyRec
& notevt
) {
7163 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
7164 if (ps
->proc_replica_info(
7165 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
7166 // We got something new, try again!
7167 return transit
< GetLog
>();
7169 return discard_event();
7173 boost::statechart::result
PeeringState::Incomplete::react(
7174 const QueryState
& q
)
7176 q
.f
->open_object_section("state");
7177 q
.f
->dump_string("name", state_name
);
7178 q
.f
->dump_stream("enter_time") << enter_time
;
7179 q
.f
->dump_string("comment", "not enough complete instances of this PG");
7180 q
.f
->close_section();
7181 return forward_event();
7184 boost::statechart::result
PeeringState::Incomplete::react(const QueryUnfound
& q
)
7186 q
.f
->dump_string("state", "Incomplete");
7187 q
.f
->dump_bool("available_might_have_unfound", false);
7188 return discard_event();
7191 void PeeringState::Incomplete::exit()
7193 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7197 ps
->state_clear(PG_STATE_INCOMPLETE
);
7198 utime_t dur
= ceph_clock_now() - enter_time
;
7199 pl
->get_peering_perf().tinc(rs_incomplete_latency
, dur
);
7201 ps
->blocked_by
.clear();
7204 /*------GetMissing--------*/
7205 PeeringState::GetMissing::GetMissing(my_context ctx
)
7207 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetMissing")
7209 context
< PeeringMachine
>().log_enter(state_name
);
7212 ps
->log_weirdness();
7213 ceph_assert(!ps
->acting_recovery_backfill
.empty());
7215 for (auto i
= ps
->acting_recovery_backfill
.begin();
7216 i
!= ps
->acting_recovery_backfill
.end();
7218 if (*i
== ps
->get_primary()) continue;
7219 const pg_info_t
& pi
= ps
->peer_info
[*i
];
7220 // reset this so to make sure the pg_missing_t is initialized and
7221 // has the correct semantics even if we don't need to get a
7222 // missing set from a shard. This way later additions due to
7223 // lost+unfound delete work properly.
7224 ps
->peer_missing
[*i
].may_include_deletes
= !ps
->perform_deletes_during_peering();
7227 continue; // no pg data, nothing divergent
7229 if (pi
.last_update
< ps
->pg_log
.get_tail()) {
7230 psdout(10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
7231 ps
->peer_missing
[*i
].clear();
7234 if (pi
.last_backfill
== hobject_t()) {
7235 psdout(10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
7236 ps
->peer_missing
[*i
].clear();
7240 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
7241 pi
.last_update
== ps
->info
.last_update
) { // peer is up to date
7242 // replica has no missing and identical log as us. no need to
7244 // FIXME: we can do better here. if last_update==last_complete we
7245 // can infer the rest!
7246 psdout(10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
7247 ps
->peer_missing
[*i
].clear();
7251 // We pull the log from the peer's last_epoch_started to ensure we
7252 // get enough log to detect divergent updates.
7253 since
.epoch
= pi
.last_epoch_started
;
7254 ceph_assert(pi
.last_update
>= ps
->info
.log_tail
); // or else choose_acting() did a bad thing
7255 if (pi
.log_tail
<= since
) {
7256 psdout(10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
7257 context
< PeeringMachine
>().send_query(
7261 i
->shard
, ps
->pg_whoami
.shard
,
7262 since
, ps
->info
.history
,
7263 ps
->get_osdmap_epoch()));
7265 psdout(10) << " requesting fulllog+missing from osd." << *i
7266 << " (want since " << since
<< " < log.tail "
7267 << pi
.log_tail
<< ")" << dendl
;
7268 context
< PeeringMachine
>().send_query(
7270 pg_query_t::FULLLOG
,
7271 i
->shard
, ps
->pg_whoami
.shard
,
7272 ps
->info
.history
, ps
->get_osdmap_epoch()));
7274 peer_missing_requested
.insert(*i
);
7275 ps
->blocked_by
.insert(i
->osd
);
7278 if (peer_missing_requested
.empty()) {
7279 if (ps
->need_up_thru
) {
7280 psdout(10) << " still need up_thru update before going active"
7282 post_event(NeedUpThru());
7287 post_event(Activate(ps
->get_osdmap_epoch()));
7289 pl
->publish_stats_to_osd();
7293 boost::statechart::result
PeeringState::GetMissing::react(const MLogRec
& logevt
)
7297 peer_missing_requested
.erase(logevt
.from
);
7298 ps
->proc_replica_log(logevt
.msg
->info
,
7300 std::move(logevt
.msg
->missing
),
7303 if (peer_missing_requested
.empty()) {
7304 if (ps
->need_up_thru
) {
7305 psdout(10) << " still need up_thru update before going active"
7307 post_event(NeedUpThru());
7309 psdout(10) << "Got last missing, don't need missing "
7310 << "posting Activate" << dendl
;
7311 post_event(Activate(ps
->get_osdmap_epoch()));
7314 return discard_event();
7317 boost::statechart::result
PeeringState::GetMissing::react(const QueryState
& q
)
7320 q
.f
->open_object_section("state");
7321 q
.f
->dump_string("name", state_name
);
7322 q
.f
->dump_stream("enter_time") << enter_time
;
7324 q
.f
->open_array_section("peer_missing_requested");
7325 for (auto p
= peer_missing_requested
.begin();
7326 p
!= peer_missing_requested
.end();
7328 q
.f
->open_object_section("osd");
7329 q
.f
->dump_stream("osd") << *p
;
7330 if (ps
->peer_missing
.count(*p
)) {
7331 q
.f
->open_object_section("got_missing");
7332 ps
->peer_missing
[*p
].dump(q
.f
);
7333 q
.f
->close_section();
7335 q
.f
->close_section();
7337 q
.f
->close_section();
7339 q
.f
->close_section();
7340 return forward_event();
7343 boost::statechart::result
PeeringState::GetMissing::react(const QueryUnfound
& q
)
7345 q
.f
->dump_string("state", "GetMising");
7346 q
.f
->dump_bool("available_might_have_unfound", false);
7347 return discard_event();
7350 void PeeringState::GetMissing::exit()
7352 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7355 utime_t dur
= ceph_clock_now() - enter_time
;
7356 pl
->get_peering_perf().tinc(rs_getmissing_latency
, dur
);
7357 ps
->blocked_by
.clear();
7360 /*------WaitUpThru--------*/
7361 PeeringState::WaitUpThru::WaitUpThru(my_context ctx
)
7363 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/WaitUpThru")
7365 context
< PeeringMachine
>().log_enter(state_name
);
7368 boost::statechart::result
PeeringState::WaitUpThru::react(const ActMap
& am
)
7371 if (!ps
->need_up_thru
) {
7372 post_event(Activate(ps
->get_osdmap_epoch()));
7374 return forward_event();
7377 boost::statechart::result
PeeringState::WaitUpThru::react(const MLogRec
& logevt
)
7380 psdout(10) << "Noting missing from osd." << logevt
.from
<< dendl
;
7381 ps
->peer_missing
[logevt
.from
].claim(std::move(logevt
.msg
->missing
));
7382 ps
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
7383 return discard_event();
7386 boost::statechart::result
PeeringState::WaitUpThru::react(const QueryState
& q
)
7388 q
.f
->open_object_section("state");
7389 q
.f
->dump_string("name", state_name
);
7390 q
.f
->dump_stream("enter_time") << enter_time
;
7391 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7392 q
.f
->close_section();
7393 return forward_event();
7396 boost::statechart::result
PeeringState::WaitUpThru::react(const QueryUnfound
& q
)
7398 q
.f
->dump_string("state", "WaitUpThru");
7399 q
.f
->dump_bool("available_might_have_unfound", false);
7400 return discard_event();
7403 void PeeringState::WaitUpThru::exit()
7405 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7407 utime_t dur
= ceph_clock_now() - enter_time
;
7408 pl
->get_peering_perf().tinc(rs_waitupthru_latency
, dur
);
7411 /*----PeeringState::PeeringMachine Methods-----*/
7413 #define dout_prefix dpp->gen_prefix(*_dout)
7415 void PeeringState::PeeringMachine::log_enter(const char *state_name
)
7418 psdout(5) << "enter " << state_name
<< dendl
;
7419 pl
->log_state_enter(state_name
);
7422 void PeeringState::PeeringMachine::log_exit(const char *state_name
, utime_t enter_time
)
7425 utime_t dur
= ceph_clock_now() - enter_time
;
7426 psdout(5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
7427 pl
->log_state_exit(state_name
, enter_time
, event_count
, event_time
);
7429 event_time
= utime_t();
7432 ostream
&operator<<(ostream
&out
, const PeeringState
&ps
) {
7433 out
<< "pg[" << ps
.info
7434 << " " << pg_vector_string(ps
.up
);
7435 if (ps
.acting
!= ps
.up
)
7436 out
<< "/" << pg_vector_string(ps
.acting
);
7438 out
<< "p" << ps
.get_primary();
7439 if (!ps
.async_recovery_targets
.empty())
7440 out
<< " async=[" << ps
.async_recovery_targets
<< "]";
7441 if (!ps
.backfill_targets
.empty())
7442 out
<< " backfill=[" << ps
.backfill_targets
<< "]";
7443 out
<< " r=" << ps
.get_role();
7444 out
<< " lpr=" << ps
.get_last_peering_reset();
7449 if (!ps
.past_intervals
.empty()) {
7450 out
<< " pi=[" << ps
.past_intervals
.get_bounds()
7451 << ")/" << ps
.past_intervals
.size();
7454 if (ps
.is_peered()) {
7455 if (ps
.last_update_ondisk
!= ps
.info
.last_update
)
7456 out
<< " luod=" << ps
.last_update_ondisk
;
7457 if (ps
.last_update_applied
!= ps
.info
.last_update
)
7458 out
<< " lua=" << ps
.last_update_applied
;
7461 if (ps
.pg_log
.get_tail() != ps
.info
.log_tail
||
7462 ps
.pg_log
.get_head() != ps
.info
.last_update
)
7463 out
<< " (info mismatch, " << ps
.pg_log
.get_log() << ")";
7465 if (!ps
.pg_log
.get_log().empty()) {
7466 if ((ps
.pg_log
.get_log().log
.begin()->version
<= ps
.pg_log
.get_tail())) {
7467 out
<< " (log bound mismatch, actual=["
7468 << ps
.pg_log
.get_log().log
.begin()->version
<< ","
7469 << ps
.pg_log
.get_log().log
.rbegin()->version
<< "]";
7474 out
<< " crt=" << ps
.pg_log
.get_can_rollback_to();
7476 if (ps
.last_complete_ondisk
!= ps
.info
.last_complete
)
7477 out
<< " lcod " << ps
.last_complete_ondisk
;
7479 out
<< " mlcod " << ps
.min_last_complete_ondisk
;
7481 out
<< " " << pg_state_string(ps
.get_state());
7482 if (ps
.should_send_notify())
7485 if (ps
.prior_readable_until_ub
!= ceph::signedspan::zero()) {
7486 out
<< " pruub " << ps
.prior_readable_until_ub
7487 << "@" << ps
.get_prior_readable_down_osds();
7492 std::vector
<pg_shard_t
> PeeringState::get_replica_recovery_order() const
7494 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
7495 async_by_num_missing
;
7496 replicas_by_num_missing
.reserve(get_acting_recovery_backfill().size() - 1);
7497 for (auto &p
: get_acting_recovery_backfill()) {
7498 if (p
== get_primary()) {
7501 auto pm
= get_peer_missing().find(p
);
7502 assert(pm
!= get_peer_missing().end());
7503 auto nm
= pm
->second
.num_missing();
7505 if (is_async_recovery_target(p
)) {
7506 async_by_num_missing
.push_back(make_pair(nm
, p
));
7508 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
7512 // sort by number of missing objects, in ascending order.
7513 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
7514 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
7515 return lhs
.first
< rhs
.first
;
7517 // acting goes first
7518 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
7519 // then async_recovery_targets
7520 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
7521 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
7522 async_by_num_missing
.begin(), async_by_num_missing
.end());
7524 std::vector
<pg_shard_t
> ret
;
7525 ret
.reserve(replicas_by_num_missing
.size());
7526 for (auto p
: replicas_by_num_missing
) {
7527 ret
.push_back(p
.second
);