1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
30 : require_osd_release(r
) {
31 // steal messages from ctx
32 message_map
.swap(ctx
.message_map
);
35 void BufferedRecoveryMessages::send_notify(int to
, const pg_notify_t
&n
)
37 if (require_osd_release
>= ceph_release_t::octopus
) {
38 spg_t
pgid(n
.info
.pgid
.pgid
, n
.to
);
39 send_osd_message(to
, make_message
<MOSDPGNotify2
>(pgid
, n
));
41 send_osd_message(to
, make_message
<MOSDPGNotify
>(n
.epoch_sent
, vector
{n
}));
45 void BufferedRecoveryMessages::send_query(
50 if (require_osd_release
>= ceph_release_t::octopus
) {
52 make_message
<MOSDPGQuery2
>(to_spgid
, q
));
54 auto m
= make_message
<MOSDPGQuery
>(
56 MOSDPGQuery::pg_list_t
{{to_spgid
, q
}});
57 send_osd_message(to
, m
);
61 void BufferedRecoveryMessages::send_info(
66 const pg_info_t
&info
,
67 std::optional
<pg_lease_t
> lease
,
68 std::optional
<pg_lease_ack_t
> lease_ack
)
70 if (require_osd_release
>= ceph_release_t::octopus
) {
73 make_message
<MOSDPGInfo2
>(
84 make_message
<MOSDPGInfo
>(
86 vector
{pg_notify_t
{to_spgid
.shard
,
89 info
, PastIntervals
{}}})
94 void PGPool::update(CephContext
*cct
, OSDMapRef map
)
96 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
98 return; // pool has been deleted
101 name
= map
->get_pool_name(id
);
103 bool updated
= false;
104 if ((map
->get_epoch() != cached_epoch
+ 1) ||
105 (pi
->get_snap_epoch() == map
->get_epoch())) {
109 if (info
.is_pool_snaps_mode() && updated
) {
110 snapc
= pi
->get_snap_context();
112 cached_epoch
= map
->get_epoch();
115 /*-------------Peering State Helpers----------------*/
117 #define dout_prefix (dpp->gen_prefix(*_dout))
119 #define psdout(x) ldout(cct, x)
121 PeeringState::PeeringState(
123 pg_shard_t pg_whoami
,
127 DoutPrefixProvider
*dpp
,
129 : state_history(*pl
),
137 pg_whoami(pg_whoami
),
140 missing_loc(spgid
, this, dpp
, cct
),
141 machine(this, cct
, spgid
, dpp
, pl
, &state_history
)
146 void PeeringState::start_handle(PeeringCtx
*new_ctx
) {
148 ceph_assert(!orig_ctx
);
151 if (messages_pending_flush
) {
152 rctx
.emplace(*messages_pending_flush
, *new_ctx
);
154 rctx
.emplace(*new_ctx
);
156 rctx
->start_time
= ceph_clock_now();
160 void PeeringState::begin_block_outgoing() {
161 ceph_assert(!messages_pending_flush
);
162 ceph_assert(orig_ctx
);
164 messages_pending_flush
= BufferedRecoveryMessages(
165 orig_ctx
->require_osd_release
);
166 rctx
.emplace(*messages_pending_flush
, *orig_ctx
);
169 void PeeringState::clear_blocked_outgoing() {
170 ceph_assert(orig_ctx
);
172 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
175 void PeeringState::end_block_outgoing() {
176 ceph_assert(messages_pending_flush
);
177 ceph_assert(orig_ctx
);
180 orig_ctx
->accept_buffered_messages(*messages_pending_flush
);
181 rctx
.emplace(*orig_ctx
);
182 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
185 void PeeringState::end_handle() {
187 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
188 machine
.event_time
+= dur
;
191 machine
.event_count
++;
196 void PeeringState::check_recovery_sources(const OSDMapRef
& osdmap
)
199 * check that any peers we are planning to (or currently) pulling
200 * objects from are dealt with.
202 missing_loc
.check_recovery_sources(osdmap
);
203 pl
->check_recovery_sources(osdmap
);
205 for (set
<pg_shard_t
>::iterator i
= peer_log_requested
.begin();
206 i
!= peer_log_requested
.end();
208 if (!osdmap
->is_up(i
->osd
)) {
209 psdout(10) << "peer_log_requested removing " << *i
<< dendl
;
210 peer_log_requested
.erase(i
++);
216 for (set
<pg_shard_t
>::iterator i
= peer_missing_requested
.begin();
217 i
!= peer_missing_requested
.end();
219 if (!osdmap
->is_up(i
->osd
)) {
220 psdout(10) << "peer_missing_requested removing " << *i
<< dendl
;
221 peer_missing_requested
.erase(i
++);
228 void PeeringState::update_history(const pg_history_t
& new_history
)
230 auto mnow
= pl
->get_mnow();
231 info
.history
.refresh_prior_readable_until_ub(mnow
, prior_readable_until_ub
);
232 if (info
.history
.merge(new_history
)) {
233 psdout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
235 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
236 psdout(20) << __func__
<< " clearing past_intervals" << dendl
;
237 past_intervals
.clear();
238 dirty_big_info
= true;
240 prior_readable_until_ub
= info
.history
.get_prior_readable_until_ub(mnow
);
241 if (prior_readable_until_ub
!= ceph::signedspan::zero()) {
243 << " prior_readable_until_ub " << prior_readable_until_ub
244 << " (mnow " << mnow
<< " + "
245 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
248 pl
->on_info_history_change();
251 void PeeringState::purge_strays()
254 psdout(10) << "purge_strays " << stray_set
<< " but premerge, doing nothing"
258 if (cct
->_conf
.get_val
<bool>("osd_debug_no_purge_strays")) {
261 psdout(10) << "purge_strays " << stray_set
<< dendl
;
263 bool removed
= false;
264 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
265 p
!= stray_set
.end();
267 ceph_assert(!is_acting_recovery_backfill(*p
));
268 if (get_osdmap()->is_up(p
->osd
)) {
269 psdout(10) << "sending PGRemove to osd." << *p
<< dendl
;
270 vector
<spg_t
> to_remove
;
271 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
272 MOSDPGRemove
*m
= new MOSDPGRemove(
275 pl
->send_cluster_message(p
->osd
, m
, get_osdmap_epoch());
277 psdout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
279 peer_missing
.erase(*p
);
281 missing_loc
.remove_stray_recovery_sources(*p
);
282 peer_purged
.insert(*p
);
286 // if we removed anyone, update peers (which include peer_info)
288 update_heartbeat_peers();
292 // clear _requested maps; we may have to peer() again if we discover
293 // (more) stray content
294 peer_log_requested
.clear();
295 peer_missing_requested
.clear();
299 bool PeeringState::proc_replica_info(
300 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
302 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
303 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
304 psdout(10) << " got dup osd." << from
<< " info "
305 << oinfo
<< ", identical to ours" << dendl
;
309 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
310 psdout(10) << " got info " << oinfo
<< " from down osd." << from
311 << " discarding" << dendl
;
315 psdout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
316 ceph_assert(is_primary());
317 peer_info
[from
] = oinfo
;
318 might_have_unfound
.insert(from
);
320 update_history(oinfo
.history
);
323 if (!is_up(from
) && !is_acting(from
)) {
324 psdout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
325 stray_set
.insert(from
);
331 // was this a new info? if so, update peers!
332 if (p
== peer_info
.end())
333 update_heartbeat_peers();
339 void PeeringState::remove_down_peer_info(const OSDMapRef
&osdmap
)
341 // Remove any downed osds from peer_info
342 bool removed
= false;
343 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
344 while (p
!= peer_info
.end()) {
345 if (!osdmap
->is_up(p
->first
.osd
)) {
346 psdout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
347 peer_missing
.erase(p
->first
);
348 peer_log_requested
.erase(p
->first
);
349 peer_missing_requested
.erase(p
->first
);
350 peer_info
.erase(p
++);
356 // Remove any downed osds from peer_purged so we can re-purge if necessary
357 auto it
= peer_purged
.begin();
358 while (it
!= peer_purged
.end()) {
359 if (!osdmap
->is_up(it
->osd
)) {
360 psdout(10) << " dropping down osd." << *it
<< " from peer_purged" << dendl
;
361 peer_purged
.erase(it
++);
367 // if we removed anyone, update peers (which include peer_info)
369 update_heartbeat_peers();
371 check_recovery_sources(osdmap
);
374 void PeeringState::update_heartbeat_peers()
380 for (unsigned i
=0; i
<acting
.size(); i
++) {
381 if (acting
[i
] != CRUSH_ITEM_NONE
)
382 new_peers
.insert(acting
[i
]);
384 for (unsigned i
=0; i
<up
.size(); i
++) {
385 if (up
[i
] != CRUSH_ITEM_NONE
)
386 new_peers
.insert(up
[i
]);
388 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
389 p
!= peer_info
.end();
391 new_peers
.insert(p
->first
.osd
);
393 pl
->update_heartbeat_peers(std::move(new_peers
));
396 void PeeringState::write_if_dirty(ObjectStore::Transaction
& t
)
405 last_persisted_osdmap
< get_osdmap_epoch(),
407 if (dirty_info
|| dirty_big_info
) {
408 last_persisted_osdmap
= get_osdmap_epoch();
409 last_written_info
= info
;
411 dirty_big_info
= false;
415 void PeeringState::advance_map(
416 OSDMapRef osdmap
, OSDMapRef lastmap
,
417 vector
<int>& newup
, int up_primary
,
418 vector
<int>& newacting
, int acting_primary
,
421 ceph_assert(lastmap
== osdmap_ref
);
422 psdout(10) << "handle_advance_map "
423 << newup
<< "/" << newacting
424 << " -- " << up_primary
<< "/" << acting_primary
427 update_osdmap_ref(osdmap
);
428 pool
.update(cct
, osdmap
);
431 osdmap
, lastmap
, newup
, up_primary
,
432 newacting
, acting_primary
);
433 handle_event(evt
, &rctx
);
434 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
435 pl
->on_pool_change();
437 readable_interval
= pool
.get_readable_interval();
438 last_require_osd_release
= osdmap
->require_osd_release
;
441 void PeeringState::activate_map(PeeringCtx
&rctx
)
443 psdout(10) << __func__
<< dendl
;
445 handle_event(evt
, &rctx
);
446 if (osdmap_ref
->get_epoch() - last_persisted_osdmap
>
447 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
448 psdout(20) << __func__
<< ": Dirtying info: last_persisted is "
449 << last_persisted_osdmap
450 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
453 psdout(20) << __func__
<< ": Not dirtying info: last_persisted is "
454 << last_persisted_osdmap
455 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
457 write_if_dirty(rctx
.transaction
);
459 if (get_osdmap()->check_new_blacklist_entries()) {
460 pl
->check_blacklisted_watchers();
464 void PeeringState::set_last_peering_reset()
466 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl
;
467 if (last_peering_reset
!= get_osdmap_epoch()) {
468 last_peering_reset
= get_osdmap_epoch();
469 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
470 clear_blocked_outgoing();
471 if (!pl
->try_flush_or_schedule_async()) {
472 psdout(10) << "Beginning to block outgoing recovery messages" << dendl
;
473 begin_block_outgoing();
475 psdout(10) << "Not blocking outgoing recovery messages" << dendl
;
480 void PeeringState::complete_flush()
482 flushes_in_progress
--;
483 if (flushes_in_progress
== 0) {
488 void PeeringState::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
490 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
492 return; // pool deleted
494 bool changed
= false;
495 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
496 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
497 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
498 psdout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
503 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
508 bool PeeringState::should_restart_peering(
510 int newactingprimary
,
511 const vector
<int>& newup
,
512 const vector
<int>& newacting
,
516 if (PastIntervals::is_new_interval(
528 psdout(20) << "new interval newup " << newup
529 << " newacting " << newacting
<< dendl
;
532 if (!lastmap
->is_up(pg_whoami
.osd
) && osdmap
->is_up(pg_whoami
.osd
)) {
533 psdout(10) << __func__
<< " osd transitioned from down -> up"
540 /* Called before initializing peering during advance_map */
541 void PeeringState::start_peering_interval(
542 const OSDMapRef lastmap
,
543 const vector
<int>& newup
, int new_up_primary
,
544 const vector
<int>& newacting
, int new_acting_primary
,
545 ObjectStore::Transaction
&t
)
547 const OSDMapRef osdmap
= get_osdmap();
549 set_last_peering_reset();
551 vector
<int> oldacting
, oldup
;
552 int oldrole
= get_role();
555 pl
->clear_ready_to_merge();
559 pg_shard_t old_acting_primary
= get_primary();
560 pg_shard_t old_up_primary
= up_primary
;
561 bool was_old_primary
= is_primary();
562 bool was_old_nonprimary
= is_nonprimary();
564 acting
.swap(oldacting
);
566 init_primary_up_acting(
572 if (info
.stats
.up
!= up
||
573 info
.stats
.acting
!= acting
||
574 info
.stats
.up_primary
!= new_up_primary
||
575 info
.stats
.acting_primary
!= new_acting_primary
) {
577 info
.stats
.up_primary
= new_up_primary
;
578 info
.stats
.acting
= acting
;
579 info
.stats
.acting_primary
= new_acting_primary
;
580 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
583 pl
->clear_publish_stats();
585 // This will now be remapped during a backfill in cases
586 // that it would not have been before.
588 state_set(PG_STATE_REMAPPED
);
590 state_clear(PG_STATE_REMAPPED
);
592 int role
= osdmap
->calc_pg_role(pg_whoami
, acting
);
595 // did acting, up, primary|acker change?
597 psdout(10) << " no lastmap" << dendl
;
599 dirty_big_info
= true;
600 info
.history
.same_interval_since
= osdmap
->get_epoch();
602 std::stringstream debug
;
603 ceph_assert(info
.history
.same_interval_since
!= 0);
604 bool new_interval
= PastIntervals::check_new_interval(
605 old_acting_primary
.osd
,
607 oldacting
, newacting
,
611 info
.history
.same_interval_since
,
612 info
.history
.last_epoch_clean
,
616 missing_loc
.get_recoverable_predicate(),
619 psdout(10) << __func__
<< ": check_new_interval output: "
620 << debug
.str() << dendl
;
622 if (osdmap
->get_epoch() == pl
->oldest_stored_osdmap() &&
623 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
624 psdout(10) << " map gap, clearing past_intervals and faking" << dendl
;
625 // our information is incomplete and useless; someone else was clean
626 // after everything we know if osdmaps were trimmed.
627 past_intervals
.clear();
629 psdout(10) << " noting past " << past_intervals
<< dendl
;
632 dirty_big_info
= true;
633 info
.history
.same_interval_since
= osdmap
->get_epoch();
634 if (osdmap
->have_pg_pool(info
.pgid
.pgid
.pool()) &&
635 info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
636 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
638 info
.history
.last_epoch_split
= osdmap
->get_epoch();
643 if (old_up_primary
!= up_primary
||
645 info
.history
.same_up_since
= osdmap
->get_epoch();
647 // this comparison includes primary rank via pg_shard_t
648 if (old_acting_primary
!= get_primary()) {
649 info
.history
.same_primary_since
= osdmap
->get_epoch();
653 pl
->on_info_history_change();
655 psdout(1) << __func__
<< " up " << oldup
<< " -> " << up
656 << ", acting " << oldacting
<< " -> " << acting
657 << ", acting_primary " << old_acting_primary
<< " -> "
658 << new_acting_primary
659 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
660 << ", role " << oldrole
<< " -> " << role
661 << ", features acting " << acting_features
662 << " upacting " << upacting_features
666 state_clear(PG_STATE_ACTIVE
);
667 state_clear(PG_STATE_PEERED
);
668 state_clear(PG_STATE_PREMERGE
);
669 state_clear(PG_STATE_DOWN
);
670 state_clear(PG_STATE_RECOVERY_WAIT
);
671 state_clear(PG_STATE_RECOVERY_TOOFULL
);
672 state_clear(PG_STATE_RECOVERING
);
675 acting_recovery_backfill
.clear();
677 // reset primary/replica state?
678 if (was_old_primary
|| is_primary()) {
679 pl
->clear_want_pg_temp();
680 } else if (was_old_nonprimary
|| is_nonprimary()) {
681 pl
->clear_want_pg_temp();
683 clear_primary_state();
687 ceph_assert(!deleting
);
689 // should we tell the primary we are here?
690 send_notify
= !is_primary();
692 if (role
!= oldrole
||
693 was_old_primary
!= is_primary()) {
694 // did primary change?
695 if (was_old_primary
!= is_primary()) {
696 state_clear(PG_STATE_CLEAN
);
699 pl
->on_role_change();
702 // did primary change?
703 if (get_primary() != old_acting_primary
) {
704 psdout(10) << oldacting
<< " -> " << acting
705 << ", acting primary "
706 << old_acting_primary
<< " -> " << get_primary()
709 // primary is the same.
711 // i am (still) primary. but my replica set changed.
712 state_clear(PG_STATE_CLEAN
);
714 psdout(10) << oldacting
<< " -> " << acting
715 << ", replicas changed" << dendl
;
720 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
721 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
722 pl
->queue_want_pg_temp(acting
);
726 void PeeringState::on_new_interval()
728 dout(20) << __func__
<< dendl
;
729 const OSDMapRef osdmap
= get_osdmap();
731 // initialize features
732 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
733 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
734 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
735 if (*p
== CRUSH_ITEM_NONE
)
737 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
738 acting_features
&= f
;
739 upacting_features
&= f
;
741 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
742 if (*p
== CRUSH_ITEM_NONE
)
744 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
746 psdout(20) << __func__
<< " upacting_features 0x" << std::hex
747 << upacting_features
<< std::dec
748 << " from " << acting
<< "+" << up
<< dendl
;
750 psdout(20) << __func__
<< " checking missing set deletes flag. missing = "
751 << get_pg_log().get_missing() << dendl
;
753 if (!pg_log
.get_missing().may_include_deletes
&&
754 !perform_deletes_during_peering()) {
755 pl
->rebuild_missing_set_with_deletes(pg_log
);
758 pg_log
.get_missing().may_include_deletes
==
759 !perform_deletes_during_peering());
763 // update lease bounds for a new interval
764 auto mnow
= pl
->get_mnow();
765 prior_readable_until_ub
= std::max(prior_readable_until_ub
,
767 prior_readable_until_ub
= info
.history
.refresh_prior_readable_until_ub(
768 mnow
, prior_readable_until_ub
);
769 psdout(10) << __func__
<< " prior_readable_until_ub "
770 << prior_readable_until_ub
<< " (mnow " << mnow
<< " + "
771 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
772 prior_readable_down_osds
.clear(); // we populate this when we build the priorset
776 readable_until_ub_sent
=
777 readable_until_ub_from_primary
= ceph::signedspan::zero();
779 acting_readable_until_ub
.clear();
781 acting_readable_until_ub
.resize(acting
.size(), ceph::signedspan::zero());
784 pl
->on_new_interval();
787 void PeeringState::init_primary_up_acting(
788 const vector
<int> &newup
,
789 const vector
<int> &newacting
,
791 int new_acting_primary
)
795 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
796 if (acting
[i
] != CRUSH_ITEM_NONE
)
800 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
804 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
805 if (up
[i
] != CRUSH_ITEM_NONE
)
809 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
811 if (!pool
.info
.is_erasure()) {
813 up_primary
= pg_shard_t(new_up_primary
, shard_id_t::NO_SHARD
);
814 primary
= pg_shard_t(new_acting_primary
, shard_id_t::NO_SHARD
);
817 up_primary
= pg_shard_t();
818 primary
= pg_shard_t();
819 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
820 if (up
[i
] == new_up_primary
) {
821 up_primary
= pg_shard_t(up
[i
], shard_id_t(i
));
825 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
826 if (acting
[i
] == new_acting_primary
) {
827 primary
= pg_shard_t(acting
[i
], shard_id_t(i
));
831 ceph_assert(up_primary
.osd
== new_up_primary
);
832 ceph_assert(primary
.osd
== new_acting_primary
);
836 void PeeringState::init_hb_stamps()
839 // we care about all other osds in the acting set
840 hb_stamps
.resize(acting
.size() - 1);
842 for (auto p
: acting
) {
843 if (p
== CRUSH_ITEM_NONE
|| p
== get_primary().osd
) {
846 hb_stamps
[i
++] = pl
->get_hb_stamps(p
);
849 } else if (is_nonprimary()) {
850 // we care about just the primary
852 hb_stamps
[0] = pl
->get_hb_stamps(get_primary().osd
);
856 dout(10) << __func__
<< " now " << hb_stamps
<< dendl
;
860 void PeeringState::clear_recovery_state()
862 async_recovery_targets
.clear();
863 backfill_targets
.clear();
866 void PeeringState::clear_primary_state()
868 psdout(10) << "clear_primary_state" << dendl
;
870 // clear peering state
872 peer_log_requested
.clear();
873 peer_missing_requested
.clear();
876 peer_missing
.clear();
877 peer_last_complete_ondisk
.clear();
878 peer_activated
.clear();
879 min_last_complete_ondisk
= eversion_t();
880 pg_trim_to
= eversion_t();
881 might_have_unfound
.clear();
882 need_up_thru
= false;
884 pg_log
.reset_recovery_pointers();
886 clear_recovery_state();
888 last_update_ondisk
= eversion_t();
890 pl
->clear_primary_state();
893 /// return [start,end) bounds for required past_intervals
894 static pair
<epoch_t
, epoch_t
> get_required_past_interval_bounds(
895 const pg_info_t
&info
,
896 epoch_t oldest_map
) {
897 epoch_t start
= std::max(
898 info
.history
.last_epoch_clean
? info
.history
.last_epoch_clean
:
899 info
.history
.epoch_pool_created
,
901 epoch_t end
= std::max(
902 info
.history
.same_interval_since
,
903 info
.history
.epoch_pool_created
);
904 return make_pair(start
, end
);
908 void PeeringState::check_past_interval_bounds() const
910 auto oldest_epoch
= pl
->oldest_stored_osdmap();
911 auto rpib
= get_required_past_interval_bounds(
914 if (rpib
.first
>= rpib
.second
) {
915 // do not warn if the start bound is dictated by oldest_map; the
916 // past intervals are presumably appropriate given the pg info.
917 if (!past_intervals
.empty() &&
918 rpib
.first
> oldest_epoch
) {
919 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
920 << " empty [" << rpib
<< ") but past_intervals is not: "
922 derr
<< info
.pgid
<< " required past_interval bounds are"
923 << " empty [" << rpib
<< ") but past_intervals is not: "
924 << past_intervals
<< dendl
;
927 if (past_intervals
.empty()) {
928 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
929 << " not empty [" << rpib
<< ") but past_intervals "
930 << past_intervals
<< " is empty";
931 derr
<< info
.pgid
<< " required past_interval bounds are"
932 << " not empty [" << rpib
<< ") but past_intervals "
933 << past_intervals
<< " is empty" << dendl
;
934 ceph_assert(!past_intervals
.empty());
937 auto apib
= past_intervals
.get_bounds();
938 if (apib
.first
> rpib
.first
) {
939 pl
->get_clog_error() << info
.pgid
<< " past_intervals [" << apib
940 << ") start interval does not contain the required"
941 << " bound [" << rpib
<< ") start";
942 derr
<< info
.pgid
<< " past_intervals [" << apib
943 << ") start interval does not contain the required"
944 << " bound [" << rpib
<< ") start" << dendl
;
945 ceph_abort_msg("past_interval start interval mismatch");
947 if (apib
.second
!= rpib
.second
) {
948 pl
->get_clog_error() << info
.pgid
<< " past_interal bound [" << apib
949 << ") end does not match required [" << rpib
951 derr
<< info
.pgid
<< " past_interal bound [" << apib
952 << ") end does not match required [" << rpib
954 ceph_abort_msg("past_interval end mismatch");
959 int PeeringState::clamp_recovery_priority(int priority
, int pool_recovery_priority
, int max
)
961 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
962 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
964 ceph_assert(max
<= OSD_RECOVERY_PRIORITY_MAX
);
966 // User can't set this too high anymore, but might be a legacy value
967 if (pool_recovery_priority
> OSD_POOL_PRIORITY_MAX
)
968 pool_recovery_priority
= OSD_POOL_PRIORITY_MAX
;
969 if (pool_recovery_priority
< OSD_POOL_PRIORITY_MIN
)
970 pool_recovery_priority
= OSD_POOL_PRIORITY_MIN
;
971 // Shift range from min to max to 0 to max - min
972 pool_recovery_priority
+= (0 - OSD_POOL_PRIORITY_MIN
);
973 ceph_assert(pool_recovery_priority
>= 0 && pool_recovery_priority
<= (OSD_POOL_PRIORITY_MAX
- OSD_POOL_PRIORITY_MIN
));
975 priority
+= pool_recovery_priority
;
977 // Clamp to valid range
978 if (priority
> max
) {
980 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
981 return OSD_RECOVERY_PRIORITY_MIN
;
987 unsigned PeeringState::get_recovery_priority()
989 // a higher value -> a higher priority
990 int ret
= OSD_RECOVERY_PRIORITY_BASE
;
993 if (state
& PG_STATE_FORCED_RECOVERY
) {
994 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
996 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
997 if (is_degraded() && info
.stats
.avail_no_missing
.size() < pool
.info
.min_size
) {
998 base
= OSD_RECOVERY_INACTIVE_PRIORITY_BASE
;
999 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1000 ret
= base
+ (pool
.info
.min_size
- info
.stats
.avail_no_missing
.size());
1003 int64_t pool_recovery_priority
= 0;
1004 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
1006 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
1008 psdout(20) << __func__
<< " recovery priority is " << ret
<< dendl
;
1009 return static_cast<unsigned>(ret
);
1012 unsigned PeeringState::get_backfill_priority()
1014 // a higher value -> a higher priority
1015 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
1018 if (state
& PG_STATE_FORCED_BACKFILL
) {
1019 ret
= OSD_BACKFILL_PRIORITY_FORCED
;
1021 if (actingset
.size() < pool
.info
.min_size
) {
1022 base
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
;
1023 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1024 ret
= base
+ (pool
.info
.min_size
- actingset
.size());
1026 } else if (is_undersized()) {
1027 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1028 ceph_assert(pool
.info
.size
> actingset
.size());
1029 base
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1030 ret
= base
+ (pool
.info
.size
- actingset
.size());
1032 } else if (is_degraded()) {
1033 // degraded: baseline degraded
1034 base
= ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1037 // Adjust with pool's recovery priority
1038 int64_t pool_recovery_priority
= 0;
1039 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
1041 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
1044 psdout(20) << __func__
<< " backfill priority is " << ret
<< dendl
;
1045 return static_cast<unsigned>(ret
);
1048 unsigned PeeringState::get_delete_priority()
1050 auto state
= get_osdmap()->get_state(pg_whoami
.osd
);
1051 if (state
& (CEPH_OSD_BACKFILLFULL
|
1053 return OSD_DELETE_PRIORITY_FULL
;
1054 } else if (state
& CEPH_OSD_NEARFULL
) {
1055 return OSD_DELETE_PRIORITY_FULLISH
;
1057 return OSD_DELETE_PRIORITY_NORMAL
;
1061 bool PeeringState::set_force_recovery(bool b
)
1065 if (!(state
& PG_STATE_FORCED_RECOVERY
) &&
1066 (state
& (PG_STATE_DEGRADED
|
1067 PG_STATE_RECOVERY_WAIT
|
1068 PG_STATE_RECOVERING
))) {
1069 psdout(20) << __func__
<< " set" << dendl
;
1070 state_set(PG_STATE_FORCED_RECOVERY
);
1071 pl
->publish_stats_to_osd();
1074 } else if (state
& PG_STATE_FORCED_RECOVERY
) {
1075 psdout(20) << __func__
<< " clear" << dendl
;
1076 state_clear(PG_STATE_FORCED_RECOVERY
);
1077 pl
->publish_stats_to_osd();
1081 psdout(20) << __func__
<< " state " << get_current_state()
1083 pl
->update_local_background_io_priority(get_recovery_priority());
1088 bool PeeringState::set_force_backfill(bool b
)
1092 if (!(state
& PG_STATE_FORCED_BACKFILL
) &&
1093 (state
& (PG_STATE_DEGRADED
|
1094 PG_STATE_BACKFILL_WAIT
|
1095 PG_STATE_BACKFILLING
))) {
1096 psdout(10) << __func__
<< " set" << dendl
;
1097 state_set(PG_STATE_FORCED_BACKFILL
);
1098 pl
->publish_stats_to_osd();
1101 } else if (state
& PG_STATE_FORCED_BACKFILL
) {
1102 psdout(10) << __func__
<< " clear" << dendl
;
1103 state_clear(PG_STATE_FORCED_BACKFILL
);
1104 pl
->publish_stats_to_osd();
1108 psdout(20) << __func__
<< " state " << get_current_state()
1110 pl
->update_local_background_io_priority(get_backfill_priority());
1115 void PeeringState::schedule_renew_lease()
1117 pl
->schedule_renew_lease(
1119 readable_interval
/ 2);
1122 void PeeringState::send_lease()
1124 epoch_t epoch
= pl
->get_osdmap_epoch();
1125 for (auto peer
: actingset
) {
1126 if (peer
== pg_whoami
) {
1129 pl
->send_cluster_message(
1131 new MOSDPGLease(epoch
,
1132 spg_t(spgid
.pgid
, peer
.shard
),
1138 void PeeringState::proc_lease(const pg_lease_t
& l
)
1140 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1141 psdout(20) << __func__
<< " no-op, upacting_features 0x" << std::hex
1142 << upacting_features
<< std::dec
1143 << " does not include SERVER_OCTOPUS" << dendl
;
1146 if (!is_nonprimary()) {
1147 psdout(20) << __func__
<< " no-op, !nonprimary" << dendl
;
1150 psdout(10) << __func__
<< " " << l
<< dendl
;
1151 if (l
.readable_until_ub
> readable_until_ub_from_primary
) {
1152 readable_until_ub_from_primary
= l
.readable_until_ub
;
1155 ceph::signedspan ru
= ceph::signedspan::zero();
1156 if (l
.readable_until
!= ceph::signedspan::zero() &&
1157 hb_stamps
[0]->peer_clock_delta_ub
) {
1158 ru
= l
.readable_until
- *hb_stamps
[0]->peer_clock_delta_ub
;
1159 psdout(20) << " peer_clock_delta_ub " << *hb_stamps
[0]->peer_clock_delta_ub
1160 << " -> ru " << ru
<< dendl
;
1162 if (ru
> readable_until
) {
1163 readable_until
= ru
;
1164 psdout(20) << __func__
<< " readable_until now " << readable_until
<< dendl
;
1165 // NOTE: if we ever decide to block/queue ops on the replica,
1166 // we'll need to wake them up here.
1169 ceph::signedspan ruub
;
1170 if (hb_stamps
[0]->peer_clock_delta_lb
) {
1171 ruub
= l
.readable_until_ub
- *hb_stamps
[0]->peer_clock_delta_lb
;
1172 psdout(20) << " peer_clock_delta_lb " << *hb_stamps
[0]->peer_clock_delta_lb
1173 << " -> ruub " << ruub
<< dendl
;
1175 ruub
= pl
->get_mnow() + l
.interval
;
1176 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub
<< dendl
;
1178 if (ruub
> readable_until_ub
) {
1179 readable_until_ub
= ruub
;
1180 psdout(20) << __func__
<< " readable_until_ub now " << readable_until_ub
1185 void PeeringState::proc_lease_ack(int from
, const pg_lease_ack_t
& a
)
1187 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1190 auto now
= pl
->get_mnow();
1191 bool was_min
= false;
1192 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1193 if (from
== acting
[i
]) {
1194 // the lease_ack value is based on the primary's clock
1195 if (a
.readable_until_ub
> acting_readable_until_ub
[i
]) {
1196 if (acting_readable_until_ub
[i
] == readable_until
) {
1199 acting_readable_until_ub
[i
] = a
.readable_until_ub
;
1205 auto old_ru
= readable_until
;
1206 recalc_readable_until();
1208 pl
->recheck_readable();
1213 void PeeringState::proc_renew_lease()
1215 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1218 renew_lease(pl
->get_mnow());
1220 schedule_renew_lease();
1223 void PeeringState::recalc_readable_until()
1225 assert(is_primary());
1226 ceph::signedspan min
= readable_until_ub_sent
;
1227 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1228 if (acting
[i
] == pg_whoami
.osd
|| acting
[i
] == CRUSH_ITEM_NONE
) {
1231 dout(20) << __func__
<< " peer osd." << acting
[i
]
1232 << " ruub " << acting_readable_until_ub
[i
] << dendl
;
1233 if (acting_readable_until_ub
[i
] < min
) {
1234 min
= acting_readable_until_ub
[i
];
1237 readable_until
= min
;
1238 readable_until_ub
= min
;
1239 dout(20) << __func__
<< " readable_until[_ub] " << readable_until
1240 << " (sent " << readable_until_ub_sent
<< ")" << dendl
;
1243 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef
& map
)
1245 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1248 bool changed
= false;
1249 auto p
= prior_readable_down_osds
.begin();
1250 while (p
!= prior_readable_down_osds
.end()) {
1251 if (map
->is_dead(*p
)) {
1252 dout(10) << __func__
<< " prior_readable_down_osds osd." << *p
1253 << " is dead as of epoch " << map
->get_epoch()
1255 p
= prior_readable_down_osds
.erase(p
);
1261 if (changed
&& prior_readable_down_osds
.empty()) {
1262 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl
;
1263 clear_prior_readable_until_ub();
1269 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap
)
1271 epoch_t up_thru
= osdmap
->get_up_thru(pg_whoami
.osd
);
1273 up_thru
>= info
.history
.same_interval_since
) {
1274 psdout(10) << "adjust_need_up_thru now "
1275 << up_thru
<< ", need_up_thru now false" << dendl
;
1276 need_up_thru
= false;
1282 PastIntervals::PriorSet
PeeringState::build_prior()
1286 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
1287 it
!= peer_info
.end();
1289 ceph_assert(info
.history
.last_epoch_started
>=
1290 it
->second
.history
.last_epoch_started
);
1294 const OSDMap
&osdmap
= *get_osdmap();
1295 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
1296 pool
.info
.is_erasure(),
1297 info
.history
.last_epoch_started
,
1298 &missing_loc
.get_recoverable_predicate(),
1299 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
1300 const osd_info_t
*pinfo
= 0;
1301 if (osdmap
.exists(osd
)) {
1302 pinfo
= &osdmap
.get_info(osd
);
1304 *lost_at
= pinfo
->lost_at
;
1307 if (osdmap
.is_up(osd
)) {
1308 return PastIntervals::UP
;
1309 } else if (!pinfo
) {
1310 return PastIntervals::DNE
;
1311 } else if (pinfo
->lost_at
> start
) {
1312 return PastIntervals::LOST
;
1314 return PastIntervals::DOWN
;
1321 if (prior
.pg_down
) {
1322 state_set(PG_STATE_DOWN
);
1325 if (get_osdmap()->get_up_thru(pg_whoami
.osd
) <
1326 info
.history
.same_interval_since
) {
1327 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1328 << " < same_since " << info
.history
.same_interval_since
1329 << ", must notify monitor" << dendl
;
1330 need_up_thru
= true;
1332 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1333 << " >= same_since " << info
.history
.same_interval_since
1334 << ", all is well" << dendl
;
1335 need_up_thru
= false;
1337 pl
->set_probe_targets(prior
.probe
);
1341 bool PeeringState::needs_recovery() const
1343 ceph_assert(is_primary());
1345 auto &missing
= pg_log
.get_missing();
1347 if (missing
.num_missing()) {
1348 psdout(10) << __func__
<< " primary has " << missing
.num_missing()
1349 << " missing" << dendl
;
1353 ceph_assert(!acting_recovery_backfill
.empty());
1354 set
<pg_shard_t
>::const_iterator end
= acting_recovery_backfill
.end();
1355 set
<pg_shard_t
>::const_iterator a
= acting_recovery_backfill
.begin();
1356 for (; a
!= end
; ++a
) {
1357 if (*a
== get_primary()) continue;
1358 pg_shard_t peer
= *a
;
1359 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
1360 if (pm
== peer_missing
.end()) {
1361 psdout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
1365 if (pm
->second
.num_missing()) {
1366 psdout(10) << __func__
<< " osd." << peer
<< " has "
1367 << pm
->second
.num_missing() << " missing" << dendl
;
1372 psdout(10) << __func__
<< " is recovered" << dendl
;
1376 bool PeeringState::needs_backfill() const
1378 ceph_assert(is_primary());
1380 // We can assume that only possible osds that need backfill
1381 // are on the backfill_targets vector nodes.
1382 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
1383 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
1384 for (; a
!= end
; ++a
) {
1385 pg_shard_t peer
= *a
;
1386 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
1387 if (!pi
->second
.last_backfill
.is_max()) {
1388 psdout(10) << __func__
<< " osd." << peer
1389 << " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
1394 psdout(10) << __func__
<< " does not need backfill" << dendl
;
1399 * Returns true unless there is a non-lost OSD in might_have_unfound.
1401 bool PeeringState::all_unfound_are_queried_or_lost(
1402 const OSDMapRef osdmap
) const
1404 ceph_assert(is_primary());
1406 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
1407 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
1408 for (; peer
!= mend
; ++peer
) {
1409 if (peer_missing
.count(*peer
))
1411 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
1412 if (iter
!= peer_info
.end() &&
1413 (iter
->second
.is_empty() || iter
->second
.dne()))
1415 if (!osdmap
->exists(peer
->osd
))
1417 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
1418 if (osd_info
.lost_at
<= osd_info
.up_from
) {
1419 // If there is even one OSD in might_have_unfound that isn't lost, we
1420 // still might retrieve our unfound.
1424 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1425 << might_have_unfound
1426 << " have been queried or are marked lost" << dendl
;
1431 void PeeringState::reject_reservation()
1433 pl
->unreserve_recovery_space();
1434 pl
->send_cluster_message(
1436 new MBackfillReserve(
1437 MBackfillReserve::REJECT_TOOFULL
,
1438 spg_t(info
.pgid
.pgid
, primary
.shard
),
1439 get_osdmap_epoch()),
1440 get_osdmap_epoch());
1446 * Returns an iterator to the best info in infos sorted by:
1447 * 1) Prefer newer last_update
1448 * 2) Prefer longer tail if it brings another info into contiguity
1449 * 3) Prefer current primary
1451 map
<pg_shard_t
, pg_info_t
>::const_iterator
PeeringState::find_best_info(
1452 const map
<pg_shard_t
, pg_info_t
> &infos
,
1453 bool restrict_to_up_acting
,
1454 bool *history_les_bound
) const
1456 ceph_assert(history_les_bound
);
1457 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1458 * to make changes to this process. Also, make sure to update it
1459 * when you find bugs! */
1460 eversion_t min_last_update_acceptable
= eversion_t::max();
1461 epoch_t max_last_epoch_started_found
= 0;
1462 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1465 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1466 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1467 *history_les_bound
= true;
1468 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1470 if (!i
->second
.is_incomplete() &&
1471 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1472 *history_les_bound
= false;
1473 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1476 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1479 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1480 if (min_last_update_acceptable
> i
->second
.last_update
)
1481 min_last_update_acceptable
= i
->second
.last_update
;
1484 if (min_last_update_acceptable
== eversion_t::max())
1487 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1488 // find osd with newest last_update (oldest for ec_pool).
1489 // if there are multiples, prefer
1490 // - a longer tail, if it brings another peer into log contiguity
1491 // - the current primary
1492 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1495 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1496 !is_acting(p
->first
))
1498 // Only consider peers with last_update >= min_last_update_acceptable
1499 if (p
->second
.last_update
< min_last_update_acceptable
)
1501 // Disqualify anyone with a too old last_epoch_started
1502 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1504 // Disqualify anyone who is incomplete (not fully backfilled)
1505 if (p
->second
.is_incomplete())
1507 if (best
== infos
.end()) {
1511 // Prefer newer last_update
1512 if (pool
.info
.require_rollback()) {
1513 if (p
->second
.last_update
> best
->second
.last_update
)
1515 if (p
->second
.last_update
< best
->second
.last_update
) {
1520 if (p
->second
.last_update
< best
->second
.last_update
)
1522 if (p
->second
.last_update
> best
->second
.last_update
) {
1528 // Prefer longer tail
1529 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1531 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1536 if (!p
->second
.has_missing() && best
->second
.has_missing()) {
1537 psdout(10) << __func__
<< " prefer osd." << p
->first
1538 << " because it is complete while best has missing"
1542 } else if (p
->second
.has_missing() && !best
->second
.has_missing()) {
1543 psdout(10) << __func__
<< " skipping osd." << p
->first
1544 << " because it has missing while best is complete"
1548 // both are complete or have missing
1552 // prefer current primary (usually the caller), all things being equal
1553 if (p
->first
== pg_whoami
) {
1554 psdout(10) << "calc_acting prefer osd." << p
->first
1555 << " because it is current primary" << dendl
;
1563 void PeeringState::calc_ec_acting(
1564 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1566 const vector
<int> &acting
,
1567 const vector
<int> &up
,
1568 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1569 bool restrict_to_up_acting
,
1571 set
<pg_shard_t
> *backfill
,
1572 set
<pg_shard_t
> *acting_backfill
,
1575 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1576 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1577 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1578 i
!= all_info
.end();
1580 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1582 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1583 ss
<< "For position " << (unsigned)i
<< ": ";
1584 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1585 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1586 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1587 auth_log_shard
->second
.log_tail
) {
1588 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1592 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1593 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1595 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1598 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1599 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1600 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1601 auth_log_shard
->second
.log_tail
) {
1602 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1603 want
[i
] = acting
[i
];
1604 } else if (!restrict_to_up_acting
) {
1605 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1606 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1608 ceph_assert(j
->shard
== i
);
1609 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1610 all_info
.find(*j
)->second
.last_update
>=
1611 auth_log_shard
->second
.log_tail
) {
1612 ss
<< " selecting stray: " << *j
<< std::endl
;
1617 if (want
[i
] == CRUSH_ITEM_NONE
)
1618 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1622 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1623 if (want
[i
] != CRUSH_ITEM_NONE
) {
1624 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1627 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1632 * calculate the desired acting set.
1634 * Choose an appropriate acting set. Prefer up[0], unless it is
1635 * incomplete, or another osd has a longer tail that allows us to
1636 * bring other up nodes up to date.
1638 void PeeringState::calc_replicated_acting(
1639 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1640 uint64_t force_auth_primary_missing_objects
,
1642 const vector
<int> &acting
,
1643 const vector
<int> &up
,
1644 pg_shard_t up_primary
,
1645 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1646 bool restrict_to_up_acting
,
1648 set
<pg_shard_t
> *backfill
,
1649 set
<pg_shard_t
> *acting_backfill
,
1650 const OSDMapRef osdmap
,
1653 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1655 ss
<< __func__
<< " newest update on osd." << auth_log_shard_id
1656 << " with " << auth_log_shard
->second
1657 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1660 auto primary
= all_info
.find(up_primary
);
1662 !primary
->second
.is_incomplete() &&
1663 primary
->second
.last_update
>=
1664 auth_log_shard
->second
.log_tail
) {
1665 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1666 auto approx_missing_objects
=
1667 primary
->second
.stats
.stats
.sum
.num_objects_missing
;
1668 auto auth_version
= auth_log_shard
->second
.last_update
.version
;
1669 auto primary_version
= primary
->second
.last_update
.version
;
1670 if (auth_version
> primary_version
) {
1671 approx_missing_objects
+= auth_version
- primary_version
;
1673 approx_missing_objects
+= primary_version
- auth_version
;
1675 if ((uint64_t)approx_missing_objects
>
1676 force_auth_primary_missing_objects
) {
1677 primary
= auth_log_shard
;
1678 ss
<< "up_primary: " << up_primary
<< ") has approximate "
1679 << approx_missing_objects
1680 << "(>" << force_auth_primary_missing_objects
<<") "
1681 << "missing objects, osd." << auth_log_shard_id
1682 << " selected as primary instead"
1685 ss
<< "up_primary: " << up_primary
<< ") selected as primary"
1689 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1692 ceph_assert(!auth_log_shard
->second
.is_incomplete());
1693 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1694 << " selected as primary instead" << std::endl
;
1695 primary
= auth_log_shard
;
1698 ss
<< __func__
<< " primary is osd." << primary
->first
1699 << " with " << primary
->second
<< std::endl
;
1700 want
->push_back(primary
->first
.osd
);
1701 acting_backfill
->insert(primary
->first
);
1703 /* We include auth_log_shard->second.log_tail because in GetLog,
1704 * we will request logs back to the min last_update over our
1705 * acting_backfill set, which will result in our log being extended
1706 * as far backwards as necessary to pick up any peers which can
1707 * be log recovered by auth_log_shard's log */
1708 eversion_t oldest_auth_log_entry
=
1709 std::min(primary
->second
.log_tail
, auth_log_shard
->second
.log_tail
);
1711 // select replicas that have log contiguity with primary.
1712 // prefer up, then acting, then any peer_info osds
1714 pg_shard_t up_cand
= pg_shard_t(i
, shard_id_t::NO_SHARD
);
1715 if (up_cand
== primary
->first
)
1717 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1718 if (cur_info
.is_incomplete() ||
1719 cur_info
.last_update
< oldest_auth_log_entry
) {
1720 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1721 backfill
->insert(up_cand
);
1722 acting_backfill
->insert(up_cand
);
1725 acting_backfill
->insert(up_cand
);
1726 ss
<< " osd." << i
<< " (up) accepted " << cur_info
<< std::endl
;
1730 if (want
->size() >= size
) {
1734 std::vector
<std::pair
<eversion_t
, int>> candidate_by_last_update
;
1735 candidate_by_last_update
.reserve(acting
.size());
1736 // This no longer has backfill OSDs, but they are covered above.
1737 for (auto i
: acting
) {
1738 pg_shard_t
acting_cand(i
, shard_id_t::NO_SHARD
);
1739 // skip up osds we already considered above
1740 if (acting_cand
== primary
->first
)
1742 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
);
1743 if (up_it
!= up
.end())
1746 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1747 if (cur_info
.is_incomplete() ||
1748 cur_info
.last_update
< oldest_auth_log_entry
) {
1749 ss
<< " shard " << acting_cand
<< " (acting) REJECTED "
1750 << cur_info
<< std::endl
;
1752 candidate_by_last_update
.emplace_back(cur_info
.last_update
, i
);
1756 auto sort_by_eversion
=[](const std::pair
<eversion_t
, int> &lhs
,
1757 const std::pair
<eversion_t
, int> &rhs
) {
1758 return lhs
.first
> rhs
.first
;
1760 // sort by last_update, in descending order.
1761 std::sort(candidate_by_last_update
.begin(),
1762 candidate_by_last_update
.end(), sort_by_eversion
);
1763 for (auto &p
: candidate_by_last_update
) {
1764 ceph_assert(want
->size() < size
);
1765 want
->push_back(p
.second
);
1766 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1767 acting_backfill
->insert(s
);
1768 ss
<< " shard " << s
<< " (acting) accepted "
1769 << all_info
.find(s
)->second
<< std::endl
;
1770 if (want
->size() >= size
) {
1775 if (restrict_to_up_acting
) {
1778 candidate_by_last_update
.clear();
1779 candidate_by_last_update
.reserve(all_info
.size()); // overestimate but fine
1780 // continue to search stray to find more suitable peers
1781 for (auto &i
: all_info
) {
1782 // skip up osds we already considered above
1783 if (i
.first
== primary
->first
)
1785 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
.first
.osd
);
1786 if (up_it
!= up
.end())
1788 vector
<int>::const_iterator acting_it
= find(
1789 acting
.begin(), acting
.end(), i
.first
.osd
);
1790 if (acting_it
!= acting
.end())
1793 if (i
.second
.is_incomplete() ||
1794 i
.second
.last_update
< oldest_auth_log_entry
) {
1795 ss
<< " shard " << i
.first
<< " (stray) REJECTED " << i
.second
1798 candidate_by_last_update
.emplace_back(
1799 i
.second
.last_update
, i
.first
.osd
);
1803 if (candidate_by_last_update
.empty()) {
1804 // save us some effort
1808 // sort by last_update, in descending order.
1809 std::sort(candidate_by_last_update
.begin(),
1810 candidate_by_last_update
.end(), sort_by_eversion
);
1812 for (auto &p
: candidate_by_last_update
) {
1813 ceph_assert(want
->size() < size
);
1814 want
->push_back(p
.second
);
1815 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1816 acting_backfill
->insert(s
);
1817 ss
<< " shard " << s
<< " (stray) accepted "
1818 << all_info
.find(s
)->second
<< std::endl
;
1819 if (want
->size() >= size
) {
1825 bool PeeringState::recoverable(const vector
<int> &want
) const
1827 unsigned num_want_acting
= 0;
1828 set
<pg_shard_t
> have
;
1829 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1830 if (want
[i
] != CRUSH_ITEM_NONE
) {
1835 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1839 if (num_want_acting
< pool
.info
.min_size
) {
1840 const bool recovery_ec_pool_below_min_size
=
1841 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS
);
1843 if (pool
.info
.is_erasure() && !recovery_ec_pool_below_min_size
) {
1844 psdout(10) << __func__
<< " failed, ec recovery below min size not supported by pre-octopus" << dendl
;
1846 } else if (!cct
->_conf
.get_val
<bool>("osd_allow_recovery_below_min_size")) {
1847 psdout(10) << __func__
<< " failed, recovery below min size not enabled" << dendl
;
1851 if (missing_loc
.get_recoverable_predicate()(have
)) {
1854 psdout(10) << __func__
<< " failed, not recoverable " << dendl
;
1859 void PeeringState::choose_async_recovery_ec(
1860 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1861 const pg_info_t
&auth_info
,
1863 set
<pg_shard_t
> *async_recovery
,
1864 const OSDMapRef osdmap
) const
1866 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1867 for (uint8_t i
= 0; i
< want
->size(); ++i
) {
1868 if ((*want
)[i
] == CRUSH_ITEM_NONE
)
1871 // Considering log entries to recover is accurate enough for
1872 // now. We could use minimum_to_decode_with_cost() later if
1874 pg_shard_t
shard_i((*want
)[i
], shard_id_t(i
));
1875 // do not include strays
1876 if (stray_set
.find(shard_i
) != stray_set
.end())
1878 // Do not include an osd that is not up, since choosing it as
1879 // an async_recovery_target will move it out of the acting set.
1880 // This results in it being identified as a stray during peering,
1881 // because it is no longer in the up or acting set.
1882 if (!is_up(shard_i
))
1884 auto shard_info
= all_info
.find(shard_i
)->second
;
1885 // for ec pools we rollback all entries past the authoritative
1886 // last_update *before* activation. This is relatively inexpensive
1887 // compared to recovery, since it is purely local, so treat shards
1888 // past the authoritative last_update the same as those equal to it.
1889 version_t auth_version
= auth_info
.last_update
.version
;
1890 version_t candidate_version
= shard_info
.last_update
.version
;
1891 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1892 auto approx_missing_objects
=
1893 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1894 if (auth_version
> candidate_version
) {
1895 approx_missing_objects
+= auth_version
- candidate_version
;
1897 if (static_cast<uint64_t>(approx_missing_objects
) >
1898 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1899 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1902 if (auth_version
> candidate_version
&&
1903 (auth_version
- candidate_version
) > cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1904 candidates_by_cost
.insert(make_pair(auth_version
- candidate_version
, shard_i
));
1909 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1912 // take out as many osds as we can for async recovery, in order of cost
1913 for (auto rit
= candidates_by_cost
.rbegin();
1914 rit
!= candidates_by_cost
.rend(); ++rit
) {
1915 pg_shard_t cur_shard
= rit
->second
;
1916 vector
<int> candidate_want(*want
);
1917 candidate_want
[cur_shard
.shard
.id
] = CRUSH_ITEM_NONE
;
1918 if (recoverable(candidate_want
)) {
1919 want
->swap(candidate_want
);
1920 async_recovery
->insert(cur_shard
);
1923 psdout(20) << __func__
<< " result want=" << *want
1924 << " async_recovery=" << *async_recovery
<< dendl
;
1927 void PeeringState::choose_async_recovery_replicated(
1928 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1929 const pg_info_t
&auth_info
,
1931 set
<pg_shard_t
> *async_recovery
,
1932 const OSDMapRef osdmap
) const
1934 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1935 for (auto osd_num
: *want
) {
1936 pg_shard_t
shard_i(osd_num
, shard_id_t::NO_SHARD
);
1937 // do not include strays
1938 if (stray_set
.find(shard_i
) != stray_set
.end())
1940 // Do not include an osd that is not up, since choosing it as
1941 // an async_recovery_target will move it out of the acting set.
1942 // This results in it being identified as a stray during peering,
1943 // because it is no longer in the up or acting set.
1944 if (!is_up(shard_i
))
1946 auto shard_info
= all_info
.find(shard_i
)->second
;
1947 // use the approximate magnitude of the difference in length of
1948 // logs plus historical missing objects as the cost of recovery
1949 version_t auth_version
= auth_info
.last_update
.version
;
1950 version_t candidate_version
= shard_info
.last_update
.version
;
1951 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1952 auto approx_missing_objects
=
1953 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1954 if (auth_version
> candidate_version
) {
1955 approx_missing_objects
+= auth_version
- candidate_version
;
1957 approx_missing_objects
+= candidate_version
- auth_version
;
1959 if (static_cast<uint64_t>(approx_missing_objects
) >
1960 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1961 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1964 size_t approx_entries
;
1965 if (auth_version
> candidate_version
) {
1966 approx_entries
= auth_version
- candidate_version
;
1968 approx_entries
= candidate_version
- auth_version
;
1970 if (approx_entries
> cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1971 candidates_by_cost
.insert(make_pair(approx_entries
, shard_i
));
1976 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1978 // take out as many osds as we can for async recovery, in order of cost
1979 for (auto rit
= candidates_by_cost
.rbegin();
1980 rit
!= candidates_by_cost
.rend(); ++rit
) {
1981 if (want
->size() <= pool
.info
.min_size
) {
1984 pg_shard_t cur_shard
= rit
->second
;
1985 vector
<int> candidate_want(*want
);
1986 for (auto it
= candidate_want
.begin(); it
!= candidate_want
.end(); ++it
) {
1987 if (*it
== cur_shard
.osd
) {
1988 candidate_want
.erase(it
);
1989 want
->swap(candidate_want
);
1990 async_recovery
->insert(cur_shard
);
1995 psdout(20) << __func__
<< " result want=" << *want
1996 << " async_recovery=" << *async_recovery
<< dendl
;
2004 * calculate the desired acting, and request a change with the monitor
2005 * if it differs from the current acting.
2007 * if restrict_to_up_acting=true, we filter out anything that's not in
2008 * up/acting. in order to lift this restriction, we need to
2009 * 1) check whether it's worth switching the acting set any time we get
2010 * a new pg info (not just here, when recovery finishes)
2011 * 2) check whether anything in want_acting went down on each new map
2012 * (and, if so, calculate a new want_acting)
2013 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2016 bool PeeringState::choose_acting(pg_shard_t
&auth_log_shard_id
,
2017 bool restrict_to_up_acting
,
2018 bool *history_les_bound
,
2019 bool request_pg_temp_change_only
)
2021 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
2022 all_info
[pg_whoami
] = info
;
2024 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 10>()) {
2025 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
2026 p
!= all_info
.end();
2028 psdout(10) << __func__
<< " all_info osd." << p
->first
<< " "
2029 << p
->second
<< dendl
;
2033 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
2034 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
2036 if (auth_log_shard
== all_info
.end()) {
2038 psdout(10) << __func__
<< " no suitable info found (incomplete backfills?),"
2039 << " reverting to up" << dendl
;
2042 pl
->queue_want_pg_temp(empty
);
2044 psdout(10) << __func__
<< " failed" << dendl
;
2045 ceph_assert(want_acting
.empty());
2050 ceph_assert(!auth_log_shard
->second
.is_incomplete());
2051 auth_log_shard_id
= auth_log_shard
->first
;
2053 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
2056 if (pool
.info
.is_replicated())
2057 calc_replicated_acting(
2059 cct
->_conf
.get_val
<uint64_t>(
2060 "osd_force_auth_primary_missing_objects"),
2061 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2066 restrict_to_up_acting
,
2069 &want_acting_backfill
,
2075 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2079 restrict_to_up_acting
,
2082 &want_acting_backfill
,
2084 psdout(10) << ss
.str() << dendl
;
2086 if (!recoverable(want
)) {
2087 want_acting
.clear();
2091 set
<pg_shard_t
> want_async_recovery
;
2092 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC
)) {
2093 if (pool
.info
.is_erasure()) {
2094 choose_async_recovery_ec(
2095 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2098 choose_async_recovery_replicated(
2099 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2103 while (want
.size() > pool
.info
.size
) {
2104 // async recovery should have taken out as many osds as it can.
2105 // if not, then always evict the last peer
2106 // (will get synchronously recovered later)
2107 psdout(10) << __func__
<< " evicting osd." << want
.back()
2108 << " from oversized want " << want
<< dendl
;
2111 if (want
!= acting
) {
2112 psdout(10) << __func__
<< " want " << want
<< " != acting " << acting
2113 << ", requesting pg_temp change" << dendl
;
2116 if (!cct
->_conf
->osd_debug_no_acting_change
) {
2117 if (want_acting
== up
) {
2118 // There can't be any pending backfill if
2119 // want is the same as crush map up OSDs.
2120 ceph_assert(want_backfill
.empty());
2122 pl
->queue_want_pg_temp(empty
);
2124 pl
->queue_want_pg_temp(want
);
2128 if (request_pg_temp_change_only
)
2130 want_acting
.clear();
2131 acting_recovery_backfill
= want_acting_backfill
;
2132 psdout(10) << "acting_recovery_backfill is "
2133 << acting_recovery_backfill
<< dendl
;
2135 backfill_targets
.empty() ||
2136 backfill_targets
== want_backfill
);
2137 if (backfill_targets
.empty()) {
2138 // Caller is GetInfo
2139 backfill_targets
= want_backfill
;
2141 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2143 async_recovery_targets
.empty() ||
2144 async_recovery_targets
== want_async_recovery
||
2146 if (async_recovery_targets
.empty() || !needs_recovery()) {
2147 async_recovery_targets
= want_async_recovery
;
2149 // Will not change if already set because up would have had to change
2150 // Verify that nothing in backfill is in stray_set
2151 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
2152 i
!= want_backfill
.end();
2154 ceph_assert(stray_set
.find(*i
) == stray_set
.end());
2156 psdout(10) << "choose_acting want=" << want
<< " backfill_targets="
2157 << want_backfill
<< " async_recovery_targets="
2158 << async_recovery_targets
<< dendl
;
2162 void PeeringState::log_weirdness()
2164 if (pg_log
.get_tail() != info
.log_tail
)
2165 pl
->get_clog_error() << info
.pgid
2166 << " info mismatch, log.tail " << pg_log
.get_tail()
2167 << " != info.log_tail " << info
.log_tail
;
2168 if (pg_log
.get_head() != info
.last_update
)
2169 pl
->get_clog_error() << info
.pgid
2170 << " info mismatch, log.head " << pg_log
.get_head()
2171 << " != info.last_update " << info
.last_update
;
2173 if (!pg_log
.get_log().empty()) {
2175 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
2176 pl
->get_clog_error() << info
.pgid
2177 << " log bound mismatch, info (tail,head] ("
2178 << pg_log
.get_tail() << ","
2179 << pg_log
.get_head() << "]"
2181 << pg_log
.get_log().log
.begin()->version
<< ","
2182 << pg_log
.get_log().log
.rbegin()->version
<< "]";
2185 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
2186 pl
->get_clog_error() << info
.pgid
2187 << " caller_ops.size "
2188 << pg_log
.get_log().caller_ops
.size()
2189 << " > log size " << pg_log
.get_log().log
.size();
2194 * Process information from a replica to determine if it could have any
2195 * objects that i need.
2197 * TODO: if the missing set becomes very large, this could get expensive.
2198 * Instead, we probably want to just iterate over our unfound set.
2200 bool PeeringState::search_for_missing(
2201 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
2203 PeeringCtxWrapper
&ctx
)
2205 uint64_t num_unfound_before
= missing_loc
.num_unfound();
2206 bool found_missing
= missing_loc
.add_source_info(
2207 from
, oinfo
, omissing
, ctx
.handle
);
2208 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
2209 pl
->publish_stats_to_osd();
2210 // avoid doing this if the peer is empty. This is abit of paranoia
2211 // to avoid doing something rash if add_source_info() above
2212 // incorrectly decided we found something new. (if the peer has
2213 // last_update=0'0 that's impossible.)
2214 if (found_missing
&&
2215 oinfo
.last_update
!= eversion_t()) {
2216 pg_info_t
tinfo(oinfo
);
2217 tinfo
.pgid
.shard
= pg_whoami
.shard
;
2220 spg_t(info
.pgid
.pgid
, from
.shard
),
2221 get_osdmap_epoch(), // fixme: use lower epoch?
2225 return found_missing
;
2228 bool PeeringState::discover_all_missing(
2229 BufferedRecoveryMessages
&rctx
)
2231 auto &missing
= pg_log
.get_missing();
2232 uint64_t unfound
= get_num_unfound();
2233 bool any
= false; // did we start any queries
2235 psdout(10) << __func__
<< " "
2236 << missing
.num_missing() << " missing, "
2237 << unfound
<< " unfound"
2240 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
2241 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
2242 for (; m
!= mend
; ++m
) {
2243 pg_shard_t
peer(*m
);
2245 if (!get_osdmap()->is_up(peer
.osd
)) {
2246 psdout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
2250 if (peer_purged
.count(peer
)) {
2251 psdout(20) << __func__
<< " skipping purged osd." << peer
<< dendl
;
2255 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
2256 if (iter
!= peer_info
.end() &&
2257 (iter
->second
.is_empty() || iter
->second
.dne())) {
2258 // ignore empty peers
2262 // If we've requested any of this stuff, the pg_missing_t information
2263 // should be on its way.
2264 // TODO: coalsce requested_* into a single data structure
2265 if (peer_missing
.find(peer
) != peer_missing
.end()) {
2266 psdout(20) << __func__
<< ": osd." << peer
2267 << ": we already have pg_missing_t" << dendl
;
2270 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
2271 psdout(20) << __func__
<< ": osd." << peer
2272 << ": in peer_log_requested" << dendl
;
2275 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
2276 psdout(20) << __func__
<< ": osd." << peer
2277 << ": in peer_missing_requested" << dendl
;
2282 psdout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
2284 peer_missing_requested
.insert(peer
);
2287 spg_t(info
.pgid
.pgid
, peer
.shard
),
2289 pg_query_t::FULLLOG
,
2290 peer
.shard
, pg_whoami
.shard
,
2291 info
.history
, get_osdmap_epoch()));
2297 /* Build the might_have_unfound set.
2299 * This is used by the primary OSD during recovery.
2301 * This set tracks the OSDs which might have unfound objects that the primary
2302 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2303 * will remove the OSD from the set.
2305 void PeeringState::build_might_have_unfound()
2307 ceph_assert(might_have_unfound
.empty());
2308 ceph_assert(is_primary());
2310 psdout(10) << __func__
<< dendl
;
2312 check_past_interval_bounds();
2314 might_have_unfound
= past_intervals
.get_might_have_unfound(
2316 pool
.info
.is_erasure());
2318 // include any (stray) peers
2319 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
2320 p
!= peer_info
.end();
2322 might_have_unfound
.insert(p
->first
);
2324 psdout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
2327 void PeeringState::activate(
2328 ObjectStore::Transaction
& t
,
2329 epoch_t activation_epoch
,
2330 PeeringCtxWrapper
&ctx
)
2332 ceph_assert(!is_peered());
2335 state_clear(PG_STATE_DOWN
);
2337 send_notify
= false;
2340 // only update primary last_epoch_started if we will go active
2341 if (actingset
.size() >= pool
.info
.min_size
) {
2342 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
2343 info
.last_epoch_started
<= activation_epoch
);
2344 info
.last_epoch_started
= activation_epoch
;
2345 info
.last_interval_started
= info
.history
.same_interval_since
;
2347 } else if (is_acting(pg_whoami
)) {
2348 /* update last_epoch_started on acting replica to whatever the primary sent
2349 * unless it's smaller (could happen if we are going peered rather than
2350 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2351 if (info
.last_epoch_started
< activation_epoch
) {
2352 info
.last_epoch_started
= activation_epoch
;
2353 info
.last_interval_started
= info
.history
.same_interval_since
;
2357 auto &missing
= pg_log
.get_missing();
2359 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
2361 last_update_ondisk
= info
.last_update
;
2363 last_update_applied
= info
.last_update
;
2364 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
2366 need_up_thru
= false;
2368 // write pg info, log
2370 dirty_big_info
= true; // maybe
2372 pl
->schedule_event_on_commit(
2374 std::make_shared
<PGPeeringEvent
>(
2379 activation_epoch
)));
2381 // init complete pointer
2382 if (missing
.num_missing() == 0) {
2383 psdout(10) << "activate - no missing, moving last_complete " << info
.last_complete
2384 << " -> " << info
.last_update
<< dendl
;
2385 info
.last_complete
= info
.last_update
;
2386 info
.stats
.stats
.sum
.num_objects_missing
= 0;
2387 pg_log
.reset_recovery_pointers();
2389 psdout(10) << "activate - not complete, " << missing
<< dendl
;
2390 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
2391 pg_log
.activate_not_complete(info
);
2397 // initialize snap_trimq
2398 interval_set
<snapid_t
> to_trim
;
2399 auto& removed_snaps_queue
= get_osdmap()->get_removed_snaps_queue();
2400 auto p
= removed_snaps_queue
.find(info
.pgid
.pgid
.pool());
2401 if (p
!= removed_snaps_queue
.end()) {
2402 dout(20) << "activate - purged_snaps " << info
.purged_snaps
2403 << " removed_snaps " << p
->second
2405 for (auto q
: p
->second
) {
2406 to_trim
.insert(q
.first
, q
.second
);
2409 interval_set
<snapid_t
> purged
;
2410 purged
.intersection_of(to_trim
, info
.purged_snaps
);
2411 to_trim
.subtract(purged
);
2413 if (HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
2414 renew_lease(pl
->get_mnow());
2415 // do not schedule until we are actually activated
2418 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2419 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2420 // reflect only those snaps that we thought were pruned and were still in
2422 info
.purged_snaps
.swap(purged
);
2424 // start up replicas
2425 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2426 prior_readable_until_ub
);
2428 ceph_assert(!acting_recovery_backfill
.empty());
2429 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2430 i
!= acting_recovery_backfill
.end();
2432 if (*i
== pg_whoami
) continue;
2433 pg_shard_t peer
= *i
;
2434 ceph_assert(peer_info
.count(peer
));
2435 pg_info_t
& pi
= peer_info
[peer
];
2437 psdout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
2440 ceph_assert(peer_missing
.count(peer
));
2441 pg_missing_t
& pm
= peer_missing
[peer
];
2443 bool needs_past_intervals
= pi
.dne();
2445 if (pi
.last_update
== info
.last_update
) {
2447 if (!pi
.last_backfill
.is_max())
2448 pl
->get_clog_info() << info
.pgid
<< " continuing backfill to osd."
2450 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2451 << "] " << pi
.last_backfill
2452 << " to " << info
.last_update
;
2453 if (!pi
.is_empty()) {
2454 psdout(10) << "activate peer osd." << peer
2455 << " is up to date, queueing in pending_activators" << dendl
;
2458 spg_t(info
.pgid
.pgid
, peer
.shard
),
2459 get_osdmap_epoch(), // fixme: use lower epoch?
2464 psdout(10) << "activate peer osd." << peer
2465 << " is up to date, but sending pg_log anyway" << dendl
;
2467 i
->shard
, pg_whoami
.shard
,
2468 get_osdmap_epoch(), info
,
2469 last_peering_reset
);
2472 pg_log
.get_tail() > pi
.last_update
||
2473 pi
.last_backfill
== hobject_t() ||
2474 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
2475 /* ^ This last case covers a situation where a replica is not contiguous
2476 * with the auth_log, but is contiguous with this replica. Reshuffling
2477 * the active set to handle this would be tricky, so instead we just go
2478 * ahead and backfill it anyway. This is probably preferrable in any
2479 * case since the replica in question would have to be significantly
2483 pl
->get_clog_debug() << info
.pgid
<< " starting backfill to osd." << peer
2484 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2485 << "] " << pi
.last_backfill
2486 << " to " << info
.last_update
;
2488 pi
.last_update
= info
.last_update
;
2489 pi
.last_complete
= info
.last_update
;
2490 pi
.set_last_backfill(hobject_t());
2491 pi
.last_epoch_started
= info
.last_epoch_started
;
2492 pi
.last_interval_started
= info
.last_interval_started
;
2493 pi
.history
= info
.history
;
2494 pi
.hit_set
= info
.hit_set
;
2495 // Save num_bytes for reservation request, can't be negative
2496 peer_bytes
[peer
] = std::max
<int64_t>(0, pi
.stats
.stats
.sum
.num_bytes
);
2497 pi
.stats
.stats
.clear();
2498 pi
.stats
.stats
.sum
.num_bytes
= peer_bytes
[peer
];
2500 // initialize peer with our purged_snaps.
2501 pi
.purged_snaps
= info
.purged_snaps
;
2504 i
->shard
, pg_whoami
.shard
,
2505 get_osdmap_epoch(), pi
,
2506 last_peering_reset
/* epoch to create pg at */);
2508 // send some recent log, so that op dup detection works well.
2509 m
->log
.copy_up_to(cct
, pg_log
.get_log(),
2510 cct
->_conf
->osd_max_pg_log_entries
);
2511 m
->info
.log_tail
= m
->log
.tail
;
2512 pi
.log_tail
= m
->log
.tail
; // sigh...
2517 ceph_assert(pg_log
.get_tail() <= pi
.last_update
);
2519 i
->shard
, pg_whoami
.shard
,
2520 get_osdmap_epoch(), info
,
2521 last_peering_reset
/* epoch to create pg at */);
2522 // send new stuff to append to replicas log
2523 m
->log
.copy_after(cct
, pg_log
.get_log(), pi
.last_update
);
2526 // share past_intervals if we are creating the pg on the replica
2527 // based on whether our info for that peer was dne() *before*
2528 // updating pi.history in the backfill block above.
2529 if (m
&& needs_past_intervals
)
2530 m
->past_intervals
= past_intervals
;
2532 // update local version of peer's missing list!
2533 if (m
&& pi
.last_backfill
!= hobject_t()) {
2534 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
2535 p
!= m
->log
.log
.end();
2537 if (p
->soid
<= pi
.last_backfill
&&
2539 if (perform_deletes_during_peering() && p
->is_delete()) {
2540 pm
.rm(p
->soid
, p
->version
);
2542 pm
.add_next_event(*p
);
2549 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
2551 m
->lease
= get_lease();
2552 pl
->send_cluster_message(peer
.osd
, m
, get_osdmap_epoch());
2556 pi
.last_update
= info
.last_update
;
2558 // update our missing
2559 if (pm
.num_missing() == 0) {
2560 pi
.last_complete
= pi
.last_update
;
2561 psdout(10) << "activate peer osd." << peer
<< " " << pi
2562 << " uptodate" << dendl
;
2564 psdout(10) << "activate peer osd." << peer
<< " " << pi
2565 << " missing " << pm
<< dendl
;
2569 // Set up missing_loc
2570 set
<pg_shard_t
> complete_shards
;
2571 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2572 i
!= acting_recovery_backfill
.end();
2574 psdout(20) << __func__
<< " setting up missing_loc from shard " << *i
2576 if (*i
== get_primary()) {
2577 missing_loc
.add_active_missing(missing
);
2578 if (!missing
.have_missing())
2579 complete_shards
.insert(*i
);
2581 auto peer_missing_entry
= peer_missing
.find(*i
);
2582 ceph_assert(peer_missing_entry
!= peer_missing
.end());
2583 missing_loc
.add_active_missing(peer_missing_entry
->second
);
2584 if (!peer_missing_entry
->second
.have_missing() &&
2585 peer_info
[*i
].last_backfill
.is_max())
2586 complete_shards
.insert(*i
);
2590 // If necessary, create might_have_unfound to help us find our unfound objects.
2591 // NOTE: It's important that we build might_have_unfound before trimming the
2593 might_have_unfound
.clear();
2594 if (needs_recovery()) {
2595 // If only one shard has missing, we do a trick to add all others as recovery
2596 // source, this is considered safe since the PGLogs have been merged locally,
2597 // and covers vast majority of the use cases, like one OSD/host is down for
2598 // a while for hardware repairing
2599 if (complete_shards
.size() + 1 == acting_recovery_backfill
.size()) {
2600 missing_loc
.add_batch_sources_info(complete_shards
, ctx
.handle
);
2602 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
2604 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2605 i
!= acting_recovery_backfill
.end();
2607 if (*i
== pg_whoami
) continue;
2608 psdout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
2609 ceph_assert(peer_missing
.count(*i
));
2610 ceph_assert(peer_info
.count(*i
));
2611 missing_loc
.add_source_info(
2618 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
2619 i
!= peer_missing
.end();
2621 if (is_acting_recovery_backfill(i
->first
))
2623 ceph_assert(peer_info
.count(i
->first
));
2625 peer_info
[i
->first
],
2631 build_might_have_unfound();
2633 // Always call now so update_calc_stats() will be accurate
2634 discover_all_missing(ctx
.msgs
);
2638 // num_objects_degraded if calculated should reflect this too, unless no
2639 // missing and we are about to go clean.
2640 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
2641 state_set(PG_STATE_UNDERSIZED
);
2644 state_set(PG_STATE_ACTIVATING
);
2645 pl
->on_activate(std::move(to_trim
));
2647 if (actingset
.size() >= pool
.info
.min_size
) {
2648 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2649 pg_log
.roll_forward(rollbacker
.get());
2653 void PeeringState::share_pg_info()
2655 psdout(10) << "share_pg_info" << dendl
;
2657 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2658 prior_readable_until_ub
);
2660 // share new pg_info_t with replicas
2661 ceph_assert(!acting_recovery_backfill
.empty());
2662 for (auto pg_shard
: acting_recovery_backfill
) {
2663 if (pg_shard
== pg_whoami
) continue;
2664 if (auto peer
= peer_info
.find(pg_shard
); peer
!= peer_info
.end()) {
2665 peer
->second
.last_epoch_started
= info
.last_epoch_started
;
2666 peer
->second
.last_interval_started
= info
.last_interval_started
;
2667 peer
->second
.history
.merge(info
.history
);
2669 Message
* m
= nullptr;
2670 if (last_require_osd_release
>= ceph_release_t::octopus
) {
2671 m
= new MOSDPGInfo2
{spg_t
{info
.pgid
.pgid
, pg_shard
.shard
},
2677 m
= new MOSDPGInfo
{get_osdmap_epoch(),
2678 {pg_notify_t
{pg_shard
.shard
,
2685 pl
->send_cluster_message(pg_shard
.osd
, m
, get_osdmap_epoch());
2689 void PeeringState::merge_log(
2690 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
,
2693 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2695 oinfo
, olog
, from
, info
, rollbacker
.get(), dirty_info
, dirty_big_info
);
2698 void PeeringState::rewind_divergent_log(
2699 ObjectStore::Transaction
& t
, eversion_t newhead
)
2701 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2702 pg_log
.rewind_divergent_log(
2703 newhead
, info
, rollbacker
.get(), dirty_info
, dirty_big_info
);
2707 void PeeringState::proc_primary_info(
2708 ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
2710 ceph_assert(!is_primary());
2712 update_history(oinfo
.history
);
2713 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
2714 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
2715 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
2716 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
2720 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
2721 psdout(10) << __func__
<< " updating purged_snaps to "
2722 << oinfo
.purged_snaps
2724 info
.purged_snaps
= oinfo
.purged_snaps
;
2726 dirty_big_info
= true;
2730 void PeeringState::proc_master_log(
2731 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
2732 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
2734 psdout(10) << "proc_master_log for osd." << from
<< ": "
2735 << olog
<< " " << omissing
<< dendl
;
2736 ceph_assert(!is_peered() && is_primary());
2738 // merge log into our own log to build master log. no need to
2739 // make any adjustments to their missing map; we are taking their
2740 // log to be authoritative (i.e., their entries are by definitely
2742 merge_log(t
, oinfo
, olog
, from
);
2743 peer_info
[from
] = oinfo
;
2744 psdout(10) << " peer osd." << from
<< " now " << oinfo
2745 << " " << omissing
<< dendl
;
2746 might_have_unfound
.insert(from
);
2748 // See doc/dev/osd_internals/last_epoch_started
2749 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
2750 info
.last_epoch_started
= oinfo
.last_epoch_started
;
2753 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
2754 info
.last_interval_started
= oinfo
.last_interval_started
;
2757 update_history(oinfo
.history
);
2758 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
2759 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
2761 peer_missing
[from
].claim(omissing
);
2764 void PeeringState::proc_replica_log(
2766 const pg_log_t
&olog
,
2767 pg_missing_t
& omissing
,
2770 psdout(10) << "proc_replica_log for osd." << from
<< ": "
2771 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
2773 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
2775 peer_info
[from
] = oinfo
;
2776 psdout(10) << " peer osd." << from
<< " now "
2777 << oinfo
<< " " << omissing
<< dendl
;
2778 might_have_unfound
.insert(from
);
2780 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
2781 omissing
.get_items().begin();
2782 i
!= omissing
.get_items().end();
2784 psdout(20) << " after missing " << i
->first
2785 << " need " << i
->second
.need
2786 << " have " << i
->second
.have
<< dendl
;
2788 peer_missing
[from
].claim(omissing
);
2791 void PeeringState::fulfill_info(
2792 pg_shard_t from
, const pg_query_t
&query
,
2793 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
2795 ceph_assert(from
== primary
);
2796 ceph_assert(query
.type
== pg_query_t::INFO
);
2799 psdout(10) << "sending info" << dendl
;
2800 notify_info
= make_pair(from
, info
);
2803 void PeeringState::fulfill_log(
2804 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
2806 psdout(10) << "log request from " << from
<< dendl
;
2807 ceph_assert(from
== primary
);
2808 ceph_assert(query
.type
!= pg_query_t::INFO
);
2810 MOSDPGLog
*mlog
= new MOSDPGLog(
2811 from
.shard
, pg_whoami
.shard
,
2814 mlog
->missing
= pg_log
.get_missing();
2816 // primary -> other, when building master log
2817 if (query
.type
== pg_query_t::LOG
) {
2818 psdout(10) << " sending info+missing+log since " << query
.since
2820 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
2821 pl
->get_clog_error() << info
.pgid
<< " got broken pg_query_t::LOG since "
2823 << " when my log.tail is " << pg_log
.get_tail()
2824 << ", sending full log instead";
2825 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
2827 mlog
->log
.copy_after(cct
, pg_log
.get_log(), query
.since
);
2829 else if (query
.type
== pg_query_t::FULLLOG
) {
2830 psdout(10) << " sending info+missing+full log" << dendl
;
2831 mlog
->log
= pg_log
.get_log();
2834 psdout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
2836 pl
->send_cluster_message(from
.osd
, mlog
, get_osdmap_epoch(), true);
2839 void PeeringState::fulfill_query(const MQuery
& query
, PeeringCtxWrapper
&rctx
)
2841 if (query
.query
.type
== pg_query_t::INFO
) {
2842 pair
<pg_shard_t
, pg_info_t
> notify_info
;
2843 // note this refreshes our prior_readable_until_ub value
2844 update_history(query
.query
.history
);
2845 fulfill_info(query
.from
, query
.query
, notify_info
);
2847 notify_info
.first
.osd
,
2849 notify_info
.first
.shard
, pg_whoami
.shard
,
2855 update_history(query
.query
.history
);
2856 fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
2860 void PeeringState::try_mark_clean()
2862 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2863 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
2864 state_set(PG_STATE_CLEAN
);
2865 info
.history
.last_epoch_clean
= get_osdmap_epoch();
2866 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2867 past_intervals
.clear();
2868 dirty_big_info
= true;
2872 if (!is_active() && is_peered()) {
2875 if (pool
.info
.is_pending_merge(info
.pgid
.pgid
, &target
)) {
2877 psdout(10) << "ready to merge (target)" << dendl
;
2878 pl
->set_ready_to_merge_target(
2880 info
.history
.last_epoch_started
,
2881 info
.history
.last_epoch_clean
);
2883 psdout(10) << "ready to merge (source)" << dendl
;
2884 pl
->set_ready_to_merge_source(info
.last_update
);
2888 psdout(10) << "not clean, not ready to merge" << dendl
;
2889 // we should have notified OSD in Active state entry point
2893 state_clear(PG_STATE_FORCED_RECOVERY
| PG_STATE_FORCED_BACKFILL
);
2896 pl
->publish_stats_to_osd();
2897 clear_recovery_state();
2900 void PeeringState::split_into(
2901 pg_t child_pgid
, PeeringState
*child
, unsigned split_bits
)
2903 child
->update_osdmap_ref(get_osdmap());
2907 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2908 child
->info
.last_complete
= info
.last_complete
;
2910 info
.last_update
= pg_log
.get_head();
2911 child
->info
.last_update
= child
->pg_log
.get_head();
2913 child
->info
.last_user_version
= info
.last_user_version
;
2915 info
.log_tail
= pg_log
.get_tail();
2916 child
->info
.log_tail
= child
->pg_log
.get_tail();
2918 // reset last_complete, we might have modified pg_log & missing above
2919 pg_log
.reset_complete_to(&info
);
2920 child
->pg_log
.reset_complete_to(&child
->info
);
2923 child
->info
.history
= info
.history
;
2924 child
->info
.history
.epoch_created
= get_osdmap_epoch();
2925 child
->info
.purged_snaps
= info
.purged_snaps
;
2927 if (info
.last_backfill
.is_max()) {
2928 child
->info
.set_last_backfill(hobject_t::get_max());
2930 // restart backfill on parent and child to be safe. we could
2931 // probably do better in the bitwise sort case, but it's more
2932 // fragile (there may be special work to do on backfill completion
2934 info
.set_last_backfill(hobject_t());
2935 child
->info
.set_last_backfill(hobject_t());
2936 // restarting backfill implies that the missing set is empty,
2937 // since it is only used for objects prior to last_backfill
2938 pg_log
.reset_backfill();
2939 child
->pg_log
.reset_backfill();
2942 child
->info
.stats
= info
.stats
;
2943 child
->info
.stats
.parent_split_bits
= split_bits
;
2944 info
.stats
.stats_invalid
= true;
2945 child
->info
.stats
.stats_invalid
= true;
2946 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2947 child
->info
.last_interval_started
= info
.last_interval_started
;
2949 // There can't be recovery/backfill going on now
2950 int primary
, up_primary
;
2951 vector
<int> newup
, newacting
;
2952 get_osdmap()->pg_to_up_acting_osds(
2953 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2954 child
->init_primary_up_acting(
2959 child
->role
= OSDMap::calc_pg_role(pg_whoami
, child
->acting
);
2961 // this comparison includes primary rank via pg_shard_t
2962 if (get_primary() != child
->get_primary())
2963 child
->info
.history
.same_primary_since
= get_osdmap_epoch();
2965 child
->info
.stats
.up
= up
;
2966 child
->info
.stats
.up_primary
= up_primary
;
2967 child
->info
.stats
.acting
= acting
;
2968 child
->info
.stats
.acting_primary
= primary
;
2969 child
->info
.stats
.mapping_epoch
= get_osdmap_epoch();
2972 child
->past_intervals
= past_intervals
;
2974 child
->on_new_interval();
2976 child
->send_notify
= !child
->is_primary();
2978 child
->dirty_info
= true;
2979 child
->dirty_big_info
= true;
2981 dirty_big_info
= true;
2984 void PeeringState::merge_from(
2985 map
<spg_t
,PeeringState
*>& sources
,
2987 unsigned split_bits
,
2988 const pg_merge_meta_t
& last_pg_merge_meta
)
2990 bool incomplete
= false;
2991 if (info
.last_complete
!= info
.last_update
||
2992 info
.is_incomplete() ||
2994 psdout(10) << __func__
<< " target incomplete" << dendl
;
2997 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
2998 if (info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
.get_parent()) {
2999 psdout(10) << __func__
<< " target doesn't match expected parent "
3000 << last_pg_merge_meta
.source_pgid
.get_parent()
3001 << " of source_pgid " << last_pg_merge_meta
.source_pgid
3005 if (info
.last_update
!= last_pg_merge_meta
.target_version
) {
3006 psdout(10) << __func__
<< " target version doesn't match expected "
3007 << last_pg_merge_meta
.target_version
<< dendl
;
3012 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(rctx
.transaction
)};
3013 pg_log
.roll_forward(handler
.get());
3015 info
.last_complete
= info
.last_update
; // to fake out trim()
3016 pg_log
.reset_recovery_pointers();
3017 pg_log
.trim(info
.last_update
, info
);
3019 vector
<PGLog
*> log_from
;
3020 for (auto& i
: sources
) {
3021 auto& source
= i
.second
;
3023 psdout(10) << __func__
<< " source " << i
.first
<< " missing" << dendl
;
3027 if (source
->info
.last_complete
!= source
->info
.last_update
||
3028 source
->info
.is_incomplete() ||
3029 source
->info
.dne()) {
3030 psdout(10) << __func__
<< " source " << source
->pg_whoami
3035 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
3036 if (source
->info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
) {
3037 dout(10) << __func__
<< " source " << source
->info
.pgid
.pgid
3038 << " doesn't match expected source pgid "
3039 << last_pg_merge_meta
.source_pgid
<< dendl
;
3042 if (source
->info
.last_update
!= last_pg_merge_meta
.source_version
) {
3043 dout(10) << __func__
<< " source version doesn't match expected "
3044 << last_pg_merge_meta
.target_version
<< dendl
;
3050 PGLog::LogEntryHandlerRef handler
{
3051 source
->pl
->get_log_handler(rctx
.transaction
)};
3052 source
->pg_log
.roll_forward(handler
.get());
3053 source
->info
.last_complete
= source
->info
.last_update
; // to fake out trim()
3054 source
->pg_log
.reset_recovery_pointers();
3055 source
->pg_log
.trim(source
->info
.last_update
, source
->info
);
3056 log_from
.push_back(&source
->pg_log
);
3059 info
.stats
.add(source
->info
.stats
);
3061 // pull up last_update
3062 info
.last_update
= std::max(info
.last_update
, source
->info
.last_update
);
3064 // adopt source's PastIntervals if target has none. we can do this since
3065 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3066 // the PGs are identical.
3067 if (past_intervals
.empty() && !source
->past_intervals
.empty()) {
3068 psdout(10) << __func__
<< " taking source's past_intervals" << dendl
;
3069 past_intervals
= source
->past_intervals
;
3073 info
.last_complete
= info
.last_update
;
3074 info
.log_tail
= info
.last_update
;
3076 info
.last_backfill
= hobject_t();
3080 pg_log
.merge_from(log_from
, info
.last_update
);
3082 // make sure we have a meaningful last_epoch_started/clean (if we were a
3084 if (info
.history
.epoch_created
== 0) {
3085 // start with (a) source's history, since these PGs *should* have been
3086 // remapped in concert with each other...
3087 info
.history
= sources
.begin()->second
->info
.history
;
3089 // we use the last_epoch_{started,clean} we got from
3090 // the caller, which are the epochs that were reported by the PGs were
3091 // found to be ready for merge.
3092 info
.history
.last_epoch_clean
= last_pg_merge_meta
.last_epoch_clean
;
3093 info
.history
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3094 info
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3095 psdout(10) << __func__
3096 << " set les/c to " << last_pg_merge_meta
.last_epoch_started
<< "/"
3097 << last_pg_merge_meta
.last_epoch_clean
3098 << " from pool last_dec_*, source pg history was "
3099 << sources
.begin()->second
->info
.history
3102 // above we have pulled down source's history and we need to check
3103 // history.epoch_created again to confirm that source is not a placeholder
3104 // too. (peering requires a sane history.same_interval_since value for any
3105 // non-newly created pg and below here we know we are basically iterating
3106 // back a series of past maps to fake a merge process, hence we need to
3107 // fix history.same_interval_since first so that start_peering_interval()
3108 // will not complain)
3109 if (info
.history
.epoch_created
== 0) {
3110 dout(10) << __func__
<< " both merge target and source are placeholders,"
3111 << " set sis to lec " << info
.history
.last_epoch_clean
3113 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
3116 // if the past_intervals start is later than last_epoch_clean, it
3117 // implies the source repeered again but the target didn't, or
3118 // that the source became clean in a later epoch than the target.
3119 // avoid the discrepancy but adjusting the interval start
3120 // backwards to match so that check_past_interval_bounds() will
3122 auto pib
= past_intervals
.get_bounds();
3123 if (info
.history
.last_epoch_clean
< pib
.first
) {
3124 psdout(10) << __func__
<< " last_epoch_clean "
3125 << info
.history
.last_epoch_clean
<< " < past_interval start "
3126 << pib
.first
<< ", adjusting start backwards" << dendl
;
3127 past_intervals
.adjust_start_backwards(info
.history
.last_epoch_clean
);
3130 // Similarly, if the same_interval_since value is later than
3131 // last_epoch_clean, the next interval change will result in a
3132 // past_interval start that is later than last_epoch_clean. This
3133 // can happen if we use the pg_history values from the merge
3134 // source. Adjust the same_interval_since value backwards if that
3135 // happens. (We trust the les and lec values more because they came from
3136 // the real target, whereas the history value we stole from the source.)
3137 if (info
.history
.last_epoch_started
< info
.history
.same_interval_since
) {
3138 psdout(10) << __func__
<< " last_epoch_started "
3139 << info
.history
.last_epoch_started
<< " < same_interval_since "
3140 << info
.history
.same_interval_since
3141 << ", adjusting pg_history backwards" << dendl
;
3142 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
3143 // make sure same_{up,primary}_since are <= same_interval_since
3144 info
.history
.same_up_since
= std::min(
3145 info
.history
.same_up_since
, info
.history
.same_interval_since
);
3146 info
.history
.same_primary_since
= std::min(
3147 info
.history
.same_primary_since
, info
.history
.same_interval_since
);
3152 dirty_big_info
= true;
3155 void PeeringState::start_split_stats(
3156 const set
<spg_t
>& childpgs
, vector
<object_stat_sum_t
> *out
)
3158 out
->resize(childpgs
.size() + 1);
3159 info
.stats
.stats
.sum
.split(*out
);
3162 void PeeringState::finish_split_stats(
3163 const object_stat_sum_t
& stats
, ObjectStore::Transaction
&t
)
3165 info
.stats
.stats
.sum
= stats
;
3169 void PeeringState::update_blocked_by()
3171 // set a max on the number of blocking peers we report. if we go
3172 // over, report a random subset. keep the result sorted.
3173 unsigned keep
= std::min
<unsigned>(
3174 blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
3175 unsigned skip
= blocked_by
.size() - keep
;
3176 info
.stats
.blocked_by
.clear();
3177 info
.stats
.blocked_by
.resize(keep
);
3179 for (set
<int>::iterator p
= blocked_by
.begin();
3180 p
!= blocked_by
.end() && keep
> 0;
3182 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
3185 info
.stats
.blocked_by
[pos
++] = *p
;
3191 static bool find_shard(const set
<pg_shard_t
> & pgs
, shard_id_t shard
)
3194 if (p
.shard
== shard
)
3199 static pg_shard_t
get_another_shard(const set
<pg_shard_t
> & pgs
, pg_shard_t skip
, shard_id_t shard
)
3201 for (auto&p
: pgs
) {
3204 if (p
.shard
== shard
)
3207 return pg_shard_t();
3210 void PeeringState::update_calc_stats()
3212 info
.stats
.version
= info
.last_update
;
3213 info
.stats
.created
= info
.history
.epoch_created
;
3214 info
.stats
.last_scrub
= info
.history
.last_scrub
;
3215 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
3216 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
3217 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
3218 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
3219 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
3221 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
3222 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
3223 info
.stats
.log_start
= pg_log
.get_tail();
3224 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
3225 info
.stats
.snaptrimq_len
= pl
->get_snap_trimq_size();
3227 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
3229 // In rare case that upset is too large (usually transient), use as target
3230 // for calculations below.
3231 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
3232 // For undersized actingset may be larger with OSDs out
3233 unsigned nrep
= std::max(actingset
.size(), upset
.size());
3234 // calc num_object_copies
3235 info
.stats
.stats
.calc_copies(std::max(target
, nrep
));
3236 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
3237 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
3238 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
3239 info
.stats
.avail_no_missing
.clear();
3240 info
.stats
.object_location_counts
.clear();
3242 // We should never hit this condition, but if end up hitting it,
3243 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3244 if (info
.stats
.stats
.sum
.num_objects
< 0) {
3245 psdout(0) << __func__
<< " negative num_objects = "
3246 << info
.stats
.stats
.sum
.num_objects
<< " setting it to 0 "
3248 info
.stats
.stats
.sum
.num_objects
= 0;
3249 state_set(PG_STATE_INCONSISTENT
);
3252 if ((is_remapped() || is_undersized() || !is_clean()) &&
3253 (is_peered()|| is_activating())) {
3254 psdout(20) << __func__
<< " actingset " << actingset
<< " upset "
3255 << upset
<< " acting_recovery_backfill " << acting_recovery_backfill
<< dendl
;
3257 ceph_assert(!acting_recovery_backfill
.empty());
3259 bool estimate
= false;
3261 // NOTE: we only generate degraded, misplaced and unfound
3262 // values for the summation, not individual stat categories.
3263 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
3265 // Objects missing from up nodes, sorted by # objects.
3266 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
3267 // Objects missing from nodes not in up, sort by # objects
3268 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
3270 // Fill missing_target_objects/acting_source_objects
3276 missing
= pg_log
.get_missing().num_missing();
3277 ceph_assert(acting_recovery_backfill
.count(pg_whoami
));
3278 if (upset
.count(pg_whoami
)) {
3279 missing_target_objects
.emplace(missing
, pg_whoami
);
3281 acting_source_objects
.emplace(missing
, pg_whoami
);
3283 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
3285 info
.stats
.avail_no_missing
.push_back(pg_whoami
);
3286 psdout(20) << __func__
<< " shard " << pg_whoami
3287 << " primary objects " << num_objects
3288 << " missing " << missing
3293 for (auto& peer
: peer_info
) {
3294 // Primary should not be in the peer_info, skip if it is.
3295 if (peer
.first
== pg_whoami
) continue;
3296 int64_t missing
= 0;
3297 int64_t peer_num_objects
=
3298 std::max((int64_t)0, peer
.second
.stats
.stats
.sum
.num_objects
);
3299 // Backfill targets always track num_objects accurately
3300 // all other peers track missing accurately.
3301 if (is_backfill_target(peer
.first
)) {
3302 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3304 if (peer_missing
.count(peer
.first
)) {
3305 missing
= peer_missing
[peer
.first
].num_missing();
3307 psdout(20) << __func__
<< " no peer_missing found for "
3308 << peer
.first
<< dendl
;
3309 if (is_recovering()) {
3312 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3315 if (upset
.count(peer
.first
)) {
3316 missing_target_objects
.emplace(missing
, peer
.first
);
3317 } else if (actingset
.count(peer
.first
)) {
3318 acting_source_objects
.emplace(missing
, peer
.first
);
3320 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
3322 info
.stats
.avail_no_missing
.push_back(peer
.first
);
3323 psdout(20) << __func__
<< " shard " << peer
.first
3324 << " objects " << peer_num_objects
3325 << " missing " << missing
3329 // Compute object_location_counts
3330 for (auto& ml
: missing_loc
.get_missing_locs()) {
3331 info
.stats
.object_location_counts
[ml
.second
]++;
3332 psdout(30) << __func__
<< " " << ml
.first
<< " object_location_counts["
3333 << ml
.second
<< "]=" << info
.stats
.object_location_counts
[ml
.second
]
3336 int64_t not_missing
= num_objects
- missing_loc
.get_missing_locs().size();
3338 // During recovery we know upset == actingset and is being populated
3339 // During backfill we know that all non-missing objects are in the actingset
3340 info
.stats
.object_location_counts
[actingset
] = not_missing
;
3342 psdout(30) << __func__
<< " object_location_counts["
3343 << upset
<< "]=" << info
.stats
.object_location_counts
[upset
]
3345 psdout(20) << __func__
<< " object_location_counts "
3346 << info
.stats
.object_location_counts
<< dendl
;
3348 // A misplaced object is not stored on the correct OSD
3349 int64_t misplaced
= 0;
3350 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3351 int64_t degraded
= 0;
3353 if (is_recovering()) {
3354 for (auto& sml
: missing_loc
.get_missing_by_count()) {
3355 for (auto& ml
: sml
.second
) {
3357 if (sml
.first
== shard_id_t::NO_SHARD
) {
3358 psdout(20) << __func__
<< " ml " << ml
.second
3359 << " upset size " << upset
.size()
3360 << " up " << ml
.first
.up
<< dendl
;
3361 missing_shards
= (int)upset
.size() - ml
.first
.up
;
3363 // Handle shards not even in upset below
3364 if (!find_shard(upset
, sml
.first
))
3366 missing_shards
= std::max(0, 1 - ml
.first
.up
);
3367 psdout(20) << __func__
3368 << " shard " << sml
.first
3369 << " ml " << ml
.second
3370 << " missing shards " << missing_shards
<< dendl
;
3372 int odegraded
= ml
.second
* missing_shards
;
3373 // Copies on other osds but limited to the possible degraded
3374 int more_osds
= std::min(missing_shards
, ml
.first
.other
);
3375 int omisplaced
= ml
.second
* more_osds
;
3376 ceph_assert(omisplaced
<= odegraded
);
3377 odegraded
-= omisplaced
;
3379 misplaced
+= omisplaced
;
3380 degraded
+= odegraded
;
3384 psdout(20) << __func__
<< " missing based degraded "
3385 << degraded
<< dendl
;
3386 psdout(20) << __func__
<< " missing based misplaced "
3387 << misplaced
<< dendl
;
3389 // Handle undersized case
3390 if (pool
.info
.is_replicated()) {
3391 // Add degraded for missing targets (num_objects missing)
3392 ceph_assert(target
>= upset
.size());
3393 unsigned needed
= target
- upset
.size();
3394 degraded
+= num_objects
* needed
;
3396 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3397 shard_id_t
shard(i
);
3399 if (!find_shard(upset
, shard
)) {
3400 pg_shard_t pgs
= get_another_shard(actingset
, pg_shard_t(), shard
);
3402 if (pgs
!= pg_shard_t()) {
3405 if (pgs
== pg_whoami
)
3406 missing
= info
.stats
.stats
.sum
.num_objects_missing_on_primary
;
3408 missing
= peer_info
[pgs
].stats
.stats
.sum
.num_objects_missing
;
3410 degraded
+= missing
;
3411 misplaced
+= std::max((int64_t)0, num_objects
- missing
);
3413 // No shard anywhere
3414 degraded
+= num_objects
;
3422 // Handle undersized case
3423 if (pool
.info
.is_replicated()) {
3424 // Add to missing_target_objects
3425 ceph_assert(target
>= missing_target_objects
.size());
3426 unsigned needed
= target
- missing_target_objects
.size();
3428 missing_target_objects
.emplace(num_objects
* needed
, pg_shard_t(pg_shard_t::NO_OSD
));
3430 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3431 shard_id_t
shard(i
);
3433 for (const auto& t
: missing_target_objects
) {
3434 if (std::get
<1>(t
).shard
== shard
) {
3440 missing_target_objects
.emplace(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
));
3444 for (const auto& item
: missing_target_objects
)
3445 psdout(20) << __func__
<< " missing shard " << std::get
<1>(item
)
3446 << " missing= " << std::get
<0>(item
) << dendl
;
3447 for (const auto& item
: acting_source_objects
)
3448 psdout(20) << __func__
<< " acting shard " << std::get
<1>(item
)
3449 << " missing= " << std::get
<0>(item
) << dendl
;
3451 // Handle all objects not in missing for remapped
3453 for (auto m
= missing_target_objects
.rbegin();
3454 m
!= missing_target_objects
.rend(); ++m
) {
3456 int64_t extra_missing
= -1;
3458 if (pool
.info
.is_replicated()) {
3459 if (!acting_source_objects
.empty()) {
3460 auto extra_copy
= acting_source_objects
.begin();
3461 extra_missing
= std::get
<0>(*extra_copy
);
3462 acting_source_objects
.erase(extra_copy
);
3464 } else { // Erasure coded
3465 // Use corresponding shard
3466 for (const auto& a
: acting_source_objects
) {
3467 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
3468 extra_missing
= std::get
<0>(a
);
3469 acting_source_objects
.erase(a
);
3475 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
3476 // We don't know which of the objects on the target
3477 // are part of extra_missing so assume are all degraded.
3478 misplaced
+= std::get
<0>(*m
) - extra_missing
;
3479 degraded
+= extra_missing
;
3481 // 1. extra_missing == -1, more targets than sources so degraded
3482 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3483 // previously degraded are now present on the target.
3484 degraded
+= std::get
<0>(*m
);
3487 // If there are still acting that haven't been accounted for
3488 // then they are misplaced
3489 for (const auto& a
: acting_source_objects
) {
3490 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
3491 psdout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
3493 misplaced
+= extra_misplaced
;
3496 // NOTE: Tests use these messages to verify this code
3497 psdout(20) << __func__
<< " degraded " << degraded
3498 << (estimate
? " (est)": "") << dendl
;
3499 psdout(20) << __func__
<< " misplaced " << misplaced
3500 << (estimate
? " (est)": "")<< dendl
;
3502 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
3503 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
3504 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
3508 std::optional
<pg_stat_t
> PeeringState::prepare_stats_for_publish(
3509 bool pg_stats_publish_valid
,
3510 const pg_stat_t
&pg_stats_publish
,
3511 const object_stat_collection_t
&unstable_stats
)
3513 if (info
.stats
.stats
.sum
.num_scrub_errors
) {
3514 state_set(PG_STATE_INCONSISTENT
);
3516 state_clear(PG_STATE_INCONSISTENT
);
3517 state_clear(PG_STATE_FAILED_REPAIR
);
3520 utime_t now
= ceph_clock_now();
3521 if (info
.stats
.state
!= state
) {
3522 info
.stats
.last_change
= now
;
3523 // Optimistic estimation, if we just find out an inactive PG,
3524 // assumt it is active till now.
3525 if (!(state
& PG_STATE_ACTIVE
) &&
3526 (info
.stats
.state
& PG_STATE_ACTIVE
))
3527 info
.stats
.last_active
= now
;
3529 if ((state
& PG_STATE_ACTIVE
) &&
3530 !(info
.stats
.state
& PG_STATE_ACTIVE
))
3531 info
.stats
.last_became_active
= now
;
3532 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
3533 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
3534 info
.stats
.last_became_peered
= now
;
3535 info
.stats
.state
= state
;
3538 update_calc_stats();
3539 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
3540 state_set(PG_STATE_DEGRADED
);
3542 state_clear(PG_STATE_DEGRADED
);
3544 update_blocked_by();
3546 pg_stat_t pre_publish
= info
.stats
;
3547 pre_publish
.stats
.add(unstable_stats
);
3548 utime_t cutoff
= now
;
3549 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
3551 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3552 // because we don't want to make the pg_stat_t structures too expensive.
3553 unsigned max
= cct
->_conf
->osd_max_snap_prune_intervals_per_epoch
;
3555 auto i
= info
.purged_snaps
.begin();
3556 while (num
< max
&& i
!= info
.purged_snaps
.end()) {
3557 pre_publish
.purged_snaps
.insert(i
.get_start(), i
.get_len());
3561 psdout(20) << __func__
<< " reporting purged_snaps "
3562 << pre_publish
.purged_snaps
<< dendl
;
3564 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
3565 info
.stats
.last_fresh
> cutoff
) {
3566 psdout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3567 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
3568 return std::nullopt
;
3570 // update our stat summary and timestamps
3571 info
.stats
.reported_epoch
= get_osdmap_epoch();
3572 ++info
.stats
.reported_seq
;
3574 info
.stats
.last_fresh
= now
;
3576 if (info
.stats
.state
& PG_STATE_CLEAN
)
3577 info
.stats
.last_clean
= now
;
3578 if (info
.stats
.state
& PG_STATE_ACTIVE
)
3579 info
.stats
.last_active
= now
;
3580 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
3581 info
.stats
.last_peered
= now
;
3582 info
.stats
.last_unstale
= now
;
3583 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
3584 info
.stats
.last_undegraded
= now
;
3585 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
3586 info
.stats
.last_fullsized
= now
;
3588 psdout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3589 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
3590 return std::make_optional(std::move(pre_publish
));
3594 void PeeringState::init(
3596 const vector
<int>& newup
, int new_up_primary
,
3597 const vector
<int>& newacting
, int new_acting_primary
,
3598 const pg_history_t
& history
,
3599 const PastIntervals
& pi
,
3601 ObjectStore::Transaction
&t
)
3603 psdout(10) << "init role " << role
<< " up "
3604 << newup
<< " acting " << newacting
3605 << " history " << history
3606 << " past_intervals " << pi
3610 init_primary_up_acting(
3614 new_acting_primary
);
3616 info
.history
= history
;
3617 past_intervals
= pi
;
3620 info
.stats
.up_primary
= new_up_primary
;
3621 info
.stats
.acting
= acting
;
3622 info
.stats
.acting_primary
= new_acting_primary
;
3623 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
3625 if (!perform_deletes_during_peering()) {
3626 pg_log
.set_missing_may_contain_deletes();
3630 psdout(10) << __func__
<< ": Setting backfill" << dendl
;
3631 info
.set_last_backfill(hobject_t());
3632 info
.last_complete
= info
.last_update
;
3633 pg_log
.mark_log_for_rewrite();
3639 dirty_big_info
= true;
3643 void PeeringState::dump_peering_state(Formatter
*f
)
3645 f
->dump_string("state", get_pg_state_string());
3646 f
->dump_unsigned("epoch", get_osdmap_epoch());
3647 f
->open_array_section("up");
3648 for (vector
<int>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
3649 f
->dump_unsigned("osd", *p
);
3651 f
->open_array_section("acting");
3652 for (vector
<int>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
3653 f
->dump_unsigned("osd", *p
);
3655 if (!backfill_targets
.empty()) {
3656 f
->open_array_section("backfill_targets");
3657 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
3658 p
!= backfill_targets
.end();
3660 f
->dump_stream("shard") << *p
;
3663 if (!async_recovery_targets
.empty()) {
3664 f
->open_array_section("async_recovery_targets");
3665 for (set
<pg_shard_t
>::iterator p
= async_recovery_targets
.begin();
3666 p
!= async_recovery_targets
.end();
3668 f
->dump_stream("shard") << *p
;
3671 if (!acting_recovery_backfill
.empty()) {
3672 f
->open_array_section("acting_recovery_backfill");
3673 for (set
<pg_shard_t
>::iterator p
= acting_recovery_backfill
.begin();
3674 p
!= acting_recovery_backfill
.end();
3676 f
->dump_stream("shard") << *p
;
3679 f
->open_object_section("info");
3680 update_calc_stats();
3684 f
->open_array_section("peer_info");
3685 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= peer_info
.begin();
3686 p
!= peer_info
.end();
3688 f
->open_object_section("info");
3689 f
->dump_stream("peer") << p
->first
;
3695 void PeeringState::update_stats(
3696 std::function
<bool(pg_history_t
&, pg_stat_t
&)> f
,
3697 ObjectStore::Transaction
*t
) {
3698 if (f(info
.history
, info
.stats
)) {
3699 pl
->publish_stats_to_osd();
3701 pl
->on_info_history_change();
3709 bool PeeringState::append_log_entries_update_missing(
3710 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3711 ObjectStore::Transaction
&t
, std::optional
<eversion_t
> trim_to
,
3712 std::optional
<eversion_t
> roll_forward_to
)
3714 ceph_assert(!entries
.empty());
3715 ceph_assert(entries
.begin()->version
> info
.last_update
);
3717 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
3718 bool invalidate_stats
=
3719 pg_log
.append_new_log_entries(
3724 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
3725 pg_log
.roll_forward(rollbacker
.get());
3727 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
3728 pg_log
.roll_forward_to(*roll_forward_to
, rollbacker
.get());
3729 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
3732 info
.last_update
= pg_log
.get_head();
3734 if (pg_log
.get_missing().num_missing() == 0) {
3735 // advance last_complete since nothing else is missing!
3736 info
.last_complete
= info
.last_update
;
3738 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
3740 psdout(20) << __func__
<< " trim_to bool = " << bool(trim_to
)
3741 << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
3743 pg_log
.trim(*trim_to
, info
);
3746 return invalidate_stats
;
3749 void PeeringState::merge_new_log_entries(
3750 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3751 ObjectStore::Transaction
&t
,
3752 std::optional
<eversion_t
> trim_to
,
3753 std::optional
<eversion_t
> roll_forward_to
)
3755 psdout(10) << __func__
<< " " << entries
<< dendl
;
3756 ceph_assert(is_primary());
3758 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
3759 for (set
<pg_shard_t
>::const_iterator i
= acting_recovery_backfill
.begin();
3760 i
!= acting_recovery_backfill
.end();
3762 pg_shard_t
peer(*i
);
3763 if (peer
== pg_whoami
) continue;
3764 ceph_assert(peer_missing
.count(peer
));
3765 ceph_assert(peer_info
.count(peer
));
3766 pg_missing_t
& pmissing(peer_missing
[peer
]);
3767 psdout(20) << __func__
<< " peer_missing for " << peer
3768 << " = " << pmissing
<< dendl
;
3769 pg_info_t
& pinfo(peer_info
[peer
]);
3770 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
3771 pinfo
.last_backfill
,
3778 pinfo
.last_update
= info
.last_update
;
3779 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
3780 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
3783 if (!rebuild_missing
) {
3787 for (auto &&i
: entries
) {
3788 missing_loc
.rebuild(
3791 acting_recovery_backfill
,
3793 pg_log
.get_missing(),
3799 void PeeringState::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3801 // raise last_complete only if we were previously up to date
3802 if (info
.last_complete
== info
.last_update
)
3803 info
.last_complete
= e
.version
;
3805 // raise last_update.
3806 ceph_assert(e
.version
> info
.last_update
);
3807 info
.last_update
= e
.version
;
3809 // raise user_version, if it increased (it may have not get bumped
3810 // by all logged updates)
3811 if (e
.user_version
> info
.last_user_version
)
3812 info
.last_user_version
= e
.user_version
;
3815 pg_log
.add(e
, applied
);
3816 psdout(10) << "add_log_entry " << e
<< dendl
;
3820 void PeeringState::append_log(
3821 const vector
<pg_log_entry_t
>& logv
,
3823 eversion_t roll_forward_to
,
3825 ObjectStore::Transaction
&t
,
3826 bool transaction_applied
,
3829 /* The primary has sent an info updating the history, but it may not
3830 * have arrived yet. We want to make sure that we cannot remember this
3831 * write without remembering that it happened in an interval which went
3832 * active in epoch history.last_epoch_started.
3834 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3835 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3837 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3838 info
.history
.last_interval_started
= info
.last_interval_started
;
3840 psdout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
3842 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
3843 if (!transaction_applied
) {
3844 /* We must be a backfill or async recovery peer, so it's ok if we apply
3845 * out-of-turn since we won't be considered when
3846 * determining a min possible last_update.
3848 * We skip_rollforward() here, which advances the crt, without
3849 * doing an actual rollforward. This avoids cleaning up entries
3850 * from the backend and we do not end up in a situation, where the
3851 * object is deleted before we can _merge_object_divergent_entries().
3853 pg_log
.skip_rollforward();
3856 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
3859 add_log_entry(*p
, transaction_applied
);
3861 /* We don't want to leave the rollforward artifacts around
3862 * here past last_backfill. It's ok for the same reason as
3864 if (transaction_applied
&&
3865 p
->soid
> info
.last_backfill
) {
3866 pg_log
.roll_forward(handler
.get());
3869 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
3870 pg_log
.roll_forward_to(
3873 last_rollback_info_trimmed_to_applied
= roll_forward_to
;
3876 psdout(10) << __func__
<< " approx pg log length = "
3877 << pg_log
.get_log().approx_size() << dendl
;
3878 psdout(10) << __func__
<< " transaction_applied = "
3879 << transaction_applied
<< dendl
;
3880 if (!transaction_applied
|| async
)
3881 psdout(10) << __func__
<< " " << pg_whoami
3882 << " is async_recovery or backfill target" << dendl
;
3883 pg_log
.trim(trim_to
, info
, transaction_applied
, async
);
3885 // update the local pg, pg log
3890 min_last_complete_ondisk
= mlcod
;
3893 void PeeringState::recover_got(
3894 const hobject_t
&oid
, eversion_t v
,
3896 ObjectStore::Transaction
&t
)
3898 if (v
> pg_log
.get_can_rollback_to()) {
3899 /* This can only happen during a repair, and even then, it would
3900 * be one heck of a race. If we are repairing the object, the
3901 * write in question must be fully committed, so it's not valid
3902 * to roll it back anyway (and we'll be rolled forward shortly
3904 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
3905 pg_log
.roll_forward_to(v
, handler
.get());
3908 psdout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
3909 pg_log
.recover_got(oid
, v
, info
);
3910 if (pg_log
.get_log().log
.empty()) {
3911 psdout(10) << "last_complete now " << info
.last_complete
3912 << " while log is empty" << dendl
;
3913 } else if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
3914 psdout(10) << "last_complete now " << info
.last_complete
3915 << " log.complete_to " << pg_log
.get_log().complete_to
->version
3918 psdout(10) << "last_complete now " << info
.last_complete
3919 << " log.complete_to at end" << dendl
;
3920 //below is not true in the repair case.
3921 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3922 ceph_assert(info
.last_complete
== info
.last_update
);
3926 ceph_assert(missing_loc
.needs_recovery(oid
));
3928 missing_loc
.add_location(oid
, pg_whoami
);
3936 void PeeringState::update_backfill_progress(
3937 const hobject_t
&updated_backfill
,
3938 const pg_stat_t
&updated_stats
,
3939 bool preserve_local_num_bytes
,
3940 ObjectStore::Transaction
&t
) {
3941 info
.set_last_backfill(updated_backfill
);
3942 if (preserve_local_num_bytes
) {
3943 psdout(25) << __func__
<< " primary " << updated_stats
.stats
.sum
.num_bytes
3944 << " local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
3945 int64_t bytes
= info
.stats
.stats
.sum
.num_bytes
;
3946 info
.stats
= updated_stats
;
3947 info
.stats
.stats
.sum
.num_bytes
= bytes
;
3949 psdout(20) << __func__
<< " final " << updated_stats
.stats
.sum
.num_bytes
3950 << " replaces local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
3951 info
.stats
= updated_stats
;
3958 void PeeringState::adjust_purged_snaps(
3959 std::function
<void(interval_set
<snapid_t
> &snaps
)> f
) {
3960 f(info
.purged_snaps
);
3962 dirty_big_info
= true;
3965 void PeeringState::on_peer_recover(
3967 const hobject_t
&soid
,
3968 const eversion_t
&version
)
3970 pl
->publish_stats_to_osd();
3972 peer_missing
[peer
].got(soid
, version
);
3973 missing_loc
.add_location(soid
, peer
);
3976 void PeeringState::begin_peer_recover(
3978 const hobject_t soid
)
3980 peer_missing
[peer
].revise_have(soid
, eversion_t());
3983 void PeeringState::force_object_missing(
3984 const set
<pg_shard_t
> &peers
,
3985 const hobject_t
&soid
,
3988 for (auto &&peer
: peers
) {
3989 if (peer
!= primary
) {
3990 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
3992 pg_log
.missing_add(soid
, version
, eversion_t());
3993 pg_log
.reset_complete_to(&info
);
3994 pg_log
.set_last_requested(0);
3998 missing_loc
.rebuild(
4001 acting_recovery_backfill
,
4003 pg_log
.get_missing(),
4008 void PeeringState::pre_submit_op(
4009 const hobject_t
&hoid
,
4010 const vector
<pg_log_entry_t
>& logv
,
4011 eversion_t at_version
)
4013 if (at_version
> eversion_t()) {
4014 for (auto &&i
: get_acting_recovery_backfill()) {
4015 if (i
== primary
) continue;
4016 pg_info_t
&pinfo
= peer_info
[i
];
4017 // keep peer_info up to date
4018 if (pinfo
.last_complete
== pinfo
.last_update
)
4019 pinfo
.last_complete
= at_version
;
4020 pinfo
.last_update
= at_version
;
4024 bool requires_missing_loc
= false;
4025 for (auto &&i
: get_async_recovery_targets()) {
4026 if (i
== primary
|| !get_peer_missing(i
).is_missing(hoid
))
4028 requires_missing_loc
= true;
4029 for (auto &&entry
: logv
) {
4030 peer_missing
[i
].add_next_event(entry
);
4034 if (requires_missing_loc
) {
4035 for (auto &&entry
: logv
) {
4036 psdout(30) << __func__
<< " missing_loc before: "
4037 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4038 missing_loc
.add_missing(entry
.soid
, entry
.version
,
4039 eversion_t(), entry
.is_delete());
4040 // clear out missing_loc
4041 missing_loc
.clear_location(entry
.soid
);
4042 for (auto &i
: get_actingset()) {
4043 if (!get_peer_missing(i
).is_missing(entry
.soid
))
4044 missing_loc
.add_location(entry
.soid
, i
);
4046 psdout(30) << __func__
<< " missing_loc after: "
4047 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4052 void PeeringState::recovery_committed_to(eversion_t version
)
4054 psdout(10) << __func__
<< " version " << version
4055 << " now ondisk" << dendl
;
4056 last_complete_ondisk
= version
;
4058 if (last_complete_ondisk
== info
.last_update
) {
4059 if (!is_primary()) {
4060 // Either we are a replica or backfill target.
4061 // we are fully up to date. tell the primary!
4062 pl
->send_cluster_message(
4066 spg_t(info
.pgid
.pgid
, primary
.shard
),
4067 last_complete_ondisk
),
4068 get_osdmap_epoch());
4070 calc_min_last_complete_ondisk();
4075 void PeeringState::complete_write(eversion_t v
, eversion_t lc
)
4077 last_update_ondisk
= v
;
4078 last_complete_ondisk
= lc
;
4079 calc_min_last_complete_ondisk();
4082 void PeeringState::calc_trim_to()
4084 size_t target
= pl
->get_target_pg_log_entries();
4086 eversion_t limit
= std::min(
4087 min_last_complete_ondisk
,
4088 pg_log
.get_can_rollback_to());
4089 if (limit
!= eversion_t() &&
4090 limit
!= pg_trim_to
&&
4091 pg_log
.get_log().approx_size() > target
) {
4092 size_t num_to_trim
= std::min(pg_log
.get_log().approx_size() - target
,
4093 cct
->_conf
->osd_pg_log_trim_max
);
4094 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4095 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4098 list
<pg_log_entry_t
>::const_iterator it
= pg_log
.get_log().log
.begin();
4099 eversion_t new_trim_to
;
4100 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
4101 new_trim_to
= it
->version
;
4103 if (new_trim_to
> limit
) {
4104 new_trim_to
= limit
;
4105 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
4109 psdout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
4110 pg_trim_to
= new_trim_to
;
4111 assert(pg_trim_to
<= pg_log
.get_head());
4112 assert(pg_trim_to
<= min_last_complete_ondisk
);
4116 void PeeringState::calc_trim_to_aggressive()
4118 size_t target
= pl
->get_target_pg_log_entries();
4120 // limit pg log trimming up to the can_rollback_to value
4121 eversion_t limit
= std::min({
4123 pg_log
.get_can_rollback_to(),
4124 last_update_ondisk
});
4125 psdout(10) << __func__
<< " limit = " << limit
<< dendl
;
4127 if (limit
!= eversion_t() &&
4128 limit
!= pg_trim_to
&&
4129 pg_log
.get_log().approx_size() > target
) {
4130 psdout(10) << __func__
<< " approx pg log length = "
4131 << pg_log
.get_log().approx_size() << dendl
;
4132 uint64_t num_to_trim
= std::min
<uint64_t>(pg_log
.get_log().approx_size() - target
,
4133 cct
->_conf
->osd_pg_log_trim_max
);
4134 psdout(10) << __func__
<< " num_to_trim = " << num_to_trim
<< dendl
;
4135 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4136 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4139 auto it
= pg_log
.get_log().log
.begin(); // oldest log entry
4140 auto rit
= pg_log
.get_log().log
.rbegin();
4141 eversion_t by_n_to_keep
; // start from tail
4142 eversion_t by_n_to_trim
= eversion_t::max(); // start from head
4143 for (size_t i
= 0; it
!= pg_log
.get_log().log
.end(); ++it
, ++rit
) {
4145 if (i
> target
&& by_n_to_keep
== eversion_t()) {
4146 by_n_to_keep
= rit
->version
;
4148 if (i
>= num_to_trim
&& by_n_to_trim
== eversion_t::max()) {
4149 by_n_to_trim
= it
->version
;
4151 if (by_n_to_keep
!= eversion_t() &&
4152 by_n_to_trim
!= eversion_t::max()) {
4157 if (by_n_to_keep
== eversion_t()) {
4161 pg_trim_to
= std::min({by_n_to_keep
, by_n_to_trim
, limit
});
4162 psdout(10) << __func__
<< " pg_trim_to now " << pg_trim_to
<< dendl
;
4163 ceph_assert(pg_trim_to
<= pg_log
.get_head());
4167 void PeeringState::apply_op_stats(
4168 const hobject_t
&soid
,
4169 const object_stat_sum_t
&delta_stats
)
4171 info
.stats
.stats
.add(delta_stats
);
4172 info
.stats
.stats
.floor(0);
4174 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
4175 i
!= get_backfill_targets().end();
4178 pg_info_t
& pinfo
= peer_info
[bt
];
4179 if (soid
<= pinfo
.last_backfill
)
4180 pinfo
.stats
.stats
.add(delta_stats
);
4184 void PeeringState::update_complete_backfill_object_stats(
4185 const hobject_t
&hoid
,
4186 const pg_stat_t
&stats
)
4188 for (auto &&bt
: get_backfill_targets()) {
4189 pg_info_t
& pinfo
= peer_info
[bt
];
4190 //Add stats to all peers that were missing object
4191 if (hoid
> pinfo
.last_backfill
)
4192 pinfo
.stats
.add(stats
);
4196 void PeeringState::update_peer_last_backfill(
4198 const hobject_t
&new_last_backfill
)
4200 pg_info_t
&pinfo
= peer_info
[peer
];
4201 pinfo
.last_backfill
= new_last_backfill
;
4202 if (new_last_backfill
.is_max()) {
4203 /* pinfo.stats might be wrong if we did log-based recovery on the
4204 * backfilled portion in addition to continuing backfill.
4206 pinfo
.stats
= info
.stats
;
4210 void PeeringState::set_revert_with_targets(
4211 const hobject_t
&soid
,
4212 const set
<pg_shard_t
> &good_peers
)
4214 for (auto &&peer
: good_peers
) {
4215 missing_loc
.add_location(soid
, peer
);
4219 void PeeringState::prepare_backfill_for_missing(
4220 const hobject_t
&soid
,
4221 const eversion_t
&version
,
4222 const vector
<pg_shard_t
> &targets
) {
4223 for (auto &&peer
: targets
) {
4224 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
4228 void PeeringState::update_hset(const pg_hit_set_history_t
&hset_history
)
4230 info
.hit_set
= hset_history
;
4233 /*------------ Peering State Machine----------------*/
4235 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4236 << "state<" << get_state_name() << ">: ")
4238 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4240 #define DECLARE_LOCALS \
4241 PeeringState *ps = context< PeeringMachine >().state; \
4243 PeeringListener *pl = context< PeeringMachine >().pl; \
4247 /*------Crashed-------*/
4248 PeeringState::Crashed::Crashed(my_context ctx
)
4250 NamedState(context
< PeeringMachine
>().state_history
, "Crashed")
4252 context
< PeeringMachine
>().log_enter(state_name
);
4253 ceph_abort_msg("we got a bad state machine event");
4257 /*------Initial-------*/
4258 PeeringState::Initial::Initial(my_context ctx
)
4260 NamedState(context
< PeeringMachine
>().state_history
, "Initial")
4262 context
< PeeringMachine
>().log_enter(state_name
);
4265 boost::statechart::result
PeeringState::Initial::react(const MNotifyRec
& notify
)
4268 ps
->proc_replica_info(
4269 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
4270 ps
->set_last_peering_reset();
4271 return transit
< Primary
>();
4274 boost::statechart::result
PeeringState::Initial::react(const MInfoRec
& i
)
4277 ceph_assert(!ps
->is_primary());
4279 return transit
< Stray
>();
4282 boost::statechart::result
PeeringState::Initial::react(const MLogRec
& i
)
4285 ceph_assert(!ps
->is_primary());
4287 return transit
< Stray
>();
4290 void PeeringState::Initial::exit()
4292 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4294 utime_t dur
= ceph_clock_now() - enter_time
;
4295 pl
->get_peering_perf().tinc(rs_initial_latency
, dur
);
4298 /*------Started-------*/
4299 PeeringState::Started::Started(my_context ctx
)
4301 NamedState(context
< PeeringMachine
>().state_history
, "Started")
4303 context
< PeeringMachine
>().log_enter(state_name
);
4306 boost::statechart::result
4307 PeeringState::Started::react(const IntervalFlush
&)
4309 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4310 context
< PeeringMachine
>().state
->end_block_outgoing();
4311 return discard_event();
4314 boost::statechart::result
PeeringState::Started::react(const AdvMap
& advmap
)
4317 psdout(10) << "Started advmap" << dendl
;
4318 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4319 if (ps
->should_restart_peering(
4321 advmap
.acting_primary
,
4326 psdout(10) << "should_restart_peering, transitioning to Reset"
4329 return transit
< Reset
>();
4331 ps
->remove_down_peer_info(advmap
.osdmap
);
4332 return discard_event();
4335 boost::statechart::result
PeeringState::Started::react(const QueryState
& q
)
4337 q
.f
->open_object_section("state");
4338 q
.f
->dump_string("name", state_name
);
4339 q
.f
->dump_stream("enter_time") << enter_time
;
4340 q
.f
->close_section();
4341 return discard_event();
4344 void PeeringState::Started::exit()
4346 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4348 utime_t dur
= ceph_clock_now() - enter_time
;
4349 pl
->get_peering_perf().tinc(rs_started_latency
, dur
);
4350 ps
->state_clear(PG_STATE_WAIT
| PG_STATE_LAGGY
);
4353 /*--------Reset---------*/
4354 PeeringState::Reset::Reset(my_context ctx
)
4356 NamedState(context
< PeeringMachine
>().state_history
, "Reset")
4358 context
< PeeringMachine
>().log_enter(state_name
);
4361 ps
->flushes_in_progress
= 0;
4362 ps
->set_last_peering_reset();
4363 ps
->log_weirdness();
4366 boost::statechart::result
4367 PeeringState::Reset::react(const IntervalFlush
&)
4369 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4370 context
< PeeringMachine
>().state
->end_block_outgoing();
4371 return discard_event();
4374 boost::statechart::result
PeeringState::Reset::react(const AdvMap
& advmap
)
4377 psdout(10) << "Reset advmap" << dendl
;
4379 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4381 if (ps
->should_restart_peering(
4383 advmap
.acting_primary
,
4388 psdout(10) << "should restart peering, calling start_peering_interval again"
4390 ps
->start_peering_interval(
4392 advmap
.newup
, advmap
.up_primary
,
4393 advmap
.newacting
, advmap
.acting_primary
,
4394 context
< PeeringMachine
>().get_cur_transaction());
4396 ps
->remove_down_peer_info(advmap
.osdmap
);
4397 ps
->check_past_interval_bounds();
4398 return discard_event();
4401 boost::statechart::result
PeeringState::Reset::react(const ActMap
&)
4404 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
4405 ps
->info
.history
.refresh_prior_readable_until_ub(
4407 ps
->prior_readable_until_ub
);
4408 context
< PeeringMachine
>().send_notify(
4409 ps
->get_primary().osd
,
4411 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
4412 ps
->get_osdmap_epoch(),
4413 ps
->get_osdmap_epoch(),
4415 ps
->past_intervals
));
4418 ps
->update_heartbeat_peers();
4420 return transit
< Started
>();
4423 boost::statechart::result
PeeringState::Reset::react(const QueryState
& q
)
4425 q
.f
->open_object_section("state");
4426 q
.f
->dump_string("name", state_name
);
4427 q
.f
->dump_stream("enter_time") << enter_time
;
4428 q
.f
->close_section();
4429 return discard_event();
4432 void PeeringState::Reset::exit()
4434 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4436 utime_t dur
= ceph_clock_now() - enter_time
;
4437 pl
->get_peering_perf().tinc(rs_reset_latency
, dur
);
4440 /*-------Start---------*/
4441 PeeringState::Start::Start(my_context ctx
)
4443 NamedState(context
< PeeringMachine
>().state_history
, "Start")
4445 context
< PeeringMachine
>().log_enter(state_name
);
4448 if (ps
->is_primary()) {
4449 psdout(1) << "transitioning to Primary" << dendl
;
4450 post_event(MakePrimary());
4452 psdout(1) << "transitioning to Stray" << dendl
;
4453 post_event(MakeStray());
4457 void PeeringState::Start::exit()
4459 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4461 utime_t dur
= ceph_clock_now() - enter_time
;
4462 pl
->get_peering_perf().tinc(rs_start_latency
, dur
);
4465 /*---------Primary--------*/
4466 PeeringState::Primary::Primary(my_context ctx
)
4468 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary")
4470 context
< PeeringMachine
>().log_enter(state_name
);
4472 ceph_assert(ps
->want_acting
.empty());
4474 // set CREATING bit until we have peered for the first time.
4475 if (ps
->info
.history
.last_epoch_started
== 0) {
4476 ps
->state_set(PG_STATE_CREATING
);
4477 // use the history timestamp, which ultimately comes from the
4478 // monitor in the create case.
4479 utime_t t
= ps
->info
.history
.last_scrub_stamp
;
4480 ps
->info
.stats
.last_fresh
= t
;
4481 ps
->info
.stats
.last_active
= t
;
4482 ps
->info
.stats
.last_change
= t
;
4483 ps
->info
.stats
.last_peered
= t
;
4484 ps
->info
.stats
.last_clean
= t
;
4485 ps
->info
.stats
.last_unstale
= t
;
4486 ps
->info
.stats
.last_undegraded
= t
;
4487 ps
->info
.stats
.last_fullsized
= t
;
4488 ps
->info
.stats
.last_scrub_stamp
= t
;
4489 ps
->info
.stats
.last_deep_scrub_stamp
= t
;
4490 ps
->info
.stats
.last_clean_scrub_stamp
= t
;
4494 boost::statechart::result
PeeringState::Primary::react(const MNotifyRec
& notevt
)
4497 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
4498 ps
->proc_replica_info(
4499 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
4500 return discard_event();
4503 boost::statechart::result
PeeringState::Primary::react(const ActMap
&)
4506 psdout(7) << "handle ActMap primary" << dendl
;
4507 pl
->publish_stats_to_osd();
4508 return discard_event();
4511 boost::statechart::result
PeeringState::Primary::react(
4512 const SetForceRecovery
&)
4515 ps
->set_force_recovery(true);
4516 return discard_event();
4519 boost::statechart::result
PeeringState::Primary::react(
4520 const UnsetForceRecovery
&)
4523 ps
->set_force_recovery(false);
4524 return discard_event();
4527 boost::statechart::result
PeeringState::Primary::react(
4528 const RequestScrub
& evt
)
4531 if (ps
->is_primary()) {
4532 pl
->scrub_requested(evt
.deep
, evt
.repair
);
4533 psdout(10) << "marking for scrub" << dendl
;
4535 return discard_event();
4538 boost::statechart::result
PeeringState::Primary::react(
4539 const SetForceBackfill
&)
4542 ps
->set_force_backfill(true);
4543 return discard_event();
4546 boost::statechart::result
PeeringState::Primary::react(
4547 const UnsetForceBackfill
&)
4550 ps
->set_force_backfill(false);
4551 return discard_event();
4554 void PeeringState::Primary::exit()
4556 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4558 ps
->want_acting
.clear();
4559 utime_t dur
= ceph_clock_now() - enter_time
;
4560 pl
->get_peering_perf().tinc(rs_primary_latency
, dur
);
4561 pl
->clear_primary_state();
4562 ps
->state_clear(PG_STATE_CREATING
);
4565 /*---------Peering--------*/
4566 PeeringState::Peering::Peering(my_context ctx
)
4568 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering"),
4569 history_les_bound(false)
4571 context
< PeeringMachine
>().log_enter(state_name
);
4574 ceph_assert(!ps
->is_peered());
4575 ceph_assert(!ps
->is_peering());
4576 ceph_assert(ps
->is_primary());
4577 ps
->state_set(PG_STATE_PEERING
);
4580 boost::statechart::result
PeeringState::Peering::react(const AdvMap
& advmap
)
4583 psdout(10) << "Peering advmap" << dendl
;
4584 if (prior_set
.affected_by_map(*(advmap
.osdmap
), ps
->dpp
)) {
4585 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl
;
4587 return transit
< Reset
>();
4590 ps
->adjust_need_up_thru(advmap
.osdmap
);
4591 ps
->check_prior_readable_down_osds(advmap
.osdmap
);
4593 return forward_event();
4596 boost::statechart::result
PeeringState::Peering::react(const QueryState
& q
)
4600 q
.f
->open_object_section("state");
4601 q
.f
->dump_string("name", state_name
);
4602 q
.f
->dump_stream("enter_time") << enter_time
;
4604 q
.f
->open_array_section("past_intervals");
4605 ps
->past_intervals
.dump(q
.f
);
4606 q
.f
->close_section();
4608 q
.f
->open_array_section("probing_osds");
4609 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
4610 p
!= prior_set
.probe
.end();
4612 q
.f
->dump_stream("osd") << *p
;
4613 q
.f
->close_section();
4615 if (prior_set
.pg_down
)
4616 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
4618 q
.f
->open_array_section("down_osds_we_would_probe");
4619 for (set
<int>::iterator p
= prior_set
.down
.begin();
4620 p
!= prior_set
.down
.end();
4622 q
.f
->dump_int("osd", *p
);
4623 q
.f
->close_section();
4625 q
.f
->open_array_section("peering_blocked_by");
4626 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
4627 p
!= prior_set
.blocked_by
.end();
4629 q
.f
->open_object_section("osd");
4630 q
.f
->dump_int("osd", p
->first
);
4631 q
.f
->dump_int("current_lost_at", p
->second
);
4632 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
4633 q
.f
->close_section();
4635 q
.f
->close_section();
4637 if (history_les_bound
) {
4638 q
.f
->open_array_section("peering_blocked_by_detail");
4639 q
.f
->open_object_section("item");
4640 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
4641 q
.f
->close_section();
4642 q
.f
->close_section();
4645 q
.f
->close_section();
4646 return forward_event();
4649 void PeeringState::Peering::exit()
4653 psdout(10) << "Leaving Peering" << dendl
;
4654 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4655 ps
->state_clear(PG_STATE_PEERING
);
4656 pl
->clear_probe_targets();
4658 utime_t dur
= ceph_clock_now() - enter_time
;
4659 pl
->get_peering_perf().tinc(rs_peering_latency
, dur
);
4663 /*------Backfilling-------*/
4664 PeeringState::Backfilling::Backfilling(my_context ctx
)
4666 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Backfilling")
4668 context
< PeeringMachine
>().log_enter(state_name
);
4672 ps
->backfill_reserved
= true;
4673 pl
->on_backfill_reserved();
4674 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
4675 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
4676 ps
->state_set(PG_STATE_BACKFILLING
);
4677 pl
->publish_stats_to_osd();
4680 void PeeringState::Backfilling::backfill_release_reservations()
4683 pl
->cancel_local_background_io_reservation();
4684 for (set
<pg_shard_t
>::iterator it
= ps
->backfill_targets
.begin();
4685 it
!= ps
->backfill_targets
.end();
4687 ceph_assert(*it
!= ps
->pg_whoami
);
4688 pl
->send_cluster_message(
4690 new MBackfillReserve(
4691 MBackfillReserve::RELEASE
,
4692 spg_t(ps
->info
.pgid
.pgid
, it
->shard
),
4693 ps
->get_osdmap_epoch()),
4694 ps
->get_osdmap_epoch());
4698 void PeeringState::Backfilling::cancel_backfill()
4701 backfill_release_reservations();
4702 pl
->on_backfill_canceled();
4705 boost::statechart::result
4706 PeeringState::Backfilling::react(const Backfilled
&c
)
4708 backfill_release_reservations();
4709 return transit
<Recovered
>();
4712 boost::statechart::result
4713 PeeringState::Backfilling::react(const DeferBackfill
&c
)
4717 psdout(10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
4718 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4719 ps
->state_clear(PG_STATE_BACKFILLING
);
4722 pl
->schedule_event_after(
4723 std::make_shared
<PGPeeringEvent
>(
4724 ps
->get_osdmap_epoch(),
4725 ps
->get_osdmap_epoch(),
4728 return transit
<NotBackfilling
>();
4731 boost::statechart::result
4732 PeeringState::Backfilling::react(const UnfoundBackfill
&c
)
4735 psdout(10) << "backfill has unfound, can't continue" << dendl
;
4736 ps
->state_set(PG_STATE_BACKFILL_UNFOUND
);
4737 ps
->state_clear(PG_STATE_BACKFILLING
);
4739 return transit
<NotBackfilling
>();
4742 boost::statechart::result
4743 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull
&)
4747 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
4748 ps
->state_clear(PG_STATE_BACKFILLING
);
4751 pl
->schedule_event_after(
4752 std::make_shared
<PGPeeringEvent
>(
4753 ps
->get_osdmap_epoch(),
4754 ps
->get_osdmap_epoch(),
4756 ps
->cct
->_conf
->osd_backfill_retry_interval
);
4758 return transit
<NotBackfilling
>();
4761 boost::statechart::result
4762 PeeringState::Backfilling::react(const RemoteReservationRevoked
&)
4765 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4767 if (ps
->needs_backfill()) {
4768 return transit
<WaitLocalBackfillReserved
>();
4770 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4771 return discard_event();
4775 void PeeringState::Backfilling::exit()
4777 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4779 ps
->backfill_reserved
= false;
4780 ps
->state_clear(PG_STATE_BACKFILLING
);
4781 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
4782 utime_t dur
= ceph_clock_now() - enter_time
;
4783 pl
->get_peering_perf().tinc(rs_backfilling_latency
, dur
);
4786 /*--WaitRemoteBackfillReserved--*/
4788 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
4790 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4791 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
4793 context
< PeeringMachine
>().log_enter(state_name
);
4796 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4797 pl
->publish_stats_to_osd();
4798 post_event(RemoteBackfillReserved());
4801 boost::statechart::result
4802 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
4806 int64_t num_bytes
= ps
->info
.stats
.stats
.sum
.num_bytes
;
4807 psdout(10) << __func__
<< " num_bytes " << num_bytes
<< dendl
;
4808 if (backfill_osd_it
!=
4809 context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
4810 // The primary never backfills itself
4811 ceph_assert(*backfill_osd_it
!= ps
->pg_whoami
);
4812 pl
->send_cluster_message(
4813 backfill_osd_it
->osd
,
4814 new MBackfillReserve(
4815 MBackfillReserve::REQUEST
,
4816 spg_t(context
< PeeringMachine
>().spgid
.pgid
, backfill_osd_it
->shard
),
4817 ps
->get_osdmap_epoch(),
4818 ps
->get_backfill_priority(),
4820 ps
->peer_bytes
[*backfill_osd_it
]),
4821 ps
->get_osdmap_epoch());
4824 ps
->peer_bytes
.clear();
4825 post_event(AllBackfillsReserved());
4827 return discard_event();
4830 void PeeringState::WaitRemoteBackfillReserved::exit()
4832 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4835 utime_t dur
= ceph_clock_now() - enter_time
;
4836 pl
->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency
, dur
);
4839 void PeeringState::WaitRemoteBackfillReserved::retry()
4842 pl
->cancel_local_background_io_reservation();
4844 // Send CANCEL to all previously acquired reservations
4845 set
<pg_shard_t
>::const_iterator it
, begin
, end
;
4846 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
4847 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
4848 ceph_assert(begin
!= end
);
4849 for (it
= begin
; it
!= backfill_osd_it
; ++it
) {
4850 // The primary never backfills itself
4851 ceph_assert(*it
!= ps
->pg_whoami
);
4852 pl
->send_cluster_message(
4854 new MBackfillReserve(
4855 MBackfillReserve::RELEASE
,
4856 spg_t(context
< PeeringMachine
>().spgid
.pgid
, it
->shard
),
4857 ps
->get_osdmap_epoch()),
4858 ps
->get_osdmap_epoch());
4861 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
4862 pl
->publish_stats_to_osd();
4864 pl
->schedule_event_after(
4865 std::make_shared
<PGPeeringEvent
>(
4866 ps
->get_osdmap_epoch(),
4867 ps
->get_osdmap_epoch(),
4869 ps
->cct
->_conf
->osd_backfill_retry_interval
);
4872 boost::statechart::result
4873 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull
&evt
)
4876 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
4878 return transit
<NotBackfilling
>();
4881 boost::statechart::result
4882 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked
&evt
)
4885 return transit
<NotBackfilling
>();
4888 /*--WaitLocalBackfillReserved--*/
4889 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
4891 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalBackfillReserved")
4893 context
< PeeringMachine
>().log_enter(state_name
);
4896 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4897 pl
->request_local_background_io_reservation(
4898 ps
->get_backfill_priority(),
4899 std::make_shared
<PGPeeringEvent
>(
4900 ps
->get_osdmap_epoch(),
4901 ps
->get_osdmap_epoch(),
4902 LocalBackfillReserved()),
4903 std::make_shared
<PGPeeringEvent
>(
4904 ps
->get_osdmap_epoch(),
4905 ps
->get_osdmap_epoch(),
4906 DeferBackfill(0.0)));
4907 pl
->publish_stats_to_osd();
4910 void PeeringState::WaitLocalBackfillReserved::exit()
4912 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4914 utime_t dur
= ceph_clock_now() - enter_time
;
4915 pl
->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency
, dur
);
4918 /*----NotBackfilling------*/
4919 PeeringState::NotBackfilling::NotBackfilling(my_context ctx
)
4921 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotBackfilling")
4923 context
< PeeringMachine
>().log_enter(state_name
);
4925 ps
->state_clear(PG_STATE_REPAIR
);
4926 pl
->publish_stats_to_osd();
4929 boost::statechart::result
4930 PeeringState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
4932 return discard_event();
4935 boost::statechart::result
4936 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull
&evt
)
4938 return discard_event();
4941 void PeeringState::NotBackfilling::exit()
4943 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4946 ps
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
4947 utime_t dur
= ceph_clock_now() - enter_time
;
4948 pl
->get_peering_perf().tinc(rs_notbackfilling_latency
, dur
);
4951 /*----NotRecovering------*/
4952 PeeringState::NotRecovering::NotRecovering(my_context ctx
)
4954 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotRecovering")
4956 context
< PeeringMachine
>().log_enter(state_name
);
4958 ps
->state_clear(PG_STATE_REPAIR
);
4959 pl
->publish_stats_to_osd();
4962 void PeeringState::NotRecovering::exit()
4964 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4967 ps
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
4968 utime_t dur
= ceph_clock_now() - enter_time
;
4969 pl
->get_peering_perf().tinc(rs_notrecovering_latency
, dur
);
4972 /*---RepNotRecovering----*/
4973 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx
)
4975 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepNotRecovering")
4977 context
< PeeringMachine
>().log_enter(state_name
);
4980 boost::statechart::result
4981 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation
&evt
)
4984 ps
->reject_reservation();
4985 post_event(RemoteReservationRejectedTooFull());
4986 return discard_event();
4989 void PeeringState::RepNotRecovering::exit()
4991 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4993 utime_t dur
= ceph_clock_now() - enter_time
;
4994 pl
->get_peering_perf().tinc(rs_repnotrecovering_latency
, dur
);
4997 /*---RepWaitRecoveryReserved--*/
4998 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
5000 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitRecoveryReserved")
5002 context
< PeeringMachine
>().log_enter(state_name
);
5005 boost::statechart::result
5006 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
5009 pl
->send_cluster_message(
5011 new MRecoveryReserve(
5012 MRecoveryReserve::GRANT
,
5013 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5014 ps
->get_osdmap_epoch()),
5015 ps
->get_osdmap_epoch());
5016 return transit
<RepRecovering
>();
5019 boost::statechart::result
5020 PeeringState::RepWaitRecoveryReserved::react(
5021 const RemoteReservationCanceled
&evt
)
5024 pl
->unreserve_recovery_space();
5026 pl
->cancel_remote_recovery_reservation();
5027 return transit
<RepNotRecovering
>();
5030 void PeeringState::RepWaitRecoveryReserved::exit()
5032 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5034 utime_t dur
= ceph_clock_now() - enter_time
;
5035 pl
->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency
, dur
);
5038 /*-RepWaitBackfillReserved*/
5039 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
5041 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitBackfillReserved")
5043 context
< PeeringMachine
>().log_enter(state_name
);
5046 boost::statechart::result
5047 PeeringState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
5052 if (!pl
->try_reserve_recovery_space(
5053 evt
.primary_num_bytes
, evt
.local_num_bytes
)) {
5054 post_event(RejectTooFullRemoteReservation());
5056 PGPeeringEventRef preempt
;
5057 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5058 // older peers will interpret preemption as TOOFULL
5059 preempt
= std::make_shared
<PGPeeringEvent
>(
5060 pl
->get_osdmap_epoch(),
5061 pl
->get_osdmap_epoch(),
5062 RemoteBackfillPreempted());
5064 pl
->request_remote_recovery_reservation(
5066 std::make_shared
<PGPeeringEvent
>(
5067 pl
->get_osdmap_epoch(),
5068 pl
->get_osdmap_epoch(),
5069 RemoteBackfillReserved()),
5072 return transit
<RepWaitBackfillReserved
>();
5075 boost::statechart::result
5076 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio
&evt
)
5080 // fall back to a local reckoning of priority of primary doesn't pass one
5081 // (pre-mimic compat)
5082 int prio
= evt
.priority
? evt
.priority
: ps
->get_recovery_priority();
5084 PGPeeringEventRef preempt
;
5085 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5086 // older peers can't handle this
5087 preempt
= std::make_shared
<PGPeeringEvent
>(
5088 ps
->get_osdmap_epoch(),
5089 ps
->get_osdmap_epoch(),
5090 RemoteRecoveryPreempted());
5093 pl
->request_remote_recovery_reservation(
5095 std::make_shared
<PGPeeringEvent
>(
5096 ps
->get_osdmap_epoch(),
5097 ps
->get_osdmap_epoch(),
5098 RemoteRecoveryReserved()),
5100 return transit
<RepWaitRecoveryReserved
>();
5103 void PeeringState::RepWaitBackfillReserved::exit()
5105 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5107 utime_t dur
= ceph_clock_now() - enter_time
;
5108 pl
->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency
, dur
);
5111 boost::statechart::result
5112 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
5117 pl
->send_cluster_message(
5119 new MBackfillReserve(
5120 MBackfillReserve::GRANT
,
5121 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5122 ps
->get_osdmap_epoch()),
5123 ps
->get_osdmap_epoch());
5124 return transit
<RepRecovering
>();
5127 boost::statechart::result
5128 PeeringState::RepWaitBackfillReserved::react(
5129 const RejectTooFullRemoteReservation
&evt
)
5132 ps
->reject_reservation();
5133 post_event(RemoteReservationRejectedTooFull());
5134 return discard_event();
5137 boost::statechart::result
5138 PeeringState::RepWaitBackfillReserved::react(
5139 const RemoteReservationRejectedTooFull
&evt
)
5142 pl
->unreserve_recovery_space();
5144 pl
->cancel_remote_recovery_reservation();
5145 return transit
<RepNotRecovering
>();
5148 boost::statechart::result
5149 PeeringState::RepWaitBackfillReserved::react(
5150 const RemoteReservationCanceled
&evt
)
5153 pl
->unreserve_recovery_space();
5155 pl
->cancel_remote_recovery_reservation();
5156 return transit
<RepNotRecovering
>();
5159 /*---RepRecovering-------*/
5160 PeeringState::RepRecovering::RepRecovering(my_context ctx
)
5162 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepRecovering")
5164 context
< PeeringMachine
>().log_enter(state_name
);
5167 boost::statechart::result
5168 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted
&)
5173 pl
->unreserve_recovery_space();
5174 pl
->send_cluster_message(
5176 new MRecoveryReserve(
5177 MRecoveryReserve::REVOKE
,
5178 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5179 ps
->get_osdmap_epoch()),
5180 ps
->get_osdmap_epoch());
5181 return discard_event();
5184 boost::statechart::result
5185 PeeringState::RepRecovering::react(const BackfillTooFull
&)
5190 pl
->unreserve_recovery_space();
5191 pl
->send_cluster_message(
5193 new MBackfillReserve(
5194 MBackfillReserve::REVOKE_TOOFULL
,
5195 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5196 ps
->get_osdmap_epoch()),
5197 ps
->get_osdmap_epoch());
5198 return discard_event();
5201 boost::statechart::result
5202 PeeringState::RepRecovering::react(const RemoteBackfillPreempted
&)
5207 pl
->unreserve_recovery_space();
5208 pl
->send_cluster_message(
5210 new MBackfillReserve(
5211 MBackfillReserve::REVOKE
,
5212 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5213 ps
->get_osdmap_epoch()),
5214 ps
->get_osdmap_epoch());
5215 return discard_event();
5218 void PeeringState::RepRecovering::exit()
5220 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5222 pl
->unreserve_recovery_space();
5224 pl
->cancel_remote_recovery_reservation();
5225 utime_t dur
= ceph_clock_now() - enter_time
;
5226 pl
->get_peering_perf().tinc(rs_reprecovering_latency
, dur
);
5229 /*------Activating--------*/
5230 PeeringState::Activating::Activating(my_context ctx
)
5232 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Activating")
5234 context
< PeeringMachine
>().log_enter(state_name
);
5237 void PeeringState::Activating::exit()
5239 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5241 utime_t dur
= ceph_clock_now() - enter_time
;
5242 pl
->get_peering_perf().tinc(rs_activating_latency
, dur
);
5245 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
5247 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalRecoveryReserved")
5249 context
< PeeringMachine
>().log_enter(state_name
);
5252 // Make sure all nodes that part of the recovery aren't full
5253 if (!ps
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
5254 ps
->get_osdmap()->check_full(ps
->acting_recovery_backfill
)) {
5255 post_event(RecoveryTooFull());
5259 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5260 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5261 pl
->request_local_background_io_reservation(
5262 ps
->get_recovery_priority(),
5263 std::make_shared
<PGPeeringEvent
>(
5264 ps
->get_osdmap_epoch(),
5265 ps
->get_osdmap_epoch(),
5266 LocalRecoveryReserved()),
5267 std::make_shared
<PGPeeringEvent
>(
5268 ps
->get_osdmap_epoch(),
5269 ps
->get_osdmap_epoch(),
5270 DeferRecovery(0.0)));
5271 pl
->publish_stats_to_osd();
5274 boost::statechart::result
5275 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
5278 ps
->state_set(PG_STATE_RECOVERY_TOOFULL
);
5279 pl
->schedule_event_after(
5280 std::make_shared
<PGPeeringEvent
>(
5281 ps
->get_osdmap_epoch(),
5282 ps
->get_osdmap_epoch(),
5284 ps
->cct
->_conf
->osd_recovery_retry_interval
);
5285 return transit
<NotRecovering
>();
5288 void PeeringState::WaitLocalRecoveryReserved::exit()
5290 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5292 utime_t dur
= ceph_clock_now() - enter_time
;
5293 pl
->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
5296 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
5298 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5299 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
5301 context
< PeeringMachine
>().log_enter(state_name
);
5302 post_event(RemoteRecoveryReserved());
5305 boost::statechart::result
5306 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
5309 if (remote_recovery_reservation_it
!=
5310 context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
5311 ceph_assert(*remote_recovery_reservation_it
!= ps
->pg_whoami
);
5312 pl
->send_cluster_message(
5313 remote_recovery_reservation_it
->osd
,
5314 new MRecoveryReserve(
5315 MRecoveryReserve::REQUEST
,
5316 spg_t(context
< PeeringMachine
>().spgid
.pgid
,
5317 remote_recovery_reservation_it
->shard
),
5318 ps
->get_osdmap_epoch(),
5319 ps
->get_recovery_priority()),
5320 ps
->get_osdmap_epoch());
5321 ++remote_recovery_reservation_it
;
5323 post_event(AllRemotesReserved());
5325 return discard_event();
5328 void PeeringState::WaitRemoteRecoveryReserved::exit()
5330 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5332 utime_t dur
= ceph_clock_now() - enter_time
;
5333 pl
->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency
, dur
);
5336 PeeringState::Recovering::Recovering(my_context ctx
)
5338 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovering")
5340 context
< PeeringMachine
>().log_enter(state_name
);
5343 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
5344 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5345 ps
->state_set(PG_STATE_RECOVERING
);
5346 pl
->on_recovery_reserved();
5347 ceph_assert(!ps
->state_test(PG_STATE_ACTIVATING
));
5348 pl
->publish_stats_to_osd();
5351 void PeeringState::Recovering::release_reservations(bool cancel
)
5354 ceph_assert(cancel
|| !ps
->pg_log
.get_missing().have_missing());
5356 // release remote reservations
5357 for (set
<pg_shard_t
>::const_iterator i
=
5358 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
5359 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
5361 if (*i
== ps
->pg_whoami
) // skip myself
5363 pl
->send_cluster_message(
5365 new MRecoveryReserve(
5366 MRecoveryReserve::RELEASE
,
5367 spg_t(ps
->info
.pgid
.pgid
, i
->shard
),
5368 ps
->get_osdmap_epoch()),
5369 ps
->get_osdmap_epoch());
5373 boost::statechart::result
5374 PeeringState::Recovering::react(const AllReplicasRecovered
&evt
)
5377 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5378 release_reservations();
5379 pl
->cancel_local_background_io_reservation();
5380 return transit
<Recovered
>();
5383 boost::statechart::result
5384 PeeringState::Recovering::react(const RequestBackfill
&evt
)
5388 release_reservations();
5390 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5391 pl
->cancel_local_background_io_reservation();
5392 pl
->publish_stats_to_osd();
5393 // transit any async_recovery_targets back into acting
5394 // so pg won't have to stay undersized for long
5395 // as backfill might take a long time to complete..
5396 if (!ps
->async_recovery_targets
.empty()) {
5397 pg_shard_t auth_log_shard
;
5398 bool history_les_bound
= false;
5399 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5401 return transit
<WaitLocalBackfillReserved
>();
5404 boost::statechart::result
5405 PeeringState::Recovering::react(const DeferRecovery
&evt
)
5408 if (!ps
->state_test(PG_STATE_RECOVERING
)) {
5409 // we may have finished recovery and have an AllReplicasRecovered
5410 // event queued to move us to the next state.
5411 psdout(10) << "got defer recovery but not recovering" << dendl
;
5412 return discard_event();
5414 psdout(10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
5415 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5416 pl
->cancel_local_background_io_reservation();
5417 release_reservations(true);
5418 pl
->schedule_event_after(
5419 std::make_shared
<PGPeeringEvent
>(
5420 ps
->get_osdmap_epoch(),
5421 ps
->get_osdmap_epoch(),
5424 return transit
<NotRecovering
>();
5427 boost::statechart::result
5428 PeeringState::Recovering::react(const UnfoundRecovery
&evt
)
5431 psdout(10) << "recovery has unfound, can't continue" << dendl
;
5432 ps
->state_set(PG_STATE_RECOVERY_UNFOUND
);
5433 pl
->cancel_local_background_io_reservation();
5434 release_reservations(true);
5435 return transit
<NotRecovering
>();
5438 void PeeringState::Recovering::exit()
5440 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5443 utime_t dur
= ceph_clock_now() - enter_time
;
5444 ps
->state_clear(PG_STATE_RECOVERING
);
5445 pl
->get_peering_perf().tinc(rs_recovering_latency
, dur
);
5448 PeeringState::Recovered::Recovered(my_context ctx
)
5450 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovered")
5452 pg_shard_t auth_log_shard
;
5454 context
< PeeringMachine
>().log_enter(state_name
);
5458 ceph_assert(!ps
->needs_recovery());
5460 // if we finished backfill, all acting are active; recheck if
5461 // DEGRADED | UNDERSIZED is appropriate.
5462 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5463 if (ps
->get_osdmap()->get_pg_size(context
< PeeringMachine
>().spgid
.pgid
) <=
5464 ps
->acting_recovery_backfill
.size()) {
5465 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
5466 pl
->publish_stats_to_osd();
5469 // adjust acting set? (e.g. because backfill completed...)
5470 bool history_les_bound
= false;
5471 if (ps
->acting
!= ps
->up
&& !ps
->choose_acting(auth_log_shard
,
5472 true, &history_les_bound
)) {
5473 ceph_assert(ps
->want_acting
.size());
5474 } else if (!ps
->async_recovery_targets
.empty()) {
5475 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5478 if (context
< Active
>().all_replicas_activated
&&
5479 ps
->async_recovery_targets
.empty())
5480 post_event(GoClean());
5483 void PeeringState::Recovered::exit()
5485 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5488 utime_t dur
= ceph_clock_now() - enter_time
;
5489 pl
->get_peering_perf().tinc(rs_recovered_latency
, dur
);
5492 PeeringState::Clean::Clean(my_context ctx
)
5494 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Clean")
5496 context
< PeeringMachine
>().log_enter(state_name
);
5500 if (ps
->info
.last_complete
!= ps
->info
.last_update
) {
5505 ps
->try_mark_clean();
5507 context
< PeeringMachine
>().get_cur_transaction().register_on_commit(
5511 void PeeringState::Clean::exit()
5513 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5516 ps
->state_clear(PG_STATE_CLEAN
);
5517 utime_t dur
= ceph_clock_now() - enter_time
;
5518 pl
->get_peering_perf().tinc(rs_clean_latency
, dur
);
5521 template <typename T
>
5522 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
5524 set
<int> osds_found
;
5525 set
<pg_shard_t
> out
;
5526 for (typename
T::const_iterator i
= in
.begin();
5529 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
5530 osds_found
.insert(i
->osd
);
5537 /*---------Active---------*/
5538 PeeringState::Active::Active(my_context ctx
)
5540 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active"),
5541 remote_shards_to_reserve_recovery(
5542 unique_osd_shard_set(
5543 context
< PeeringMachine
>().state
->pg_whoami
,
5544 context
< PeeringMachine
>().state
->acting_recovery_backfill
)),
5545 remote_shards_to_reserve_backfill(
5546 unique_osd_shard_set(
5547 context
< PeeringMachine
>().state
->pg_whoami
,
5548 context
< PeeringMachine
>().state
->backfill_targets
)),
5549 all_replicas_activated(false)
5551 context
< PeeringMachine
>().log_enter(state_name
);
5556 ceph_assert(!ps
->backfill_reserved
);
5557 ceph_assert(ps
->is_primary());
5558 psdout(10) << "In Active, about to call activate" << dendl
;
5559 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
5560 ps
->activate(context
< PeeringMachine
>().get_cur_transaction(),
5561 ps
->get_osdmap_epoch(),
5562 context
< PeeringMachine
>().get_recovery_ctx());
5564 // everyone has to commit/ack before we are truly active
5565 ps
->blocked_by
.clear();
5566 for (set
<pg_shard_t
>::iterator p
= ps
->acting_recovery_backfill
.begin();
5567 p
!= ps
->acting_recovery_backfill
.end();
5569 if (p
->shard
!= ps
->pg_whoami
.shard
) {
5570 ps
->blocked_by
.insert(p
->shard
);
5573 pl
->publish_stats_to_osd();
5574 psdout(10) << "Activate Finished" << dendl
;
5577 boost::statechart::result
PeeringState::Active::react(const AdvMap
& advmap
)
5581 if (ps
->should_restart_peering(
5583 advmap
.acting_primary
,
5588 psdout(10) << "Active advmap interval change, fast return" << dendl
;
5589 return forward_event();
5591 psdout(10) << "Active advmap" << dendl
;
5592 bool need_publish
= false;
5594 pl
->on_active_advmap(advmap
.osdmap
);
5595 if (ps
->dirty_big_info
) {
5596 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5597 // purged snaps and (b) perhaps share more snaps that we have purged
5598 // but didn't fit in pg_stat_t.
5599 need_publish
= true;
5600 ps
->share_pg_info();
5603 bool need_acting_change
= false;
5604 for (size_t i
= 0; i
< ps
->want_acting
.size(); i
++) {
5605 int osd
= ps
->want_acting
[i
];
5606 if (!advmap
.osdmap
->is_up(osd
)) {
5607 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
5608 if (!ps
->is_acting(osd_with_shard
) && !ps
->is_up(osd_with_shard
)) {
5609 psdout(10) << "Active stray osd." << osd
<< " in want_acting is down"
5611 need_acting_change
= true;
5615 if (need_acting_change
) {
5616 psdout(10) << "Active need acting change, call choose_acting again"
5618 // possibly because we re-add some strays into the acting set and
5619 // some of them then go down in a subsequent map before we could see
5620 // the map changing the pg temp.
5621 // call choose_acting again to clear them out.
5622 // note that we leave restrict_to_up_acting to false in order to
5623 // not overkill any chosen stray that is still alive.
5624 pg_shard_t auth_log_shard
;
5625 bool history_les_bound
= false;
5626 ps
->remove_down_peer_info(advmap
.osdmap
);
5627 ps
->choose_acting(auth_log_shard
, false, &history_les_bound
, true);
5630 /* Check for changes in pool size (if the acting set changed as a result,
5631 * this does not matter) */
5632 if (advmap
.lastmap
->get_pg_size(ps
->info
.pgid
.pgid
) !=
5633 ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
)) {
5634 if (ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
) <=
5635 ps
->actingset
.size()) {
5636 ps
->state_clear(PG_STATE_UNDERSIZED
);
5638 ps
->state_set(PG_STATE_UNDERSIZED
);
5640 // degraded changes will be detected by call from publish_stats_to_osd()
5641 need_publish
= true;
5644 // if we haven't reported our PG stats in a long time, do so now.
5645 if (ps
->info
.stats
.reported_epoch
+ ps
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
5646 psdout(20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - ps
->info
.stats
.reported_epoch
)
5647 << " epochs" << dendl
;
5648 need_publish
= true;
5652 pl
->publish_stats_to_osd();
5654 if (ps
->check_prior_readable_down_osds(advmap
.osdmap
)) {
5655 pl
->recheck_readable();
5658 return forward_event();
5661 boost::statechart::result
PeeringState::Active::react(const ActMap
&)
5664 psdout(10) << "Active: handling ActMap" << dendl
;
5665 ceph_assert(ps
->is_primary());
5667 pl
->on_active_actmap();
5669 if (ps
->have_unfound()) {
5670 // object may have become unfound
5671 ps
->discover_all_missing(context
<PeeringMachine
>().get_recovery_ctx().msgs
);
5674 uint64_t unfound
= ps
->missing_loc
.num_unfound();
5676 ps
->all_unfound_are_queried_or_lost(ps
->get_osdmap())) {
5677 if (ps
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
5678 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has " << unfound
5679 << " objects unfound and apparently lost, would automatically "
5680 << "mark these objects lost but this feature is not yet implemented "
5681 << "(osd_auto_mark_unfound_lost)";
5683 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has "
5684 << unfound
<< " objects unfound and apparently lost";
5687 return forward_event();
5690 boost::statechart::result
PeeringState::Active::react(const MNotifyRec
& notevt
)
5694 ceph_assert(ps
->is_primary());
5695 if (ps
->peer_info
.count(notevt
.from
)) {
5696 psdout(10) << "Active: got notify from " << notevt
.from
5697 << ", already have info from that osd, ignoring"
5699 } else if (ps
->peer_purged
.count(notevt
.from
)) {
5700 psdout(10) << "Active: got notify from " << notevt
.from
5701 << ", already purged that peer, ignoring"
5704 psdout(10) << "Active: got notify from " << notevt
.from
5705 << ", calling proc_replica_info and discover_all_missing"
5707 ps
->proc_replica_info(
5708 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
5709 if (ps
->have_unfound() || (ps
->is_degraded() && ps
->might_have_unfound
.count(notevt
.from
))) {
5710 ps
->discover_all_missing(
5711 context
<PeeringMachine
>().get_recovery_ctx().msgs
);
5713 // check if it is a previous down acting member that's coming back.
5714 // if so, request pg_temp change to trigger a new interval transition
5715 pg_shard_t auth_log_shard
;
5716 bool history_les_bound
= false;
5717 ps
->choose_acting(auth_log_shard
, false, &history_les_bound
, true);
5718 if (!ps
->want_acting
.empty() && ps
->want_acting
!= ps
->acting
) {
5719 psdout(10) << "Active: got notify from previous acting member "
5720 << notevt
.from
<< ", requesting pg_temp change"
5724 return discard_event();
5727 boost::statechart::result
PeeringState::Active::react(const MTrim
& trim
)
5730 ceph_assert(ps
->is_primary());
5732 // peer is informing us of their last_complete_ondisk
5733 ldout(ps
->cct
,10) << " replica osd." << trim
.from
<< " lcod " << trim
.trim_to
<< dendl
;
5734 ps
->update_peer_last_complete_ondisk(pg_shard_t
{trim
.from
, trim
.shard
},
5736 // trim log when the pg is recovered
5737 ps
->calc_min_last_complete_ondisk();
5738 return discard_event();
5741 boost::statechart::result
PeeringState::Active::react(const MInfoRec
& infoevt
)
5744 ceph_assert(ps
->is_primary());
5746 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5747 if (infoevt
.lease_ack
) {
5748 ps
->proc_lease_ack(infoevt
.from
.osd
, *infoevt
.lease_ack
);
5750 // don't update history (yet) if we are active and primary; the replica
5751 // may be telling us they have activated (and committed) but we can't
5752 // share that until _everyone_ does the same.
5753 if (ps
->is_acting_recovery_backfill(infoevt
.from
) &&
5754 ps
->peer_activated
.count(infoevt
.from
) == 0) {
5755 psdout(10) << " peer osd." << infoevt
.from
5756 << " activated and committed" << dendl
;
5757 ps
->peer_activated
.insert(infoevt
.from
);
5758 ps
->blocked_by
.erase(infoevt
.from
.shard
);
5759 pl
->publish_stats_to_osd();
5760 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size()) {
5761 all_activated_and_committed();
5764 return discard_event();
5767 boost::statechart::result
PeeringState::Active::react(const MLogRec
& logevt
)
5770 psdout(10) << "searching osd." << logevt
.from
5771 << " log for unfound items" << dendl
;
5772 ps
->proc_replica_log(
5773 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
5774 bool got_missing
= ps
->search_for_missing(
5775 ps
->peer_info
[logevt
.from
],
5776 ps
->peer_missing
[logevt
.from
],
5778 context
< PeeringMachine
>().get_recovery_ctx());
5779 // If there are missing AND we are "fully" active then start recovery now
5780 if (got_missing
&& ps
->state_test(PG_STATE_ACTIVE
)) {
5781 post_event(DoRecovery());
5783 return discard_event();
5786 boost::statechart::result
PeeringState::Active::react(const QueryState
& q
)
5790 q
.f
->open_object_section("state");
5791 q
.f
->dump_string("name", state_name
);
5792 q
.f
->dump_stream("enter_time") << enter_time
;
5795 q
.f
->open_array_section("might_have_unfound");
5796 for (set
<pg_shard_t
>::iterator p
= ps
->might_have_unfound
.begin();
5797 p
!= ps
->might_have_unfound
.end();
5799 q
.f
->open_object_section("osd");
5800 q
.f
->dump_stream("osd") << *p
;
5801 if (ps
->peer_missing
.count(*p
)) {
5802 q
.f
->dump_string("status", "already probed");
5803 } else if (ps
->peer_missing_requested
.count(*p
)) {
5804 q
.f
->dump_string("status", "querying");
5805 } else if (!ps
->get_osdmap()->is_up(p
->osd
)) {
5806 q
.f
->dump_string("status", "osd is down");
5808 q
.f
->dump_string("status", "not queried");
5810 q
.f
->close_section();
5812 q
.f
->close_section();
5815 q
.f
->open_object_section("recovery_progress");
5816 q
.f
->open_array_section("backfill_targets");
5817 for (set
<pg_shard_t
>::const_iterator p
= ps
->backfill_targets
.begin();
5818 p
!= ps
->backfill_targets
.end(); ++p
)
5819 q
.f
->dump_stream("replica") << *p
;
5820 q
.f
->close_section();
5821 pl
->dump_recovery_info(q
.f
);
5822 q
.f
->close_section();
5825 q
.f
->close_section();
5826 return forward_event();
5829 boost::statechart::result
PeeringState::Active::react(
5830 const ActivateCommitted
&evt
)
5833 ceph_assert(!ps
->peer_activated
.count(ps
->pg_whoami
));
5834 ps
->peer_activated
.insert(ps
->pg_whoami
);
5835 psdout(10) << "_activate_committed " << evt
.epoch
5836 << " peer_activated now " << ps
->peer_activated
5837 << " last_interval_started "
5838 << ps
->info
.history
.last_interval_started
5839 << " last_epoch_started "
5840 << ps
->info
.history
.last_epoch_started
5841 << " same_interval_since "
5842 << ps
->info
.history
.same_interval_since
5844 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5845 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size())
5846 all_activated_and_committed();
5847 return discard_event();
5850 boost::statechart::result
PeeringState::Active::react(const AllReplicasActivated
&evt
)
5854 pg_t pgid
= context
< PeeringMachine
>().spgid
.pgid
;
5856 all_replicas_activated
= true;
5858 ps
->state_clear(PG_STATE_ACTIVATING
);
5859 ps
->state_clear(PG_STATE_CREATING
);
5860 ps
->state_clear(PG_STATE_PREMERGE
);
5863 if (ps
->pool
.info
.is_pending_merge(pgid
, &merge_target
)) {
5864 ps
->state_set(PG_STATE_PEERED
);
5865 ps
->state_set(PG_STATE_PREMERGE
);
5867 if (ps
->actingset
.size() != ps
->get_osdmap()->get_pg_size(pgid
)) {
5870 src
.set_ps(ps
->pool
.info
.get_pg_num_pending());
5871 assert(src
.get_parent() == pgid
);
5872 pl
->set_not_ready_to_merge_target(pgid
, src
);
5874 pl
->set_not_ready_to_merge_source(pgid
);
5877 } else if (ps
->actingset
.size() < ps
->pool
.info
.min_size
) {
5878 ps
->state_set(PG_STATE_PEERED
);
5880 ps
->state_set(PG_STATE_ACTIVE
);
5883 auto mnow
= pl
->get_mnow();
5884 if (ps
->prior_readable_until_ub
> mnow
) {
5885 psdout(10) << " waiting for prior_readable_until_ub "
5886 << ps
->prior_readable_until_ub
<< " > mnow " << mnow
<< dendl
;
5887 ps
->state_set(PG_STATE_WAIT
);
5888 pl
->queue_check_readable(
5889 ps
->last_peering_reset
,
5890 ps
->prior_readable_until_ub
- mnow
);
5892 psdout(10) << " mnow " << mnow
<< " >= prior_readable_until_ub "
5893 << ps
->prior_readable_until_ub
<< dendl
;
5896 if (ps
->pool
.info
.has_flag(pg_pool_t::FLAG_CREATING
)) {
5897 pl
->send_pg_created(pgid
);
5900 ps
->info
.history
.last_epoch_started
= ps
->info
.last_epoch_started
;
5901 ps
->info
.history
.last_interval_started
= ps
->info
.last_interval_started
;
5902 ps
->dirty_info
= true;
5904 ps
->share_pg_info();
5905 pl
->publish_stats_to_osd();
5907 pl
->on_activate_complete();
5909 return discard_event();
5912 boost::statechart::result
PeeringState::Active::react(const RenewLease
& rl
)
5915 ps
->proc_renew_lease();
5916 return discard_event();
5919 boost::statechart::result
PeeringState::Active::react(const MLeaseAck
& la
)
5922 ps
->proc_lease_ack(la
.from
, la
.lease_ack
);
5923 return discard_event();
5927 boost::statechart::result
PeeringState::Active::react(const CheckReadable
&evt
)
5930 pl
->recheck_readable();
5931 return discard_event();
5935 * update info.history.last_epoch_started ONLY after we and all
5936 * replicas have activated AND committed the activate transaction
5937 * (i.e. the peering results are stable on disk).
5939 void PeeringState::Active::all_activated_and_committed()
5942 psdout(10) << "all_activated_and_committed" << dendl
;
5943 ceph_assert(ps
->is_primary());
5944 ceph_assert(ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size());
5945 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5946 ceph_assert(ps
->blocked_by
.empty());
5948 if (HAVE_FEATURE(ps
->upacting_features
, SERVER_OCTOPUS
)) {
5949 // this is overkill when the activation is quick, but when it is slow it
5950 // is important, because the lease was renewed by the activate itself but we
5951 // don't know how long ago that was, and simply scheduling now may leave
5952 // a gap in lease coverage. keep it simple and aggressively renew.
5953 ps
->renew_lease(pl
->get_mnow());
5955 ps
->schedule_renew_lease();
5959 ps
->update_calc_stats();
5960 if (ps
->info
.stats
.stats
.sum
.num_objects_degraded
) {
5961 ps
->state_set(PG_STATE_DEGRADED
);
5963 ps
->state_clear(PG_STATE_DEGRADED
);
5966 post_event(PeeringState::AllReplicasActivated());
5970 void PeeringState::Active::exit()
5972 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5976 pl
->cancel_local_background_io_reservation();
5978 ps
->blocked_by
.clear();
5979 ps
->backfill_reserved
= false;
5980 ps
->state_clear(PG_STATE_ACTIVATING
);
5981 ps
->state_clear(PG_STATE_DEGRADED
);
5982 ps
->state_clear(PG_STATE_UNDERSIZED
);
5983 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
5984 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
5985 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
5986 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5987 utime_t dur
= ceph_clock_now() - enter_time
;
5988 pl
->get_peering_perf().tinc(rs_active_latency
, dur
);
5989 pl
->on_active_exit();
5992 /*------ReplicaActive-----*/
5993 PeeringState::ReplicaActive::ReplicaActive(my_context ctx
)
5995 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive")
5997 context
< PeeringMachine
>().log_enter(state_name
);
6000 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6004 boost::statechart::result
PeeringState::ReplicaActive::react(
6005 const Activate
& actevt
) {
6007 psdout(10) << "In ReplicaActive, about to call activate" << dendl
;
6009 context
< PeeringMachine
>().get_cur_transaction(),
6010 actevt
.activation_epoch
,
6011 context
< PeeringMachine
>().get_recovery_ctx());
6012 psdout(10) << "Activate Finished" << dendl
;
6013 return discard_event();
6016 boost::statechart::result
PeeringState::ReplicaActive::react(
6017 const ActivateCommitted
&evt
)
6020 psdout(10) << __func__
<< " " << evt
.epoch
<< " telling primary" << dendl
;
6022 auto &rctx
= context
<PeeringMachine
>().get_recovery_ctx();
6023 auto epoch
= ps
->get_osdmap_epoch();
6024 pg_info_t i
= ps
->info
;
6025 i
.history
.last_epoch_started
= evt
.activation_epoch
;
6026 i
.history
.last_interval_started
= i
.history
.same_interval_since
;
6028 ps
->get_primary().osd
,
6029 spg_t(ps
->info
.pgid
.pgid
, ps
->get_primary().shard
),
6034 ps
->get_lease_ack());
6036 if (ps
->actingset
.size() >= ps
->pool
.info
.min_size
) {
6037 ps
->state_set(PG_STATE_ACTIVE
);
6039 ps
->state_set(PG_STATE_PEERED
);
6041 pl
->on_activate_committed();
6043 return discard_event();
6046 boost::statechart::result
PeeringState::ReplicaActive::react(const MLease
& l
)
6049 spg_t spgid
= context
< PeeringMachine
>().spgid
;
6050 epoch_t epoch
= pl
->get_osdmap_epoch();
6052 ps
->proc_lease(l
.lease
);
6053 pl
->send_cluster_message(
6054 ps
->get_primary().osd
,
6055 new MOSDPGLeaseAck(epoch
,
6056 spg_t(spgid
.pgid
, ps
->get_primary().shard
),
6057 ps
->get_lease_ack()),
6059 return discard_event();
6062 boost::statechart::result
PeeringState::ReplicaActive::react(const MInfoRec
& infoevt
)
6065 ps
->proc_primary_info(context
<PeeringMachine
>().get_cur_transaction(),
6067 return discard_event();
6070 boost::statechart::result
PeeringState::ReplicaActive::react(const MLogRec
& logevt
)
6073 psdout(10) << "received log from " << logevt
.from
<< dendl
;
6074 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6075 ps
->merge_log(t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
6076 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6077 if (logevt
.msg
->lease
) {
6078 ps
->proc_lease(*logevt
.msg
->lease
);
6081 return discard_event();
6084 boost::statechart::result
PeeringState::ReplicaActive::react(const MTrim
& trim
)
6087 // primary is instructing us to trim
6088 ps
->pg_log
.trim(trim
.trim_to
, ps
->info
);
6089 ps
->dirty_info
= true;
6090 return discard_event();
6093 boost::statechart::result
PeeringState::ReplicaActive::react(const ActMap
&)
6096 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6097 ps
->info
.history
.refresh_prior_readable_until_ub(
6098 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6099 context
< PeeringMachine
>().send_notify(
6100 ps
->get_primary().osd
,
6102 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6103 ps
->get_osdmap_epoch(),
6104 ps
->get_osdmap_epoch(),
6106 ps
->past_intervals
));
6108 return discard_event();
6111 boost::statechart::result
PeeringState::ReplicaActive::react(
6112 const MQuery
& query
)
6115 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6116 return discard_event();
6119 boost::statechart::result
PeeringState::ReplicaActive::react(const QueryState
& q
)
6121 q
.f
->open_object_section("state");
6122 q
.f
->dump_string("name", state_name
);
6123 q
.f
->dump_stream("enter_time") << enter_time
;
6124 q
.f
->close_section();
6125 return forward_event();
6128 void PeeringState::ReplicaActive::exit()
6130 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6132 pl
->unreserve_recovery_space();
6134 pl
->cancel_remote_recovery_reservation();
6135 utime_t dur
= ceph_clock_now() - enter_time
;
6136 pl
->get_peering_perf().tinc(rs_replicaactive_latency
, dur
);
6138 ps
->min_last_complete_ondisk
= eversion_t();
6142 PeeringState::Stray::Stray(my_context ctx
)
6144 NamedState(context
< PeeringMachine
>().state_history
, "Started/Stray")
6146 context
< PeeringMachine
>().log_enter(state_name
);
6150 ceph_assert(!ps
->is_peered());
6151 ceph_assert(!ps
->is_peering());
6152 ceph_assert(!ps
->is_primary());
6154 if (!ps
->get_osdmap()->have_pg_pool(ps
->info
.pgid
.pgid
.pool())) {
6155 ldout(ps
->cct
,10) << __func__
<< " pool is deleted" << dendl
;
6156 post_event(DeleteStart());
6158 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6162 boost::statechart::result
PeeringState::Stray::react(const MLogRec
& logevt
)
6165 MOSDPGLog
*msg
= logevt
.msg
.get();
6166 psdout(10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
6168 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6169 if (msg
->info
.last_backfill
== hobject_t()) {
6171 ps
->info
= msg
->info
;
6172 pl
->on_info_history_change();
6173 ps
->dirty_info
= true;
6174 ps
->dirty_big_info
= true; // maybe.
6176 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6177 ps
->pg_log
.reset_backfill_claim_log(msg
->log
, rollbacker
.get());
6179 ps
->pg_log
.reset_backfill();
6181 ps
->merge_log(t
, msg
->info
, msg
->log
, logevt
.from
);
6183 if (logevt
.msg
->lease
) {
6184 ps
->proc_lease(*logevt
.msg
->lease
);
6187 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6189 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
6190 return transit
<ReplicaActive
>();
6193 boost::statechart::result
PeeringState::Stray::react(const MInfoRec
& infoevt
)
6196 psdout(10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
6198 if (ps
->info
.last_update
> infoevt
.info
.last_update
) {
6199 // rewind divergent log entries
6200 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6201 ps
->rewind_divergent_log(t
, infoevt
.info
.last_update
);
6202 ps
->info
.stats
= infoevt
.info
.stats
;
6203 ps
->info
.hit_set
= infoevt
.info
.hit_set
;
6206 if (infoevt
.lease
) {
6207 ps
->proc_lease(*infoevt
.lease
);
6210 ceph_assert(infoevt
.info
.last_update
== ps
->info
.last_update
);
6211 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6213 post_event(Activate(infoevt
.info
.last_epoch_started
));
6214 return transit
<ReplicaActive
>();
6217 boost::statechart::result
PeeringState::Stray::react(const MQuery
& query
)
6220 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6221 return discard_event();
6224 boost::statechart::result
PeeringState::Stray::react(const ActMap
&)
6227 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6228 ps
->info
.history
.refresh_prior_readable_until_ub(
6229 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6230 context
< PeeringMachine
>().send_notify(
6231 ps
->get_primary().osd
,
6233 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6234 ps
->get_osdmap_epoch(),
6235 ps
->get_osdmap_epoch(),
6237 ps
->past_intervals
));
6239 return discard_event();
6242 void PeeringState::Stray::exit()
6244 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6246 utime_t dur
= ceph_clock_now() - enter_time
;
6247 pl
->get_peering_perf().tinc(rs_stray_latency
, dur
);
6251 /*--------ToDelete----------*/
6252 PeeringState::ToDelete::ToDelete(my_context ctx
)
6254 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete")
6256 context
< PeeringMachine
>().log_enter(state_name
);
6258 pl
->get_perf_logger().inc(l_osd_pg_removing
);
6261 void PeeringState::ToDelete::exit()
6263 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6265 // note: on a successful removal, this path doesn't execute. see
6267 pl
->get_perf_logger().dec(l_osd_pg_removing
);
6269 pl
->cancel_local_background_io_reservation();
6272 /*----WaitDeleteReserved----*/
6273 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx
)
6275 NamedState(context
< PeeringMachine
>().state_history
,
6276 "Started/ToDelete/WaitDeleteReseved")
6278 context
< PeeringMachine
>().log_enter(state_name
);
6280 context
< ToDelete
>().priority
= ps
->get_delete_priority();
6282 pl
->cancel_local_background_io_reservation();
6283 pl
->request_local_background_io_reservation(
6284 context
<ToDelete
>().priority
,
6285 std::make_shared
<PGPeeringEvent
>(
6286 ps
->get_osdmap_epoch(),
6287 ps
->get_osdmap_epoch(),
6289 std::make_shared
<PGPeeringEvent
>(
6290 ps
->get_osdmap_epoch(),
6291 ps
->get_osdmap_epoch(),
6292 DeleteInterrupted()));
6295 boost::statechart::result
PeeringState::ToDelete::react(
6299 if (ps
->get_delete_priority() != priority
) {
6300 psdout(10) << __func__
<< " delete priority changed, resetting"
6302 return transit
<ToDelete
>();
6304 return discard_event();
6307 void PeeringState::WaitDeleteReserved::exit()
6309 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6312 /*----Deleting-----*/
6313 PeeringState::Deleting::Deleting(my_context ctx
)
6315 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete/Deleting")
6317 start
= ceph::mono_clock::now();
6319 context
< PeeringMachine
>().log_enter(state_name
);
6322 ps
->deleting
= true;
6323 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6326 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6327 ps
->pg_log
.roll_forward(rollbacker
.get());
6329 // adjust info to backfill
6330 ps
->info
.set_last_backfill(hobject_t());
6331 ps
->pg_log
.reset_backfill();
6332 ps
->dirty_info
= true;
6337 boost::statechart::result
PeeringState::Deleting::react(
6338 const DeleteSome
& evt
)
6341 next
= pl
->do_delete_work(context
<PeeringMachine
>().get_cur_transaction(),
6343 return discard_event();
6346 void PeeringState::Deleting::exit()
6348 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6350 ps
->deleting
= false;
6351 pl
->cancel_local_background_io_reservation();
6352 psdout(20) << "Deleting::" << __func__
<< this <<" finished in "
6353 << ceph::mono_clock::now() - start
6357 /*--------GetInfo---------*/
6358 PeeringState::GetInfo::GetInfo(my_context ctx
)
6360 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetInfo")
6362 context
< PeeringMachine
>().log_enter(state_name
);
6366 ps
->check_past_interval_bounds();
6367 ps
->log_weirdness();
6368 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6370 ceph_assert(ps
->blocked_by
.empty());
6372 prior_set
= ps
->build_prior();
6373 ps
->prior_readable_down_osds
= prior_set
.down
;
6374 if (ps
->prior_readable_down_osds
.empty()) {
6375 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6377 ps
->clear_prior_readable_until_ub();
6380 ps
->reset_min_peer_features();
6382 if (prior_set
.pg_down
) {
6383 post_event(IsDown());
6384 } else if (peer_info_requested
.empty()) {
6385 post_event(GotInfo());
6389 void PeeringState::GetInfo::get_infos()
6392 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6394 ps
->blocked_by
.clear();
6395 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
6396 it
!= prior_set
.probe
.end();
6398 pg_shard_t peer
= *it
;
6399 if (peer
== ps
->pg_whoami
) {
6402 if (ps
->peer_info
.count(peer
)) {
6403 psdout(10) << " have osd." << peer
<< " info " << ps
->peer_info
[peer
] << dendl
;
6406 if (peer_info_requested
.count(peer
)) {
6407 psdout(10) << " already requested info from osd." << peer
<< dendl
;
6408 ps
->blocked_by
.insert(peer
.osd
);
6409 } else if (!ps
->get_osdmap()->is_up(peer
.osd
)) {
6410 psdout(10) << " not querying info from down osd." << peer
<< dendl
;
6412 psdout(10) << " querying info from osd." << peer
<< dendl
;
6413 context
< PeeringMachine
>().send_query(
6415 pg_query_t(pg_query_t::INFO
,
6416 it
->shard
, ps
->pg_whoami
.shard
,
6418 ps
->get_osdmap_epoch()));
6419 peer_info_requested
.insert(peer
);
6420 ps
->blocked_by
.insert(peer
.osd
);
6424 ps
->check_prior_readable_down_osds(ps
->get_osdmap());
6426 pl
->publish_stats_to_osd();
6429 boost::statechart::result
PeeringState::GetInfo::react(const MNotifyRec
& infoevt
)
6434 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
6435 if (p
!= peer_info_requested
.end()) {
6436 peer_info_requested
.erase(p
);
6437 ps
->blocked_by
.erase(infoevt
.from
.osd
);
6440 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
6441 if (ps
->proc_replica_info(
6442 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
6443 // we got something new ...
6444 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6445 if (old_start
< ps
->info
.history
.last_epoch_started
) {
6446 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
6447 prior_set
= ps
->build_prior();
6448 ps
->prior_readable_down_osds
= prior_set
.down
;
6450 // filter out any osds that got dropped from the probe set from
6451 // peer_info_requested. this is less expensive than restarting
6452 // peering (which would re-probe everyone).
6453 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
6454 while (p
!= peer_info_requested
.end()) {
6455 if (prior_set
.probe
.count(*p
) == 0) {
6456 psdout(20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
6457 peer_info_requested
.erase(p
++);
6464 psdout(20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
6465 << hex
<< infoevt
.features
<< dec
<< dendl
;
6466 ps
->apply_peer_features(infoevt
.features
);
6468 // are we done getting everything?
6469 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
6470 psdout(20) << "Common peer features: " << hex
<< ps
->get_min_peer_features() << dec
<< dendl
;
6471 psdout(20) << "Common acting features: " << hex
<< ps
->get_min_acting_features() << dec
<< dendl
;
6472 psdout(20) << "Common upacting features: " << hex
<< ps
->get_min_upacting_features() << dec
<< dendl
;
6473 post_event(GotInfo());
6476 return discard_event();
6479 boost::statechart::result
PeeringState::GetInfo::react(const QueryState
& q
)
6482 q
.f
->open_object_section("state");
6483 q
.f
->dump_string("name", state_name
);
6484 q
.f
->dump_stream("enter_time") << enter_time
;
6486 q
.f
->open_array_section("requested_info_from");
6487 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
6488 p
!= peer_info_requested
.end();
6490 q
.f
->open_object_section("osd");
6491 q
.f
->dump_stream("osd") << *p
;
6492 if (ps
->peer_info
.count(*p
)) {
6493 q
.f
->open_object_section("got_info");
6494 ps
->peer_info
[*p
].dump(q
.f
);
6495 q
.f
->close_section();
6497 q
.f
->close_section();
6499 q
.f
->close_section();
6501 q
.f
->close_section();
6502 return forward_event();
6505 void PeeringState::GetInfo::exit()
6507 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6510 utime_t dur
= ceph_clock_now() - enter_time
;
6511 pl
->get_peering_perf().tinc(rs_getinfo_latency
, dur
);
6512 ps
->blocked_by
.clear();
6515 /*------GetLog------------*/
6516 PeeringState::GetLog::GetLog(my_context ctx
)
6519 context
< PeeringMachine
>().state_history
,
6520 "Started/Primary/Peering/GetLog"),
6523 context
< PeeringMachine
>().log_enter(state_name
);
6527 ps
->log_weirdness();
6530 if (!ps
->choose_acting(auth_log_shard
, false,
6531 &context
< Peering
>().history_les_bound
)) {
6532 if (!ps
->want_acting
.empty()) {
6533 post_event(NeedActingChange());
6535 post_event(IsIncomplete());
6541 if (auth_log_shard
== ps
->pg_whoami
) {
6542 post_event(GotLog());
6546 const pg_info_t
& best
= ps
->peer_info
[auth_log_shard
];
6549 if (ps
->info
.last_update
< best
.log_tail
) {
6550 psdout(10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
6551 post_event(IsIncomplete());
6555 // how much log to request?
6556 eversion_t request_log_from
= ps
->info
.last_update
;
6557 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6558 for (set
<pg_shard_t
>::iterator p
= ps
->acting_recovery_backfill
.begin();
6559 p
!= ps
->acting_recovery_backfill
.end();
6561 if (*p
== ps
->pg_whoami
) continue;
6562 pg_info_t
& ri
= ps
->peer_info
[*p
];
6563 if (ri
.last_update
< ps
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
6564 ri
.last_update
< request_log_from
)
6565 request_log_from
= ri
.last_update
;
6569 psdout(10) << " requesting log from osd." << auth_log_shard
<< dendl
;
6570 context
<PeeringMachine
>().send_query(
6574 auth_log_shard
.shard
, ps
->pg_whoami
.shard
,
6575 request_log_from
, ps
->info
.history
,
6576 ps
->get_osdmap_epoch()));
6578 ceph_assert(ps
->blocked_by
.empty());
6579 ps
->blocked_by
.insert(auth_log_shard
.osd
);
6580 pl
->publish_stats_to_osd();
6583 boost::statechart::result
PeeringState::GetLog::react(const AdvMap
& advmap
)
6585 // make sure our log source didn't go down. we need to check
6586 // explicitly because it may not be part of the prior set, which
6587 // means the Peering state check won't catch it going down.
6588 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
6589 psdout(10) << "GetLog: auth_log_shard osd."
6590 << auth_log_shard
.osd
<< " went down" << dendl
;
6592 return transit
< Reset
>();
6595 // let the Peering state do its checks.
6596 return forward_event();
6599 boost::statechart::result
PeeringState::GetLog::react(const MLogRec
& logevt
)
6602 if (logevt
.from
!= auth_log_shard
) {
6603 psdout(10) << "GetLog: discarding log from "
6604 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
6605 return discard_event();
6607 psdout(10) << "GetLog: received master log from osd."
6608 << logevt
.from
<< dendl
;
6610 post_event(GotLog());
6611 return discard_event();
6614 boost::statechart::result
PeeringState::GetLog::react(const GotLog
&)
6618 psdout(10) << "leaving GetLog" << dendl
;
6620 psdout(10) << "processing master log" << dendl
;
6621 ps
->proc_master_log(context
<PeeringMachine
>().get_cur_transaction(),
6622 msg
->info
, msg
->log
, msg
->missing
,
6625 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6626 return transit
< GetMissing
>();
6629 boost::statechart::result
PeeringState::GetLog::react(const QueryState
& q
)
6631 q
.f
->open_object_section("state");
6632 q
.f
->dump_string("name", state_name
);
6633 q
.f
->dump_stream("enter_time") << enter_time
;
6634 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
6635 q
.f
->close_section();
6636 return forward_event();
6639 void PeeringState::GetLog::exit()
6641 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6644 utime_t dur
= ceph_clock_now() - enter_time
;
6645 pl
->get_peering_perf().tinc(rs_getlog_latency
, dur
);
6646 ps
->blocked_by
.clear();
6649 /*------WaitActingChange--------*/
6650 PeeringState::WaitActingChange::WaitActingChange(my_context ctx
)
6652 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/WaitActingChange")
6654 context
< PeeringMachine
>().log_enter(state_name
);
6657 boost::statechart::result
PeeringState::WaitActingChange::react(const AdvMap
& advmap
)
6660 OSDMapRef osdmap
= advmap
.osdmap
;
6662 psdout(10) << "verifying no want_acting " << ps
->want_acting
<< " targets didn't go down" << dendl
;
6663 for (vector
<int>::iterator p
= ps
->want_acting
.begin(); p
!= ps
->want_acting
.end(); ++p
) {
6664 if (!osdmap
->is_up(*p
)) {
6665 psdout(10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
6667 return transit
< Reset
>();
6670 return forward_event();
6673 boost::statechart::result
PeeringState::WaitActingChange::react(const MLogRec
& logevt
)
6675 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
6676 return discard_event();
6679 boost::statechart::result
PeeringState::WaitActingChange::react(const MInfoRec
& evt
)
6681 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
6682 return discard_event();
6685 boost::statechart::result
PeeringState::WaitActingChange::react(const MNotifyRec
& evt
)
6687 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
6688 return discard_event();
6691 boost::statechart::result
PeeringState::WaitActingChange::react(const QueryState
& q
)
6693 q
.f
->open_object_section("state");
6694 q
.f
->dump_string("name", state_name
);
6695 q
.f
->dump_stream("enter_time") << enter_time
;
6696 q
.f
->dump_string("comment", "waiting for pg acting set to change");
6697 q
.f
->close_section();
6698 return forward_event();
6701 void PeeringState::WaitActingChange::exit()
6703 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6705 utime_t dur
= ceph_clock_now() - enter_time
;
6706 pl
->get_peering_perf().tinc(rs_waitactingchange_latency
, dur
);
6709 /*------Down--------*/
6710 PeeringState::Down::Down(my_context ctx
)
6712 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Down")
6714 context
< PeeringMachine
>().log_enter(state_name
);
6717 ps
->state_clear(PG_STATE_PEERING
);
6718 ps
->state_set(PG_STATE_DOWN
);
6720 auto &prior_set
= context
< Peering
>().prior_set
;
6721 ceph_assert(ps
->blocked_by
.empty());
6722 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
6723 pl
->publish_stats_to_osd();
6726 void PeeringState::Down::exit()
6728 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6732 ps
->state_clear(PG_STATE_DOWN
);
6733 utime_t dur
= ceph_clock_now() - enter_time
;
6734 pl
->get_peering_perf().tinc(rs_down_latency
, dur
);
6736 ps
->blocked_by
.clear();
6739 boost::statechart::result
PeeringState::Down::react(const QueryState
& q
)
6741 q
.f
->open_object_section("state");
6742 q
.f
->dump_string("name", state_name
);
6743 q
.f
->dump_stream("enter_time") << enter_time
;
6744 q
.f
->dump_string("comment",
6745 "not enough up instances of this PG to go active");
6746 q
.f
->close_section();
6747 return forward_event();
6750 boost::statechart::result
PeeringState::Down::react(const MNotifyRec
& infoevt
)
6754 ceph_assert(ps
->is_primary());
6755 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
6756 if (!ps
->peer_info
.count(infoevt
.from
) &&
6757 ps
->get_osdmap()->has_been_up_since(infoevt
.from
.osd
, infoevt
.notify
.epoch_sent
)) {
6758 ps
->update_history(infoevt
.notify
.info
.history
);
6760 // if we got something new to make pg escape down state
6761 if (ps
->info
.history
.last_epoch_started
> old_start
) {
6762 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl
;
6763 ps
->state_clear(PG_STATE_DOWN
);
6764 ps
->state_set(PG_STATE_PEERING
);
6765 return transit
< GetInfo
>();
6768 return discard_event();
6772 /*------Incomplete--------*/
6773 PeeringState::Incomplete::Incomplete(my_context ctx
)
6775 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Incomplete")
6777 context
< PeeringMachine
>().log_enter(state_name
);
6780 ps
->state_clear(PG_STATE_PEERING
);
6781 ps
->state_set(PG_STATE_INCOMPLETE
);
6783 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6784 ceph_assert(ps
->blocked_by
.empty());
6785 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
6786 pl
->publish_stats_to_osd();
6789 boost::statechart::result
PeeringState::Incomplete::react(const AdvMap
&advmap
) {
6791 int64_t poolnum
= ps
->info
.pgid
.pool();
6793 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6794 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
6795 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
6796 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
6798 return transit
< Reset
>();
6801 return forward_event();
6804 boost::statechart::result
PeeringState::Incomplete::react(const MNotifyRec
& notevt
) {
6806 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
6807 if (ps
->proc_replica_info(
6808 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
6809 // We got something new, try again!
6810 return transit
< GetLog
>();
6812 return discard_event();
6816 boost::statechart::result
PeeringState::Incomplete::react(
6817 const QueryState
& q
)
6819 q
.f
->open_object_section("state");
6820 q
.f
->dump_string("name", state_name
);
6821 q
.f
->dump_stream("enter_time") << enter_time
;
6822 q
.f
->dump_string("comment", "not enough complete instances of this PG");
6823 q
.f
->close_section();
6824 return forward_event();
6827 void PeeringState::Incomplete::exit()
6829 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6833 ps
->state_clear(PG_STATE_INCOMPLETE
);
6834 utime_t dur
= ceph_clock_now() - enter_time
;
6835 pl
->get_peering_perf().tinc(rs_incomplete_latency
, dur
);
6837 ps
->blocked_by
.clear();
6840 /*------GetMissing--------*/
6841 PeeringState::GetMissing::GetMissing(my_context ctx
)
6843 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetMissing")
6845 context
< PeeringMachine
>().log_enter(state_name
);
6848 ps
->log_weirdness();
6849 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6851 for (set
<pg_shard_t
>::iterator i
= ps
->acting_recovery_backfill
.begin();
6852 i
!= ps
->acting_recovery_backfill
.end();
6854 if (*i
== ps
->get_primary()) continue;
6855 const pg_info_t
& pi
= ps
->peer_info
[*i
];
6856 // reset this so to make sure the pg_missing_t is initialized and
6857 // has the correct semantics even if we don't need to get a
6858 // missing set from a shard. This way later additions due to
6859 // lost+unfound delete work properly.
6860 ps
->peer_missing
[*i
].may_include_deletes
= !ps
->perform_deletes_during_peering();
6863 continue; // no pg data, nothing divergent
6865 if (pi
.last_update
< ps
->pg_log
.get_tail()) {
6866 psdout(10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
6867 ps
->peer_missing
[*i
].clear();
6870 if (pi
.last_backfill
== hobject_t()) {
6871 psdout(10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
6872 ps
->peer_missing
[*i
].clear();
6876 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
6877 pi
.last_update
== ps
->info
.last_update
) { // peer is up to date
6878 // replica has no missing and identical log as us. no need to
6880 // FIXME: we can do better here. if last_update==last_complete we
6881 // can infer the rest!
6882 psdout(10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
6883 ps
->peer_missing
[*i
].clear();
6887 // We pull the log from the peer's last_epoch_started to ensure we
6888 // get enough log to detect divergent updates.
6889 since
.epoch
= pi
.last_epoch_started
;
6890 ceph_assert(pi
.last_update
>= ps
->info
.log_tail
); // or else choose_acting() did a bad thing
6891 if (pi
.log_tail
<= since
) {
6892 psdout(10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
6893 context
< PeeringMachine
>().send_query(
6897 i
->shard
, ps
->pg_whoami
.shard
,
6898 since
, ps
->info
.history
,
6899 ps
->get_osdmap_epoch()));
6901 psdout(10) << " requesting fulllog+missing from osd." << *i
6902 << " (want since " << since
<< " < log.tail "
6903 << pi
.log_tail
<< ")" << dendl
;
6904 context
< PeeringMachine
>().send_query(
6906 pg_query_t::FULLLOG
,
6907 i
->shard
, ps
->pg_whoami
.shard
,
6908 ps
->info
.history
, ps
->get_osdmap_epoch()));
6910 peer_missing_requested
.insert(*i
);
6911 ps
->blocked_by
.insert(i
->osd
);
6914 if (peer_missing_requested
.empty()) {
6915 if (ps
->need_up_thru
) {
6916 psdout(10) << " still need up_thru update before going active"
6918 post_event(NeedUpThru());
6923 post_event(Activate(ps
->get_osdmap_epoch()));
6925 pl
->publish_stats_to_osd();
6929 boost::statechart::result
PeeringState::GetMissing::react(const MLogRec
& logevt
)
6933 peer_missing_requested
.erase(logevt
.from
);
6934 ps
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
6936 if (peer_missing_requested
.empty()) {
6937 if (ps
->need_up_thru
) {
6938 psdout(10) << " still need up_thru update before going active"
6940 post_event(NeedUpThru());
6942 psdout(10) << "Got last missing, don't need missing "
6943 << "posting Activate" << dendl
;
6944 post_event(Activate(ps
->get_osdmap_epoch()));
6947 return discard_event();
6950 boost::statechart::result
PeeringState::GetMissing::react(const QueryState
& q
)
6953 q
.f
->open_object_section("state");
6954 q
.f
->dump_string("name", state_name
);
6955 q
.f
->dump_stream("enter_time") << enter_time
;
6957 q
.f
->open_array_section("peer_missing_requested");
6958 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
6959 p
!= peer_missing_requested
.end();
6961 q
.f
->open_object_section("osd");
6962 q
.f
->dump_stream("osd") << *p
;
6963 if (ps
->peer_missing
.count(*p
)) {
6964 q
.f
->open_object_section("got_missing");
6965 ps
->peer_missing
[*p
].dump(q
.f
);
6966 q
.f
->close_section();
6968 q
.f
->close_section();
6970 q
.f
->close_section();
6972 q
.f
->close_section();
6973 return forward_event();
6976 void PeeringState::GetMissing::exit()
6978 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6981 utime_t dur
= ceph_clock_now() - enter_time
;
6982 pl
->get_peering_perf().tinc(rs_getmissing_latency
, dur
);
6983 ps
->blocked_by
.clear();
6986 /*------WaitUpThru--------*/
6987 PeeringState::WaitUpThru::WaitUpThru(my_context ctx
)
6989 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/WaitUpThru")
6991 context
< PeeringMachine
>().log_enter(state_name
);
6994 boost::statechart::result
PeeringState::WaitUpThru::react(const ActMap
& am
)
6997 if (!ps
->need_up_thru
) {
6998 post_event(Activate(ps
->get_osdmap_epoch()));
7000 return forward_event();
7003 boost::statechart::result
PeeringState::WaitUpThru::react(const MLogRec
& logevt
)
7006 psdout(10) << "Noting missing from osd." << logevt
.from
<< dendl
;
7007 ps
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
7008 ps
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
7009 return discard_event();
7012 boost::statechart::result
PeeringState::WaitUpThru::react(const QueryState
& q
)
7014 q
.f
->open_object_section("state");
7015 q
.f
->dump_string("name", state_name
);
7016 q
.f
->dump_stream("enter_time") << enter_time
;
7017 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7018 q
.f
->close_section();
7019 return forward_event();
7022 void PeeringState::WaitUpThru::exit()
7024 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
7026 utime_t dur
= ceph_clock_now() - enter_time
;
7027 pl
->get_peering_perf().tinc(rs_waitupthru_latency
, dur
);
7030 /*----PeeringState::PeeringMachine Methods-----*/
7032 #define dout_prefix dpp->gen_prefix(*_dout)
7034 void PeeringState::PeeringMachine::log_enter(const char *state_name
)
7037 psdout(5) << "enter " << state_name
<< dendl
;
7038 pl
->log_state_enter(state_name
);
7041 void PeeringState::PeeringMachine::log_exit(const char *state_name
, utime_t enter_time
)
7044 utime_t dur
= ceph_clock_now() - enter_time
;
7045 psdout(5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
7046 pl
->log_state_exit(state_name
, enter_time
, event_count
, event_time
);
7048 event_time
= utime_t();
7051 ostream
&operator<<(ostream
&out
, const PeeringState
&ps
) {
7052 out
<< "pg[" << ps
.info
7053 << " " << pg_vector_string(ps
.up
);
7054 if (ps
.acting
!= ps
.up
)
7055 out
<< "/" << pg_vector_string(ps
.acting
);
7057 out
<< "p" << ps
.get_primary();
7058 if (!ps
.async_recovery_targets
.empty())
7059 out
<< " async=[" << ps
.async_recovery_targets
<< "]";
7060 if (!ps
.backfill_targets
.empty())
7061 out
<< " backfill=[" << ps
.backfill_targets
<< "]";
7062 out
<< " r=" << ps
.get_role();
7063 out
<< " lpr=" << ps
.get_last_peering_reset();
7068 if (!ps
.past_intervals
.empty()) {
7069 out
<< " pi=[" << ps
.past_intervals
.get_bounds()
7070 << ")/" << ps
.past_intervals
.size();
7073 if (ps
.is_peered()) {
7074 if (ps
.last_update_ondisk
!= ps
.info
.last_update
)
7075 out
<< " luod=" << ps
.last_update_ondisk
;
7076 if (ps
.last_update_applied
!= ps
.info
.last_update
)
7077 out
<< " lua=" << ps
.last_update_applied
;
7080 if (ps
.pg_log
.get_tail() != ps
.info
.log_tail
||
7081 ps
.pg_log
.get_head() != ps
.info
.last_update
)
7082 out
<< " (info mismatch, " << ps
.pg_log
.get_log() << ")";
7084 if (!ps
.pg_log
.get_log().empty()) {
7085 if ((ps
.pg_log
.get_log().log
.begin()->version
<= ps
.pg_log
.get_tail())) {
7086 out
<< " (log bound mismatch, actual=["
7087 << ps
.pg_log
.get_log().log
.begin()->version
<< ","
7088 << ps
.pg_log
.get_log().log
.rbegin()->version
<< "]";
7093 out
<< " crt=" << ps
.pg_log
.get_can_rollback_to();
7095 if (ps
.last_complete_ondisk
!= ps
.info
.last_complete
)
7096 out
<< " lcod " << ps
.last_complete_ondisk
;
7098 out
<< " mlcod " << ps
.min_last_complete_ondisk
;
7100 out
<< " " << pg_state_string(ps
.get_state());
7101 if (ps
.should_send_notify())
7104 if (ps
.prior_readable_until_ub
!= ceph::signedspan::zero()) {
7105 out
<< " pruub " << ps
.prior_readable_until_ub
7106 << "@" << ps
.get_prior_readable_down_osds();