1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
30 : require_osd_release(r
) {
31 // steal messages from ctx
32 message_map
.swap(ctx
.message_map
);
35 void BufferedRecoveryMessages::send_notify(int to
, const pg_notify_t
&n
)
37 if (require_osd_release
>= ceph_release_t::octopus
) {
38 spg_t
pgid(n
.info
.pgid
.pgid
, n
.to
);
39 send_osd_message(to
, make_message
<MOSDPGNotify2
>(pgid
, n
));
41 send_osd_message(to
, make_message
<MOSDPGNotify
>(n
.epoch_sent
, vector
{n
}));
45 void BufferedRecoveryMessages::send_query(
50 if (require_osd_release
>= ceph_release_t::octopus
) {
52 make_message
<MOSDPGQuery2
>(to_spgid
, q
));
54 auto m
= make_message
<MOSDPGQuery
>(
56 MOSDPGQuery::pg_list_t
{{to_spgid
, q
}});
57 send_osd_message(to
, m
);
61 void BufferedRecoveryMessages::send_info(
66 const pg_info_t
&info
,
67 std::optional
<pg_lease_t
> lease
,
68 std::optional
<pg_lease_ack_t
> lease_ack
)
70 if (require_osd_release
>= ceph_release_t::octopus
) {
73 make_message
<MOSDPGInfo2
>(
84 make_message
<MOSDPGInfo
>(
86 vector
{pg_notify_t
{to_spgid
.shard
,
89 info
, PastIntervals
{}}})
94 void PGPool::update(CephContext
*cct
, OSDMapRef map
)
96 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
98 return; // pool has been deleted
101 name
= map
->get_pool_name(id
);
103 bool updated
= false;
104 if ((map
->get_epoch() != cached_epoch
+ 1) ||
105 (pi
->get_snap_epoch() == map
->get_epoch())) {
109 assert(map
->require_osd_release
>= ceph_release_t::mimic
);
110 if (info
.is_pool_snaps_mode() && updated
) {
111 snapc
= pi
->get_snap_context();
113 cached_epoch
= map
->get_epoch();
116 /*-------------Peering State Helpers----------------*/
118 #define dout_prefix (dpp->gen_prefix(*_dout))
120 #define psdout(x) ldout(cct, x)
122 PeeringState::PeeringState(
124 pg_shard_t pg_whoami
,
128 DoutPrefixProvider
*dpp
,
130 : state_history(*pl
),
138 pg_whoami(pg_whoami
),
141 missing_loc(spgid
, this, dpp
, cct
),
142 machine(this, cct
, spgid
, dpp
, pl
, &state_history
)
147 void PeeringState::start_handle(PeeringCtx
*new_ctx
) {
149 ceph_assert(!orig_ctx
);
152 if (messages_pending_flush
) {
153 rctx
.emplace(*messages_pending_flush
, *new_ctx
);
155 rctx
.emplace(*new_ctx
);
157 rctx
->start_time
= ceph_clock_now();
161 void PeeringState::begin_block_outgoing() {
162 ceph_assert(!messages_pending_flush
);
163 ceph_assert(orig_ctx
);
165 messages_pending_flush
= BufferedRecoveryMessages(
166 orig_ctx
->require_osd_release
);
167 rctx
.emplace(*messages_pending_flush
, *orig_ctx
);
170 void PeeringState::clear_blocked_outgoing() {
171 ceph_assert(orig_ctx
);
173 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
176 void PeeringState::end_block_outgoing() {
177 ceph_assert(messages_pending_flush
);
178 ceph_assert(orig_ctx
);
181 orig_ctx
->accept_buffered_messages(*messages_pending_flush
);
182 rctx
.emplace(*orig_ctx
);
183 messages_pending_flush
= std::optional
<BufferedRecoveryMessages
>();
186 void PeeringState::end_handle() {
188 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
189 machine
.event_time
+= dur
;
192 machine
.event_count
++;
197 void PeeringState::check_recovery_sources(const OSDMapRef
& osdmap
)
200 * check that any peers we are planning to (or currently) pulling
201 * objects from are dealt with.
203 missing_loc
.check_recovery_sources(osdmap
);
204 pl
->check_recovery_sources(osdmap
);
206 for (set
<pg_shard_t
>::iterator i
= peer_log_requested
.begin();
207 i
!= peer_log_requested
.end();
209 if (!osdmap
->is_up(i
->osd
)) {
210 psdout(10) << "peer_log_requested removing " << *i
<< dendl
;
211 peer_log_requested
.erase(i
++);
217 for (set
<pg_shard_t
>::iterator i
= peer_missing_requested
.begin();
218 i
!= peer_missing_requested
.end();
220 if (!osdmap
->is_up(i
->osd
)) {
221 psdout(10) << "peer_missing_requested removing " << *i
<< dendl
;
222 peer_missing_requested
.erase(i
++);
229 void PeeringState::update_history(const pg_history_t
& new_history
)
231 auto mnow
= pl
->get_mnow();
232 info
.history
.refresh_prior_readable_until_ub(mnow
, prior_readable_until_ub
);
233 if (info
.history
.merge(new_history
)) {
234 psdout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
236 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
237 psdout(20) << __func__
<< " clearing past_intervals" << dendl
;
238 past_intervals
.clear();
239 dirty_big_info
= true;
241 prior_readable_until_ub
= info
.history
.get_prior_readable_until_ub(mnow
);
242 if (prior_readable_until_ub
!= ceph::signedspan::zero()) {
244 << " prior_readable_until_ub " << prior_readable_until_ub
245 << " (mnow " << mnow
<< " + "
246 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
249 pl
->on_info_history_change();
252 void PeeringState::purge_strays()
255 psdout(10) << "purge_strays " << stray_set
<< " but premerge, doing nothing"
259 if (cct
->_conf
.get_val
<bool>("osd_debug_no_purge_strays")) {
262 psdout(10) << "purge_strays " << stray_set
<< dendl
;
264 bool removed
= false;
265 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
266 p
!= stray_set
.end();
268 ceph_assert(!is_acting_recovery_backfill(*p
));
269 if (get_osdmap()->is_up(p
->osd
)) {
270 psdout(10) << "sending PGRemove to osd." << *p
<< dendl
;
271 vector
<spg_t
> to_remove
;
272 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
273 MOSDPGRemove
*m
= new MOSDPGRemove(
276 pl
->send_cluster_message(p
->osd
, m
, get_osdmap_epoch());
278 psdout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
280 peer_missing
.erase(*p
);
282 missing_loc
.remove_stray_recovery_sources(*p
);
283 peer_purged
.insert(*p
);
287 // if we removed anyone, update peers (which include peer_info)
289 update_heartbeat_peers();
293 // clear _requested maps; we may have to peer() again if we discover
294 // (more) stray content
295 peer_log_requested
.clear();
296 peer_missing_requested
.clear();
300 bool PeeringState::proc_replica_info(
301 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
303 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
304 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
305 psdout(10) << " got dup osd." << from
<< " info "
306 << oinfo
<< ", identical to ours" << dendl
;
310 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
311 psdout(10) << " got info " << oinfo
<< " from down osd." << from
312 << " discarding" << dendl
;
316 psdout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
317 ceph_assert(is_primary());
318 peer_info
[from
] = oinfo
;
319 might_have_unfound
.insert(from
);
321 update_history(oinfo
.history
);
324 if (!is_up(from
) && !is_acting(from
)) {
325 psdout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
326 stray_set
.insert(from
);
332 // was this a new info? if so, update peers!
333 if (p
== peer_info
.end())
334 update_heartbeat_peers();
340 void PeeringState::remove_down_peer_info(const OSDMapRef
&osdmap
)
342 // Remove any downed osds from peer_info
343 bool removed
= false;
344 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
345 while (p
!= peer_info
.end()) {
346 if (!osdmap
->is_up(p
->first
.osd
)) {
347 psdout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
348 peer_missing
.erase(p
->first
);
349 peer_log_requested
.erase(p
->first
);
350 peer_missing_requested
.erase(p
->first
);
351 peer_purged
.erase(p
->first
);
352 peer_info
.erase(p
++);
358 // if we removed anyone, update peers (which include peer_info)
360 update_heartbeat_peers();
362 check_recovery_sources(osdmap
);
365 void PeeringState::update_heartbeat_peers()
371 for (unsigned i
=0; i
<acting
.size(); i
++) {
372 if (acting
[i
] != CRUSH_ITEM_NONE
)
373 new_peers
.insert(acting
[i
]);
375 for (unsigned i
=0; i
<up
.size(); i
++) {
376 if (up
[i
] != CRUSH_ITEM_NONE
)
377 new_peers
.insert(up
[i
]);
379 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
380 p
!= peer_info
.end();
382 new_peers
.insert(p
->first
.osd
);
384 pl
->update_heartbeat_peers(std::move(new_peers
));
387 void PeeringState::write_if_dirty(ObjectStore::Transaction
& t
)
396 last_persisted_osdmap
< get_osdmap_epoch(),
398 if (dirty_info
|| dirty_big_info
) {
399 last_persisted_osdmap
= get_osdmap_epoch();
400 last_written_info
= info
;
402 dirty_big_info
= false;
406 void PeeringState::advance_map(
407 OSDMapRef osdmap
, OSDMapRef lastmap
,
408 vector
<int>& newup
, int up_primary
,
409 vector
<int>& newacting
, int acting_primary
,
412 ceph_assert(lastmap
== osdmap_ref
);
413 psdout(10) << "handle_advance_map "
414 << newup
<< "/" << newacting
415 << " -- " << up_primary
<< "/" << acting_primary
418 update_osdmap_ref(osdmap
);
419 pool
.update(cct
, osdmap
);
422 osdmap
, lastmap
, newup
, up_primary
,
423 newacting
, acting_primary
);
424 handle_event(evt
, &rctx
);
425 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
426 pl
->on_pool_change();
428 readable_interval
= pool
.get_readable_interval();
429 last_require_osd_release
= osdmap
->require_osd_release
;
432 void PeeringState::activate_map(PeeringCtx
&rctx
)
434 psdout(10) << __func__
<< dendl
;
436 handle_event(evt
, &rctx
);
437 if (osdmap_ref
->get_epoch() - last_persisted_osdmap
>
438 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
439 psdout(20) << __func__
<< ": Dirtying info: last_persisted is "
440 << last_persisted_osdmap
441 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
444 psdout(20) << __func__
<< ": Not dirtying info: last_persisted is "
445 << last_persisted_osdmap
446 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
448 write_if_dirty(rctx
.transaction
);
450 if (get_osdmap()->check_new_blacklist_entries()) {
451 pl
->check_blacklisted_watchers();
455 void PeeringState::set_last_peering_reset()
457 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl
;
458 if (last_peering_reset
!= get_osdmap_epoch()) {
459 last_peering_reset
= get_osdmap_epoch();
460 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
461 clear_blocked_outgoing();
462 if (!pl
->try_flush_or_schedule_async()) {
463 psdout(10) << "Beginning to block outgoing recovery messages" << dendl
;
464 begin_block_outgoing();
466 psdout(10) << "Not blocking outgoing recovery messages" << dendl
;
471 void PeeringState::complete_flush()
473 flushes_in_progress
--;
474 if (flushes_in_progress
== 0) {
479 void PeeringState::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
481 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
483 return; // pool deleted
485 bool changed
= false;
486 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
487 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
488 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
489 psdout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
494 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
499 bool PeeringState::should_restart_peering(
501 int newactingprimary
,
502 const vector
<int>& newup
,
503 const vector
<int>& newacting
,
507 if (PastIntervals::is_new_interval(
519 psdout(20) << "new interval newup " << newup
520 << " newacting " << newacting
<< dendl
;
523 if (!lastmap
->is_up(pg_whoami
.osd
) && osdmap
->is_up(pg_whoami
.osd
)) {
524 psdout(10) << __func__
<< " osd transitioned from down -> up"
531 /* Called before initializing peering during advance_map */
532 void PeeringState::start_peering_interval(
533 const OSDMapRef lastmap
,
534 const vector
<int>& newup
, int new_up_primary
,
535 const vector
<int>& newacting
, int new_acting_primary
,
536 ObjectStore::Transaction
&t
)
538 const OSDMapRef osdmap
= get_osdmap();
540 set_last_peering_reset();
542 vector
<int> oldacting
, oldup
;
543 int oldrole
= get_role();
546 pl
->clear_ready_to_merge();
550 pg_shard_t old_acting_primary
= get_primary();
551 pg_shard_t old_up_primary
= up_primary
;
552 bool was_old_primary
= is_primary();
553 bool was_old_nonprimary
= is_nonprimary();
555 acting
.swap(oldacting
);
557 init_primary_up_acting(
563 if (info
.stats
.up
!= up
||
564 info
.stats
.acting
!= acting
||
565 info
.stats
.up_primary
!= new_up_primary
||
566 info
.stats
.acting_primary
!= new_acting_primary
) {
568 info
.stats
.up_primary
= new_up_primary
;
569 info
.stats
.acting
= acting
;
570 info
.stats
.acting_primary
= new_acting_primary
;
571 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
574 pl
->clear_publish_stats();
576 // This will now be remapped during a backfill in cases
577 // that it would not have been before.
579 state_set(PG_STATE_REMAPPED
);
581 state_clear(PG_STATE_REMAPPED
);
583 int role
= osdmap
->calc_pg_role(pg_whoami
, acting
);
586 // did acting, up, primary|acker change?
588 psdout(10) << " no lastmap" << dendl
;
590 dirty_big_info
= true;
591 info
.history
.same_interval_since
= osdmap
->get_epoch();
593 std::stringstream debug
;
594 ceph_assert(info
.history
.same_interval_since
!= 0);
595 bool new_interval
= PastIntervals::check_new_interval(
596 old_acting_primary
.osd
,
598 oldacting
, newacting
,
602 info
.history
.same_interval_since
,
603 info
.history
.last_epoch_clean
,
607 missing_loc
.get_recoverable_predicate(),
610 psdout(10) << __func__
<< ": check_new_interval output: "
611 << debug
.str() << dendl
;
613 if (osdmap
->get_epoch() == pl
->oldest_stored_osdmap() &&
614 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
615 psdout(10) << " map gap, clearing past_intervals and faking" << dendl
;
616 // our information is incomplete and useless; someone else was clean
617 // after everything we know if osdmaps were trimmed.
618 past_intervals
.clear();
620 psdout(10) << " noting past " << past_intervals
<< dendl
;
623 dirty_big_info
= true;
624 info
.history
.same_interval_since
= osdmap
->get_epoch();
625 if (osdmap
->have_pg_pool(info
.pgid
.pgid
.pool()) &&
626 info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
627 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
629 info
.history
.last_epoch_split
= osdmap
->get_epoch();
634 if (old_up_primary
!= up_primary
||
636 info
.history
.same_up_since
= osdmap
->get_epoch();
638 // this comparison includes primary rank via pg_shard_t
639 if (old_acting_primary
!= get_primary()) {
640 info
.history
.same_primary_since
= osdmap
->get_epoch();
644 pl
->on_info_history_change();
646 psdout(1) << __func__
<< " up " << oldup
<< " -> " << up
647 << ", acting " << oldacting
<< " -> " << acting
648 << ", acting_primary " << old_acting_primary
<< " -> "
649 << new_acting_primary
650 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
651 << ", role " << oldrole
<< " -> " << role
652 << ", features acting " << acting_features
653 << " upacting " << upacting_features
657 state_clear(PG_STATE_ACTIVE
);
658 state_clear(PG_STATE_PEERED
);
659 state_clear(PG_STATE_PREMERGE
);
660 state_clear(PG_STATE_DOWN
);
661 state_clear(PG_STATE_RECOVERY_WAIT
);
662 state_clear(PG_STATE_RECOVERY_TOOFULL
);
663 state_clear(PG_STATE_RECOVERING
);
666 acting_recovery_backfill
.clear();
668 // reset primary/replica state?
669 if (was_old_primary
|| is_primary()) {
670 pl
->clear_want_pg_temp();
671 } else if (was_old_nonprimary
|| is_nonprimary()) {
672 pl
->clear_want_pg_temp();
674 clear_primary_state();
678 ceph_assert(!deleting
);
680 // should we tell the primary we are here?
681 send_notify
= !is_primary();
683 if (role
!= oldrole
||
684 was_old_primary
!= is_primary()) {
685 // did primary change?
686 if (was_old_primary
!= is_primary()) {
687 state_clear(PG_STATE_CLEAN
);
690 pl
->on_role_change();
693 // did primary change?
694 if (get_primary() != old_acting_primary
) {
695 psdout(10) << oldacting
<< " -> " << acting
696 << ", acting primary "
697 << old_acting_primary
<< " -> " << get_primary()
700 // primary is the same.
702 // i am (still) primary. but my replica set changed.
703 state_clear(PG_STATE_CLEAN
);
705 psdout(10) << oldacting
<< " -> " << acting
706 << ", replicas changed" << dendl
;
711 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
712 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
713 pl
->queue_want_pg_temp(acting
);
717 void PeeringState::on_new_interval()
719 dout(20) << __func__
<< dendl
;
720 const OSDMapRef osdmap
= get_osdmap();
722 // initialize features
723 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
724 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
725 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
726 if (*p
== CRUSH_ITEM_NONE
)
728 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
729 acting_features
&= f
;
730 upacting_features
&= f
;
732 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
733 if (*p
== CRUSH_ITEM_NONE
)
735 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
737 psdout(20) << __func__
<< " upacting_features 0x" << std::hex
738 << upacting_features
<< std::dec
739 << " from " << acting
<< "+" << up
<< dendl
;
741 psdout(20) << __func__
<< " checking missing set deletes flag. missing = "
742 << get_pg_log().get_missing() << dendl
;
744 if (!pg_log
.get_missing().may_include_deletes
&&
745 !perform_deletes_during_peering()) {
746 pl
->rebuild_missing_set_with_deletes(pg_log
);
749 pg_log
.get_missing().may_include_deletes
==
750 !perform_deletes_during_peering());
754 // update lease bounds for a new interval
755 auto mnow
= pl
->get_mnow();
756 prior_readable_until_ub
= std::max(prior_readable_until_ub
,
758 prior_readable_until_ub
= info
.history
.refresh_prior_readable_until_ub(
759 mnow
, prior_readable_until_ub
);
760 psdout(10) << __func__
<< " prior_readable_until_ub "
761 << prior_readable_until_ub
<< " (mnow " << mnow
<< " + "
762 << info
.history
.prior_readable_until_ub
<< ")" << dendl
;
763 prior_readable_down_osds
.clear(); // we populate this when we build the priorset
767 readable_until_ub_sent
=
768 readable_until_ub_from_primary
= ceph::signedspan::zero();
770 acting_readable_until_ub
.clear();
772 acting_readable_until_ub
.resize(acting
.size(), ceph::signedspan::zero());
775 pl
->on_new_interval();
778 void PeeringState::init_primary_up_acting(
779 const vector
<int> &newup
,
780 const vector
<int> &newacting
,
782 int new_acting_primary
)
786 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
787 if (acting
[i
] != CRUSH_ITEM_NONE
)
791 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
795 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
796 if (up
[i
] != CRUSH_ITEM_NONE
)
800 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
802 if (!pool
.info
.is_erasure()) {
804 up_primary
= pg_shard_t(new_up_primary
, shard_id_t::NO_SHARD
);
805 primary
= pg_shard_t(new_acting_primary
, shard_id_t::NO_SHARD
);
808 up_primary
= pg_shard_t();
809 primary
= pg_shard_t();
810 for (uint8_t i
= 0; i
< up
.size(); ++i
) {
811 if (up
[i
] == new_up_primary
) {
812 up_primary
= pg_shard_t(up
[i
], shard_id_t(i
));
816 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
817 if (acting
[i
] == new_acting_primary
) {
818 primary
= pg_shard_t(acting
[i
], shard_id_t(i
));
822 ceph_assert(up_primary
.osd
== new_up_primary
);
823 ceph_assert(primary
.osd
== new_acting_primary
);
827 void PeeringState::init_hb_stamps()
830 // we care about all other osds in the acting set
831 hb_stamps
.resize(acting
.size() - 1);
833 for (auto p
: acting
) {
834 if (p
== CRUSH_ITEM_NONE
|| p
== get_primary().osd
) {
837 hb_stamps
[i
++] = pl
->get_hb_stamps(p
);
840 } else if (is_nonprimary()) {
841 // we care about just the primary
843 hb_stamps
[0] = pl
->get_hb_stamps(get_primary().osd
);
847 dout(10) << __func__
<< " now " << hb_stamps
<< dendl
;
851 void PeeringState::clear_recovery_state()
853 async_recovery_targets
.clear();
854 backfill_targets
.clear();
857 void PeeringState::clear_primary_state()
859 psdout(10) << "clear_primary_state" << dendl
;
861 // clear peering state
863 peer_log_requested
.clear();
864 peer_missing_requested
.clear();
867 peer_missing
.clear();
868 peer_last_complete_ondisk
.clear();
869 peer_activated
.clear();
870 min_last_complete_ondisk
= eversion_t();
871 pg_trim_to
= eversion_t();
872 might_have_unfound
.clear();
873 need_up_thru
= false;
875 pg_log
.reset_recovery_pointers();
877 clear_recovery_state();
879 last_update_ondisk
= eversion_t();
881 pl
->clear_primary_state();
884 /// return [start,end) bounds for required past_intervals
885 static pair
<epoch_t
, epoch_t
> get_required_past_interval_bounds(
886 const pg_info_t
&info
,
887 epoch_t oldest_map
) {
888 epoch_t start
= std::max(
889 info
.history
.last_epoch_clean
? info
.history
.last_epoch_clean
:
890 info
.history
.epoch_pool_created
,
892 epoch_t end
= std::max(
893 info
.history
.same_interval_since
,
894 info
.history
.epoch_pool_created
);
895 return make_pair(start
, end
);
899 void PeeringState::check_past_interval_bounds() const
901 auto oldest_epoch
= pl
->oldest_stored_osdmap();
902 auto rpib
= get_required_past_interval_bounds(
905 if (rpib
.first
>= rpib
.second
) {
906 // do not warn if the start bound is dictated by oldest_map; the
907 // past intervals are presumably appropriate given the pg info.
908 if (!past_intervals
.empty() &&
909 rpib
.first
> oldest_epoch
) {
910 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
911 << " empty [" << rpib
<< ") but past_intervals is not: "
913 derr
<< info
.pgid
<< " required past_interval bounds are"
914 << " empty [" << rpib
<< ") but past_intervals is not: "
915 << past_intervals
<< dendl
;
918 if (past_intervals
.empty()) {
919 pl
->get_clog_error() << info
.pgid
<< " required past_interval bounds are"
920 << " not empty [" << rpib
<< ") but past_intervals "
921 << past_intervals
<< " is empty";
922 derr
<< info
.pgid
<< " required past_interval bounds are"
923 << " not empty [" << rpib
<< ") but past_intervals "
924 << past_intervals
<< " is empty" << dendl
;
925 ceph_assert(!past_intervals
.empty());
928 auto apib
= past_intervals
.get_bounds();
929 if (apib
.first
> rpib
.first
) {
930 pl
->get_clog_error() << info
.pgid
<< " past_intervals [" << apib
931 << ") start interval does not contain the required"
932 << " bound [" << rpib
<< ") start";
933 derr
<< info
.pgid
<< " past_intervals [" << apib
934 << ") start interval does not contain the required"
935 << " bound [" << rpib
<< ") start" << dendl
;
936 ceph_abort_msg("past_interval start interval mismatch");
938 if (apib
.second
!= rpib
.second
) {
939 pl
->get_clog_error() << info
.pgid
<< " past_interal bound [" << apib
940 << ") end does not match required [" << rpib
942 derr
<< info
.pgid
<< " past_interal bound [" << apib
943 << ") end does not match required [" << rpib
945 ceph_abort_msg("past_interval end mismatch");
950 int PeeringState::clamp_recovery_priority(int priority
, int pool_recovery_priority
, int max
)
952 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
953 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
955 ceph_assert(max
<= OSD_RECOVERY_PRIORITY_MAX
);
957 // User can't set this too high anymore, but might be a legacy value
958 if (pool_recovery_priority
> OSD_POOL_PRIORITY_MAX
)
959 pool_recovery_priority
= OSD_POOL_PRIORITY_MAX
;
960 if (pool_recovery_priority
< OSD_POOL_PRIORITY_MIN
)
961 pool_recovery_priority
= OSD_POOL_PRIORITY_MIN
;
962 // Shift range from min to max to 0 to max - min
963 pool_recovery_priority
+= (0 - OSD_POOL_PRIORITY_MIN
);
964 ceph_assert(pool_recovery_priority
>= 0 && pool_recovery_priority
<= (OSD_POOL_PRIORITY_MAX
- OSD_POOL_PRIORITY_MIN
));
966 priority
+= pool_recovery_priority
;
968 // Clamp to valid range
969 if (priority
> max
) {
971 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
972 return OSD_RECOVERY_PRIORITY_MIN
;
978 unsigned PeeringState::get_recovery_priority()
980 // a higher value -> a higher priority
981 int ret
= OSD_RECOVERY_PRIORITY_BASE
;
984 if (state
& PG_STATE_FORCED_RECOVERY
) {
985 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
987 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
988 if (is_degraded() && info
.stats
.avail_no_missing
.size() < pool
.info
.min_size
) {
989 base
= OSD_RECOVERY_INACTIVE_PRIORITY_BASE
;
990 // inactive: no. of replicas < min_size, highest priority since it blocks IO
991 ret
= base
+ (pool
.info
.min_size
- info
.stats
.avail_no_missing
.size());
994 int64_t pool_recovery_priority
= 0;
995 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
997 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
999 psdout(20) << __func__
<< " recovery priority is " << ret
<< dendl
;
1000 return static_cast<unsigned>(ret
);
1003 unsigned PeeringState::get_backfill_priority()
1005 // a higher value -> a higher priority
1006 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
1009 if (state
& PG_STATE_FORCED_BACKFILL
) {
1010 ret
= OSD_BACKFILL_PRIORITY_FORCED
;
1012 if (acting
.size() < pool
.info
.min_size
) {
1013 base
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
;
1014 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1015 ret
= base
+ (pool
.info
.min_size
- acting
.size());
1017 } else if (is_undersized()) {
1018 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1019 ceph_assert(pool
.info
.size
> actingset
.size());
1020 base
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1021 ret
= base
+ (pool
.info
.size
- actingset
.size());
1023 } else if (is_degraded()) {
1024 // degraded: baseline degraded
1025 base
= ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
1028 // Adjust with pool's recovery priority
1029 int64_t pool_recovery_priority
= 0;
1030 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
1032 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
1035 psdout(20) << __func__
<< " backfill priority is " << ret
<< dendl
;
1036 return static_cast<unsigned>(ret
);
1039 unsigned PeeringState::get_delete_priority()
1041 auto state
= get_osdmap()->get_state(pg_whoami
.osd
);
1042 if (state
& (CEPH_OSD_BACKFILLFULL
|
1044 return OSD_DELETE_PRIORITY_FULL
;
1045 } else if (state
& CEPH_OSD_NEARFULL
) {
1046 return OSD_DELETE_PRIORITY_FULLISH
;
1048 return OSD_DELETE_PRIORITY_NORMAL
;
1052 bool PeeringState::set_force_recovery(bool b
)
1056 if (!(state
& PG_STATE_FORCED_RECOVERY
) &&
1057 (state
& (PG_STATE_DEGRADED
|
1058 PG_STATE_RECOVERY_WAIT
|
1059 PG_STATE_RECOVERING
))) {
1060 psdout(20) << __func__
<< " set" << dendl
;
1061 state_set(PG_STATE_FORCED_RECOVERY
);
1062 pl
->publish_stats_to_osd();
1065 } else if (state
& PG_STATE_FORCED_RECOVERY
) {
1066 psdout(20) << __func__
<< " clear" << dendl
;
1067 state_clear(PG_STATE_FORCED_RECOVERY
);
1068 pl
->publish_stats_to_osd();
1072 psdout(20) << __func__
<< " state " << get_current_state()
1074 pl
->update_local_background_io_priority(get_recovery_priority());
1079 bool PeeringState::set_force_backfill(bool b
)
1083 if (!(state
& PG_STATE_FORCED_BACKFILL
) &&
1084 (state
& (PG_STATE_DEGRADED
|
1085 PG_STATE_BACKFILL_WAIT
|
1086 PG_STATE_BACKFILLING
))) {
1087 psdout(10) << __func__
<< " set" << dendl
;
1088 state_set(PG_STATE_FORCED_BACKFILL
);
1089 pl
->publish_stats_to_osd();
1092 } else if (state
& PG_STATE_FORCED_BACKFILL
) {
1093 psdout(10) << __func__
<< " clear" << dendl
;
1094 state_clear(PG_STATE_FORCED_BACKFILL
);
1095 pl
->publish_stats_to_osd();
1099 psdout(20) << __func__
<< " state " << get_current_state()
1101 pl
->update_local_background_io_priority(get_backfill_priority());
1106 void PeeringState::schedule_renew_lease()
1108 pl
->schedule_renew_lease(
1110 readable_interval
/ 2);
1113 void PeeringState::send_lease()
1115 epoch_t epoch
= pl
->get_osdmap_epoch();
1116 for (auto peer
: actingset
) {
1117 if (peer
== pg_whoami
) {
1120 pl
->send_cluster_message(
1122 new MOSDPGLease(epoch
,
1123 spg_t(spgid
.pgid
, peer
.shard
),
1129 void PeeringState::proc_lease(const pg_lease_t
& l
)
1131 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1132 psdout(20) << __func__
<< " no-op, upacting_features 0x" << std::hex
1133 << upacting_features
<< std::dec
1134 << " does not include SERVER_OCTOPUS" << dendl
;
1137 if (!is_nonprimary()) {
1138 psdout(20) << __func__
<< " no-op, !nonprimary" << dendl
;
1141 psdout(10) << __func__
<< " " << l
<< dendl
;
1142 if (l
.readable_until_ub
> readable_until_ub_from_primary
) {
1143 readable_until_ub_from_primary
= l
.readable_until_ub
;
1146 ceph::signedspan ru
= ceph::signedspan::zero();
1147 if (l
.readable_until
!= ceph::signedspan::zero() &&
1148 hb_stamps
[0]->peer_clock_delta_ub
) {
1149 ru
= l
.readable_until
- *hb_stamps
[0]->peer_clock_delta_ub
;
1150 psdout(20) << " peer_clock_delta_ub " << *hb_stamps
[0]->peer_clock_delta_ub
1151 << " -> ru " << ru
<< dendl
;
1153 if (ru
> readable_until
) {
1154 readable_until
= ru
;
1155 psdout(20) << __func__
<< " readable_until now " << readable_until
<< dendl
;
1156 // NOTE: if we ever decide to block/queue ops on the replica,
1157 // we'll need to wake them up here.
1160 ceph::signedspan ruub
;
1161 if (hb_stamps
[0]->peer_clock_delta_lb
) {
1162 ruub
= l
.readable_until_ub
- *hb_stamps
[0]->peer_clock_delta_lb
;
1163 psdout(20) << " peer_clock_delta_lb " << *hb_stamps
[0]->peer_clock_delta_lb
1164 << " -> ruub " << ruub
<< dendl
;
1166 ruub
= pl
->get_mnow() + l
.interval
;
1167 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub
<< dendl
;
1169 if (ruub
> readable_until_ub
) {
1170 readable_until_ub
= ruub
;
1171 psdout(20) << __func__
<< " readable_until_ub now " << readable_until_ub
1176 void PeeringState::proc_lease_ack(int from
, const pg_lease_ack_t
& a
)
1178 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1181 auto now
= pl
->get_mnow();
1182 bool was_min
= false;
1183 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1184 if (from
== acting
[i
]) {
1185 // the lease_ack value is based on the primary's clock
1186 if (a
.readable_until_ub
> acting_readable_until_ub
[i
]) {
1187 if (acting_readable_until_ub
[i
] == readable_until
) {
1190 acting_readable_until_ub
[i
] = a
.readable_until_ub
;
1196 auto old_ru
= readable_until
;
1197 recalc_readable_until();
1199 pl
->recheck_readable();
1204 void PeeringState::proc_renew_lease()
1206 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1209 renew_lease(pl
->get_mnow());
1211 schedule_renew_lease();
1214 void PeeringState::recalc_readable_until()
1216 assert(is_primary());
1217 ceph::signedspan min
= readable_until_ub_sent
;
1218 for (unsigned i
= 0; i
< acting
.size(); ++i
) {
1219 if (acting
[i
] == pg_whoami
.osd
|| acting
[i
] == CRUSH_ITEM_NONE
) {
1222 dout(20) << __func__
<< " peer osd." << acting
[i
]
1223 << " ruub " << acting_readable_until_ub
[i
] << dendl
;
1224 if (acting_readable_until_ub
[i
] < min
) {
1225 min
= acting_readable_until_ub
[i
];
1228 readable_until
= min
;
1229 readable_until_ub
= min
;
1230 dout(20) << __func__
<< " readable_until[_ub] " << readable_until
1231 << " (sent " << readable_until_ub_sent
<< ")" << dendl
;
1234 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef
& map
)
1236 if (!HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
1239 bool changed
= false;
1240 auto p
= prior_readable_down_osds
.begin();
1241 while (p
!= prior_readable_down_osds
.end()) {
1242 if (map
->is_dead(*p
)) {
1243 dout(10) << __func__
<< " prior_readable_down_osds osd." << *p
1244 << " is dead as of epoch " << map
->get_epoch()
1246 p
= prior_readable_down_osds
.erase(p
);
1252 if (changed
&& prior_readable_down_osds
.empty()) {
1253 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl
;
1254 clear_prior_readable_until_ub();
1260 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap
)
1262 epoch_t up_thru
= osdmap
->get_up_thru(pg_whoami
.osd
);
1264 up_thru
>= info
.history
.same_interval_since
) {
1265 psdout(10) << "adjust_need_up_thru now "
1266 << up_thru
<< ", need_up_thru now false" << dendl
;
1267 need_up_thru
= false;
1273 PastIntervals::PriorSet
PeeringState::build_prior()
1277 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
1278 it
!= peer_info
.end();
1280 ceph_assert(info
.history
.last_epoch_started
>=
1281 it
->second
.history
.last_epoch_started
);
1285 const OSDMap
&osdmap
= *get_osdmap();
1286 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
1287 pool
.info
.is_erasure(),
1288 info
.history
.last_epoch_started
,
1289 &missing_loc
.get_recoverable_predicate(),
1290 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
1291 const osd_info_t
*pinfo
= 0;
1292 if (osdmap
.exists(osd
)) {
1293 pinfo
= &osdmap
.get_info(osd
);
1295 *lost_at
= pinfo
->lost_at
;
1298 if (osdmap
.is_up(osd
)) {
1299 return PastIntervals::UP
;
1300 } else if (!pinfo
) {
1301 return PastIntervals::DNE
;
1302 } else if (pinfo
->lost_at
> start
) {
1303 return PastIntervals::LOST
;
1305 return PastIntervals::DOWN
;
1312 if (prior
.pg_down
) {
1313 state_set(PG_STATE_DOWN
);
1316 if (get_osdmap()->get_up_thru(pg_whoami
.osd
) <
1317 info
.history
.same_interval_since
) {
1318 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1319 << " < same_since " << info
.history
.same_interval_since
1320 << ", must notify monitor" << dendl
;
1321 need_up_thru
= true;
1323 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami
.osd
)
1324 << " >= same_since " << info
.history
.same_interval_since
1325 << ", all is well" << dendl
;
1326 need_up_thru
= false;
1328 pl
->set_probe_targets(prior
.probe
);
1332 bool PeeringState::needs_recovery() const
1334 ceph_assert(is_primary());
1336 auto &missing
= pg_log
.get_missing();
1338 if (missing
.num_missing()) {
1339 psdout(10) << __func__
<< " primary has " << missing
.num_missing()
1340 << " missing" << dendl
;
1344 ceph_assert(!acting_recovery_backfill
.empty());
1345 set
<pg_shard_t
>::const_iterator end
= acting_recovery_backfill
.end();
1346 set
<pg_shard_t
>::const_iterator a
= acting_recovery_backfill
.begin();
1347 for (; a
!= end
; ++a
) {
1348 if (*a
== get_primary()) continue;
1349 pg_shard_t peer
= *a
;
1350 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
1351 if (pm
== peer_missing
.end()) {
1352 psdout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
1356 if (pm
->second
.num_missing()) {
1357 psdout(10) << __func__
<< " osd." << peer
<< " has "
1358 << pm
->second
.num_missing() << " missing" << dendl
;
1363 psdout(10) << __func__
<< " is recovered" << dendl
;
1367 bool PeeringState::needs_backfill() const
1369 ceph_assert(is_primary());
1371 // We can assume that only possible osds that need backfill
1372 // are on the backfill_targets vector nodes.
1373 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
1374 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
1375 for (; a
!= end
; ++a
) {
1376 pg_shard_t peer
= *a
;
1377 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
1378 if (!pi
->second
.last_backfill
.is_max()) {
1379 psdout(10) << __func__
<< " osd." << peer
1380 << " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
1385 psdout(10) << __func__
<< " does not need backfill" << dendl
;
1390 * Returns true unless there is a non-lost OSD in might_have_unfound.
1392 bool PeeringState::all_unfound_are_queried_or_lost(
1393 const OSDMapRef osdmap
) const
1395 ceph_assert(is_primary());
1397 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
1398 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
1399 for (; peer
!= mend
; ++peer
) {
1400 if (peer_missing
.count(*peer
))
1402 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
1403 if (iter
!= peer_info
.end() &&
1404 (iter
->second
.is_empty() || iter
->second
.dne()))
1406 if (!osdmap
->exists(peer
->osd
))
1408 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
1409 if (osd_info
.lost_at
<= osd_info
.up_from
) {
1410 // If there is even one OSD in might_have_unfound that isn't lost, we
1411 // still might retrieve our unfound.
1415 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1416 << might_have_unfound
1417 << " have been queried or are marked lost" << dendl
;
1422 void PeeringState::reject_reservation()
1424 pl
->unreserve_recovery_space();
1425 pl
->send_cluster_message(
1427 new MBackfillReserve(
1428 MBackfillReserve::REJECT_TOOFULL
,
1429 spg_t(info
.pgid
.pgid
, primary
.shard
),
1430 get_osdmap_epoch()),
1431 get_osdmap_epoch());
1437 * Returns an iterator to the best info in infos sorted by:
1438 * 1) Prefer newer last_update
1439 * 2) Prefer longer tail if it brings another info into contiguity
1440 * 3) Prefer current primary
1442 map
<pg_shard_t
, pg_info_t
>::const_iterator
PeeringState::find_best_info(
1443 const map
<pg_shard_t
, pg_info_t
> &infos
,
1444 bool restrict_to_up_acting
,
1445 bool *history_les_bound
) const
1447 ceph_assert(history_les_bound
);
1448 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1449 * to make changes to this process. Also, make sure to update it
1450 * when you find bugs! */
1451 eversion_t min_last_update_acceptable
= eversion_t::max();
1452 epoch_t max_last_epoch_started_found
= 0;
1453 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1456 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1457 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1458 *history_les_bound
= true;
1459 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1461 if (!i
->second
.is_incomplete() &&
1462 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1463 *history_les_bound
= false;
1464 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1467 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1470 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1471 if (min_last_update_acceptable
> i
->second
.last_update
)
1472 min_last_update_acceptable
= i
->second
.last_update
;
1475 if (min_last_update_acceptable
== eversion_t::max())
1478 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1479 // find osd with newest last_update (oldest for ec_pool).
1480 // if there are multiples, prefer
1481 // - a longer tail, if it brings another peer into log contiguity
1482 // - the current primary
1483 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1486 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1487 !is_acting(p
->first
))
1489 // Only consider peers with last_update >= min_last_update_acceptable
1490 if (p
->second
.last_update
< min_last_update_acceptable
)
1492 // Disqualify anyone with a too old last_epoch_started
1493 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1495 // Disqualify anyone who is incomplete (not fully backfilled)
1496 if (p
->second
.is_incomplete())
1498 if (best
== infos
.end()) {
1502 // Prefer newer last_update
1503 if (pool
.info
.require_rollback()) {
1504 if (p
->second
.last_update
> best
->second
.last_update
)
1506 if (p
->second
.last_update
< best
->second
.last_update
) {
1511 if (p
->second
.last_update
< best
->second
.last_update
)
1513 if (p
->second
.last_update
> best
->second
.last_update
) {
1519 // Prefer longer tail
1520 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1522 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1527 if (!p
->second
.has_missing() && best
->second
.has_missing()) {
1528 psdout(10) << __func__
<< " prefer osd." << p
->first
1529 << " because it is complete while best has missing"
1533 } else if (p
->second
.has_missing() && !best
->second
.has_missing()) {
1534 psdout(10) << __func__
<< " skipping osd." << p
->first
1535 << " because it has missing while best is complete"
1539 // both are complete or have missing
1543 // prefer current primary (usually the caller), all things being equal
1544 if (p
->first
== pg_whoami
) {
1545 psdout(10) << "calc_acting prefer osd." << p
->first
1546 << " because it is current primary" << dendl
;
1554 void PeeringState::calc_ec_acting(
1555 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1557 const vector
<int> &acting
,
1558 const vector
<int> &up
,
1559 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1560 bool restrict_to_up_acting
,
1562 set
<pg_shard_t
> *backfill
,
1563 set
<pg_shard_t
> *acting_backfill
,
1566 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1567 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1568 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1569 i
!= all_info
.end();
1571 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1573 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1574 ss
<< "For position " << (unsigned)i
<< ": ";
1575 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1576 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1577 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1578 auth_log_shard
->second
.log_tail
) {
1579 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1583 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1584 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1586 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1589 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1590 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1591 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1592 auth_log_shard
->second
.log_tail
) {
1593 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1594 want
[i
] = acting
[i
];
1595 } else if (!restrict_to_up_acting
) {
1596 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1597 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1599 ceph_assert(j
->shard
== i
);
1600 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1601 all_info
.find(*j
)->second
.last_update
>=
1602 auth_log_shard
->second
.log_tail
) {
1603 ss
<< " selecting stray: " << *j
<< std::endl
;
1608 if (want
[i
] == CRUSH_ITEM_NONE
)
1609 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1613 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1614 if (want
[i
] != CRUSH_ITEM_NONE
) {
1615 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1618 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1623 * calculate the desired acting set.
1625 * Choose an appropriate acting set. Prefer up[0], unless it is
1626 * incomplete, or another osd has a longer tail that allows us to
1627 * bring other up nodes up to date.
1629 void PeeringState::calc_replicated_acting(
1630 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1631 uint64_t force_auth_primary_missing_objects
,
1633 const vector
<int> &acting
,
1634 const vector
<int> &up
,
1635 pg_shard_t up_primary
,
1636 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1637 bool restrict_to_up_acting
,
1639 set
<pg_shard_t
> *backfill
,
1640 set
<pg_shard_t
> *acting_backfill
,
1641 const OSDMapRef osdmap
,
1644 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1646 ss
<< __func__
<< " newest update on osd." << auth_log_shard_id
1647 << " with " << auth_log_shard
->second
1648 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1651 auto primary
= all_info
.find(up_primary
);
1653 !primary
->second
.is_incomplete() &&
1654 primary
->second
.last_update
>=
1655 auth_log_shard
->second
.log_tail
) {
1656 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1657 auto approx_missing_objects
=
1658 primary
->second
.stats
.stats
.sum
.num_objects_missing
;
1659 auto auth_version
= auth_log_shard
->second
.last_update
.version
;
1660 auto primary_version
= primary
->second
.last_update
.version
;
1661 if (auth_version
> primary_version
) {
1662 approx_missing_objects
+= auth_version
- primary_version
;
1664 approx_missing_objects
+= primary_version
- auth_version
;
1666 if ((uint64_t)approx_missing_objects
>
1667 force_auth_primary_missing_objects
) {
1668 primary
= auth_log_shard
;
1669 ss
<< "up_primary: " << up_primary
<< ") has approximate "
1670 << approx_missing_objects
1671 << "(>" << force_auth_primary_missing_objects
<<") "
1672 << "missing objects, osd." << auth_log_shard_id
1673 << " selected as primary instead"
1676 ss
<< "up_primary: " << up_primary
<< ") selected as primary"
1680 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1683 ceph_assert(!auth_log_shard
->second
.is_incomplete());
1684 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1685 << " selected as primary instead" << std::endl
;
1686 primary
= auth_log_shard
;
1689 ss
<< __func__
<< " primary is osd." << primary
->first
1690 << " with " << primary
->second
<< std::endl
;
1691 want
->push_back(primary
->first
.osd
);
1692 acting_backfill
->insert(primary
->first
);
1694 /* We include auth_log_shard->second.log_tail because in GetLog,
1695 * we will request logs back to the min last_update over our
1696 * acting_backfill set, which will result in our log being extended
1697 * as far backwards as necessary to pick up any peers which can
1698 * be log recovered by auth_log_shard's log */
1699 eversion_t oldest_auth_log_entry
=
1700 std::min(primary
->second
.log_tail
, auth_log_shard
->second
.log_tail
);
1702 // select replicas that have log contiguity with primary.
1703 // prefer up, then acting, then any peer_info osds
1705 pg_shard_t up_cand
= pg_shard_t(i
, shard_id_t::NO_SHARD
);
1706 if (up_cand
== primary
->first
)
1708 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1709 if (cur_info
.is_incomplete() ||
1710 cur_info
.last_update
< oldest_auth_log_entry
) {
1711 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1712 backfill
->insert(up_cand
);
1713 acting_backfill
->insert(up_cand
);
1716 acting_backfill
->insert(up_cand
);
1717 ss
<< " osd." << i
<< " (up) accepted " << cur_info
<< std::endl
;
1721 if (want
->size() >= size
) {
1725 std::vector
<std::pair
<eversion_t
, int>> candidate_by_last_update
;
1726 candidate_by_last_update
.reserve(acting
.size());
1727 // This no longer has backfill OSDs, but they are covered above.
1728 for (auto i
: acting
) {
1729 pg_shard_t
acting_cand(i
, shard_id_t::NO_SHARD
);
1730 // skip up osds we already considered above
1731 if (acting_cand
== primary
->first
)
1733 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
);
1734 if (up_it
!= up
.end())
1737 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1738 if (cur_info
.is_incomplete() ||
1739 cur_info
.last_update
< oldest_auth_log_entry
) {
1740 ss
<< " shard " << acting_cand
<< " (acting) REJECTED "
1741 << cur_info
<< std::endl
;
1743 candidate_by_last_update
.emplace_back(cur_info
.last_update
, i
);
1747 auto sort_by_eversion
=[](const std::pair
<eversion_t
, int> &lhs
,
1748 const std::pair
<eversion_t
, int> &rhs
) {
1749 return lhs
.first
> rhs
.first
;
1751 // sort by last_update, in descending order.
1752 std::sort(candidate_by_last_update
.begin(),
1753 candidate_by_last_update
.end(), sort_by_eversion
);
1754 for (auto &p
: candidate_by_last_update
) {
1755 ceph_assert(want
->size() < size
);
1756 want
->push_back(p
.second
);
1757 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1758 acting_backfill
->insert(s
);
1759 ss
<< " shard " << s
<< " (acting) accepted "
1760 << all_info
.find(s
)->second
<< std::endl
;
1761 if (want
->size() >= size
) {
1766 if (restrict_to_up_acting
) {
1769 candidate_by_last_update
.clear();
1770 candidate_by_last_update
.reserve(all_info
.size()); // overestimate but fine
1771 // continue to search stray to find more suitable peers
1772 for (auto &i
: all_info
) {
1773 // skip up osds we already considered above
1774 if (i
.first
== primary
->first
)
1776 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
.first
.osd
);
1777 if (up_it
!= up
.end())
1779 vector
<int>::const_iterator acting_it
= find(
1780 acting
.begin(), acting
.end(), i
.first
.osd
);
1781 if (acting_it
!= acting
.end())
1784 if (i
.second
.is_incomplete() ||
1785 i
.second
.last_update
< oldest_auth_log_entry
) {
1786 ss
<< " shard " << i
.first
<< " (stray) REJECTED " << i
.second
1789 candidate_by_last_update
.emplace_back(
1790 i
.second
.last_update
, i
.first
.osd
);
1794 if (candidate_by_last_update
.empty()) {
1795 // save us some effort
1799 // sort by last_update, in descending order.
1800 std::sort(candidate_by_last_update
.begin(),
1801 candidate_by_last_update
.end(), sort_by_eversion
);
1803 for (auto &p
: candidate_by_last_update
) {
1804 ceph_assert(want
->size() < size
);
1805 want
->push_back(p
.second
);
1806 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1807 acting_backfill
->insert(s
);
1808 ss
<< " shard " << s
<< " (stray) accepted "
1809 << all_info
.find(s
)->second
<< std::endl
;
1810 if (want
->size() >= size
) {
1816 bool PeeringState::recoverable(const vector
<int> &want
) const
1818 unsigned num_want_acting
= 0;
1819 set
<pg_shard_t
> have
;
1820 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1821 if (want
[i
] != CRUSH_ITEM_NONE
) {
1826 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1830 if (num_want_acting
< pool
.info
.min_size
) {
1831 const bool recovery_ec_pool_below_min_size
=
1832 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS
);
1834 if (pool
.info
.is_erasure() && !recovery_ec_pool_below_min_size
) {
1835 psdout(10) << __func__
<< " failed, ec recovery below min size not supported by pre-octopus" << dendl
;
1837 } else if (!cct
->_conf
.get_val
<bool>("osd_allow_recovery_below_min_size")) {
1838 psdout(10) << __func__
<< " failed, recovery below min size not enabled" << dendl
;
1842 if (missing_loc
.get_recoverable_predicate()(have
)) {
1845 psdout(10) << __func__
<< " failed, not recoverable " << dendl
;
1850 void PeeringState::choose_async_recovery_ec(
1851 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1852 const pg_info_t
&auth_info
,
1854 set
<pg_shard_t
> *async_recovery
,
1855 const OSDMapRef osdmap
) const
1857 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1858 for (uint8_t i
= 0; i
< want
->size(); ++i
) {
1859 if ((*want
)[i
] == CRUSH_ITEM_NONE
)
1862 // Considering log entries to recover is accurate enough for
1863 // now. We could use minimum_to_decode_with_cost() later if
1865 pg_shard_t
shard_i((*want
)[i
], shard_id_t(i
));
1866 // do not include strays
1867 if (stray_set
.find(shard_i
) != stray_set
.end())
1869 // Do not include an osd that is not up, since choosing it as
1870 // an async_recovery_target will move it out of the acting set.
1871 // This results in it being identified as a stray during peering,
1872 // because it is no longer in the up or acting set.
1873 if (!is_up(shard_i
))
1875 auto shard_info
= all_info
.find(shard_i
)->second
;
1876 // for ec pools we rollback all entries past the authoritative
1877 // last_update *before* activation. This is relatively inexpensive
1878 // compared to recovery, since it is purely local, so treat shards
1879 // past the authoritative last_update the same as those equal to it.
1880 version_t auth_version
= auth_info
.last_update
.version
;
1881 version_t candidate_version
= shard_info
.last_update
.version
;
1882 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1883 auto approx_missing_objects
=
1884 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1885 if (auth_version
> candidate_version
) {
1886 approx_missing_objects
+= auth_version
- candidate_version
;
1888 if (static_cast<uint64_t>(approx_missing_objects
) >
1889 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1890 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1893 if (auth_version
> candidate_version
&&
1894 (auth_version
- candidate_version
) > cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1895 candidates_by_cost
.insert(make_pair(auth_version
- candidate_version
, shard_i
));
1900 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1903 // take out as many osds as we can for async recovery, in order of cost
1904 for (auto rit
= candidates_by_cost
.rbegin();
1905 rit
!= candidates_by_cost
.rend(); ++rit
) {
1906 pg_shard_t cur_shard
= rit
->second
;
1907 vector
<int> candidate_want(*want
);
1908 candidate_want
[cur_shard
.shard
.id
] = CRUSH_ITEM_NONE
;
1909 if (recoverable(candidate_want
)) {
1910 want
->swap(candidate_want
);
1911 async_recovery
->insert(cur_shard
);
1914 psdout(20) << __func__
<< " result want=" << *want
1915 << " async_recovery=" << *async_recovery
<< dendl
;
1918 void PeeringState::choose_async_recovery_replicated(
1919 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1920 const pg_info_t
&auth_info
,
1922 set
<pg_shard_t
> *async_recovery
,
1923 const OSDMapRef osdmap
) const
1925 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1926 for (auto osd_num
: *want
) {
1927 pg_shard_t
shard_i(osd_num
, shard_id_t::NO_SHARD
);
1928 // do not include strays
1929 if (stray_set
.find(shard_i
) != stray_set
.end())
1931 // Do not include an osd that is not up, since choosing it as
1932 // an async_recovery_target will move it out of the acting set.
1933 // This results in it being identified as a stray during peering,
1934 // because it is no longer in the up or acting set.
1935 if (!is_up(shard_i
))
1937 auto shard_info
= all_info
.find(shard_i
)->second
;
1938 // use the approximate magnitude of the difference in length of
1939 // logs plus historical missing objects as the cost of recovery
1940 version_t auth_version
= auth_info
.last_update
.version
;
1941 version_t candidate_version
= shard_info
.last_update
.version
;
1942 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1943 auto approx_missing_objects
=
1944 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1945 if (auth_version
> candidate_version
) {
1946 approx_missing_objects
+= auth_version
- candidate_version
;
1948 approx_missing_objects
+= candidate_version
- auth_version
;
1950 if (static_cast<uint64_t>(approx_missing_objects
) >
1951 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1952 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1955 size_t approx_entries
;
1956 if (auth_version
> candidate_version
) {
1957 approx_entries
= auth_version
- candidate_version
;
1959 approx_entries
= candidate_version
- auth_version
;
1961 if (approx_entries
> cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1962 candidates_by_cost
.insert(make_pair(approx_entries
, shard_i
));
1967 psdout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1969 // take out as many osds as we can for async recovery, in order of cost
1970 for (auto rit
= candidates_by_cost
.rbegin();
1971 rit
!= candidates_by_cost
.rend(); ++rit
) {
1972 if (want
->size() <= pool
.info
.min_size
) {
1975 pg_shard_t cur_shard
= rit
->second
;
1976 vector
<int> candidate_want(*want
);
1977 for (auto it
= candidate_want
.begin(); it
!= candidate_want
.end(); ++it
) {
1978 if (*it
== cur_shard
.osd
) {
1979 candidate_want
.erase(it
);
1980 want
->swap(candidate_want
);
1981 async_recovery
->insert(cur_shard
);
1986 psdout(20) << __func__
<< " result want=" << *want
1987 << " async_recovery=" << *async_recovery
<< dendl
;
1995 * calculate the desired acting, and request a change with the monitor
1996 * if it differs from the current acting.
1998 * if restrict_to_up_acting=true, we filter out anything that's not in
1999 * up/acting. in order to lift this restriction, we need to
2000 * 1) check whether it's worth switching the acting set any time we get
2001 * a new pg info (not just here, when recovery finishes)
2002 * 2) check whether anything in want_acting went down on each new map
2003 * (and, if so, calculate a new want_acting)
2004 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2007 bool PeeringState::choose_acting(pg_shard_t
&auth_log_shard_id
,
2008 bool restrict_to_up_acting
,
2009 bool *history_les_bound
,
2010 bool request_pg_temp_change_only
)
2012 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
2013 all_info
[pg_whoami
] = info
;
2015 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 10>()) {
2016 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
2017 p
!= all_info
.end();
2019 psdout(10) << __func__
<< " all_info osd." << p
->first
<< " "
2020 << p
->second
<< dendl
;
2024 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
2025 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
2027 if (auth_log_shard
== all_info
.end()) {
2029 psdout(10) << __func__
<< " no suitable info found (incomplete backfills?),"
2030 << " reverting to up" << dendl
;
2033 pl
->queue_want_pg_temp(empty
);
2035 psdout(10) << __func__
<< " failed" << dendl
;
2036 ceph_assert(want_acting
.empty());
2041 ceph_assert(!auth_log_shard
->second
.is_incomplete());
2042 auth_log_shard_id
= auth_log_shard
->first
;
2044 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
2047 if (pool
.info
.is_replicated())
2048 calc_replicated_acting(
2050 cct
->_conf
.get_val
<uint64_t>(
2051 "osd_force_auth_primary_missing_objects"),
2052 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2057 restrict_to_up_acting
,
2060 &want_acting_backfill
,
2066 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
2070 restrict_to_up_acting
,
2073 &want_acting_backfill
,
2075 psdout(10) << ss
.str() << dendl
;
2077 if (!recoverable(want
)) {
2078 want_acting
.clear();
2082 set
<pg_shard_t
> want_async_recovery
;
2083 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC
)) {
2084 if (pool
.info
.is_erasure()) {
2085 choose_async_recovery_ec(
2086 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2089 choose_async_recovery_replicated(
2090 all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
,
2094 while (want
.size() > pool
.info
.size
) {
2095 // async recovery should have taken out as many osds as it can.
2096 // if not, then always evict the last peer
2097 // (will get synchronously recovered later)
2098 psdout(10) << __func__
<< " evicting osd." << want
.back()
2099 << " from oversized want " << want
<< dendl
;
2102 if (want
!= acting
) {
2103 psdout(10) << __func__
<< " want " << want
<< " != acting " << acting
2104 << ", requesting pg_temp change" << dendl
;
2107 if (!cct
->_conf
->osd_debug_no_acting_change
) {
2108 if (want_acting
== up
) {
2109 // There can't be any pending backfill if
2110 // want is the same as crush map up OSDs.
2111 ceph_assert(want_backfill
.empty());
2113 pl
->queue_want_pg_temp(empty
);
2115 pl
->queue_want_pg_temp(want
);
2119 if (request_pg_temp_change_only
)
2121 want_acting
.clear();
2122 acting_recovery_backfill
= want_acting_backfill
;
2123 psdout(10) << "acting_recovery_backfill is "
2124 << acting_recovery_backfill
<< dendl
;
2126 backfill_targets
.empty() ||
2127 backfill_targets
== want_backfill
);
2128 if (backfill_targets
.empty()) {
2129 // Caller is GetInfo
2130 backfill_targets
= want_backfill
;
2132 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2134 async_recovery_targets
.empty() ||
2135 async_recovery_targets
== want_async_recovery
||
2137 if (async_recovery_targets
.empty() || !needs_recovery()) {
2138 async_recovery_targets
= want_async_recovery
;
2140 // Will not change if already set because up would have had to change
2141 // Verify that nothing in backfill is in stray_set
2142 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
2143 i
!= want_backfill
.end();
2145 ceph_assert(stray_set
.find(*i
) == stray_set
.end());
2147 psdout(10) << "choose_acting want=" << want
<< " backfill_targets="
2148 << want_backfill
<< " async_recovery_targets="
2149 << async_recovery_targets
<< dendl
;
2153 void PeeringState::log_weirdness()
2155 if (pg_log
.get_tail() != info
.log_tail
)
2156 pl
->get_clog_error() << info
.pgid
2157 << " info mismatch, log.tail " << pg_log
.get_tail()
2158 << " != info.log_tail " << info
.log_tail
;
2159 if (pg_log
.get_head() != info
.last_update
)
2160 pl
->get_clog_error() << info
.pgid
2161 << " info mismatch, log.head " << pg_log
.get_head()
2162 << " != info.last_update " << info
.last_update
;
2164 if (!pg_log
.get_log().empty()) {
2166 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
2167 pl
->get_clog_error() << info
.pgid
2168 << " log bound mismatch, info (tail,head] ("
2169 << pg_log
.get_tail() << ","
2170 << pg_log
.get_head() << "]"
2172 << pg_log
.get_log().log
.begin()->version
<< ","
2173 << pg_log
.get_log().log
.rbegin()->version
<< "]";
2176 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
2177 pl
->get_clog_error() << info
.pgid
2178 << " caller_ops.size "
2179 << pg_log
.get_log().caller_ops
.size()
2180 << " > log size " << pg_log
.get_log().log
.size();
2185 * Process information from a replica to determine if it could have any
2186 * objects that i need.
2188 * TODO: if the missing set becomes very large, this could get expensive.
2189 * Instead, we probably want to just iterate over our unfound set.
2191 bool PeeringState::search_for_missing(
2192 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
2194 PeeringCtxWrapper
&ctx
)
2196 uint64_t num_unfound_before
= missing_loc
.num_unfound();
2197 bool found_missing
= missing_loc
.add_source_info(
2198 from
, oinfo
, omissing
, ctx
.handle
);
2199 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
2200 pl
->publish_stats_to_osd();
2201 // avoid doing this if the peer is empty. This is abit of paranoia
2202 // to avoid doing something rash if add_source_info() above
2203 // incorrectly decided we found something new. (if the peer has
2204 // last_update=0'0 that's impossible.)
2205 if (found_missing
&&
2206 oinfo
.last_update
!= eversion_t()) {
2207 pg_info_t
tinfo(oinfo
);
2208 tinfo
.pgid
.shard
= pg_whoami
.shard
;
2211 spg_t(info
.pgid
.pgid
, from
.shard
),
2212 get_osdmap_epoch(), // fixme: use lower epoch?
2216 return found_missing
;
2219 bool PeeringState::discover_all_missing(
2220 BufferedRecoveryMessages
&rctx
)
2222 auto &missing
= pg_log
.get_missing();
2223 uint64_t unfound
= get_num_unfound();
2224 bool any
= false; // did we start any queries
2226 psdout(10) << __func__
<< " "
2227 << missing
.num_missing() << " missing, "
2228 << unfound
<< " unfound"
2231 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
2232 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
2233 for (; m
!= mend
; ++m
) {
2234 pg_shard_t
peer(*m
);
2236 if (!get_osdmap()->is_up(peer
.osd
)) {
2237 psdout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
2241 if (peer_purged
.count(peer
)) {
2242 psdout(20) << __func__
<< " skipping purged osd." << peer
<< dendl
;
2246 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
2247 if (iter
!= peer_info
.end() &&
2248 (iter
->second
.is_empty() || iter
->second
.dne())) {
2249 // ignore empty peers
2253 // If we've requested any of this stuff, the pg_missing_t information
2254 // should be on its way.
2255 // TODO: coalsce requested_* into a single data structure
2256 if (peer_missing
.find(peer
) != peer_missing
.end()) {
2257 psdout(20) << __func__
<< ": osd." << peer
2258 << ": we already have pg_missing_t" << dendl
;
2261 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
2262 psdout(20) << __func__
<< ": osd." << peer
2263 << ": in peer_log_requested" << dendl
;
2266 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
2267 psdout(20) << __func__
<< ": osd." << peer
2268 << ": in peer_missing_requested" << dendl
;
2273 psdout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
2275 peer_missing_requested
.insert(peer
);
2278 spg_t(info
.pgid
.pgid
, peer
.shard
),
2280 pg_query_t::FULLLOG
,
2281 peer
.shard
, pg_whoami
.shard
,
2282 info
.history
, get_osdmap_epoch()));
2288 /* Build the might_have_unfound set.
2290 * This is used by the primary OSD during recovery.
2292 * This set tracks the OSDs which might have unfound objects that the primary
2293 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2294 * will remove the OSD from the set.
2296 void PeeringState::build_might_have_unfound()
2298 ceph_assert(might_have_unfound
.empty());
2299 ceph_assert(is_primary());
2301 psdout(10) << __func__
<< dendl
;
2303 check_past_interval_bounds();
2305 might_have_unfound
= past_intervals
.get_might_have_unfound(
2307 pool
.info
.is_erasure());
2309 // include any (stray) peers
2310 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
2311 p
!= peer_info
.end();
2313 might_have_unfound
.insert(p
->first
);
2315 psdout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
2318 void PeeringState::activate(
2319 ObjectStore::Transaction
& t
,
2320 epoch_t activation_epoch
,
2321 PeeringCtxWrapper
&ctx
)
2323 ceph_assert(!is_peered());
2326 state_clear(PG_STATE_DOWN
);
2328 send_notify
= false;
2331 // only update primary last_epoch_started if we will go active
2332 if (acting
.size() >= pool
.info
.min_size
) {
2333 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
2334 info
.last_epoch_started
<= activation_epoch
);
2335 info
.last_epoch_started
= activation_epoch
;
2336 info
.last_interval_started
= info
.history
.same_interval_since
;
2338 } else if (is_acting(pg_whoami
)) {
2339 /* update last_epoch_started on acting replica to whatever the primary sent
2340 * unless it's smaller (could happen if we are going peered rather than
2341 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2342 if (info
.last_epoch_started
< activation_epoch
) {
2343 info
.last_epoch_started
= activation_epoch
;
2344 info
.last_interval_started
= info
.history
.same_interval_since
;
2348 auto &missing
= pg_log
.get_missing();
2350 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
2352 last_update_ondisk
= info
.last_update
;
2354 last_update_applied
= info
.last_update
;
2355 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
2357 need_up_thru
= false;
2359 // write pg info, log
2361 dirty_big_info
= true; // maybe
2363 pl
->schedule_event_on_commit(
2365 std::make_shared
<PGPeeringEvent
>(
2370 activation_epoch
)));
2372 // init complete pointer
2373 if (missing
.num_missing() == 0) {
2374 psdout(10) << "activate - no missing, moving last_complete " << info
.last_complete
2375 << " -> " << info
.last_update
<< dendl
;
2376 info
.last_complete
= info
.last_update
;
2377 info
.stats
.stats
.sum
.num_objects_missing
= 0;
2378 pg_log
.reset_recovery_pointers();
2380 psdout(10) << "activate - not complete, " << missing
<< dendl
;
2381 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
2382 pg_log
.activate_not_complete(info
);
2388 // initialize snap_trimq
2389 interval_set
<snapid_t
> to_trim
;
2390 auto& removed_snaps_queue
= get_osdmap()->get_removed_snaps_queue();
2391 auto p
= removed_snaps_queue
.find(info
.pgid
.pgid
.pool());
2392 if (p
!= removed_snaps_queue
.end()) {
2393 dout(20) << "activate - purged_snaps " << info
.purged_snaps
2394 << " removed_snaps " << p
->second
2396 for (auto q
: p
->second
) {
2397 to_trim
.insert(q
.first
, q
.second
);
2400 interval_set
<snapid_t
> purged
;
2401 purged
.intersection_of(to_trim
, info
.purged_snaps
);
2402 to_trim
.subtract(purged
);
2404 if (HAVE_FEATURE(upacting_features
, SERVER_OCTOPUS
)) {
2405 renew_lease(pl
->get_mnow());
2406 // do not schedule until we are actually activated
2409 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2410 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2411 // reflect only those snaps that we thought were pruned and were still in
2413 info
.purged_snaps
.swap(purged
);
2415 // start up replicas
2416 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2417 prior_readable_until_ub
);
2419 ceph_assert(!acting_recovery_backfill
.empty());
2420 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2421 i
!= acting_recovery_backfill
.end();
2423 if (*i
== pg_whoami
) continue;
2424 pg_shard_t peer
= *i
;
2425 ceph_assert(peer_info
.count(peer
));
2426 pg_info_t
& pi
= peer_info
[peer
];
2428 psdout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
2431 ceph_assert(peer_missing
.count(peer
));
2432 pg_missing_t
& pm
= peer_missing
[peer
];
2434 bool needs_past_intervals
= pi
.dne();
2436 if (pi
.last_update
== info
.last_update
) {
2438 if (!pi
.last_backfill
.is_max())
2439 pl
->get_clog_info() << info
.pgid
<< " continuing backfill to osd."
2441 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2442 << "] " << pi
.last_backfill
2443 << " to " << info
.last_update
;
2444 if (!pi
.is_empty()) {
2445 psdout(10) << "activate peer osd." << peer
2446 << " is up to date, queueing in pending_activators" << dendl
;
2449 spg_t(info
.pgid
.pgid
, peer
.shard
),
2450 get_osdmap_epoch(), // fixme: use lower epoch?
2455 psdout(10) << "activate peer osd." << peer
2456 << " is up to date, but sending pg_log anyway" << dendl
;
2458 i
->shard
, pg_whoami
.shard
,
2459 get_osdmap_epoch(), info
,
2460 last_peering_reset
);
2463 pg_log
.get_tail() > pi
.last_update
||
2464 pi
.last_backfill
== hobject_t() ||
2465 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
2466 /* ^ This last case covers a situation where a replica is not contiguous
2467 * with the auth_log, but is contiguous with this replica. Reshuffling
2468 * the active set to handle this would be tricky, so instead we just go
2469 * ahead and backfill it anyway. This is probably preferrable in any
2470 * case since the replica in question would have to be significantly
2474 pl
->get_clog_debug() << info
.pgid
<< " starting backfill to osd." << peer
2475 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2476 << "] " << pi
.last_backfill
2477 << " to " << info
.last_update
;
2479 pi
.last_update
= info
.last_update
;
2480 pi
.last_complete
= info
.last_update
;
2481 pi
.set_last_backfill(hobject_t());
2482 pi
.last_epoch_started
= info
.last_epoch_started
;
2483 pi
.last_interval_started
= info
.last_interval_started
;
2484 pi
.history
= info
.history
;
2485 pi
.hit_set
= info
.hit_set
;
2486 // Save num_bytes for reservation request, can't be negative
2487 peer_bytes
[peer
] = std::max
<int64_t>(0, pi
.stats
.stats
.sum
.num_bytes
);
2488 pi
.stats
.stats
.clear();
2489 pi
.stats
.stats
.sum
.num_bytes
= peer_bytes
[peer
];
2491 // initialize peer with our purged_snaps.
2492 pi
.purged_snaps
= info
.purged_snaps
;
2495 i
->shard
, pg_whoami
.shard
,
2496 get_osdmap_epoch(), pi
,
2497 last_peering_reset
/* epoch to create pg at */);
2499 // send some recent log, so that op dup detection works well.
2500 m
->log
.copy_up_to(cct
, pg_log
.get_log(),
2501 cct
->_conf
->osd_max_pg_log_entries
);
2502 m
->info
.log_tail
= m
->log
.tail
;
2503 pi
.log_tail
= m
->log
.tail
; // sigh...
2508 ceph_assert(pg_log
.get_tail() <= pi
.last_update
);
2510 i
->shard
, pg_whoami
.shard
,
2511 get_osdmap_epoch(), info
,
2512 last_peering_reset
/* epoch to create pg at */);
2513 // send new stuff to append to replicas log
2514 m
->log
.copy_after(cct
, pg_log
.get_log(), pi
.last_update
);
2517 // share past_intervals if we are creating the pg on the replica
2518 // based on whether our info for that peer was dne() *before*
2519 // updating pi.history in the backfill block above.
2520 if (m
&& needs_past_intervals
)
2521 m
->past_intervals
= past_intervals
;
2523 // update local version of peer's missing list!
2524 if (m
&& pi
.last_backfill
!= hobject_t()) {
2525 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
2526 p
!= m
->log
.log
.end();
2528 if (p
->soid
<= pi
.last_backfill
&&
2530 if (perform_deletes_during_peering() && p
->is_delete()) {
2531 pm
.rm(p
->soid
, p
->version
);
2533 pm
.add_next_event(*p
);
2540 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
2542 m
->lease
= get_lease();
2543 pl
->send_cluster_message(peer
.osd
, m
, get_osdmap_epoch());
2547 pi
.last_update
= info
.last_update
;
2549 // update our missing
2550 if (pm
.num_missing() == 0) {
2551 pi
.last_complete
= pi
.last_update
;
2552 psdout(10) << "activate peer osd." << peer
<< " " << pi
2553 << " uptodate" << dendl
;
2555 psdout(10) << "activate peer osd." << peer
<< " " << pi
2556 << " missing " << pm
<< dendl
;
2560 // Set up missing_loc
2561 set
<pg_shard_t
> complete_shards
;
2562 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2563 i
!= acting_recovery_backfill
.end();
2565 psdout(20) << __func__
<< " setting up missing_loc from shard " << *i
2567 if (*i
== get_primary()) {
2568 missing_loc
.add_active_missing(missing
);
2569 if (!missing
.have_missing())
2570 complete_shards
.insert(*i
);
2572 auto peer_missing_entry
= peer_missing
.find(*i
);
2573 ceph_assert(peer_missing_entry
!= peer_missing
.end());
2574 missing_loc
.add_active_missing(peer_missing_entry
->second
);
2575 if (!peer_missing_entry
->second
.have_missing() &&
2576 peer_info
[*i
].last_backfill
.is_max())
2577 complete_shards
.insert(*i
);
2581 // If necessary, create might_have_unfound to help us find our unfound objects.
2582 // NOTE: It's important that we build might_have_unfound before trimming the
2584 might_have_unfound
.clear();
2585 if (needs_recovery()) {
2586 // If only one shard has missing, we do a trick to add all others as recovery
2587 // source, this is considered safe since the PGLogs have been merged locally,
2588 // and covers vast majority of the use cases, like one OSD/host is down for
2589 // a while for hardware repairing
2590 if (complete_shards
.size() + 1 == acting_recovery_backfill
.size()) {
2591 missing_loc
.add_batch_sources_info(complete_shards
, ctx
.handle
);
2593 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
2595 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2596 i
!= acting_recovery_backfill
.end();
2598 if (*i
== pg_whoami
) continue;
2599 psdout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
2600 ceph_assert(peer_missing
.count(*i
));
2601 ceph_assert(peer_info
.count(*i
));
2602 missing_loc
.add_source_info(
2609 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
2610 i
!= peer_missing
.end();
2612 if (is_acting_recovery_backfill(i
->first
))
2614 ceph_assert(peer_info
.count(i
->first
));
2616 peer_info
[i
->first
],
2622 build_might_have_unfound();
2624 // Always call now so update_calc_stats() will be accurate
2625 discover_all_missing(ctx
.msgs
);
2629 // num_objects_degraded if calculated should reflect this too, unless no
2630 // missing and we are about to go clean.
2631 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
2632 state_set(PG_STATE_UNDERSIZED
);
2635 state_set(PG_STATE_ACTIVATING
);
2636 pl
->on_activate(std::move(to_trim
));
2638 if (acting
.size() >= pool
.info
.min_size
) {
2639 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2640 pg_log
.roll_forward(rollbacker
.get());
2644 void PeeringState::share_pg_info()
2646 psdout(10) << "share_pg_info" << dendl
;
2648 info
.history
.refresh_prior_readable_until_ub(pl
->get_mnow(),
2649 prior_readable_until_ub
);
2651 // share new pg_info_t with replicas
2652 ceph_assert(!acting_recovery_backfill
.empty());
2653 for (auto pg_shard
: acting_recovery_backfill
) {
2654 if (pg_shard
== pg_whoami
) continue;
2655 if (auto peer
= peer_info
.find(pg_shard
); peer
!= peer_info
.end()) {
2656 peer
->second
.last_epoch_started
= info
.last_epoch_started
;
2657 peer
->second
.last_interval_started
= info
.last_interval_started
;
2658 peer
->second
.history
.merge(info
.history
);
2660 Message
* m
= nullptr;
2661 if (last_require_osd_release
>= ceph_release_t::octopus
) {
2662 m
= new MOSDPGInfo2
{spg_t
{info
.pgid
.pgid
, pg_shard
.shard
},
2668 m
= new MOSDPGInfo
{get_osdmap_epoch(),
2669 {pg_notify_t
{pg_shard
.shard
,
2676 pl
->send_cluster_message(pg_shard
.osd
, m
, get_osdmap_epoch());
2680 void PeeringState::merge_log(
2681 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
,
2684 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2686 oinfo
, olog
, from
, info
, rollbacker
.get(), dirty_info
, dirty_big_info
);
2689 void PeeringState::rewind_divergent_log(
2690 ObjectStore::Transaction
& t
, eversion_t newhead
)
2692 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
2693 pg_log
.rewind_divergent_log(
2694 newhead
, info
, rollbacker
.get(), dirty_info
, dirty_big_info
);
2698 void PeeringState::proc_primary_info(
2699 ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
2701 ceph_assert(!is_primary());
2703 update_history(oinfo
.history
);
2704 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
2705 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
2706 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
2707 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
2711 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
2712 psdout(10) << __func__
<< " updating purged_snaps to "
2713 << oinfo
.purged_snaps
2715 info
.purged_snaps
= oinfo
.purged_snaps
;
2717 dirty_big_info
= true;
2721 void PeeringState::proc_master_log(
2722 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
2723 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
2725 psdout(10) << "proc_master_log for osd." << from
<< ": "
2726 << olog
<< " " << omissing
<< dendl
;
2727 ceph_assert(!is_peered() && is_primary());
2729 // merge log into our own log to build master log. no need to
2730 // make any adjustments to their missing map; we are taking their
2731 // log to be authoritative (i.e., their entries are by definitely
2733 merge_log(t
, oinfo
, olog
, from
);
2734 peer_info
[from
] = oinfo
;
2735 psdout(10) << " peer osd." << from
<< " now " << oinfo
2736 << " " << omissing
<< dendl
;
2737 might_have_unfound
.insert(from
);
2739 // See doc/dev/osd_internals/last_epoch_started
2740 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
2741 info
.last_epoch_started
= oinfo
.last_epoch_started
;
2744 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
2745 info
.last_interval_started
= oinfo
.last_interval_started
;
2748 update_history(oinfo
.history
);
2749 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
2750 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
2752 peer_missing
[from
].claim(omissing
);
2755 void PeeringState::proc_replica_log(
2757 const pg_log_t
&olog
,
2758 pg_missing_t
& omissing
,
2761 psdout(10) << "proc_replica_log for osd." << from
<< ": "
2762 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
2764 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
2766 peer_info
[from
] = oinfo
;
2767 psdout(10) << " peer osd." << from
<< " now "
2768 << oinfo
<< " " << omissing
<< dendl
;
2769 might_have_unfound
.insert(from
);
2771 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
2772 omissing
.get_items().begin();
2773 i
!= omissing
.get_items().end();
2775 psdout(20) << " after missing " << i
->first
2776 << " need " << i
->second
.need
2777 << " have " << i
->second
.have
<< dendl
;
2779 peer_missing
[from
].claim(omissing
);
2782 void PeeringState::fulfill_info(
2783 pg_shard_t from
, const pg_query_t
&query
,
2784 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
2786 ceph_assert(from
== primary
);
2787 ceph_assert(query
.type
== pg_query_t::INFO
);
2790 psdout(10) << "sending info" << dendl
;
2791 notify_info
= make_pair(from
, info
);
2794 void PeeringState::fulfill_log(
2795 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
2797 psdout(10) << "log request from " << from
<< dendl
;
2798 ceph_assert(from
== primary
);
2799 ceph_assert(query
.type
!= pg_query_t::INFO
);
2801 MOSDPGLog
*mlog
= new MOSDPGLog(
2802 from
.shard
, pg_whoami
.shard
,
2805 mlog
->missing
= pg_log
.get_missing();
2807 // primary -> other, when building master log
2808 if (query
.type
== pg_query_t::LOG
) {
2809 psdout(10) << " sending info+missing+log since " << query
.since
2811 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
2812 pl
->get_clog_error() << info
.pgid
<< " got broken pg_query_t::LOG since "
2814 << " when my log.tail is " << pg_log
.get_tail()
2815 << ", sending full log instead";
2816 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
2818 mlog
->log
.copy_after(cct
, pg_log
.get_log(), query
.since
);
2820 else if (query
.type
== pg_query_t::FULLLOG
) {
2821 psdout(10) << " sending info+missing+full log" << dendl
;
2822 mlog
->log
= pg_log
.get_log();
2825 psdout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
2827 pl
->send_cluster_message(from
.osd
, mlog
, get_osdmap_epoch(), true);
2830 void PeeringState::fulfill_query(const MQuery
& query
, PeeringCtxWrapper
&rctx
)
2832 if (query
.query
.type
== pg_query_t::INFO
) {
2833 pair
<pg_shard_t
, pg_info_t
> notify_info
;
2834 // note this refreshes our prior_readable_until_ub value
2835 update_history(query
.query
.history
);
2836 fulfill_info(query
.from
, query
.query
, notify_info
);
2838 notify_info
.first
.osd
,
2840 notify_info
.first
.shard
, pg_whoami
.shard
,
2846 update_history(query
.query
.history
);
2847 fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
2851 void PeeringState::try_mark_clean()
2853 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2854 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
2855 state_set(PG_STATE_CLEAN
);
2856 info
.history
.last_epoch_clean
= get_osdmap_epoch();
2857 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2858 past_intervals
.clear();
2859 dirty_big_info
= true;
2863 if (!is_active() && is_peered()) {
2866 if (pool
.info
.is_pending_merge(info
.pgid
.pgid
, &target
)) {
2868 psdout(10) << "ready to merge (target)" << dendl
;
2869 pl
->set_ready_to_merge_target(
2871 info
.history
.last_epoch_started
,
2872 info
.history
.last_epoch_clean
);
2874 psdout(10) << "ready to merge (source)" << dendl
;
2875 pl
->set_ready_to_merge_source(info
.last_update
);
2879 psdout(10) << "not clean, not ready to merge" << dendl
;
2880 // we should have notified OSD in Active state entry point
2884 state_clear(PG_STATE_FORCED_RECOVERY
| PG_STATE_FORCED_BACKFILL
);
2887 pl
->publish_stats_to_osd();
2888 clear_recovery_state();
2891 void PeeringState::split_into(
2892 pg_t child_pgid
, PeeringState
*child
, unsigned split_bits
)
2894 child
->update_osdmap_ref(get_osdmap());
2898 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2899 child
->info
.last_complete
= info
.last_complete
;
2901 info
.last_update
= pg_log
.get_head();
2902 child
->info
.last_update
= child
->pg_log
.get_head();
2904 child
->info
.last_user_version
= info
.last_user_version
;
2906 info
.log_tail
= pg_log
.get_tail();
2907 child
->info
.log_tail
= child
->pg_log
.get_tail();
2909 // reset last_complete, we might have modified pg_log & missing above
2910 pg_log
.reset_complete_to(&info
);
2911 child
->pg_log
.reset_complete_to(&child
->info
);
2914 child
->info
.history
= info
.history
;
2915 child
->info
.history
.epoch_created
= get_osdmap_epoch();
2916 child
->info
.purged_snaps
= info
.purged_snaps
;
2918 if (info
.last_backfill
.is_max()) {
2919 child
->info
.set_last_backfill(hobject_t::get_max());
2921 // restart backfill on parent and child to be safe. we could
2922 // probably do better in the bitwise sort case, but it's more
2923 // fragile (there may be special work to do on backfill completion
2925 info
.set_last_backfill(hobject_t());
2926 child
->info
.set_last_backfill(hobject_t());
2927 // restarting backfill implies that the missing set is empty,
2928 // since it is only used for objects prior to last_backfill
2929 pg_log
.reset_backfill();
2930 child
->pg_log
.reset_backfill();
2933 child
->info
.stats
= info
.stats
;
2934 child
->info
.stats
.parent_split_bits
= split_bits
;
2935 info
.stats
.stats_invalid
= true;
2936 child
->info
.stats
.stats_invalid
= true;
2937 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2938 child
->info
.last_interval_started
= info
.last_interval_started
;
2940 // There can't be recovery/backfill going on now
2941 int primary
, up_primary
;
2942 vector
<int> newup
, newacting
;
2943 get_osdmap()->pg_to_up_acting_osds(
2944 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2945 child
->init_primary_up_acting(
2950 child
->role
= OSDMap::calc_pg_role(pg_whoami
, child
->acting
);
2952 // this comparison includes primary rank via pg_shard_t
2953 if (get_primary() != child
->get_primary())
2954 child
->info
.history
.same_primary_since
= get_osdmap_epoch();
2956 child
->info
.stats
.up
= up
;
2957 child
->info
.stats
.up_primary
= up_primary
;
2958 child
->info
.stats
.acting
= acting
;
2959 child
->info
.stats
.acting_primary
= primary
;
2960 child
->info
.stats
.mapping_epoch
= get_osdmap_epoch();
2963 child
->past_intervals
= past_intervals
;
2965 child
->on_new_interval();
2967 child
->send_notify
= !child
->is_primary();
2969 child
->dirty_info
= true;
2970 child
->dirty_big_info
= true;
2972 dirty_big_info
= true;
2975 void PeeringState::merge_from(
2976 map
<spg_t
,PeeringState
*>& sources
,
2978 unsigned split_bits
,
2979 const pg_merge_meta_t
& last_pg_merge_meta
)
2981 bool incomplete
= false;
2982 if (info
.last_complete
!= info
.last_update
||
2983 info
.is_incomplete() ||
2985 psdout(10) << __func__
<< " target incomplete" << dendl
;
2988 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
2989 if (info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
.get_parent()) {
2990 psdout(10) << __func__
<< " target doesn't match expected parent "
2991 << last_pg_merge_meta
.source_pgid
.get_parent()
2992 << " of source_pgid " << last_pg_merge_meta
.source_pgid
2996 if (info
.last_update
!= last_pg_merge_meta
.target_version
) {
2997 psdout(10) << __func__
<< " target version doesn't match expected "
2998 << last_pg_merge_meta
.target_version
<< dendl
;
3003 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(rctx
.transaction
)};
3004 pg_log
.roll_forward(handler
.get());
3006 info
.last_complete
= info
.last_update
; // to fake out trim()
3007 pg_log
.reset_recovery_pointers();
3008 pg_log
.trim(info
.last_update
, info
);
3010 vector
<PGLog
*> log_from
;
3011 for (auto& i
: sources
) {
3012 auto& source
= i
.second
;
3014 psdout(10) << __func__
<< " source " << i
.first
<< " missing" << dendl
;
3018 if (source
->info
.last_complete
!= source
->info
.last_update
||
3019 source
->info
.is_incomplete() ||
3020 source
->info
.dne()) {
3021 psdout(10) << __func__
<< " source " << source
->pg_whoami
3026 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
3027 if (source
->info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
) {
3028 dout(10) << __func__
<< " source " << source
->info
.pgid
.pgid
3029 << " doesn't match expected source pgid "
3030 << last_pg_merge_meta
.source_pgid
<< dendl
;
3033 if (source
->info
.last_update
!= last_pg_merge_meta
.source_version
) {
3034 dout(10) << __func__
<< " source version doesn't match expected "
3035 << last_pg_merge_meta
.target_version
<< dendl
;
3041 PGLog::LogEntryHandlerRef handler
{
3042 source
->pl
->get_log_handler(rctx
.transaction
)};
3043 source
->pg_log
.roll_forward(handler
.get());
3044 source
->info
.last_complete
= source
->info
.last_update
; // to fake out trim()
3045 source
->pg_log
.reset_recovery_pointers();
3046 source
->pg_log
.trim(source
->info
.last_update
, source
->info
);
3047 log_from
.push_back(&source
->pg_log
);
3050 info
.stats
.add(source
->info
.stats
);
3052 // pull up last_update
3053 info
.last_update
= std::max(info
.last_update
, source
->info
.last_update
);
3055 // adopt source's PastIntervals if target has none. we can do this since
3056 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3057 // the PGs are identical.
3058 if (past_intervals
.empty() && !source
->past_intervals
.empty()) {
3059 psdout(10) << __func__
<< " taking source's past_intervals" << dendl
;
3060 past_intervals
= source
->past_intervals
;
3064 info
.last_complete
= info
.last_update
;
3065 info
.log_tail
= info
.last_update
;
3067 info
.last_backfill
= hobject_t();
3071 pg_log
.merge_from(log_from
, info
.last_update
);
3073 // make sure we have a meaningful last_epoch_started/clean (if we were a
3075 if (info
.history
.epoch_created
== 0) {
3076 // start with (a) source's history, since these PGs *should* have been
3077 // remapped in concert with each other...
3078 info
.history
= sources
.begin()->second
->info
.history
;
3080 // we use the last_epoch_{started,clean} we got from
3081 // the caller, which are the epochs that were reported by the PGs were
3082 // found to be ready for merge.
3083 info
.history
.last_epoch_clean
= last_pg_merge_meta
.last_epoch_clean
;
3084 info
.history
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3085 info
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
3086 psdout(10) << __func__
3087 << " set les/c to " << last_pg_merge_meta
.last_epoch_started
<< "/"
3088 << last_pg_merge_meta
.last_epoch_clean
3089 << " from pool last_dec_*, source pg history was "
3090 << sources
.begin()->second
->info
.history
3093 // if the past_intervals start is later than last_epoch_clean, it
3094 // implies the source repeered again but the target didn't, or
3095 // that the source became clean in a later epoch than the target.
3096 // avoid the discrepancy but adjusting the interval start
3097 // backwards to match so that check_past_interval_bounds() will
3099 auto pib
= past_intervals
.get_bounds();
3100 if (info
.history
.last_epoch_clean
< pib
.first
) {
3101 psdout(10) << __func__
<< " last_epoch_clean "
3102 << info
.history
.last_epoch_clean
<< " < past_interval start "
3103 << pib
.first
<< ", adjusting start backwards" << dendl
;
3104 past_intervals
.adjust_start_backwards(info
.history
.last_epoch_clean
);
3107 // Similarly, if the same_interval_since value is later than
3108 // last_epoch_clean, the next interval change will result in a
3109 // past_interval start that is later than last_epoch_clean. This
3110 // can happen if we use the pg_history values from the merge
3111 // source. Adjust the same_interval_since value backwards if that
3112 // happens. (We trust the les and lec values more because they came from
3113 // the real target, whereas the history value we stole from the source.)
3114 if (info
.history
.last_epoch_started
< info
.history
.same_interval_since
) {
3115 psdout(10) << __func__
<< " last_epoch_started "
3116 << info
.history
.last_epoch_started
<< " < same_interval_since "
3117 << info
.history
.same_interval_since
3118 << ", adjusting pg_history backwards" << dendl
;
3119 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
3120 // make sure same_{up,primary}_since are <= same_interval_since
3121 info
.history
.same_up_since
= std::min(
3122 info
.history
.same_up_since
, info
.history
.same_interval_since
);
3123 info
.history
.same_primary_since
= std::min(
3124 info
.history
.same_primary_since
, info
.history
.same_interval_since
);
3129 dirty_big_info
= true;
3132 void PeeringState::start_split_stats(
3133 const set
<spg_t
>& childpgs
, vector
<object_stat_sum_t
> *out
)
3135 out
->resize(childpgs
.size() + 1);
3136 info
.stats
.stats
.sum
.split(*out
);
3139 void PeeringState::finish_split_stats(
3140 const object_stat_sum_t
& stats
, ObjectStore::Transaction
&t
)
3142 info
.stats
.stats
.sum
= stats
;
3146 void PeeringState::update_blocked_by()
3148 // set a max on the number of blocking peers we report. if we go
3149 // over, report a random subset. keep the result sorted.
3150 unsigned keep
= std::min
<unsigned>(
3151 blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
3152 unsigned skip
= blocked_by
.size() - keep
;
3153 info
.stats
.blocked_by
.clear();
3154 info
.stats
.blocked_by
.resize(keep
);
3156 for (set
<int>::iterator p
= blocked_by
.begin();
3157 p
!= blocked_by
.end() && keep
> 0;
3159 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
3162 info
.stats
.blocked_by
[pos
++] = *p
;
3168 static bool find_shard(const set
<pg_shard_t
> & pgs
, shard_id_t shard
)
3171 if (p
.shard
== shard
)
3176 static pg_shard_t
get_another_shard(const set
<pg_shard_t
> & pgs
, pg_shard_t skip
, shard_id_t shard
)
3178 for (auto&p
: pgs
) {
3181 if (p
.shard
== shard
)
3184 return pg_shard_t();
3187 void PeeringState::update_calc_stats()
3189 info
.stats
.version
= info
.last_update
;
3190 info
.stats
.created
= info
.history
.epoch_created
;
3191 info
.stats
.last_scrub
= info
.history
.last_scrub
;
3192 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
3193 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
3194 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
3195 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
3196 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
3198 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
3199 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
3200 info
.stats
.log_start
= pg_log
.get_tail();
3201 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
3202 info
.stats
.snaptrimq_len
= pl
->get_snap_trimq_size();
3204 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
3206 // In rare case that upset is too large (usually transient), use as target
3207 // for calculations below.
3208 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
3209 // For undersized actingset may be larger with OSDs out
3210 unsigned nrep
= std::max(actingset
.size(), upset
.size());
3211 // calc num_object_copies
3212 info
.stats
.stats
.calc_copies(std::max(target
, nrep
));
3213 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
3214 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
3215 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
3216 info
.stats
.avail_no_missing
.clear();
3217 info
.stats
.object_location_counts
.clear();
3219 // We should never hit this condition, but if end up hitting it,
3220 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3221 if (info
.stats
.stats
.sum
.num_objects
< 0) {
3222 psdout(0) << __func__
<< " negative num_objects = "
3223 << info
.stats
.stats
.sum
.num_objects
<< " setting it to 0 "
3225 info
.stats
.stats
.sum
.num_objects
= 0;
3226 state_set(PG_STATE_INCONSISTENT
);
3229 if ((is_remapped() || is_undersized() || !is_clean()) &&
3230 (is_peered()|| is_activating())) {
3231 psdout(20) << __func__
<< " actingset " << actingset
<< " upset "
3232 << upset
<< " acting_recovery_backfill " << acting_recovery_backfill
<< dendl
;
3234 ceph_assert(!acting_recovery_backfill
.empty());
3236 bool estimate
= false;
3238 // NOTE: we only generate degraded, misplaced and unfound
3239 // values for the summation, not individual stat categories.
3240 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
3242 // Objects missing from up nodes, sorted by # objects.
3243 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
3244 // Objects missing from nodes not in up, sort by # objects
3245 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
3247 // Fill missing_target_objects/acting_source_objects
3253 missing
= pg_log
.get_missing().num_missing();
3254 ceph_assert(acting_recovery_backfill
.count(pg_whoami
));
3255 if (upset
.count(pg_whoami
)) {
3256 missing_target_objects
.emplace(missing
, pg_whoami
);
3258 acting_source_objects
.emplace(missing
, pg_whoami
);
3260 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
3262 info
.stats
.avail_no_missing
.push_back(pg_whoami
);
3263 psdout(20) << __func__
<< " shard " << pg_whoami
3264 << " primary objects " << num_objects
3265 << " missing " << missing
3270 for (auto& peer
: peer_info
) {
3271 // Primary should not be in the peer_info, skip if it is.
3272 if (peer
.first
== pg_whoami
) continue;
3273 int64_t missing
= 0;
3274 int64_t peer_num_objects
= peer
.second
.stats
.stats
.sum
.num_objects
;
3275 // Backfill targets always track num_objects accurately
3276 // all other peers track missing accurately.
3277 if (is_backfill_target(peer
.first
)) {
3278 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3280 if (peer_missing
.count(peer
.first
)) {
3281 missing
= peer_missing
[peer
.first
].num_missing();
3283 psdout(20) << __func__
<< " no peer_missing found for "
3284 << peer
.first
<< dendl
;
3285 if (is_recovering()) {
3288 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3291 if (upset
.count(peer
.first
)) {
3292 missing_target_objects
.emplace(missing
, peer
.first
);
3293 } else if (actingset
.count(peer
.first
)) {
3294 acting_source_objects
.emplace(missing
, peer
.first
);
3296 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
3298 info
.stats
.avail_no_missing
.push_back(peer
.first
);
3299 psdout(20) << __func__
<< " shard " << peer
.first
3300 << " objects " << peer_num_objects
3301 << " missing " << missing
3305 // Compute object_location_counts
3306 for (auto& ml
: missing_loc
.get_missing_locs()) {
3307 info
.stats
.object_location_counts
[ml
.second
]++;
3308 psdout(30) << __func__
<< " " << ml
.first
<< " object_location_counts["
3309 << ml
.second
<< "]=" << info
.stats
.object_location_counts
[ml
.second
]
3312 int64_t not_missing
= num_objects
- missing_loc
.get_missing_locs().size();
3314 // During recovery we know upset == actingset and is being populated
3315 // During backfill we know that all non-missing objects are in the actingset
3316 info
.stats
.object_location_counts
[actingset
] = not_missing
;
3318 psdout(30) << __func__
<< " object_location_counts["
3319 << upset
<< "]=" << info
.stats
.object_location_counts
[upset
]
3321 psdout(20) << __func__
<< " object_location_counts "
3322 << info
.stats
.object_location_counts
<< dendl
;
3324 // A misplaced object is not stored on the correct OSD
3325 int64_t misplaced
= 0;
3326 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3327 int64_t degraded
= 0;
3329 if (is_recovering()) {
3330 for (auto& sml
: missing_loc
.get_missing_by_count()) {
3331 for (auto& ml
: sml
.second
) {
3333 if (sml
.first
== shard_id_t::NO_SHARD
) {
3334 psdout(20) << __func__
<< " ml " << ml
.second
3335 << " upset size " << upset
.size()
3336 << " up " << ml
.first
.up
<< dendl
;
3337 missing_shards
= (int)upset
.size() - ml
.first
.up
;
3339 // Handle shards not even in upset below
3340 if (!find_shard(upset
, sml
.first
))
3342 missing_shards
= std::max(0, 1 - ml
.first
.up
);
3343 psdout(20) << __func__
3344 << " shard " << sml
.first
3345 << " ml " << ml
.second
3346 << " missing shards " << missing_shards
<< dendl
;
3348 int odegraded
= ml
.second
* missing_shards
;
3349 // Copies on other osds but limited to the possible degraded
3350 int more_osds
= std::min(missing_shards
, ml
.first
.other
);
3351 int omisplaced
= ml
.second
* more_osds
;
3352 ceph_assert(omisplaced
<= odegraded
);
3353 odegraded
-= omisplaced
;
3355 misplaced
+= omisplaced
;
3356 degraded
+= odegraded
;
3360 psdout(20) << __func__
<< " missing based degraded "
3361 << degraded
<< dendl
;
3362 psdout(20) << __func__
<< " missing based misplaced "
3363 << misplaced
<< dendl
;
3365 // Handle undersized case
3366 if (pool
.info
.is_replicated()) {
3367 // Add degraded for missing targets (num_objects missing)
3368 ceph_assert(target
>= upset
.size());
3369 unsigned needed
= target
- upset
.size();
3370 degraded
+= num_objects
* needed
;
3372 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3373 shard_id_t
shard(i
);
3375 if (!find_shard(upset
, shard
)) {
3376 pg_shard_t pgs
= get_another_shard(actingset
, pg_shard_t(), shard
);
3378 if (pgs
!= pg_shard_t()) {
3381 if (pgs
== pg_whoami
)
3382 missing
= info
.stats
.stats
.sum
.num_objects_missing_on_primary
;
3384 missing
= peer_info
[pgs
].stats
.stats
.sum
.num_objects_missing
;
3386 degraded
+= missing
;
3387 misplaced
+= std::max((int64_t)0, num_objects
- missing
);
3389 // No shard anywhere
3390 degraded
+= num_objects
;
3398 // Handle undersized case
3399 if (pool
.info
.is_replicated()) {
3400 // Add to missing_target_objects
3401 ceph_assert(target
>= missing_target_objects
.size());
3402 unsigned needed
= target
- missing_target_objects
.size();
3404 missing_target_objects
.emplace(num_objects
* needed
, pg_shard_t(pg_shard_t::NO_OSD
));
3406 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3407 shard_id_t
shard(i
);
3409 for (const auto& t
: missing_target_objects
) {
3410 if (std::get
<1>(t
).shard
== shard
) {
3416 missing_target_objects
.emplace(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
));
3420 for (const auto& item
: missing_target_objects
)
3421 psdout(20) << __func__
<< " missing shard " << std::get
<1>(item
)
3422 << " missing= " << std::get
<0>(item
) << dendl
;
3423 for (const auto& item
: acting_source_objects
)
3424 psdout(20) << __func__
<< " acting shard " << std::get
<1>(item
)
3425 << " missing= " << std::get
<0>(item
) << dendl
;
3427 // Handle all objects not in missing for remapped
3429 for (auto m
= missing_target_objects
.rbegin();
3430 m
!= missing_target_objects
.rend(); ++m
) {
3432 int64_t extra_missing
= -1;
3434 if (pool
.info
.is_replicated()) {
3435 if (!acting_source_objects
.empty()) {
3436 auto extra_copy
= acting_source_objects
.begin();
3437 extra_missing
= std::get
<0>(*extra_copy
);
3438 acting_source_objects
.erase(extra_copy
);
3440 } else { // Erasure coded
3441 // Use corresponding shard
3442 for (const auto& a
: acting_source_objects
) {
3443 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
3444 extra_missing
= std::get
<0>(a
);
3445 acting_source_objects
.erase(a
);
3451 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
3452 // We don't know which of the objects on the target
3453 // are part of extra_missing so assume are all degraded.
3454 misplaced
+= std::get
<0>(*m
) - extra_missing
;
3455 degraded
+= extra_missing
;
3457 // 1. extra_missing == -1, more targets than sources so degraded
3458 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3459 // previously degraded are now present on the target.
3460 degraded
+= std::get
<0>(*m
);
3463 // If there are still acting that haven't been accounted for
3464 // then they are misplaced
3465 for (const auto& a
: acting_source_objects
) {
3466 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
3467 psdout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
3469 misplaced
+= extra_misplaced
;
3472 // NOTE: Tests use these messages to verify this code
3473 psdout(20) << __func__
<< " degraded " << degraded
3474 << (estimate
? " (est)": "") << dendl
;
3475 psdout(20) << __func__
<< " misplaced " << misplaced
3476 << (estimate
? " (est)": "")<< dendl
;
3478 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
3479 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
3480 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
3484 std::optional
<pg_stat_t
> PeeringState::prepare_stats_for_publish(
3485 bool pg_stats_publish_valid
,
3486 const pg_stat_t
&pg_stats_publish
,
3487 const object_stat_collection_t
&unstable_stats
)
3489 if (info
.stats
.stats
.sum
.num_scrub_errors
) {
3490 state_set(PG_STATE_INCONSISTENT
);
3492 state_clear(PG_STATE_INCONSISTENT
);
3493 state_clear(PG_STATE_FAILED_REPAIR
);
3496 utime_t now
= ceph_clock_now();
3497 if (info
.stats
.state
!= state
) {
3498 info
.stats
.last_change
= now
;
3499 // Optimistic estimation, if we just find out an inactive PG,
3500 // assumt it is active till now.
3501 if (!(state
& PG_STATE_ACTIVE
) &&
3502 (info
.stats
.state
& PG_STATE_ACTIVE
))
3503 info
.stats
.last_active
= now
;
3505 if ((state
& PG_STATE_ACTIVE
) &&
3506 !(info
.stats
.state
& PG_STATE_ACTIVE
))
3507 info
.stats
.last_became_active
= now
;
3508 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
3509 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
3510 info
.stats
.last_became_peered
= now
;
3511 info
.stats
.state
= state
;
3514 update_calc_stats();
3515 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
3516 state_set(PG_STATE_DEGRADED
);
3518 state_clear(PG_STATE_DEGRADED
);
3520 update_blocked_by();
3522 pg_stat_t pre_publish
= info
.stats
;
3523 pre_publish
.stats
.add(unstable_stats
);
3524 utime_t cutoff
= now
;
3525 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
3527 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3528 // because we don't want to make the pg_stat_t structures too expensive.
3529 unsigned max
= cct
->_conf
->osd_max_snap_prune_intervals_per_epoch
;
3531 auto i
= info
.purged_snaps
.begin();
3532 while (num
< max
&& i
!= info
.purged_snaps
.end()) {
3533 pre_publish
.purged_snaps
.insert(i
.get_start(), i
.get_len());
3537 psdout(20) << __func__
<< " reporting purged_snaps "
3538 << pre_publish
.purged_snaps
<< dendl
;
3540 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
3541 info
.stats
.last_fresh
> cutoff
) {
3542 psdout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3543 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
3544 return std::nullopt
;
3546 // update our stat summary and timestamps
3547 info
.stats
.reported_epoch
= get_osdmap_epoch();
3548 ++info
.stats
.reported_seq
;
3550 info
.stats
.last_fresh
= now
;
3552 if (info
.stats
.state
& PG_STATE_CLEAN
)
3553 info
.stats
.last_clean
= now
;
3554 if (info
.stats
.state
& PG_STATE_ACTIVE
)
3555 info
.stats
.last_active
= now
;
3556 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
3557 info
.stats
.last_peered
= now
;
3558 info
.stats
.last_unstale
= now
;
3559 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
3560 info
.stats
.last_undegraded
= now
;
3561 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
3562 info
.stats
.last_fullsized
= now
;
3564 psdout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3565 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
3566 return std::make_optional(std::move(pre_publish
));
3570 void PeeringState::init(
3572 const vector
<int>& newup
, int new_up_primary
,
3573 const vector
<int>& newacting
, int new_acting_primary
,
3574 const pg_history_t
& history
,
3575 const PastIntervals
& pi
,
3577 ObjectStore::Transaction
&t
)
3579 psdout(10) << "init role " << role
<< " up "
3580 << newup
<< " acting " << newacting
3581 << " history " << history
3582 << " past_intervals " << pi
3586 init_primary_up_acting(
3590 new_acting_primary
);
3592 info
.history
= history
;
3593 past_intervals
= pi
;
3596 info
.stats
.up_primary
= new_up_primary
;
3597 info
.stats
.acting
= acting
;
3598 info
.stats
.acting_primary
= new_acting_primary
;
3599 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
3601 if (!perform_deletes_during_peering()) {
3602 pg_log
.set_missing_may_contain_deletes();
3606 psdout(10) << __func__
<< ": Setting backfill" << dendl
;
3607 info
.set_last_backfill(hobject_t());
3608 info
.last_complete
= info
.last_update
;
3609 pg_log
.mark_log_for_rewrite();
3615 dirty_big_info
= true;
3619 void PeeringState::dump_peering_state(Formatter
*f
)
3621 f
->dump_string("state", get_pg_state_string());
3622 f
->dump_unsigned("epoch", get_osdmap_epoch());
3623 f
->open_array_section("up");
3624 for (vector
<int>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
)
3625 f
->dump_unsigned("osd", *p
);
3627 f
->open_array_section("acting");
3628 for (vector
<int>::const_iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
3629 f
->dump_unsigned("osd", *p
);
3631 if (!backfill_targets
.empty()) {
3632 f
->open_array_section("backfill_targets");
3633 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
3634 p
!= backfill_targets
.end();
3636 f
->dump_stream("shard") << *p
;
3639 if (!async_recovery_targets
.empty()) {
3640 f
->open_array_section("async_recovery_targets");
3641 for (set
<pg_shard_t
>::iterator p
= async_recovery_targets
.begin();
3642 p
!= async_recovery_targets
.end();
3644 f
->dump_stream("shard") << *p
;
3647 if (!acting_recovery_backfill
.empty()) {
3648 f
->open_array_section("acting_recovery_backfill");
3649 for (set
<pg_shard_t
>::iterator p
= acting_recovery_backfill
.begin();
3650 p
!= acting_recovery_backfill
.end();
3652 f
->dump_stream("shard") << *p
;
3655 f
->open_object_section("info");
3656 update_calc_stats();
3660 f
->open_array_section("peer_info");
3661 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= peer_info
.begin();
3662 p
!= peer_info
.end();
3664 f
->open_object_section("info");
3665 f
->dump_stream("peer") << p
->first
;
3671 void PeeringState::update_stats(
3672 std::function
<bool(pg_history_t
&, pg_stat_t
&)> f
,
3673 ObjectStore::Transaction
*t
) {
3674 if (f(info
.history
, info
.stats
)) {
3675 pl
->publish_stats_to_osd();
3677 pl
->on_info_history_change();
3685 bool PeeringState::append_log_entries_update_missing(
3686 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3687 ObjectStore::Transaction
&t
, std::optional
<eversion_t
> trim_to
,
3688 std::optional
<eversion_t
> roll_forward_to
)
3690 ceph_assert(!entries
.empty());
3691 ceph_assert(entries
.begin()->version
> info
.last_update
);
3693 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
3694 bool invalidate_stats
=
3695 pg_log
.append_new_log_entries(
3700 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
3701 pg_log
.roll_forward(rollbacker
.get());
3703 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
3704 pg_log
.roll_forward_to(*roll_forward_to
, rollbacker
.get());
3705 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
3708 info
.last_update
= pg_log
.get_head();
3710 if (pg_log
.get_missing().num_missing() == 0) {
3711 // advance last_complete since nothing else is missing!
3712 info
.last_complete
= info
.last_update
;
3714 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
3716 psdout(20) << __func__
<< " trim_to bool = " << bool(trim_to
)
3717 << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
3719 pg_log
.trim(*trim_to
, info
);
3722 return invalidate_stats
;
3725 void PeeringState::merge_new_log_entries(
3726 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
3727 ObjectStore::Transaction
&t
,
3728 std::optional
<eversion_t
> trim_to
,
3729 std::optional
<eversion_t
> roll_forward_to
)
3731 psdout(10) << __func__
<< " " << entries
<< dendl
;
3732 ceph_assert(is_primary());
3734 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
3735 for (set
<pg_shard_t
>::const_iterator i
= acting_recovery_backfill
.begin();
3736 i
!= acting_recovery_backfill
.end();
3738 pg_shard_t
peer(*i
);
3739 if (peer
== pg_whoami
) continue;
3740 ceph_assert(peer_missing
.count(peer
));
3741 ceph_assert(peer_info
.count(peer
));
3742 pg_missing_t
& pmissing(peer_missing
[peer
]);
3743 psdout(20) << __func__
<< " peer_missing for " << peer
3744 << " = " << pmissing
<< dendl
;
3745 pg_info_t
& pinfo(peer_info
[peer
]);
3746 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
3747 pinfo
.last_backfill
,
3754 pinfo
.last_update
= info
.last_update
;
3755 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
3756 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
3759 if (!rebuild_missing
) {
3763 for (auto &&i
: entries
) {
3764 missing_loc
.rebuild(
3767 acting_recovery_backfill
,
3769 pg_log
.get_missing(),
3775 void PeeringState::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3777 // raise last_complete only if we were previously up to date
3778 if (info
.last_complete
== info
.last_update
)
3779 info
.last_complete
= e
.version
;
3781 // raise last_update.
3782 ceph_assert(e
.version
> info
.last_update
);
3783 info
.last_update
= e
.version
;
3785 // raise user_version, if it increased (it may have not get bumped
3786 // by all logged updates)
3787 if (e
.user_version
> info
.last_user_version
)
3788 info
.last_user_version
= e
.user_version
;
3791 pg_log
.add(e
, applied
);
3792 psdout(10) << "add_log_entry " << e
<< dendl
;
3796 void PeeringState::append_log(
3797 const vector
<pg_log_entry_t
>& logv
,
3799 eversion_t roll_forward_to
,
3801 ObjectStore::Transaction
&t
,
3802 bool transaction_applied
,
3805 /* The primary has sent an info updating the history, but it may not
3806 * have arrived yet. We want to make sure that we cannot remember this
3807 * write without remembering that it happened in an interval which went
3808 * active in epoch history.last_epoch_started.
3810 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3811 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3813 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3814 info
.history
.last_interval_started
= info
.last_interval_started
;
3816 psdout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
3818 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
3819 if (!transaction_applied
) {
3820 /* We must be a backfill or async recovery peer, so it's ok if we apply
3821 * out-of-turn since we won't be considered when
3822 * determining a min possible last_update.
3824 * We skip_rollforward() here, which advances the crt, without
3825 * doing an actual rollforward. This avoids cleaning up entries
3826 * from the backend and we do not end up in a situation, where the
3827 * object is deleted before we can _merge_object_divergent_entries().
3829 pg_log
.skip_rollforward();
3832 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
3835 add_log_entry(*p
, transaction_applied
);
3837 /* We don't want to leave the rollforward artifacts around
3838 * here past last_backfill. It's ok for the same reason as
3840 if (transaction_applied
&&
3841 p
->soid
> info
.last_backfill
) {
3842 pg_log
.roll_forward(handler
.get());
3845 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
3846 pg_log
.roll_forward_to(
3849 last_rollback_info_trimmed_to_applied
= roll_forward_to
;
3852 psdout(10) << __func__
<< " approx pg log length = "
3853 << pg_log
.get_log().approx_size() << dendl
;
3854 psdout(10) << __func__
<< " transaction_applied = "
3855 << transaction_applied
<< dendl
;
3856 if (!transaction_applied
|| async
)
3857 psdout(10) << __func__
<< " " << pg_whoami
3858 << " is async_recovery or backfill target" << dendl
;
3859 pg_log
.trim(trim_to
, info
, transaction_applied
, async
);
3861 // update the local pg, pg log
3866 min_last_complete_ondisk
= mlcod
;
3869 void PeeringState::recover_got(
3870 const hobject_t
&oid
, eversion_t v
,
3872 ObjectStore::Transaction
&t
)
3874 if (v
> pg_log
.get_can_rollback_to()) {
3875 /* This can only happen during a repair, and even then, it would
3876 * be one heck of a race. If we are repairing the object, the
3877 * write in question must be fully committed, so it's not valid
3878 * to roll it back anyway (and we'll be rolled forward shortly
3880 PGLog::LogEntryHandlerRef handler
{pl
->get_log_handler(t
)};
3881 pg_log
.roll_forward_to(v
, handler
.get());
3884 psdout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
3885 pg_log
.recover_got(oid
, v
, info
);
3886 if (pg_log
.get_log().log
.empty()) {
3887 psdout(10) << "last_complete now " << info
.last_complete
3888 << " while log is empty" << dendl
;
3889 } else if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
3890 psdout(10) << "last_complete now " << info
.last_complete
3891 << " log.complete_to " << pg_log
.get_log().complete_to
->version
3894 psdout(10) << "last_complete now " << info
.last_complete
3895 << " log.complete_to at end" << dendl
;
3896 //below is not true in the repair case.
3897 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3898 ceph_assert(info
.last_complete
== info
.last_update
);
3902 ceph_assert(missing_loc
.needs_recovery(oid
));
3904 missing_loc
.add_location(oid
, pg_whoami
);
3912 void PeeringState::update_backfill_progress(
3913 const hobject_t
&updated_backfill
,
3914 const pg_stat_t
&updated_stats
,
3915 bool preserve_local_num_bytes
,
3916 ObjectStore::Transaction
&t
) {
3917 info
.set_last_backfill(updated_backfill
);
3918 if (preserve_local_num_bytes
) {
3919 psdout(25) << __func__
<< " primary " << updated_stats
.stats
.sum
.num_bytes
3920 << " local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
3921 int64_t bytes
= info
.stats
.stats
.sum
.num_bytes
;
3922 info
.stats
= updated_stats
;
3923 info
.stats
.stats
.sum
.num_bytes
= bytes
;
3925 psdout(20) << __func__
<< " final " << updated_stats
.stats
.sum
.num_bytes
3926 << " replaces local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
3927 info
.stats
= updated_stats
;
3934 void PeeringState::adjust_purged_snaps(
3935 std::function
<void(interval_set
<snapid_t
> &snaps
)> f
) {
3936 f(info
.purged_snaps
);
3938 dirty_big_info
= true;
3941 void PeeringState::on_peer_recover(
3943 const hobject_t
&soid
,
3944 const eversion_t
&version
)
3946 pl
->publish_stats_to_osd();
3948 peer_missing
[peer
].got(soid
, version
);
3949 missing_loc
.add_location(soid
, peer
);
3952 void PeeringState::begin_peer_recover(
3954 const hobject_t soid
)
3956 peer_missing
[peer
].revise_have(soid
, eversion_t());
3959 void PeeringState::force_object_missing(
3960 const set
<pg_shard_t
> &peers
,
3961 const hobject_t
&soid
,
3964 for (auto &&peer
: peers
) {
3965 if (peer
!= primary
) {
3966 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
3968 pg_log
.missing_add(soid
, version
, eversion_t());
3969 pg_log
.reset_complete_to(&info
);
3970 pg_log
.set_last_requested(0);
3974 missing_loc
.rebuild(
3977 acting_recovery_backfill
,
3979 pg_log
.get_missing(),
3984 void PeeringState::pre_submit_op(
3985 const hobject_t
&hoid
,
3986 const vector
<pg_log_entry_t
>& logv
,
3987 eversion_t at_version
)
3989 if (at_version
> eversion_t()) {
3990 for (auto &&i
: get_acting_recovery_backfill()) {
3991 if (i
== primary
) continue;
3992 pg_info_t
&pinfo
= peer_info
[i
];
3993 // keep peer_info up to date
3994 if (pinfo
.last_complete
== pinfo
.last_update
)
3995 pinfo
.last_complete
= at_version
;
3996 pinfo
.last_update
= at_version
;
4000 bool requires_missing_loc
= false;
4001 for (auto &&i
: get_async_recovery_targets()) {
4002 if (i
== primary
|| !get_peer_missing(i
).is_missing(hoid
))
4004 requires_missing_loc
= true;
4005 for (auto &&entry
: logv
) {
4006 peer_missing
[i
].add_next_event(entry
);
4010 if (requires_missing_loc
) {
4011 for (auto &&entry
: logv
) {
4012 psdout(30) << __func__
<< " missing_loc before: "
4013 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4014 missing_loc
.add_missing(entry
.soid
, entry
.version
,
4015 eversion_t(), entry
.is_delete());
4016 // clear out missing_loc
4017 missing_loc
.clear_location(entry
.soid
);
4018 for (auto &i
: get_actingset()) {
4019 if (!get_peer_missing(i
).is_missing(entry
.soid
))
4020 missing_loc
.add_location(entry
.soid
, i
);
4022 psdout(30) << __func__
<< " missing_loc after: "
4023 << missing_loc
.get_locations(entry
.soid
) << dendl
;
4028 void PeeringState::recovery_committed_to(eversion_t version
)
4030 psdout(10) << __func__
<< " version " << version
4031 << " now ondisk" << dendl
;
4032 last_complete_ondisk
= version
;
4034 if (last_complete_ondisk
== info
.last_update
) {
4035 if (!is_primary()) {
4036 // Either we are a replica or backfill target.
4037 // we are fully up to date. tell the primary!
4038 pl
->send_cluster_message(
4042 spg_t(info
.pgid
.pgid
, primary
.shard
),
4043 last_complete_ondisk
),
4044 get_osdmap_epoch());
4046 calc_min_last_complete_ondisk();
4051 void PeeringState::complete_write(eversion_t v
, eversion_t lc
)
4053 last_update_ondisk
= v
;
4054 last_complete_ondisk
= lc
;
4055 calc_min_last_complete_ondisk();
4058 void PeeringState::calc_trim_to()
4060 size_t target
= pl
->get_target_pg_log_entries();
4062 eversion_t limit
= std::min(
4063 min_last_complete_ondisk
,
4064 pg_log
.get_can_rollback_to());
4065 if (limit
!= eversion_t() &&
4066 limit
!= pg_trim_to
&&
4067 pg_log
.get_log().approx_size() > target
) {
4068 size_t num_to_trim
= std::min(pg_log
.get_log().approx_size() - target
,
4069 cct
->_conf
->osd_pg_log_trim_max
);
4070 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4071 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4074 list
<pg_log_entry_t
>::const_iterator it
= pg_log
.get_log().log
.begin();
4075 eversion_t new_trim_to
;
4076 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
4077 new_trim_to
= it
->version
;
4079 if (new_trim_to
> limit
) {
4080 new_trim_to
= limit
;
4081 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
4085 psdout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
4086 pg_trim_to
= new_trim_to
;
4087 assert(pg_trim_to
<= pg_log
.get_head());
4088 assert(pg_trim_to
<= min_last_complete_ondisk
);
4092 void PeeringState::calc_trim_to_aggressive()
4094 size_t target
= pl
->get_target_pg_log_entries();
4096 // limit pg log trimming up to the can_rollback_to value
4097 eversion_t limit
= std::min(
4099 pg_log
.get_can_rollback_to());
4100 psdout(10) << __func__
<< " limit = " << limit
<< dendl
;
4102 if (limit
!= eversion_t() &&
4103 limit
!= pg_trim_to
&&
4104 pg_log
.get_log().approx_size() > target
) {
4105 psdout(10) << __func__
<< " approx pg log length = "
4106 << pg_log
.get_log().approx_size() << dendl
;
4107 uint64_t num_to_trim
= std::min
<uint64_t>(pg_log
.get_log().approx_size() - target
,
4108 cct
->_conf
->osd_pg_log_trim_max
);
4109 psdout(10) << __func__
<< " num_to_trim = " << num_to_trim
<< dendl
;
4110 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
4111 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
4114 auto it
= pg_log
.get_log().log
.begin(); // oldest log entry
4115 auto rit
= pg_log
.get_log().log
.rbegin();
4116 eversion_t by_n_to_keep
; // start from tail
4117 eversion_t by_n_to_trim
= eversion_t::max(); // start from head
4118 for (size_t i
= 0; it
!= pg_log
.get_log().log
.end(); ++it
, ++rit
) {
4120 if (i
> target
&& by_n_to_keep
== eversion_t()) {
4121 by_n_to_keep
= rit
->version
;
4123 if (i
>= num_to_trim
&& by_n_to_trim
== eversion_t::max()) {
4124 by_n_to_trim
= it
->version
;
4126 if (by_n_to_keep
!= eversion_t() &&
4127 by_n_to_trim
!= eversion_t::max()) {
4132 if (by_n_to_keep
== eversion_t()) {
4136 pg_trim_to
= std::min({by_n_to_keep
, by_n_to_trim
, limit
});
4137 psdout(10) << __func__
<< " pg_trim_to now " << pg_trim_to
<< dendl
;
4138 ceph_assert(pg_trim_to
<= pg_log
.get_head());
4142 void PeeringState::apply_op_stats(
4143 const hobject_t
&soid
,
4144 const object_stat_sum_t
&delta_stats
)
4146 info
.stats
.stats
.add(delta_stats
);
4147 info
.stats
.stats
.floor(0);
4149 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
4150 i
!= get_backfill_targets().end();
4153 pg_info_t
& pinfo
= peer_info
[bt
];
4154 if (soid
<= pinfo
.last_backfill
)
4155 pinfo
.stats
.stats
.add(delta_stats
);
4159 void PeeringState::update_complete_backfill_object_stats(
4160 const hobject_t
&hoid
,
4161 const pg_stat_t
&stats
)
4163 for (auto &&bt
: get_backfill_targets()) {
4164 pg_info_t
& pinfo
= peer_info
[bt
];
4165 //Add stats to all peers that were missing object
4166 if (hoid
> pinfo
.last_backfill
)
4167 pinfo
.stats
.add(stats
);
4171 void PeeringState::update_peer_last_backfill(
4173 const hobject_t
&new_last_backfill
)
4175 pg_info_t
&pinfo
= peer_info
[peer
];
4176 pinfo
.last_backfill
= new_last_backfill
;
4177 if (new_last_backfill
.is_max()) {
4178 /* pinfo.stats might be wrong if we did log-based recovery on the
4179 * backfilled portion in addition to continuing backfill.
4181 pinfo
.stats
= info
.stats
;
4185 void PeeringState::set_revert_with_targets(
4186 const hobject_t
&soid
,
4187 const set
<pg_shard_t
> &good_peers
)
4189 for (auto &&peer
: good_peers
) {
4190 missing_loc
.add_location(soid
, peer
);
4194 void PeeringState::prepare_backfill_for_missing(
4195 const hobject_t
&soid
,
4196 const eversion_t
&version
,
4197 const vector
<pg_shard_t
> &targets
) {
4198 for (auto &&peer
: targets
) {
4199 peer_missing
[peer
].add(soid
, version
, eversion_t(), false);
4203 void PeeringState::update_hset(const pg_hit_set_history_t
&hset_history
)
4205 info
.hit_set
= hset_history
;
4208 /*------------ Peering State Machine----------------*/
4210 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4211 << "state<" << get_state_name() << ">: ")
4213 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4215 #define DECLARE_LOCALS \
4216 PeeringState *ps = context< PeeringMachine >().state; \
4218 PeeringListener *pl = context< PeeringMachine >().pl; \
4222 /*------Crashed-------*/
4223 PeeringState::Crashed::Crashed(my_context ctx
)
4225 NamedState(context
< PeeringMachine
>().state_history
, "Crashed")
4227 context
< PeeringMachine
>().log_enter(state_name
);
4228 ceph_abort_msg("we got a bad state machine event");
4232 /*------Initial-------*/
4233 PeeringState::Initial::Initial(my_context ctx
)
4235 NamedState(context
< PeeringMachine
>().state_history
, "Initial")
4237 context
< PeeringMachine
>().log_enter(state_name
);
4240 boost::statechart::result
PeeringState::Initial::react(const MNotifyRec
& notify
)
4243 ps
->proc_replica_info(
4244 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
4245 ps
->set_last_peering_reset();
4246 return transit
< Primary
>();
4249 boost::statechart::result
PeeringState::Initial::react(const MInfoRec
& i
)
4252 ceph_assert(!ps
->is_primary());
4254 return transit
< Stray
>();
4257 boost::statechart::result
PeeringState::Initial::react(const MLogRec
& i
)
4260 ceph_assert(!ps
->is_primary());
4262 return transit
< Stray
>();
4265 void PeeringState::Initial::exit()
4267 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4269 utime_t dur
= ceph_clock_now() - enter_time
;
4270 pl
->get_peering_perf().tinc(rs_initial_latency
, dur
);
4273 /*------Started-------*/
4274 PeeringState::Started::Started(my_context ctx
)
4276 NamedState(context
< PeeringMachine
>().state_history
, "Started")
4278 context
< PeeringMachine
>().log_enter(state_name
);
4281 boost::statechart::result
4282 PeeringState::Started::react(const IntervalFlush
&)
4284 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4285 context
< PeeringMachine
>().state
->end_block_outgoing();
4286 return discard_event();
4289 boost::statechart::result
PeeringState::Started::react(const AdvMap
& advmap
)
4292 psdout(10) << "Started advmap" << dendl
;
4293 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4294 if (ps
->should_restart_peering(
4296 advmap
.acting_primary
,
4301 psdout(10) << "should_restart_peering, transitioning to Reset"
4304 return transit
< Reset
>();
4306 ps
->remove_down_peer_info(advmap
.osdmap
);
4307 return discard_event();
4310 boost::statechart::result
PeeringState::Started::react(const QueryState
& q
)
4312 q
.f
->open_object_section("state");
4313 q
.f
->dump_string("name", state_name
);
4314 q
.f
->dump_stream("enter_time") << enter_time
;
4315 q
.f
->close_section();
4316 return discard_event();
4319 void PeeringState::Started::exit()
4321 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4323 utime_t dur
= ceph_clock_now() - enter_time
;
4324 pl
->get_peering_perf().tinc(rs_started_latency
, dur
);
4325 ps
->state_clear(PG_STATE_WAIT
| PG_STATE_LAGGY
);
4328 /*--------Reset---------*/
4329 PeeringState::Reset::Reset(my_context ctx
)
4331 NamedState(context
< PeeringMachine
>().state_history
, "Reset")
4333 context
< PeeringMachine
>().log_enter(state_name
);
4336 ps
->flushes_in_progress
= 0;
4337 ps
->set_last_peering_reset();
4338 ps
->log_weirdness();
4341 boost::statechart::result
4342 PeeringState::Reset::react(const IntervalFlush
&)
4344 psdout(10) << "Ending blocked outgoing recovery messages" << dendl
;
4345 context
< PeeringMachine
>().state
->end_block_outgoing();
4346 return discard_event();
4349 boost::statechart::result
PeeringState::Reset::react(const AdvMap
& advmap
)
4352 psdout(10) << "Reset advmap" << dendl
;
4354 ps
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
4356 if (ps
->should_restart_peering(
4358 advmap
.acting_primary
,
4363 psdout(10) << "should restart peering, calling start_peering_interval again"
4365 ps
->start_peering_interval(
4367 advmap
.newup
, advmap
.up_primary
,
4368 advmap
.newacting
, advmap
.acting_primary
,
4369 context
< PeeringMachine
>().get_cur_transaction());
4371 ps
->remove_down_peer_info(advmap
.osdmap
);
4372 ps
->check_past_interval_bounds();
4373 return discard_event();
4376 boost::statechart::result
PeeringState::Reset::react(const ActMap
&)
4379 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
4380 ps
->info
.history
.refresh_prior_readable_until_ub(
4382 ps
->prior_readable_until_ub
);
4383 context
< PeeringMachine
>().send_notify(
4384 ps
->get_primary().osd
,
4386 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
4387 ps
->get_osdmap_epoch(),
4388 ps
->get_osdmap_epoch(),
4390 ps
->past_intervals
));
4393 ps
->update_heartbeat_peers();
4395 return transit
< Started
>();
4398 boost::statechart::result
PeeringState::Reset::react(const QueryState
& q
)
4400 q
.f
->open_object_section("state");
4401 q
.f
->dump_string("name", state_name
);
4402 q
.f
->dump_stream("enter_time") << enter_time
;
4403 q
.f
->close_section();
4404 return discard_event();
4407 void PeeringState::Reset::exit()
4409 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4411 utime_t dur
= ceph_clock_now() - enter_time
;
4412 pl
->get_peering_perf().tinc(rs_reset_latency
, dur
);
4415 /*-------Start---------*/
4416 PeeringState::Start::Start(my_context ctx
)
4418 NamedState(context
< PeeringMachine
>().state_history
, "Start")
4420 context
< PeeringMachine
>().log_enter(state_name
);
4423 if (ps
->is_primary()) {
4424 psdout(1) << "transitioning to Primary" << dendl
;
4425 post_event(MakePrimary());
4427 psdout(1) << "transitioning to Stray" << dendl
;
4428 post_event(MakeStray());
4432 void PeeringState::Start::exit()
4434 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4436 utime_t dur
= ceph_clock_now() - enter_time
;
4437 pl
->get_peering_perf().tinc(rs_start_latency
, dur
);
4440 /*---------Primary--------*/
4441 PeeringState::Primary::Primary(my_context ctx
)
4443 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary")
4445 context
< PeeringMachine
>().log_enter(state_name
);
4447 ceph_assert(ps
->want_acting
.empty());
4449 // set CREATING bit until we have peered for the first time.
4450 if (ps
->info
.history
.last_epoch_started
== 0) {
4451 ps
->state_set(PG_STATE_CREATING
);
4452 // use the history timestamp, which ultimately comes from the
4453 // monitor in the create case.
4454 utime_t t
= ps
->info
.history
.last_scrub_stamp
;
4455 ps
->info
.stats
.last_fresh
= t
;
4456 ps
->info
.stats
.last_active
= t
;
4457 ps
->info
.stats
.last_change
= t
;
4458 ps
->info
.stats
.last_peered
= t
;
4459 ps
->info
.stats
.last_clean
= t
;
4460 ps
->info
.stats
.last_unstale
= t
;
4461 ps
->info
.stats
.last_undegraded
= t
;
4462 ps
->info
.stats
.last_fullsized
= t
;
4463 ps
->info
.stats
.last_scrub_stamp
= t
;
4464 ps
->info
.stats
.last_deep_scrub_stamp
= t
;
4465 ps
->info
.stats
.last_clean_scrub_stamp
= t
;
4469 boost::statechart::result
PeeringState::Primary::react(const MNotifyRec
& notevt
)
4472 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
4473 ps
->proc_replica_info(
4474 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
4475 return discard_event();
4478 boost::statechart::result
PeeringState::Primary::react(const ActMap
&)
4481 psdout(7) << "handle ActMap primary" << dendl
;
4482 pl
->publish_stats_to_osd();
4483 return discard_event();
4486 boost::statechart::result
PeeringState::Primary::react(
4487 const SetForceRecovery
&)
4490 ps
->set_force_recovery(true);
4491 return discard_event();
4494 boost::statechart::result
PeeringState::Primary::react(
4495 const UnsetForceRecovery
&)
4498 ps
->set_force_recovery(false);
4499 return discard_event();
4502 boost::statechart::result
PeeringState::Primary::react(
4503 const RequestScrub
& evt
)
4506 if (ps
->is_primary()) {
4507 pl
->scrub_requested(evt
.deep
, evt
.repair
);
4508 psdout(10) << "marking for scrub" << dendl
;
4510 return discard_event();
4513 boost::statechart::result
PeeringState::Primary::react(
4514 const SetForceBackfill
&)
4517 ps
->set_force_backfill(true);
4518 return discard_event();
4521 boost::statechart::result
PeeringState::Primary::react(
4522 const UnsetForceBackfill
&)
4525 ps
->set_force_backfill(false);
4526 return discard_event();
4529 void PeeringState::Primary::exit()
4531 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4533 ps
->want_acting
.clear();
4534 utime_t dur
= ceph_clock_now() - enter_time
;
4535 pl
->get_peering_perf().tinc(rs_primary_latency
, dur
);
4536 pl
->clear_primary_state();
4537 ps
->state_clear(PG_STATE_CREATING
);
4540 /*---------Peering--------*/
4541 PeeringState::Peering::Peering(my_context ctx
)
4543 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering"),
4544 history_les_bound(false)
4546 context
< PeeringMachine
>().log_enter(state_name
);
4549 ceph_assert(!ps
->is_peered());
4550 ceph_assert(!ps
->is_peering());
4551 ceph_assert(ps
->is_primary());
4552 ps
->state_set(PG_STATE_PEERING
);
4555 boost::statechart::result
PeeringState::Peering::react(const AdvMap
& advmap
)
4558 psdout(10) << "Peering advmap" << dendl
;
4559 if (prior_set
.affected_by_map(*(advmap
.osdmap
), ps
->dpp
)) {
4560 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl
;
4562 return transit
< Reset
>();
4565 ps
->adjust_need_up_thru(advmap
.osdmap
);
4566 ps
->check_prior_readable_down_osds(advmap
.osdmap
);
4568 return forward_event();
4571 boost::statechart::result
PeeringState::Peering::react(const QueryState
& q
)
4575 q
.f
->open_object_section("state");
4576 q
.f
->dump_string("name", state_name
);
4577 q
.f
->dump_stream("enter_time") << enter_time
;
4579 q
.f
->open_array_section("past_intervals");
4580 ps
->past_intervals
.dump(q
.f
);
4581 q
.f
->close_section();
4583 q
.f
->open_array_section("probing_osds");
4584 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
4585 p
!= prior_set
.probe
.end();
4587 q
.f
->dump_stream("osd") << *p
;
4588 q
.f
->close_section();
4590 if (prior_set
.pg_down
)
4591 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
4593 q
.f
->open_array_section("down_osds_we_would_probe");
4594 for (set
<int>::iterator p
= prior_set
.down
.begin();
4595 p
!= prior_set
.down
.end();
4597 q
.f
->dump_int("osd", *p
);
4598 q
.f
->close_section();
4600 q
.f
->open_array_section("peering_blocked_by");
4601 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
4602 p
!= prior_set
.blocked_by
.end();
4604 q
.f
->open_object_section("osd");
4605 q
.f
->dump_int("osd", p
->first
);
4606 q
.f
->dump_int("current_lost_at", p
->second
);
4607 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
4608 q
.f
->close_section();
4610 q
.f
->close_section();
4612 if (history_les_bound
) {
4613 q
.f
->open_array_section("peering_blocked_by_detail");
4614 q
.f
->open_object_section("item");
4615 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
4616 q
.f
->close_section();
4617 q
.f
->close_section();
4620 q
.f
->close_section();
4621 return forward_event();
4624 void PeeringState::Peering::exit()
4628 psdout(10) << "Leaving Peering" << dendl
;
4629 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4630 ps
->state_clear(PG_STATE_PEERING
);
4631 pl
->clear_probe_targets();
4633 utime_t dur
= ceph_clock_now() - enter_time
;
4634 pl
->get_peering_perf().tinc(rs_peering_latency
, dur
);
4638 /*------Backfilling-------*/
4639 PeeringState::Backfilling::Backfilling(my_context ctx
)
4641 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Backfilling")
4643 context
< PeeringMachine
>().log_enter(state_name
);
4647 ps
->backfill_reserved
= true;
4648 pl
->on_backfill_reserved();
4649 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
4650 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
4651 ps
->state_set(PG_STATE_BACKFILLING
);
4652 pl
->publish_stats_to_osd();
4655 void PeeringState::Backfilling::backfill_release_reservations()
4658 pl
->cancel_local_background_io_reservation();
4659 for (set
<pg_shard_t
>::iterator it
= ps
->backfill_targets
.begin();
4660 it
!= ps
->backfill_targets
.end();
4662 ceph_assert(*it
!= ps
->pg_whoami
);
4663 pl
->send_cluster_message(
4665 new MBackfillReserve(
4666 MBackfillReserve::RELEASE
,
4667 spg_t(ps
->info
.pgid
.pgid
, it
->shard
),
4668 ps
->get_osdmap_epoch()),
4669 ps
->get_osdmap_epoch());
4673 void PeeringState::Backfilling::cancel_backfill()
4676 backfill_release_reservations();
4677 pl
->on_backfill_canceled();
4680 boost::statechart::result
4681 PeeringState::Backfilling::react(const Backfilled
&c
)
4683 backfill_release_reservations();
4684 return transit
<Recovered
>();
4687 boost::statechart::result
4688 PeeringState::Backfilling::react(const DeferBackfill
&c
)
4692 psdout(10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
4693 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4694 ps
->state_clear(PG_STATE_BACKFILLING
);
4697 pl
->schedule_event_after(
4698 std::make_shared
<PGPeeringEvent
>(
4699 ps
->get_osdmap_epoch(),
4700 ps
->get_osdmap_epoch(),
4703 return transit
<NotBackfilling
>();
4706 boost::statechart::result
4707 PeeringState::Backfilling::react(const UnfoundBackfill
&c
)
4710 psdout(10) << "backfill has unfound, can't continue" << dendl
;
4711 ps
->state_set(PG_STATE_BACKFILL_UNFOUND
);
4712 ps
->state_clear(PG_STATE_BACKFILLING
);
4714 return transit
<NotBackfilling
>();
4717 boost::statechart::result
4718 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull
&)
4722 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
4723 ps
->state_clear(PG_STATE_BACKFILLING
);
4726 pl
->schedule_event_after(
4727 std::make_shared
<PGPeeringEvent
>(
4728 ps
->get_osdmap_epoch(),
4729 ps
->get_osdmap_epoch(),
4731 ps
->cct
->_conf
->osd_backfill_retry_interval
);
4733 return transit
<NotBackfilling
>();
4736 boost::statechart::result
4737 PeeringState::Backfilling::react(const RemoteReservationRevoked
&)
4740 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4742 if (ps
->needs_backfill()) {
4743 return transit
<WaitLocalBackfillReserved
>();
4745 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4746 return discard_event();
4750 void PeeringState::Backfilling::exit()
4752 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4754 ps
->backfill_reserved
= false;
4755 ps
->state_clear(PG_STATE_BACKFILLING
);
4756 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
4757 utime_t dur
= ceph_clock_now() - enter_time
;
4758 pl
->get_peering_perf().tinc(rs_backfilling_latency
, dur
);
4761 /*--WaitRemoteBackfillReserved--*/
4763 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
4765 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4766 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
4768 context
< PeeringMachine
>().log_enter(state_name
);
4771 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4772 pl
->publish_stats_to_osd();
4773 post_event(RemoteBackfillReserved());
4776 boost::statechart::result
4777 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
4781 int64_t num_bytes
= ps
->info
.stats
.stats
.sum
.num_bytes
;
4782 psdout(10) << __func__
<< " num_bytes " << num_bytes
<< dendl
;
4783 if (backfill_osd_it
!=
4784 context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
4785 // The primary never backfills itself
4786 ceph_assert(*backfill_osd_it
!= ps
->pg_whoami
);
4787 pl
->send_cluster_message(
4788 backfill_osd_it
->osd
,
4789 new MBackfillReserve(
4790 MBackfillReserve::REQUEST
,
4791 spg_t(context
< PeeringMachine
>().spgid
.pgid
, backfill_osd_it
->shard
),
4792 ps
->get_osdmap_epoch(),
4793 ps
->get_backfill_priority(),
4795 ps
->peer_bytes
[*backfill_osd_it
]),
4796 ps
->get_osdmap_epoch());
4799 ps
->peer_bytes
.clear();
4800 post_event(AllBackfillsReserved());
4802 return discard_event();
4805 void PeeringState::WaitRemoteBackfillReserved::exit()
4807 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4810 utime_t dur
= ceph_clock_now() - enter_time
;
4811 pl
->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency
, dur
);
4814 void PeeringState::WaitRemoteBackfillReserved::retry()
4817 pl
->cancel_local_background_io_reservation();
4819 // Send CANCEL to all previously acquired reservations
4820 set
<pg_shard_t
>::const_iterator it
, begin
, end
;
4821 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
4822 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
4823 ceph_assert(begin
!= end
);
4824 for (it
= begin
; it
!= backfill_osd_it
; ++it
) {
4825 // The primary never backfills itself
4826 ceph_assert(*it
!= ps
->pg_whoami
);
4827 pl
->send_cluster_message(
4829 new MBackfillReserve(
4830 MBackfillReserve::RELEASE
,
4831 spg_t(context
< PeeringMachine
>().spgid
.pgid
, it
->shard
),
4832 ps
->get_osdmap_epoch()),
4833 ps
->get_osdmap_epoch());
4836 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
4837 pl
->publish_stats_to_osd();
4839 pl
->schedule_event_after(
4840 std::make_shared
<PGPeeringEvent
>(
4841 ps
->get_osdmap_epoch(),
4842 ps
->get_osdmap_epoch(),
4844 ps
->cct
->_conf
->osd_backfill_retry_interval
);
4847 boost::statechart::result
4848 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull
&evt
)
4851 ps
->state_set(PG_STATE_BACKFILL_TOOFULL
);
4853 return transit
<NotBackfilling
>();
4856 boost::statechart::result
4857 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked
&evt
)
4860 return transit
<NotBackfilling
>();
4863 /*--WaitLocalBackfillReserved--*/
4864 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
4866 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalBackfillReserved")
4868 context
< PeeringMachine
>().log_enter(state_name
);
4871 ps
->state_set(PG_STATE_BACKFILL_WAIT
);
4872 pl
->request_local_background_io_reservation(
4873 ps
->get_backfill_priority(),
4874 std::make_shared
<PGPeeringEvent
>(
4875 ps
->get_osdmap_epoch(),
4876 ps
->get_osdmap_epoch(),
4877 LocalBackfillReserved()),
4878 std::make_shared
<PGPeeringEvent
>(
4879 ps
->get_osdmap_epoch(),
4880 ps
->get_osdmap_epoch(),
4881 DeferBackfill(0.0)));
4882 pl
->publish_stats_to_osd();
4885 void PeeringState::WaitLocalBackfillReserved::exit()
4887 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4889 utime_t dur
= ceph_clock_now() - enter_time
;
4890 pl
->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency
, dur
);
4893 /*----NotBackfilling------*/
4894 PeeringState::NotBackfilling::NotBackfilling(my_context ctx
)
4896 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotBackfilling")
4898 context
< PeeringMachine
>().log_enter(state_name
);
4900 ps
->state_clear(PG_STATE_REPAIR
);
4901 pl
->publish_stats_to_osd();
4904 boost::statechart::result
4905 PeeringState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
4907 return discard_event();
4910 boost::statechart::result
4911 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull
&evt
)
4913 return discard_event();
4916 void PeeringState::NotBackfilling::exit()
4918 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4921 ps
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
4922 utime_t dur
= ceph_clock_now() - enter_time
;
4923 pl
->get_peering_perf().tinc(rs_notbackfilling_latency
, dur
);
4926 /*----NotRecovering------*/
4927 PeeringState::NotRecovering::NotRecovering(my_context ctx
)
4929 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/NotRecovering")
4931 context
< PeeringMachine
>().log_enter(state_name
);
4933 ps
->state_clear(PG_STATE_REPAIR
);
4934 pl
->publish_stats_to_osd();
4937 void PeeringState::NotRecovering::exit()
4939 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4942 ps
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
4943 utime_t dur
= ceph_clock_now() - enter_time
;
4944 pl
->get_peering_perf().tinc(rs_notrecovering_latency
, dur
);
4947 /*---RepNotRecovering----*/
4948 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx
)
4950 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepNotRecovering")
4952 context
< PeeringMachine
>().log_enter(state_name
);
4955 boost::statechart::result
4956 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation
&evt
)
4959 ps
->reject_reservation();
4960 post_event(RemoteReservationRejectedTooFull());
4961 return discard_event();
4964 void PeeringState::RepNotRecovering::exit()
4966 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
4968 utime_t dur
= ceph_clock_now() - enter_time
;
4969 pl
->get_peering_perf().tinc(rs_repnotrecovering_latency
, dur
);
4972 /*---RepWaitRecoveryReserved--*/
4973 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
4975 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitRecoveryReserved")
4977 context
< PeeringMachine
>().log_enter(state_name
);
4980 boost::statechart::result
4981 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
4984 pl
->send_cluster_message(
4986 new MRecoveryReserve(
4987 MRecoveryReserve::GRANT
,
4988 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
4989 ps
->get_osdmap_epoch()),
4990 ps
->get_osdmap_epoch());
4991 return transit
<RepRecovering
>();
4994 boost::statechart::result
4995 PeeringState::RepWaitRecoveryReserved::react(
4996 const RemoteReservationCanceled
&evt
)
4999 pl
->unreserve_recovery_space();
5001 pl
->cancel_remote_recovery_reservation();
5002 return transit
<RepNotRecovering
>();
5005 void PeeringState::RepWaitRecoveryReserved::exit()
5007 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5009 utime_t dur
= ceph_clock_now() - enter_time
;
5010 pl
->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency
, dur
);
5013 /*-RepWaitBackfillReserved*/
5014 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
5016 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepWaitBackfillReserved")
5018 context
< PeeringMachine
>().log_enter(state_name
);
5021 boost::statechart::result
5022 PeeringState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
5027 if (!pl
->try_reserve_recovery_space(
5028 evt
.primary_num_bytes
, evt
.local_num_bytes
)) {
5029 post_event(RejectTooFullRemoteReservation());
5031 PGPeeringEventRef preempt
;
5032 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5033 // older peers will interpret preemption as TOOFULL
5034 preempt
= std::make_shared
<PGPeeringEvent
>(
5035 pl
->get_osdmap_epoch(),
5036 pl
->get_osdmap_epoch(),
5037 RemoteBackfillPreempted());
5039 pl
->request_remote_recovery_reservation(
5041 std::make_shared
<PGPeeringEvent
>(
5042 pl
->get_osdmap_epoch(),
5043 pl
->get_osdmap_epoch(),
5044 RemoteBackfillReserved()),
5047 return transit
<RepWaitBackfillReserved
>();
5050 boost::statechart::result
5051 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio
&evt
)
5055 // fall back to a local reckoning of priority of primary doesn't pass one
5056 // (pre-mimic compat)
5057 int prio
= evt
.priority
? evt
.priority
: ps
->get_recovery_priority();
5059 PGPeeringEventRef preempt
;
5060 if (HAVE_FEATURE(ps
->upacting_features
, RECOVERY_RESERVATION_2
)) {
5061 // older peers can't handle this
5062 preempt
= std::make_shared
<PGPeeringEvent
>(
5063 ps
->get_osdmap_epoch(),
5064 ps
->get_osdmap_epoch(),
5065 RemoteRecoveryPreempted());
5068 pl
->request_remote_recovery_reservation(
5070 std::make_shared
<PGPeeringEvent
>(
5071 ps
->get_osdmap_epoch(),
5072 ps
->get_osdmap_epoch(),
5073 RemoteRecoveryReserved()),
5075 return transit
<RepWaitRecoveryReserved
>();
5078 void PeeringState::RepWaitBackfillReserved::exit()
5080 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5082 utime_t dur
= ceph_clock_now() - enter_time
;
5083 pl
->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency
, dur
);
5086 boost::statechart::result
5087 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
5092 pl
->send_cluster_message(
5094 new MBackfillReserve(
5095 MBackfillReserve::GRANT
,
5096 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5097 ps
->get_osdmap_epoch()),
5098 ps
->get_osdmap_epoch());
5099 return transit
<RepRecovering
>();
5102 boost::statechart::result
5103 PeeringState::RepWaitBackfillReserved::react(
5104 const RejectTooFullRemoteReservation
&evt
)
5107 ps
->reject_reservation();
5108 post_event(RemoteReservationRejectedTooFull());
5109 return discard_event();
5112 boost::statechart::result
5113 PeeringState::RepWaitBackfillReserved::react(
5114 const RemoteReservationRejectedTooFull
&evt
)
5117 pl
->unreserve_recovery_space();
5119 pl
->cancel_remote_recovery_reservation();
5120 return transit
<RepNotRecovering
>();
5123 boost::statechart::result
5124 PeeringState::RepWaitBackfillReserved::react(
5125 const RemoteReservationCanceled
&evt
)
5128 pl
->unreserve_recovery_space();
5130 pl
->cancel_remote_recovery_reservation();
5131 return transit
<RepNotRecovering
>();
5134 /*---RepRecovering-------*/
5135 PeeringState::RepRecovering::RepRecovering(my_context ctx
)
5137 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive/RepRecovering")
5139 context
< PeeringMachine
>().log_enter(state_name
);
5142 boost::statechart::result
5143 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted
&)
5148 pl
->unreserve_recovery_space();
5149 pl
->send_cluster_message(
5151 new MRecoveryReserve(
5152 MRecoveryReserve::REVOKE
,
5153 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5154 ps
->get_osdmap_epoch()),
5155 ps
->get_osdmap_epoch());
5156 return discard_event();
5159 boost::statechart::result
5160 PeeringState::RepRecovering::react(const BackfillTooFull
&)
5165 pl
->unreserve_recovery_space();
5166 pl
->send_cluster_message(
5168 new MBackfillReserve(
5169 MBackfillReserve::REVOKE_TOOFULL
,
5170 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5171 ps
->get_osdmap_epoch()),
5172 ps
->get_osdmap_epoch());
5173 return discard_event();
5176 boost::statechart::result
5177 PeeringState::RepRecovering::react(const RemoteBackfillPreempted
&)
5182 pl
->unreserve_recovery_space();
5183 pl
->send_cluster_message(
5185 new MBackfillReserve(
5186 MBackfillReserve::REVOKE
,
5187 spg_t(ps
->info
.pgid
.pgid
, ps
->primary
.shard
),
5188 ps
->get_osdmap_epoch()),
5189 ps
->get_osdmap_epoch());
5190 return discard_event();
5193 void PeeringState::RepRecovering::exit()
5195 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5197 pl
->unreserve_recovery_space();
5199 pl
->cancel_remote_recovery_reservation();
5200 utime_t dur
= ceph_clock_now() - enter_time
;
5201 pl
->get_peering_perf().tinc(rs_reprecovering_latency
, dur
);
5204 /*------Activating--------*/
5205 PeeringState::Activating::Activating(my_context ctx
)
5207 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Activating")
5209 context
< PeeringMachine
>().log_enter(state_name
);
5212 void PeeringState::Activating::exit()
5214 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5216 utime_t dur
= ceph_clock_now() - enter_time
;
5217 pl
->get_peering_perf().tinc(rs_activating_latency
, dur
);
5220 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
5222 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitLocalRecoveryReserved")
5224 context
< PeeringMachine
>().log_enter(state_name
);
5227 // Make sure all nodes that part of the recovery aren't full
5228 if (!ps
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
5229 ps
->get_osdmap()->check_full(ps
->acting_recovery_backfill
)) {
5230 post_event(RecoveryTooFull());
5234 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5235 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5236 pl
->request_local_background_io_reservation(
5237 ps
->get_recovery_priority(),
5238 std::make_shared
<PGPeeringEvent
>(
5239 ps
->get_osdmap_epoch(),
5240 ps
->get_osdmap_epoch(),
5241 LocalRecoveryReserved()),
5242 std::make_shared
<PGPeeringEvent
>(
5243 ps
->get_osdmap_epoch(),
5244 ps
->get_osdmap_epoch(),
5245 DeferRecovery(0.0)));
5246 pl
->publish_stats_to_osd();
5249 boost::statechart::result
5250 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
5253 ps
->state_set(PG_STATE_RECOVERY_TOOFULL
);
5254 pl
->schedule_event_after(
5255 std::make_shared
<PGPeeringEvent
>(
5256 ps
->get_osdmap_epoch(),
5257 ps
->get_osdmap_epoch(),
5259 ps
->cct
->_conf
->osd_recovery_retry_interval
);
5260 return transit
<NotRecovering
>();
5263 void PeeringState::WaitLocalRecoveryReserved::exit()
5265 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5267 utime_t dur
= ceph_clock_now() - enter_time
;
5268 pl
->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
5271 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
5273 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5274 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
5276 context
< PeeringMachine
>().log_enter(state_name
);
5277 post_event(RemoteRecoveryReserved());
5280 boost::statechart::result
5281 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
5284 if (remote_recovery_reservation_it
!=
5285 context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
5286 ceph_assert(*remote_recovery_reservation_it
!= ps
->pg_whoami
);
5287 pl
->send_cluster_message(
5288 remote_recovery_reservation_it
->osd
,
5289 new MRecoveryReserve(
5290 MRecoveryReserve::REQUEST
,
5291 spg_t(context
< PeeringMachine
>().spgid
.pgid
,
5292 remote_recovery_reservation_it
->shard
),
5293 ps
->get_osdmap_epoch(),
5294 ps
->get_recovery_priority()),
5295 ps
->get_osdmap_epoch());
5296 ++remote_recovery_reservation_it
;
5298 post_event(AllRemotesReserved());
5300 return discard_event();
5303 void PeeringState::WaitRemoteRecoveryReserved::exit()
5305 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5307 utime_t dur
= ceph_clock_now() - enter_time
;
5308 pl
->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency
, dur
);
5311 PeeringState::Recovering::Recovering(my_context ctx
)
5313 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovering")
5315 context
< PeeringMachine
>().log_enter(state_name
);
5318 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
5319 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5320 ps
->state_set(PG_STATE_RECOVERING
);
5321 pl
->on_recovery_reserved();
5322 ceph_assert(!ps
->state_test(PG_STATE_ACTIVATING
));
5323 pl
->publish_stats_to_osd();
5326 void PeeringState::Recovering::release_reservations(bool cancel
)
5329 ceph_assert(cancel
|| !ps
->pg_log
.get_missing().have_missing());
5331 // release remote reservations
5332 for (set
<pg_shard_t
>::const_iterator i
=
5333 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
5334 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
5336 if (*i
== ps
->pg_whoami
) // skip myself
5338 pl
->send_cluster_message(
5340 new MRecoveryReserve(
5341 MRecoveryReserve::RELEASE
,
5342 spg_t(ps
->info
.pgid
.pgid
, i
->shard
),
5343 ps
->get_osdmap_epoch()),
5344 ps
->get_osdmap_epoch());
5348 boost::statechart::result
5349 PeeringState::Recovering::react(const AllReplicasRecovered
&evt
)
5352 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5353 release_reservations();
5354 pl
->cancel_local_background_io_reservation();
5355 return transit
<Recovered
>();
5358 boost::statechart::result
5359 PeeringState::Recovering::react(const RequestBackfill
&evt
)
5363 release_reservations();
5365 ps
->state_clear(PG_STATE_FORCED_RECOVERY
);
5366 pl
->cancel_local_background_io_reservation();
5367 pl
->publish_stats_to_osd();
5368 // transit any async_recovery_targets back into acting
5369 // so pg won't have to stay undersized for long
5370 // as backfill might take a long time to complete..
5371 if (!ps
->async_recovery_targets
.empty()) {
5372 pg_shard_t auth_log_shard
;
5373 bool history_les_bound
= false;
5374 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5376 return transit
<WaitLocalBackfillReserved
>();
5379 boost::statechart::result
5380 PeeringState::Recovering::react(const DeferRecovery
&evt
)
5383 if (!ps
->state_test(PG_STATE_RECOVERING
)) {
5384 // we may have finished recovery and have an AllReplicasRecovered
5385 // event queued to move us to the next state.
5386 psdout(10) << "got defer recovery but not recovering" << dendl
;
5387 return discard_event();
5389 psdout(10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
5390 ps
->state_set(PG_STATE_RECOVERY_WAIT
);
5391 pl
->cancel_local_background_io_reservation();
5392 release_reservations(true);
5393 pl
->schedule_event_after(
5394 std::make_shared
<PGPeeringEvent
>(
5395 ps
->get_osdmap_epoch(),
5396 ps
->get_osdmap_epoch(),
5399 return transit
<NotRecovering
>();
5402 boost::statechart::result
5403 PeeringState::Recovering::react(const UnfoundRecovery
&evt
)
5406 psdout(10) << "recovery has unfound, can't continue" << dendl
;
5407 ps
->state_set(PG_STATE_RECOVERY_UNFOUND
);
5408 pl
->cancel_local_background_io_reservation();
5409 release_reservations(true);
5410 return transit
<NotRecovering
>();
5413 void PeeringState::Recovering::exit()
5415 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5418 utime_t dur
= ceph_clock_now() - enter_time
;
5419 ps
->state_clear(PG_STATE_RECOVERING
);
5420 pl
->get_peering_perf().tinc(rs_recovering_latency
, dur
);
5423 PeeringState::Recovered::Recovered(my_context ctx
)
5425 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Recovered")
5427 pg_shard_t auth_log_shard
;
5429 context
< PeeringMachine
>().log_enter(state_name
);
5433 ceph_assert(!ps
->needs_recovery());
5435 // if we finished backfill, all acting are active; recheck if
5436 // DEGRADED | UNDERSIZED is appropriate.
5437 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5438 if (ps
->get_osdmap()->get_pg_size(context
< PeeringMachine
>().spgid
.pgid
) <=
5439 ps
->acting_recovery_backfill
.size()) {
5440 ps
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
5441 pl
->publish_stats_to_osd();
5444 // adjust acting set? (e.g. because backfill completed...)
5445 bool history_les_bound
= false;
5446 if (ps
->acting
!= ps
->up
&& !ps
->choose_acting(auth_log_shard
,
5447 true, &history_les_bound
)) {
5448 ceph_assert(ps
->want_acting
.size());
5449 } else if (!ps
->async_recovery_targets
.empty()) {
5450 ps
->choose_acting(auth_log_shard
, true, &history_les_bound
);
5453 if (context
< Active
>().all_replicas_activated
&&
5454 ps
->async_recovery_targets
.empty())
5455 post_event(GoClean());
5458 void PeeringState::Recovered::exit()
5460 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5463 utime_t dur
= ceph_clock_now() - enter_time
;
5464 pl
->get_peering_perf().tinc(rs_recovered_latency
, dur
);
5467 PeeringState::Clean::Clean(my_context ctx
)
5469 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active/Clean")
5471 context
< PeeringMachine
>().log_enter(state_name
);
5475 if (ps
->info
.last_complete
!= ps
->info
.last_update
) {
5480 ps
->try_mark_clean();
5482 context
< PeeringMachine
>().get_cur_transaction().register_on_commit(
5486 void PeeringState::Clean::exit()
5488 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5491 ps
->state_clear(PG_STATE_CLEAN
);
5492 utime_t dur
= ceph_clock_now() - enter_time
;
5493 pl
->get_peering_perf().tinc(rs_clean_latency
, dur
);
5496 template <typename T
>
5497 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
5499 set
<int> osds_found
;
5500 set
<pg_shard_t
> out
;
5501 for (typename
T::const_iterator i
= in
.begin();
5504 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
5505 osds_found
.insert(i
->osd
);
5512 /*---------Active---------*/
5513 PeeringState::Active::Active(my_context ctx
)
5515 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Active"),
5516 remote_shards_to_reserve_recovery(
5517 unique_osd_shard_set(
5518 context
< PeeringMachine
>().state
->pg_whoami
,
5519 context
< PeeringMachine
>().state
->acting_recovery_backfill
)),
5520 remote_shards_to_reserve_backfill(
5521 unique_osd_shard_set(
5522 context
< PeeringMachine
>().state
->pg_whoami
,
5523 context
< PeeringMachine
>().state
->backfill_targets
)),
5524 all_replicas_activated(false)
5526 context
< PeeringMachine
>().log_enter(state_name
);
5531 ceph_assert(!ps
->backfill_reserved
);
5532 ceph_assert(ps
->is_primary());
5533 psdout(10) << "In Active, about to call activate" << dendl
;
5534 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
5535 ps
->activate(context
< PeeringMachine
>().get_cur_transaction(),
5536 ps
->get_osdmap_epoch(),
5537 context
< PeeringMachine
>().get_recovery_ctx());
5539 // everyone has to commit/ack before we are truly active
5540 ps
->blocked_by
.clear();
5541 for (set
<pg_shard_t
>::iterator p
= ps
->acting_recovery_backfill
.begin();
5542 p
!= ps
->acting_recovery_backfill
.end();
5544 if (p
->shard
!= ps
->pg_whoami
.shard
) {
5545 ps
->blocked_by
.insert(p
->shard
);
5548 pl
->publish_stats_to_osd();
5549 psdout(10) << "Activate Finished" << dendl
;
5552 boost::statechart::result
PeeringState::Active::react(const AdvMap
& advmap
)
5556 if (ps
->should_restart_peering(
5558 advmap
.acting_primary
,
5563 psdout(10) << "Active advmap interval change, fast return" << dendl
;
5564 return forward_event();
5566 psdout(10) << "Active advmap" << dendl
;
5567 bool need_publish
= false;
5569 pl
->on_active_advmap(advmap
.osdmap
);
5570 if (ps
->dirty_big_info
) {
5571 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5572 // purged snaps and (b) perhaps share more snaps that we have purged
5573 // but didn't fit in pg_stat_t.
5574 need_publish
= true;
5575 ps
->share_pg_info();
5578 for (size_t i
= 0; i
< ps
->want_acting
.size(); i
++) {
5579 int osd
= ps
->want_acting
[i
];
5580 if (!advmap
.osdmap
->is_up(osd
)) {
5581 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
5582 ceph_assert(ps
->is_acting(osd_with_shard
) || ps
->is_up(osd_with_shard
));
5586 /* Check for changes in pool size (if the acting set changed as a result,
5587 * this does not matter) */
5588 if (advmap
.lastmap
->get_pg_size(ps
->info
.pgid
.pgid
) !=
5589 ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
)) {
5590 if (ps
->get_osdmap()->get_pg_size(ps
->info
.pgid
.pgid
) <=
5591 ps
->actingset
.size()) {
5592 ps
->state_clear(PG_STATE_UNDERSIZED
);
5594 ps
->state_set(PG_STATE_UNDERSIZED
);
5596 // degraded changes will be detected by call from publish_stats_to_osd()
5597 need_publish
= true;
5600 // if we haven't reported our PG stats in a long time, do so now.
5601 if (ps
->info
.stats
.reported_epoch
+ ps
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
5602 psdout(20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - ps
->info
.stats
.reported_epoch
)
5603 << " epochs" << dendl
;
5604 need_publish
= true;
5608 pl
->publish_stats_to_osd();
5610 if (ps
->check_prior_readable_down_osds(advmap
.osdmap
)) {
5611 pl
->recheck_readable();
5614 return forward_event();
5617 boost::statechart::result
PeeringState::Active::react(const ActMap
&)
5620 psdout(10) << "Active: handling ActMap" << dendl
;
5621 ceph_assert(ps
->is_primary());
5623 pl
->on_active_actmap();
5625 if (ps
->have_unfound()) {
5626 // object may have become unfound
5627 ps
->discover_all_missing(context
<PeeringMachine
>().get_recovery_ctx().msgs
);
5630 uint64_t unfound
= ps
->missing_loc
.num_unfound();
5632 ps
->all_unfound_are_queried_or_lost(ps
->get_osdmap())) {
5633 if (ps
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
5634 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has " << unfound
5635 << " objects unfound and apparently lost, would automatically "
5636 << "mark these objects lost but this feature is not yet implemented "
5637 << "(osd_auto_mark_unfound_lost)";
5639 pl
->get_clog_error() << context
< PeeringMachine
>().spgid
.pgid
<< " has "
5640 << unfound
<< " objects unfound and apparently lost";
5643 return forward_event();
5646 boost::statechart::result
PeeringState::Active::react(const MNotifyRec
& notevt
)
5650 ceph_assert(ps
->is_primary());
5651 if (ps
->peer_info
.count(notevt
.from
)) {
5652 psdout(10) << "Active: got notify from " << notevt
.from
5653 << ", already have info from that osd, ignoring"
5655 } else if (ps
->peer_purged
.count(notevt
.from
)) {
5656 psdout(10) << "Active: got notify from " << notevt
.from
5657 << ", already purged that peer, ignoring"
5660 psdout(10) << "Active: got notify from " << notevt
.from
5661 << ", calling proc_replica_info and discover_all_missing"
5663 ps
->proc_replica_info(
5664 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
5665 if (ps
->have_unfound() || (ps
->is_degraded() && ps
->might_have_unfound
.count(notevt
.from
))) {
5666 ps
->discover_all_missing(
5667 context
<PeeringMachine
>().get_recovery_ctx().msgs
);
5669 // check if it is a previous down acting member that's coming back.
5670 // if so, request pg_temp change to trigger a new interval transition
5671 pg_shard_t auth_log_shard
;
5672 bool history_les_bound
= false;
5673 ps
->choose_acting(auth_log_shard
, false, &history_les_bound
, true);
5674 if (!ps
->want_acting
.empty() && ps
->want_acting
!= ps
->acting
) {
5675 psdout(10) << "Active: got notify from previous acting member "
5676 << notevt
.from
<< ", requesting pg_temp change"
5680 return discard_event();
5683 boost::statechart::result
PeeringState::Active::react(const MTrim
& trim
)
5686 ceph_assert(ps
->is_primary());
5688 // peer is informing us of their last_complete_ondisk
5689 ldout(ps
->cct
,10) << " replica osd." << trim
.from
<< " lcod " << trim
.trim_to
<< dendl
;
5690 ps
->update_peer_last_complete_ondisk(pg_shard_t
{trim
.from
, trim
.shard
},
5692 // trim log when the pg is recovered
5693 ps
->calc_min_last_complete_ondisk();
5694 return discard_event();
5697 boost::statechart::result
PeeringState::Active::react(const MInfoRec
& infoevt
)
5700 ceph_assert(ps
->is_primary());
5702 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5703 if (infoevt
.lease_ack
) {
5704 ps
->proc_lease_ack(infoevt
.from
.osd
, *infoevt
.lease_ack
);
5706 // don't update history (yet) if we are active and primary; the replica
5707 // may be telling us they have activated (and committed) but we can't
5708 // share that until _everyone_ does the same.
5709 if (ps
->is_acting_recovery_backfill(infoevt
.from
) &&
5710 ps
->peer_activated
.count(infoevt
.from
) == 0) {
5711 psdout(10) << " peer osd." << infoevt
.from
5712 << " activated and committed" << dendl
;
5713 ps
->peer_activated
.insert(infoevt
.from
);
5714 ps
->blocked_by
.erase(infoevt
.from
.shard
);
5715 pl
->publish_stats_to_osd();
5716 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size()) {
5717 all_activated_and_committed();
5720 return discard_event();
5723 boost::statechart::result
PeeringState::Active::react(const MLogRec
& logevt
)
5726 psdout(10) << "searching osd." << logevt
.from
5727 << " log for unfound items" << dendl
;
5728 ps
->proc_replica_log(
5729 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
5730 bool got_missing
= ps
->search_for_missing(
5731 ps
->peer_info
[logevt
.from
],
5732 ps
->peer_missing
[logevt
.from
],
5734 context
< PeeringMachine
>().get_recovery_ctx());
5735 // If there are missing AND we are "fully" active then start recovery now
5736 if (got_missing
&& ps
->state_test(PG_STATE_ACTIVE
)) {
5737 post_event(DoRecovery());
5739 return discard_event();
5742 boost::statechart::result
PeeringState::Active::react(const QueryState
& q
)
5746 q
.f
->open_object_section("state");
5747 q
.f
->dump_string("name", state_name
);
5748 q
.f
->dump_stream("enter_time") << enter_time
;
5751 q
.f
->open_array_section("might_have_unfound");
5752 for (set
<pg_shard_t
>::iterator p
= ps
->might_have_unfound
.begin();
5753 p
!= ps
->might_have_unfound
.end();
5755 q
.f
->open_object_section("osd");
5756 q
.f
->dump_stream("osd") << *p
;
5757 if (ps
->peer_missing
.count(*p
)) {
5758 q
.f
->dump_string("status", "already probed");
5759 } else if (ps
->peer_missing_requested
.count(*p
)) {
5760 q
.f
->dump_string("status", "querying");
5761 } else if (!ps
->get_osdmap()->is_up(p
->osd
)) {
5762 q
.f
->dump_string("status", "osd is down");
5764 q
.f
->dump_string("status", "not queried");
5766 q
.f
->close_section();
5768 q
.f
->close_section();
5771 q
.f
->open_object_section("recovery_progress");
5772 q
.f
->open_array_section("backfill_targets");
5773 for (set
<pg_shard_t
>::const_iterator p
= ps
->backfill_targets
.begin();
5774 p
!= ps
->backfill_targets
.end(); ++p
)
5775 q
.f
->dump_stream("replica") << *p
;
5776 q
.f
->close_section();
5777 pl
->dump_recovery_info(q
.f
);
5778 q
.f
->close_section();
5781 q
.f
->close_section();
5782 return forward_event();
5785 boost::statechart::result
PeeringState::Active::react(
5786 const ActivateCommitted
&evt
)
5789 ceph_assert(!ps
->peer_activated
.count(ps
->pg_whoami
));
5790 ps
->peer_activated
.insert(ps
->pg_whoami
);
5791 psdout(10) << "_activate_committed " << evt
.epoch
5792 << " peer_activated now " << ps
->peer_activated
5793 << " last_interval_started "
5794 << ps
->info
.history
.last_interval_started
5795 << " last_epoch_started "
5796 << ps
->info
.history
.last_epoch_started
5797 << " same_interval_since "
5798 << ps
->info
.history
.same_interval_since
5800 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5801 if (ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size())
5802 all_activated_and_committed();
5803 return discard_event();
5806 boost::statechart::result
PeeringState::Active::react(const AllReplicasActivated
&evt
)
5810 pg_t pgid
= context
< PeeringMachine
>().spgid
.pgid
;
5812 all_replicas_activated
= true;
5814 ps
->state_clear(PG_STATE_ACTIVATING
);
5815 ps
->state_clear(PG_STATE_CREATING
);
5816 ps
->state_clear(PG_STATE_PREMERGE
);
5819 if (ps
->pool
.info
.is_pending_merge(pgid
, &merge_target
)) {
5820 ps
->state_set(PG_STATE_PEERED
);
5821 ps
->state_set(PG_STATE_PREMERGE
);
5823 if (ps
->actingset
.size() != ps
->get_osdmap()->get_pg_size(pgid
)) {
5826 src
.set_ps(ps
->pool
.info
.get_pg_num_pending());
5827 assert(src
.get_parent() == pgid
);
5828 pl
->set_not_ready_to_merge_target(pgid
, src
);
5830 pl
->set_not_ready_to_merge_source(pgid
);
5833 } else if (ps
->acting
.size() < ps
->pool
.info
.min_size
) {
5834 ps
->state_set(PG_STATE_PEERED
);
5836 ps
->state_set(PG_STATE_ACTIVE
);
5839 auto mnow
= pl
->get_mnow();
5840 if (ps
->prior_readable_until_ub
> mnow
) {
5841 psdout(10) << " waiting for prior_readable_until_ub "
5842 << ps
->prior_readable_until_ub
<< " > mnow " << mnow
<< dendl
;
5843 ps
->state_set(PG_STATE_WAIT
);
5844 pl
->queue_check_readable(
5845 ps
->last_peering_reset
,
5846 ps
->prior_readable_until_ub
- mnow
);
5848 psdout(10) << " mnow " << mnow
<< " >= prior_readable_until_ub "
5849 << ps
->prior_readable_until_ub
<< dendl
;
5852 if (ps
->pool
.info
.has_flag(pg_pool_t::FLAG_CREATING
)) {
5853 pl
->send_pg_created(pgid
);
5856 ps
->info
.history
.last_epoch_started
= ps
->info
.last_epoch_started
;
5857 ps
->info
.history
.last_interval_started
= ps
->info
.last_interval_started
;
5858 ps
->dirty_info
= true;
5860 ps
->share_pg_info();
5861 pl
->publish_stats_to_osd();
5863 pl
->on_activate_complete();
5865 return discard_event();
5868 boost::statechart::result
PeeringState::Active::react(const RenewLease
& rl
)
5871 ps
->proc_renew_lease();
5872 return discard_event();
5875 boost::statechart::result
PeeringState::Active::react(const MLeaseAck
& la
)
5878 ps
->proc_lease_ack(la
.from
, la
.lease_ack
);
5879 return discard_event();
5883 boost::statechart::result
PeeringState::Active::react(const CheckReadable
&evt
)
5886 pl
->recheck_readable();
5887 return discard_event();
5891 * update info.history.last_epoch_started ONLY after we and all
5892 * replicas have activated AND committed the activate transaction
5893 * (i.e. the peering results are stable on disk).
5895 void PeeringState::Active::all_activated_and_committed()
5898 psdout(10) << "all_activated_and_committed" << dendl
;
5899 ceph_assert(ps
->is_primary());
5900 ceph_assert(ps
->peer_activated
.size() == ps
->acting_recovery_backfill
.size());
5901 ceph_assert(!ps
->acting_recovery_backfill
.empty());
5902 ceph_assert(ps
->blocked_by
.empty());
5904 if (HAVE_FEATURE(ps
->upacting_features
, SERVER_OCTOPUS
)) {
5905 // this is overkill when the activation is quick, but when it is slow it
5906 // is important, because the lease was renewed by the activate itself but we
5907 // don't know how long ago that was, and simply scheduling now may leave
5908 // a gap in lease coverage. keep it simple and aggressively renew.
5909 ps
->renew_lease(pl
->get_mnow());
5911 ps
->schedule_renew_lease();
5915 ps
->update_calc_stats();
5916 if (ps
->info
.stats
.stats
.sum
.num_objects_degraded
) {
5917 ps
->state_set(PG_STATE_DEGRADED
);
5919 ps
->state_clear(PG_STATE_DEGRADED
);
5922 post_event(PeeringState::AllReplicasActivated());
5926 void PeeringState::Active::exit()
5928 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
5932 pl
->cancel_local_background_io_reservation();
5934 ps
->blocked_by
.clear();
5935 ps
->backfill_reserved
= false;
5936 ps
->state_clear(PG_STATE_ACTIVATING
);
5937 ps
->state_clear(PG_STATE_DEGRADED
);
5938 ps
->state_clear(PG_STATE_UNDERSIZED
);
5939 ps
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
5940 ps
->state_clear(PG_STATE_BACKFILL_WAIT
);
5941 ps
->state_clear(PG_STATE_RECOVERY_WAIT
);
5942 ps
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
5943 utime_t dur
= ceph_clock_now() - enter_time
;
5944 pl
->get_peering_perf().tinc(rs_active_latency
, dur
);
5945 pl
->on_active_exit();
5948 /*------ReplicaActive-----*/
5949 PeeringState::ReplicaActive::ReplicaActive(my_context ctx
)
5951 NamedState(context
< PeeringMachine
>().state_history
, "Started/ReplicaActive")
5953 context
< PeeringMachine
>().log_enter(state_name
);
5956 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
5960 boost::statechart::result
PeeringState::ReplicaActive::react(
5961 const Activate
& actevt
) {
5963 psdout(10) << "In ReplicaActive, about to call activate" << dendl
;
5965 context
< PeeringMachine
>().get_cur_transaction(),
5966 actevt
.activation_epoch
,
5967 context
< PeeringMachine
>().get_recovery_ctx());
5968 psdout(10) << "Activate Finished" << dendl
;
5969 return discard_event();
5972 boost::statechart::result
PeeringState::ReplicaActive::react(
5973 const ActivateCommitted
&evt
)
5976 psdout(10) << __func__
<< " " << evt
.epoch
<< " telling primary" << dendl
;
5978 auto &rctx
= context
<PeeringMachine
>().get_recovery_ctx();
5979 auto epoch
= ps
->get_osdmap_epoch();
5980 pg_info_t i
= ps
->info
;
5981 i
.history
.last_epoch_started
= evt
.activation_epoch
;
5982 i
.history
.last_interval_started
= i
.history
.same_interval_since
;
5984 ps
->get_primary().osd
,
5985 spg_t(ps
->info
.pgid
.pgid
, ps
->get_primary().shard
),
5990 ps
->get_lease_ack());
5992 if (ps
->acting
.size() >= ps
->pool
.info
.min_size
) {
5993 ps
->state_set(PG_STATE_ACTIVE
);
5995 ps
->state_set(PG_STATE_PEERED
);
5997 pl
->on_activate_committed();
5999 return discard_event();
6002 boost::statechart::result
PeeringState::ReplicaActive::react(const MLease
& l
)
6005 spg_t spgid
= context
< PeeringMachine
>().spgid
;
6006 epoch_t epoch
= pl
->get_osdmap_epoch();
6008 ps
->proc_lease(l
.lease
);
6009 pl
->send_cluster_message(
6010 ps
->get_primary().osd
,
6011 new MOSDPGLeaseAck(epoch
,
6012 spg_t(spgid
.pgid
, ps
->get_primary().shard
),
6013 ps
->get_lease_ack()),
6015 return discard_event();
6018 boost::statechart::result
PeeringState::ReplicaActive::react(const MInfoRec
& infoevt
)
6021 ps
->proc_primary_info(context
<PeeringMachine
>().get_cur_transaction(),
6023 return discard_event();
6026 boost::statechart::result
PeeringState::ReplicaActive::react(const MLogRec
& logevt
)
6029 psdout(10) << "received log from " << logevt
.from
<< dendl
;
6030 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6031 ps
->merge_log(t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
6032 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6033 if (logevt
.msg
->lease
) {
6034 ps
->proc_lease(*logevt
.msg
->lease
);
6037 return discard_event();
6040 boost::statechart::result
PeeringState::ReplicaActive::react(const MTrim
& trim
)
6043 // primary is instructing us to trim
6044 ps
->pg_log
.trim(trim
.trim_to
, ps
->info
);
6045 ps
->dirty_info
= true;
6046 return discard_event();
6049 boost::statechart::result
PeeringState::ReplicaActive::react(const ActMap
&)
6052 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6053 ps
->info
.history
.refresh_prior_readable_until_ub(
6054 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6055 context
< PeeringMachine
>().send_notify(
6056 ps
->get_primary().osd
,
6058 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6059 ps
->get_osdmap_epoch(),
6060 ps
->get_osdmap_epoch(),
6062 ps
->past_intervals
));
6064 return discard_event();
6067 boost::statechart::result
PeeringState::ReplicaActive::react(
6068 const MQuery
& query
)
6071 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6072 return discard_event();
6075 boost::statechart::result
PeeringState::ReplicaActive::react(const QueryState
& q
)
6077 q
.f
->open_object_section("state");
6078 q
.f
->dump_string("name", state_name
);
6079 q
.f
->dump_stream("enter_time") << enter_time
;
6080 q
.f
->close_section();
6081 return forward_event();
6084 void PeeringState::ReplicaActive::exit()
6086 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6088 pl
->unreserve_recovery_space();
6090 pl
->cancel_remote_recovery_reservation();
6091 utime_t dur
= ceph_clock_now() - enter_time
;
6092 pl
->get_peering_perf().tinc(rs_replicaactive_latency
, dur
);
6094 ps
->min_last_complete_ondisk
= eversion_t();
6098 PeeringState::Stray::Stray(my_context ctx
)
6100 NamedState(context
< PeeringMachine
>().state_history
, "Started/Stray")
6102 context
< PeeringMachine
>().log_enter(state_name
);
6106 ceph_assert(!ps
->is_peered());
6107 ceph_assert(!ps
->is_peering());
6108 ceph_assert(!ps
->is_primary());
6110 if (!ps
->get_osdmap()->have_pg_pool(ps
->info
.pgid
.pgid
.pool())) {
6111 ldout(ps
->cct
,10) << __func__
<< " pool is deleted" << dendl
;
6112 post_event(DeleteStart());
6114 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6118 boost::statechart::result
PeeringState::Stray::react(const MLogRec
& logevt
)
6121 MOSDPGLog
*msg
= logevt
.msg
.get();
6122 psdout(10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
6124 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6125 if (msg
->info
.last_backfill
== hobject_t()) {
6127 ps
->info
= msg
->info
;
6128 pl
->on_info_history_change();
6129 ps
->dirty_info
= true;
6130 ps
->dirty_big_info
= true; // maybe.
6132 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6133 ps
->pg_log
.reset_backfill_claim_log(msg
->log
, rollbacker
.get());
6135 ps
->pg_log
.reset_backfill();
6137 ps
->merge_log(t
, msg
->info
, msg
->log
, logevt
.from
);
6139 if (logevt
.msg
->lease
) {
6140 ps
->proc_lease(*logevt
.msg
->lease
);
6143 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6145 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
6146 return transit
<ReplicaActive
>();
6149 boost::statechart::result
PeeringState::Stray::react(const MInfoRec
& infoevt
)
6152 psdout(10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
6154 if (ps
->info
.last_update
> infoevt
.info
.last_update
) {
6155 // rewind divergent log entries
6156 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6157 ps
->rewind_divergent_log(t
, infoevt
.info
.last_update
);
6158 ps
->info
.stats
= infoevt
.info
.stats
;
6159 ps
->info
.hit_set
= infoevt
.info
.hit_set
;
6162 if (infoevt
.lease
) {
6163 ps
->proc_lease(*infoevt
.lease
);
6166 ceph_assert(infoevt
.info
.last_update
== ps
->info
.last_update
);
6167 ceph_assert(ps
->pg_log
.get_head() == ps
->info
.last_update
);
6169 post_event(Activate(infoevt
.info
.last_epoch_started
));
6170 return transit
<ReplicaActive
>();
6173 boost::statechart::result
PeeringState::Stray::react(const MQuery
& query
)
6176 ps
->fulfill_query(query
, context
<PeeringMachine
>().get_recovery_ctx());
6177 return discard_event();
6180 boost::statechart::result
PeeringState::Stray::react(const ActMap
&)
6183 if (ps
->should_send_notify() && ps
->get_primary().osd
>= 0) {
6184 ps
->info
.history
.refresh_prior_readable_until_ub(
6185 pl
->get_mnow(), ps
->prior_readable_until_ub
);
6186 context
< PeeringMachine
>().send_notify(
6187 ps
->get_primary().osd
,
6189 ps
->get_primary().shard
, ps
->pg_whoami
.shard
,
6190 ps
->get_osdmap_epoch(),
6191 ps
->get_osdmap_epoch(),
6193 ps
->past_intervals
));
6195 return discard_event();
6198 void PeeringState::Stray::exit()
6200 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6202 utime_t dur
= ceph_clock_now() - enter_time
;
6203 pl
->get_peering_perf().tinc(rs_stray_latency
, dur
);
6207 /*--------ToDelete----------*/
6208 PeeringState::ToDelete::ToDelete(my_context ctx
)
6210 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete")
6212 context
< PeeringMachine
>().log_enter(state_name
);
6214 pl
->get_perf_logger().inc(l_osd_pg_removing
);
6217 void PeeringState::ToDelete::exit()
6219 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6221 // note: on a successful removal, this path doesn't execute. see
6223 pl
->get_perf_logger().dec(l_osd_pg_removing
);
6225 pl
->cancel_local_background_io_reservation();
6228 /*----WaitDeleteReserved----*/
6229 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx
)
6231 NamedState(context
< PeeringMachine
>().state_history
,
6232 "Started/ToDelete/WaitDeleteReseved")
6234 context
< PeeringMachine
>().log_enter(state_name
);
6236 context
< ToDelete
>().priority
= ps
->get_delete_priority();
6238 pl
->cancel_local_background_io_reservation();
6239 pl
->request_local_background_io_reservation(
6240 context
<ToDelete
>().priority
,
6241 std::make_shared
<PGPeeringEvent
>(
6242 ps
->get_osdmap_epoch(),
6243 ps
->get_osdmap_epoch(),
6245 std::make_shared
<PGPeeringEvent
>(
6246 ps
->get_osdmap_epoch(),
6247 ps
->get_osdmap_epoch(),
6248 DeleteInterrupted()));
6251 boost::statechart::result
PeeringState::ToDelete::react(
6255 if (ps
->get_delete_priority() != priority
) {
6256 psdout(10) << __func__
<< " delete priority changed, resetting"
6258 return transit
<ToDelete
>();
6260 return discard_event();
6263 void PeeringState::WaitDeleteReserved::exit()
6265 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6268 /*----Deleting-----*/
6269 PeeringState::Deleting::Deleting(my_context ctx
)
6271 NamedState(context
< PeeringMachine
>().state_history
, "Started/ToDelete/Deleting")
6273 context
< PeeringMachine
>().log_enter(state_name
);
6275 ps
->deleting
= true;
6276 ObjectStore::Transaction
&t
= context
<PeeringMachine
>().get_cur_transaction();
6279 PGLog::LogEntryHandlerRef rollbacker
{pl
->get_log_handler(t
)};
6280 ps
->pg_log
.roll_forward(rollbacker
.get());
6282 // adjust info to backfill
6283 ps
->info
.set_last_backfill(hobject_t());
6284 ps
->pg_log
.reset_backfill();
6285 ps
->dirty_info
= true;
6290 boost::statechart::result
PeeringState::Deleting::react(
6291 const DeleteSome
& evt
)
6294 pl
->do_delete_work(context
<PeeringMachine
>().get_cur_transaction());
6295 return discard_event();
6298 void PeeringState::Deleting::exit()
6300 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6302 ps
->deleting
= false;
6303 pl
->cancel_local_background_io_reservation();
6306 /*--------GetInfo---------*/
6307 PeeringState::GetInfo::GetInfo(my_context ctx
)
6309 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetInfo")
6311 context
< PeeringMachine
>().log_enter(state_name
);
6315 ps
->check_past_interval_bounds();
6316 ps
->log_weirdness();
6317 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6319 ceph_assert(ps
->blocked_by
.empty());
6321 prior_set
= ps
->build_prior();
6322 ps
->prior_readable_down_osds
= prior_set
.down
;
6323 if (ps
->prior_readable_down_osds
.empty()) {
6324 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6326 ps
->clear_prior_readable_until_ub();
6329 ps
->reset_min_peer_features();
6331 if (prior_set
.pg_down
) {
6332 post_event(IsDown());
6333 } else if (peer_info_requested
.empty()) {
6334 post_event(GotInfo());
6338 void PeeringState::GetInfo::get_infos()
6341 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6343 ps
->blocked_by
.clear();
6344 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
6345 it
!= prior_set
.probe
.end();
6347 pg_shard_t peer
= *it
;
6348 if (peer
== ps
->pg_whoami
) {
6351 if (ps
->peer_info
.count(peer
)) {
6352 psdout(10) << " have osd." << peer
<< " info " << ps
->peer_info
[peer
] << dendl
;
6355 if (peer_info_requested
.count(peer
)) {
6356 psdout(10) << " already requested info from osd." << peer
<< dendl
;
6357 ps
->blocked_by
.insert(peer
.osd
);
6358 } else if (!ps
->get_osdmap()->is_up(peer
.osd
)) {
6359 psdout(10) << " not querying info from down osd." << peer
<< dendl
;
6361 psdout(10) << " querying info from osd." << peer
<< dendl
;
6362 context
< PeeringMachine
>().send_query(
6364 pg_query_t(pg_query_t::INFO
,
6365 it
->shard
, ps
->pg_whoami
.shard
,
6367 ps
->get_osdmap_epoch()));
6368 peer_info_requested
.insert(peer
);
6369 ps
->blocked_by
.insert(peer
.osd
);
6373 ps
->check_prior_readable_down_osds(ps
->get_osdmap());
6375 pl
->publish_stats_to_osd();
6378 boost::statechart::result
PeeringState::GetInfo::react(const MNotifyRec
& infoevt
)
6383 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
6384 if (p
!= peer_info_requested
.end()) {
6385 peer_info_requested
.erase(p
);
6386 ps
->blocked_by
.erase(infoevt
.from
.osd
);
6389 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
6390 if (ps
->proc_replica_info(
6391 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
6392 // we got something new ...
6393 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6394 if (old_start
< ps
->info
.history
.last_epoch_started
) {
6395 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
6396 prior_set
= ps
->build_prior();
6397 ps
->prior_readable_down_osds
= prior_set
.down
;
6399 // filter out any osds that got dropped from the probe set from
6400 // peer_info_requested. this is less expensive than restarting
6401 // peering (which would re-probe everyone).
6402 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
6403 while (p
!= peer_info_requested
.end()) {
6404 if (prior_set
.probe
.count(*p
) == 0) {
6405 psdout(20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
6406 peer_info_requested
.erase(p
++);
6413 psdout(20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
6414 << hex
<< infoevt
.features
<< dec
<< dendl
;
6415 ps
->apply_peer_features(infoevt
.features
);
6417 // are we done getting everything?
6418 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
6419 psdout(20) << "Common peer features: " << hex
<< ps
->get_min_peer_features() << dec
<< dendl
;
6420 psdout(20) << "Common acting features: " << hex
<< ps
->get_min_acting_features() << dec
<< dendl
;
6421 psdout(20) << "Common upacting features: " << hex
<< ps
->get_min_upacting_features() << dec
<< dendl
;
6422 post_event(GotInfo());
6425 return discard_event();
6428 boost::statechart::result
PeeringState::GetInfo::react(const QueryState
& q
)
6431 q
.f
->open_object_section("state");
6432 q
.f
->dump_string("name", state_name
);
6433 q
.f
->dump_stream("enter_time") << enter_time
;
6435 q
.f
->open_array_section("requested_info_from");
6436 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
6437 p
!= peer_info_requested
.end();
6439 q
.f
->open_object_section("osd");
6440 q
.f
->dump_stream("osd") << *p
;
6441 if (ps
->peer_info
.count(*p
)) {
6442 q
.f
->open_object_section("got_info");
6443 ps
->peer_info
[*p
].dump(q
.f
);
6444 q
.f
->close_section();
6446 q
.f
->close_section();
6448 q
.f
->close_section();
6450 q
.f
->close_section();
6451 return forward_event();
6454 void PeeringState::GetInfo::exit()
6456 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6459 utime_t dur
= ceph_clock_now() - enter_time
;
6460 pl
->get_peering_perf().tinc(rs_getinfo_latency
, dur
);
6461 ps
->blocked_by
.clear();
6464 /*------GetLog------------*/
6465 PeeringState::GetLog::GetLog(my_context ctx
)
6468 context
< PeeringMachine
>().state_history
,
6469 "Started/Primary/Peering/GetLog"),
6472 context
< PeeringMachine
>().log_enter(state_name
);
6476 ps
->log_weirdness();
6479 if (!ps
->choose_acting(auth_log_shard
, false,
6480 &context
< Peering
>().history_les_bound
)) {
6481 if (!ps
->want_acting
.empty()) {
6482 post_event(NeedActingChange());
6484 post_event(IsIncomplete());
6490 if (auth_log_shard
== ps
->pg_whoami
) {
6491 post_event(GotLog());
6495 const pg_info_t
& best
= ps
->peer_info
[auth_log_shard
];
6498 if (ps
->info
.last_update
< best
.log_tail
) {
6499 psdout(10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
6500 post_event(IsIncomplete());
6504 // how much log to request?
6505 eversion_t request_log_from
= ps
->info
.last_update
;
6506 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6507 for (set
<pg_shard_t
>::iterator p
= ps
->acting_recovery_backfill
.begin();
6508 p
!= ps
->acting_recovery_backfill
.end();
6510 if (*p
== ps
->pg_whoami
) continue;
6511 pg_info_t
& ri
= ps
->peer_info
[*p
];
6512 if (ri
.last_update
< ps
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
6513 ri
.last_update
< request_log_from
)
6514 request_log_from
= ri
.last_update
;
6518 psdout(10) << " requesting log from osd." << auth_log_shard
<< dendl
;
6519 context
<PeeringMachine
>().send_query(
6523 auth_log_shard
.shard
, ps
->pg_whoami
.shard
,
6524 request_log_from
, ps
->info
.history
,
6525 ps
->get_osdmap_epoch()));
6527 ceph_assert(ps
->blocked_by
.empty());
6528 ps
->blocked_by
.insert(auth_log_shard
.osd
);
6529 pl
->publish_stats_to_osd();
6532 boost::statechart::result
PeeringState::GetLog::react(const AdvMap
& advmap
)
6534 // make sure our log source didn't go down. we need to check
6535 // explicitly because it may not be part of the prior set, which
6536 // means the Peering state check won't catch it going down.
6537 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
6538 psdout(10) << "GetLog: auth_log_shard osd."
6539 << auth_log_shard
.osd
<< " went down" << dendl
;
6541 return transit
< Reset
>();
6544 // let the Peering state do its checks.
6545 return forward_event();
6548 boost::statechart::result
PeeringState::GetLog::react(const MLogRec
& logevt
)
6551 if (logevt
.from
!= auth_log_shard
) {
6552 psdout(10) << "GetLog: discarding log from "
6553 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
6554 return discard_event();
6556 psdout(10) << "GetLog: received master log from osd."
6557 << logevt
.from
<< dendl
;
6559 post_event(GotLog());
6560 return discard_event();
6563 boost::statechart::result
PeeringState::GetLog::react(const GotLog
&)
6567 psdout(10) << "leaving GetLog" << dendl
;
6569 psdout(10) << "processing master log" << dendl
;
6570 ps
->proc_master_log(context
<PeeringMachine
>().get_cur_transaction(),
6571 msg
->info
, msg
->log
, msg
->missing
,
6574 ps
->start_flush(context
< PeeringMachine
>().get_cur_transaction());
6575 return transit
< GetMissing
>();
6578 boost::statechart::result
PeeringState::GetLog::react(const QueryState
& q
)
6580 q
.f
->open_object_section("state");
6581 q
.f
->dump_string("name", state_name
);
6582 q
.f
->dump_stream("enter_time") << enter_time
;
6583 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
6584 q
.f
->close_section();
6585 return forward_event();
6588 void PeeringState::GetLog::exit()
6590 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6593 utime_t dur
= ceph_clock_now() - enter_time
;
6594 pl
->get_peering_perf().tinc(rs_getlog_latency
, dur
);
6595 ps
->blocked_by
.clear();
6598 /*------WaitActingChange--------*/
6599 PeeringState::WaitActingChange::WaitActingChange(my_context ctx
)
6601 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/WaitActingChange")
6603 context
< PeeringMachine
>().log_enter(state_name
);
6606 boost::statechart::result
PeeringState::WaitActingChange::react(const AdvMap
& advmap
)
6609 OSDMapRef osdmap
= advmap
.osdmap
;
6611 psdout(10) << "verifying no want_acting " << ps
->want_acting
<< " targets didn't go down" << dendl
;
6612 for (vector
<int>::iterator p
= ps
->want_acting
.begin(); p
!= ps
->want_acting
.end(); ++p
) {
6613 if (!osdmap
->is_up(*p
)) {
6614 psdout(10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
6616 return transit
< Reset
>();
6619 return forward_event();
6622 boost::statechart::result
PeeringState::WaitActingChange::react(const MLogRec
& logevt
)
6624 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
6625 return discard_event();
6628 boost::statechart::result
PeeringState::WaitActingChange::react(const MInfoRec
& evt
)
6630 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
6631 return discard_event();
6634 boost::statechart::result
PeeringState::WaitActingChange::react(const MNotifyRec
& evt
)
6636 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
6637 return discard_event();
6640 boost::statechart::result
PeeringState::WaitActingChange::react(const QueryState
& q
)
6642 q
.f
->open_object_section("state");
6643 q
.f
->dump_string("name", state_name
);
6644 q
.f
->dump_stream("enter_time") << enter_time
;
6645 q
.f
->dump_string("comment", "waiting for pg acting set to change");
6646 q
.f
->close_section();
6647 return forward_event();
6650 void PeeringState::WaitActingChange::exit()
6652 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6654 utime_t dur
= ceph_clock_now() - enter_time
;
6655 pl
->get_peering_perf().tinc(rs_waitactingchange_latency
, dur
);
6658 /*------Down--------*/
6659 PeeringState::Down::Down(my_context ctx
)
6661 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Down")
6663 context
< PeeringMachine
>().log_enter(state_name
);
6666 ps
->state_clear(PG_STATE_PEERING
);
6667 ps
->state_set(PG_STATE_DOWN
);
6669 auto &prior_set
= context
< Peering
>().prior_set
;
6670 ceph_assert(ps
->blocked_by
.empty());
6671 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
6672 pl
->publish_stats_to_osd();
6675 void PeeringState::Down::exit()
6677 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6681 ps
->state_clear(PG_STATE_DOWN
);
6682 utime_t dur
= ceph_clock_now() - enter_time
;
6683 pl
->get_peering_perf().tinc(rs_down_latency
, dur
);
6685 ps
->blocked_by
.clear();
6688 boost::statechart::result
PeeringState::Down::react(const QueryState
& q
)
6690 q
.f
->open_object_section("state");
6691 q
.f
->dump_string("name", state_name
);
6692 q
.f
->dump_stream("enter_time") << enter_time
;
6693 q
.f
->dump_string("comment",
6694 "not enough up instances of this PG to go active");
6695 q
.f
->close_section();
6696 return forward_event();
6699 boost::statechart::result
PeeringState::Down::react(const MNotifyRec
& infoevt
)
6703 ceph_assert(ps
->is_primary());
6704 epoch_t old_start
= ps
->info
.history
.last_epoch_started
;
6705 if (!ps
->peer_info
.count(infoevt
.from
) &&
6706 ps
->get_osdmap()->has_been_up_since(infoevt
.from
.osd
, infoevt
.notify
.epoch_sent
)) {
6707 ps
->update_history(infoevt
.notify
.info
.history
);
6709 // if we got something new to make pg escape down state
6710 if (ps
->info
.history
.last_epoch_started
> old_start
) {
6711 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl
;
6712 ps
->state_clear(PG_STATE_DOWN
);
6713 ps
->state_set(PG_STATE_PEERING
);
6714 return transit
< GetInfo
>();
6717 return discard_event();
6721 /*------Incomplete--------*/
6722 PeeringState::Incomplete::Incomplete(my_context ctx
)
6724 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/Incomplete")
6726 context
< PeeringMachine
>().log_enter(state_name
);
6729 ps
->state_clear(PG_STATE_PEERING
);
6730 ps
->state_set(PG_STATE_INCOMPLETE
);
6732 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
6733 ceph_assert(ps
->blocked_by
.empty());
6734 ps
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
6735 pl
->publish_stats_to_osd();
6738 boost::statechart::result
PeeringState::Incomplete::react(const AdvMap
&advmap
) {
6740 int64_t poolnum
= ps
->info
.pgid
.pool();
6742 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6743 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
6744 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
6745 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
6747 return transit
< Reset
>();
6750 return forward_event();
6753 boost::statechart::result
PeeringState::Incomplete::react(const MNotifyRec
& notevt
) {
6755 psdout(7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
6756 if (ps
->proc_replica_info(
6757 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
6758 // We got something new, try again!
6759 return transit
< GetLog
>();
6761 return discard_event();
6765 boost::statechart::result
PeeringState::Incomplete::react(
6766 const QueryState
& q
)
6768 q
.f
->open_object_section("state");
6769 q
.f
->dump_string("name", state_name
);
6770 q
.f
->dump_stream("enter_time") << enter_time
;
6771 q
.f
->dump_string("comment", "not enough complete instances of this PG");
6772 q
.f
->close_section();
6773 return forward_event();
6776 void PeeringState::Incomplete::exit()
6778 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6782 ps
->state_clear(PG_STATE_INCOMPLETE
);
6783 utime_t dur
= ceph_clock_now() - enter_time
;
6784 pl
->get_peering_perf().tinc(rs_incomplete_latency
, dur
);
6786 ps
->blocked_by
.clear();
6789 /*------GetMissing--------*/
6790 PeeringState::GetMissing::GetMissing(my_context ctx
)
6792 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/GetMissing")
6794 context
< PeeringMachine
>().log_enter(state_name
);
6797 ps
->log_weirdness();
6798 ceph_assert(!ps
->acting_recovery_backfill
.empty());
6800 for (set
<pg_shard_t
>::iterator i
= ps
->acting_recovery_backfill
.begin();
6801 i
!= ps
->acting_recovery_backfill
.end();
6803 if (*i
== ps
->get_primary()) continue;
6804 const pg_info_t
& pi
= ps
->peer_info
[*i
];
6805 // reset this so to make sure the pg_missing_t is initialized and
6806 // has the correct semantics even if we don't need to get a
6807 // missing set from a shard. This way later additions due to
6808 // lost+unfound delete work properly.
6809 ps
->peer_missing
[*i
].may_include_deletes
= !ps
->perform_deletes_during_peering();
6812 continue; // no pg data, nothing divergent
6814 if (pi
.last_update
< ps
->pg_log
.get_tail()) {
6815 psdout(10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
6816 ps
->peer_missing
[*i
].clear();
6819 if (pi
.last_backfill
== hobject_t()) {
6820 psdout(10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
6821 ps
->peer_missing
[*i
].clear();
6825 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
6826 pi
.last_update
== ps
->info
.last_update
) { // peer is up to date
6827 // replica has no missing and identical log as us. no need to
6829 // FIXME: we can do better here. if last_update==last_complete we
6830 // can infer the rest!
6831 psdout(10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
6832 ps
->peer_missing
[*i
].clear();
6836 // We pull the log from the peer's last_epoch_started to ensure we
6837 // get enough log to detect divergent updates.
6838 since
.epoch
= pi
.last_epoch_started
;
6839 ceph_assert(pi
.last_update
>= ps
->info
.log_tail
); // or else choose_acting() did a bad thing
6840 if (pi
.log_tail
<= since
) {
6841 psdout(10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
6842 context
< PeeringMachine
>().send_query(
6846 i
->shard
, ps
->pg_whoami
.shard
,
6847 since
, ps
->info
.history
,
6848 ps
->get_osdmap_epoch()));
6850 psdout(10) << " requesting fulllog+missing from osd." << *i
6851 << " (want since " << since
<< " < log.tail "
6852 << pi
.log_tail
<< ")" << dendl
;
6853 context
< PeeringMachine
>().send_query(
6855 pg_query_t::FULLLOG
,
6856 i
->shard
, ps
->pg_whoami
.shard
,
6857 ps
->info
.history
, ps
->get_osdmap_epoch()));
6859 peer_missing_requested
.insert(*i
);
6860 ps
->blocked_by
.insert(i
->osd
);
6863 if (peer_missing_requested
.empty()) {
6864 if (ps
->need_up_thru
) {
6865 psdout(10) << " still need up_thru update before going active"
6867 post_event(NeedUpThru());
6872 post_event(Activate(ps
->get_osdmap_epoch()));
6874 pl
->publish_stats_to_osd();
6878 boost::statechart::result
PeeringState::GetMissing::react(const MLogRec
& logevt
)
6882 peer_missing_requested
.erase(logevt
.from
);
6883 ps
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
6885 if (peer_missing_requested
.empty()) {
6886 if (ps
->need_up_thru
) {
6887 psdout(10) << " still need up_thru update before going active"
6889 post_event(NeedUpThru());
6891 psdout(10) << "Got last missing, don't need missing "
6892 << "posting Activate" << dendl
;
6893 post_event(Activate(ps
->get_osdmap_epoch()));
6896 return discard_event();
6899 boost::statechart::result
PeeringState::GetMissing::react(const QueryState
& q
)
6902 q
.f
->open_object_section("state");
6903 q
.f
->dump_string("name", state_name
);
6904 q
.f
->dump_stream("enter_time") << enter_time
;
6906 q
.f
->open_array_section("peer_missing_requested");
6907 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
6908 p
!= peer_missing_requested
.end();
6910 q
.f
->open_object_section("osd");
6911 q
.f
->dump_stream("osd") << *p
;
6912 if (ps
->peer_missing
.count(*p
)) {
6913 q
.f
->open_object_section("got_missing");
6914 ps
->peer_missing
[*p
].dump(q
.f
);
6915 q
.f
->close_section();
6917 q
.f
->close_section();
6919 q
.f
->close_section();
6921 q
.f
->close_section();
6922 return forward_event();
6925 void PeeringState::GetMissing::exit()
6927 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6930 utime_t dur
= ceph_clock_now() - enter_time
;
6931 pl
->get_peering_perf().tinc(rs_getmissing_latency
, dur
);
6932 ps
->blocked_by
.clear();
6935 /*------WaitUpThru--------*/
6936 PeeringState::WaitUpThru::WaitUpThru(my_context ctx
)
6938 NamedState(context
< PeeringMachine
>().state_history
, "Started/Primary/Peering/WaitUpThru")
6940 context
< PeeringMachine
>().log_enter(state_name
);
6943 boost::statechart::result
PeeringState::WaitUpThru::react(const ActMap
& am
)
6946 if (!ps
->need_up_thru
) {
6947 post_event(Activate(ps
->get_osdmap_epoch()));
6949 return forward_event();
6952 boost::statechart::result
PeeringState::WaitUpThru::react(const MLogRec
& logevt
)
6955 psdout(10) << "Noting missing from osd." << logevt
.from
<< dendl
;
6956 ps
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
6957 ps
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
6958 return discard_event();
6961 boost::statechart::result
PeeringState::WaitUpThru::react(const QueryState
& q
)
6963 q
.f
->open_object_section("state");
6964 q
.f
->dump_string("name", state_name
);
6965 q
.f
->dump_stream("enter_time") << enter_time
;
6966 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
6967 q
.f
->close_section();
6968 return forward_event();
6971 void PeeringState::WaitUpThru::exit()
6973 context
< PeeringMachine
>().log_exit(state_name
, enter_time
);
6975 utime_t dur
= ceph_clock_now() - enter_time
;
6976 pl
->get_peering_perf().tinc(rs_waitupthru_latency
, dur
);
6979 /*----PeeringState::PeeringMachine Methods-----*/
6981 #define dout_prefix dpp->gen_prefix(*_dout)
6983 void PeeringState::PeeringMachine::log_enter(const char *state_name
)
6986 psdout(5) << "enter " << state_name
<< dendl
;
6987 pl
->log_state_enter(state_name
);
6990 void PeeringState::PeeringMachine::log_exit(const char *state_name
, utime_t enter_time
)
6993 utime_t dur
= ceph_clock_now() - enter_time
;
6994 psdout(5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
6995 pl
->log_state_exit(state_name
, enter_time
, event_count
, event_time
);
6997 event_time
= utime_t();
7000 ostream
&operator<<(ostream
&out
, const PeeringState
&ps
) {
7001 out
<< "pg[" << ps
.info
7002 << " " << pg_vector_string(ps
.up
);
7003 if (ps
.acting
!= ps
.up
)
7004 out
<< "/" << pg_vector_string(ps
.acting
);
7006 out
<< "p" << ps
.get_primary();
7007 if (!ps
.async_recovery_targets
.empty())
7008 out
<< " async=[" << ps
.async_recovery_targets
<< "]";
7009 if (!ps
.backfill_targets
.empty())
7010 out
<< " backfill=[" << ps
.backfill_targets
<< "]";
7011 out
<< " r=" << ps
.get_role();
7012 out
<< " lpr=" << ps
.get_last_peering_reset();
7017 if (!ps
.past_intervals
.empty()) {
7018 out
<< " pi=[" << ps
.past_intervals
.get_bounds()
7019 << ")/" << ps
.past_intervals
.size();
7022 if (ps
.is_peered()) {
7023 if (ps
.last_update_ondisk
!= ps
.info
.last_update
)
7024 out
<< " luod=" << ps
.last_update_ondisk
;
7025 if (ps
.last_update_applied
!= ps
.info
.last_update
)
7026 out
<< " lua=" << ps
.last_update_applied
;
7029 if (ps
.pg_log
.get_tail() != ps
.info
.log_tail
||
7030 ps
.pg_log
.get_head() != ps
.info
.last_update
)
7031 out
<< " (info mismatch, " << ps
.pg_log
.get_log() << ")";
7033 if (!ps
.pg_log
.get_log().empty()) {
7034 if ((ps
.pg_log
.get_log().log
.begin()->version
<= ps
.pg_log
.get_tail())) {
7035 out
<< " (log bound mismatch, actual=["
7036 << ps
.pg_log
.get_log().log
.begin()->version
<< ","
7037 << ps
.pg_log
.get_log().log
.rbegin()->version
<< "]";
7042 out
<< " crt=" << ps
.pg_log
.get_can_rollback_to();
7044 if (ps
.last_complete_ondisk
!= ps
.info
.last_complete
)
7045 out
<< " lcod " << ps
.last_complete_ondisk
;
7047 out
<< " mlcod " << ps
.min_last_complete_ondisk
;
7049 out
<< " " << pg_state_string(ps
.get_state());
7050 if (ps
.should_send_notify())
7053 if (ps
.prior_readable_until_ub
!= ceph::signedspan::zero()) {
7054 out
<< " pruub " << ps
.prior_readable_until_ub
7055 << "@" << ps
.get_prior_readable_down_osds();