1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
21 #include "common/errno.h"
22 #include "common/config.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
71 #define tracepoint(...)
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
79 #define dout_prefix _prefix(_dout, this)
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
83 const string
infover_key("_infover");
84 const string
info_key("_info");
85 const string
biginfo_key("_biginfo");
86 const string
epoch_key("_epoch");
87 const string
fastinfo_key("_fastinfo");
90 static ostream
& _prefix(std::ostream
*_dout
, T
*t
)
92 return *_dout
<< t
->gen_prefix();
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt
, pg_peering_evt
, osd
);
97 void PGStateHistory::enter(PG
* pg
, const utime_t entime
, const char* state
)
99 // Ignore trimming state machine for now
100 if (::strstr(state
, "Trimming") != NULL
) {
102 } else if (pi
!= nullptr) {
103 pi
->enter_state(entime
, state
);
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi
== nullptr) {
107 tmppi
= std::unique_ptr
<PGStateInstance
>(new PGStateInstance
);
111 tmppi
->enter_state(entime
, state
);
115 void PGStateHistory::exit(const char* state
) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state
, "Trimming") != NULL
|| pg_in_destructor
) {
121 bool ilocked
= false;
122 if(!thispg
->is_locked()) {
127 buffer
.push_back(std::unique_ptr
<PGStateInstance
>(tmppi
.release()));
128 pi
= buffer
.back().get();
129 pi
->setepoch(thispg
->get_osdmap()->get_epoch());
132 pi
->exit_state(ceph_clock_now());
133 if (::strcmp(state
, "Reset") == 0) {
142 void PGStateHistory::dump(Formatter
* f
) const {
143 f
->open_array_section("history");
144 for (auto pi
= buffer
.begin(); pi
!= buffer
.end(); ++pi
) {
145 f
->open_object_section("states");
146 f
->dump_stream("epoch") << (*pi
)->this_epoch
;
147 for (auto she
: (*pi
)->state_history
) {
148 f
->dump_string("state", std::get
<2>(she
));
149 f
->dump_stream("enter") << std::get
<0>(she
);
150 f
->dump_stream("exit") << std::get
<1>(she
);
157 void PG::get(const char* tag
)
161 Mutex::Locker
l(_ref_id_lock
);
166 void PG::put(const char* tag
)
170 Mutex::Locker
l(_ref_id_lock
);
171 auto tag_counts_entry
= _tag_counts
.find(tag
);
172 assert(tag_counts_entry
!= _tag_counts
.end());
173 --tag_counts_entry
->second
;
174 if (tag_counts_entry
->second
== 0) {
175 _tag_counts
.erase(tag_counts_entry
);
184 uint64_t PG::get_with_id()
187 Mutex::Locker
l(_ref_id_lock
);
188 uint64_t id
= ++_ref_id
;
192 dout(20) << __func__
<< ": " << info
.pgid
<< " got id " << id
<< " (new) ref==" << ref
<< dendl
;
193 assert(!_live_ids
.count(id
));
194 _live_ids
.insert(make_pair(id
, ss
.str()));
198 void PG::put_with_id(uint64_t id
)
200 dout(20) << __func__
<< ": " << info
.pgid
<< " put id " << id
<< " (current) ref==" << ref
<< dendl
;
202 Mutex::Locker
l(_ref_id_lock
);
203 assert(_live_ids
.count(id
));
210 void PG::dump_live_ids()
212 Mutex::Locker
l(_ref_id_lock
);
213 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live ids:" << dendl
;
214 for (map
<uint64_t, string
>::iterator i
= _live_ids
.begin();
215 i
!= _live_ids
.end();
217 dout(0) << "\t\tid: " << *i
<< dendl
;
219 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live tags:" << dendl
;
220 for (map
<string
, uint64_t>::iterator i
= _tag_counts
.begin();
221 i
!= _tag_counts
.end();
223 dout(0) << "\t\tid: " << *i
<< dendl
;
229 void PGPool::update(OSDMapRef map
)
231 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
235 name
= map
->get_pool_name(id
);
236 bool updated
= false;
237 if ((map
->get_epoch() != cached_epoch
+ 1) ||
238 (pi
->get_snap_epoch() == map
->get_epoch())) {
240 if (pi
->maybe_updated_removed_snaps(cached_removed_snaps
)) {
241 pi
->build_removed_snaps(newly_removed_snaps
);
242 if (cached_removed_snaps
.subset_of(newly_removed_snaps
)) {
243 interval_set
<snapid_t
> removed_snaps
= newly_removed_snaps
;
244 newly_removed_snaps
.subtract(cached_removed_snaps
);
245 cached_removed_snaps
.swap(removed_snaps
);
247 lgeneric_subdout(cct
, osd
, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps
<< dendl
;
250 cached_removed_snaps
.swap(newly_removed_snaps
);
251 newly_removed_snaps
.clear();
254 newly_removed_snaps
.clear();
255 snapc
= pi
->get_snap_context();
257 /* 1) map->get_epoch() == cached_epoch + 1 &&
258 * 2) pi->get_snap_epoch() != map->get_epoch()
260 * From the if branch, 1 && 2 must be true. From 2, we know that
261 * this map didn't change the set of removed snaps. From 1, we
262 * know that our cached_removed_snaps matches the previous map.
263 * Thus, from 1 && 2, cached_removed snaps matches the current
264 * set of removed snaps and all we have to do is clear
265 * newly_removed_snaps.
267 newly_removed_snaps
.clear();
269 cached_epoch
= map
->get_epoch();
270 lgeneric_subdout(cct
, osd
, 20)
271 << "PGPool::update cached_removed_snaps "
272 << cached_removed_snaps
273 << " newly_removed_snaps "
274 << newly_removed_snaps
275 << " snapc " << snapc
276 << (updated
? " (updated)":" (no change)")
280 PG::PG(OSDService
*o
, OSDMapRef curmap
,
281 const PGPool
&_pool
, spg_t p
) :
284 osdriver(osd
->store
, coll_t(), OSD::make_snapmapper_oid()),
289 p
.get_split_bits(curmap
->get_pg_num(_pool
.id
)),
292 osdmap_ref(curmap
), last_persisted_osdmap_ref(curmap
), pool(_pool
),
295 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
298 trace_endpoint("0.0.0.0", 0, "PG"),
299 dirty_info(false), dirty_big_info(false),
304 pgmeta_oid(p
.make_pgmeta_oid()),
307 curmap
->get_pools().at(p
.pgid
.pool()).ec_pool(),
309 stat_queue_item(this),
311 recovery_queued(false),
312 recovery_ops_active(0),
316 pg_whoami(osd
->whoami
, p
.shard
),
318 last_peering_reset(0),
319 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
320 backfill_reserved(false),
321 backfill_reserving(false),
322 flushes_in_progress(0),
323 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
324 pg_stats_publish_valid(false),
325 osr(osd
->osr_registry
.lookup_or_create(p
, (stringify(p
)))),
326 finish_sync_event(NULL
),
327 backoff_lock("PG::backoff_lock"),
328 scrub_after_recovery(false),
330 recovery_state(this),
332 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
333 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
334 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
338 osd
->add_pgid(p
, this);
341 std::stringstream ss
;
342 ss
<< "PG " << info
.pgid
;
343 trace_endpoint
.copy_name(ss
.str());
350 pgstate_history
.set_pg_in_destructor();
352 osd
->remove_pgid(info
.pgid
, this);
356 void PG::lock_suspend_timeout(ThreadPool::TPHandle
&handle
)
358 handle
.suspend_tp_timeout();
360 handle
.reset_tp_timeout();
363 void PG::lock(bool no_lockdep
) const
365 _lock
.Lock(no_lockdep
);
366 // if we have unrecorded dirty state with the lock dropped, there is a bug
368 assert(!dirty_big_info
);
370 dout(30) << "lock" << dendl
;
373 std::string
PG::gen_prefix() const
376 OSDMapRef mapref
= osdmap_ref
;
377 if (_lock
.is_locked_by_me()) {
378 out
<< "osd." << osd
->whoami
379 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
380 << " " << *this << " ";
382 out
<< "osd." << osd
->whoami
383 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
384 << " pg[" << info
.pgid
<< "(unlocked)] ";
389 /********* PG **********/
391 void PG::proc_master_log(
392 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
393 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
395 dout(10) << "proc_master_log for osd." << from
<< ": "
396 << olog
<< " " << omissing
<< dendl
;
397 assert(!is_peered() && is_primary());
399 // merge log into our own log to build master log. no need to
400 // make any adjustments to their missing map; we are taking their
401 // log to be authoritative (i.e., their entries are by definitely
403 merge_log(t
, oinfo
, olog
, from
);
404 peer_info
[from
] = oinfo
;
405 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
406 might_have_unfound
.insert(from
);
408 // See doc/dev/osd_internals/last_epoch_started
409 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
410 info
.last_epoch_started
= oinfo
.last_epoch_started
;
413 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
414 info
.last_interval_started
= oinfo
.last_interval_started
;
417 update_history(oinfo
.history
);
418 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
419 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
421 peer_missing
[from
].claim(omissing
);
424 void PG::proc_replica_log(
426 const pg_log_t
&olog
,
427 pg_missing_t
& omissing
,
430 dout(10) << "proc_replica_log for osd." << from
<< ": "
431 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
433 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
435 peer_info
[from
] = oinfo
;
436 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
437 might_have_unfound
.insert(from
);
439 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
440 omissing
.get_items().begin();
441 i
!= omissing
.get_items().end();
443 dout(20) << " after missing " << i
->first
<< " need " << i
->second
.need
444 << " have " << i
->second
.have
<< dendl
;
446 peer_missing
[from
].claim(omissing
);
449 bool PG::proc_replica_info(
450 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
452 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
453 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
454 dout(10) << " got dup osd." << from
<< " info " << oinfo
<< ", identical to ours" << dendl
;
458 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
459 dout(10) << " got info " << oinfo
<< " from down osd." << from
460 << " discarding" << dendl
;
464 dout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
465 assert(is_primary());
466 peer_info
[from
] = oinfo
;
467 might_have_unfound
.insert(from
);
469 update_history(oinfo
.history
);
472 if (!is_up(from
) && !is_acting(from
)) {
473 dout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
474 stray_set
.insert(from
);
480 // was this a new info? if so, update peers!
481 if (p
== peer_info
.end())
482 update_heartbeat_peers();
487 void PG::remove_snap_mapped_object(
488 ObjectStore::Transaction
&t
, const hobject_t
&soid
)
492 ghobject_t(soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
));
493 clear_object_snap_mapping(&t
, soid
);
496 void PG::clear_object_snap_mapping(
497 ObjectStore::Transaction
*t
, const hobject_t
&soid
)
499 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
500 if (soid
.snap
< CEPH_MAXSNAP
) {
501 int r
= snap_mapper
.remove_oid(
504 if (!(r
== 0 || r
== -ENOENT
)) {
505 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
511 void PG::update_object_snap_mapping(
512 ObjectStore::Transaction
*t
, const hobject_t
&soid
, const set
<snapid_t
> &snaps
)
514 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
515 assert(soid
.snap
< CEPH_MAXSNAP
);
516 int r
= snap_mapper
.remove_oid(
519 if (!(r
== 0 || r
== -ENOENT
)) {
520 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
530 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t from
)
532 PGLogEntryHandler rollbacker
{this, &t
};
534 oinfo
, olog
, from
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
537 void PG::rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
)
539 PGLogEntryHandler rollbacker
{this, &t
};
540 pg_log
.rewind_divergent_log(
541 newhead
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
545 * Process information from a replica to determine if it could have any
546 * objects that i need.
548 * TODO: if the missing set becomes very large, this could get expensive.
549 * Instead, we probably want to just iterate over our unfound set.
551 bool PG::search_for_missing(
552 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
556 uint64_t num_unfound_before
= missing_loc
.num_unfound();
557 bool found_missing
= missing_loc
.add_source_info(
558 from
, oinfo
, omissing
, ctx
->handle
);
559 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
560 publish_stats_to_osd();
561 // avoid doing this if the peer is empty. This is abit of paranoia
562 // to avoid doing something rash if add_source_info() above
563 // incorrectly decided we found something new. (if the peer has
564 // last_update=0'0 that's impossible.)
566 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
567 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
568 oinfo
.last_update
!= eversion_t()) {
569 pg_info_t
tinfo(oinfo
);
570 tinfo
.pgid
.shard
= pg_whoami
.shard
;
571 (*(ctx
->info_map
))[from
.osd
].push_back(
574 from
.shard
, pg_whoami
.shard
,
575 get_osdmap()->get_epoch(),
576 get_osdmap()->get_epoch(),
580 return found_missing
;
586 bool PG::MissingLoc::readable_with_acting(
587 const hobject_t
&hoid
,
588 const set
<pg_shard_t
> &acting
) const {
589 if (!needs_recovery(hoid
))
591 if (is_deleted(hoid
))
593 auto missing_loc_entry
= missing_loc
.find(hoid
);
594 if (missing_loc_entry
== missing_loc
.end())
596 const set
<pg_shard_t
> &locs
= missing_loc_entry
->second
;
597 ldout(pg
->cct
, 10) << __func__
<< ": locs:" << locs
<< dendl
;
598 set
<pg_shard_t
> have_acting
;
599 for (set
<pg_shard_t
>::const_iterator i
= locs
.begin();
602 if (acting
.count(*i
))
603 have_acting
.insert(*i
);
605 return (*is_readable
)(have_acting
);
608 void PG::MissingLoc::add_batch_sources_info(
609 const set
<pg_shard_t
> &sources
, ThreadPool::TPHandle
* handle
)
611 ldout(pg
->cct
, 10) << __func__
<< ": adding sources in batch "
612 << sources
.size() << dendl
;
614 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= needs_recovery_map
.begin();
615 i
!= needs_recovery_map
.end();
617 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
618 handle
->reset_tp_timeout();
621 if (i
->second
.is_delete())
624 auto p
= missing_loc
.find(i
->first
);
625 if (p
== missing_loc
.end()) {
626 p
= missing_loc
.emplace(i
->first
, set
<pg_shard_t
>()).first
;
628 _dec_count(p
->second
);
630 missing_loc
[i
->first
].insert(sources
.begin(), sources
.end());
631 missing_loc_sources
.insert(sources
.begin(), sources
.end());
632 _inc_count(p
->second
);
637 bool PG::MissingLoc::add_source_info(
639 const pg_info_t
&oinfo
,
640 const pg_missing_t
&omissing
,
641 ThreadPool::TPHandle
* handle
)
643 bool found_missing
= false;
646 for (map
<hobject_t
,pg_missing_item
>::const_iterator p
= needs_recovery_map
.begin();
647 p
!= needs_recovery_map
.end();
649 const hobject_t
&soid(p
->first
);
650 eversion_t need
= p
->second
.need
;
651 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
652 handle
->reset_tp_timeout();
655 if (p
->second
.is_delete()) {
656 ldout(pg
->cct
, 10) << __func__
<< " " << soid
657 << " delete, ignoring source" << dendl
;
660 if (oinfo
.last_update
< need
) {
661 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
662 << " also missing on osd." << fromosd
663 << " (last_update " << oinfo
.last_update
664 << " < needed " << need
<< ")" << dendl
;
667 if (!oinfo
.last_backfill
.is_max() &&
668 !oinfo
.last_backfill_bitwise
) {
669 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
670 << " also missing on osd." << fromosd
671 << " (last_backfill " << oinfo
.last_backfill
672 << " but with wrong sort order)"
676 if (p
->first
>= oinfo
.last_backfill
) {
677 // FIXME: this is _probably_ true, although it could conceivably
678 // be in the undefined region! Hmm!
679 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
680 << " also missing on osd." << fromosd
681 << " (past last_backfill " << oinfo
.last_backfill
685 if (omissing
.is_missing(soid
)) {
686 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
687 << " also missing on osd." << fromosd
<< dendl
;
691 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
692 << " is on osd." << fromosd
<< dendl
;
694 missing_loc_sources
.insert(fromosd
);
696 auto p
= missing_loc
.find(soid
);
697 if (p
== missing_loc
.end()) {
698 p
= missing_loc
.emplace(soid
, set
<pg_shard_t
>()).first
;
700 _dec_count(p
->second
);
702 p
->second
.insert(fromosd
);
703 _inc_count(p
->second
);
706 found_missing
= true;
709 ldout(pg
->cct
, 20) << "needs_recovery_map missing " << needs_recovery_map
711 return found_missing
;
714 void PG::MissingLoc::check_recovery_sources(const OSDMapRef
& osdmap
)
716 set
<pg_shard_t
> now_down
;
717 for (set
<pg_shard_t
>::iterator p
= missing_loc_sources
.begin();
718 p
!= missing_loc_sources
.end();
720 if (osdmap
->is_up(p
->osd
)) {
724 ldout(pg
->cct
, 10) << __func__
<< " source osd." << *p
<< " now down" << dendl
;
726 missing_loc_sources
.erase(p
++);
729 if (now_down
.empty()) {
730 ldout(pg
->cct
, 10) << __func__
<< " no source osds (" << missing_loc_sources
<< ") went down" << dendl
;
732 ldout(pg
->cct
, 10) << __func__
<< " sources osds " << now_down
<< " now down, remaining sources are "
733 << missing_loc_sources
<< dendl
;
735 // filter missing_loc
736 map
<hobject_t
, set
<pg_shard_t
>>::iterator p
= missing_loc
.begin();
737 while (p
!= missing_loc
.end()) {
738 set
<pg_shard_t
>::iterator q
= p
->second
.begin();
739 bool changed
= false;
740 while (q
!= p
->second
.end()) {
741 if (now_down
.count(*q
)) {
744 _dec_count(p
->second
);
746 p
->second
.erase(q
++);
751 if (p
->second
.empty()) {
752 missing_loc
.erase(p
++);
755 _inc_count(p
->second
);
763 void PG::discover_all_missing(map
<int, map
<spg_t
,pg_query_t
> > &query_map
)
765 auto &missing
= pg_log
.get_missing();
766 uint64_t unfound
= get_num_unfound();
768 dout(10) << __func__
<< " "
769 << missing
.num_missing() << " missing, "
770 << unfound
<< " unfound"
773 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
774 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
775 for (; m
!= mend
; ++m
) {
778 if (!get_osdmap()->is_up(peer
.osd
)) {
779 dout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
783 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
784 if (iter
!= peer_info
.end() &&
785 (iter
->second
.is_empty() || iter
->second
.dne())) {
786 // ignore empty peers
790 // If we've requested any of this stuff, the pg_missing_t information
791 // should be on its way.
792 // TODO: coalsce requested_* into a single data structure
793 if (peer_missing
.find(peer
) != peer_missing
.end()) {
794 dout(20) << __func__
<< ": osd." << peer
795 << ": we already have pg_missing_t" << dendl
;
798 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
799 dout(20) << __func__
<< ": osd." << peer
800 << ": in peer_log_requested" << dendl
;
803 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
804 dout(20) << __func__
<< ": osd." << peer
805 << ": in peer_missing_requested" << dendl
;
810 dout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
812 peer_missing_requested
.insert(peer
);
813 query_map
[peer
.osd
][spg_t(info
.pgid
.pgid
, peer
.shard
)] =
816 peer
.shard
, pg_whoami
.shard
,
817 info
.history
, get_osdmap()->get_epoch());
821 /******* PG ***********/
822 bool PG::needs_recovery() const
824 assert(is_primary());
826 auto &missing
= pg_log
.get_missing();
828 if (missing
.num_missing()) {
829 dout(10) << __func__
<< " primary has " << missing
.num_missing()
830 << " missing" << dendl
;
834 assert(!actingbackfill
.empty());
835 set
<pg_shard_t
>::const_iterator end
= actingbackfill
.end();
836 set
<pg_shard_t
>::const_iterator a
= actingbackfill
.begin();
837 for (; a
!= end
; ++a
) {
838 if (*a
== get_primary()) continue;
839 pg_shard_t peer
= *a
;
840 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
841 if (pm
== peer_missing
.end()) {
842 dout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
846 if (pm
->second
.num_missing()) {
847 dout(10) << __func__
<< " osd." << peer
<< " has "
848 << pm
->second
.num_missing() << " missing" << dendl
;
853 dout(10) << __func__
<< " is recovered" << dendl
;
857 bool PG::needs_backfill() const
859 assert(is_primary());
861 // We can assume that only possible osds that need backfill
862 // are on the backfill_targets vector nodes.
863 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
864 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
865 for (; a
!= end
; ++a
) {
866 pg_shard_t peer
= *a
;
867 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
868 if (!pi
->second
.last_backfill
.is_max()) {
869 dout(10) << __func__
<< " osd." << peer
<< " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
874 dout(10) << __func__
<< " does not need backfill" << dendl
;
879 void PG::check_past_interval_bounds() const
881 auto rpib
= get_required_past_interval_bounds(
883 osd
->get_superblock().oldest_map
);
884 if (rpib
.first
>= rpib
.second
) {
885 if (!past_intervals
.empty()) {
886 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
887 << " empty [" << rpib
<< ") but past_intervals is not: "
889 derr
<< info
.pgid
<< " required past_interval bounds are"
890 << " empty [" << rpib
<< ") but past_intervals is not: "
891 << past_intervals
<< dendl
;
894 if (past_intervals
.empty()) {
895 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
896 << " not empty [" << rpib
<< ") but past_intervals "
897 << past_intervals
<< " is empty";
898 derr
<< info
.pgid
<< " required past_interval bounds are"
899 << " not empty [" << rpib
<< ") but past_intervals "
900 << past_intervals
<< " is empty" << dendl
;
901 assert(!past_intervals
.empty());
904 auto apib
= past_intervals
.get_bounds();
905 if (apib
.first
> rpib
.first
) {
906 osd
->clog
->error() << info
.pgid
<< " past_intervals [" << apib
907 << ") start interval does not contain the required"
908 << " bound [" << rpib
<< ") start";
909 derr
<< info
.pgid
<< " past_intervals [" << apib
910 << ") start interval does not contain the required"
911 << " bound [" << rpib
<< ") start" << dendl
;
912 assert(0 == "past_interval start interval mismatch");
914 if (apib
.second
!= rpib
.second
) {
915 osd
->clog
->error() << info
.pgid
<< " past_interal bound [" << apib
916 << ") end does not match required [" << rpib
918 derr
<< info
.pgid
<< " past_interal bound [" << apib
919 << ") end does not match required [" << rpib
921 assert(0 == "past_interval end mismatch");
926 bool PG::adjust_need_up_thru(const OSDMapRef osdmap
)
928 epoch_t up_thru
= osdmap
->get_up_thru(osd
->whoami
);
930 up_thru
>= info
.history
.same_interval_since
) {
931 dout(10) << "adjust_need_up_thru now " << up_thru
<< ", need_up_thru now false" << dendl
;
932 need_up_thru
= false;
938 void PG::remove_down_peer_info(const OSDMapRef osdmap
)
940 // Remove any downed osds from peer_info
941 bool removed
= false;
942 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
943 while (p
!= peer_info
.end()) {
944 if (!osdmap
->is_up(p
->first
.osd
)) {
945 dout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
946 peer_missing
.erase(p
->first
);
947 peer_log_requested
.erase(p
->first
);
948 peer_missing_requested
.erase(p
->first
);
949 peer_info
.erase(p
++);
955 // if we removed anyone, update peers (which include peer_info)
957 update_heartbeat_peers();
958 check_recovery_sources(osdmap
);
962 * Returns true unless there is a non-lost OSD in might_have_unfound.
964 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const
966 assert(is_primary());
968 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
969 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
970 for (; peer
!= mend
; ++peer
) {
971 if (peer_missing
.count(*peer
))
973 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
974 if (iter
!= peer_info
.end() &&
975 (iter
->second
.is_empty() || iter
->second
.dne()))
977 if (!osdmap
->exists(peer
->osd
))
979 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
980 if (osd_info
.lost_at
<= osd_info
.up_from
) {
981 // If there is even one OSD in might_have_unfound that isn't lost, we
982 // still might retrieve our unfound.
986 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
987 << " have been queried or are marked lost" << dendl
;
991 PastIntervals::PriorSet
PG::build_prior()
995 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
996 it
!= peer_info
.end();
998 assert(info
.history
.last_epoch_started
>= it
->second
.history
.last_epoch_started
);
1002 const OSDMap
&osdmap
= *get_osdmap();
1003 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
1004 pool
.info
.ec_pool(),
1005 info
.history
.last_epoch_started
,
1006 get_pgbackend()->get_is_recoverable_predicate(),
1007 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
1008 const osd_info_t
*pinfo
= 0;
1009 if (osdmap
.exists(osd
)) {
1010 pinfo
= &osdmap
.get_info(osd
);
1012 *lost_at
= pinfo
->lost_at
;
1015 if (osdmap
.is_up(osd
)) {
1016 return PastIntervals::UP
;
1017 } else if (!pinfo
) {
1018 return PastIntervals::DNE
;
1019 } else if (pinfo
->lost_at
> start
) {
1020 return PastIntervals::LOST
;
1022 return PastIntervals::DOWN
;
1029 if (prior
.pg_down
) {
1030 state_set(PG_STATE_DOWN
);
1033 if (get_osdmap()->get_up_thru(osd
->whoami
) < info
.history
.same_interval_since
) {
1034 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
1035 << " < same_since " << info
.history
.same_interval_since
1036 << ", must notify monitor" << dendl
;
1037 need_up_thru
= true;
1039 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
1040 << " >= same_since " << info
.history
.same_interval_since
1041 << ", all is well" << dendl
;
1042 need_up_thru
= false;
1044 set_probe_targets(prior
.probe
);
1048 void PG::clear_primary_state()
1050 dout(10) << "clear_primary_state" << dendl
;
1052 // clear peering state
1054 peer_log_requested
.clear();
1055 peer_missing_requested
.clear();
1057 peer_missing
.clear();
1058 need_up_thru
= false;
1059 peer_last_complete_ondisk
.clear();
1060 peer_activated
.clear();
1061 min_last_complete_ondisk
= eversion_t();
1062 pg_trim_to
= eversion_t();
1063 might_have_unfound
.clear();
1064 projected_log
= PGLog::IndexedLog();
1066 last_update_ondisk
= eversion_t();
1070 finish_sync_event
= 0; // so that _finish_recovery doesn't go off in another thread
1072 missing_loc
.clear();
1074 release_pg_backoffs();
1076 pg_log
.reset_recovery_pointers();
1078 scrubber
.reserved_peers
.clear();
1079 scrub_after_recovery
= false;
1084 PG::Scrubber::Scrubber()
1085 : reserved(false), reserve_failed(false),
1088 shallow_errors(0), deep_errors(0), fixed(0),
1089 must_scrub(false), must_deep_scrub(false), must_repair(false),
1091 num_digest_updates_pending(0),
1096 PG::Scrubber::~Scrubber() {}
1101 * Returns an iterator to the best info in infos sorted by:
1102 * 1) Prefer newer last_update
1103 * 2) Prefer longer tail if it brings another info into contiguity
1104 * 3) Prefer current primary
1106 map
<pg_shard_t
, pg_info_t
>::const_iterator
PG::find_best_info(
1107 const map
<pg_shard_t
, pg_info_t
> &infos
,
1108 bool restrict_to_up_acting
,
1109 bool *history_les_bound
) const
1111 assert(history_les_bound
);
1112 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1113 * to make changes to this process. Also, make sure to update it
1114 * when you find bugs! */
1115 eversion_t min_last_update_acceptable
= eversion_t::max();
1116 epoch_t max_last_epoch_started_found
= 0;
1117 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1120 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1121 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1122 *history_les_bound
= true;
1123 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1125 if (!i
->second
.is_incomplete() &&
1126 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1127 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1130 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1133 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1134 if (min_last_update_acceptable
> i
->second
.last_update
)
1135 min_last_update_acceptable
= i
->second
.last_update
;
1138 if (min_last_update_acceptable
== eversion_t::max())
1141 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1142 // find osd with newest last_update (oldest for ec_pool).
1143 // if there are multiples, prefer
1144 // - a longer tail, if it brings another peer into log contiguity
1145 // - the current primary
1146 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1149 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1150 !is_acting(p
->first
))
1152 // Only consider peers with last_update >= min_last_update_acceptable
1153 if (p
->second
.last_update
< min_last_update_acceptable
)
1155 // Disqualify anyone with a too old last_epoch_started
1156 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1158 // Disqualify anyone who is incomplete (not fully backfilled)
1159 if (p
->second
.is_incomplete())
1161 if (best
== infos
.end()) {
1165 // Prefer newer last_update
1166 if (pool
.info
.require_rollback()) {
1167 if (p
->second
.last_update
> best
->second
.last_update
)
1169 if (p
->second
.last_update
< best
->second
.last_update
) {
1174 if (p
->second
.last_update
< best
->second
.last_update
)
1176 if (p
->second
.last_update
> best
->second
.last_update
) {
1182 // Prefer longer tail
1183 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1185 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1190 // prefer current primary (usually the caller), all things being equal
1191 if (p
->first
== pg_whoami
) {
1192 dout(10) << "calc_acting prefer osd." << p
->first
1193 << " because it is current primary" << dendl
;
1201 void PG::calc_ec_acting(
1202 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1204 const vector
<int> &acting
,
1205 pg_shard_t acting_primary
,
1206 const vector
<int> &up
,
1207 pg_shard_t up_primary
,
1208 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1209 bool restrict_to_up_acting
,
1211 set
<pg_shard_t
> *backfill
,
1212 set
<pg_shard_t
> *acting_backfill
,
1213 pg_shard_t
*want_primary
,
1216 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1217 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1218 unsigned usable
= 0;
1219 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1220 i
!= all_info
.end();
1222 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1224 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1225 ss
<< "For position " << (unsigned)i
<< ": ";
1226 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1227 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1228 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1229 auth_log_shard
->second
.log_tail
) {
1230 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1235 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1236 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1238 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1241 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1242 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1243 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1244 auth_log_shard
->second
.log_tail
) {
1245 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1246 want
[i
] = acting
[i
];
1248 } else if (!restrict_to_up_acting
) {
1249 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1250 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1252 assert(j
->shard
== i
);
1253 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1254 all_info
.find(*j
)->second
.last_update
>=
1255 auth_log_shard
->second
.log_tail
) {
1256 ss
<< " selecting stray: " << *j
<< std::endl
;
1262 if (want
[i
] == CRUSH_ITEM_NONE
)
1263 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1267 bool found_primary
= false;
1268 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1269 if (want
[i
] != CRUSH_ITEM_NONE
) {
1270 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1271 if (!found_primary
) {
1272 *want_primary
= pg_shard_t(want
[i
], shard_id_t(i
));
1273 found_primary
= true;
1277 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1282 * calculate the desired acting set.
1284 * Choose an appropriate acting set. Prefer up[0], unless it is
1285 * incomplete, or another osd has a longer tail that allows us to
1286 * bring other up nodes up to date.
1288 void PG::calc_replicated_acting(
1289 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1291 const vector
<int> &acting
,
1292 pg_shard_t acting_primary
,
1293 const vector
<int> &up
,
1294 pg_shard_t up_primary
,
1295 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1296 bool restrict_to_up_acting
,
1298 set
<pg_shard_t
> *backfill
,
1299 set
<pg_shard_t
> *acting_backfill
,
1300 pg_shard_t
*want_primary
,
1303 ss
<< "calc_acting newest update on osd." << auth_log_shard
->first
1304 << " with " << auth_log_shard
->second
1305 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1306 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1309 map
<pg_shard_t
,pg_info_t
>::const_iterator primary
;
1311 !all_info
.find(up_primary
)->second
.is_incomplete() &&
1312 all_info
.find(up_primary
)->second
.last_update
>=
1313 auth_log_shard
->second
.log_tail
) {
1314 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1315 primary
= all_info
.find(up_primary
); // prefer up[0], all thing being equal
1317 assert(!auth_log_shard
->second
.is_incomplete());
1318 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1319 << " selected as primary instead" << std::endl
;
1320 primary
= auth_log_shard
;
1323 ss
<< "calc_acting primary is osd." << primary
->first
1324 << " with " << primary
->second
<< std::endl
;
1325 *want_primary
= primary
->first
;
1326 want
->push_back(primary
->first
.osd
);
1327 acting_backfill
->insert(primary
->first
);
1328 unsigned usable
= 1;
1330 // select replicas that have log contiguity with primary.
1331 // prefer up, then acting, then any peer_info osds
1332 for (vector
<int>::const_iterator i
= up
.begin();
1335 pg_shard_t up_cand
= pg_shard_t(*i
, shard_id_t::NO_SHARD
);
1336 if (up_cand
== primary
->first
)
1338 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1339 if (cur_info
.is_incomplete() ||
1340 cur_info
.last_update
< MIN(
1341 primary
->second
.log_tail
,
1342 auth_log_shard
->second
.log_tail
)) {
1343 /* We include auth_log_shard->second.log_tail because in GetLog,
1344 * we will request logs back to the min last_update over our
1345 * acting_backfill set, which will result in our log being extended
1346 * as far backwards as necessary to pick up any peers which can
1347 * be log recovered by auth_log_shard's log */
1348 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1349 backfill
->insert(up_cand
);
1350 acting_backfill
->insert(up_cand
);
1352 want
->push_back(*i
);
1353 acting_backfill
->insert(up_cand
);
1355 ss
<< " osd." << *i
<< " (up) accepted " << cur_info
<< std::endl
;
1357 if (want
->size() >= size
) {
1362 // This no longer has backfill OSDs, but they are covered above.
1363 for (vector
<int>::const_iterator i
= acting
.begin();
1366 pg_shard_t
acting_cand(*i
, shard_id_t::NO_SHARD
);
1370 // skip up osds we already considered above
1371 if (acting_cand
== primary
->first
)
1373 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), acting_cand
.osd
);
1374 if (up_it
!= up
.end())
1377 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1378 if (cur_info
.is_incomplete() ||
1379 cur_info
.last_update
< primary
->second
.log_tail
) {
1380 ss
<< " shard " << acting_cand
<< " (stray) REJECTED "
1381 << cur_info
<< std::endl
;
1383 want
->push_back(*i
);
1384 acting_backfill
->insert(acting_cand
);
1385 ss
<< " shard " << acting_cand
<< " (stray) accepted "
1386 << cur_info
<< std::endl
;
1391 if (restrict_to_up_acting
) {
1394 for (map
<pg_shard_t
,pg_info_t
>::const_iterator i
= all_info
.begin();
1395 i
!= all_info
.end();
1400 // skip up osds we already considered above
1401 if (i
->first
== primary
->first
)
1403 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
->first
.osd
);
1404 if (up_it
!= up
.end())
1406 vector
<int>::const_iterator acting_it
= find(
1407 acting
.begin(), acting
.end(), i
->first
.osd
);
1408 if (acting_it
!= acting
.end())
1411 if (i
->second
.is_incomplete() ||
1412 i
->second
.last_update
< primary
->second
.log_tail
) {
1413 ss
<< " shard " << i
->first
<< " (stray) REJECTED "
1414 << i
->second
<< std::endl
;
1416 want
->push_back(i
->first
.osd
);
1417 acting_backfill
->insert(i
->first
);
1418 ss
<< " shard " << i
->first
<< " (stray) accepted "
1419 << i
->second
<< std::endl
;
1428 * calculate the desired acting, and request a change with the monitor
1429 * if it differs from the current acting.
1431 * if restrict_to_up_acting=true, we filter out anything that's not in
1432 * up/acting. in order to lift this restriction, we need to
1433 * 1) check whether it's worth switching the acting set any time we get
1434 * a new pg info (not just here, when recovery finishes)
1435 * 2) check whether anything in want_acting went down on each new map
1436 * (and, if so, calculate a new want_acting)
1437 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1440 bool PG::choose_acting(pg_shard_t
&auth_log_shard_id
,
1441 bool restrict_to_up_acting
,
1442 bool *history_les_bound
)
1444 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
1445 all_info
[pg_whoami
] = info
;
1447 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
1448 p
!= all_info
.end();
1450 dout(10) << __func__
<< " all_info osd." << p
->first
<< " " << p
->second
<< dendl
;
1453 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
1454 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
1456 if (auth_log_shard
== all_info
.end()) {
1458 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1459 << " reverting to up" << dendl
;
1462 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1464 dout(10) << "choose_acting failed" << dendl
;
1465 assert(want_acting
.empty());
1470 assert(!auth_log_shard
->second
.is_incomplete());
1471 auth_log_shard_id
= auth_log_shard
->first
;
1473 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
1475 pg_shard_t want_primary
;
1477 if (!pool
.info
.ec_pool())
1478 calc_replicated_acting(
1480 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1486 restrict_to_up_acting
,
1489 &want_acting_backfill
,
1495 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1501 restrict_to_up_acting
,
1504 &want_acting_backfill
,
1507 dout(10) << ss
.str() << dendl
;
1509 unsigned num_want_acting
= 0;
1510 set
<pg_shard_t
> have
;
1511 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1512 if (want
[i
] != CRUSH_ITEM_NONE
) {
1517 pool
.info
.ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1521 // We go incomplete if below min_size for ec_pools since backfill
1522 // does not currently maintain rollbackability
1523 // Otherwise, we will go "peered", but not "active"
1524 if (num_want_acting
< pool
.info
.min_size
&&
1525 (pool
.info
.ec_pool() ||
1526 !cct
->_conf
->osd_allow_recovery_below_min_size
)) {
1527 want_acting
.clear();
1528 dout(10) << "choose_acting failed, below min size" << dendl
;
1532 /* Check whether we have enough acting shards to later perform recovery */
1533 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable_predicate(
1534 get_pgbackend()->get_is_recoverable_predicate());
1535 if (!(*recoverable_predicate
)(have
)) {
1536 want_acting
.clear();
1537 dout(10) << "choose_acting failed, not recoverable" << dendl
;
1541 if (want
!= acting
) {
1542 dout(10) << "choose_acting want " << want
<< " != acting " << acting
1543 << ", requesting pg_temp change" << dendl
;
1546 if (want_acting
== up
) {
1547 // There can't be any pending backfill if
1548 // want is the same as crush map up OSDs.
1549 assert(want_backfill
.empty());
1551 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1553 osd
->queue_want_pg_temp(info
.pgid
.pgid
, want
);
1556 want_acting
.clear();
1557 actingbackfill
= want_acting_backfill
;
1558 dout(10) << "actingbackfill is " << actingbackfill
<< dendl
;
1559 assert(backfill_targets
.empty() || backfill_targets
== want_backfill
);
1560 if (backfill_targets
.empty()) {
1561 // Caller is GetInfo
1562 backfill_targets
= want_backfill
;
1564 // Will not change if already set because up would have had to change
1565 // Verify that nothing in backfill is in stray_set
1566 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
1567 i
!= want_backfill
.end();
1569 assert(stray_set
.find(*i
) == stray_set
.end());
1571 dout(10) << "choose_acting want " << want
<< " (== acting) backfill_targets "
1572 << want_backfill
<< dendl
;
1576 /* Build the might_have_unfound set.
1578 * This is used by the primary OSD during recovery.
1580 * This set tracks the OSDs which might have unfound objects that the primary
1581 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1582 * will remove the OSD from the set.
1584 void PG::build_might_have_unfound()
1586 assert(might_have_unfound
.empty());
1587 assert(is_primary());
1589 dout(10) << __func__
<< dendl
;
1591 check_past_interval_bounds();
1593 might_have_unfound
= past_intervals
.get_might_have_unfound(
1595 pool
.info
.ec_pool());
1597 // include any (stray) peers
1598 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
1599 p
!= peer_info
.end();
1601 might_have_unfound
.insert(p
->first
);
1603 dout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
1606 struct C_PG_ActivateCommitted
: public Context
{
1609 epoch_t activation_epoch
;
1610 C_PG_ActivateCommitted(PG
*p
, epoch_t e
, epoch_t ae
)
1611 : pg(p
), epoch(e
), activation_epoch(ae
) {}
1612 void finish(int r
) override
{
1613 pg
->_activate_committed(epoch
, activation_epoch
);
1617 void PG::activate(ObjectStore::Transaction
& t
,
1618 epoch_t activation_epoch
,
1619 list
<Context
*>& tfin
,
1620 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1624 PastIntervals
> > > *activator_map
,
1627 assert(!is_peered());
1628 assert(scrubber
.callbacks
.empty());
1629 assert(callbacks_for_degraded_object
.empty());
1632 state_clear(PG_STATE_DOWN
);
1634 send_notify
= false;
1637 // only update primary last_epoch_started if we will go active
1638 if (acting
.size() >= pool
.info
.min_size
) {
1639 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
1640 info
.last_epoch_started
<= activation_epoch
);
1641 info
.last_epoch_started
= activation_epoch
;
1642 info
.last_interval_started
= info
.history
.same_interval_since
;
1644 } else if (is_acting(pg_whoami
)) {
1645 /* update last_epoch_started on acting replica to whatever the primary sent
1646 * unless it's smaller (could happen if we are going peered rather than
1647 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1648 if (info
.last_epoch_started
< activation_epoch
) {
1649 info
.last_epoch_started
= activation_epoch
;
1650 info
.last_interval_started
= info
.history
.same_interval_since
;
1654 auto &missing
= pg_log
.get_missing();
1657 last_update_ondisk
= info
.last_update
;
1658 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
1660 last_update_applied
= info
.last_update
;
1661 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
1663 need_up_thru
= false;
1665 // write pg info, log
1667 dirty_big_info
= true; // maybe
1669 // find out when we commit
1670 t
.register_on_complete(
1671 new C_PG_ActivateCommitted(
1673 get_osdmap()->get_epoch(),
1676 // initialize snap_trimq
1678 dout(20) << "activate - purged_snaps " << info
.purged_snaps
1679 << " cached_removed_snaps " << pool
.cached_removed_snaps
<< dendl
;
1680 snap_trimq
= pool
.cached_removed_snaps
;
1681 interval_set
<snapid_t
> intersection
;
1682 intersection
.intersection_of(snap_trimq
, info
.purged_snaps
);
1683 if (intersection
== info
.purged_snaps
) {
1684 snap_trimq
.subtract(info
.purged_snaps
);
1686 dout(0) << "warning: info.purged_snaps (" << info
.purged_snaps
1687 << ") is not a subset of pool.cached_removed_snaps ("
1688 << pool
.cached_removed_snaps
<< ")" << dendl
;
1689 snap_trimq
.subtract(intersection
);
1693 // init complete pointer
1694 if (missing
.num_missing() == 0) {
1695 dout(10) << "activate - no missing, moving last_complete " << info
.last_complete
1696 << " -> " << info
.last_update
<< dendl
;
1697 info
.last_complete
= info
.last_update
;
1698 pg_log
.reset_recovery_pointers();
1700 dout(10) << "activate - not complete, " << missing
<< dendl
;
1701 pg_log
.activate_not_complete(info
);
1709 // start up replicas
1711 assert(!actingbackfill
.empty());
1712 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1713 i
!= actingbackfill
.end();
1715 if (*i
== pg_whoami
) continue;
1716 pg_shard_t peer
= *i
;
1717 assert(peer_info
.count(peer
));
1718 pg_info_t
& pi
= peer_info
[peer
];
1720 dout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
1723 assert(peer_missing
.count(peer
));
1724 pg_missing_t
& pm
= peer_missing
[peer
];
1726 bool needs_past_intervals
= pi
.dne();
1729 * cover case where peer sort order was different and
1730 * last_backfill cannot be interpreted
1732 bool force_restart_backfill
=
1733 !pi
.last_backfill
.is_max() &&
1734 !pi
.last_backfill_bitwise
;
1736 if (pi
.last_update
== info
.last_update
&& !force_restart_backfill
) {
1738 if (!pi
.last_backfill
.is_max())
1739 osd
->clog
->info() << info
.pgid
<< " continuing backfill to osd."
1741 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1742 << "] " << pi
.last_backfill
1743 << " to " << info
.last_update
;
1744 if (!pi
.is_empty() && activator_map
) {
1745 dout(10) << "activate peer osd." << peer
<< " is up to date, queueing in pending_activators" << dendl
;
1746 (*activator_map
)[peer
.osd
].push_back(
1749 peer
.shard
, pg_whoami
.shard
,
1750 get_osdmap()->get_epoch(),
1751 get_osdmap()->get_epoch(),
1755 dout(10) << "activate peer osd." << peer
<< " is up to date, but sending pg_log anyway" << dendl
;
1757 i
->shard
, pg_whoami
.shard
,
1758 get_osdmap()->get_epoch(), info
);
1761 pg_log
.get_tail() > pi
.last_update
||
1762 pi
.last_backfill
== hobject_t() ||
1763 force_restart_backfill
||
1764 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
1765 /* ^ This last case covers a situation where a replica is not contiguous
1766 * with the auth_log, but is contiguous with this replica. Reshuffling
1767 * the active set to handle this would be tricky, so instead we just go
1768 * ahead and backfill it anyway. This is probably preferrable in any
1769 * case since the replica in question would have to be significantly
1773 osd
->clog
->debug() << info
.pgid
<< " starting backfill to osd." << peer
1774 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1775 << "] " << pi
.last_backfill
1776 << " to " << info
.last_update
;
1778 pi
.last_update
= info
.last_update
;
1779 pi
.last_complete
= info
.last_update
;
1780 pi
.set_last_backfill(hobject_t());
1781 pi
.last_epoch_started
= info
.last_epoch_started
;
1782 pi
.last_interval_started
= info
.last_interval_started
;
1783 pi
.history
= info
.history
;
1784 pi
.hit_set
= info
.hit_set
;
1785 pi
.stats
.stats
.clear();
1787 // initialize peer with our purged_snaps.
1788 pi
.purged_snaps
= info
.purged_snaps
;
1791 i
->shard
, pg_whoami
.shard
,
1792 get_osdmap()->get_epoch(), pi
);
1794 // send some recent log, so that op dup detection works well.
1795 m
->log
.copy_up_to(pg_log
.get_log(), cct
->_conf
->osd_min_pg_log_entries
);
1796 m
->info
.log_tail
= m
->log
.tail
;
1797 pi
.log_tail
= m
->log
.tail
; // sigh...
1802 assert(pg_log
.get_tail() <= pi
.last_update
);
1804 i
->shard
, pg_whoami
.shard
,
1805 get_osdmap()->get_epoch(), info
);
1806 // send new stuff to append to replicas log
1807 m
->log
.copy_after(pg_log
.get_log(), pi
.last_update
);
1810 // share past_intervals if we are creating the pg on the replica
1811 // based on whether our info for that peer was dne() *before*
1812 // updating pi.history in the backfill block above.
1813 if (m
&& needs_past_intervals
)
1814 m
->past_intervals
= past_intervals
;
1816 // update local version of peer's missing list!
1817 if (m
&& pi
.last_backfill
!= hobject_t()) {
1818 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
1819 p
!= m
->log
.log
.end();
1821 if (p
->soid
<= pi
.last_backfill
&&
1823 if (perform_deletes_during_peering() && p
->is_delete()) {
1824 pm
.rm(p
->soid
, p
->version
);
1826 pm
.add_next_event(*p
);
1833 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
<< dendl
;
1834 //m->log.print(cout);
1835 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
1839 pi
.last_update
= info
.last_update
;
1841 // update our missing
1842 if (pm
.num_missing() == 0) {
1843 pi
.last_complete
= pi
.last_update
;
1844 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " uptodate" << dendl
;
1846 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " missing " << pm
<< dendl
;
1850 // Set up missing_loc
1851 set
<pg_shard_t
> complete_shards
;
1852 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1853 i
!= actingbackfill
.end();
1855 dout(20) << __func__
<< " setting up missing_loc from shard " << *i
<< " " << dendl
;
1856 if (*i
== get_primary()) {
1857 missing_loc
.add_active_missing(missing
);
1858 if (!missing
.have_missing())
1859 complete_shards
.insert(*i
);
1861 auto peer_missing_entry
= peer_missing
.find(*i
);
1862 assert(peer_missing_entry
!= peer_missing
.end());
1863 missing_loc
.add_active_missing(peer_missing_entry
->second
);
1864 if (!peer_missing_entry
->second
.have_missing() &&
1865 peer_info
[*i
].last_backfill
.is_max())
1866 complete_shards
.insert(*i
);
1870 // If necessary, create might_have_unfound to help us find our unfound objects.
1871 // NOTE: It's important that we build might_have_unfound before trimming the
1873 might_have_unfound
.clear();
1874 if (needs_recovery()) {
1875 // If only one shard has missing, we do a trick to add all others as recovery
1876 // source, this is considered safe since the PGLogs have been merged locally,
1877 // and covers vast majority of the use cases, like one OSD/host is down for
1878 // a while for hardware repairing
1879 if (complete_shards
.size() + 1 == actingbackfill
.size()) {
1880 missing_loc
.add_batch_sources_info(complete_shards
, ctx
->handle
);
1882 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
1884 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1885 i
!= actingbackfill
.end();
1887 if (*i
== pg_whoami
) continue;
1888 dout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
1889 assert(peer_missing
.count(*i
));
1890 assert(peer_info
.count(*i
));
1891 missing_loc
.add_source_info(
1898 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
1899 i
!= peer_missing
.end();
1901 if (is_actingbackfill(i
->first
))
1903 assert(peer_info
.count(i
->first
));
1905 peer_info
[i
->first
],
1911 build_might_have_unfound();
1913 // Always call now so _update_calc_stats() will be accurate
1914 discover_all_missing(query_map
);
1917 // num_objects_degraded if calculated should reflect this too, unless no
1918 // missing and we are about to go clean.
1919 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
1920 state_set(PG_STATE_UNDERSIZED
);
1923 state_set(PG_STATE_ACTIVATING
);
1924 release_pg_backoffs();
1925 projected_last_update
= info
.last_update
;
1927 if (acting
.size() >= pool
.info
.min_size
) {
1928 PGLogEntryHandler handler
{this, &t
};
1929 pg_log
.roll_forward(&handler
);
1933 bool PG::op_has_sufficient_caps(OpRequestRef
& op
)
1935 // only check MOSDOp
1936 if (op
->get_req()->get_type() != CEPH_MSG_OSD_OP
)
1939 const MOSDOp
*req
= static_cast<const MOSDOp
*>(op
->get_req());
1941 Session
*session
= static_cast<Session
*>(req
->get_connection()->get_priv());
1943 dout(0) << "op_has_sufficient_caps: no session for op " << *req
<< dendl
;
1946 OSDCap
& caps
= session
->caps
;
1949 const string
&key
= req
->get_hobj().get_key().empty() ?
1950 req
->get_oid().name
:
1951 req
->get_hobj().get_key();
1953 bool cap
= caps
.is_capable(pool
.name
, req
->get_hobj().nspace
,
1955 op
->need_read_cap(),
1956 op
->need_write_cap(),
1959 dout(20) << "op_has_sufficient_caps "
1960 << "session=" << session
1961 << " pool=" << pool
.id
<< " (" << pool
.name
1962 << " " << req
->get_hobj().nspace
1963 << ") owner=" << pool
.auid
1964 << " need_read_cap=" << op
->need_read_cap()
1965 << " need_write_cap=" << op
->need_write_cap()
1966 << " classes=" << op
->classes()
1967 << " -> " << (cap
? "yes" : "NO")
1972 void PG::_activate_committed(epoch_t epoch
, epoch_t activation_epoch
)
1975 if (pg_has_reset_since(epoch
)) {
1976 dout(10) << "_activate_committed " << epoch
1977 << ", that was an old interval" << dendl
;
1978 } else if (is_primary()) {
1979 peer_activated
.insert(pg_whoami
);
1980 dout(10) << "_activate_committed " << epoch
1981 << " peer_activated now " << peer_activated
1982 << " last_interval_started " << info
.history
.last_interval_started
1983 << " last_epoch_started " << info
.history
.last_epoch_started
1984 << " same_interval_since " << info
.history
.same_interval_since
<< dendl
;
1985 assert(!actingbackfill
.empty());
1986 if (peer_activated
.size() == actingbackfill
.size())
1987 all_activated_and_committed();
1989 dout(10) << "_activate_committed " << epoch
<< " telling primary" << dendl
;
1990 MOSDPGInfo
*m
= new MOSDPGInfo(epoch
);
1991 pg_notify_t i
= pg_notify_t(
1992 get_primary().shard
, pg_whoami
.shard
,
1993 get_osdmap()->get_epoch(),
1994 get_osdmap()->get_epoch(),
1997 i
.info
.history
.last_epoch_started
= activation_epoch
;
1998 i
.info
.history
.last_interval_started
= i
.info
.history
.same_interval_since
;
1999 if (acting
.size() >= pool
.info
.min_size
) {
2000 state_set(PG_STATE_ACTIVE
);
2002 state_set(PG_STATE_PEERED
);
2005 m
->pg_list
.push_back(make_pair(i
, PastIntervals()));
2006 osd
->send_message_osd_cluster(get_primary().osd
, m
, get_osdmap()->get_epoch());
2009 if (flushes_in_progress
== 0) {
2010 requeue_ops(waiting_for_peered
);
2011 } else if (!waiting_for_peered
.empty()) {
2012 dout(10) << __func__
<< " flushes in progress, moving "
2013 << waiting_for_peered
.size() << " items to waiting_for_flush"
2015 assert(waiting_for_flush
.empty());
2016 waiting_for_flush
.swap(waiting_for_peered
);
2020 assert(!dirty_info
);
2026 * update info.history.last_epoch_started ONLY after we and all
2027 * replicas have activated AND committed the activate transaction
2028 * (i.e. the peering results are stable on disk).
2030 void PG::all_activated_and_committed()
2032 dout(10) << "all_activated_and_committed" << dendl
;
2033 assert(is_primary());
2034 assert(peer_activated
.size() == actingbackfill
.size());
2035 assert(!actingbackfill
.empty());
2036 assert(blocked_by
.empty());
2039 _update_calc_stats();
2040 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
2041 state_set(PG_STATE_DEGRADED
);
2043 state_clear(PG_STATE_DEGRADED
);
2046 queue_peering_event(
2048 std::make_shared
<CephPeeringEvt
>(
2049 get_osdmap()->get_epoch(),
2050 get_osdmap()->get_epoch(),
2051 AllReplicasActivated())));
2054 bool PG::requeue_scrub(bool high_priority
)
2056 assert(is_locked());
2058 dout(10) << __func__
<< ": already queued" << dendl
;
2061 dout(10) << __func__
<< ": queueing" << dendl
;
2062 scrub_queued
= true;
2063 osd
->queue_for_scrub(this, high_priority
);
2068 void PG::queue_recovery()
2070 if (!is_primary() || !is_peered()) {
2071 dout(10) << "queue_recovery -- not primary or not peered " << dendl
;
2072 assert(!recovery_queued
);
2073 } else if (recovery_queued
) {
2074 dout(10) << "queue_recovery -- already queued" << dendl
;
2076 dout(10) << "queue_recovery -- queuing" << dendl
;
2077 recovery_queued
= true;
2078 osd
->queue_for_recovery(this);
2082 bool PG::queue_scrub()
2084 assert(is_locked());
2085 if (is_scrubbing()) {
2088 scrubber
.priority
= scrubber
.must_scrub
?
2089 cct
->_conf
->osd_requested_scrub_priority
: get_scrub_priority();
2090 scrubber
.must_scrub
= false;
2091 state_set(PG_STATE_SCRUBBING
);
2092 if (scrubber
.must_deep_scrub
) {
2093 state_set(PG_STATE_DEEP_SCRUB
);
2094 scrubber
.must_deep_scrub
= false;
2096 if (scrubber
.must_repair
|| scrubber
.auto_repair
) {
2097 state_set(PG_STATE_REPAIR
);
2098 scrubber
.must_repair
= false;
2104 unsigned PG::get_scrub_priority()
2106 // a higher value -> a higher priority
2107 int pool_scrub_priority
= 0;
2108 pool
.info
.opts
.get(pool_opts_t::SCRUB_PRIORITY
, &pool_scrub_priority
);
2109 return pool_scrub_priority
> 0 ? pool_scrub_priority
: cct
->_conf
->osd_scrub_priority
;
2112 struct C_PG_FinishRecovery
: public Context
{
2114 explicit C_PG_FinishRecovery(PG
*p
) : pg(p
) {}
2115 void finish(int r
) override
{
2116 pg
->_finish_recovery(this);
2120 void PG::mark_clean()
2122 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2123 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
2124 state_set(PG_STATE_CLEAN
);
2125 info
.history
.last_epoch_clean
= get_osdmap()->get_epoch();
2126 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2127 past_intervals
.clear();
2128 dirty_big_info
= true;
2135 bool PG::set_force_recovery(bool b
)
2141 if (!(state
& PG_STATE_FORCED_RECOVERY
) &&
2142 (state
& (PG_STATE_DEGRADED
|
2143 PG_STATE_RECOVERY_WAIT
|
2144 PG_STATE_RECOVERING
))) {
2145 dout(20) << __func__
<< " set" << dendl
;
2146 state_set(PG_STATE_FORCED_RECOVERY
);
2147 publish_stats_to_osd();
2150 } else if (state
& PG_STATE_FORCED_RECOVERY
) {
2151 dout(20) << __func__
<< " clear" << dendl
;
2152 state_clear(PG_STATE_FORCED_RECOVERY
);
2153 publish_stats_to_osd();
2159 dout(20) << __func__
<< " state " << pgstate_history
.get_current_state() << dendl
;
2160 osd
->local_reserver
.update_priority(info
.pgid
, get_recovery_priority());
2165 bool PG::set_force_backfill(bool b
)
2171 if (!(state
& PG_STATE_FORCED_BACKFILL
) &&
2172 (state
& (PG_STATE_DEGRADED
|
2173 PG_STATE_BACKFILL_WAIT
|
2174 PG_STATE_BACKFILLING
))) {
2175 dout(10) << __func__
<< " set" << dendl
;
2176 state_set(PG_STATE_FORCED_BACKFILL
);
2177 publish_stats_to_osd();
2180 } else if (state
& PG_STATE_FORCED_BACKFILL
) {
2181 dout(10) << __func__
<< " clear" << dendl
;
2182 state_clear(PG_STATE_FORCED_BACKFILL
);
2183 publish_stats_to_osd();
2189 dout(20) << __func__
<< " state " << pgstate_history
.get_current_state() << dendl
;
2190 osd
->local_reserver
.update_priority(info
.pgid
, get_backfill_priority());
2195 inline int PG::clamp_recovery_priority(int priority
)
2197 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
2198 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
2200 // Clamp to valid range
2201 if (priority
> OSD_RECOVERY_PRIORITY_MAX
) {
2202 return OSD_RECOVERY_PRIORITY_MAX
;
2203 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
2204 return OSD_RECOVERY_PRIORITY_MIN
;
2210 unsigned PG::get_recovery_priority()
2212 // a higher value -> a higher priority
2215 if (state
& PG_STATE_FORCED_RECOVERY
) {
2216 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
2218 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &ret
);
2219 ret
= clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE
+ ret
);
2221 dout(20) << __func__
<< " recovery priority for " << *this << " is " << ret
<< ", state is " << state
<< dendl
;
2222 return static_cast<unsigned>(ret
);
2225 unsigned PG::get_backfill_priority()
2227 // a higher value -> a higher priority
2228 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
2229 if (state
& PG_STATE_FORCED_BACKFILL
) {
2230 ret
= OSD_BACKFILL_PRIORITY_FORCED
;
2232 if (acting
.size() < pool
.info
.min_size
) {
2233 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2234 ret
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
+ (pool
.info
.min_size
- acting
.size());
2236 } else if (is_undersized()) {
2237 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2238 assert(pool
.info
.size
> actingset
.size());
2239 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
+ (pool
.info
.size
- actingset
.size());
2241 } else if (is_degraded()) {
2242 // degraded: baseline degraded
2243 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
2246 // Adjust with pool's recovery priority
2247 int pool_recovery_priority
= 0;
2248 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2250 ret
= clamp_recovery_priority(pool_recovery_priority
+ ret
);
2253 return static_cast<unsigned>(ret
);
2256 void PG::finish_recovery(list
<Context
*>& tfin
)
2258 dout(10) << "finish_recovery" << dendl
;
2259 assert(info
.last_complete
== info
.last_update
);
2261 clear_recovery_state();
2264 * sync all this before purging strays. but don't block!
2266 finish_sync_event
= new C_PG_FinishRecovery(this);
2267 tfin
.push_back(finish_sync_event
);
2270 void PG::_finish_recovery(Context
*c
)
2277 if (c
== finish_sync_event
) {
2278 dout(10) << "_finish_recovery" << dendl
;
2279 finish_sync_event
= 0;
2282 publish_stats_to_osd();
2284 if (scrub_after_recovery
) {
2285 dout(10) << "_finish_recovery requeueing for scrub" << dendl
;
2286 scrub_after_recovery
= false;
2287 scrubber
.must_deep_scrub
= true;
2291 dout(10) << "_finish_recovery -- stale" << dendl
;
2296 void PG::start_recovery_op(const hobject_t
& soid
)
2298 dout(10) << "start_recovery_op " << soid
2299 #ifdef DEBUG_RECOVERY_OIDS
2300 << " (" << recovering_oids
<< ")"
2303 assert(recovery_ops_active
>= 0);
2304 recovery_ops_active
++;
2305 #ifdef DEBUG_RECOVERY_OIDS
2306 assert(recovering_oids
.count(soid
) == 0);
2307 recovering_oids
.insert(soid
);
2309 osd
->start_recovery_op(this, soid
);
2312 void PG::finish_recovery_op(const hobject_t
& soid
, bool dequeue
)
2314 dout(10) << "finish_recovery_op " << soid
2315 #ifdef DEBUG_RECOVERY_OIDS
2316 << " (" << recovering_oids
<< ")"
2319 assert(recovery_ops_active
> 0);
2320 recovery_ops_active
--;
2321 #ifdef DEBUG_RECOVERY_OIDS
2322 assert(recovering_oids
.count(soid
));
2323 recovering_oids
.erase(soid
);
2325 osd
->finish_recovery_op(this, soid
, dequeue
);
2332 void PG::split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
2334 child
->update_snap_mapper_bits(split_bits
);
2335 child
->update_osdmap_ref(get_osdmap());
2340 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2341 child
->info
.last_complete
= info
.last_complete
;
2343 info
.last_update
= pg_log
.get_head();
2344 child
->info
.last_update
= child
->pg_log
.get_head();
2346 child
->info
.last_user_version
= info
.last_user_version
;
2348 info
.log_tail
= pg_log
.get_tail();
2349 child
->info
.log_tail
= child
->pg_log
.get_tail();
2351 if (info
.last_complete
< pg_log
.get_tail())
2352 info
.last_complete
= pg_log
.get_tail();
2353 if (child
->info
.last_complete
< child
->pg_log
.get_tail())
2354 child
->info
.last_complete
= child
->pg_log
.get_tail();
2357 child
->info
.history
= info
.history
;
2358 child
->info
.history
.epoch_created
= get_osdmap()->get_epoch();
2359 child
->info
.purged_snaps
= info
.purged_snaps
;
2361 if (info
.last_backfill
.is_max()) {
2362 child
->info
.set_last_backfill(hobject_t::get_max());
2364 // restart backfill on parent and child to be safe. we could
2365 // probably do better in the bitwise sort case, but it's more
2366 // fragile (there may be special work to do on backfill completion
2368 info
.set_last_backfill(hobject_t());
2369 child
->info
.set_last_backfill(hobject_t());
2370 // restarting backfill implies that the missing set is empty,
2371 // since it is only used for objects prior to last_backfill
2372 pg_log
.reset_backfill();
2373 child
->pg_log
.reset_backfill();
2376 child
->info
.stats
= info
.stats
;
2377 child
->info
.stats
.parent_split_bits
= split_bits
;
2378 info
.stats
.stats_invalid
= true;
2379 child
->info
.stats
.stats_invalid
= true;
2380 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2381 child
->info
.last_interval_started
= info
.last_interval_started
;
2383 child
->snap_trimq
= snap_trimq
;
2385 // There can't be recovery/backfill going on now
2386 int primary
, up_primary
;
2387 vector
<int> newup
, newacting
;
2388 get_osdmap()->pg_to_up_acting_osds(
2389 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2390 child
->init_primary_up_acting(
2395 child
->role
= OSDMap::calc_pg_role(osd
->whoami
, child
->acting
);
2397 // this comparison includes primary rank via pg_shard_t
2398 if (get_primary() != child
->get_primary())
2399 child
->info
.history
.same_primary_since
= get_osdmap()->get_epoch();
2401 child
->info
.stats
.up
= up
;
2402 child
->info
.stats
.up_primary
= up_primary
;
2403 child
->info
.stats
.acting
= acting
;
2404 child
->info
.stats
.acting_primary
= primary
;
2405 child
->info
.stats
.mapping_epoch
= get_osdmap()->get_epoch();
2408 child
->past_intervals
= past_intervals
;
2410 _split_into(child_pgid
, child
, split_bits
);
2412 // release all backoffs for simplicity
2413 release_backoffs(hobject_t(), hobject_t::get_max());
2415 child
->on_new_interval();
2417 child
->dirty_info
= true;
2418 child
->dirty_big_info
= true;
2420 dirty_big_info
= true;
2423 void PG::add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
)
2425 ConnectionRef con
= s
->con
;
2426 if (!con
) // OSD::ms_handle_reset clears s->con without a lock
2428 BackoffRef
b(s
->have_backoff(info
.pgid
, begin
));
2430 derr
<< __func__
<< " already have backoff for " << s
<< " begin " << begin
2431 << " " << *b
<< dendl
;
2434 Mutex::Locker
l(backoff_lock
);
2436 b
= new Backoff(info
.pgid
, this, s
, ++s
->backoff_seq
, begin
, end
);
2437 backoffs
[begin
].insert(b
);
2439 dout(10) << __func__
<< " session " << s
<< " added " << *b
<< dendl
;
2444 get_osdmap()->get_epoch(),
2445 CEPH_OSD_BACKOFF_OP_BLOCK
,
2451 void PG::release_backoffs(const hobject_t
& begin
, const hobject_t
& end
)
2453 dout(10) << __func__
<< " [" << begin
<< "," << end
<< ")" << dendl
;
2454 vector
<BackoffRef
> bv
;
2456 Mutex::Locker
l(backoff_lock
);
2457 auto p
= backoffs
.lower_bound(begin
);
2458 while (p
!= backoffs
.end()) {
2459 int r
= cmp(p
->first
, end
);
2460 dout(20) << __func__
<< " ? " << r
<< " " << p
->first
2461 << " " << p
->second
<< dendl
;
2462 // note: must still examine begin=end=p->first case
2463 if (r
> 0 || (r
== 0 && begin
< end
)) {
2466 dout(20) << __func__
<< " checking " << p
->first
2467 << " " << p
->second
<< dendl
;
2468 auto q
= p
->second
.begin();
2469 while (q
!= p
->second
.end()) {
2470 dout(20) << __func__
<< " checking " << *q
<< dendl
;
2471 int r
= cmp((*q
)->begin
, begin
);
2472 if (r
== 0 || (r
> 0 && (*q
)->end
< end
)) {
2474 q
= p
->second
.erase(q
);
2479 if (p
->second
.empty()) {
2480 p
= backoffs
.erase(p
);
2487 Mutex::Locker
l(b
->lock
);
2488 dout(10) << __func__
<< " " << *b
<< dendl
;
2490 assert(b
->pg
== this);
2491 ConnectionRef con
= b
->session
->con
;
2492 if (con
) { // OSD::ms_handle_reset clears s->con without a lock
2496 get_osdmap()->get_epoch(),
2497 CEPH_OSD_BACKOFF_OP_UNBLOCK
,
2503 b
->state
= Backoff::STATE_DELETING
;
2505 b
->session
->rm_backoff(b
);
2513 void PG::clear_backoffs()
2515 dout(10) << __func__
<< " " << dendl
;
2516 map
<hobject_t
,set
<BackoffRef
>> ls
;
2518 Mutex::Locker
l(backoff_lock
);
2521 for (auto& p
: ls
) {
2522 for (auto& b
: p
.second
) {
2523 Mutex::Locker
l(b
->lock
);
2524 dout(10) << __func__
<< " " << *b
<< dendl
;
2526 assert(b
->pg
== this);
2528 b
->state
= Backoff::STATE_DELETING
;
2530 b
->session
->rm_backoff(b
);
2539 // called by Session::clear_backoffs()
2540 void PG::rm_backoff(BackoffRef b
)
2542 dout(10) << __func__
<< " " << *b
<< dendl
;
2543 Mutex::Locker
l(backoff_lock
);
2544 assert(b
->lock
.is_locked_by_me());
2545 assert(b
->pg
== this);
2546 auto p
= backoffs
.find(b
->begin
);
2547 // may race with release_backoffs()
2548 if (p
!= backoffs
.end()) {
2549 auto q
= p
->second
.find(b
);
2550 if (q
!= p
->second
.end()) {
2552 if (p
->second
.empty()) {
2559 void PG::clear_recovery_state()
2561 dout(10) << "clear_recovery_state" << dendl
;
2563 pg_log
.reset_recovery_pointers();
2564 finish_sync_event
= 0;
2567 while (recovery_ops_active
> 0) {
2568 #ifdef DEBUG_RECOVERY_OIDS
2569 soid
= *recovering_oids
.begin();
2571 finish_recovery_op(soid
, true);
2574 backfill_targets
.clear();
2575 backfill_info
.clear();
2576 peer_backfill_info
.clear();
2577 waiting_on_backfill
.clear();
2578 _clear_recovery_state(); // pg impl specific hook
2581 void PG::cancel_recovery()
2583 dout(10) << "cancel_recovery" << dendl
;
2584 clear_recovery_state();
2588 void PG::purge_strays()
2590 dout(10) << "purge_strays " << stray_set
<< dendl
;
2592 bool removed
= false;
2593 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
2594 p
!= stray_set
.end();
2596 assert(!is_actingbackfill(*p
));
2597 if (get_osdmap()->is_up(p
->osd
)) {
2598 dout(10) << "sending PGRemove to osd." << *p
<< dendl
;
2599 vector
<spg_t
> to_remove
;
2600 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
2601 MOSDPGRemove
*m
= new MOSDPGRemove(
2602 get_osdmap()->get_epoch(),
2604 osd
->send_message_osd_cluster(p
->osd
, m
, get_osdmap()->get_epoch());
2606 dout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
2608 peer_missing
.erase(*p
);
2609 peer_info
.erase(*p
);
2610 peer_purged
.insert(*p
);
2614 // if we removed anyone, update peers (which include peer_info)
2616 update_heartbeat_peers();
2620 // clear _requested maps; we may have to peer() again if we discover
2621 // (more) stray content
2622 peer_log_requested
.clear();
2623 peer_missing_requested
.clear();
2626 void PG::set_probe_targets(const set
<pg_shard_t
> &probe_set
)
2628 Mutex::Locker
l(heartbeat_peer_lock
);
2629 probe_targets
.clear();
2630 for (set
<pg_shard_t
>::iterator i
= probe_set
.begin();
2631 i
!= probe_set
.end();
2633 probe_targets
.insert(i
->osd
);
2637 void PG::clear_probe_targets()
2639 Mutex::Locker
l(heartbeat_peer_lock
);
2640 probe_targets
.clear();
2643 void PG::update_heartbeat_peers()
2645 assert(is_locked());
2651 for (unsigned i
=0; i
<acting
.size(); i
++) {
2652 if (acting
[i
] != CRUSH_ITEM_NONE
)
2653 new_peers
.insert(acting
[i
]);
2655 for (unsigned i
=0; i
<up
.size(); i
++) {
2656 if (up
[i
] != CRUSH_ITEM_NONE
)
2657 new_peers
.insert(up
[i
]);
2659 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
2660 p
!= peer_info
.end();
2662 new_peers
.insert(p
->first
.osd
);
2664 bool need_update
= false;
2665 heartbeat_peer_lock
.Lock();
2666 if (new_peers
== heartbeat_peers
) {
2667 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " unchanged" << dendl
;
2669 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " -> " << new_peers
<< dendl
;
2670 heartbeat_peers
.swap(new_peers
);
2673 heartbeat_peer_lock
.Unlock();
2676 osd
->need_heartbeat_peer_update();
2680 bool PG::check_in_progress_op(
2681 const osd_reqid_t
&r
,
2682 eversion_t
*version
,
2683 version_t
*user_version
,
2684 int *return_code
) const
2687 projected_log
.get_request(r
, version
, user_version
, return_code
) ||
2688 pg_log
.get_log().get_request(r
, version
, user_version
, return_code
));
2691 static bool find_shard(const set
<pg_shard_t
> & pgs
, shard_id_t shard
)
2694 if (p
.shard
== shard
)
2699 static pg_shard_t
get_another_shard(const set
<pg_shard_t
> & pgs
, pg_shard_t skip
, shard_id_t shard
)
2701 for (auto&p
: pgs
) {
2704 if (p
.shard
== shard
)
2707 return pg_shard_t();
2710 void PG::_update_calc_stats()
2712 info
.stats
.version
= info
.last_update
;
2713 info
.stats
.created
= info
.history
.epoch_created
;
2714 info
.stats
.last_scrub
= info
.history
.last_scrub
;
2715 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
2716 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
2717 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
2718 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
2719 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
2721 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
2722 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
2723 info
.stats
.log_start
= pg_log
.get_tail();
2724 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
2725 info
.stats
.snaptrimq_len
= snap_trimq
.size();
2727 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
2729 // In rare case that upset is too large (usually transient), use as target
2730 // for calculations below.
2731 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
2732 // For undersized actingset may be larger with OSDs out
2733 unsigned nrep
= std::max(actingset
.size(), upset
.size());
2734 // calc num_object_copies
2735 info
.stats
.stats
.calc_copies(MAX(target
, nrep
));
2736 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
2737 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
2738 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
2740 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2741 dout(20) << __func__
<< " actingset " << actingset
<< " upset "
2742 << upset
<< " actingbackfill " << actingbackfill
<< dendl
;
2743 dout(20) << __func__
<< " acting " << acting
<< " up " << up
<< dendl
;
2745 assert(!actingbackfill
.empty());
2747 bool estimate
= false;
2749 // NOTE: we only generate degraded, misplaced and unfound
2750 // values for the summation, not individual stat categories.
2751 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
2753 // Objects missing from up nodes, sorted by # objects.
2754 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
2755 // Objects missing from nodes not in up, sort by # objects
2756 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
2758 // Fill missing_target_objects/acting_source_objects
2764 missing
= pg_log
.get_missing().num_missing();
2765 assert(actingbackfill
.count(pg_whoami
));
2766 if (upset
.count(pg_whoami
)) {
2767 missing_target_objects
.insert(make_pair(missing
, pg_whoami
));
2769 acting_source_objects
.insert(make_pair(missing
, pg_whoami
));
2771 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
2772 dout(20) << __func__
<< " shard " << pg_whoami
2773 << " primary objects " << num_objects
2774 << " missing " << missing
2780 for (auto& peer
: peer_info
) {
2781 // Primary should not be in the peer_info, skip if it is.
2782 if (peer
.first
== pg_whoami
) continue;
2783 int64_t missing
= 0;
2784 int64_t peer_num_objects
= peer
.second
.stats
.stats
.sum
.num_objects
;
2785 // Backfill targets always track num_objects accurately
2786 // all other peers track missing accurately.
2787 if (is_backfill_targets(peer
.first
)) {
2788 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
2790 if (peer_missing
.count(peer
.first
)) {
2791 missing
= peer_missing
[peer
.first
].num_missing();
2793 dout(20) << __func__
<< " no peer_missing found for " << peer
.first
<< dendl
;
2794 if (is_recovering()) {
2797 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
2800 if (upset
.count(peer
.first
)) {
2801 missing_target_objects
.insert(make_pair(missing
, peer
.first
));
2802 } else if (actingset
.count(peer
.first
)) {
2803 acting_source_objects
.insert(make_pair(missing
, peer
.first
));
2805 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
2806 dout(20) << __func__
<< " shard " << peer
.first
2807 << " objects " << peer_num_objects
2808 << " missing " << missing
2812 // A misplaced object is not stored on the correct OSD
2813 int64_t misplaced
= 0;
2814 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2815 int64_t degraded
= 0;
2817 if (is_recovering()) {
2818 for (auto& sml
: missing_loc
.get_missing_by_count()) {
2819 for (auto& ml
: sml
.second
) {
2821 if (sml
.first
== shard_id_t::NO_SHARD
) {
2822 dout(20) << __func__
<< " ml " << ml
.second
<< " upset size " << upset
.size() << " up " << ml
.first
.up
<< dendl
;
2823 missing_shards
= (int)upset
.size() - ml
.first
.up
;
2825 // Handle shards not even in upset below
2826 if (!find_shard(upset
, sml
.first
))
2828 missing_shards
= std::max(0, 1 - ml
.first
.up
);
2829 dout(20) << __func__
<< " shard " << sml
.first
<< " ml " << ml
.second
<< " missing shards " << missing_shards
<< dendl
;
2831 int odegraded
= ml
.second
* missing_shards
;
2832 // Copies on other osds but limited to the possible degraded
2833 int more_osds
= std::min(missing_shards
, ml
.first
.other
);
2834 int omisplaced
= ml
.second
* more_osds
;
2835 assert(omisplaced
<= odegraded
);
2836 odegraded
-= omisplaced
;
2838 misplaced
+= omisplaced
;
2839 degraded
+= odegraded
;
2843 dout(20) << __func__
<< " missing based degraded " << degraded
<< dendl
;
2844 dout(20) << __func__
<< " missing based misplaced " << misplaced
<< dendl
;
2846 // Handle undersized case
2847 if (pool
.info
.is_replicated()) {
2848 // Add degraded for missing targets (num_objects missing)
2849 assert(target
>= upset
.size());
2850 unsigned needed
= target
- upset
.size();
2851 degraded
+= num_objects
* needed
;
2853 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
2854 shard_id_t
shard(i
);
2856 if (!find_shard(upset
, shard
)) {
2857 pg_shard_t pgs
= get_another_shard(actingset
, pg_shard_t(), shard
);
2859 if (pgs
!= pg_shard_t()) {
2862 if (pgs
== pg_whoami
)
2863 missing
= info
.stats
.stats
.sum
.num_objects_missing_on_primary
;
2865 missing
= peer_info
[pgs
].stats
.stats
.sum
.num_objects_missing
;
2867 degraded
+= missing
;
2868 misplaced
+= std::max((int64_t)0, num_objects
- missing
);
2870 // No shard anywhere
2871 degraded
+= num_objects
;
2879 // Handle undersized case
2880 if (pool
.info
.is_replicated()) {
2881 // Add to missing_target_objects
2882 assert(target
>= missing_target_objects
.size());
2883 unsigned needed
= target
- missing_target_objects
.size();
2885 missing_target_objects
.insert(make_pair(num_objects
* needed
, pg_shard_t(pg_shard_t::NO_OSD
)));
2887 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
2888 shard_id_t
shard(i
);
2890 for (const auto& t
: missing_target_objects
) {
2891 if (std::get
<1>(t
).shard
== shard
) {
2897 missing_target_objects
.insert(make_pair(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
)));
2901 for (const auto& item
: missing_target_objects
)
2902 dout(20) << __func__
<< " missing shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
2903 for (const auto& item
: acting_source_objects
)
2904 dout(20) << __func__
<< " acting shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
2906 // Handle all objects not in missing for remapped
2908 for (auto m
= missing_target_objects
.rbegin();
2909 m
!= missing_target_objects
.rend(); ++m
) {
2911 int64_t extra_missing
= -1;
2913 if (pool
.info
.is_replicated()) {
2914 if (!acting_source_objects
.empty()) {
2915 auto extra_copy
= acting_source_objects
.begin();
2916 extra_missing
= std::get
<0>(*extra_copy
);
2917 acting_source_objects
.erase(extra_copy
);
2919 } else { // Erasure coded
2920 // Use corresponding shard
2921 for (const auto& a
: acting_source_objects
) {
2922 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
2923 extra_missing
= std::get
<0>(a
);
2924 acting_source_objects
.erase(a
);
2930 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
2931 // We don't know which of the objects on the target
2932 // are part of extra_missing so assume are all degraded.
2933 misplaced
+= std::get
<0>(*m
) - extra_missing
;
2934 degraded
+= extra_missing
;
2936 // 1. extra_missing == -1, more targets than sources so degraded
2937 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2938 // previously degraded are now present on the target.
2939 degraded
+= std::get
<0>(*m
);
2942 // If there are still acting that haven't been accounted for
2943 // then they are misplaced
2944 for (const auto& a
: acting_source_objects
) {
2945 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
2946 dout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
<< dendl
;
2947 misplaced
+= extra_misplaced
;
2950 // NOTE: Tests use these messages to verify this code
2951 dout(20) << __func__
<< " degraded " << degraded
<< (estimate
? " (est)": "") << dendl
;
2952 dout(20) << __func__
<< " misplaced " << misplaced
<< (estimate
? " (est)": "")<< dendl
;
2954 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
2955 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
2956 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
2960 void PG::_update_blocked_by()
2962 // set a max on the number of blocking peers we report. if we go
2963 // over, report a random subset. keep the result sorted.
2964 unsigned keep
= MIN(blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
2965 unsigned skip
= blocked_by
.size() - keep
;
2966 info
.stats
.blocked_by
.clear();
2967 info
.stats
.blocked_by
.resize(keep
);
2969 for (set
<int>::iterator p
= blocked_by
.begin();
2970 p
!= blocked_by
.end() && keep
> 0;
2972 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
2975 info
.stats
.blocked_by
[pos
++] = *p
;
2981 void PG::publish_stats_to_osd()
2986 pg_stats_publish_lock
.Lock();
2988 if (info
.stats
.stats
.sum
.num_scrub_errors
)
2989 state_set(PG_STATE_INCONSISTENT
);
2991 state_clear(PG_STATE_INCONSISTENT
);
2993 utime_t now
= ceph_clock_now();
2994 if (info
.stats
.state
!= state
) {
2995 info
.stats
.last_change
= now
;
2996 // Optimistic estimation, if we just find out an inactive PG,
2997 // assumt it is active till now.
2998 if (!(state
& PG_STATE_ACTIVE
) &&
2999 (info
.stats
.state
& PG_STATE_ACTIVE
))
3000 info
.stats
.last_active
= now
;
3002 if ((state
& PG_STATE_ACTIVE
) &&
3003 !(info
.stats
.state
& PG_STATE_ACTIVE
))
3004 info
.stats
.last_became_active
= now
;
3005 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
3006 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
3007 info
.stats
.last_became_peered
= now
;
3008 if (!(state
& PG_STATE_CREATING
) &&
3009 (info
.stats
.state
& PG_STATE_CREATING
)) {
3010 osd
->send_pg_created(get_pgid().pgid
);
3012 info
.stats
.state
= state
;
3015 _update_calc_stats();
3016 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
3017 state_set(PG_STATE_DEGRADED
);
3019 state_clear(PG_STATE_DEGRADED
);
3021 _update_blocked_by();
3023 bool publish
= false;
3024 pg_stat_t pre_publish
= info
.stats
;
3025 pre_publish
.stats
.add(unstable_stats
);
3026 utime_t cutoff
= now
;
3027 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
3028 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
3029 info
.stats
.last_fresh
> cutoff
) {
3030 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3031 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
3033 // update our stat summary and timestamps
3034 info
.stats
.reported_epoch
= get_osdmap()->get_epoch();
3035 ++info
.stats
.reported_seq
;
3037 info
.stats
.last_fresh
= now
;
3039 if (info
.stats
.state
& PG_STATE_CLEAN
)
3040 info
.stats
.last_clean
= now
;
3041 if (info
.stats
.state
& PG_STATE_ACTIVE
)
3042 info
.stats
.last_active
= now
;
3043 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
3044 info
.stats
.last_peered
= now
;
3045 info
.stats
.last_unstale
= now
;
3046 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
3047 info
.stats
.last_undegraded
= now
;
3048 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
3049 info
.stats
.last_fullsized
= now
;
3051 // do not send pgstat to mon anymore once we are luminous, since mgr takes
3052 // care of this by sending MMonMgrReport to mon.
3054 osd
->osd
->get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
3055 pg_stats_publish_valid
= true;
3056 pg_stats_publish
= pre_publish
;
3058 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3059 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
3061 pg_stats_publish_lock
.Unlock();
3064 osd
->pg_stat_queue_enqueue(this);
3067 void PG::clear_publish_stats()
3069 dout(15) << "clear_stats" << dendl
;
3070 pg_stats_publish_lock
.Lock();
3071 pg_stats_publish_valid
= false;
3072 pg_stats_publish_lock
.Unlock();
3074 osd
->pg_stat_queue_dequeue(this);
3078 * initialize a newly instantiated pg
3080 * Initialize PG state, as when a PG is initially created, or when it
3081 * is first instantiated on the current node.
3083 * @param role our role/rank
3084 * @param newup up set
3085 * @param newacting acting set
3086 * @param history pg history
3087 * @param pi past_intervals
3088 * @param backfill true if info should be marked as backfill
3089 * @param t transaction to write out our new state in
3093 const vector
<int>& newup
, int new_up_primary
,
3094 const vector
<int>& newacting
, int new_acting_primary
,
3095 const pg_history_t
& history
,
3096 const PastIntervals
& pi
,
3098 ObjectStore::Transaction
*t
)
3100 dout(10) << "init role " << role
<< " up " << newup
<< " acting " << newacting
3101 << " history " << history
3102 << " past_intervals " << pi
3108 init_primary_up_acting(
3112 new_acting_primary
);
3114 info
.history
= history
;
3115 past_intervals
= pi
;
3118 info
.stats
.up_primary
= new_up_primary
;
3119 info
.stats
.acting
= acting
;
3120 info
.stats
.acting_primary
= new_acting_primary
;
3121 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
3124 dout(10) << __func__
<< ": Setting backfill" << dendl
;
3125 info
.set_last_backfill(hobject_t());
3126 info
.last_complete
= info
.last_update
;
3127 pg_log
.mark_log_for_rewrite();
3133 dirty_big_info
= true;
3137 #pragma GCC diagnostic ignored "-Wpragmas"
3138 #pragma GCC diagnostic push
3139 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3141 void PG::upgrade(ObjectStore
*store
)
3143 assert(info_struct_v
<= 10);
3144 ObjectStore::Transaction t
;
3146 assert(info_struct_v
>= 7);
3149 if (info_struct_v
<= 7) {
3150 pg_log
.mark_log_for_rewrite();
3151 ghobject_t
log_oid(OSD::make_pg_log_oid(pg_id
));
3152 ghobject_t
biginfo_oid(OSD::make_pg_biginfo_oid(pg_id
));
3153 t
.remove(coll_t::meta(), log_oid
);
3154 t
.remove(coll_t::meta(), biginfo_oid
);
3155 t
.touch(coll
, pgmeta_oid
);
3159 if (info_struct_v
<= 8) {
3160 // no special action needed.
3164 if (info_struct_v
<= 9) {
3165 // previous versions weren't (as) aggressively clearing past_intervals
3166 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
3167 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
3168 past_intervals
.clear();
3172 // update infover_key
3173 if (info_struct_v
< cur_struct_v
) {
3174 map
<string
,bufferlist
> v
;
3175 __u8 ver
= cur_struct_v
;
3176 ::encode(ver
, v
[infover_key
]);
3177 t
.omap_setkeys(coll
, pgmeta_oid
, v
);
3181 dirty_big_info
= true;
3184 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
3185 ObjectStore::Sequencer
>("upgrade"));
3186 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
3188 derr
<< __func__
<< ": apply_transaction returned "
3189 << cpp_strerror(r
) << dendl
;
3195 if (!osr
->flush_commit(&waiter
)) {
3200 #pragma GCC diagnostic pop
3201 #pragma GCC diagnostic warning "-Wpragmas"
3203 int PG::_prepare_write_info(CephContext
* cct
,
3204 map
<string
,bufferlist
> *km
,
3206 pg_info_t
&info
, pg_info_t
&last_written_info
,
3207 PastIntervals
&past_intervals
,
3208 bool dirty_big_info
,
3211 PerfCounters
*logger
)
3214 ::encode(epoch
, (*km
)[epoch_key
]);
3218 logger
->inc(l_osd_pg_info
);
3220 // try to do info efficiently?
3221 if (!dirty_big_info
&& try_fast_info
&&
3222 info
.last_update
> last_written_info
.last_update
) {
3223 pg_fast_info_t fast
;
3224 fast
.populate_from(info
);
3225 bool did
= fast
.try_apply_to(&last_written_info
);
3226 assert(did
); // we verified last_update increased above
3227 if (info
== last_written_info
) {
3228 ::encode(fast
, (*km
)[fastinfo_key
]);
3230 logger
->inc(l_osd_pg_fastinfo
);
3233 generic_dout(30) << __func__
<< " fastinfo failed, info:\n";
3235 JSONFormatter
jf(true);
3236 jf
.dump_object("info", info
);
3240 *_dout
<< "\nlast_written_info:\n";
3241 JSONFormatter
jf(true);
3242 jf
.dump_object("last_written_info", last_written_info
);
3247 last_written_info
= info
;
3249 // info. store purged_snaps separately.
3250 interval_set
<snapid_t
> purged_snaps
;
3251 purged_snaps
.swap(info
.purged_snaps
);
3252 ::encode(info
, (*km
)[info_key
]);
3253 purged_snaps
.swap(info
.purged_snaps
);
3255 if (dirty_big_info
) {
3256 // potentially big stuff
3257 bufferlist
& bigbl
= (*km
)[biginfo_key
];
3258 ::encode(past_intervals
, bigbl
);
3259 ::encode(info
.purged_snaps
, bigbl
);
3260 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3262 logger
->inc(l_osd_pg_biginfo
);
3268 void PG::_create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
)
3271 t
.create_collection(coll
, bits
);
3274 void PG::_init(ObjectStore::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
)
3279 // Give a hint to the PG collection
3281 uint32_t pg_num
= pool
->get_pg_num();
3282 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
3283 ::encode(pg_num
, hint
);
3284 ::encode(expected_num_objects_pg
, hint
);
3285 uint32_t hint_type
= ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
3286 t
.collection_hint(coll
, hint_type
, hint
);
3289 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3290 t
.touch(coll
, pgmeta_oid
);
3291 map
<string
,bufferlist
> values
;
3292 __u8 struct_v
= cur_struct_v
;
3293 ::encode(struct_v
, values
[infover_key
]);
3294 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
3297 void PG::prepare_write_info(map
<string
,bufferlist
> *km
)
3299 info
.stats
.stats
.add(unstable_stats
);
3300 unstable_stats
.clear();
3302 bool need_update_epoch
= last_epoch
< get_osdmap()->get_epoch();
3303 int ret
= _prepare_write_info(cct
, km
, get_osdmap()->get_epoch(),
3307 dirty_big_info
, need_update_epoch
,
3308 cct
->_conf
->osd_fast_info
,
3311 if (need_update_epoch
)
3312 last_epoch
= get_osdmap()->get_epoch();
3313 last_persisted_osdmap_ref
= osdmap_ref
;
3316 dirty_big_info
= false;
3319 #pragma GCC diagnostic ignored "-Wpragmas"
3320 #pragma GCC diagnostic push
3321 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3323 bool PG::_has_removal_flag(ObjectStore
*store
,
3327 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3329 // first try new way
3331 keys
.insert("_remove");
3332 map
<string
,bufferlist
> values
;
3333 if (store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
) == 0 &&
3340 int PG::peek_map_epoch(ObjectStore
*store
,
3346 ghobject_t
legacy_infos_oid(OSD::make_infos_oid());
3347 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3348 epoch_t cur_epoch
= 0;
3352 // validate collection name
3353 assert(coll
.is_pg());
3358 keys
.insert(infover_key
);
3359 keys
.insert(epoch_key
);
3360 map
<string
,bufferlist
> values
;
3361 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3363 assert(values
.size() == 2);
3365 // sanity check version
3366 bufferlist::iterator bp
= values
[infover_key
].begin();
3368 ::decode(struct_v
, bp
);
3369 assert(struct_v
>= 8);
3372 bp
= values
[epoch_key
].begin();
3373 ::decode(cur_epoch
, bp
);
3375 // probably bug 10617; see OSD::load_pgs()
3379 *pepoch
= cur_epoch
;
3383 #pragma GCC diagnostic pop
3384 #pragma GCC diagnostic warning "-Wpragmas"
3386 void PG::write_if_dirty(ObjectStore::Transaction
& t
)
3388 map
<string
,bufferlist
> km
;
3389 if (dirty_big_info
|| dirty_info
)
3390 prepare_write_info(&km
);
3391 pg_log
.write_log_and_missing(t
, &km
, coll
, pgmeta_oid
, pool
.info
.require_rollback());
3393 t
.omap_setkeys(coll
, pgmeta_oid
, km
);
3396 void PG::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3398 // raise last_complete only if we were previously up to date
3399 if (info
.last_complete
== info
.last_update
)
3400 info
.last_complete
= e
.version
;
3402 // raise last_update.
3403 assert(e
.version
> info
.last_update
);
3404 info
.last_update
= e
.version
;
3406 // raise user_version, if it increased (it may have not get bumped
3407 // by all logged updates)
3408 if (e
.user_version
> info
.last_user_version
)
3409 info
.last_user_version
= e
.user_version
;
3412 pg_log
.add(e
, applied
);
3413 dout(10) << "add_log_entry " << e
<< dendl
;
3417 void PG::append_log(
3418 const vector
<pg_log_entry_t
>& logv
,
3420 eversion_t roll_forward_to
,
3421 ObjectStore::Transaction
&t
,
3422 bool transaction_applied
)
3424 if (transaction_applied
)
3425 update_snap_map(logv
, t
);
3427 /* The primary has sent an info updating the history, but it may not
3428 * have arrived yet. We want to make sure that we cannot remember this
3429 * write without remembering that it happened in an interval which went
3430 * active in epoch history.last_epoch_started.
3432 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3433 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3435 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3436 info
.history
.last_interval_started
= info
.last_interval_started
;
3438 dout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
3440 PGLogEntryHandler handler
{this, &t
};
3441 if (!transaction_applied
) {
3442 /* We must be a backfill peer, so it's ok if we apply
3443 * out-of-turn since we won't be considered when
3444 * determining a min possible last_update.
3446 pg_log
.roll_forward(&handler
);
3449 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
3452 add_log_entry(*p
, transaction_applied
);
3454 /* We don't want to leave the rollforward artifacts around
3455 * here past last_backfill. It's ok for the same reason as
3457 if (transaction_applied
&&
3458 p
->soid
> info
.last_backfill
) {
3459 pg_log
.roll_forward(&handler
);
3462 auto last
= logv
.rbegin();
3463 if (is_primary() && last
!= logv
.rend()) {
3464 projected_log
.skip_can_rollback_to_to_head();
3465 projected_log
.trim(cct
, last
->version
, nullptr, nullptr, nullptr);
3468 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
3469 pg_log
.roll_forward_to(
3472 t
.register_on_applied(
3473 new C_UpdateLastRollbackInfoTrimmedToApplied(
3475 get_osdmap()->get_epoch(),
3479 dout(10) << __func__
<< " approx pg log length = "
3480 << pg_log
.get_log().approx_size() << dendl
;
3481 dout(10) << __func__
<< " transaction_applied = "
3482 << transaction_applied
<< dendl
;
3483 if (!transaction_applied
)
3484 dout(10) << __func__
<< " " << pg_whoami
3485 << " is backfill target" << dendl
;
3486 pg_log
.trim(trim_to
, info
, transaction_applied
);
3488 // update the local pg, pg log
3493 bool PG::check_log_for_corruption(ObjectStore
*store
)
3495 /// TODO: this method needs to work with the omap log
3499 //! Get the name we're going to save our corrupt page log as
3500 std::string
PG::get_corrupt_pg_log_name() const
3502 const int MAX_BUF
= 512;
3505 time_t my_time(time(NULL
));
3506 const struct tm
*t
= localtime_r(&my_time
, &tm_buf
);
3507 int ret
= strftime(buf
, sizeof(buf
), "corrupt_log_%Y-%m-%d_%k:%M_", t
);
3509 dout(0) << "strftime failed" << dendl
;
3510 return "corrupt_log_unknown_time";
3513 out
+= stringify(info
.pgid
);
3518 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
, bufferlist
&bl
,
3519 pg_info_t
&info
, PastIntervals
&past_intervals
,
3522 // try for v8 or later
3524 keys
.insert(infover_key
);
3525 keys
.insert(info_key
);
3526 keys
.insert(biginfo_key
);
3527 keys
.insert(fastinfo_key
);
3528 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3529 map
<string
,bufferlist
> values
;
3530 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3532 assert(values
.size() == 3 ||
3533 values
.size() == 4);
3535 bufferlist::iterator p
= values
[infover_key
].begin();
3536 ::decode(struct_v
, p
);
3537 assert(struct_v
>= 8);
3539 p
= values
[info_key
].begin();
3542 p
= values
[biginfo_key
].begin();
3543 if (struct_v
>= 10) {
3544 ::decode(past_intervals
, p
);
3546 past_intervals
.decode_classic(p
);
3548 ::decode(info
.purged_snaps
, p
);
3550 p
= values
[fastinfo_key
].begin();
3552 pg_fast_info_t fast
;
3554 fast
.try_apply_to(&info
);
3560 ghobject_t
infos_oid(OSD::make_infos_oid());
3561 bufferlist::iterator p
= bl
.begin();
3562 ::decode(struct_v
, p
);
3563 assert(struct_v
== 7);
3565 // get info out of leveldb
3566 string k
= get_info_key(info
.pgid
);
3567 string bk
= get_biginfo_key(info
.pgid
);
3572 store
->omap_get_values(coll_t::meta(), ghobject_t(infos_oid
), keys
, &values
);
3573 assert(values
.size() == 2);
3575 p
= values
[k
].begin();
3578 p
= values
[bk
].begin();
3579 ::decode(past_intervals
, p
);
3580 interval_set
<snapid_t
> snap_collections
; // obsolete
3581 ::decode(snap_collections
, p
);
3582 ::decode(info
.purged_snaps
, p
);
3586 void PG::read_state(ObjectStore
*store
, bufferlist
&bl
)
3588 int r
= read_info(store
, pg_id
, coll
, bl
, info
, past_intervals
,
3592 last_written_info
= info
;
3594 // if we are upgrading from jewel, we need to force rebuild of
3595 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3596 // (before kraken). persisted missing set was circa
3597 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3598 // v8 was pre-jewel (per-pg meta object).
3599 bool force_rebuild_missing
= info_struct_v
< 9;
3600 if (force_rebuild_missing
) {
3601 dout(10) << __func__
<< " detected upgrade from jewel, force_rebuild_missing"
3606 pg_log
.read_log_and_missing(
3609 info_struct_v
< 8 ? coll_t::meta() : coll
,
3610 ghobject_t(info_struct_v
< 8 ? OSD::make_pg_log_oid(pg_id
) : pgmeta_oid
),
3612 force_rebuild_missing
,
3614 cct
->_conf
->osd_ignore_stale_divergent_priors
,
3615 cct
->_conf
->osd_debug_verify_missing_on_start
);
3617 osd
->clog
->error() << oss
.str();
3619 if (force_rebuild_missing
) {
3620 dout(10) << __func__
<< " forced rebuild of missing got "
3621 << pg_log
.get_missing()
3625 // log any weirdness
3629 void PG::log_weirdness()
3631 if (pg_log
.get_tail() != info
.log_tail
)
3632 osd
->clog
->error() << info
.pgid
3633 << " info mismatch, log.tail " << pg_log
.get_tail()
3634 << " != info.log_tail " << info
.log_tail
;
3635 if (pg_log
.get_head() != info
.last_update
)
3636 osd
->clog
->error() << info
.pgid
3637 << " info mismatch, log.head " << pg_log
.get_head()
3638 << " != info.last_update " << info
.last_update
;
3640 if (!pg_log
.get_log().empty()) {
3642 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
3643 osd
->clog
->error() << info
.pgid
3644 << " log bound mismatch, info (tail,head] ("
3645 << pg_log
.get_tail() << "," << pg_log
.get_head() << "]"
3647 << pg_log
.get_log().log
.begin()->version
<< ","
3648 << pg_log
.get_log().log
.rbegin()->version
<< "]";
3651 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
3652 osd
->clog
->error() << info
.pgid
3653 << " caller_ops.size " << pg_log
.get_log().caller_ops
.size()
3654 << " > log size " << pg_log
.get_log().log
.size();
3658 void PG::update_snap_map(
3659 const vector
<pg_log_entry_t
> &log_entries
,
3660 ObjectStore::Transaction
&t
)
3662 for (vector
<pg_log_entry_t
>::const_iterator i
= log_entries
.begin();
3663 i
!= log_entries
.end();
3665 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
3666 if (i
->soid
.snap
< CEPH_MAXSNAP
) {
3667 if (i
->is_delete()) {
3668 int r
= snap_mapper
.remove_oid(
3672 } else if (i
->is_update()) {
3673 assert(i
->snaps
.length() > 0);
3674 vector
<snapid_t
> snaps
;
3675 bufferlist snapbl
= i
->snaps
;
3676 bufferlist::iterator p
= snapbl
.begin();
3680 derr
<< __func__
<< " decode snaps failure on " << *i
<< dendl
;
3683 set
<snapid_t
> _snaps(snaps
.begin(), snaps
.end());
3685 if (i
->is_clone() || i
->is_promote()) {
3686 snap_mapper
.add_oid(
3690 } else if (i
->is_modify()) {
3691 assert(i
->is_modify());
3692 int r
= snap_mapper
.update_snaps(
3699 assert(i
->is_clean());
3707 * filter trimming|trimmed snaps out of snapcontext
3709 void PG::filter_snapc(vector
<snapid_t
> &snaps
)
3711 //nothing needs to trim, we can return immediately
3712 if(snap_trimq
.empty() && info
.purged_snaps
.empty())
3715 bool filtering
= false;
3716 vector
<snapid_t
> newsnaps
;
3717 for (vector
<snapid_t
>::iterator p
= snaps
.begin();
3720 if (snap_trimq
.contains(*p
) || info
.purged_snaps
.contains(*p
)) {
3722 // start building a new vector with what we've seen so far
3723 dout(10) << "filter_snapc filtering " << snaps
<< dendl
;
3724 newsnaps
.insert(newsnaps
.begin(), snaps
.begin(), p
);
3727 dout(20) << "filter_snapc removing trimq|purged snap " << *p
<< dendl
;
3730 newsnaps
.push_back(*p
); // continue building new vector
3734 snaps
.swap(newsnaps
);
3735 dout(10) << "filter_snapc result " << snaps
<< dendl
;
3739 void PG::requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
)
3741 for (map
<hobject_t
, list
<OpRequestRef
>>::iterator it
= m
.begin();
3744 requeue_ops(it
->second
);
3748 void PG::requeue_op(OpRequestRef op
)
3750 auto p
= waiting_for_map
.find(op
->get_source());
3751 if (p
!= waiting_for_map
.end()) {
3752 dout(20) << __func__
<< " " << op
<< " (waiting_for_map " << p
->first
<< ")"
3754 p
->second
.push_front(op
);
3756 dout(20) << __func__
<< " " << op
<< dendl
;
3757 osd
->enqueue_front(info
.pgid
, PGQueueable(op
, get_osdmap()->get_epoch()));
3761 void PG::requeue_ops(list
<OpRequestRef
> &ls
)
3763 for (list
<OpRequestRef
>::reverse_iterator i
= ls
.rbegin();
3766 auto p
= waiting_for_map
.find((*i
)->get_source());
3767 if (p
!= waiting_for_map
.end()) {
3768 dout(20) << __func__
<< " " << *i
<< " (waiting_for_map " << p
->first
3770 p
->second
.push_front(*i
);
3772 dout(20) << __func__
<< " " << *i
<< dendl
;
3773 osd
->enqueue_front(info
.pgid
, PGQueueable(*i
, get_osdmap()->get_epoch()));
3779 void PG::requeue_map_waiters()
3781 epoch_t epoch
= get_osdmap()->get_epoch();
3782 auto p
= waiting_for_map
.begin();
3783 while (p
!= waiting_for_map
.end()) {
3784 if (epoch
< p
->second
.front()->min_epoch
) {
3785 dout(20) << __func__
<< " " << p
->first
<< " front op "
3786 << p
->second
.front() << " must still wait, doing nothing"
3790 dout(20) << __func__
<< " " << p
->first
<< " " << p
->second
<< dendl
;
3791 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
3792 osd
->enqueue_front(info
.pgid
, PGQueueable(*q
, epoch
));
3794 p
= waiting_for_map
.erase(p
);
3800 // ==========================================================================================
3804 * when holding pg and sched_scrub_lock, then the states are:
3806 * scrubber.reserved = true
3807 * scrub_rserved_peers includes whoami
3808 * osd->scrub_pending++
3809 * scheduling, replica declined:
3810 * scrubber.reserved = true
3811 * scrubber.reserved_peers includes -1
3812 * osd->scrub_pending++
3814 * scrubber.reserved = true
3815 * scrubber.reserved_peers.size() == acting.size();
3817 * osd->scrub_pending++
3819 * scrubber.reserved = false;
3820 * scrubber.reserved_peers empty
3821 * osd->scrubber.active++
3824 // returns true if a scrub has been newly kicked off
3825 bool PG::sched_scrub()
3827 bool nodeep_scrub
= false;
3828 assert(is_locked());
3829 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3833 double deep_scrub_interval
= 0;
3834 pool
.info
.opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
3835 if (deep_scrub_interval
<= 0) {
3836 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
3838 bool time_for_deep
= ceph_clock_now() >=
3839 info
.history
.last_deep_scrub_stamp
+ deep_scrub_interval
;
3841 bool deep_coin_flip
= false;
3842 // Only add random deep scrubs when NOT user initiated scrub
3843 if (!scrubber
.must_scrub
)
3844 deep_coin_flip
= (rand() % 100) < cct
->_conf
->osd_deep_scrub_randomize_ratio
* 100;
3845 dout(20) << __func__
<< ": time_for_deep=" << time_for_deep
<< " deep_coin_flip=" << deep_coin_flip
<< dendl
;
3847 time_for_deep
= (time_for_deep
|| deep_coin_flip
);
3849 //NODEEP_SCRUB so ignore time initiated deep-scrub
3850 if (osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB
) ||
3851 pool
.info
.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB
)) {
3852 time_for_deep
= false;
3853 nodeep_scrub
= true;
3856 if (!scrubber
.must_scrub
) {
3857 assert(!scrubber
.must_deep_scrub
);
3859 //NOSCRUB so skip regular scrubs
3860 if ((osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB
) ||
3861 pool
.info
.has_flag(pg_pool_t::FLAG_NOSCRUB
)) && !time_for_deep
) {
3862 if (scrubber
.reserved
) {
3863 // cancel scrub if it is still in scheduling,
3864 // so pgs from other pools where scrub are still legal
3865 // have a chance to go ahead with scrubbing.
3866 clear_scrub_reserved();
3867 scrub_unreserve_replicas();
3873 if (cct
->_conf
->osd_scrub_auto_repair
3874 && get_pgbackend()->auto_repair_supported()
3876 // respect the command from user, and not do auto-repair
3877 && !scrubber
.must_repair
3878 && !scrubber
.must_scrub
3879 && !scrubber
.must_deep_scrub
) {
3880 dout(20) << __func__
<< ": auto repair with deep scrubbing" << dendl
;
3881 scrubber
.auto_repair
= true;
3883 // this happens when user issue the scrub/repair command during
3884 // the scheduling of the scrub/repair (e.g. request reservation)
3885 scrubber
.auto_repair
= false;
3889 if (!scrubber
.reserved
) {
3890 assert(scrubber
.reserved_peers
.empty());
3891 if ((cct
->_conf
->osd_scrub_during_recovery
|| !osd
->is_recovery_active()) &&
3892 osd
->inc_scrubs_pending()) {
3893 dout(20) << __func__
<< ": reserved locally, reserving replicas" << dendl
;
3894 scrubber
.reserved
= true;
3895 scrubber
.reserved_peers
.insert(pg_whoami
);
3896 scrub_reserve_replicas();
3898 dout(20) << __func__
<< ": failed to reserve locally" << dendl
;
3902 if (scrubber
.reserved
) {
3903 if (scrubber
.reserve_failed
) {
3904 dout(20) << "sched_scrub: failed, a peer declined" << dendl
;
3905 clear_scrub_reserved();
3906 scrub_unreserve_replicas();
3908 } else if (scrubber
.reserved_peers
.size() == acting
.size()) {
3909 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl
;
3910 if (time_for_deep
) {
3911 dout(10) << "sched_scrub: scrub will be deep" << dendl
;
3912 state_set(PG_STATE_DEEP_SCRUB
);
3913 } else if (!scrubber
.must_deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
) {
3914 if (!nodeep_scrub
) {
3915 osd
->clog
->info() << "osd." << osd
->whoami
3916 << " pg " << info
.pgid
3917 << " Deep scrub errors, upgrading scrub to deep-scrub";
3918 state_set(PG_STATE_DEEP_SCRUB
);
3919 } else if (!scrubber
.must_scrub
) {
3920 osd
->clog
->error() << "osd." << osd
->whoami
3921 << " pg " << info
.pgid
3922 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3923 clear_scrub_reserved();
3924 scrub_unreserve_replicas();
3927 osd
->clog
->error() << "osd." << osd
->whoami
3928 << " pg " << info
.pgid
3929 << " Regular scrub request, deep-scrub details will be lost";
3934 // none declined, since scrubber.reserved is set
3935 dout(20) << "sched_scrub: reserved " << scrubber
.reserved_peers
<< ", waiting for replicas" << dendl
;
3942 void PG::reg_next_scrub()
3949 if (scrubber
.must_scrub
) {
3950 // Set the smallest time that isn't utime_t()
3951 reg_stamp
= utime_t(0,1);
3953 } else if (info
.stats
.stats_invalid
&& cct
->_conf
->osd_scrub_invalid_stats
) {
3954 reg_stamp
= ceph_clock_now();
3957 reg_stamp
= info
.history
.last_scrub_stamp
;
3959 // note down the sched_time, so we can locate this scrub, and remove it
3961 double scrub_min_interval
= 0, scrub_max_interval
= 0;
3962 pool
.info
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &scrub_min_interval
);
3963 pool
.info
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
3964 assert(scrubber
.scrub_reg_stamp
== utime_t());
3965 scrubber
.scrub_reg_stamp
= osd
->reg_pg_scrub(info
.pgid
,
3972 void PG::unreg_next_scrub()
3975 osd
->unreg_pg_scrub(info
.pgid
, scrubber
.scrub_reg_stamp
);
3976 scrubber
.scrub_reg_stamp
= utime_t();
3980 void PG::do_replica_scrub_map(OpRequestRef op
)
3982 const MOSDRepScrubMap
*m
= static_cast<const MOSDRepScrubMap
*>(op
->get_req());
3983 dout(7) << __func__
<< " " << *m
<< dendl
;
3984 if (m
->map_epoch
< info
.history
.same_interval_since
) {
3985 dout(10) << __func__
<< " discarding old from "
3986 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
3990 if (!scrubber
.is_chunky_scrub_active()) {
3991 dout(10) << __func__
<< " scrub isn't active" << dendl
;
3997 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3998 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
3999 dout(10) << "map version is "
4000 << scrubber
.received_maps
[m
->from
].valid_through
4003 dout(10) << __func__
<< " waiting_on_whom was " << scrubber
.waiting_on_whom
4005 assert(scrubber
.waiting_on_whom
.count(m
->from
));
4006 scrubber
.waiting_on_whom
.erase(m
->from
);
4008 dout(10) << __func__
<< " replica was preempted, setting flag" << dendl
;
4009 scrub_preempted
= true;
4011 if (scrubber
.waiting_on_whom
.empty()) {
4012 if (ops_blocked_by_scrub()) {
4013 requeue_scrub(true);
4015 requeue_scrub(false);
4020 void PG::sub_op_scrub_map(OpRequestRef op
)
4022 // for legacy jewel compatibility only
4023 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
4024 assert(m
->get_type() == MSG_OSD_SUBOP
);
4025 dout(7) << "sub_op_scrub_map" << dendl
;
4027 if (m
->map_epoch
< info
.history
.same_interval_since
) {
4028 dout(10) << "sub_op_scrub discarding old sub_op from "
4029 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
<< dendl
;
4033 if (!scrubber
.is_chunky_scrub_active()) {
4034 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl
;
4040 dout(10) << " got " << m
->from
<< " scrub map" << dendl
;
4041 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
4043 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
4044 dout(10) << "map version is "
4045 << scrubber
.received_maps
[m
->from
].valid_through
4048 scrubber
.waiting_on_whom
.erase(m
->from
);
4050 if (scrubber
.waiting_on_whom
.empty()) {
4051 if (ops_blocked_by_scrub()) {
4052 requeue_scrub(true);
4054 requeue_scrub(false);
4059 // send scrub v3 messages (chunky scrub)
4060 void PG::_request_scrub_map(
4061 pg_shard_t replica
, eversion_t version
,
4062 hobject_t start
, hobject_t end
,
4064 bool allow_preemption
)
4066 assert(replica
!= pg_whoami
);
4067 dout(10) << "scrub requesting scrubmap from osd." << replica
4068 << " deep " << (int)deep
<< dendl
;
4069 MOSDRepScrub
*repscrubop
= new MOSDRepScrub(
4070 spg_t(info
.pgid
.pgid
, replica
.shard
), version
,
4071 get_osdmap()->get_epoch(),
4072 get_last_peering_reset(),
4076 ops_blocked_by_scrub());
4077 // default priority, we want the rep scrub processed prior to any recovery
4078 // or client io messages (we are holding a lock!)
4079 osd
->send_message_osd_cluster(
4080 replica
.osd
, repscrubop
, get_osdmap()->get_epoch());
4083 void PG::handle_scrub_reserve_request(OpRequestRef op
)
4085 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4087 if (scrubber
.reserved
) {
4088 dout(10) << __func__
<< " ignoring reserve request: Already reserved"
4092 if ((cct
->_conf
->osd_scrub_during_recovery
|| !osd
->is_recovery_active()) &&
4093 osd
->inc_scrubs_pending()) {
4094 scrubber
.reserved
= true;
4096 dout(20) << __func__
<< ": failed to reserve remotely" << dendl
;
4097 scrubber
.reserved
= false;
4099 if (op
->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE
) {
4100 const MOSDScrubReserve
*m
=
4101 static_cast<const MOSDScrubReserve
*>(op
->get_req());
4102 Message
*reply
= new MOSDScrubReserve(
4103 spg_t(info
.pgid
.pgid
, primary
.shard
),
4105 scrubber
.reserved
? MOSDScrubReserve::GRANT
: MOSDScrubReserve::REJECT
,
4107 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
4109 // for jewel compat only
4110 const MOSDSubOp
*req
= static_cast<const MOSDSubOp
*>(op
->get_req());
4111 assert(req
->get_type() == MSG_OSD_SUBOP
);
4112 MOSDSubOpReply
*reply
= new MOSDSubOpReply(
4113 req
, pg_whoami
, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK
);
4114 ::encode(scrubber
.reserved
, reply
->get_data());
4115 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
4119 void PG::handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
)
4121 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4123 if (!scrubber
.reserved
) {
4124 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
4127 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
4128 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
4130 dout(10) << " osd." << from
<< " scrub reserve = success" << dendl
;
4131 scrubber
.reserved_peers
.insert(from
);
4136 void PG::handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
)
4138 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4140 if (!scrubber
.reserved
) {
4141 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
4144 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
4145 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
4147 /* One decline stops this pg from being scheduled for scrubbing. */
4148 dout(10) << " osd." << from
<< " scrub reserve = fail" << dendl
;
4149 scrubber
.reserve_failed
= true;
4154 void PG::handle_scrub_reserve_release(OpRequestRef op
)
4156 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4158 clear_scrub_reserved();
4161 void PG::reject_reservation()
4163 osd
->send_message_osd_cluster(
4165 new MBackfillReserve(
4166 MBackfillReserve::REJECT
,
4167 spg_t(info
.pgid
.pgid
, primary
.shard
),
4168 get_osdmap()->get_epoch()),
4169 get_osdmap()->get_epoch());
4172 void PG::schedule_backfill_retry(float delay
)
4174 Mutex::Locker
lock(osd
->recovery_request_lock
);
4175 osd
->recovery_request_timer
.add_event_after(
4177 new QueuePeeringEvt
<RequestBackfill
>(
4178 this, get_osdmap()->get_epoch(),
4179 RequestBackfill()));
4182 void PG::schedule_recovery_retry(float delay
)
4184 Mutex::Locker
lock(osd
->recovery_request_lock
);
4185 osd
->recovery_request_timer
.add_event_after(
4187 new QueuePeeringEvt
<DoRecovery
>(
4188 this, get_osdmap()->get_epoch(),
4192 void PG::clear_scrub_reserved()
4194 scrubber
.reserved_peers
.clear();
4195 scrubber
.reserve_failed
= false;
4197 if (scrubber
.reserved
) {
4198 scrubber
.reserved
= false;
4199 osd
->dec_scrubs_pending();
4203 void PG::scrub_reserve_replicas()
4205 assert(backfill_targets
.empty());
4206 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4207 i
!= actingbackfill
.end();
4209 if (*i
== pg_whoami
) continue;
4210 dout(10) << "scrub requesting reserve from osd." << *i
<< dendl
;
4211 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
4212 osd
->send_message_osd_cluster(
4214 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4215 get_osdmap()->get_epoch(),
4216 MOSDScrubReserve::REQUEST
, pg_whoami
),
4217 get_osdmap()->get_epoch());
4219 // for jewel compat only
4220 vector
<OSDOp
> scrub(1);
4221 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_RESERVE
;
4225 MOSDSubOp
*subop
= new MOSDSubOp(
4226 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
4227 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
4229 osd
->send_message_osd_cluster(
4230 i
->osd
, subop
, get_osdmap()->get_epoch());
4235 void PG::scrub_unreserve_replicas()
4237 assert(backfill_targets
.empty());
4238 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4239 i
!= actingbackfill
.end();
4241 if (*i
== pg_whoami
) continue;
4242 dout(10) << "scrub requesting unreserve from osd." << *i
<< dendl
;
4243 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
4244 osd
->send_message_osd_cluster(
4246 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4247 get_osdmap()->get_epoch(),
4248 MOSDScrubReserve::RELEASE
, pg_whoami
),
4249 get_osdmap()->get_epoch());
4251 // for jewel compat only
4252 vector
<OSDOp
> scrub(1);
4253 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_UNRESERVE
;
4257 MOSDSubOp
*subop
= new MOSDSubOp(
4258 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
4259 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
4261 osd
->send_message_osd_cluster(i
->osd
, subop
, get_osdmap()->get_epoch());
4266 void PG::_scan_rollback_obs(
4267 const vector
<ghobject_t
> &rollback_obs
,
4268 ThreadPool::TPHandle
&handle
)
4270 ObjectStore::Transaction t
;
4271 eversion_t trimmed_to
= last_rollback_info_trimmed_to_applied
;
4272 for (vector
<ghobject_t
>::const_iterator i
= rollback_obs
.begin();
4273 i
!= rollback_obs
.end();
4275 if (i
->generation
< trimmed_to
.version
) {
4276 osd
->clog
->error() << "osd." << osd
->whoami
4277 << " pg " << info
.pgid
4278 << " found obsolete rollback obj "
4279 << *i
<< " generation < trimmed_to "
4286 derr
<< __func__
<< ": queueing trans to clean up obsolete rollback objs"
4288 osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
4292 void PG::_scan_snaps(ScrubMap
&smap
)
4297 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4298 // caller using clean_meta_map(), and it works properly.
4299 dout(20) << __func__
<< " start" << dendl
;
4301 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4302 i
!= smap
.objects
.rend();
4304 const hobject_t
&hoid
= i
->first
;
4305 ScrubMap::object
&o
= i
->second
;
4307 dout(20) << __func__
<< " " << hoid
<< dendl
;
4309 if (hoid
.is_head() || hoid
.is_snapdir()) {
4310 // parse the SnapSet
4312 if (o
.attrs
.find(SS_ATTR
) == o
.attrs
.end()) {
4315 bl
.push_back(o
.attrs
[SS_ATTR
]);
4316 auto p
= bl
.begin();
4318 ::decode(snapset
, p
);
4322 head
= hoid
.get_head();
4323 // Make sure head_exists is correct for is_legacy() check
4325 snapset
.head_exists
= true;
4328 if (hoid
.snap
< CEPH_MAXSNAP
) {
4329 // check and if necessary fix snap_mapper
4330 if (hoid
.get_head() != head
) {
4331 derr
<< __func__
<< " no head for " << hoid
<< " (have " << head
<< ")"
4335 set
<snapid_t
> obj_snaps
;
4336 if (!snapset
.is_legacy()) {
4337 auto p
= snapset
.clone_snaps
.find(hoid
.snap
);
4338 if (p
== snapset
.clone_snaps
.end()) {
4339 derr
<< __func__
<< " no clone_snaps for " << hoid
<< " in " << snapset
4343 obj_snaps
.insert(p
->second
.begin(), p
->second
.end());
4346 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4349 bl
.push_back(o
.attrs
[OI_ATTR
]);
4356 obj_snaps
.insert(oi
.legacy_snaps
.begin(), oi
.legacy_snaps
.end());
4358 set
<snapid_t
> cur_snaps
;
4359 int r
= snap_mapper
.get_snaps(hoid
, &cur_snaps
);
4360 if (r
!= 0 && r
!= -ENOENT
) {
4361 derr
<< __func__
<< ": get_snaps returned " << cpp_strerror(r
) << dendl
;
4364 if (r
== -ENOENT
|| cur_snaps
!= obj_snaps
) {
4365 ObjectStore::Transaction t
;
4366 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4368 r
= snap_mapper
.remove_oid(hoid
, &_t
);
4370 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
)
4374 osd
->clog
->error() << "osd." << osd
->whoami
4375 << " found snap mapper error on pg "
4377 << " oid " << hoid
<< " snaps in mapper: "
4378 << cur_snaps
<< ", oi: "
4382 osd
->clog
->error() << "osd." << osd
->whoami
4383 << " found snap mapper error on pg "
4385 << " oid " << hoid
<< " snaps missing in mapper"
4390 snap_mapper
.add_oid(hoid
, obj_snaps
, &_t
);
4392 // wait for repair to apply to avoid confusing other bits of the system.
4395 Mutex
my_lock("PG::_scan_snaps my_lock");
4398 t
.register_on_applied_sync(
4399 new C_SafeCond(&my_lock
, &my_cond
, &done
, &r
));
4400 r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
4402 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4407 my_cond
.Wait(my_lock
);
4416 void PG::_repair_oinfo_oid(ScrubMap
&smap
)
4418 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4419 i
!= smap
.objects
.rend();
4421 const hobject_t
&hoid
= i
->first
;
4422 ScrubMap::object
&o
= i
->second
;
4425 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4428 bl
.push_back(o
.attrs
[OI_ATTR
]);
4435 if (oi
.soid
!= hoid
) {
4436 ObjectStore::Transaction t
;
4437 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4438 osd
->clog
->error() << "osd." << osd
->whoami
4439 << " found object info error on pg "
4441 << " oid " << hoid
<< " oid in object info: "
4447 ::encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4449 bufferptr
bp(bl
.c_str(), bl
.length());
4450 o
.attrs
[OI_ATTR
] = bp
;
4452 t
.setattr(coll
, ghobject_t(hoid
), OI_ATTR
, bl
);
4453 int r
= osd
->store
->apply_transaction(osr
.get(), std::move(t
));
4455 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4461 int PG::build_scrub_map_chunk(
4463 ScrubMapBuilder
&pos
,
4467 ThreadPool::TPHandle
&handle
)
4469 dout(10) << __func__
<< " [" << start
<< "," << end
<< ") "
4474 while (pos
.empty()) {
4476 map
.valid_through
= info
.last_update
;
4480 vector
<ghobject_t
> rollback_obs
;
4481 pos
.ret
= get_pgbackend()->objects_list_range(
4488 dout(5) << "objects_list_range error: " << pos
.ret
<< dendl
;
4491 if (pos
.ls
.empty()) {
4494 _scan_rollback_obs(rollback_obs
, handle
);
4496 return -EINPROGRESS
;
4500 while (!pos
.done()) {
4501 int r
= get_pgbackend()->be_scan_list(map
, pos
);
4502 if (r
== -EINPROGRESS
) {
4508 dout(20) << __func__
<< " finishing" << dendl
;
4510 _repair_oinfo_oid(map
);
4511 if (!is_primary()) {
4512 ScrubMap for_meta_scrub
;
4513 // In case we restarted smaller chunk, clear old data
4514 scrubber
.cleaned_meta_map
.clear_from(scrubber
.start
);
4515 scrubber
.cleaned_meta_map
.insert(map
);
4516 scrubber
.clean_meta_map(for_meta_scrub
);
4517 _scan_snaps(for_meta_scrub
);
4520 dout(20) << __func__
<< " done, got " << map
.objects
.size() << " items"
4525 void PG::Scrubber::cleanup_store(ObjectStore::Transaction
*t
) {
4528 struct OnComplete
: Context
{
4529 std::unique_ptr
<Scrub::Store
> store
;
4531 std::unique_ptr
<Scrub::Store
> &&store
)
4532 : store(std::move(store
)) {}
4533 void finish(int) override
{}
4536 t
->register_on_complete(new OnComplete(std::move(store
)));
4540 void PG::repair_object(
4541 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
4542 pg_shard_t bad_peer
)
4544 list
<pg_shard_t
> op_shards
;
4545 for (auto i
: *ok_peers
) {
4546 op_shards
.push_back(i
.second
);
4548 dout(10) << "repair_object " << soid
<< " bad_peer osd."
4549 << bad_peer
<< " ok_peers osd.{" << op_shards
<< "}" << dendl
;
4550 ScrubMap::object
&po
= ok_peers
->back().first
;
4553 bv
.push_back(po
.attrs
[OI_ATTR
]);
4556 bufferlist::iterator bliter
= bv
.begin();
4557 ::decode(oi
, bliter
);
4559 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
4562 if (bad_peer
!= primary
) {
4563 peer_missing
[bad_peer
].add(soid
, oi
.version
, eversion_t(), false);
4565 // We should only be scrubbing if the PG is clean.
4566 assert(waiting_for_unreadable_object
.empty());
4568 pg_log
.missing_add(soid
, oi
.version
, eversion_t());
4570 pg_log
.set_last_requested(0);
4571 dout(10) << __func__
<< ": primary = " << primary
<< dendl
;
4574 if (is_ec_pg() || bad_peer
== primary
) {
4575 // we'd better collect all shard for EC pg, and prepare good peers as the
4576 // source of pull in the case of replicated pg.
4577 missing_loc
.add_missing(soid
, oi
.version
, eversion_t());
4578 list
<pair
<ScrubMap::object
, pg_shard_t
> >::iterator i
;
4579 for (i
= ok_peers
->begin();
4580 i
!= ok_peers
->end();
4582 missing_loc
.add_location(soid
, i
->second
);
4588 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4589 * for pushes to complete in case of recent recovery. Build a single
4590 * scrubmap of objects that are in the range [msg->start, msg->end).
4592 void PG::replica_scrub(
4594 ThreadPool::TPHandle
&handle
)
4596 const MOSDRepScrub
*msg
= static_cast<const MOSDRepScrub
*>(op
->get_req());
4597 assert(!scrubber
.active_rep_scrub
);
4598 dout(7) << "replica_scrub" << dendl
;
4600 if (msg
->map_epoch
< info
.history
.same_interval_since
) {
4601 dout(10) << "replica_scrub discarding old replica_scrub from "
4602 << msg
->map_epoch
<< " < " << info
.history
.same_interval_since
4607 assert(msg
->chunky
);
4608 if (last_update_applied
< msg
->scrub_to
) {
4609 dout(10) << "waiting for last_update_applied to catch up" << dendl
;
4610 scrubber
.active_rep_scrub
= op
;
4614 if (active_pushes
> 0) {
4615 dout(10) << "waiting for active pushes to finish" << dendl
;
4616 scrubber
.active_rep_scrub
= op
;
4620 scrubber
.state
= Scrubber::BUILD_MAP_REPLICA
;
4621 scrubber
.replica_scrub_start
= msg
->min_epoch
;
4622 scrubber
.start
= msg
->start
;
4623 scrubber
.end
= msg
->end
;
4624 scrubber
.max_end
= msg
->end
;
4625 scrubber
.deep
= msg
->deep
;
4626 scrubber
.epoch_start
= info
.history
.same_interval_since
;
4627 if (msg
->priority
) {
4628 scrubber
.priority
= msg
->priority
;
4630 scrubber
.priority
= get_scrub_priority();
4633 scrub_can_preempt
= msg
->allow_preemption
;
4634 scrub_preempted
= false;
4635 scrubber
.replica_scrubmap_pos
.reset();
4637 requeue_scrub(msg
->high_priority
);
4641 * PG_STATE_SCRUBBING is set when the scrub is queued
4643 * scrub will be chunky if all OSDs in PG support chunky scrub
4644 * scrub will fail if OSDs are too old.
4646 void PG::scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
)
4648 if (cct
->_conf
->osd_scrub_sleep
> 0 &&
4649 (scrubber
.state
== PG::Scrubber::NEW_CHUNK
||
4650 scrubber
.state
== PG::Scrubber::INACTIVE
) &&
4651 scrubber
.needs_sleep
) {
4652 ceph_assert(!scrubber
.sleeping
);
4653 dout(20) << __func__
<< " state is INACTIVE|NEW_CHUNK, sleeping" << dendl
;
4655 // Do an async sleep so we don't block the op queue
4656 OSDService
*osds
= osd
;
4657 spg_t pgid
= get_pgid();
4658 int state
= scrubber
.state
;
4659 auto scrub_requeue_callback
=
4660 new FunctionContext([osds
, pgid
, state
](int r
) {
4661 PG
*pg
= osds
->osd
->lookup_lock_pg(pgid
);
4662 if (pg
== nullptr) {
4663 lgeneric_dout(osds
->osd
->cct
, 20)
4664 << "scrub_requeue_callback: Could not find "
4665 << "PG " << pgid
<< " can't complete scrub requeue after sleep"
4669 pg
->scrubber
.sleeping
= false;
4670 pg
->scrubber
.needs_sleep
= false;
4671 lgeneric_dout(pg
->cct
, 20)
4672 << "scrub_requeue_callback: slept for "
4673 << ceph_clock_now() - pg
->scrubber
.sleep_start
4674 << ", re-queuing scrub with state " << state
<< dendl
;
4675 pg
->scrub_queued
= false;
4676 pg
->requeue_scrub();
4677 pg
->scrubber
.sleep_start
= utime_t();
4680 Mutex::Locker
l(osd
->scrub_sleep_lock
);
4681 osd
->scrub_sleep_timer
.add_event_after(cct
->_conf
->osd_scrub_sleep
,
4682 scrub_requeue_callback
);
4683 scrubber
.sleeping
= true;
4684 scrubber
.sleep_start
= ceph_clock_now();
4687 if (pg_has_reset_since(queued
)) {
4690 assert(scrub_queued
);
4691 scrub_queued
= false;
4692 scrubber
.needs_sleep
= true;
4695 if (!is_primary() &&
4696 scrubber
.state
== PG::Scrubber::BUILD_MAP_REPLICA
) {
4697 chunky_scrub(handle
);
4701 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4702 dout(10) << "scrub -- not primary or active or not clean" << dendl
;
4703 state_clear(PG_STATE_SCRUBBING
);
4704 state_clear(PG_STATE_REPAIR
);
4705 state_clear(PG_STATE_DEEP_SCRUB
);
4706 publish_stats_to_osd();
4710 if (!scrubber
.active
) {
4711 assert(backfill_targets
.empty());
4713 scrubber
.deep
= state_test(PG_STATE_DEEP_SCRUB
);
4715 dout(10) << "starting a new chunky scrub" << dendl
;
4718 chunky_scrub(handle
);
4722 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4725 * The object store is partitioned into chunks which end on hash boundaries. For
4726 * each chunk, the following logic is performed:
4728 * (1) Block writes on the chunk
4729 * (2) Request maps from replicas
4730 * (3) Wait for pushes to be applied (after recovery)
4731 * (4) Wait for writes to flush on the chunk
4732 * (5) Wait for maps from replicas
4733 * (6) Compare / repair all scrub maps
4734 * (7) Wait for digest updates to apply
4736 * This logic is encoded in the mostly linear state machine:
4738 * +------------------+
4739 * _________v__________ |
4742 * |____________________| |
4745 * _________v___v______ | |
4748 * |____________________| | |
4750 * _________v__________ | |
4752 * | WAIT_PUSHES | | |
4753 * |____________________| | |
4755 * _________v__________ | |
4757 * | WAIT_LAST_UPDATE | | |
4758 * |____________________| | |
4760 * _________v__________ | |
4763 * |____________________| | |
4765 * _________v__________ | |
4767 * | WAIT_REPLICAS | | |
4768 * |____________________| | |
4770 * _________v__________ | |
4772 * | COMPARE_MAPS | | |
4773 * |____________________| | |
4776 * _________v__________ | |
4778 * |WAIT_DIGEST_UPDATES | | |
4779 * |____________________| | |
4782 * _________v__________ |
4785 * |____________________| |
4787 * +------------------+
4789 * The primary determines the last update from the subset by walking the log. If
4790 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4791 * to wait until that update is applied before building a scrub map. Both the
4792 * primary and replicas will wait for any active pushes to be applied.
4794 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4796 * scrubber.state encodes the current state of the scrub (refer to state diagram
4799 void PG::chunky_scrub(ThreadPool::TPHandle
&handle
)
4801 // check for map changes
4802 if (scrubber
.is_chunky_scrub_active()) {
4803 if (scrubber
.epoch_start
!= info
.history
.same_interval_since
) {
4804 dout(10) << "scrub pg changed, aborting" << dendl
;
4805 scrub_clear_state();
4806 scrub_unreserve_replicas();
4815 dout(20) << "scrub state " << Scrubber::state_string(scrubber
.state
)
4816 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
4817 << " max_end " << scrubber
.max_end
<< dendl
;
4819 switch (scrubber
.state
) {
4820 case PG::Scrubber::INACTIVE
:
4821 dout(10) << "scrub start" << dendl
;
4822 assert(is_primary());
4824 publish_stats_to_osd();
4825 scrubber
.epoch_start
= info
.history
.same_interval_since
;
4826 scrubber
.active
= true;
4828 osd
->inc_scrubs_active(scrubber
.reserved
);
4829 if (scrubber
.reserved
) {
4830 scrubber
.reserved
= false;
4831 scrubber
.reserved_peers
.clear();
4835 ObjectStore::Transaction t
;
4836 scrubber
.cleanup_store(&t
);
4837 scrubber
.store
.reset(Scrub::Store::create(osd
->store
, &t
,
4839 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
4842 // Don't include temporary objects when scrubbing
4843 scrubber
.start
= info
.pgid
.pgid
.get_hobj_start();
4844 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4847 bool repair
= state_test(PG_STATE_REPAIR
);
4848 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
4849 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
4851 oss
<< info
.pgid
.pgid
<< " " << mode
<< " starts" << std::endl
;
4852 osd
->clog
->debug(oss
);
4855 scrubber
.preempt_left
= cct
->_conf
->get_val
<uint64_t>(
4856 "osd_scrub_max_preemptions");
4857 scrubber
.preempt_divisor
= 1;
4860 case PG::Scrubber::NEW_CHUNK
:
4861 scrubber
.primary_scrubmap
= ScrubMap();
4862 scrubber
.received_maps
.clear();
4864 // begin (possible) preemption window
4865 if (scrub_preempted
) {
4866 scrubber
.preempt_left
--;
4867 scrubber
.preempt_divisor
*= 2;
4868 dout(10) << __func__
<< " preempted, " << scrubber
.preempt_left
4869 << " left" << dendl
;
4870 scrub_preempted
= false;
4872 scrub_can_preempt
= scrubber
.preempt_left
> 0;
4875 /* get the start and end of our scrub chunk
4877 * Our scrub chunk has an important restriction we're going to need to
4878 * respect. We can't let head or snapdir be start or end.
4879 * Using a half-open interval means that if end == head|snapdir,
4880 * we'd scrub/lock head and the clone right next to head in different
4881 * chunks which would allow us to miss clones created between
4882 * scrubbing that chunk and scrubbing the chunk including head.
4883 * This isn't true for any of the other clones since clones can
4884 * only be created "just to the left of" head. There is one exception
4885 * to this: promotion of clones which always happens to the left of the
4886 * left-most clone, but promote_object checks the scrubber in that
4887 * case, so it should be ok. Also, it's ok to "miss" clones at the
4888 * left end of the range if we are a tier because they may legitimately
4889 * not exist (see _scrub).
4891 int min
= std::max
<int64_t>(3, cct
->_conf
->osd_scrub_chunk_min
/
4892 scrubber
.preempt_divisor
);
4893 int max
= std::max
<int64_t>(min
, cct
->_conf
->osd_scrub_chunk_max
/
4894 scrubber
.preempt_divisor
);
4895 hobject_t start
= scrubber
.start
;
4896 hobject_t candidate_end
;
4897 vector
<hobject_t
> objects
;
4899 ret
= get_pgbackend()->objects_list_partial(
4907 if (!objects
.empty()) {
4908 hobject_t back
= objects
.back();
4909 while (candidate_end
.has_snapset() &&
4910 candidate_end
.get_head() == back
.get_head()) {
4911 candidate_end
= back
;
4913 if (objects
.empty()) {
4915 "Somehow we got more than 2 objects which"
4916 "have the same head but are not clones");
4918 back
= objects
.back();
4920 if (candidate_end
.has_snapset()) {
4921 assert(candidate_end
.get_head() != back
.get_head());
4922 candidate_end
= candidate_end
.get_object_boundary();
4925 assert(candidate_end
.is_max());
4928 if (!_range_available_for_scrub(scrubber
.start
, candidate_end
)) {
4929 // we'll be requeued by whatever made us unavailable for scrub
4930 dout(10) << __func__
<< ": scrub blocked somewhere in range "
4931 << "[" << scrubber
.start
<< ", " << candidate_end
<< ")"
4936 scrubber
.end
= candidate_end
;
4937 if (scrubber
.end
> scrubber
.max_end
)
4938 scrubber
.max_end
= scrubber
.end
;
4941 // walk the log to find the latest update that affects our chunk
4942 scrubber
.subset_last_update
= eversion_t();
4943 for (auto p
= projected_log
.log
.rbegin();
4944 p
!= projected_log
.log
.rend();
4946 if (p
->soid
>= scrubber
.start
&&
4947 p
->soid
< scrubber
.end
) {
4948 scrubber
.subset_last_update
= p
->version
;
4952 if (scrubber
.subset_last_update
== eversion_t()) {
4953 for (list
<pg_log_entry_t
>::const_reverse_iterator p
=
4954 pg_log
.get_log().log
.rbegin();
4955 p
!= pg_log
.get_log().log
.rend();
4957 if (p
->soid
>= scrubber
.start
&&
4958 p
->soid
< scrubber
.end
) {
4959 scrubber
.subset_last_update
= p
->version
;
4965 // ask replicas to wait until
4966 // last_update_applied >= scrubber.subset_last_update and then scan
4967 scrubber
.waiting_on_whom
.insert(pg_whoami
);
4969 // request maps from replicas
4970 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4971 i
!= actingbackfill
.end();
4973 if (*i
== pg_whoami
) continue;
4974 _request_scrub_map(*i
, scrubber
.subset_last_update
,
4975 scrubber
.start
, scrubber
.end
, scrubber
.deep
,
4976 scrubber
.preempt_left
> 0);
4977 scrubber
.waiting_on_whom
.insert(*i
);
4979 dout(10) << __func__
<< " waiting_on_whom " << scrubber
.waiting_on_whom
4982 scrubber
.state
= PG::Scrubber::WAIT_PUSHES
;
4985 case PG::Scrubber::WAIT_PUSHES
:
4986 if (active_pushes
== 0) {
4987 scrubber
.state
= PG::Scrubber::WAIT_LAST_UPDATE
;
4989 dout(15) << "wait for pushes to apply" << dendl
;
4994 case PG::Scrubber::WAIT_LAST_UPDATE
:
4995 if (last_update_applied
< scrubber
.subset_last_update
) {
4996 // will be requeued by op_applied
4997 dout(15) << "wait for writes to flush" << dendl
;
5002 scrubber
.state
= PG::Scrubber::BUILD_MAP
;
5003 scrubber
.primary_scrubmap_pos
.reset();
5006 case PG::Scrubber::BUILD_MAP
:
5007 assert(last_update_applied
>= scrubber
.subset_last_update
);
5009 // build my own scrub map
5010 if (scrub_preempted
) {
5011 dout(10) << __func__
<< " preempted" << dendl
;
5012 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
5015 ret
= build_scrub_map_chunk(
5016 scrubber
.primary_scrubmap
,
5017 scrubber
.primary_scrubmap_pos
,
5018 scrubber
.start
, scrubber
.end
,
5021 if (ret
== -EINPROGRESS
) {
5026 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
5029 case PG::Scrubber::BUILD_MAP_DONE
:
5030 if (scrubber
.primary_scrubmap_pos
.ret
< 0) {
5031 dout(5) << "error: " << scrubber
.primary_scrubmap_pos
.ret
5032 << ", aborting" << dendl
;
5033 scrub_clear_state();
5034 scrub_unreserve_replicas();
5037 dout(10) << __func__
<< " waiting_on_whom was "
5038 << scrubber
.waiting_on_whom
<< dendl
;
5039 assert(scrubber
.waiting_on_whom
.count(pg_whoami
));
5040 scrubber
.waiting_on_whom
.erase(pg_whoami
);
5042 scrubber
.state
= PG::Scrubber::WAIT_REPLICAS
;
5045 case PG::Scrubber::WAIT_REPLICAS
:
5046 if (!scrubber
.waiting_on_whom
.empty()) {
5047 // will be requeued by sub_op_scrub_map
5048 dout(10) << "wait for replicas to build scrub map" << dendl
;
5052 // end (possible) preemption window
5053 scrub_can_preempt
= false;
5054 if (scrub_preempted
) {
5055 dout(10) << __func__
<< " preempted, restarting chunk" << dendl
;
5056 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
5058 scrubber
.state
= PG::Scrubber::COMPARE_MAPS
;
5062 case PG::Scrubber::COMPARE_MAPS
:
5063 assert(last_update_applied
>= scrubber
.subset_last_update
);
5064 assert(scrubber
.waiting_on_whom
.empty());
5066 scrub_compare_maps();
5067 scrubber
.start
= scrubber
.end
;
5068 scrubber
.run_callbacks();
5070 // requeue the writes from the chunk that just finished
5071 requeue_ops(waiting_for_scrub
);
5073 scrubber
.state
= PG::Scrubber::WAIT_DIGEST_UPDATES
;
5077 case PG::Scrubber::WAIT_DIGEST_UPDATES
:
5078 if (scrubber
.num_digest_updates_pending
) {
5079 dout(10) << __func__
<< " waiting on "
5080 << scrubber
.num_digest_updates_pending
5081 << " digest updates" << dendl
;
5086 scrubber
.preempt_left
= cct
->_conf
->get_val
<uint64_t>(
5087 "osd_scrub_max_preemptions");
5088 scrubber
.preempt_divisor
= 1;
5090 if (!(scrubber
.end
.is_max())) {
5091 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
5095 scrubber
.state
= PG::Scrubber::FINISH
;
5100 case PG::Scrubber::FINISH
:
5102 scrubber
.state
= PG::Scrubber::INACTIVE
;
5105 if (!snap_trimq
.empty()) {
5106 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl
;
5107 snap_trimmer_scrub_complete();
5112 case PG::Scrubber::BUILD_MAP_REPLICA
:
5113 // build my own scrub map
5114 if (scrub_preempted
) {
5115 dout(10) << __func__
<< " preempted" << dendl
;
5118 ret
= build_scrub_map_chunk(
5119 scrubber
.replica_scrubmap
,
5120 scrubber
.replica_scrubmap_pos
,
5121 scrubber
.start
, scrubber
.end
,
5125 if (ret
== -EINPROGRESS
) {
5131 if (HAVE_FEATURE(acting_features
, SERVER_LUMINOUS
)) {
5132 MOSDRepScrubMap
*reply
= new MOSDRepScrubMap(
5133 spg_t(info
.pgid
.pgid
, get_primary().shard
),
5134 scrubber
.replica_scrub_start
,
5136 reply
->preempted
= scrub_preempted
;
5137 ::encode(scrubber
.replica_scrubmap
, reply
->get_data());
5138 osd
->send_message_osd_cluster(
5139 get_primary().osd
, reply
,
5140 scrubber
.replica_scrub_start
);
5142 // for jewel compatibility
5143 vector
<OSDOp
> scrub(1);
5144 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_MAP
;
5148 MOSDSubOp
*subop
= new MOSDSubOp(
5151 spg_t(info
.pgid
.pgid
, get_primary().shard
),
5154 scrubber
.replica_scrub_start
,
5157 ::encode(scrubber
.replica_scrubmap
, subop
->get_data());
5159 osd
->send_message_osd_cluster(
5160 get_primary().osd
, subop
,
5161 scrubber
.replica_scrub_start
);
5163 scrub_preempted
= false;
5164 scrub_can_preempt
= false;
5165 scrubber
.state
= PG::Scrubber::INACTIVE
;
5166 scrubber
.replica_scrubmap
= ScrubMap();
5167 scrubber
.replica_scrubmap_pos
= ScrubMapBuilder();
5168 scrubber
.start
= hobject_t();
5169 scrubber
.end
= hobject_t();
5170 scrubber
.max_end
= hobject_t();
5178 dout(20) << "scrub final state " << Scrubber::state_string(scrubber
.state
)
5179 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
5180 << " max_end " << scrubber
.max_end
<< dendl
;
5183 bool PG::write_blocked_by_scrub(const hobject_t
& soid
)
5185 if (soid
< scrubber
.start
|| soid
>= scrubber
.end
) {
5188 if (scrub_can_preempt
) {
5189 if (!scrub_preempted
) {
5190 dout(10) << __func__
<< " " << soid
<< " preempted" << dendl
;
5191 scrub_preempted
= true;
5193 dout(10) << __func__
<< " " << soid
<< " already preempted" << dendl
;
5200 bool PG::range_intersects_scrub(const hobject_t
&start
, const hobject_t
& end
)
5202 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5203 return (start
< scrubber
.max_end
&&
5204 end
>= scrubber
.start
);
5207 void PG::scrub_clear_state()
5209 assert(is_locked());
5210 state_clear(PG_STATE_SCRUBBING
);
5211 state_clear(PG_STATE_REPAIR
);
5212 state_clear(PG_STATE_DEEP_SCRUB
);
5213 publish_stats_to_osd();
5215 // active -> nothing.
5216 if (scrubber
.active
)
5217 osd
->dec_scrubs_active();
5219 requeue_ops(waiting_for_scrub
);
5223 // type-specific state clear
5224 _scrub_clear_state();
5227 void PG::scrub_compare_maps()
5229 dout(10) << __func__
<< " has maps, analyzing" << dendl
;
5231 // construct authoritative scrub map for type specific scrubbing
5232 scrubber
.cleaned_meta_map
.insert(scrubber
.primary_scrubmap
);
5234 pair
<boost::optional
<uint32_t>,
5235 boost::optional
<uint32_t>>> missing_digest
;
5237 map
<pg_shard_t
, ScrubMap
*> maps
;
5238 maps
[pg_whoami
] = &scrubber
.primary_scrubmap
;
5240 for (const auto& i
: actingbackfill
) {
5241 if (i
== pg_whoami
) continue;
5242 dout(2) << __func__
<< " replica " << i
<< " has "
5243 << scrubber
.received_maps
[i
].objects
.size()
5244 << " items" << dendl
;
5245 maps
[i
] = &scrubber
.received_maps
[i
];
5248 set
<hobject_t
> master_set
;
5250 // Construct master set
5251 for (const auto map
: maps
) {
5252 for (const auto i
: map
.second
->objects
) {
5253 master_set
.insert(i
.first
);
5258 get_pgbackend()->be_large_omap_check(maps
, master_set
,
5259 scrubber
.large_omap_objects
, ss
);
5260 if (!ss
.str().empty()) {
5261 osd
->clog
->warn(ss
);
5264 if (acting
.size() > 1) {
5265 dout(10) << __func__
<< " comparing replica scrub maps" << dendl
;
5267 // Map from object with errors to good peer
5268 map
<hobject_t
, list
<pg_shard_t
>> authoritative
;
5270 dout(2) << __func__
<< " osd." << acting
[0] << " has "
5271 << scrubber
.primary_scrubmap
.objects
.size() << " items" << dendl
;
5276 get_pgbackend()->be_compare_scrubmaps(
5279 state_test(PG_STATE_REPAIR
),
5281 scrubber
.inconsistent
,
5284 scrubber
.shallow_errors
,
5285 scrubber
.deep_errors
,
5286 scrubber
.store
.get(),
5289 dout(2) << ss
.str() << dendl
;
5291 if (!ss
.str().empty()) {
5292 osd
->clog
->error(ss
);
5295 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5296 i
!= authoritative
.end();
5298 list
<pair
<ScrubMap::object
, pg_shard_t
> > good_peers
;
5299 for (list
<pg_shard_t
>::const_iterator j
= i
->second
.begin();
5300 j
!= i
->second
.end();
5302 good_peers
.push_back(make_pair(maps
[*j
]->objects
[i
->first
], *j
));
5304 scrubber
.authoritative
.insert(
5310 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5311 i
!= authoritative
.end();
5313 scrubber
.cleaned_meta_map
.objects
.erase(i
->first
);
5314 scrubber
.cleaned_meta_map
.objects
.insert(
5315 *(maps
[i
->second
.back()]->objects
.find(i
->first
))
5320 ScrubMap for_meta_scrub
;
5321 scrubber
.clean_meta_map(for_meta_scrub
);
5323 // ok, do the pg-type specific scrubbing
5324 scrub_snapshot_metadata(for_meta_scrub
, missing_digest
);
5325 // Called here on the primary can use an authoritative map if it isn't the primary
5326 _scan_snaps(for_meta_scrub
);
5327 if (!scrubber
.store
->empty()) {
5328 if (state_test(PG_STATE_REPAIR
)) {
5329 dout(10) << __func__
<< ": discarding scrub results" << dendl
;
5330 scrubber
.store
->flush(nullptr);
5332 dout(10) << __func__
<< ": updating scrub object" << dendl
;
5333 ObjectStore::Transaction t
;
5334 scrubber
.store
->flush(&t
);
5335 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
5340 bool PG::scrub_process_inconsistent()
5342 dout(10) << __func__
<< ": checking authoritative" << dendl
;
5343 bool repair
= state_test(PG_STATE_REPAIR
);
5344 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5345 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5347 // authoriative only store objects which missing or inconsistent.
5348 if (!scrubber
.authoritative
.empty()) {
5350 ss
<< info
.pgid
<< " " << mode
<< " "
5351 << scrubber
.missing
.size() << " missing, "
5352 << scrubber
.inconsistent
.size() << " inconsistent objects";
5353 dout(2) << ss
.str() << dendl
;
5354 osd
->clog
->error(ss
);
5356 state_clear(PG_STATE_CLEAN
);
5357 for (map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >>::iterator i
=
5358 scrubber
.authoritative
.begin();
5359 i
!= scrubber
.authoritative
.end();
5361 set
<pg_shard_t
>::iterator j
;
5363 auto missing_entry
= scrubber
.missing
.find(i
->first
);
5364 if (missing_entry
!= scrubber
.missing
.end()) {
5365 for (j
= missing_entry
->second
.begin();
5366 j
!= missing_entry
->second
.end();
5375 if (scrubber
.inconsistent
.count(i
->first
)) {
5376 for (j
= scrubber
.inconsistent
[i
->first
].begin();
5377 j
!= scrubber
.inconsistent
[i
->first
].end();
5379 repair_object(i
->first
,
5388 return (!scrubber
.authoritative
.empty() && repair
);
5391 bool PG::ops_blocked_by_scrub() const {
5392 return (waiting_for_scrub
.size() != 0);
5395 // the part that actually finalizes a scrub
5396 void PG::scrub_finish()
5398 bool repair
= state_test(PG_STATE_REPAIR
);
5399 // if the repair request comes from auto-repair and large number of errors,
5400 // we would like to cancel auto-repair
5401 if (repair
&& scrubber
.auto_repair
5402 && scrubber
.authoritative
.size() > cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
5403 state_clear(PG_STATE_REPAIR
);
5406 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5407 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5409 // type-specific finish (can tally more errors)
5412 bool has_error
= scrub_process_inconsistent();
5416 oss
<< info
.pgid
.pgid
<< " " << mode
<< " ";
5417 int total_errors
= scrubber
.shallow_errors
+ scrubber
.deep_errors
;
5419 oss
<< total_errors
<< " errors";
5422 if (!deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
)
5423 oss
<< " ( " << info
.stats
.stats
.sum
.num_deep_scrub_errors
5424 << " remaining deep scrub error details lost)";
5426 oss
<< ", " << scrubber
.fixed
<< " fixed";
5428 osd
->clog
->error(oss
);
5430 osd
->clog
->debug(oss
);
5435 utime_t now
= ceph_clock_now();
5436 info
.history
.last_scrub
= info
.last_update
;
5437 info
.history
.last_scrub_stamp
= now
;
5438 if (scrubber
.deep
) {
5439 info
.history
.last_deep_scrub
= info
.last_update
;
5440 info
.history
.last_deep_scrub_stamp
= now
;
5442 // Since we don't know which errors were fixed, we can only clear them
5443 // when every one has been fixed.
5445 if (scrubber
.fixed
== scrubber
.shallow_errors
+ scrubber
.deep_errors
) {
5447 scrubber
.shallow_errors
= scrubber
.deep_errors
= 0;
5449 // Deep scrub in order to get corrected error counts
5450 scrub_after_recovery
= true;
5454 if ((scrubber
.shallow_errors
== 0) && (scrubber
.deep_errors
== 0))
5455 info
.history
.last_clean_scrub_stamp
= now
;
5456 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5457 info
.stats
.stats
.sum
.num_deep_scrub_errors
= scrubber
.deep_errors
;
5458 info
.stats
.stats
.sum
.num_large_omap_objects
= scrubber
.large_omap_objects
;
5460 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5461 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5462 // because of deep-scrub errors
5463 if (scrubber
.shallow_errors
== 0)
5464 info
.history
.last_clean_scrub_stamp
= now
;
5466 info
.stats
.stats
.sum
.num_scrub_errors
=
5467 info
.stats
.stats
.sum
.num_shallow_scrub_errors
+
5468 info
.stats
.stats
.sum
.num_deep_scrub_errors
;
5472 ObjectStore::Transaction t
;
5475 int tr
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
5481 queue_peering_event(
5483 std::make_shared
<CephPeeringEvt
>(
5484 get_osdmap()->get_epoch(),
5485 get_osdmap()->get_epoch(),
5489 scrub_clear_state();
5490 scrub_unreserve_replicas();
5492 if (is_active() && is_primary()) {
5497 void PG::share_pg_info()
5499 dout(10) << "share_pg_info" << dendl
;
5501 // share new pg_info_t with replicas
5502 assert(!actingbackfill
.empty());
5503 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
5504 i
!= actingbackfill
.end();
5506 if (*i
== pg_whoami
) continue;
5507 pg_shard_t peer
= *i
;
5508 if (peer_info
.count(peer
)) {
5509 peer_info
[peer
].last_epoch_started
= info
.last_epoch_started
;
5510 peer_info
[peer
].last_interval_started
= info
.last_interval_started
;
5511 peer_info
[peer
].history
.merge(info
.history
);
5513 MOSDPGInfo
*m
= new MOSDPGInfo(get_osdmap()->get_epoch());
5514 m
->pg_list
.push_back(
5517 peer
.shard
, pg_whoami
.shard
,
5518 get_osdmap()->get_epoch(),
5519 get_osdmap()->get_epoch(),
5522 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
5526 bool PG::append_log_entries_update_missing(
5527 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
5528 ObjectStore::Transaction
&t
, boost::optional
<eversion_t
> trim_to
,
5529 boost::optional
<eversion_t
> roll_forward_to
)
5531 assert(!entries
.empty());
5532 assert(entries
.begin()->version
> info
.last_update
);
5534 PGLogEntryHandler rollbacker
{this, &t
};
5535 bool invalidate_stats
=
5536 pg_log
.append_new_log_entries(info
.last_backfill
,
5537 info
.last_backfill_bitwise
,
5541 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
5542 pg_log
.roll_forward(&rollbacker
);
5544 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
5545 pg_log
.roll_forward_to(*roll_forward_to
, &rollbacker
);
5546 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
5549 info
.last_update
= pg_log
.get_head();
5551 if (pg_log
.get_missing().num_missing() == 0) {
5552 // advance last_complete since nothing else is missing!
5553 info
.last_complete
= info
.last_update
;
5555 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
5557 dout(20) << __func__
<< "trim_to bool = " << bool(trim_to
) << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
5559 pg_log
.trim(*trim_to
, info
);
5562 return invalidate_stats
;
5566 void PG::merge_new_log_entries(
5567 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
5568 ObjectStore::Transaction
&t
,
5569 boost::optional
<eversion_t
> trim_to
,
5570 boost::optional
<eversion_t
> roll_forward_to
)
5572 dout(10) << __func__
<< " " << entries
<< dendl
;
5573 assert(is_primary());
5575 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
5576 for (set
<pg_shard_t
>::const_iterator i
= actingbackfill
.begin();
5577 i
!= actingbackfill
.end();
5579 pg_shard_t
peer(*i
);
5580 if (peer
== pg_whoami
) continue;
5581 assert(peer_missing
.count(peer
));
5582 assert(peer_info
.count(peer
));
5583 pg_missing_t
& pmissing(peer_missing
[peer
]);
5584 dout(20) << __func__
<< " peer_missing for " << peer
<< " = " << pmissing
<< dendl
;
5585 pg_info_t
& pinfo(peer_info
[peer
]);
5586 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
5587 pinfo
.last_backfill
,
5588 info
.last_backfill_bitwise
,
5595 pinfo
.last_update
= info
.last_update
;
5596 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
5597 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
5600 if (!rebuild_missing
) {
5604 for (auto &&i
: entries
) {
5605 missing_loc
.rebuild(
5610 pg_log
.get_missing(),
5616 void PG::update_history(const pg_history_t
& new_history
)
5619 if (info
.history
.merge(new_history
)) {
5620 dout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
5622 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
5623 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
5624 past_intervals
.clear();
5625 dirty_big_info
= true;
5631 void PG::fulfill_info(
5632 pg_shard_t from
, const pg_query_t
&query
,
5633 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
5635 assert(from
== primary
);
5636 assert(query
.type
== pg_query_t::INFO
);
5639 dout(10) << "sending info" << dendl
;
5640 notify_info
= make_pair(from
, info
);
5643 void PG::fulfill_log(
5644 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
5646 dout(10) << "log request from " << from
<< dendl
;
5647 assert(from
== primary
);
5648 assert(query
.type
!= pg_query_t::INFO
);
5649 ConnectionRef con
= osd
->get_con_osd_cluster(
5650 from
.osd
, get_osdmap()->get_epoch());
5653 MOSDPGLog
*mlog
= new MOSDPGLog(
5654 from
.shard
, pg_whoami
.shard
,
5655 get_osdmap()->get_epoch(),
5657 mlog
->missing
= pg_log
.get_missing();
5659 // primary -> other, when building master log
5660 if (query
.type
== pg_query_t::LOG
) {
5661 dout(10) << " sending info+missing+log since " << query
.since
5663 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
5664 osd
->clog
->error() << info
.pgid
<< " got broken pg_query_t::LOG since " << query
.since
5665 << " when my log.tail is " << pg_log
.get_tail()
5666 << ", sending full log instead";
5667 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
5669 mlog
->log
.copy_after(pg_log
.get_log(), query
.since
);
5671 else if (query
.type
== pg_query_t::FULLLOG
) {
5672 dout(10) << " sending info+missing+full log" << dendl
;
5673 mlog
->log
= pg_log
.get_log();
5676 dout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
5678 osd
->share_map_peer(from
.osd
, con
.get(), get_osdmap());
5679 osd
->send_message_osd_cluster(mlog
, con
.get());
5682 void PG::fulfill_query(const MQuery
& query
, RecoveryCtx
*rctx
)
5684 if (query
.query
.type
== pg_query_t::INFO
) {
5685 pair
<pg_shard_t
, pg_info_t
> notify_info
;
5686 update_history(query
.query
.history
);
5687 fulfill_info(query
.from
, query
.query
, notify_info
);
5691 notify_info
.first
.shard
, pg_whoami
.shard
,
5693 get_osdmap()->get_epoch(),
5694 notify_info
.second
),
5697 update_history(query
.query
.history
);
5698 fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
5702 void PG::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
5704 bool changed
= false;
5705 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
) &&
5706 !lastmap
->test_flag(CEPH_OSDMAP_FULL
)) {
5707 dout(10) << " cluster was marked full in " << osdmap
->get_epoch() << dendl
;
5710 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
5712 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5713 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
5714 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5715 dout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
5720 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
5725 bool PG::should_restart_peering(
5727 int newactingprimary
,
5728 const vector
<int>& newup
,
5729 const vector
<int>& newacting
,
5733 if (PastIntervals::is_new_interval(
5745 dout(20) << "new interval newup " << newup
5746 << " newacting " << newacting
<< dendl
;
5749 if (!lastmap
->is_up(osd
->whoami
) && osdmap
->is_up(osd
->whoami
)) {
5750 dout(10) << __func__
<< " osd transitioned from down -> up" << dendl
;
5756 bool PG::old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
)
5758 if (last_peering_reset
> reply_epoch
||
5759 last_peering_reset
> query_epoch
) {
5760 dout(10) << "old_peering_msg reply_epoch " << reply_epoch
<< " query_epoch " << query_epoch
5761 << " last_peering_reset " << last_peering_reset
5768 void PG::set_last_peering_reset()
5770 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl
;
5771 if (last_peering_reset
!= get_osdmap()->get_epoch()) {
5772 last_peering_reset
= get_osdmap()->get_epoch();
5773 reset_interval_flush();
5780 FlushState(PG
*pg
, epoch_t epoch
) : pg(pg
), epoch(epoch
) {}
5783 if (!pg
->pg_has_reset_since(epoch
))
5784 pg
->queue_flushed(epoch
);
5788 typedef ceph::shared_ptr
<FlushState
> FlushStateRef
;
5790 void PG::start_flush(ObjectStore::Transaction
*t
,
5791 list
<Context
*> *on_applied
,
5792 list
<Context
*> *on_safe
)
5794 // flush in progress ops
5795 FlushStateRef
flush_trigger (std::make_shared
<FlushState
>(
5796 this, get_osdmap()->get_epoch()));
5798 flushes_in_progress
++;
5799 on_applied
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5800 on_safe
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5803 void PG::reset_interval_flush()
5805 dout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
5806 recovery_state
.clear_blocked_outgoing();
5808 Context
*c
= new QueuePeeringEvt
<IntervalFlush
>(
5809 this, get_osdmap()->get_epoch(), IntervalFlush());
5810 if (!osr
->flush_commit(c
)) {
5811 dout(10) << "Beginning to block outgoing recovery messages" << dendl
;
5812 recovery_state
.begin_block_outgoing();
5814 dout(10) << "Not blocking outgoing recovery messages" << dendl
;
5819 /* Called before initializing peering during advance_map */
5820 void PG::start_peering_interval(
5821 const OSDMapRef lastmap
,
5822 const vector
<int>& newup
, int new_up_primary
,
5823 const vector
<int>& newacting
, int new_acting_primary
,
5824 ObjectStore::Transaction
*t
)
5826 const OSDMapRef osdmap
= get_osdmap();
5828 set_last_peering_reset();
5830 vector
<int> oldacting
, oldup
;
5831 int oldrole
= get_role();
5835 pg_shard_t old_acting_primary
= get_primary();
5836 pg_shard_t old_up_primary
= up_primary
;
5837 bool was_old_primary
= is_primary();
5838 bool was_old_replica
= is_replica();
5840 acting
.swap(oldacting
);
5842 init_primary_up_acting(
5846 new_acting_primary
);
5848 if (info
.stats
.up
!= up
||
5849 info
.stats
.acting
!= acting
||
5850 info
.stats
.up_primary
!= new_up_primary
||
5851 info
.stats
.acting_primary
!= new_acting_primary
) {
5853 info
.stats
.up_primary
= new_up_primary
;
5854 info
.stats
.acting
= acting
;
5855 info
.stats
.acting_primary
= new_acting_primary
;
5856 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
5859 pg_stats_publish_lock
.Lock();
5860 pg_stats_publish_valid
= false;
5861 pg_stats_publish_lock
.Unlock();
5863 // This will now be remapped during a backfill in cases
5864 // that it would not have been before.
5866 state_set(PG_STATE_REMAPPED
);
5868 state_clear(PG_STATE_REMAPPED
);
5870 int role
= osdmap
->calc_pg_role(osd
->whoami
, acting
, acting
.size());
5871 if (pool
.info
.is_replicated() || role
== pg_whoami
.shard
)
5876 // did acting, up, primary|acker change?
5878 dout(10) << " no lastmap" << dendl
;
5880 dirty_big_info
= true;
5881 info
.history
.same_interval_since
= osdmap
->get_epoch();
5883 std::stringstream debug
;
5884 assert(info
.history
.same_interval_since
!= 0);
5885 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
5886 get_is_recoverable_predicate());
5887 bool new_interval
= PastIntervals::check_new_interval(
5888 old_acting_primary
.osd
,
5890 oldacting
, newacting
,
5894 info
.history
.same_interval_since
,
5895 info
.history
.last_epoch_clean
,
5902 dout(10) << __func__
<< ": check_new_interval output: "
5903 << debug
.str() << dendl
;
5905 if (osdmap
->get_epoch() == osd
->get_superblock().oldest_map
&&
5906 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
5907 dout(10) << " map gap, clearing past_intervals and faking" << dendl
;
5908 // our information is incomplete and useless; someone else was clean
5909 // after everything we know if osdmaps were trimmed.
5910 past_intervals
.clear();
5912 dout(10) << " noting past " << past_intervals
<< dendl
;
5915 dirty_big_info
= true;
5916 info
.history
.same_interval_since
= osdmap
->get_epoch();
5917 if (info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5918 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5920 info
.history
.last_epoch_split
= osdmap
->get_epoch();
5925 if (old_up_primary
!= up_primary
||
5927 info
.history
.same_up_since
= osdmap
->get_epoch();
5929 // this comparison includes primary rank via pg_shard_t
5930 if (old_acting_primary
!= get_primary()) {
5931 info
.history
.same_primary_since
= osdmap
->get_epoch();
5936 dout(1) << __func__
<< " up " << oldup
<< " -> " << up
5937 << ", acting " << oldacting
<< " -> " << acting
5938 << ", acting_primary " << old_acting_primary
<< " -> " << new_acting_primary
5939 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
5940 << ", role " << oldrole
<< " -> " << role
5941 << ", features acting " << acting_features
5942 << " upacting " << upacting_features
5946 state_clear(PG_STATE_ACTIVE
);
5947 state_clear(PG_STATE_PEERED
);
5948 state_clear(PG_STATE_DOWN
);
5949 state_clear(PG_STATE_RECOVERY_WAIT
);
5950 state_clear(PG_STATE_RECOVERY_TOOFULL
);
5951 state_clear(PG_STATE_RECOVERING
);
5953 peer_purged
.clear();
5954 actingbackfill
.clear();
5955 scrub_queued
= false;
5957 // reset primary/replica state?
5958 if (was_old_primary
|| is_primary()) {
5959 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
5960 } else if (was_old_replica
|| is_replica()) {
5961 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
5963 clear_primary_state();
5969 projected_last_update
= eversion_t();
5973 // should we tell the primary we are here?
5974 send_notify
= !is_primary();
5976 if (role
!= oldrole
||
5977 was_old_primary
!= is_primary()) {
5978 // did primary change?
5979 if (was_old_primary
!= is_primary()) {
5980 state_clear(PG_STATE_CLEAN
);
5981 clear_publish_stats();
5986 // take active waiters
5987 requeue_ops(waiting_for_peered
);
5991 // did primary change?
5992 if (get_primary() != old_acting_primary
) {
5993 dout(10) << *this << " " << oldacting
<< " -> " << acting
5994 << ", acting primary "
5995 << old_acting_primary
<< " -> " << get_primary()
5998 // primary is the same.
6000 // i am (still) primary. but my replica set changed.
6001 state_clear(PG_STATE_CLEAN
);
6003 dout(10) << oldacting
<< " -> " << acting
6004 << ", replicas changed" << dendl
;
6010 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
6011 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
6012 osd
->queue_want_pg_temp(info
.pgid
.pgid
, acting
);
6016 void PG::on_new_interval()
6018 const OSDMapRef osdmap
= get_osdmap();
6022 // initialize features
6023 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
6024 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
6025 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
6026 if (*p
== CRUSH_ITEM_NONE
)
6028 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
6029 acting_features
&= f
;
6030 upacting_features
&= f
;
6032 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
6033 if (*p
== CRUSH_ITEM_NONE
)
6035 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
6041 void PG::proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
6043 assert(!is_primary());
6045 update_history(oinfo
.history
);
6046 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
6047 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
6048 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
6049 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
6053 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
6054 dout(10) << __func__
<< " updating purged_snaps to " << oinfo
.purged_snaps
6056 info
.purged_snaps
= oinfo
.purged_snaps
;
6058 dirty_big_info
= true;
6062 ostream
& operator<<(ostream
& out
, const PG
& pg
)
6064 out
<< "pg[" << pg
.info
6066 if (pg
.acting
!= pg
.up
)
6067 out
<< "/" << pg
.acting
;
6069 out
<< "p" << pg
.get_primary();
6070 out
<< " r=" << pg
.get_role();
6071 out
<< " lpr=" << pg
.get_last_peering_reset();
6073 if (!pg
.past_intervals
.empty()) {
6074 out
<< " pi=[" << pg
.past_intervals
.get_bounds()
6075 << ")/" << pg
.past_intervals
.size();
6078 if (pg
.is_peered()) {
6079 if (pg
.last_update_ondisk
!= pg
.info
.last_update
)
6080 out
<< " luod=" << pg
.last_update_ondisk
;
6081 if (pg
.last_update_applied
!= pg
.info
.last_update
)
6082 out
<< " lua=" << pg
.last_update_applied
;
6085 if (pg
.recovery_ops_active
)
6086 out
<< " rops=" << pg
.recovery_ops_active
;
6088 if (pg
.pg_log
.get_tail() != pg
.info
.log_tail
||
6089 pg
.pg_log
.get_head() != pg
.info
.last_update
)
6090 out
<< " (info mismatch, " << pg
.pg_log
.get_log() << ")";
6092 if (!pg
.pg_log
.get_log().empty()) {
6093 if ((pg
.pg_log
.get_log().log
.begin()->version
<= pg
.pg_log
.get_tail())) {
6094 out
<< " (log bound mismatch, actual=["
6095 << pg
.pg_log
.get_log().log
.begin()->version
<< ","
6096 << pg
.pg_log
.get_log().log
.rbegin()->version
<< "]";
6101 if (!pg
.backfill_targets
.empty())
6102 out
<< " bft=" << pg
.backfill_targets
;
6103 out
<< " crt=" << pg
.pg_log
.get_can_rollback_to();
6105 if (pg
.last_complete_ondisk
!= pg
.info
.last_complete
)
6106 out
<< " lcod " << pg
.last_complete_ondisk
;
6108 if (pg
.is_primary()) {
6109 out
<< " mlcod " << pg
.min_last_complete_ondisk
;
6112 out
<< " " << pg_state_string(pg
.get_state());
6113 if (pg
.should_send_notify())
6116 if (pg
.scrubber
.must_repair
)
6117 out
<< " MUST_REPAIR";
6118 if (pg
.scrubber
.auto_repair
)
6119 out
<< " AUTO_REPAIR";
6120 if (pg
.scrubber
.must_deep_scrub
)
6121 out
<< " MUST_DEEP_SCRUB";
6122 if (pg
.scrubber
.must_scrub
)
6123 out
<< " MUST_SCRUB";
6125 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6126 if (pg
.pg_log
.get_missing().num_missing()) {
6127 out
<< " m=" << pg
.pg_log
.get_missing().num_missing();
6128 if (pg
.is_primary()) {
6129 uint64_t unfound
= pg
.get_num_unfound();
6131 out
<< " u=" << unfound
;
6134 if (pg
.snap_trimq
.size())
6135 out
<< " snaptrimq=" << pg
.snap_trimq
;
6136 if (!pg
.is_clean()) {
6137 out
<< " mbc=" << pg
.missing_loc
.get_missing_by_count();
6146 bool PG::can_discard_op(OpRequestRef
& op
)
6148 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
6149 if (cct
->_conf
->osd_discard_disconnected_ops
&& OSD::op_is_discardable(m
)) {
6150 dout(20) << " discard " << *m
<< dendl
;
6154 if (m
->get_map_epoch() < info
.history
.same_primary_since
) {
6155 dout(7) << " changed after " << m
->get_map_epoch()
6156 << ", dropping " << *m
<< dendl
;
6160 if (m
->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT
)) {
6161 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend()) {
6162 dout(7) << __func__
<< " sent before last_force_op_resend "
6163 << pool
.info
.last_force_op_resend
<< ", dropping" << *m
<< dendl
;
6166 if (m
->get_map_epoch() < info
.history
.last_epoch_split
) {
6167 dout(7) << __func__
<< " pg split in "
6168 << info
.history
.last_epoch_split
<< ", dropping" << dendl
;
6171 } else if (m
->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND
)) {
6172 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend_preluminous()) {
6173 dout(7) << __func__
<< " sent before last_force_op_resend_preluminous "
6174 << pool
.info
.last_force_op_resend_preluminous
6175 << ", dropping" << *m
<< dendl
;
6183 template<typename T
, int MSGTYPE
>
6184 bool PG::can_discard_replica_op(OpRequestRef
& op
)
6186 const T
*m
= static_cast<const T
*>(op
->get_req());
6187 assert(m
->get_type() == MSGTYPE
);
6189 int from
= m
->get_source().num();
6191 // if a repop is replied after a replica goes down in a new osdmap, and
6192 // before the pg advances to this new osdmap, the repop replies before this
6193 // repop can be discarded by that replica OSD, because the primary resets the
6194 // connection to it when handling the new osdmap marking it down, and also
6195 // resets the messenger sesssion when the replica reconnects. to avoid the
6196 // out-of-order replies, the messages from that replica should be discarded.
6197 if (osd
->get_osdmap()->is_down(from
))
6199 /* Mostly, this overlaps with the old_peering_msg
6200 * condition. An important exception is pushes
6201 * sent by replicas not in the acting set, since
6202 * if such a replica goes down it does not cause
6203 * a new interval. */
6204 if (get_osdmap()->get_down_at(from
) >= m
->map_epoch
)
6208 // if pg changes _at all_, we reset and repeer!
6209 if (old_peering_msg(m
->map_epoch
, m
->map_epoch
)) {
6210 dout(10) << "can_discard_replica_op pg changed " << info
.history
6211 << " after " << m
->map_epoch
6212 << ", dropping" << dendl
;
6218 bool PG::can_discard_scan(OpRequestRef op
)
6220 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
6221 assert(m
->get_type() == MSG_OSD_PG_SCAN
);
6223 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
6224 dout(10) << " got old scan, ignoring" << dendl
;
6230 bool PG::can_discard_backfill(OpRequestRef op
)
6232 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
6233 assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
6235 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
6236 dout(10) << " got old backfill, ignoring" << dendl
;
6244 bool PG::can_discard_request(OpRequestRef
& op
)
6246 switch (op
->get_req()->get_type()) {
6247 case CEPH_MSG_OSD_OP
:
6248 return can_discard_op(op
);
6249 case CEPH_MSG_OSD_BACKOFF
:
6250 return false; // never discard
6252 return can_discard_replica_op
<MOSDSubOp
, MSG_OSD_SUBOP
>(op
);
6254 return can_discard_replica_op
<MOSDRepOp
, MSG_OSD_REPOP
>(op
);
6255 case MSG_OSD_PG_PUSH
:
6256 return can_discard_replica_op
<MOSDPGPush
, MSG_OSD_PG_PUSH
>(op
);
6257 case MSG_OSD_PG_PULL
:
6258 return can_discard_replica_op
<MOSDPGPull
, MSG_OSD_PG_PULL
>(op
);
6259 case MSG_OSD_PG_PUSH_REPLY
:
6260 return can_discard_replica_op
<MOSDPGPushReply
, MSG_OSD_PG_PUSH_REPLY
>(op
);
6261 case MSG_OSD_SUBOPREPLY
:
6262 return can_discard_replica_op
<MOSDSubOpReply
, MSG_OSD_SUBOPREPLY
>(op
);
6263 case MSG_OSD_REPOPREPLY
:
6264 return can_discard_replica_op
<MOSDRepOpReply
, MSG_OSD_REPOPREPLY
>(op
);
6265 case MSG_OSD_PG_RECOVERY_DELETE
:
6266 return can_discard_replica_op
<MOSDPGRecoveryDelete
, MSG_OSD_PG_RECOVERY_DELETE
>(op
);
6268 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
6269 return can_discard_replica_op
<MOSDPGRecoveryDeleteReply
, MSG_OSD_PG_RECOVERY_DELETE_REPLY
>(op
);
6271 case MSG_OSD_EC_WRITE
:
6272 return can_discard_replica_op
<MOSDECSubOpWrite
, MSG_OSD_EC_WRITE
>(op
);
6273 case MSG_OSD_EC_WRITE_REPLY
:
6274 return can_discard_replica_op
<MOSDECSubOpWriteReply
, MSG_OSD_EC_WRITE_REPLY
>(op
);
6275 case MSG_OSD_EC_READ
:
6276 return can_discard_replica_op
<MOSDECSubOpRead
, MSG_OSD_EC_READ
>(op
);
6277 case MSG_OSD_EC_READ_REPLY
:
6278 return can_discard_replica_op
<MOSDECSubOpReadReply
, MSG_OSD_EC_READ_REPLY
>(op
);
6279 case MSG_OSD_REP_SCRUB
:
6280 return can_discard_replica_op
<MOSDRepScrub
, MSG_OSD_REP_SCRUB
>(op
);
6281 case MSG_OSD_SCRUB_RESERVE
:
6282 return can_discard_replica_op
<MOSDScrubReserve
, MSG_OSD_SCRUB_RESERVE
>(op
);
6283 case MSG_OSD_REP_SCRUBMAP
:
6284 return can_discard_replica_op
<MOSDRepScrubMap
, MSG_OSD_REP_SCRUBMAP
>(op
);
6285 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
6286 return can_discard_replica_op
<
6287 MOSDPGUpdateLogMissing
, MSG_OSD_PG_UPDATE_LOG_MISSING
>(op
);
6288 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
6289 return can_discard_replica_op
<
6290 MOSDPGUpdateLogMissingReply
, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
>(op
);
6292 case MSG_OSD_PG_SCAN
:
6293 return can_discard_scan(op
);
6294 case MSG_OSD_PG_BACKFILL
:
6295 return can_discard_backfill(op
);
6296 case MSG_OSD_PG_BACKFILL_REMOVE
:
6297 return can_discard_replica_op
<MOSDPGBackfillRemove
,
6298 MSG_OSD_PG_BACKFILL_REMOVE
>(op
);
6303 void PG::take_waiters()
6305 dout(10) << "take_waiters" << dendl
;
6306 requeue_map_waiters();
6307 for (list
<CephPeeringEvtRef
>::iterator i
= peering_waiters
.begin();
6308 i
!= peering_waiters
.end();
6309 ++i
) osd
->queue_for_peering(this);
6310 peering_queue
.splice(peering_queue
.begin(), peering_waiters
,
6311 peering_waiters
.begin(), peering_waiters
.end());
6314 void PG::handle_peering_event(CephPeeringEvtRef evt
, RecoveryCtx
*rctx
)
6316 dout(10) << "handle_peering_event: " << evt
->get_desc() << dendl
;
6317 if (!have_same_or_newer_map(evt
->get_epoch_sent())) {
6318 dout(10) << "deferring event " << evt
->get_desc() << dendl
;
6319 peering_waiters
.push_back(evt
);
6322 if (old_peering_evt(evt
))
6324 recovery_state
.handle_event(evt
, rctx
);
6327 void PG::queue_peering_event(CephPeeringEvtRef evt
)
6329 if (old_peering_evt(evt
))
6331 peering_queue
.push_back(evt
);
6332 osd
->queue_for_peering(this);
6335 void PG::queue_null(epoch_t msg_epoch
,
6336 epoch_t query_epoch
)
6338 dout(10) << "null" << dendl
;
6339 queue_peering_event(
6340 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
6344 void PG::queue_flushed(epoch_t e
)
6346 dout(10) << "flushed" << dendl
;
6347 queue_peering_event(
6348 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(e
, e
,
6352 void PG::queue_query(epoch_t msg_epoch
,
6353 epoch_t query_epoch
,
6354 pg_shard_t from
, const pg_query_t
& q
)
6356 dout(10) << "handle_query " << q
<< " from replica " << from
<< dendl
;
6357 queue_peering_event(
6358 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
6359 MQuery(from
, q
, query_epoch
))));
6362 void PG::handle_advance_map(
6363 OSDMapRef osdmap
, OSDMapRef lastmap
,
6364 vector
<int>& newup
, int up_primary
,
6365 vector
<int>& newacting
, int acting_primary
,
6368 assert(lastmap
->get_epoch() == osdmap_ref
->get_epoch());
6369 assert(lastmap
== osdmap_ref
);
6370 dout(10) << "handle_advance_map "
6371 << newup
<< "/" << newacting
6372 << " -- " << up_primary
<< "/" << acting_primary
6374 update_osdmap_ref(osdmap
);
6375 pool
.update(osdmap
);
6376 past_intervals
.update_type_from_map(pool
.info
.ec_pool(), *osdmap
);
6377 if (cct
->_conf
->osd_debug_verify_cached_snaps
) {
6378 interval_set
<snapid_t
> actual_removed_snaps
;
6379 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
6381 pi
->build_removed_snaps(actual_removed_snaps
);
6382 if (!(actual_removed_snaps
== pool
.cached_removed_snaps
)) {
6383 derr
<< __func__
<< ": mismatch between the actual removed snaps "
6384 << actual_removed_snaps
<< " and pool.cached_removed_snaps "
6385 << " pool.cached_removed_snaps " << pool
.cached_removed_snaps
6388 assert(actual_removed_snaps
== pool
.cached_removed_snaps
);
6391 osdmap
, lastmap
, newup
, up_primary
,
6392 newacting
, acting_primary
);
6393 recovery_state
.handle_event(evt
, rctx
);
6394 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
6396 update_store_with_options();
6400 void PG::handle_activate_map(RecoveryCtx
*rctx
)
6402 dout(10) << "handle_activate_map " << dendl
;
6404 recovery_state
.handle_event(evt
, rctx
);
6405 if (osdmap_ref
->get_epoch() - last_persisted_osdmap_ref
->get_epoch() >
6406 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
6407 dout(20) << __func__
<< ": Dirtying info: last_persisted is "
6408 << last_persisted_osdmap_ref
->get_epoch()
6409 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6412 dout(20) << __func__
<< ": Not dirtying info: last_persisted is "
6413 << last_persisted_osdmap_ref
->get_epoch()
6414 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6416 if (osdmap_ref
->check_new_blacklist_entries()) check_blacklisted_watchers();
6419 void PG::handle_loaded(RecoveryCtx
*rctx
)
6421 dout(10) << "handle_loaded" << dendl
;
6423 recovery_state
.handle_event(evt
, rctx
);
6426 void PG::handle_create(RecoveryCtx
*rctx
)
6428 dout(10) << "handle_create" << dendl
;
6429 rctx
->created_pgs
.insert(this);
6431 recovery_state
.handle_event(evt
, rctx
);
6433 recovery_state
.handle_event(evt2
, rctx
);
6435 rctx
->on_applied
->add(make_lambda_context([this]() {
6436 update_store_with_options();
6440 void PG::handle_query_state(Formatter
*f
)
6442 dout(10) << "handle_query_state" << dendl
;
6444 recovery_state
.handle_event(q
, 0);
6447 void PG::update_store_with_options()
6449 auto r
= osd
->store
->set_collection_opts(coll
, pool
.info
.opts
);
6450 if(r
< 0 && r
!= -EOPNOTSUPP
) {
6451 derr
<< __func__
<< " set_collection_opts returns error:" << r
<< dendl
;
6455 void PG::update_store_on_load()
6457 if (osd
->store
->get_type() == "filestore") {
6458 // legacy filestore didn't store collection bit width; fix.
6459 int bits
= osd
->store
->collection_bits(coll
);
6461 assert(!coll
.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6462 bits
= info
.pgid
.get_split_bits(pool
.info
.get_pg_num());
6463 lderr(cct
) << __func__
<< " setting bit width to " << bits
<< dendl
;
6464 ObjectStore::Transaction t
;
6465 t
.collection_set_bits(coll
, bits
);
6466 osd
->store
->apply_transaction(osr
.get(), std::move(t
));
6471 /*------------ Recovery State Machine----------------*/
6473 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6474 << "state<" << get_state_name() << ">: ")
6476 /*------Crashed-------*/
6477 PG::RecoveryState::Crashed::Crashed(my_context ctx
)
6479 NamedState(context
< RecoveryMachine
>().pg
, "Crashed")
6481 context
< RecoveryMachine
>().log_enter(state_name
);
6482 assert(0 == "we got a bad state machine event");
6486 /*------Initial-------*/
6487 PG::RecoveryState::Initial::Initial(my_context ctx
)
6489 NamedState(context
< RecoveryMachine
>().pg
, "Initial")
6491 context
< RecoveryMachine
>().log_enter(state_name
);
6494 boost::statechart::result
PG::RecoveryState::Initial::react(const Load
& l
)
6496 PG
*pg
= context
< RecoveryMachine
>().pg
;
6498 // do we tell someone we're here?
6499 pg
->send_notify
= (!pg
->is_primary());
6500 pg
->update_store_with_options();
6502 pg
->update_store_on_load();
6504 return transit
< Reset
>();
6507 boost::statechart::result
PG::RecoveryState::Initial::react(const MNotifyRec
& notify
)
6509 PG
*pg
= context
< RecoveryMachine
>().pg
;
6510 pg
->proc_replica_info(
6511 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
6512 pg
->set_last_peering_reset();
6513 return transit
< Primary
>();
6516 boost::statechart::result
PG::RecoveryState::Initial::react(const MInfoRec
& i
)
6518 PG
*pg
= context
< RecoveryMachine
>().pg
;
6519 assert(!pg
->is_primary());
6521 return transit
< Stray
>();
6524 boost::statechart::result
PG::RecoveryState::Initial::react(const MLogRec
& i
)
6526 PG
*pg
= context
< RecoveryMachine
>().pg
;
6527 assert(!pg
->is_primary());
6529 return transit
< Stray
>();
6532 void PG::RecoveryState::Initial::exit()
6534 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6535 PG
*pg
= context
< RecoveryMachine
>().pg
;
6536 utime_t dur
= ceph_clock_now() - enter_time
;
6537 pg
->osd
->recoverystate_perf
->tinc(rs_initial_latency
, dur
);
6540 /*------Started-------*/
6541 PG::RecoveryState::Started::Started(my_context ctx
)
6543 NamedState(context
< RecoveryMachine
>().pg
, "Started")
6545 context
< RecoveryMachine
>().log_enter(state_name
);
6548 boost::statechart::result
6549 PG::RecoveryState::Started::react(const IntervalFlush
&)
6551 PG
*pg
= context
< RecoveryMachine
>().pg
;
6552 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
6553 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
6554 return discard_event();
6558 boost::statechart::result
6559 PG::RecoveryState::Started::react(const FlushedEvt
&)
6561 PG
*pg
= context
< RecoveryMachine
>().pg
;
6563 return discard_event();
6567 boost::statechart::result
PG::RecoveryState::Started::react(const AdvMap
& advmap
)
6569 PG
*pg
= context
< RecoveryMachine
>().pg
;
6570 ldout(pg
->cct
, 10) << "Started advmap" << dendl
;
6571 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
6572 if (pg
->should_restart_peering(
6574 advmap
.acting_primary
,
6579 ldout(pg
->cct
, 10) << "should_restart_peering, transitioning to Reset"
6582 return transit
< Reset
>();
6584 pg
->remove_down_peer_info(advmap
.osdmap
);
6585 return discard_event();
6588 boost::statechart::result
PG::RecoveryState::Started::react(const QueryState
& q
)
6590 q
.f
->open_object_section("state");
6591 q
.f
->dump_string("name", state_name
);
6592 q
.f
->dump_stream("enter_time") << enter_time
;
6593 q
.f
->close_section();
6594 return discard_event();
6597 void PG::RecoveryState::Started::exit()
6599 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6600 PG
*pg
= context
< RecoveryMachine
>().pg
;
6601 utime_t dur
= ceph_clock_now() - enter_time
;
6602 pg
->osd
->recoverystate_perf
->tinc(rs_started_latency
, dur
);
6605 /*--------Reset---------*/
6606 PG::RecoveryState::Reset::Reset(my_context ctx
)
6608 NamedState(context
< RecoveryMachine
>().pg
, "Reset")
6610 context
< RecoveryMachine
>().log_enter(state_name
);
6611 PG
*pg
= context
< RecoveryMachine
>().pg
;
6613 pg
->flushes_in_progress
= 0;
6614 pg
->set_last_peering_reset();
6617 boost::statechart::result
6618 PG::RecoveryState::Reset::react(const FlushedEvt
&)
6620 PG
*pg
= context
< RecoveryMachine
>().pg
;
6622 return discard_event();
6625 boost::statechart::result
6626 PG::RecoveryState::Reset::react(const IntervalFlush
&)
6628 PG
*pg
= context
< RecoveryMachine
>().pg
;
6629 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
6630 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
6631 return discard_event();
6634 boost::statechart::result
PG::RecoveryState::Reset::react(const AdvMap
& advmap
)
6636 PG
*pg
= context
< RecoveryMachine
>().pg
;
6637 ldout(pg
->cct
, 10) << "Reset advmap" << dendl
;
6639 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
6641 if (pg
->should_restart_peering(
6643 advmap
.acting_primary
,
6648 ldout(pg
->cct
, 10) << "should restart peering, calling start_peering_interval again"
6650 pg
->start_peering_interval(
6652 advmap
.newup
, advmap
.up_primary
,
6653 advmap
.newacting
, advmap
.acting_primary
,
6654 context
< RecoveryMachine
>().get_cur_transaction());
6656 pg
->remove_down_peer_info(advmap
.osdmap
);
6657 pg
->check_past_interval_bounds();
6658 return discard_event();
6661 boost::statechart::result
PG::RecoveryState::Reset::react(const ActMap
&)
6663 PG
*pg
= context
< RecoveryMachine
>().pg
;
6664 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
6665 context
< RecoveryMachine
>().send_notify(
6668 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
6669 pg
->get_osdmap()->get_epoch(),
6670 pg
->get_osdmap()->get_epoch(),
6672 pg
->past_intervals
);
6675 pg
->update_heartbeat_peers();
6678 return transit
< Started
>();
6681 boost::statechart::result
PG::RecoveryState::Reset::react(const QueryState
& q
)
6683 q
.f
->open_object_section("state");
6684 q
.f
->dump_string("name", state_name
);
6685 q
.f
->dump_stream("enter_time") << enter_time
;
6686 q
.f
->close_section();
6687 return discard_event();
6690 void PG::RecoveryState::Reset::exit()
6692 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6693 PG
*pg
= context
< RecoveryMachine
>().pg
;
6694 utime_t dur
= ceph_clock_now() - enter_time
;
6695 pg
->osd
->recoverystate_perf
->tinc(rs_reset_latency
, dur
);
6698 /*-------Start---------*/
6699 PG::RecoveryState::Start::Start(my_context ctx
)
6701 NamedState(context
< RecoveryMachine
>().pg
, "Start")
6703 context
< RecoveryMachine
>().log_enter(state_name
);
6705 PG
*pg
= context
< RecoveryMachine
>().pg
;
6706 if (pg
->is_primary()) {
6707 ldout(pg
->cct
, 1) << "transitioning to Primary" << dendl
;
6708 post_event(MakePrimary());
6710 ldout(pg
->cct
, 1) << "transitioning to Stray" << dendl
;
6711 post_event(MakeStray());
6715 void PG::RecoveryState::Start::exit()
6717 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6718 PG
*pg
= context
< RecoveryMachine
>().pg
;
6719 utime_t dur
= ceph_clock_now() - enter_time
;
6720 pg
->osd
->recoverystate_perf
->tinc(rs_start_latency
, dur
);
6723 /*---------Primary--------*/
6724 PG::RecoveryState::Primary::Primary(my_context ctx
)
6726 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary")
6728 context
< RecoveryMachine
>().log_enter(state_name
);
6729 PG
*pg
= context
< RecoveryMachine
>().pg
;
6730 assert(pg
->want_acting
.empty());
6732 // set CREATING bit until we have peered for the first time.
6733 if (pg
->info
.history
.last_epoch_started
== 0) {
6734 pg
->state_set(PG_STATE_CREATING
);
6735 // use the history timestamp, which ultimately comes from the
6736 // monitor in the create case.
6737 utime_t t
= pg
->info
.history
.last_scrub_stamp
;
6738 pg
->info
.stats
.last_fresh
= t
;
6739 pg
->info
.stats
.last_active
= t
;
6740 pg
->info
.stats
.last_change
= t
;
6741 pg
->info
.stats
.last_peered
= t
;
6742 pg
->info
.stats
.last_clean
= t
;
6743 pg
->info
.stats
.last_unstale
= t
;
6744 pg
->info
.stats
.last_undegraded
= t
;
6745 pg
->info
.stats
.last_fullsized
= t
;
6746 pg
->info
.stats
.last_scrub_stamp
= t
;
6747 pg
->info
.stats
.last_deep_scrub_stamp
= t
;
6748 pg
->info
.stats
.last_clean_scrub_stamp
= t
;
6752 boost::statechart::result
PG::RecoveryState::Primary::react(const MNotifyRec
& notevt
)
6754 PG
*pg
= context
< RecoveryMachine
>().pg
;
6755 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
6756 pg
->proc_replica_info(
6757 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
6758 return discard_event();
6761 boost::statechart::result
PG::RecoveryState::Primary::react(const ActMap
&)
6763 PG
*pg
= context
< RecoveryMachine
>().pg
;
6764 ldout(pg
->cct
, 7) << "handle ActMap primary" << dendl
;
6765 pg
->publish_stats_to_osd();
6767 return discard_event();
6770 void PG::RecoveryState::Primary::exit()
6772 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6773 PG
*pg
= context
< RecoveryMachine
>().pg
;
6774 pg
->want_acting
.clear();
6775 utime_t dur
= ceph_clock_now() - enter_time
;
6776 pg
->osd
->recoverystate_perf
->tinc(rs_primary_latency
, dur
);
6777 pg
->clear_primary_state();
6778 pg
->state_clear(PG_STATE_CREATING
);
6781 /*---------Peering--------*/
6782 PG::RecoveryState::Peering::Peering(my_context ctx
)
6784 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering"),
6785 history_les_bound(false)
6787 context
< RecoveryMachine
>().log_enter(state_name
);
6789 PG
*pg
= context
< RecoveryMachine
>().pg
;
6790 assert(!pg
->is_peered());
6791 assert(!pg
->is_peering());
6792 assert(pg
->is_primary());
6793 pg
->state_set(PG_STATE_PEERING
);
6796 boost::statechart::result
PG::RecoveryState::Peering::react(const AdvMap
& advmap
)
6798 PG
*pg
= context
< RecoveryMachine
>().pg
;
6799 ldout(pg
->cct
, 10) << "Peering advmap" << dendl
;
6800 if (prior_set
.affected_by_map(*(advmap
.osdmap
), pg
)) {
6801 ldout(pg
->cct
, 1) << "Peering, affected_by_map, going to Reset" << dendl
;
6803 return transit
< Reset
>();
6806 pg
->adjust_need_up_thru(advmap
.osdmap
);
6808 return forward_event();
6811 boost::statechart::result
PG::RecoveryState::Peering::react(const QueryState
& q
)
6813 PG
*pg
= context
< RecoveryMachine
>().pg
;
6815 q
.f
->open_object_section("state");
6816 q
.f
->dump_string("name", state_name
);
6817 q
.f
->dump_stream("enter_time") << enter_time
;
6819 q
.f
->open_array_section("past_intervals");
6820 pg
->past_intervals
.dump(q
.f
);
6821 q
.f
->close_section();
6823 q
.f
->open_array_section("probing_osds");
6824 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
6825 p
!= prior_set
.probe
.end();
6827 q
.f
->dump_stream("osd") << *p
;
6828 q
.f
->close_section();
6830 if (prior_set
.pg_down
)
6831 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
6833 q
.f
->open_array_section("down_osds_we_would_probe");
6834 for (set
<int>::iterator p
= prior_set
.down
.begin();
6835 p
!= prior_set
.down
.end();
6837 q
.f
->dump_int("osd", *p
);
6838 q
.f
->close_section();
6840 q
.f
->open_array_section("peering_blocked_by");
6841 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
6842 p
!= prior_set
.blocked_by
.end();
6844 q
.f
->open_object_section("osd");
6845 q
.f
->dump_int("osd", p
->first
);
6846 q
.f
->dump_int("current_lost_at", p
->second
);
6847 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
6848 q
.f
->close_section();
6850 q
.f
->close_section();
6852 if (history_les_bound
) {
6853 q
.f
->open_array_section("peering_blocked_by_detail");
6854 q
.f
->open_object_section("item");
6855 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
6856 q
.f
->close_section();
6857 q
.f
->close_section();
6860 q
.f
->close_section();
6861 return forward_event();
6864 void PG::RecoveryState::Peering::exit()
6866 PG
*pg
= context
< RecoveryMachine
>().pg
;
6867 ldout(pg
->cct
, 10) << "Leaving Peering" << dendl
;
6868 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6869 pg
->state_clear(PG_STATE_PEERING
);
6870 pg
->clear_probe_targets();
6872 utime_t dur
= ceph_clock_now() - enter_time
;
6873 pg
->osd
->recoverystate_perf
->tinc(rs_peering_latency
, dur
);
6877 /*------Backfilling-------*/
6878 PG::RecoveryState::Backfilling::Backfilling(my_context ctx
)
6880 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Backfilling")
6882 context
< RecoveryMachine
>().log_enter(state_name
);
6883 PG
*pg
= context
< RecoveryMachine
>().pg
;
6884 pg
->backfill_reserved
= true;
6885 pg
->queue_recovery();
6886 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
6887 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
6888 pg
->state_set(PG_STATE_BACKFILLING
);
6889 pg
->publish_stats_to_osd();
6892 boost::statechart::result
6893 PG::RecoveryState::Backfilling::react(const DeferBackfill
&c
)
6895 PG
*pg
= context
< RecoveryMachine
>().pg
;
6896 ldout(pg
->cct
, 10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
6897 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6899 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6900 pg
->state_clear(PG_STATE_BACKFILLING
);
6902 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6903 it
!= pg
->backfill_targets
.end();
6905 assert(*it
!= pg
->pg_whoami
);
6906 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6907 it
->osd
, pg
->get_osdmap()->get_epoch());
6909 pg
->osd
->send_message_osd_cluster(
6910 new MBackfillReserve(
6911 MBackfillReserve::REJECT
,
6912 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6913 pg
->get_osdmap()->get_epoch()),
6919 if (!pg
->waiting_on_backfill
.empty()) {
6920 pg
->waiting_on_backfill
.clear();
6921 pg
->finish_recovery_op(hobject_t::get_max());
6924 pg
->schedule_backfill_retry(c
.delay
);
6925 return transit
<NotBackfilling
>();
6928 boost::statechart::result
6929 PG::RecoveryState::Backfilling::react(const UnfoundBackfill
&c
)
6931 PG
*pg
= context
< RecoveryMachine
>().pg
;
6932 ldout(pg
->cct
, 10) << "backfill has unfound, can't continue" << dendl
;
6933 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6935 pg
->state_set(PG_STATE_BACKFILL_UNFOUND
);
6936 pg
->state_clear(PG_STATE_BACKFILLING
);
6938 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6939 it
!= pg
->backfill_targets
.end();
6941 assert(*it
!= pg
->pg_whoami
);
6942 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6943 it
->osd
, pg
->get_osdmap()->get_epoch());
6945 pg
->osd
->send_message_osd_cluster(
6946 new MBackfillReserve(
6947 MBackfillReserve::REJECT
,
6948 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6949 pg
->get_osdmap()->get_epoch()),
6954 pg
->waiting_on_backfill
.clear();
6956 return transit
<NotBackfilling
>();
6959 boost::statechart::result
6960 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected
&)
6962 PG
*pg
= context
< RecoveryMachine
>().pg
;
6963 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6964 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
6966 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6967 it
!= pg
->backfill_targets
.end();
6969 assert(*it
!= pg
->pg_whoami
);
6970 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6971 it
->osd
, pg
->get_osdmap()->get_epoch());
6973 pg
->osd
->send_message_osd_cluster(
6974 new MBackfillReserve(
6975 MBackfillReserve::REJECT
,
6976 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6977 pg
->get_osdmap()->get_epoch()),
6982 if (!pg
->waiting_on_backfill
.empty()) {
6983 pg
->waiting_on_backfill
.clear();
6984 pg
->finish_recovery_op(hobject_t::get_max());
6987 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
6988 return transit
<NotBackfilling
>();
6991 void PG::RecoveryState::Backfilling::exit()
6993 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6994 PG
*pg
= context
< RecoveryMachine
>().pg
;
6995 pg
->backfill_reserved
= false;
6996 pg
->backfill_reserving
= false;
6997 pg
->state_clear(PG_STATE_BACKFILLING
);
6998 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
6999 utime_t dur
= ceph_clock_now() - enter_time
;
7000 pg
->osd
->recoverystate_perf
->tinc(rs_backfilling_latency
, dur
);
7003 /*--WaitRemoteBackfillReserved--*/
7005 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
7007 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7008 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
7010 context
< RecoveryMachine
>().log_enter(state_name
);
7011 PG
*pg
= context
< RecoveryMachine
>().pg
;
7012 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7013 pg
->publish_stats_to_osd();
7014 post_event(RemoteBackfillReserved());
7017 boost::statechart::result
7018 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
7020 PG
*pg
= context
< RecoveryMachine
>().pg
;
7022 if (backfill_osd_it
!= context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
7023 //The primary never backfills itself
7024 assert(*backfill_osd_it
!= pg
->pg_whoami
);
7025 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7026 backfill_osd_it
->osd
, pg
->get_osdmap()->get_epoch());
7028 pg
->osd
->send_message_osd_cluster(
7029 new MBackfillReserve(
7030 MBackfillReserve::REQUEST
,
7031 spg_t(pg
->info
.pgid
.pgid
, backfill_osd_it
->shard
),
7032 pg
->get_osdmap()->get_epoch(),
7033 pg
->get_backfill_priority()),
7038 post_event(AllBackfillsReserved());
7040 return discard_event();
7043 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7045 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7046 PG
*pg
= context
< RecoveryMachine
>().pg
;
7047 utime_t dur
= ceph_clock_now() - enter_time
;
7048 pg
->osd
->recoverystate_perf
->tinc(rs_waitremotebackfillreserved_latency
, dur
);
7051 boost::statechart::result
7052 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected
&evt
)
7054 PG
*pg
= context
< RecoveryMachine
>().pg
;
7055 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7057 // Send REJECT to all previously acquired reservations
7058 set
<pg_shard_t
>::const_iterator it
, begin
, end
, next
;
7059 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
7060 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
7061 assert(begin
!= end
);
7062 for (next
= it
= begin
, ++next
; next
!= backfill_osd_it
; ++it
, ++next
) {
7063 //The primary never backfills itself
7064 assert(*it
!= pg
->pg_whoami
);
7065 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7066 it
->osd
, pg
->get_osdmap()->get_epoch());
7068 pg
->osd
->send_message_osd_cluster(
7069 new MBackfillReserve(
7070 MBackfillReserve::REJECT
,
7071 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
7072 pg
->get_osdmap()->get_epoch()),
7077 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7078 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
7079 pg
->publish_stats_to_osd();
7081 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
7083 return transit
<NotBackfilling
>();
7086 /*--WaitLocalBackfillReserved--*/
7087 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
7089 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalBackfillReserved")
7091 context
< RecoveryMachine
>().log_enter(state_name
);
7092 PG
*pg
= context
< RecoveryMachine
>().pg
;
7093 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7094 pg
->osd
->local_reserver
.request_reservation(
7096 new QueuePeeringEvt
<LocalBackfillReserved
>(
7097 pg
, pg
->get_osdmap()->get_epoch(),
7098 LocalBackfillReserved()),
7099 pg
->get_backfill_priority(),
7100 new QueuePeeringEvt
<DeferBackfill
>(
7101 pg
, pg
->get_osdmap()->get_epoch(),
7102 DeferBackfill(0.0)));
7103 pg
->publish_stats_to_osd();
7106 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7108 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7109 PG
*pg
= context
< RecoveryMachine
>().pg
;
7110 utime_t dur
= ceph_clock_now() - enter_time
;
7111 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalbackfillreserved_latency
, dur
);
7114 /*----NotBackfilling------*/
7115 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx
)
7117 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotBackfilling")
7119 context
< RecoveryMachine
>().log_enter(state_name
);
7120 PG
*pg
= context
< RecoveryMachine
>().pg
;
7121 pg
->publish_stats_to_osd();
7124 boost::statechart::result
7125 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
7127 return discard_event();
7130 boost::statechart::result
7131 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected
&evt
)
7133 return discard_event();
7136 void PG::RecoveryState::NotBackfilling::exit()
7138 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7139 PG
*pg
= context
< RecoveryMachine
>().pg
;
7140 pg
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
7141 utime_t dur
= ceph_clock_now() - enter_time
;
7142 pg
->osd
->recoverystate_perf
->tinc(rs_notbackfilling_latency
, dur
);
7145 /*----NotRecovering------*/
7146 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx
)
7148 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotRecovering")
7150 context
< RecoveryMachine
>().log_enter(state_name
);
7151 PG
*pg
= context
< RecoveryMachine
>().pg
;
7152 pg
->publish_stats_to_osd();
7155 void PG::RecoveryState::NotRecovering::exit()
7157 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7158 PG
*pg
= context
< RecoveryMachine
>().pg
;
7159 pg
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
7160 utime_t dur
= ceph_clock_now() - enter_time
;
7161 pg
->osd
->recoverystate_perf
->tinc(rs_notrecovering_latency
, dur
);
7164 /*---RepNotRecovering----*/
7165 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx
)
7167 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepNotRecovering")
7169 context
< RecoveryMachine
>().log_enter(state_name
);
7172 boost::statechart::result
7173 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation
&evt
)
7175 PG
*pg
= context
< RecoveryMachine
>().pg
;
7176 pg
->reject_reservation();
7177 post_event(RemoteReservationRejected());
7178 return discard_event();
7181 void PG::RecoveryState::RepNotRecovering::exit()
7183 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7184 PG
*pg
= context
< RecoveryMachine
>().pg
;
7185 utime_t dur
= ceph_clock_now() - enter_time
;
7186 pg
->osd
->recoverystate_perf
->tinc(rs_repnotrecovering_latency
, dur
);
7189 /*---RepWaitRecoveryReserved--*/
7190 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
7192 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitRecoveryReserved")
7194 context
< RecoveryMachine
>().log_enter(state_name
);
7195 PG
*pg
= context
< RecoveryMachine
>().pg
;
7197 pg
->osd
->remote_reserver
.request_reservation(
7199 new QueuePeeringEvt
<RemoteRecoveryReserved
>(
7200 pg
, pg
->get_osdmap()->get_epoch(),
7201 RemoteRecoveryReserved()),
7202 pg
->get_recovery_priority());
7205 boost::statechart::result
7206 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
7208 PG
*pg
= context
< RecoveryMachine
>().pg
;
7209 pg
->osd
->send_message_osd_cluster(
7211 new MRecoveryReserve(
7212 MRecoveryReserve::GRANT
,
7213 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
7214 pg
->get_osdmap()->get_epoch()),
7215 pg
->get_osdmap()->get_epoch());
7216 return transit
<RepRecovering
>();
7219 boost::statechart::result
7220 PG::RecoveryState::RepWaitRecoveryReserved::react(
7221 const RemoteReservationCanceled
&evt
)
7223 PG
*pg
= context
< RecoveryMachine
>().pg
;
7224 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7225 return transit
<RepNotRecovering
>();
7228 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7230 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7231 PG
*pg
= context
< RecoveryMachine
>().pg
;
7232 utime_t dur
= ceph_clock_now() - enter_time
;
7233 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitrecoveryreserved_latency
, dur
);
7236 /*-RepWaitBackfillReserved*/
7237 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
7239 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitBackfillReserved")
7241 context
< RecoveryMachine
>().log_enter(state_name
);
7244 boost::statechart::result
7245 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
7247 PG
*pg
= context
< RecoveryMachine
>().pg
;
7250 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
7251 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
7252 ldout(pg
->cct
, 10) << "backfill reservation rejected: failure injection"
7254 post_event(RejectRemoteReservation());
7255 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
7256 pg
->osd
->check_backfill_full(ss
)) {
7257 ldout(pg
->cct
, 10) << "backfill reservation rejected: "
7258 << ss
.str() << dendl
;
7259 post_event(RejectRemoteReservation());
7261 pg
->osd
->remote_reserver
.request_reservation(
7263 new QueuePeeringEvt
<RemoteBackfillReserved
>(
7264 pg
, pg
->get_osdmap()->get_epoch(),
7265 RemoteBackfillReserved()), evt
.priority
);
7267 return transit
<RepWaitBackfillReserved
>();
7270 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7272 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7273 PG
*pg
= context
< RecoveryMachine
>().pg
;
7274 utime_t dur
= ceph_clock_now() - enter_time
;
7275 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitbackfillreserved_latency
, dur
);
7278 boost::statechart::result
7279 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
7281 PG
*pg
= context
< RecoveryMachine
>().pg
;
7284 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
7285 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
7286 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
7287 << "failure injection" << dendl
;
7288 post_event(RejectRemoteReservation());
7289 return discard_event();
7290 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
7291 pg
->osd
->check_backfill_full(ss
)) {
7292 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
7293 << ss
.str() << dendl
;
7294 post_event(RejectRemoteReservation());
7295 return discard_event();
7297 pg
->osd
->send_message_osd_cluster(
7299 new MBackfillReserve(
7300 MBackfillReserve::GRANT
,
7301 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
7302 pg
->get_osdmap()->get_epoch()),
7303 pg
->get_osdmap()->get_epoch());
7304 return transit
<RepRecovering
>();
7308 boost::statechart::result
7309 PG::RecoveryState::RepWaitBackfillReserved::react(
7310 const RejectRemoteReservation
&evt
)
7312 PG
*pg
= context
< RecoveryMachine
>().pg
;
7313 pg
->reject_reservation();
7314 post_event(RemoteReservationRejected());
7315 return discard_event();
7318 boost::statechart::result
7319 PG::RecoveryState::RepWaitBackfillReserved::react(
7320 const RemoteReservationRejected
&evt
)
7322 PG
*pg
= context
< RecoveryMachine
>().pg
;
7323 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7324 return transit
<RepNotRecovering
>();
7327 boost::statechart::result
7328 PG::RecoveryState::RepWaitBackfillReserved::react(
7329 const RemoteReservationCanceled
&evt
)
7331 PG
*pg
= context
< RecoveryMachine
>().pg
;
7332 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7333 return transit
<RepNotRecovering
>();
7336 /*---RepRecovering-------*/
7337 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx
)
7339 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepRecovering")
7341 context
< RecoveryMachine
>().log_enter(state_name
);
7344 boost::statechart::result
7345 PG::RecoveryState::RepRecovering::react(const BackfillTooFull
&)
7347 PG
*pg
= context
< RecoveryMachine
>().pg
;
7348 pg
->reject_reservation();
7349 return discard_event();
7352 void PG::RecoveryState::RepRecovering::exit()
7354 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7355 PG
*pg
= context
< RecoveryMachine
>().pg
;
7356 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7357 utime_t dur
= ceph_clock_now() - enter_time
;
7358 pg
->osd
->recoverystate_perf
->tinc(rs_reprecovering_latency
, dur
);
7361 /*------Activating--------*/
7362 PG::RecoveryState::Activating::Activating(my_context ctx
)
7364 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Activating")
7366 context
< RecoveryMachine
>().log_enter(state_name
);
7369 void PG::RecoveryState::Activating::exit()
7371 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7372 PG
*pg
= context
< RecoveryMachine
>().pg
;
7373 utime_t dur
= ceph_clock_now() - enter_time
;
7374 pg
->osd
->recoverystate_perf
->tinc(rs_activating_latency
, dur
);
7377 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
7379 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalRecoveryReserved")
7381 context
< RecoveryMachine
>().log_enter(state_name
);
7382 PG
*pg
= context
< RecoveryMachine
>().pg
;
7384 // Make sure all nodes that part of the recovery aren't full
7385 if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
7386 pg
->osd
->check_osdmap_full(pg
->actingbackfill
)) {
7387 post_event(RecoveryTooFull());
7391 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7392 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
7393 pg
->osd
->local_reserver
.request_reservation(
7395 new QueuePeeringEvt
<LocalRecoveryReserved
>(
7396 pg
, pg
->get_osdmap()->get_epoch(),
7397 LocalRecoveryReserved()),
7398 pg
->get_recovery_priority(),
7399 new QueuePeeringEvt
<DeferRecovery
>(
7400 pg
, pg
->get_osdmap()->get_epoch(),
7401 DeferRecovery(0.0)));
7402 pg
->publish_stats_to_osd();
7405 boost::statechart::result
7406 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
7408 PG
*pg
= context
< RecoveryMachine
>().pg
;
7409 pg
->state_set(PG_STATE_RECOVERY_TOOFULL
);
7410 pg
->schedule_recovery_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
7411 return transit
<NotRecovering
>();
7414 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7416 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7417 PG
*pg
= context
< RecoveryMachine
>().pg
;
7418 utime_t dur
= ceph_clock_now() - enter_time
;
7419 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
7422 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
7424 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7425 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
7427 context
< RecoveryMachine
>().log_enter(state_name
);
7428 post_event(RemoteRecoveryReserved());
7431 boost::statechart::result
7432 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
7433 PG
*pg
= context
< RecoveryMachine
>().pg
;
7435 if (remote_recovery_reservation_it
!= context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
7436 assert(*remote_recovery_reservation_it
!= pg
->pg_whoami
);
7437 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7438 remote_recovery_reservation_it
->osd
, pg
->get_osdmap()->get_epoch());
7440 pg
->osd
->send_message_osd_cluster(
7441 new MRecoveryReserve(
7442 MRecoveryReserve::REQUEST
,
7443 spg_t(pg
->info
.pgid
.pgid
, remote_recovery_reservation_it
->shard
),
7444 pg
->get_osdmap()->get_epoch()),
7447 ++remote_recovery_reservation_it
;
7449 post_event(AllRemotesReserved());
7451 return discard_event();
7454 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7456 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7457 PG
*pg
= context
< RecoveryMachine
>().pg
;
7458 utime_t dur
= ceph_clock_now() - enter_time
;
7459 pg
->osd
->recoverystate_perf
->tinc(rs_waitremoterecoveryreserved_latency
, dur
);
7462 PG::RecoveryState::Recovering::Recovering(my_context ctx
)
7464 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovering")
7466 context
< RecoveryMachine
>().log_enter(state_name
);
7468 PG
*pg
= context
< RecoveryMachine
>().pg
;
7469 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
7470 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7471 pg
->state_set(PG_STATE_RECOVERING
);
7472 assert(!pg
->state_test(PG_STATE_ACTIVATING
));
7473 pg
->publish_stats_to_osd();
7474 pg
->queue_recovery();
7477 void PG::RecoveryState::Recovering::release_reservations(bool cancel
)
7479 PG
*pg
= context
< RecoveryMachine
>().pg
;
7480 assert(cancel
|| !pg
->pg_log
.get_missing().have_missing());
7482 // release remote reservations
7483 for (set
<pg_shard_t
>::const_iterator i
=
7484 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
7485 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
7487 if (*i
== pg
->pg_whoami
) // skip myself
7489 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7490 i
->osd
, pg
->get_osdmap()->get_epoch());
7492 pg
->osd
->send_message_osd_cluster(
7493 new MRecoveryReserve(
7494 MRecoveryReserve::RELEASE
,
7495 spg_t(pg
->info
.pgid
.pgid
, i
->shard
),
7496 pg
->get_osdmap()->get_epoch()),
7502 boost::statechart::result
7503 PG::RecoveryState::Recovering::react(const AllReplicasRecovered
&evt
)
7505 PG
*pg
= context
< RecoveryMachine
>().pg
;
7506 pg
->state_clear(PG_STATE_RECOVERING
);
7507 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
7508 release_reservations();
7509 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7510 return transit
<Recovered
>();
7513 boost::statechart::result
7514 PG::RecoveryState::Recovering::react(const RequestBackfill
&evt
)
7516 PG
*pg
= context
< RecoveryMachine
>().pg
;
7517 pg
->state_clear(PG_STATE_RECOVERING
);
7518 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
7519 release_reservations();
7520 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7521 // XXX: Is this needed?
7522 pg
->publish_stats_to_osd();
7523 return transit
<WaitLocalBackfillReserved
>();
7526 boost::statechart::result
7527 PG::RecoveryState::Recovering::react(const DeferRecovery
&evt
)
7529 PG
*pg
= context
< RecoveryMachine
>().pg
;
7530 if (!pg
->state_test(PG_STATE_RECOVERING
)) {
7531 // we may have finished recovery and have an AllReplicasRecovered
7532 // event queued to move us to the next state.
7533 ldout(pg
->cct
, 10) << "got defer recovery but not recovering" << dendl
;
7534 return discard_event();
7536 ldout(pg
->cct
, 10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
7537 pg
->state_clear(PG_STATE_RECOVERING
);
7538 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
7539 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7540 release_reservations(true);
7541 pg
->schedule_recovery_retry(evt
.delay
);
7542 return transit
<NotRecovering
>();
7545 boost::statechart::result
7546 PG::RecoveryState::Recovering::react(const UnfoundRecovery
&evt
)
7548 PG
*pg
= context
< RecoveryMachine
>().pg
;
7549 ldout(pg
->cct
, 10) << "recovery has unfound, can't continue" << dendl
;
7550 pg
->state_set(PG_STATE_RECOVERY_UNFOUND
);
7551 pg
->state_clear(PG_STATE_RECOVERING
);
7552 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7553 release_reservations(true);
7554 return transit
<NotRecovering
>();
7557 void PG::RecoveryState::Recovering::exit()
7559 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7560 PG
*pg
= context
< RecoveryMachine
>().pg
;
7561 utime_t dur
= ceph_clock_now() - enter_time
;
7562 pg
->osd
->recoverystate_perf
->tinc(rs_recovering_latency
, dur
);
7565 PG::RecoveryState::Recovered::Recovered(my_context ctx
)
7567 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovered")
7569 pg_shard_t auth_log_shard
;
7571 context
< RecoveryMachine
>().log_enter(state_name
);
7573 PG
*pg
= context
< RecoveryMachine
>().pg
;
7575 assert(!pg
->needs_recovery());
7577 // if we finished backfill, all acting are active; recheck if
7578 // DEGRADED | UNDERSIZED is appropriate.
7579 assert(!pg
->actingbackfill
.empty());
7580 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <=
7581 pg
->actingbackfill
.size()) {
7582 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
7583 pg
->publish_stats_to_osd();
7586 // adjust acting set? (e.g. because backfill completed...)
7587 bool history_les_bound
= false;
7588 if (pg
->acting
!= pg
->up
&& !pg
->choose_acting(auth_log_shard
,
7589 true, &history_les_bound
))
7590 assert(pg
->want_acting
.size());
7592 if (context
< Active
>().all_replicas_activated
)
7593 post_event(GoClean());
7596 void PG::RecoveryState::Recovered::exit()
7598 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7599 PG
*pg
= context
< RecoveryMachine
>().pg
;
7600 utime_t dur
= ceph_clock_now() - enter_time
;
7601 pg
->osd
->recoverystate_perf
->tinc(rs_recovered_latency
, dur
);
7604 PG::RecoveryState::Clean::Clean(my_context ctx
)
7606 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Clean")
7608 context
< RecoveryMachine
>().log_enter(state_name
);
7610 PG
*pg
= context
< RecoveryMachine
>().pg
;
7612 if (pg
->info
.last_complete
!= pg
->info
.last_update
) {
7615 pg
->finish_recovery(*context
< RecoveryMachine
>().get_on_safe_context_list());
7617 if (pg
->is_active()) {
7621 pg
->share_pg_info();
7622 pg
->publish_stats_to_osd();
7623 pg
->requeue_ops(pg
->waiting_for_clean_to_primary_repair
);
7626 void PG::RecoveryState::Clean::exit()
7628 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7629 PG
*pg
= context
< RecoveryMachine
>().pg
;
7630 pg
->state_clear(PG_STATE_CLEAN
);
7631 utime_t dur
= ceph_clock_now() - enter_time
;
7632 pg
->osd
->recoverystate_perf
->tinc(rs_clean_latency
, dur
);
7635 template <typename T
>
7636 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
7638 set
<int> osds_found
;
7639 set
<pg_shard_t
> out
;
7640 for (typename
T::const_iterator i
= in
.begin();
7643 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
7644 osds_found
.insert(i
->osd
);
7651 /*---------Active---------*/
7652 PG::RecoveryState::Active::Active(my_context ctx
)
7654 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active"),
7655 remote_shards_to_reserve_recovery(
7656 unique_osd_shard_set(
7657 context
< RecoveryMachine
>().pg
->pg_whoami
,
7658 context
< RecoveryMachine
>().pg
->actingbackfill
)),
7659 remote_shards_to_reserve_backfill(
7660 unique_osd_shard_set(
7661 context
< RecoveryMachine
>().pg
->pg_whoami
,
7662 context
< RecoveryMachine
>().pg
->backfill_targets
)),
7663 all_replicas_activated(false)
7665 context
< RecoveryMachine
>().log_enter(state_name
);
7667 PG
*pg
= context
< RecoveryMachine
>().pg
;
7669 assert(!pg
->backfill_reserving
);
7670 assert(!pg
->backfill_reserved
);
7671 assert(pg
->is_primary());
7672 ldout(pg
->cct
, 10) << "In Active, about to call activate" << dendl
;
7674 context
< RecoveryMachine
>().get_cur_transaction(),
7675 context
< RecoveryMachine
>().get_on_applied_context_list(),
7676 context
< RecoveryMachine
>().get_on_safe_context_list());
7677 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
7678 pg
->get_osdmap()->get_epoch(),
7679 *context
< RecoveryMachine
>().get_on_safe_context_list(),
7680 *context
< RecoveryMachine
>().get_query_map(),
7681 context
< RecoveryMachine
>().get_info_map(),
7682 context
< RecoveryMachine
>().get_recovery_ctx());
7684 // everyone has to commit/ack before we are truly active
7685 pg
->blocked_by
.clear();
7686 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
7687 p
!= pg
->actingbackfill
.end();
7689 if (p
->shard
!= pg
->pg_whoami
.shard
) {
7690 pg
->blocked_by
.insert(p
->shard
);
7693 pg
->publish_stats_to_osd();
7694 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7697 boost::statechart::result
PG::RecoveryState::Active::react(const AdvMap
& advmap
)
7699 PG
*pg
= context
< RecoveryMachine
>().pg
;
7700 ldout(pg
->cct
, 10) << "Active advmap" << dendl
;
7701 if (!pg
->pool
.newly_removed_snaps
.empty()) {
7702 pg
->snap_trimq
.union_of(pg
->pool
.newly_removed_snaps
);
7703 ldout(pg
->cct
, 10) << *pg
<< " snap_trimq now " << pg
->snap_trimq
<< dendl
;
7704 pg
->dirty_info
= true;
7705 pg
->dirty_big_info
= true;
7708 for (size_t i
= 0; i
< pg
->want_acting
.size(); i
++) {
7709 int osd
= pg
->want_acting
[i
];
7710 if (!advmap
.osdmap
->is_up(osd
)) {
7711 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
7712 assert(pg
->is_acting(osd_with_shard
) || pg
->is_up(osd_with_shard
));
7716 bool need_publish
= false;
7717 /* Check for changes in pool size (if the acting set changed as a result,
7718 * this does not matter) */
7719 if (advmap
.lastmap
->get_pg_size(pg
->info
.pgid
.pgid
) !=
7720 pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
)) {
7721 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <= pg
->actingset
.size()) {
7722 pg
->state_clear(PG_STATE_UNDERSIZED
);
7724 pg
->state_set(PG_STATE_UNDERSIZED
);
7726 // degraded changes will be detected by call from publish_stats_to_osd()
7727 need_publish
= true;
7730 // if we haven't reported our PG stats in a long time, do so now.
7731 if (pg
->info
.stats
.reported_epoch
+ pg
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
7732 ldout(pg
->cct
, 20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - pg
->info
.stats
.reported_epoch
)
7733 << " epochs" << dendl
;
7734 need_publish
= true;
7738 pg
->publish_stats_to_osd();
7740 return forward_event();
7743 boost::statechart::result
PG::RecoveryState::Active::react(const ActMap
&)
7745 PG
*pg
= context
< RecoveryMachine
>().pg
;
7746 ldout(pg
->cct
, 10) << "Active: handling ActMap" << dendl
;
7747 assert(pg
->is_primary());
7749 if (pg
->have_unfound()) {
7750 // object may have become unfound
7751 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7754 if (pg
->cct
->_conf
->osd_check_for_log_corruption
)
7755 pg
->check_log_for_corruption(pg
->osd
->store
);
7757 uint64_t unfound
= pg
->missing_loc
.num_unfound();
7759 pg
->all_unfound_are_queried_or_lost(pg
->get_osdmap())) {
7760 if (pg
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
7761 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has " << unfound
7762 << " objects unfound and apparently lost, would automatically "
7763 << "mark these objects lost but this feature is not yet implemented "
7764 << "(osd_auto_mark_unfound_lost)";
7766 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has "
7767 << unfound
<< " objects unfound and apparently lost";
7770 if (pg
->is_active()) {
7771 ldout(pg
->cct
, 10) << "Active: kicking snap trim" << dendl
;
7772 pg
->kick_snap_trim();
7775 if (pg
->is_peered() &&
7777 !pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
) &&
7778 (!pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) || pg
->is_degraded())) {
7779 pg
->queue_recovery();
7781 return forward_event();
7784 boost::statechart::result
PG::RecoveryState::Active::react(const MNotifyRec
& notevt
)
7786 PG
*pg
= context
< RecoveryMachine
>().pg
;
7787 assert(pg
->is_primary());
7788 if (pg
->peer_info
.count(notevt
.from
)) {
7789 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7790 << ", already have info from that osd, ignoring"
7792 } else if (pg
->peer_purged
.count(notevt
.from
)) {
7793 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7794 << ", already purged that peer, ignoring"
7797 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7798 << ", calling proc_replica_info and discover_all_missing"
7800 pg
->proc_replica_info(
7801 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
7802 if (pg
->have_unfound()) {
7803 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7806 return discard_event();
7809 boost::statechart::result
PG::RecoveryState::Active::react(const MInfoRec
& infoevt
)
7811 PG
*pg
= context
< RecoveryMachine
>().pg
;
7812 assert(pg
->is_primary());
7814 assert(!pg
->actingbackfill
.empty());
7815 // don't update history (yet) if we are active and primary; the replica
7816 // may be telling us they have activated (and committed) but we can't
7817 // share that until _everyone_ does the same.
7818 if (pg
->is_actingbackfill(infoevt
.from
)) {
7819 ldout(pg
->cct
, 10) << " peer osd." << infoevt
.from
7820 << " activated and committed" << dendl
;
7821 pg
->peer_activated
.insert(infoevt
.from
);
7822 pg
->blocked_by
.erase(infoevt
.from
.shard
);
7823 pg
->publish_stats_to_osd();
7824 if (pg
->peer_activated
.size() == pg
->actingbackfill
.size()) {
7825 pg
->all_activated_and_committed();
7828 return discard_event();
7831 boost::statechart::result
PG::RecoveryState::Active::react(const MLogRec
& logevt
)
7833 PG
*pg
= context
< RecoveryMachine
>().pg
;
7834 ldout(pg
->cct
, 10) << "searching osd." << logevt
.from
7835 << " log for unfound items" << dendl
;
7836 pg
->proc_replica_log(
7837 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
7838 bool got_missing
= pg
->search_for_missing(
7839 pg
->peer_info
[logevt
.from
],
7840 pg
->peer_missing
[logevt
.from
],
7842 context
< RecoveryMachine
>().get_recovery_ctx());
7843 // If there are missing AND we are "fully" active then start recovery now
7844 if (got_missing
&& pg
->state_test(PG_STATE_ACTIVE
)) {
7845 post_event(DoRecovery());
7847 return discard_event();
7850 boost::statechart::result
PG::RecoveryState::Active::react(const QueryState
& q
)
7852 PG
*pg
= context
< RecoveryMachine
>().pg
;
7854 q
.f
->open_object_section("state");
7855 q
.f
->dump_string("name", state_name
);
7856 q
.f
->dump_stream("enter_time") << enter_time
;
7859 q
.f
->open_array_section("might_have_unfound");
7860 for (set
<pg_shard_t
>::iterator p
= pg
->might_have_unfound
.begin();
7861 p
!= pg
->might_have_unfound
.end();
7863 q
.f
->open_object_section("osd");
7864 q
.f
->dump_stream("osd") << *p
;
7865 if (pg
->peer_missing
.count(*p
)) {
7866 q
.f
->dump_string("status", "already probed");
7867 } else if (pg
->peer_missing_requested
.count(*p
)) {
7868 q
.f
->dump_string("status", "querying");
7869 } else if (!pg
->get_osdmap()->is_up(p
->osd
)) {
7870 q
.f
->dump_string("status", "osd is down");
7872 q
.f
->dump_string("status", "not queried");
7874 q
.f
->close_section();
7876 q
.f
->close_section();
7879 q
.f
->open_object_section("recovery_progress");
7880 pg
->dump_recovery_info(q
.f
);
7881 q
.f
->close_section();
7885 q
.f
->open_object_section("scrub");
7886 q
.f
->dump_stream("scrubber.epoch_start") << pg
->scrubber
.epoch_start
;
7887 q
.f
->dump_bool("scrubber.active", pg
->scrubber
.active
);
7888 q
.f
->dump_string("scrubber.state", Scrubber::state_string(pg
->scrubber
.state
));
7889 q
.f
->dump_stream("scrubber.start") << pg
->scrubber
.start
;
7890 q
.f
->dump_stream("scrubber.end") << pg
->scrubber
.end
;
7891 q
.f
->dump_stream("scrubber.max_end") << pg
->scrubber
.max_end
;
7892 q
.f
->dump_stream("scrubber.subset_last_update") << pg
->scrubber
.subset_last_update
;
7893 q
.f
->dump_bool("scrubber.deep", pg
->scrubber
.deep
);
7895 q
.f
->open_array_section("scrubber.waiting_on_whom");
7896 for (set
<pg_shard_t
>::iterator p
= pg
->scrubber
.waiting_on_whom
.begin();
7897 p
!= pg
->scrubber
.waiting_on_whom
.end();
7899 q
.f
->dump_stream("shard") << *p
;
7901 q
.f
->close_section();
7903 q
.f
->close_section();
7906 q
.f
->close_section();
7907 return forward_event();
7910 boost::statechart::result
PG::RecoveryState::Active::react(const AllReplicasActivated
&evt
)
7912 PG
*pg
= context
< RecoveryMachine
>().pg
;
7913 all_replicas_activated
= true;
7915 pg
->state_clear(PG_STATE_ACTIVATING
);
7916 pg
->state_clear(PG_STATE_CREATING
);
7917 if (pg
->acting
.size() >= pg
->pool
.info
.min_size
) {
7918 pg
->state_set(PG_STATE_ACTIVE
);
7920 pg
->state_set(PG_STATE_PEERED
);
7923 // info.last_epoch_started is set during activate()
7924 pg
->info
.history
.last_epoch_started
= pg
->info
.last_epoch_started
;
7925 pg
->info
.history
.last_interval_started
= pg
->info
.last_interval_started
;
7926 pg
->dirty_info
= true;
7928 pg
->share_pg_info();
7929 pg
->publish_stats_to_osd();
7934 if (pg
->flushes_in_progress
== 0) {
7935 pg
->requeue_ops(pg
->waiting_for_peered
);
7936 } else if (!pg
->waiting_for_peered
.empty()) {
7937 ldout(pg
->cct
, 10) << __func__
<< " flushes in progress, moving "
7938 << pg
->waiting_for_peered
.size()
7939 << " items to waiting_for_flush"
7941 assert(pg
->waiting_for_flush
.empty());
7942 pg
->waiting_for_flush
.swap(pg
->waiting_for_peered
);
7947 return discard_event();
7950 void PG::RecoveryState::Active::exit()
7952 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7953 PG
*pg
= context
< RecoveryMachine
>().pg
;
7954 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7956 pg
->blocked_by
.clear();
7957 pg
->backfill_reserved
= false;
7958 pg
->backfill_reserving
= false;
7959 pg
->state_clear(PG_STATE_ACTIVATING
);
7960 pg
->state_clear(PG_STATE_DEGRADED
);
7961 pg
->state_clear(PG_STATE_UNDERSIZED
);
7962 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
7963 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7964 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
7965 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7966 utime_t dur
= ceph_clock_now() - enter_time
;
7967 pg
->osd
->recoverystate_perf
->tinc(rs_active_latency
, dur
);
7971 /*------ReplicaActive-----*/
7972 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx
)
7974 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive")
7976 context
< RecoveryMachine
>().log_enter(state_name
);
7978 PG
*pg
= context
< RecoveryMachine
>().pg
;
7980 context
< RecoveryMachine
>().get_cur_transaction(),
7981 context
< RecoveryMachine
>().get_on_applied_context_list(),
7982 context
< RecoveryMachine
>().get_on_safe_context_list());
7986 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
7987 const Activate
& actevt
) {
7988 PG
*pg
= context
< RecoveryMachine
>().pg
;
7989 ldout(pg
->cct
, 10) << "In ReplicaActive, about to call activate" << dendl
;
7990 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
7991 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
7992 actevt
.activation_epoch
,
7993 *context
< RecoveryMachine
>().get_on_safe_context_list(),
7994 query_map
, NULL
, NULL
);
7995 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7996 return discard_event();
7999 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MInfoRec
& infoevt
)
8001 PG
*pg
= context
< RecoveryMachine
>().pg
;
8002 pg
->proc_primary_info(*context
<RecoveryMachine
>().get_cur_transaction(),
8004 return discard_event();
8007 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MLogRec
& logevt
)
8009 PG
*pg
= context
< RecoveryMachine
>().pg
;
8010 ldout(pg
->cct
, 10) << "received log from " << logevt
.from
<< dendl
;
8011 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
8012 pg
->merge_log(*t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
8013 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
8015 return discard_event();
8018 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const ActMap
&)
8020 PG
*pg
= context
< RecoveryMachine
>().pg
;
8021 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
8022 context
< RecoveryMachine
>().send_notify(
8025 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
8026 pg
->get_osdmap()->get_epoch(),
8027 pg
->get_osdmap()->get_epoch(),
8029 pg
->past_intervals
);
8032 return discard_event();
8035 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
8036 const MQuery
& query
)
8038 PG
*pg
= context
< RecoveryMachine
>().pg
;
8039 pg
->fulfill_query(query
, context
<RecoveryMachine
>().get_recovery_ctx());
8040 return discard_event();
8043 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const QueryState
& q
)
8045 q
.f
->open_object_section("state");
8046 q
.f
->dump_string("name", state_name
);
8047 q
.f
->dump_stream("enter_time") << enter_time
;
8048 q
.f
->close_section();
8049 return forward_event();
8052 void PG::RecoveryState::ReplicaActive::exit()
8054 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8055 PG
*pg
= context
< RecoveryMachine
>().pg
;
8056 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
8057 utime_t dur
= ceph_clock_now() - enter_time
;
8058 pg
->osd
->recoverystate_perf
->tinc(rs_replicaactive_latency
, dur
);
8062 PG::RecoveryState::Stray::Stray(my_context ctx
)
8064 NamedState(context
< RecoveryMachine
>().pg
, "Started/Stray")
8066 context
< RecoveryMachine
>().log_enter(state_name
);
8068 PG
*pg
= context
< RecoveryMachine
>().pg
;
8069 assert(!pg
->is_peered());
8070 assert(!pg
->is_peering());
8071 assert(!pg
->is_primary());
8073 context
< RecoveryMachine
>().get_cur_transaction(),
8074 context
< RecoveryMachine
>().get_on_applied_context_list(),
8075 context
< RecoveryMachine
>().get_on_safe_context_list());
8078 boost::statechart::result
PG::RecoveryState::Stray::react(const MLogRec
& logevt
)
8080 PG
*pg
= context
< RecoveryMachine
>().pg
;
8081 MOSDPGLog
*msg
= logevt
.msg
.get();
8082 ldout(pg
->cct
, 10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
8084 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
8085 if (msg
->info
.last_backfill
== hobject_t()) {
8087 pg
->unreg_next_scrub();
8088 pg
->info
= msg
->info
;
8089 pg
->reg_next_scrub();
8090 pg
->dirty_info
= true;
8091 pg
->dirty_big_info
= true; // maybe.
8093 PGLogEntryHandler rollbacker
{pg
, t
};
8094 pg
->pg_log
.reset_backfill_claim_log(msg
->log
, &rollbacker
);
8096 pg
->pg_log
.reset_backfill();
8098 pg
->merge_log(*t
, msg
->info
, msg
->log
, logevt
.from
);
8101 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
8103 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
8104 return transit
<ReplicaActive
>();
8107 boost::statechart::result
PG::RecoveryState::Stray::react(const MInfoRec
& infoevt
)
8109 PG
*pg
= context
< RecoveryMachine
>().pg
;
8110 ldout(pg
->cct
, 10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
8112 if (pg
->info
.last_update
> infoevt
.info
.last_update
) {
8113 // rewind divergent log entries
8114 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
8115 pg
->rewind_divergent_log(*t
, infoevt
.info
.last_update
);
8116 pg
->info
.stats
= infoevt
.info
.stats
;
8117 pg
->info
.hit_set
= infoevt
.info
.hit_set
;
8120 assert(infoevt
.info
.last_update
== pg
->info
.last_update
);
8121 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
8123 post_event(Activate(infoevt
.info
.last_epoch_started
));
8124 return transit
<ReplicaActive
>();
8127 boost::statechart::result
PG::RecoveryState::Stray::react(const MQuery
& query
)
8129 PG
*pg
= context
< RecoveryMachine
>().pg
;
8130 pg
->fulfill_query(query
, context
<RecoveryMachine
>().get_recovery_ctx());
8131 return discard_event();
8134 boost::statechart::result
PG::RecoveryState::Stray::react(const ActMap
&)
8136 PG
*pg
= context
< RecoveryMachine
>().pg
;
8137 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
8138 context
< RecoveryMachine
>().send_notify(
8141 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
8142 pg
->get_osdmap()->get_epoch(),
8143 pg
->get_osdmap()->get_epoch(),
8145 pg
->past_intervals
);
8148 return discard_event();
8151 void PG::RecoveryState::Stray::exit()
8153 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8154 PG
*pg
= context
< RecoveryMachine
>().pg
;
8155 utime_t dur
= ceph_clock_now() - enter_time
;
8156 pg
->osd
->recoverystate_perf
->tinc(rs_stray_latency
, dur
);
8159 /*--------GetInfo---------*/
8160 PG::RecoveryState::GetInfo::GetInfo(my_context ctx
)
8162 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetInfo")
8164 context
< RecoveryMachine
>().log_enter(state_name
);
8166 PG
*pg
= context
< RecoveryMachine
>().pg
;
8167 pg
->check_past_interval_bounds();
8168 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8170 assert(pg
->blocked_by
.empty());
8172 prior_set
= pg
->build_prior();
8174 pg
->reset_min_peer_features();
8176 if (prior_set
.pg_down
) {
8177 post_event(IsDown());
8178 } else if (peer_info_requested
.empty()) {
8179 post_event(GotInfo());
8183 void PG::RecoveryState::GetInfo::get_infos()
8185 PG
*pg
= context
< RecoveryMachine
>().pg
;
8186 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8188 pg
->blocked_by
.clear();
8189 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
8190 it
!= prior_set
.probe
.end();
8192 pg_shard_t peer
= *it
;
8193 if (peer
== pg
->pg_whoami
) {
8196 if (pg
->peer_info
.count(peer
)) {
8197 ldout(pg
->cct
, 10) << " have osd." << peer
<< " info " << pg
->peer_info
[peer
] << dendl
;
8200 if (peer_info_requested
.count(peer
)) {
8201 ldout(pg
->cct
, 10) << " already requested info from osd." << peer
<< dendl
;
8202 pg
->blocked_by
.insert(peer
.osd
);
8203 } else if (!pg
->get_osdmap()->is_up(peer
.osd
)) {
8204 ldout(pg
->cct
, 10) << " not querying info from down osd." << peer
<< dendl
;
8206 ldout(pg
->cct
, 10) << " querying info from osd." << peer
<< dendl
;
8207 context
< RecoveryMachine
>().send_query(
8208 peer
, pg_query_t(pg_query_t::INFO
,
8209 it
->shard
, pg
->pg_whoami
.shard
,
8211 pg
->get_osdmap()->get_epoch()));
8212 peer_info_requested
.insert(peer
);
8213 pg
->blocked_by
.insert(peer
.osd
);
8217 pg
->publish_stats_to_osd();
8220 boost::statechart::result
PG::RecoveryState::GetInfo::react(const MNotifyRec
& infoevt
)
8222 PG
*pg
= context
< RecoveryMachine
>().pg
;
8224 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
8225 if (p
!= peer_info_requested
.end()) {
8226 peer_info_requested
.erase(p
);
8227 pg
->blocked_by
.erase(infoevt
.from
.osd
);
8230 epoch_t old_start
= pg
->info
.history
.last_epoch_started
;
8231 if (pg
->proc_replica_info(
8232 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
8233 // we got something new ...
8234 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8235 if (old_start
< pg
->info
.history
.last_epoch_started
) {
8236 ldout(pg
->cct
, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
8237 prior_set
= pg
->build_prior();
8239 // filter out any osds that got dropped from the probe set from
8240 // peer_info_requested. this is less expensive than restarting
8241 // peering (which would re-probe everyone).
8242 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
8243 while (p
!= peer_info_requested
.end()) {
8244 if (prior_set
.probe
.count(*p
) == 0) {
8245 ldout(pg
->cct
, 20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
8246 peer_info_requested
.erase(p
++);
8253 ldout(pg
->cct
, 20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
8254 << hex
<< infoevt
.features
<< dec
<< dendl
;
8255 pg
->apply_peer_features(infoevt
.features
);
8257 // are we done getting everything?
8258 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
8259 ldout(pg
->cct
, 20) << "Common peer features: " << hex
<< pg
->get_min_peer_features() << dec
<< dendl
;
8260 ldout(pg
->cct
, 20) << "Common acting features: " << hex
<< pg
->get_min_acting_features() << dec
<< dendl
;
8261 ldout(pg
->cct
, 20) << "Common upacting features: " << hex
<< pg
->get_min_upacting_features() << dec
<< dendl
;
8262 post_event(GotInfo());
8265 return discard_event();
8268 boost::statechart::result
PG::RecoveryState::GetInfo::react(const QueryState
& q
)
8270 PG
*pg
= context
< RecoveryMachine
>().pg
;
8271 q
.f
->open_object_section("state");
8272 q
.f
->dump_string("name", state_name
);
8273 q
.f
->dump_stream("enter_time") << enter_time
;
8275 q
.f
->open_array_section("requested_info_from");
8276 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
8277 p
!= peer_info_requested
.end();
8279 q
.f
->open_object_section("osd");
8280 q
.f
->dump_stream("osd") << *p
;
8281 if (pg
->peer_info
.count(*p
)) {
8282 q
.f
->open_object_section("got_info");
8283 pg
->peer_info
[*p
].dump(q
.f
);
8284 q
.f
->close_section();
8286 q
.f
->close_section();
8288 q
.f
->close_section();
8290 q
.f
->close_section();
8291 return forward_event();
8294 void PG::RecoveryState::GetInfo::exit()
8296 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8297 PG
*pg
= context
< RecoveryMachine
>().pg
;
8298 utime_t dur
= ceph_clock_now() - enter_time
;
8299 pg
->osd
->recoverystate_perf
->tinc(rs_getinfo_latency
, dur
);
8300 pg
->blocked_by
.clear();
8301 pg
->publish_stats_to_osd();
8304 /*------GetLog------------*/
8305 PG::RecoveryState::GetLog::GetLog(my_context ctx
)
8308 context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetLog"),
8311 context
< RecoveryMachine
>().log_enter(state_name
);
8313 PG
*pg
= context
< RecoveryMachine
>().pg
;
8316 if (!pg
->choose_acting(auth_log_shard
, false,
8317 &context
< Peering
>().history_les_bound
)) {
8318 if (!pg
->want_acting
.empty()) {
8319 post_event(NeedActingChange());
8321 post_event(IsIncomplete());
8327 if (auth_log_shard
== pg
->pg_whoami
) {
8328 post_event(GotLog());
8332 const pg_info_t
& best
= pg
->peer_info
[auth_log_shard
];
8335 if (pg
->info
.last_update
< best
.log_tail
) {
8336 ldout(pg
->cct
, 10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
8337 post_event(IsIncomplete());
8341 // how much log to request?
8342 eversion_t request_log_from
= pg
->info
.last_update
;
8343 assert(!pg
->actingbackfill
.empty());
8344 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
8345 p
!= pg
->actingbackfill
.end();
8347 if (*p
== pg
->pg_whoami
) continue;
8348 pg_info_t
& ri
= pg
->peer_info
[*p
];
8349 if (ri
.last_update
< pg
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
8350 ri
.last_update
< request_log_from
)
8351 request_log_from
= ri
.last_update
;
8355 ldout(pg
->cct
, 10) << " requesting log from osd." << auth_log_shard
<< dendl
;
8356 context
<RecoveryMachine
>().send_query(
8360 auth_log_shard
.shard
, pg
->pg_whoami
.shard
,
8361 request_log_from
, pg
->info
.history
,
8362 pg
->get_osdmap()->get_epoch()));
8364 assert(pg
->blocked_by
.empty());
8365 pg
->blocked_by
.insert(auth_log_shard
.osd
);
8366 pg
->publish_stats_to_osd();
8369 boost::statechart::result
PG::RecoveryState::GetLog::react(const AdvMap
& advmap
)
8371 PG
*pg
= context
< RecoveryMachine
>().pg
;
8372 // make sure our log source didn't go down. we need to check
8373 // explicitly because it may not be part of the prior set, which
8374 // means the Peering state check won't catch it going down.
8375 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
8376 ldout(pg
->cct
, 10) << "GetLog: auth_log_shard osd."
8377 << auth_log_shard
.osd
<< " went down" << dendl
;
8379 return transit
< Reset
>();
8382 // let the Peering state do its checks.
8383 return forward_event();
8386 boost::statechart::result
PG::RecoveryState::GetLog::react(const MLogRec
& logevt
)
8388 PG
*pg
= context
< RecoveryMachine
>().pg
;
8390 if (logevt
.from
!= auth_log_shard
) {
8391 ldout(pg
->cct
, 10) << "GetLog: discarding log from "
8392 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
8393 return discard_event();
8395 ldout(pg
->cct
, 10) << "GetLog: received master log from osd"
8396 << logevt
.from
<< dendl
;
8398 post_event(GotLog());
8399 return discard_event();
8402 boost::statechart::result
PG::RecoveryState::GetLog::react(const GotLog
&)
8404 PG
*pg
= context
< RecoveryMachine
>().pg
;
8405 ldout(pg
->cct
, 10) << "leaving GetLog" << dendl
;
8407 ldout(pg
->cct
, 10) << "processing master log" << dendl
;
8408 pg
->proc_master_log(*context
<RecoveryMachine
>().get_cur_transaction(),
8409 msg
->info
, msg
->log
, msg
->missing
,
8413 context
< RecoveryMachine
>().get_cur_transaction(),
8414 context
< RecoveryMachine
>().get_on_applied_context_list(),
8415 context
< RecoveryMachine
>().get_on_safe_context_list());
8416 return transit
< GetMissing
>();
8419 boost::statechart::result
PG::RecoveryState::GetLog::react(const QueryState
& q
)
8421 q
.f
->open_object_section("state");
8422 q
.f
->dump_string("name", state_name
);
8423 q
.f
->dump_stream("enter_time") << enter_time
;
8424 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
8425 q
.f
->close_section();
8426 return forward_event();
8429 void PG::RecoveryState::GetLog::exit()
8431 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8432 PG
*pg
= context
< RecoveryMachine
>().pg
;
8433 utime_t dur
= ceph_clock_now() - enter_time
;
8434 pg
->osd
->recoverystate_perf
->tinc(rs_getlog_latency
, dur
);
8435 pg
->blocked_by
.clear();
8436 pg
->publish_stats_to_osd();
8439 /*------WaitActingChange--------*/
8440 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx
)
8442 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitActingChange")
8444 context
< RecoveryMachine
>().log_enter(state_name
);
8447 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const AdvMap
& advmap
)
8449 PG
*pg
= context
< RecoveryMachine
>().pg
;
8450 OSDMapRef osdmap
= advmap
.osdmap
;
8452 ldout(pg
->cct
, 10) << "verifying no want_acting " << pg
->want_acting
<< " targets didn't go down" << dendl
;
8453 for (vector
<int>::iterator p
= pg
->want_acting
.begin(); p
!= pg
->want_acting
.end(); ++p
) {
8454 if (!osdmap
->is_up(*p
)) {
8455 ldout(pg
->cct
, 10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
8457 return transit
< Reset
>();
8460 return forward_event();
8463 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MLogRec
& logevt
)
8465 PG
*pg
= context
< RecoveryMachine
>().pg
;
8466 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
8467 return discard_event();
8470 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MInfoRec
& evt
)
8472 PG
*pg
= context
< RecoveryMachine
>().pg
;
8473 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
8474 return discard_event();
8477 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MNotifyRec
& evt
)
8479 PG
*pg
= context
< RecoveryMachine
>().pg
;
8480 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
8481 return discard_event();
8484 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const QueryState
& q
)
8486 q
.f
->open_object_section("state");
8487 q
.f
->dump_string("name", state_name
);
8488 q
.f
->dump_stream("enter_time") << enter_time
;
8489 q
.f
->dump_string("comment", "waiting for pg acting set to change");
8490 q
.f
->close_section();
8491 return forward_event();
8494 void PG::RecoveryState::WaitActingChange::exit()
8496 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8497 PG
*pg
= context
< RecoveryMachine
>().pg
;
8498 utime_t dur
= ceph_clock_now() - enter_time
;
8499 pg
->osd
->recoverystate_perf
->tinc(rs_waitactingchange_latency
, dur
);
8502 /*------Down--------*/
8503 PG::RecoveryState::Down::Down(my_context ctx
)
8505 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Down")
8507 context
< RecoveryMachine
>().log_enter(state_name
);
8508 PG
*pg
= context
< RecoveryMachine
>().pg
;
8510 pg
->state_clear(PG_STATE_PEERING
);
8511 pg
->state_set(PG_STATE_DOWN
);
8513 auto &prior_set
= context
< Peering
>().prior_set
;
8514 assert(pg
->blocked_by
.empty());
8515 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
8516 pg
->publish_stats_to_osd();
8519 void PG::RecoveryState::Down::exit()
8521 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8522 PG
*pg
= context
< RecoveryMachine
>().pg
;
8524 pg
->state_clear(PG_STATE_DOWN
);
8525 utime_t dur
= ceph_clock_now() - enter_time
;
8526 pg
->osd
->recoverystate_perf
->tinc(rs_down_latency
, dur
);
8528 pg
->blocked_by
.clear();
8529 pg
->publish_stats_to_osd();
8532 boost::statechart::result
PG::RecoveryState::Down::react(const QueryState
& q
)
8534 q
.f
->open_object_section("state");
8535 q
.f
->dump_string("name", state_name
);
8536 q
.f
->dump_stream("enter_time") << enter_time
;
8537 q
.f
->dump_string("comment",
8538 "not enough up instances of this PG to go active");
8539 q
.f
->close_section();
8540 return forward_event();
8543 /*------Incomplete--------*/
8544 PG::RecoveryState::Incomplete::Incomplete(my_context ctx
)
8546 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Incomplete")
8548 context
< RecoveryMachine
>().log_enter(state_name
);
8549 PG
*pg
= context
< RecoveryMachine
>().pg
;
8551 pg
->state_clear(PG_STATE_PEERING
);
8552 pg
->state_set(PG_STATE_INCOMPLETE
);
8554 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8555 assert(pg
->blocked_by
.empty());
8556 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
8557 pg
->publish_stats_to_osd();
8560 boost::statechart::result
PG::RecoveryState::Incomplete::react(const AdvMap
&advmap
) {
8561 PG
*pg
= context
< RecoveryMachine
>().pg
;
8562 int64_t poolnum
= pg
->info
.pgid
.pool();
8564 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8565 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
8566 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
8567 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
8569 return transit
< Reset
>();
8572 return forward_event();
8575 boost::statechart::result
PG::RecoveryState::Incomplete::react(const MNotifyRec
& notevt
) {
8576 PG
*pg
= context
< RecoveryMachine
>().pg
;
8577 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
8578 if (pg
->proc_replica_info(
8579 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
8580 // We got something new, try again!
8581 return transit
< GetLog
>();
8583 return discard_event();
8587 boost::statechart::result
PG::RecoveryState::Incomplete::react(
8588 const QueryState
& q
)
8590 q
.f
->open_object_section("state");
8591 q
.f
->dump_string("name", state_name
);
8592 q
.f
->dump_stream("enter_time") << enter_time
;
8593 q
.f
->dump_string("comment", "not enough complete instances of this PG");
8594 q
.f
->close_section();
8595 return forward_event();
8598 void PG::RecoveryState::Incomplete::exit()
8600 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8601 PG
*pg
= context
< RecoveryMachine
>().pg
;
8603 pg
->state_clear(PG_STATE_INCOMPLETE
);
8604 utime_t dur
= ceph_clock_now() - enter_time
;
8605 pg
->osd
->recoverystate_perf
->tinc(rs_incomplete_latency
, dur
);
8607 pg
->blocked_by
.clear();
8608 pg
->publish_stats_to_osd();
8611 /*------GetMissing--------*/
8612 PG::RecoveryState::GetMissing::GetMissing(my_context ctx
)
8614 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetMissing")
8616 context
< RecoveryMachine
>().log_enter(state_name
);
8618 PG
*pg
= context
< RecoveryMachine
>().pg
;
8619 assert(!pg
->actingbackfill
.empty());
8621 for (set
<pg_shard_t
>::iterator i
= pg
->actingbackfill
.begin();
8622 i
!= pg
->actingbackfill
.end();
8624 if (*i
== pg
->get_primary()) continue;
8625 const pg_info_t
& pi
= pg
->peer_info
[*i
];
8626 // reset this so to make sure the pg_missing_t is initialized and
8627 // has the correct semantics even if we don't need to get a
8628 // missing set from a shard. This way later additions due to
8629 // lost+unfound delete work properly.
8630 pg
->peer_missing
[*i
].may_include_deletes
= !pg
->perform_deletes_during_peering();
8633 continue; // no pg data, nothing divergent
8635 if (pi
.last_update
< pg
->pg_log
.get_tail()) {
8636 ldout(pg
->cct
, 10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
8637 pg
->peer_missing
[*i
].clear();
8640 if (pi
.last_backfill
== hobject_t()) {
8641 ldout(pg
->cct
, 10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
8642 pg
->peer_missing
[*i
].clear();
8646 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
8647 pi
.last_update
== pg
->info
.last_update
) { // peer is up to date
8648 // replica has no missing and identical log as us. no need to
8650 // FIXME: we can do better here. if last_update==last_complete we
8651 // can infer the rest!
8652 ldout(pg
->cct
, 10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
8653 pg
->peer_missing
[*i
].clear();
8657 // We pull the log from the peer's last_epoch_started to ensure we
8658 // get enough log to detect divergent updates.
8659 since
.epoch
= pi
.last_epoch_started
;
8660 assert(pi
.last_update
>= pg
->info
.log_tail
); // or else choose_acting() did a bad thing
8661 if (pi
.log_tail
<= since
) {
8662 ldout(pg
->cct
, 10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
8663 context
< RecoveryMachine
>().send_query(
8667 i
->shard
, pg
->pg_whoami
.shard
,
8668 since
, pg
->info
.history
,
8669 pg
->get_osdmap()->get_epoch()));
8671 ldout(pg
->cct
, 10) << " requesting fulllog+missing from osd." << *i
8672 << " (want since " << since
<< " < log.tail "
8673 << pi
.log_tail
<< ")" << dendl
;
8674 context
< RecoveryMachine
>().send_query(
8676 pg_query_t::FULLLOG
,
8677 i
->shard
, pg
->pg_whoami
.shard
,
8678 pg
->info
.history
, pg
->get_osdmap()->get_epoch()));
8680 peer_missing_requested
.insert(*i
);
8681 pg
->blocked_by
.insert(i
->osd
);
8684 if (peer_missing_requested
.empty()) {
8685 if (pg
->need_up_thru
) {
8686 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8688 post_event(NeedUpThru());
8693 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8695 pg
->publish_stats_to_osd();
8699 boost::statechart::result
PG::RecoveryState::GetMissing::react(const MLogRec
& logevt
)
8701 PG
*pg
= context
< RecoveryMachine
>().pg
;
8703 peer_missing_requested
.erase(logevt
.from
);
8704 pg
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
8706 if (peer_missing_requested
.empty()) {
8707 if (pg
->need_up_thru
) {
8708 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8710 post_event(NeedUpThru());
8712 ldout(pg
->cct
, 10) << "Got last missing, don't need missing "
8713 << "posting Activate" << dendl
;
8714 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8717 return discard_event();
8720 boost::statechart::result
PG::RecoveryState::GetMissing::react(const QueryState
& q
)
8722 PG
*pg
= context
< RecoveryMachine
>().pg
;
8723 q
.f
->open_object_section("state");
8724 q
.f
->dump_string("name", state_name
);
8725 q
.f
->dump_stream("enter_time") << enter_time
;
8727 q
.f
->open_array_section("peer_missing_requested");
8728 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
8729 p
!= peer_missing_requested
.end();
8731 q
.f
->open_object_section("osd");
8732 q
.f
->dump_stream("osd") << *p
;
8733 if (pg
->peer_missing
.count(*p
)) {
8734 q
.f
->open_object_section("got_missing");
8735 pg
->peer_missing
[*p
].dump(q
.f
);
8736 q
.f
->close_section();
8738 q
.f
->close_section();
8740 q
.f
->close_section();
8742 q
.f
->close_section();
8743 return forward_event();
8746 void PG::RecoveryState::GetMissing::exit()
8748 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8749 PG
*pg
= context
< RecoveryMachine
>().pg
;
8750 utime_t dur
= ceph_clock_now() - enter_time
;
8751 pg
->osd
->recoverystate_perf
->tinc(rs_getmissing_latency
, dur
);
8752 pg
->blocked_by
.clear();
8753 pg
->publish_stats_to_osd();
8756 /*------WaitUpThru--------*/
8757 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx
)
8759 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitUpThru")
8761 context
< RecoveryMachine
>().log_enter(state_name
);
8764 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const ActMap
& am
)
8766 PG
*pg
= context
< RecoveryMachine
>().pg
;
8767 if (!pg
->need_up_thru
) {
8768 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8770 return forward_event();
8773 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const MLogRec
& logevt
)
8775 PG
*pg
= context
< RecoveryMachine
>().pg
;
8776 ldout(pg
->cct
, 10) << "Noting missing from osd." << logevt
.from
<< dendl
;
8777 pg
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
8778 pg
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
8779 return discard_event();
8782 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const QueryState
& q
)
8784 q
.f
->open_object_section("state");
8785 q
.f
->dump_string("name", state_name
);
8786 q
.f
->dump_stream("enter_time") << enter_time
;
8787 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8788 q
.f
->close_section();
8789 return forward_event();
8792 void PG::RecoveryState::WaitUpThru::exit()
8794 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8795 PG
*pg
= context
< RecoveryMachine
>().pg
;
8796 utime_t dur
= ceph_clock_now() - enter_time
;
8797 pg
->osd
->recoverystate_perf
->tinc(rs_waitupthru_latency
, dur
);
8800 /*----RecoveryState::RecoveryMachine Methods-----*/
8802 #define dout_prefix *_dout << pg->gen_prefix()
8804 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name
)
8806 PG
*pg
= context
< RecoveryMachine
>().pg
;
8807 ldout(pg
->cct
, 5) << "enter " << state_name
<< dendl
;
8808 pg
->osd
->pg_recovery_stats
.log_enter(state_name
);
8811 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name
, utime_t enter_time
)
8813 utime_t dur
= ceph_clock_now() - enter_time
;
8814 PG
*pg
= context
< RecoveryMachine
>().pg
;
8815 ldout(pg
->cct
, 5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
8816 pg
->osd
->pg_recovery_stats
.log_exit(state_name
, ceph_clock_now() - enter_time
,
8817 event_count
, event_time
);
8819 event_time
= utime_t();
8823 /*---------------------------------------------------*/
8825 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8827 void PG::RecoveryState::start_handle(RecoveryCtx
*new_ctx
) {
8832 if (messages_pending_flush
) {
8833 rctx
= RecoveryCtx(*messages_pending_flush
, *new_ctx
);
8837 rctx
->start_time
= ceph_clock_now();
8841 void PG::RecoveryState::begin_block_outgoing() {
8842 assert(!messages_pending_flush
);
8845 messages_pending_flush
= BufferedRecoveryMessages();
8846 rctx
= RecoveryCtx(*messages_pending_flush
, *orig_ctx
);
8849 void PG::RecoveryState::clear_blocked_outgoing() {
8852 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8855 void PG::RecoveryState::end_block_outgoing() {
8856 assert(messages_pending_flush
);
8860 rctx
= RecoveryCtx(*orig_ctx
);
8861 rctx
->accept_buffered_messages(*messages_pending_flush
);
8862 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8865 void PG::RecoveryState::end_handle() {
8867 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
8868 machine
.event_time
+= dur
;
8871 machine
.event_count
++;
8872 rctx
= boost::optional
<RecoveryCtx
>();
8876 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
)
8878 out
<< "BackfillInfo(" << bi
.begin
<< "-" << bi
.end
8879 << " " << bi
.objects
.size() << " objects";
8880 if (!bi
.objects
.empty())
8881 out
<< " " << bi
.objects
;
8886 void intrusive_ptr_add_ref(PG
*pg
) { pg
->get("intptr"); }
8887 void intrusive_ptr_release(PG
*pg
) { pg
->put("intptr"); }
8889 #ifdef PG_DEBUG_REFS
8890 uint64_t get_with_id(PG
*pg
) { return pg
->get_with_id(); }
8891 void put_with_id(PG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }