1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
21 #include "common/errno.h"
22 #include "common/config.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDRepOp.h"
54 #include "messages/MOSDRepOpReply.h"
55 #include "messages/MOSDRepScrubMap.h"
56 #include "messages/MOSDPGRecoveryDelete.h"
57 #include "messages/MOSDPGRecoveryDeleteReply.h"
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
69 #define tracepoint(...)
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
77 #define dout_prefix _prefix(_dout, this)
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
81 const string
infover_key("_infover");
82 const string
info_key("_info");
83 const string
biginfo_key("_biginfo");
84 const string
epoch_key("_epoch");
85 const string
fastinfo_key("_fastinfo");
88 static ostream
& _prefix(std::ostream
*_dout
, T
*t
)
90 return t
->gen_prefix(*_dout
);
93 void PGStateHistory::enter(PG
* pg
, const utime_t entime
, const char* state
)
95 // Ignore trimming state machine for now
96 if (::strstr(state
, "Trimming") != NULL
) {
98 } else if (pi
!= nullptr) {
99 pi
->enter_state(entime
, state
);
101 // Store current state since we can't reliably take the PG lock here
102 if ( tmppi
== nullptr) {
103 tmppi
= std::unique_ptr
<PGStateInstance
>(new PGStateInstance
);
107 tmppi
->enter_state(entime
, state
);
111 void PGStateHistory::exit(const char* state
) {
112 // Ignore trimming state machine for now
113 // Do nothing if PG is being destroyed!
114 if (::strstr(state
, "Trimming") != NULL
|| pg_in_destructor
) {
117 bool ilocked
= false;
118 if(!thispg
->is_locked()) {
123 buffer
.push_back(std::unique_ptr
<PGStateInstance
>(tmppi
.release()));
124 pi
= buffer
.back().get();
125 pi
->setepoch(thispg
->get_osdmap_epoch());
128 pi
->exit_state(ceph_clock_now());
129 if (::strcmp(state
, "Reset") == 0) {
138 void PGStateHistory::dump(Formatter
* f
) const {
139 f
->open_array_section("history");
140 for (auto pi
= buffer
.begin(); pi
!= buffer
.end(); ++pi
) {
141 f
->open_object_section("states");
142 f
->dump_stream("epoch") << (*pi
)->this_epoch
;
143 for (auto she
: (*pi
)->state_history
) {
144 f
->dump_string("state", std::get
<2>(she
));
145 f
->dump_stream("enter") << std::get
<0>(she
);
146 f
->dump_stream("exit") << std::get
<1>(she
);
153 void PG::get(const char* tag
)
156 lgeneric_subdout(cct
, refs
, 5) << "PG::get " << this << " "
157 << "tag " << (tag
? tag
: "(none") << " "
158 << (after
- 1) << " -> " << after
<< dendl
;
160 std::lock_guard
l(_ref_id_lock
);
165 void PG::put(const char* tag
)
169 std::lock_guard
l(_ref_id_lock
);
170 auto tag_counts_entry
= _tag_counts
.find(tag
);
171 ceph_assert(tag_counts_entry
!= _tag_counts
.end());
172 --tag_counts_entry
->second
;
173 if (tag_counts_entry
->second
== 0) {
174 _tag_counts
.erase(tag_counts_entry
);
178 auto local_cct
= cct
;
180 lgeneric_subdout(local_cct
, refs
, 5) << "PG::put " << this << " "
181 << "tag " << (tag
? tag
: "(none") << " "
182 << (after
+ 1) << " -> " << after
189 uint64_t PG::get_with_id()
192 std::lock_guard
l(_ref_id_lock
);
193 uint64_t id
= ++_ref_id
;
197 lgeneric_subdout(cct
, refs
, 5) << "PG::get " << this << " " << info
.pgid
198 << " got id " << id
<< " "
199 << (ref
- 1) << " -> " << ref
201 ceph_assert(!_live_ids
.count(id
));
202 _live_ids
.insert(make_pair(id
, ss
.str()));
206 void PG::put_with_id(uint64_t id
)
209 lgeneric_subdout(cct
, refs
, 5) << "PG::put " << this << " " << info
.pgid
210 << " put id " << id
<< " "
211 << (newref
+ 1) << " -> " << newref
214 std::lock_guard
l(_ref_id_lock
);
215 ceph_assert(_live_ids
.count(id
));
222 void PG::dump_live_ids()
224 std::lock_guard
l(_ref_id_lock
);
225 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live ids:" << dendl
;
226 for (map
<uint64_t, string
>::iterator i
= _live_ids
.begin();
227 i
!= _live_ids
.end();
229 dout(0) << "\t\tid: " << *i
<< dendl
;
231 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live tags:" << dendl
;
232 for (map
<string
, uint64_t>::iterator i
= _tag_counts
.begin();
233 i
!= _tag_counts
.end();
235 dout(0) << "\t\tid: " << *i
<< dendl
;
241 void PGPool::update(CephContext
*cct
, OSDMapRef map
)
243 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
245 return; // pool has been deleted
248 name
= map
->get_pool_name(id
);
250 bool updated
= false;
251 if ((map
->get_epoch() != cached_epoch
+ 1) ||
252 (pi
->get_snap_epoch() == map
->get_epoch())) {
256 if (map
->require_osd_release
>= CEPH_RELEASE_MIMIC
) {
257 // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
258 // in the pg_info_t, with deltas for both in each OSDMap. we don't
259 // need to (and can't) track it here.
260 cached_removed_snaps
.clear();
261 newly_removed_snaps
.clear();
263 // legacy (<= luminous) removed_snaps tracking
265 if (pi
->maybe_updated_removed_snaps(cached_removed_snaps
)) {
266 pi
->build_removed_snaps(newly_removed_snaps
);
267 if (cached_removed_snaps
.subset_of(newly_removed_snaps
)) {
268 interval_set
<snapid_t
> removed_snaps
= newly_removed_snaps
;
269 newly_removed_snaps
.subtract(cached_removed_snaps
);
270 cached_removed_snaps
.swap(removed_snaps
);
272 lgeneric_subdout(cct
, osd
, 0) << __func__
273 << " cached_removed_snaps shrank from " << cached_removed_snaps
274 << " to " << newly_removed_snaps
<< dendl
;
275 cached_removed_snaps
.swap(newly_removed_snaps
);
276 newly_removed_snaps
.clear();
279 newly_removed_snaps
.clear();
282 /* 1) map->get_epoch() == cached_epoch + 1 &&
283 * 2) pi->get_snap_epoch() != map->get_epoch()
285 * From the if branch, 1 && 2 must be true. From 2, we know that
286 * this map didn't change the set of removed snaps. From 1, we
287 * know that our cached_removed_snaps matches the previous map.
288 * Thus, from 1 && 2, cached_removed snaps matches the current
289 * set of removed snaps and all we have to do is clear
290 * newly_removed_snaps.
292 newly_removed_snaps
.clear();
294 lgeneric_subdout(cct
, osd
, 20)
295 << "PGPool::update cached_removed_snaps "
296 << cached_removed_snaps
297 << " newly_removed_snaps "
298 << newly_removed_snaps
299 << " snapc " << snapc
300 << (updated
? " (updated)":" (no change)")
302 if (cct
->_conf
->osd_debug_verify_cached_snaps
) {
303 interval_set
<snapid_t
> actual_removed_snaps
;
304 pi
->build_removed_snaps(actual_removed_snaps
);
305 if (!(actual_removed_snaps
== cached_removed_snaps
)) {
306 lgeneric_derr(cct
) << __func__
307 << ": mismatch between the actual removed snaps "
308 << actual_removed_snaps
309 << " and pool.cached_removed_snaps "
310 << " pool.cached_removed_snaps " << cached_removed_snaps
313 ceph_assert(actual_removed_snaps
== cached_removed_snaps
);
316 if (info
.is_pool_snaps_mode() && updated
) {
317 snapc
= pi
->get_snap_context();
319 cached_epoch
= map
->get_epoch();
322 PG::PG(OSDService
*o
, OSDMapRef curmap
,
323 const PGPool
&_pool
, spg_t p
) :
330 osdriver(osd
->store
, coll_t(), OSD::make_snapmapper_oid()),
335 p
.get_split_bits(_pool
.info
.get_pg_num()),
338 last_persisted_osdmap(curmap
->get_epoch()),
340 trace_endpoint("0.0.0.0", 0, "PG"),
341 dirty_info(false), dirty_big_info(false),
345 pgmeta_oid(p
.make_pgmeta_oid()),
347 stat_queue_item(this),
349 recovery_queued(false),
350 recovery_ops_active(0),
354 pg_whoami(osd
->whoami
, p
.shard
),
356 last_peering_reset(0),
357 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
358 backfill_reserved(false),
359 backfill_reserving(false),
360 flushes_in_progress(0),
361 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
362 pg_stats_publish_valid(false),
363 finish_sync_event(NULL
),
364 backoff_lock("PG::backoff_lock"),
365 scrub_after_recovery(false),
367 recovery_state(this),
368 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
369 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
370 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
372 last_require_osd_release(curmap
->require_osd_release
)
375 osd
->add_pgid(p
, this);
378 std::stringstream ss
;
379 ss
<< "PG " << info
.pgid
;
380 trace_endpoint
.copy_name(ss
.str());
386 pgstate_history
.set_pg_in_destructor();
388 osd
->remove_pgid(info
.pgid
, this);
392 void PG::lock(bool no_lockdep
) const
394 _lock
.Lock(no_lockdep
);
395 // if we have unrecorded dirty state with the lock dropped, there is a bug
396 ceph_assert(!dirty_info
);
397 ceph_assert(!dirty_big_info
);
399 dout(30) << "lock" << dendl
;
402 std::ostream
& PG::gen_prefix(std::ostream
& out
) const
404 OSDMapRef mapref
= osdmap_ref
;
405 if (_lock
.is_locked_by_me()) {
406 out
<< "osd." << osd
->whoami
407 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
408 << " " << *this << " ";
410 out
<< "osd." << osd
->whoami
411 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
412 << " pg[" << info
.pgid
<< "(unlocked)] ";
417 /********* PG **********/
419 void PG::proc_master_log(
420 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
421 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
423 dout(10) << "proc_master_log for osd." << from
<< ": "
424 << olog
<< " " << omissing
<< dendl
;
425 ceph_assert(!is_peered() && is_primary());
427 // merge log into our own log to build master log. no need to
428 // make any adjustments to their missing map; we are taking their
429 // log to be authoritative (i.e., their entries are by definitely
431 merge_log(t
, oinfo
, olog
, from
);
432 peer_info
[from
] = oinfo
;
433 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
434 might_have_unfound
.insert(from
);
436 // See doc/dev/osd_internals/last_epoch_started
437 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
438 info
.last_epoch_started
= oinfo
.last_epoch_started
;
441 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
442 info
.last_interval_started
= oinfo
.last_interval_started
;
445 update_history(oinfo
.history
);
446 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
447 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
449 peer_missing
[from
].claim(omissing
);
452 void PG::proc_replica_log(
454 const pg_log_t
&olog
,
455 pg_missing_t
& omissing
,
458 dout(10) << "proc_replica_log for osd." << from
<< ": "
459 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
461 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
463 peer_info
[from
] = oinfo
;
464 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
465 might_have_unfound
.insert(from
);
467 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
468 omissing
.get_items().begin();
469 i
!= omissing
.get_items().end();
471 dout(20) << " after missing " << i
->first
<< " need " << i
->second
.need
472 << " have " << i
->second
.have
<< dendl
;
474 peer_missing
[from
].claim(omissing
);
477 bool PG::proc_replica_info(
478 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
480 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
481 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
482 dout(10) << " got dup osd." << from
<< " info " << oinfo
<< ", identical to ours" << dendl
;
486 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
487 dout(10) << " got info " << oinfo
<< " from down osd." << from
488 << " discarding" << dendl
;
492 dout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
493 ceph_assert(is_primary());
494 peer_info
[from
] = oinfo
;
495 might_have_unfound
.insert(from
);
497 update_history(oinfo
.history
);
500 if (!is_up(from
) && !is_acting(from
)) {
501 dout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
502 stray_set
.insert(from
);
508 // was this a new info? if so, update peers!
509 if (p
== peer_info
.end())
510 update_heartbeat_peers();
515 void PG::remove_snap_mapped_object(
516 ObjectStore::Transaction
&t
, const hobject_t
&soid
)
520 ghobject_t(soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
));
521 clear_object_snap_mapping(&t
, soid
);
524 void PG::clear_object_snap_mapping(
525 ObjectStore::Transaction
*t
, const hobject_t
&soid
)
527 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
528 if (soid
.snap
< CEPH_MAXSNAP
) {
529 int r
= snap_mapper
.remove_oid(
532 if (!(r
== 0 || r
== -ENOENT
)) {
533 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
539 void PG::update_object_snap_mapping(
540 ObjectStore::Transaction
*t
, const hobject_t
&soid
, const set
<snapid_t
> &snaps
)
542 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
543 ceph_assert(soid
.snap
< CEPH_MAXSNAP
);
544 int r
= snap_mapper
.remove_oid(
547 if (!(r
== 0 || r
== -ENOENT
)) {
548 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
558 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t from
)
560 PGLogEntryHandler rollbacker
{this, &t
};
562 oinfo
, olog
, from
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
565 void PG::rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
)
567 PGLogEntryHandler rollbacker
{this, &t
};
568 pg_log
.rewind_divergent_log(
569 newhead
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
573 * Process information from a replica to determine if it could have any
574 * objects that i need.
576 * TODO: if the missing set becomes very large, this could get expensive.
577 * Instead, we probably want to just iterate over our unfound set.
579 bool PG::search_for_missing(
580 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
584 uint64_t num_unfound_before
= missing_loc
.num_unfound();
585 bool found_missing
= missing_loc
.add_source_info(
586 from
, oinfo
, omissing
, ctx
->handle
);
587 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
588 publish_stats_to_osd();
589 // avoid doing this if the peer is empty. This is abit of paranoia
590 // to avoid doing something rash if add_source_info() above
591 // incorrectly decided we found something new. (if the peer has
592 // last_update=0'0 that's impossible.)
594 oinfo
.last_update
!= eversion_t()) {
595 pg_info_t
tinfo(oinfo
);
596 tinfo
.pgid
.shard
= pg_whoami
.shard
;
597 (*(ctx
->info_map
))[from
.osd
].push_back(
600 from
.shard
, pg_whoami
.shard
,
606 return found_missing
;
612 bool PG::MissingLoc::readable_with_acting(
613 const hobject_t
&hoid
,
614 const set
<pg_shard_t
> &acting
) const {
615 if (!needs_recovery(hoid
))
617 if (is_deleted(hoid
))
619 auto missing_loc_entry
= missing_loc
.find(hoid
);
620 if (missing_loc_entry
== missing_loc
.end())
622 const set
<pg_shard_t
> &locs
= missing_loc_entry
->second
;
623 ldout(pg
->cct
, 10) << __func__
<< ": locs:" << locs
<< dendl
;
624 set
<pg_shard_t
> have_acting
;
625 for (set
<pg_shard_t
>::const_iterator i
= locs
.begin();
628 if (acting
.count(*i
))
629 have_acting
.insert(*i
);
631 return (*is_readable
)(have_acting
);
634 void PG::MissingLoc::add_batch_sources_info(
635 const set
<pg_shard_t
> &sources
, ThreadPool::TPHandle
* handle
)
637 ldout(pg
->cct
, 10) << __func__
<< ": adding sources in batch "
638 << sources
.size() << dendl
;
640 bool sources_updated
= false;
641 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= needs_recovery_map
.begin();
642 i
!= needs_recovery_map
.end();
644 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
645 handle
->reset_tp_timeout();
648 if (i
->second
.is_delete())
651 auto p
= missing_loc
.find(i
->first
);
652 if (p
== missing_loc
.end()) {
653 p
= missing_loc
.emplace(i
->first
, set
<pg_shard_t
>()).first
;
655 _dec_count(p
->second
);
657 missing_loc
[i
->first
].insert(sources
.begin(), sources
.end());
658 _inc_count(p
->second
);
660 if (!sources_updated
) {
661 missing_loc_sources
.insert(sources
.begin(), sources
.end());
662 sources_updated
= true;
667 bool PG::MissingLoc::add_source_info(
669 const pg_info_t
&oinfo
,
670 const pg_missing_t
&omissing
,
671 ThreadPool::TPHandle
* handle
)
673 bool found_missing
= false;
675 bool sources_updated
= false;
677 for (map
<hobject_t
,pg_missing_item
>::const_iterator p
= needs_recovery_map
.begin();
678 p
!= needs_recovery_map
.end();
680 const hobject_t
&soid(p
->first
);
681 eversion_t need
= p
->second
.need
;
682 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
683 handle
->reset_tp_timeout();
686 if (p
->second
.is_delete()) {
687 ldout(pg
->cct
, 10) << __func__
<< " " << soid
688 << " delete, ignoring source" << dendl
;
691 if (oinfo
.last_update
< need
) {
692 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
693 << " also missing on osd." << fromosd
694 << " (last_update " << oinfo
.last_update
695 << " < needed " << need
<< ")" << dendl
;
698 if (!oinfo
.last_backfill
.is_max() &&
699 !oinfo
.last_backfill_bitwise
) {
700 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
701 << " also missing on osd." << fromosd
702 << " (last_backfill " << oinfo
.last_backfill
703 << " but with wrong sort order)"
707 if (p
->first
>= oinfo
.last_backfill
) {
708 // FIXME: this is _probably_ true, although it could conceivably
709 // be in the undefined region! Hmm!
710 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
711 << " also missing on osd." << fromosd
712 << " (past last_backfill " << oinfo
.last_backfill
716 if (omissing
.is_missing(soid
)) {
717 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
718 << " also missing on osd." << fromosd
<< dendl
;
722 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
723 << " is on osd." << fromosd
<< dendl
;
726 auto p
= missing_loc
.find(soid
);
727 if (p
== missing_loc
.end()) {
728 p
= missing_loc
.emplace(soid
, set
<pg_shard_t
>()).first
;
730 _dec_count(p
->second
);
732 p
->second
.insert(fromosd
);
733 _inc_count(p
->second
);
736 if (!sources_updated
) {
737 missing_loc_sources
.insert(fromosd
);
738 sources_updated
= true;
740 found_missing
= true;
743 ldout(pg
->cct
, 20) << "needs_recovery_map missing " << needs_recovery_map
745 return found_missing
;
748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef
& osdmap
)
750 set
<pg_shard_t
> now_down
;
751 for (set
<pg_shard_t
>::iterator p
= missing_loc_sources
.begin();
752 p
!= missing_loc_sources
.end();
754 if (osdmap
->is_up(p
->osd
)) {
758 ldout(pg
->cct
, 10) << __func__
<< " source osd." << *p
<< " now down" << dendl
;
760 missing_loc_sources
.erase(p
++);
763 if (now_down
.empty()) {
764 ldout(pg
->cct
, 10) << __func__
<< " no source osds (" << missing_loc_sources
<< ") went down" << dendl
;
766 ldout(pg
->cct
, 10) << __func__
<< " sources osds " << now_down
<< " now down, remaining sources are "
767 << missing_loc_sources
<< dendl
;
769 // filter missing_loc
770 map
<hobject_t
, set
<pg_shard_t
>>::iterator p
= missing_loc
.begin();
771 while (p
!= missing_loc
.end()) {
772 set
<pg_shard_t
>::iterator q
= p
->second
.begin();
773 bool changed
= false;
774 while (q
!= p
->second
.end()) {
775 if (now_down
.count(*q
)) {
778 _dec_count(p
->second
);
780 p
->second
.erase(q
++);
785 if (p
->second
.empty()) {
786 missing_loc
.erase(p
++);
789 _inc_count(p
->second
);
797 void PG::discover_all_missing(map
<int, map
<spg_t
,pg_query_t
> > &query_map
)
799 auto &missing
= pg_log
.get_missing();
800 uint64_t unfound
= get_num_unfound();
802 dout(10) << __func__
<< " "
803 << missing
.num_missing() << " missing, "
804 << unfound
<< " unfound"
807 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
808 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
809 for (; m
!= mend
; ++m
) {
812 if (!get_osdmap()->is_up(peer
.osd
)) {
813 dout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
817 if (peer_purged
.count(peer
)) {
818 dout(20) << __func__
<< " skipping purged osd." << peer
<< dendl
;
822 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
823 if (iter
!= peer_info
.end() &&
824 (iter
->second
.is_empty() || iter
->second
.dne())) {
825 // ignore empty peers
829 // If we've requested any of this stuff, the pg_missing_t information
830 // should be on its way.
831 // TODO: coalsce requested_* into a single data structure
832 if (peer_missing
.find(peer
) != peer_missing
.end()) {
833 dout(20) << __func__
<< ": osd." << peer
834 << ": we already have pg_missing_t" << dendl
;
837 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
838 dout(20) << __func__
<< ": osd." << peer
839 << ": in peer_log_requested" << dendl
;
842 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
843 dout(20) << __func__
<< ": osd." << peer
844 << ": in peer_missing_requested" << dendl
;
849 dout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
851 peer_missing_requested
.insert(peer
);
852 query_map
[peer
.osd
][spg_t(info
.pgid
.pgid
, peer
.shard
)] =
855 peer
.shard
, pg_whoami
.shard
,
856 info
.history
, get_osdmap_epoch());
860 /******* PG ***********/
861 bool PG::needs_recovery() const
863 ceph_assert(is_primary());
865 auto &missing
= pg_log
.get_missing();
867 if (missing
.num_missing()) {
868 dout(10) << __func__
<< " primary has " << missing
.num_missing()
869 << " missing" << dendl
;
873 ceph_assert(!acting_recovery_backfill
.empty());
874 set
<pg_shard_t
>::const_iterator end
= acting_recovery_backfill
.end();
875 set
<pg_shard_t
>::const_iterator a
= acting_recovery_backfill
.begin();
876 for (; a
!= end
; ++a
) {
877 if (*a
== get_primary()) continue;
878 pg_shard_t peer
= *a
;
879 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
880 if (pm
== peer_missing
.end()) {
881 dout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
885 if (pm
->second
.num_missing()) {
886 dout(10) << __func__
<< " osd." << peer
<< " has "
887 << pm
->second
.num_missing() << " missing" << dendl
;
892 dout(10) << __func__
<< " is recovered" << dendl
;
896 bool PG::needs_backfill() const
898 ceph_assert(is_primary());
900 // We can assume that only possible osds that need backfill
901 // are on the backfill_targets vector nodes.
902 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
903 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
904 for (; a
!= end
; ++a
) {
905 pg_shard_t peer
= *a
;
906 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
907 if (!pi
->second
.last_backfill
.is_max()) {
908 dout(10) << __func__
<< " osd." << peer
<< " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
913 dout(10) << __func__
<< " does not need backfill" << dendl
;
918 void PG::check_past_interval_bounds() const
920 auto oldest_epoch
= osd
->get_superblock().oldest_map
;
921 auto rpib
= get_required_past_interval_bounds(
924 if (rpib
.first
>= rpib
.second
) {
925 // do not warn if the start bound is dictated by oldest_map; the
926 // past intervals are presumably appropriate given the pg info.
927 if (!past_intervals
.empty() &&
928 rpib
.first
> oldest_epoch
) {
929 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
930 << " empty [" << rpib
<< ") but past_intervals is not: "
932 derr
<< info
.pgid
<< " required past_interval bounds are"
933 << " empty [" << rpib
<< ") but past_intervals is not: "
934 << past_intervals
<< dendl
;
937 if (past_intervals
.empty()) {
938 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
939 << " not empty [" << rpib
<< ") but past_intervals "
940 << past_intervals
<< " is empty";
941 derr
<< info
.pgid
<< " required past_interval bounds are"
942 << " not empty [" << rpib
<< ") but past_intervals "
943 << past_intervals
<< " is empty" << dendl
;
944 ceph_assert(!past_intervals
.empty());
947 auto apib
= past_intervals
.get_bounds();
948 if (apib
.first
> rpib
.first
) {
949 osd
->clog
->error() << info
.pgid
<< " past_intervals [" << apib
950 << ") start interval does not contain the required"
951 << " bound [" << rpib
<< ") start";
952 derr
<< info
.pgid
<< " past_intervals [" << apib
953 << ") start interval does not contain the required"
954 << " bound [" << rpib
<< ") start" << dendl
;
955 ceph_abort_msg("past_interval start interval mismatch");
957 if (apib
.second
!= rpib
.second
) {
958 osd
->clog
->error() << info
.pgid
<< " past_interal bound [" << apib
959 << ") end does not match required [" << rpib
961 derr
<< info
.pgid
<< " past_interal bound [" << apib
962 << ") end does not match required [" << rpib
964 ceph_abort_msg("past_interval end mismatch");
969 bool PG::adjust_need_up_thru(const OSDMapRef osdmap
)
971 epoch_t up_thru
= osdmap
->get_up_thru(osd
->whoami
);
973 up_thru
>= info
.history
.same_interval_since
) {
974 dout(10) << "adjust_need_up_thru now " << up_thru
<< ", need_up_thru now false" << dendl
;
975 need_up_thru
= false;
981 void PG::remove_down_peer_info(const OSDMapRef osdmap
)
983 // Remove any downed osds from peer_info
984 bool removed
= false;
985 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
986 while (p
!= peer_info
.end()) {
987 if (!osdmap
->is_up(p
->first
.osd
)) {
988 dout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
989 peer_missing
.erase(p
->first
);
990 peer_log_requested
.erase(p
->first
);
991 peer_missing_requested
.erase(p
->first
);
992 peer_purged
.erase(p
->first
); // so we can re-purge if necessary
993 peer_info
.erase(p
++);
999 // if we removed anyone, update peers (which include peer_info)
1001 update_heartbeat_peers();
1002 check_recovery_sources(osdmap
);
1006 * Returns true unless there is a non-lost OSD in might_have_unfound.
1008 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const
1010 ceph_assert(is_primary());
1012 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
1013 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
1014 for (; peer
!= mend
; ++peer
) {
1015 if (peer_missing
.count(*peer
))
1017 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
1018 if (iter
!= peer_info
.end() &&
1019 (iter
->second
.is_empty() || iter
->second
.dne()))
1021 if (!osdmap
->exists(peer
->osd
))
1023 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
1024 if (osd_info
.lost_at
<= osd_info
.up_from
) {
1025 // If there is even one OSD in might_have_unfound that isn't lost, we
1026 // still might retrieve our unfound.
1030 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1031 << " have been queried or are marked lost" << dendl
;
1035 PastIntervals::PriorSet
PG::build_prior()
1039 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
1040 it
!= peer_info
.end();
1042 ceph_assert(info
.history
.last_epoch_started
>= it
->second
.history
.last_epoch_started
);
1046 const OSDMap
&osdmap
= *get_osdmap();
1047 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
1048 pool
.info
.is_erasure(),
1049 info
.history
.last_epoch_started
,
1050 get_pgbackend()->get_is_recoverable_predicate(),
1051 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
1052 const osd_info_t
*pinfo
= 0;
1053 if (osdmap
.exists(osd
)) {
1054 pinfo
= &osdmap
.get_info(osd
);
1056 *lost_at
= pinfo
->lost_at
;
1059 if (osdmap
.is_up(osd
)) {
1060 return PastIntervals::UP
;
1061 } else if (!pinfo
) {
1062 return PastIntervals::DNE
;
1063 } else if (pinfo
->lost_at
> start
) {
1064 return PastIntervals::LOST
;
1066 return PastIntervals::DOWN
;
1073 if (prior
.pg_down
) {
1074 state_set(PG_STATE_DOWN
);
1077 if (get_osdmap()->get_up_thru(osd
->whoami
) < info
.history
.same_interval_since
) {
1078 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
1079 << " < same_since " << info
.history
.same_interval_since
1080 << ", must notify monitor" << dendl
;
1081 need_up_thru
= true;
1083 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
1084 << " >= same_since " << info
.history
.same_interval_since
1085 << ", all is well" << dendl
;
1086 need_up_thru
= false;
1088 set_probe_targets(prior
.probe
);
1092 void PG::clear_primary_state()
1094 dout(10) << "clear_primary_state" << dendl
;
1096 // clear peering state
1098 peer_log_requested
.clear();
1099 peer_missing_requested
.clear();
1102 peer_missing
.clear();
1103 need_up_thru
= false;
1104 peer_last_complete_ondisk
.clear();
1105 peer_activated
.clear();
1106 min_last_complete_ondisk
= eversion_t();
1107 pg_trim_to
= eversion_t();
1108 might_have_unfound
.clear();
1109 projected_log
= PGLog::IndexedLog();
1111 last_update_ondisk
= eversion_t();
1115 finish_sync_event
= 0; // so that _finish_recovery doesn't go off in another thread
1117 missing_loc
.clear();
1119 release_pg_backoffs();
1121 pg_log
.reset_recovery_pointers();
1123 scrubber
.reserved_peers
.clear();
1124 scrub_after_recovery
= false;
1129 PG::Scrubber::Scrubber()
1130 : local_reserved(false), remote_reserved(false), reserve_failed(false),
1133 shallow_errors(0), deep_errors(0), fixed(0),
1134 must_scrub(false), must_deep_scrub(false), must_repair(false),
1135 need_auto(false), time_for_deep(false),
1137 check_repair(false),
1138 deep_scrub_on_error(false),
1139 num_digest_updates_pending(0),
1144 PG::Scrubber::~Scrubber() {}
1149 * Returns an iterator to the best info in infos sorted by:
1150 * 1) Prefer newer last_update
1151 * 2) Prefer longer tail if it brings another info into contiguity
1152 * 3) Prefer current primary
1154 map
<pg_shard_t
, pg_info_t
>::const_iterator
PG::find_best_info(
1155 const map
<pg_shard_t
, pg_info_t
> &infos
,
1156 bool restrict_to_up_acting
,
1157 bool *history_les_bound
) const
1159 ceph_assert(history_les_bound
);
1160 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1161 * to make changes to this process. Also, make sure to update it
1162 * when you find bugs! */
1163 eversion_t min_last_update_acceptable
= eversion_t::max();
1164 epoch_t max_last_epoch_started_found
= 0;
1165 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1168 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1169 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1170 *history_les_bound
= true;
1171 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1173 if (!i
->second
.is_incomplete() &&
1174 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1175 *history_les_bound
= false;
1176 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1179 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1182 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1183 if (min_last_update_acceptable
> i
->second
.last_update
)
1184 min_last_update_acceptable
= i
->second
.last_update
;
1187 if (min_last_update_acceptable
== eversion_t::max())
1190 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1191 // find osd with newest last_update (oldest for ec_pool).
1192 // if there are multiples, prefer
1193 // - a longer tail, if it brings another peer into log contiguity
1194 // - the current primary
1195 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1198 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1199 !is_acting(p
->first
))
1201 // Only consider peers with last_update >= min_last_update_acceptable
1202 if (p
->second
.last_update
< min_last_update_acceptable
)
1204 // Disqualify anyone with a too old last_epoch_started
1205 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1207 // Disqualify anyone who is incomplete (not fully backfilled)
1208 if (p
->second
.is_incomplete())
1210 if (best
== infos
.end()) {
1214 // Prefer newer last_update
1215 if (pool
.info
.require_rollback()) {
1216 if (p
->second
.last_update
> best
->second
.last_update
)
1218 if (p
->second
.last_update
< best
->second
.last_update
) {
1223 if (p
->second
.last_update
< best
->second
.last_update
)
1225 if (p
->second
.last_update
> best
->second
.last_update
) {
1231 // Prefer longer tail
1232 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1234 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1239 if (!p
->second
.has_missing() && best
->second
.has_missing()) {
1240 dout(10) << __func__
<< " prefer osd." << p
->first
1241 << " because it is complete while best has missing"
1245 } else if (p
->second
.has_missing() && !best
->second
.has_missing()) {
1246 dout(10) << __func__
<< " skipping osd." << p
->first
1247 << " because it has missing while best is complete"
1251 // both are complete or have missing
1255 // prefer current primary (usually the caller), all things being equal
1256 if (p
->first
== pg_whoami
) {
1257 dout(10) << "calc_acting prefer osd." << p
->first
1258 << " because it is current primary" << dendl
;
1266 void PG::calc_ec_acting(
1267 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1269 const vector
<int> &acting
,
1270 const vector
<int> &up
,
1271 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1272 bool restrict_to_up_acting
,
1274 set
<pg_shard_t
> *backfill
,
1275 set
<pg_shard_t
> *acting_backfill
,
1278 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1279 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1280 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1281 i
!= all_info
.end();
1283 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1285 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1286 ss
<< "For position " << (unsigned)i
<< ": ";
1287 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1288 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1289 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1290 auth_log_shard
->second
.log_tail
) {
1291 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1295 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1296 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1298 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1301 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1302 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1303 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1304 auth_log_shard
->second
.log_tail
) {
1305 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1306 want
[i
] = acting
[i
];
1307 } else if (!restrict_to_up_acting
) {
1308 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1309 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1311 ceph_assert(j
->shard
== i
);
1312 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1313 all_info
.find(*j
)->second
.last_update
>=
1314 auth_log_shard
->second
.log_tail
) {
1315 ss
<< " selecting stray: " << *j
<< std::endl
;
1320 if (want
[i
] == CRUSH_ITEM_NONE
)
1321 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1325 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1326 if (want
[i
] != CRUSH_ITEM_NONE
) {
1327 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1330 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1335 * calculate the desired acting set.
1337 * Choose an appropriate acting set. Prefer up[0], unless it is
1338 * incomplete, or another osd has a longer tail that allows us to
1339 * bring other up nodes up to date.
1341 void PG::calc_replicated_acting(
1342 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1343 uint64_t force_auth_primary_missing_objects
,
1345 const vector
<int> &acting
,
1346 const vector
<int> &up
,
1347 pg_shard_t up_primary
,
1348 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1349 bool restrict_to_up_acting
,
1351 set
<pg_shard_t
> *backfill
,
1352 set
<pg_shard_t
> *acting_backfill
,
1353 const OSDMapRef osdmap
,
1356 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1358 ss
<< __func__
<< " newest update on osd." << auth_log_shard_id
1359 << " with " << auth_log_shard
->second
1360 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1363 auto primary
= all_info
.find(up_primary
);
1365 !primary
->second
.is_incomplete() &&
1366 primary
->second
.last_update
>=
1367 auth_log_shard
->second
.log_tail
) {
1368 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1369 auto approx_missing_objects
=
1370 primary
->second
.stats
.stats
.sum
.num_objects_missing
;
1371 auto auth_version
= auth_log_shard
->second
.last_update
.version
;
1372 auto primary_version
= primary
->second
.last_update
.version
;
1373 if (auth_version
> primary_version
) {
1374 approx_missing_objects
+= auth_version
- primary_version
;
1376 approx_missing_objects
+= primary_version
- auth_version
;
1378 if ((uint64_t)approx_missing_objects
>
1379 force_auth_primary_missing_objects
) {
1380 primary
= auth_log_shard
;
1381 ss
<< "up_primary: " << up_primary
<< ") has approximate "
1382 << approx_missing_objects
1383 << "(>" << force_auth_primary_missing_objects
<<") "
1384 << "missing objects, osd." << auth_log_shard_id
1385 << " selected as primary instead"
1388 ss
<< "up_primary: " << up_primary
<< ") selected as primary"
1392 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1395 ceph_assert(!auth_log_shard
->second
.is_incomplete());
1396 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1397 << " selected as primary instead" << std::endl
;
1398 primary
= auth_log_shard
;
1401 ss
<< __func__
<< " primary is osd." << primary
->first
1402 << " with " << primary
->second
<< std::endl
;
1403 want
->push_back(primary
->first
.osd
);
1404 acting_backfill
->insert(primary
->first
);
1406 /* We include auth_log_shard->second.log_tail because in GetLog,
1407 * we will request logs back to the min last_update over our
1408 * acting_backfill set, which will result in our log being extended
1409 * as far backwards as necessary to pick up any peers which can
1410 * be log recovered by auth_log_shard's log */
1411 eversion_t oldest_auth_log_entry
=
1412 std::min(primary
->second
.log_tail
, auth_log_shard
->second
.log_tail
);
1414 // select replicas that have log contiguity with primary.
1415 // prefer up, then acting, then any peer_info osds
1417 pg_shard_t up_cand
= pg_shard_t(i
, shard_id_t::NO_SHARD
);
1418 if (up_cand
== primary
->first
)
1420 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1421 if (cur_info
.is_incomplete() ||
1422 cur_info
.last_update
< oldest_auth_log_entry
) {
1423 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1424 backfill
->insert(up_cand
);
1425 acting_backfill
->insert(up_cand
);
1428 acting_backfill
->insert(up_cand
);
1429 ss
<< " osd." << i
<< " (up) accepted " << cur_info
<< std::endl
;
1433 if (want
->size() >= size
) {
1437 std::vector
<std::pair
<eversion_t
, int>> candidate_by_last_update
;
1438 candidate_by_last_update
.reserve(acting
.size());
1439 // This no longer has backfill OSDs, but they are covered above.
1440 for (auto i
: acting
) {
1441 pg_shard_t
acting_cand(i
, shard_id_t::NO_SHARD
);
1442 // skip up osds we already considered above
1443 if (acting_cand
== primary
->first
)
1445 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
);
1446 if (up_it
!= up
.end())
1449 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1450 if (cur_info
.is_incomplete() ||
1451 cur_info
.last_update
< oldest_auth_log_entry
) {
1452 ss
<< " shard " << acting_cand
<< " (acting) REJECTED "
1453 << cur_info
<< std::endl
;
1455 candidate_by_last_update
.push_back(make_pair(cur_info
.last_update
, i
));
1459 auto sort_by_eversion
=[](const std::pair
<eversion_t
, int> &lhs
,
1460 const std::pair
<eversion_t
, int> &rhs
) {
1461 return lhs
.first
> rhs
.first
;
1463 // sort by last_update, in descending order.
1464 std::sort(candidate_by_last_update
.begin(),
1465 candidate_by_last_update
.end(), sort_by_eversion
);
1466 for (auto &p
: candidate_by_last_update
) {
1467 ceph_assert(want
->size() < size
);
1468 want
->push_back(p
.second
);
1469 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1470 acting_backfill
->insert(s
);
1471 ss
<< " shard " << s
<< " (acting) accepted "
1472 << all_info
.find(s
)->second
<< std::endl
;
1473 if (want
->size() >= size
) {
1478 if (restrict_to_up_acting
) {
1481 candidate_by_last_update
.clear();
1482 candidate_by_last_update
.reserve(all_info
.size()); // overestimate but fine
1483 // continue to search stray to find more suitable peers
1484 for (auto &i
: all_info
) {
1485 // skip up osds we already considered above
1486 if (i
.first
== primary
->first
)
1488 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
.first
.osd
);
1489 if (up_it
!= up
.end())
1491 vector
<int>::const_iterator acting_it
= find(
1492 acting
.begin(), acting
.end(), i
.first
.osd
);
1493 if (acting_it
!= acting
.end())
1496 if (i
.second
.is_incomplete() ||
1497 i
.second
.last_update
< oldest_auth_log_entry
) {
1498 ss
<< " shard " << i
.first
<< " (stray) REJECTED " << i
.second
1501 candidate_by_last_update
.push_back(
1502 make_pair(i
.second
.last_update
, i
.first
.osd
));
1506 if (candidate_by_last_update
.empty()) {
1507 // save us some effort
1511 // sort by last_update, in descending order.
1512 std::sort(candidate_by_last_update
.begin(),
1513 candidate_by_last_update
.end(), sort_by_eversion
);
1515 for (auto &p
: candidate_by_last_update
) {
1516 ceph_assert(want
->size() < size
);
1517 want
->push_back(p
.second
);
1518 pg_shard_t s
= pg_shard_t(p
.second
, shard_id_t::NO_SHARD
);
1519 acting_backfill
->insert(s
);
1520 ss
<< " shard " << s
<< " (stray) accepted "
1521 << all_info
.find(s
)->second
<< std::endl
;
1522 if (want
->size() >= size
) {
1528 bool PG::recoverable_and_ge_min_size(const vector
<int> &want
) const
1530 unsigned num_want_acting
= 0;
1531 set
<pg_shard_t
> have
;
1532 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1533 if (want
[i
] != CRUSH_ITEM_NONE
) {
1538 pool
.info
.is_erasure() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1541 // We go incomplete if below min_size for ec_pools since backfill
1542 // does not currently maintain rollbackability
1543 // Otherwise, we will go "peered", but not "active"
1544 if (num_want_acting
< pool
.info
.min_size
&&
1545 (pool
.info
.is_erasure() ||
1546 !cct
->_conf
->osd_allow_recovery_below_min_size
)) {
1547 dout(10) << __func__
<< " failed, below min size" << dendl
;
1551 /* Check whether we have enough acting shards to later perform recovery */
1552 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable_predicate(
1553 get_pgbackend()->get_is_recoverable_predicate());
1554 if (!(*recoverable_predicate
)(have
)) {
1555 dout(10) << __func__
<< " failed, not recoverable" << dendl
;
1562 void PG::choose_async_recovery_ec(const map
<pg_shard_t
, pg_info_t
> &all_info
,
1563 const pg_info_t
&auth_info
,
1565 set
<pg_shard_t
> *async_recovery
,
1566 const OSDMapRef osdmap
) const
1568 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1569 for (uint8_t i
= 0; i
< want
->size(); ++i
) {
1570 if ((*want
)[i
] == CRUSH_ITEM_NONE
)
1573 // Considering log entries to recover is accurate enough for
1574 // now. We could use minimum_to_decode_with_cost() later if
1576 pg_shard_t
shard_i((*want
)[i
], shard_id_t(i
));
1577 // do not include strays
1578 if (stray_set
.find(shard_i
) != stray_set
.end())
1580 // Do not include an osd that is not up, since choosing it as
1581 // an async_recovery_target will move it out of the acting set.
1582 // This results in it being identified as a stray during peering,
1583 // because it is no longer in the up or acting set.
1584 if (!is_up(shard_i
))
1586 auto shard_info
= all_info
.find(shard_i
)->second
;
1587 // for ec pools we rollback all entries past the authoritative
1588 // last_update *before* activation. This is relatively inexpensive
1589 // compared to recovery, since it is purely local, so treat shards
1590 // past the authoritative last_update the same as those equal to it.
1591 version_t auth_version
= auth_info
.last_update
.version
;
1592 version_t candidate_version
= shard_info
.last_update
.version
;
1593 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1594 auto approx_missing_objects
=
1595 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1596 if (auth_version
> candidate_version
) {
1597 approx_missing_objects
+= auth_version
- candidate_version
;
1599 if (static_cast<uint64_t>(approx_missing_objects
) >
1600 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1601 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1604 if (auth_version
> candidate_version
&&
1605 (auth_version
- candidate_version
) > cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1606 candidates_by_cost
.insert(make_pair(auth_version
- candidate_version
, shard_i
));
1611 dout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1614 // take out as many osds as we can for async recovery, in order of cost
1615 for (auto rit
= candidates_by_cost
.rbegin();
1616 rit
!= candidates_by_cost
.rend(); ++rit
) {
1617 pg_shard_t cur_shard
= rit
->second
;
1618 vector
<int> candidate_want(*want
);
1619 candidate_want
[cur_shard
.shard
.id
] = CRUSH_ITEM_NONE
;
1620 if (recoverable_and_ge_min_size(candidate_want
)) {
1621 want
->swap(candidate_want
);
1622 async_recovery
->insert(cur_shard
);
1625 dout(20) << __func__
<< " result want=" << *want
1626 << " async_recovery=" << *async_recovery
<< dendl
;
1629 void PG::choose_async_recovery_replicated(const map
<pg_shard_t
, pg_info_t
> &all_info
,
1630 const pg_info_t
&auth_info
,
1632 set
<pg_shard_t
> *async_recovery
,
1633 const OSDMapRef osdmap
) const
1635 set
<pair
<int, pg_shard_t
> > candidates_by_cost
;
1636 for (auto osd_num
: *want
) {
1637 pg_shard_t
shard_i(osd_num
, shard_id_t::NO_SHARD
);
1638 // do not include strays
1639 if (stray_set
.find(shard_i
) != stray_set
.end())
1641 // Do not include an osd that is not up, since choosing it as
1642 // an async_recovery_target will move it out of the acting set.
1643 // This results in it being identified as a stray during peering,
1644 // because it is no longer in the up or acting set.
1645 if (!is_up(shard_i
))
1647 auto shard_info
= all_info
.find(shard_i
)->second
;
1648 // use the approximate magnitude of the difference in length of
1649 // logs plus historical missing objects as the cost of recovery
1650 version_t auth_version
= auth_info
.last_update
.version
;
1651 version_t candidate_version
= shard_info
.last_update
.version
;
1652 if (HAVE_FEATURE(osdmap
->get_up_osd_features(), SERVER_NAUTILUS
)) {
1653 auto approx_missing_objects
=
1654 shard_info
.stats
.stats
.sum
.num_objects_missing
;
1655 if (auth_version
> candidate_version
) {
1656 approx_missing_objects
+= auth_version
- candidate_version
;
1658 approx_missing_objects
+= candidate_version
- auth_version
;
1660 if (static_cast<uint64_t>(approx_missing_objects
) >
1661 cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1662 candidates_by_cost
.emplace(approx_missing_objects
, shard_i
);
1665 size_t approx_entries
;
1666 if (auth_version
> candidate_version
) {
1667 approx_entries
= auth_version
- candidate_version
;
1669 approx_entries
= candidate_version
- auth_version
;
1671 if (approx_entries
> cct
->_conf
.get_val
<uint64_t>("osd_async_recovery_min_cost")) {
1672 candidates_by_cost
.insert(make_pair(approx_entries
, shard_i
));
1677 dout(20) << __func__
<< " candidates by cost are: " << candidates_by_cost
1679 // take out as many osds as we can for async recovery, in order of cost
1680 for (auto rit
= candidates_by_cost
.rbegin();
1681 rit
!= candidates_by_cost
.rend(); ++rit
) {
1682 if (want
->size() <= pool
.info
.min_size
) {
1685 pg_shard_t cur_shard
= rit
->second
;
1686 vector
<int> candidate_want(*want
);
1687 for (auto it
= candidate_want
.begin(); it
!= candidate_want
.end(); ++it
) {
1688 if (*it
== cur_shard
.osd
) {
1689 candidate_want
.erase(it
);
1690 want
->swap(candidate_want
);
1691 async_recovery
->insert(cur_shard
);
1696 dout(20) << __func__
<< " result want=" << *want
1697 << " async_recovery=" << *async_recovery
<< dendl
;
1703 * calculate the desired acting, and request a change with the monitor
1704 * if it differs from the current acting.
1706 * if restrict_to_up_acting=true, we filter out anything that's not in
1707 * up/acting. in order to lift this restriction, we need to
1708 * 1) check whether it's worth switching the acting set any time we get
1709 * a new pg info (not just here, when recovery finishes)
1710 * 2) check whether anything in want_acting went down on each new map
1711 * (and, if so, calculate a new want_acting)
1712 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1715 bool PG::choose_acting(pg_shard_t
&auth_log_shard_id
,
1716 bool restrict_to_up_acting
,
1717 bool *history_les_bound
)
1719 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
1720 all_info
[pg_whoami
] = info
;
1722 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 10>()) {
1723 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
1724 p
!= all_info
.end();
1726 dout(10) << __func__
<< " all_info osd." << p
->first
<< " " << p
->second
<< dendl
;
1730 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
1731 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
1733 if (auth_log_shard
== all_info
.end()) {
1735 dout(10) << __func__
<< " no suitable info found (incomplete backfills?),"
1736 << " reverting to up" << dendl
;
1739 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1741 dout(10) << __func__
<< " failed" << dendl
;
1742 ceph_assert(want_acting
.empty());
1747 ceph_assert(!auth_log_shard
->second
.is_incomplete());
1748 auth_log_shard_id
= auth_log_shard
->first
;
1750 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
1753 if (!pool
.info
.is_erasure())
1754 calc_replicated_acting(
1756 cct
->_conf
.get_val
<uint64_t>(
1757 "osd_force_auth_primary_missing_objects"),
1758 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1763 restrict_to_up_acting
,
1766 &want_acting_backfill
,
1772 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1776 restrict_to_up_acting
,
1779 &want_acting_backfill
,
1781 dout(10) << ss
.str() << dendl
;
1783 if (!recoverable_and_ge_min_size(want
)) {
1784 want_acting
.clear();
1788 set
<pg_shard_t
> want_async_recovery
;
1789 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC
)) {
1790 if (pool
.info
.is_erasure()) {
1791 choose_async_recovery_ec(all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
, get_osdmap());
1793 choose_async_recovery_replicated(all_info
, auth_log_shard
->second
, &want
, &want_async_recovery
, get_osdmap());
1796 while (want
.size() > pool
.info
.size
) {
1797 // async recovery should have taken out as many osds as it can.
1798 // if not, then always evict the last peer
1799 // (will get synchronously recovered later)
1800 dout(10) << __func__
<< " evicting osd." << want
.back()
1801 << " from oversized want " << want
<< dendl
;
1804 if (want
!= acting
) {
1805 dout(10) << __func__
<< " want " << want
<< " != acting " << acting
1806 << ", requesting pg_temp change" << dendl
;
1809 if (!cct
->_conf
->osd_debug_no_acting_change
) {
1810 if (want_acting
== up
) {
1811 // There can't be any pending backfill if
1812 // want is the same as crush map up OSDs.
1813 ceph_assert(want_backfill
.empty());
1815 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1817 osd
->queue_want_pg_temp(info
.pgid
.pgid
, want
);
1821 want_acting
.clear();
1822 acting_recovery_backfill
= want_acting_backfill
;
1823 dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill
<< dendl
;
1824 ceph_assert(backfill_targets
.empty() || backfill_targets
== want_backfill
);
1825 if (backfill_targets
.empty()) {
1826 // Caller is GetInfo
1827 backfill_targets
= want_backfill
;
1829 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1830 ceph_assert(async_recovery_targets
.empty() || async_recovery_targets
== want_async_recovery
|| !needs_recovery());
1831 if (async_recovery_targets
.empty() || !needs_recovery()) {
1832 async_recovery_targets
= want_async_recovery
;
1834 // Will not change if already set because up would have had to change
1835 // Verify that nothing in backfill is in stray_set
1836 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
1837 i
!= want_backfill
.end();
1839 ceph_assert(stray_set
.find(*i
) == stray_set
.end());
1841 dout(10) << "choose_acting want=" << want
<< " backfill_targets="
1842 << want_backfill
<< " async_recovery_targets="
1843 << async_recovery_targets
<< dendl
;
1847 /* Build the might_have_unfound set.
1849 * This is used by the primary OSD during recovery.
1851 * This set tracks the OSDs which might have unfound objects that the primary
1852 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1853 * will remove the OSD from the set.
1855 void PG::build_might_have_unfound()
1857 ceph_assert(might_have_unfound
.empty());
1858 ceph_assert(is_primary());
1860 dout(10) << __func__
<< dendl
;
1862 check_past_interval_bounds();
1864 might_have_unfound
= past_intervals
.get_might_have_unfound(
1866 pool
.info
.is_erasure());
1868 // include any (stray) peers
1869 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
1870 p
!= peer_info
.end();
1872 might_have_unfound
.insert(p
->first
);
1874 dout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
1877 void PG::activate(ObjectStore::Transaction
& t
,
1878 epoch_t activation_epoch
,
1879 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1883 PastIntervals
> > > *activator_map
,
1886 ceph_assert(!is_peered());
1887 ceph_assert(scrubber
.callbacks
.empty());
1888 ceph_assert(callbacks_for_degraded_object
.empty());
1891 state_clear(PG_STATE_DOWN
);
1893 send_notify
= false;
1896 // only update primary last_epoch_started if we will go active
1897 if (acting
.size() >= pool
.info
.min_size
) {
1898 ceph_assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
1899 info
.last_epoch_started
<= activation_epoch
);
1900 info
.last_epoch_started
= activation_epoch
;
1901 info
.last_interval_started
= info
.history
.same_interval_since
;
1903 } else if (is_acting(pg_whoami
)) {
1904 /* update last_epoch_started on acting replica to whatever the primary sent
1905 * unless it's smaller (could happen if we are going peered rather than
1906 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1907 if (info
.last_epoch_started
< activation_epoch
) {
1908 info
.last_epoch_started
= activation_epoch
;
1909 info
.last_interval_started
= info
.history
.same_interval_since
;
1913 auto &missing
= pg_log
.get_missing();
1916 last_update_ondisk
= info
.last_update
;
1917 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
1919 last_update_applied
= info
.last_update
;
1920 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
1922 need_up_thru
= false;
1924 // write pg info, log
1926 dirty_big_info
= true; // maybe
1928 // find out when we commit
1929 t
.register_on_complete(
1930 new C_PG_ActivateCommitted(
1936 // initialize snap_trimq
1937 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_MIMIC
) {
1938 dout(20) << "activate - purged_snaps " << info
.purged_snaps
1939 << " cached_removed_snaps " << pool
.cached_removed_snaps
1941 snap_trimq
= pool
.cached_removed_snaps
;
1943 auto& removed_snaps_queue
= get_osdmap()->get_removed_snaps_queue();
1944 auto p
= removed_snaps_queue
.find(info
.pgid
.pgid
.pool());
1946 if (p
!= removed_snaps_queue
.end()) {
1947 dout(20) << "activate - purged_snaps " << info
.purged_snaps
1948 << " removed_snaps " << p
->second
1950 for (auto q
: p
->second
) {
1951 snap_trimq
.insert(q
.first
, q
.second
);
1955 interval_set
<snapid_t
> purged
;
1956 purged
.intersection_of(snap_trimq
, info
.purged_snaps
);
1957 snap_trimq
.subtract(purged
);
1959 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_MIMIC
) {
1960 // adjust purged_snaps: PG may have been inactive while snaps were pruned
1961 // from the removed_snaps_queue in the osdmap. update local purged_snaps
1962 // reflect only those snaps that we thought were pruned and were still in
1964 info
.purged_snaps
.swap(purged
);
1968 // init complete pointer
1969 if (missing
.num_missing() == 0) {
1970 dout(10) << "activate - no missing, moving last_complete " << info
.last_complete
1971 << " -> " << info
.last_update
<< dendl
;
1972 info
.last_complete
= info
.last_update
;
1973 info
.stats
.stats
.sum
.num_objects_missing
= 0;
1974 pg_log
.reset_recovery_pointers();
1976 dout(10) << "activate - not complete, " << missing
<< dendl
;
1977 info
.stats
.stats
.sum
.num_objects_missing
= missing
.num_missing();
1978 pg_log
.activate_not_complete(info
);
1986 // start up replicas
1988 ceph_assert(!acting_recovery_backfill
.empty());
1989 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
1990 i
!= acting_recovery_backfill
.end();
1992 if (*i
== pg_whoami
) continue;
1993 pg_shard_t peer
= *i
;
1994 ceph_assert(peer_info
.count(peer
));
1995 pg_info_t
& pi
= peer_info
[peer
];
1997 dout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
2000 ceph_assert(peer_missing
.count(peer
));
2001 pg_missing_t
& pm
= peer_missing
[peer
];
2003 bool needs_past_intervals
= pi
.dne();
2006 * cover case where peer sort order was different and
2007 * last_backfill cannot be interpreted
2009 bool force_restart_backfill
=
2010 !pi
.last_backfill
.is_max() &&
2011 !pi
.last_backfill_bitwise
;
2013 if (pi
.last_update
== info
.last_update
&& !force_restart_backfill
) {
2015 if (!pi
.last_backfill
.is_max())
2016 osd
->clog
->info() << info
.pgid
<< " continuing backfill to osd."
2018 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2019 << "] " << pi
.last_backfill
2020 << " to " << info
.last_update
;
2021 if (!pi
.is_empty() && activator_map
) {
2022 dout(10) << "activate peer osd." << peer
<< " is up to date, queueing in pending_activators" << dendl
;
2023 (*activator_map
)[peer
.osd
].push_back(
2026 peer
.shard
, pg_whoami
.shard
,
2032 dout(10) << "activate peer osd." << peer
<< " is up to date, but sending pg_log anyway" << dendl
;
2034 i
->shard
, pg_whoami
.shard
,
2035 get_osdmap_epoch(), info
,
2036 last_peering_reset
);
2039 pg_log
.get_tail() > pi
.last_update
||
2040 pi
.last_backfill
== hobject_t() ||
2041 force_restart_backfill
||
2042 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
2043 /* ^ This last case covers a situation where a replica is not contiguous
2044 * with the auth_log, but is contiguous with this replica. Reshuffling
2045 * the active set to handle this would be tricky, so instead we just go
2046 * ahead and backfill it anyway. This is probably preferrable in any
2047 * case since the replica in question would have to be significantly
2051 osd
->clog
->debug() << info
.pgid
<< " starting backfill to osd." << peer
2052 << " from (" << pi
.log_tail
<< "," << pi
.last_update
2053 << "] " << pi
.last_backfill
2054 << " to " << info
.last_update
;
2056 pi
.last_update
= info
.last_update
;
2057 pi
.last_complete
= info
.last_update
;
2058 pi
.set_last_backfill(hobject_t());
2059 pi
.last_epoch_started
= info
.last_epoch_started
;
2060 pi
.last_interval_started
= info
.last_interval_started
;
2061 pi
.history
= info
.history
;
2062 pi
.hit_set
= info
.hit_set
;
2063 // Save num_bytes for reservation request, can't be negative
2064 peer_bytes
[peer
] = std::max
<int64_t>(0, pi
.stats
.stats
.sum
.num_bytes
);
2065 pi
.stats
.stats
.clear();
2067 // initialize peer with our purged_snaps.
2068 pi
.purged_snaps
= info
.purged_snaps
;
2071 i
->shard
, pg_whoami
.shard
,
2072 get_osdmap_epoch(), pi
,
2073 last_peering_reset
/* epoch to create pg at */);
2075 // send some recent log, so that op dup detection works well.
2076 m
->log
.copy_up_to(cct
, pg_log
.get_log(), cct
->_conf
->osd_min_pg_log_entries
);
2077 m
->info
.log_tail
= m
->log
.tail
;
2078 pi
.log_tail
= m
->log
.tail
; // sigh...
2083 ceph_assert(pg_log
.get_tail() <= pi
.last_update
);
2085 i
->shard
, pg_whoami
.shard
,
2086 get_osdmap_epoch(), info
,
2087 last_peering_reset
/* epoch to create pg at */);
2088 // send new stuff to append to replicas log
2089 m
->log
.copy_after(cct
, pg_log
.get_log(), pi
.last_update
);
2092 // share past_intervals if we are creating the pg on the replica
2093 // based on whether our info for that peer was dne() *before*
2094 // updating pi.history in the backfill block above.
2095 if (m
&& needs_past_intervals
)
2096 m
->past_intervals
= past_intervals
;
2098 // update local version of peer's missing list!
2099 if (m
&& pi
.last_backfill
!= hobject_t()) {
2100 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
2101 p
!= m
->log
.log
.end();
2103 if (p
->soid
<= pi
.last_backfill
&&
2105 if (perform_deletes_during_peering() && p
->is_delete()) {
2106 pm
.rm(p
->soid
, p
->version
);
2108 pm
.add_next_event(*p
);
2115 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
<< dendl
;
2116 //m->log.print(cout);
2117 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap_epoch());
2121 pi
.last_update
= info
.last_update
;
2123 // update our missing
2124 if (pm
.num_missing() == 0) {
2125 pi
.last_complete
= pi
.last_update
;
2126 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " uptodate" << dendl
;
2128 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " missing " << pm
<< dendl
;
2132 // Set up missing_loc
2133 set
<pg_shard_t
> complete_shards
;
2134 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2135 i
!= acting_recovery_backfill
.end();
2137 dout(20) << __func__
<< " setting up missing_loc from shard " << *i
<< " " << dendl
;
2138 if (*i
== get_primary()) {
2139 missing_loc
.add_active_missing(missing
);
2140 if (!missing
.have_missing())
2141 complete_shards
.insert(*i
);
2143 auto peer_missing_entry
= peer_missing
.find(*i
);
2144 ceph_assert(peer_missing_entry
!= peer_missing
.end());
2145 missing_loc
.add_active_missing(peer_missing_entry
->second
);
2146 if (!peer_missing_entry
->second
.have_missing() &&
2147 peer_info
[*i
].last_backfill
.is_max())
2148 complete_shards
.insert(*i
);
2152 // If necessary, create might_have_unfound to help us find our unfound objects.
2153 // NOTE: It's important that we build might_have_unfound before trimming the
2155 might_have_unfound
.clear();
2156 if (needs_recovery()) {
2157 // If only one shard has missing, we do a trick to add all others as recovery
2158 // source, this is considered safe since the PGLogs have been merged locally,
2159 // and covers vast majority of the use cases, like one OSD/host is down for
2160 // a while for hardware repairing
2161 if (complete_shards
.size() + 1 == acting_recovery_backfill
.size()) {
2162 missing_loc
.add_batch_sources_info(complete_shards
, ctx
->handle
);
2164 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
2166 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
2167 i
!= acting_recovery_backfill
.end();
2169 if (*i
== pg_whoami
) continue;
2170 dout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
2171 ceph_assert(peer_missing
.count(*i
));
2172 ceph_assert(peer_info
.count(*i
));
2173 missing_loc
.add_source_info(
2180 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
2181 i
!= peer_missing
.end();
2183 if (is_acting_recovery_backfill(i
->first
))
2185 ceph_assert(peer_info
.count(i
->first
));
2187 peer_info
[i
->first
],
2193 build_might_have_unfound();
2195 // Always call now so _update_calc_stats() will be accurate
2196 discover_all_missing(query_map
);
2199 // num_objects_degraded if calculated should reflect this too, unless no
2200 // missing and we are about to go clean.
2201 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
2202 state_set(PG_STATE_UNDERSIZED
);
2205 state_set(PG_STATE_ACTIVATING
);
2206 release_pg_backoffs();
2207 projected_last_update
= info
.last_update
;
2209 if (acting
.size() >= pool
.info
.min_size
) {
2210 PGLogEntryHandler handler
{this, &t
};
2211 pg_log
.roll_forward(&handler
);
2215 bool PG::op_has_sufficient_caps(OpRequestRef
& op
)
2217 // only check MOSDOp
2218 if (op
->get_req()->get_type() != CEPH_MSG_OSD_OP
)
2221 const MOSDOp
*req
= static_cast<const MOSDOp
*>(op
->get_req());
2223 auto priv
= req
->get_connection()->get_priv();
2224 auto session
= static_cast<Session
*>(priv
.get());
2226 dout(0) << "op_has_sufficient_caps: no session for op " << *req
<< dendl
;
2229 OSDCap
& caps
= session
->caps
;
2232 const string
&key
= req
->get_hobj().get_key().empty() ?
2233 req
->get_oid().name
:
2234 req
->get_hobj().get_key();
2236 bool cap
= caps
.is_capable(pool
.name
, req
->get_hobj().nspace
,
2237 pool
.info
.application_metadata
,
2239 op
->need_read_cap(),
2240 op
->need_write_cap(),
2242 session
->get_peer_socket_addr());
2244 dout(20) << "op_has_sufficient_caps "
2245 << "session=" << session
2246 << " pool=" << pool
.id
<< " (" << pool
.name
2247 << " " << req
->get_hobj().nspace
2249 << " pool_app_metadata=" << pool
.info
.application_metadata
2250 << " need_read_cap=" << op
->need_read_cap()
2251 << " need_write_cap=" << op
->need_write_cap()
2252 << " classes=" << op
->classes()
2253 << " -> " << (cap
? "yes" : "NO")
2258 void PG::_activate_committed(epoch_t epoch
, epoch_t activation_epoch
)
2261 if (pg_has_reset_since(epoch
)) {
2262 dout(10) << "_activate_committed " << epoch
2263 << ", that was an old interval" << dendl
;
2264 } else if (is_primary()) {
2265 ceph_assert(!peer_activated
.count(pg_whoami
));
2266 peer_activated
.insert(pg_whoami
);
2267 dout(10) << "_activate_committed " << epoch
2268 << " peer_activated now " << peer_activated
2269 << " last_interval_started " << info
.history
.last_interval_started
2270 << " last_epoch_started " << info
.history
.last_epoch_started
2271 << " same_interval_since " << info
.history
.same_interval_since
<< dendl
;
2272 ceph_assert(!acting_recovery_backfill
.empty());
2273 if (peer_activated
.size() == acting_recovery_backfill
.size())
2274 all_activated_and_committed();
2276 dout(10) << "_activate_committed " << epoch
<< " telling primary" << dendl
;
2277 MOSDPGInfo
*m
= new MOSDPGInfo(epoch
);
2278 pg_notify_t i
= pg_notify_t(
2279 get_primary().shard
, pg_whoami
.shard
,
2284 i
.info
.history
.last_epoch_started
= activation_epoch
;
2285 i
.info
.history
.last_interval_started
= i
.info
.history
.same_interval_since
;
2286 if (acting
.size() >= pool
.info
.min_size
) {
2287 state_set(PG_STATE_ACTIVE
);
2289 state_set(PG_STATE_PEERED
);
2292 m
->pg_list
.push_back(make_pair(i
, PastIntervals()));
2293 osd
->send_message_osd_cluster(get_primary().osd
, m
, get_osdmap_epoch());
2296 if (flushes_in_progress
== 0) {
2297 requeue_ops(waiting_for_peered
);
2298 } else if (!waiting_for_peered
.empty()) {
2299 dout(10) << __func__
<< " flushes in progress, moving "
2300 << waiting_for_peered
.size() << " items to waiting_for_flush"
2302 ceph_assert(waiting_for_flush
.empty());
2303 waiting_for_flush
.swap(waiting_for_peered
);
2307 ceph_assert(!dirty_info
);
2313 * update info.history.last_epoch_started ONLY after we and all
2314 * replicas have activated AND committed the activate transaction
2315 * (i.e. the peering results are stable on disk).
2317 void PG::all_activated_and_committed()
2319 dout(10) << "all_activated_and_committed" << dendl
;
2320 ceph_assert(is_primary());
2321 ceph_assert(peer_activated
.size() == acting_recovery_backfill
.size());
2322 ceph_assert(!acting_recovery_backfill
.empty());
2323 ceph_assert(blocked_by
.empty());
2326 _update_calc_stats();
2327 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
2328 state_set(PG_STATE_DEGRADED
);
2330 state_clear(PG_STATE_DEGRADED
);
2333 queue_peering_event(
2335 std::make_shared
<PGPeeringEvent
>(
2338 AllReplicasActivated())));
2341 bool PG::requeue_scrub(bool high_priority
)
2343 ceph_assert(is_locked());
2345 dout(10) << __func__
<< ": already queued" << dendl
;
2348 dout(10) << __func__
<< ": queueing" << dendl
;
2349 scrub_queued
= true;
2350 osd
->queue_for_scrub(this, high_priority
);
2355 void PG::queue_recovery()
2357 if (!is_primary() || !is_peered()) {
2358 dout(10) << "queue_recovery -- not primary or not peered " << dendl
;
2359 ceph_assert(!recovery_queued
);
2360 } else if (recovery_queued
) {
2361 dout(10) << "queue_recovery -- already queued" << dendl
;
2363 dout(10) << "queue_recovery -- queuing" << dendl
;
2364 recovery_queued
= true;
2365 osd
->queue_for_recovery(this);
2369 bool PG::queue_scrub()
2371 ceph_assert(is_locked());
2372 if (is_scrubbing()) {
2375 // An interrupted recovery repair could leave this set.
2376 state_clear(PG_STATE_REPAIR
);
2377 if (scrubber
.need_auto
) {
2378 scrubber
.must_scrub
= true;
2379 scrubber
.must_deep_scrub
= true;
2380 scrubber
.auto_repair
= true;
2381 scrubber
.need_auto
= false;
2383 scrubber
.priority
= scrubber
.must_scrub
?
2384 cct
->_conf
->osd_requested_scrub_priority
: get_scrub_priority();
2385 scrubber
.must_scrub
= false;
2386 state_set(PG_STATE_SCRUBBING
);
2387 if (scrubber
.must_deep_scrub
) {
2388 state_set(PG_STATE_DEEP_SCRUB
);
2389 scrubber
.must_deep_scrub
= false;
2391 if (scrubber
.must_repair
|| scrubber
.auto_repair
) {
2392 state_set(PG_STATE_REPAIR
);
2393 scrubber
.must_repair
= false;
2399 unsigned PG::get_scrub_priority()
2401 // a higher value -> a higher priority
2402 int64_t pool_scrub_priority
= 0;
2403 pool
.info
.opts
.get(pool_opts_t::SCRUB_PRIORITY
, &pool_scrub_priority
);
2404 return pool_scrub_priority
> 0 ? pool_scrub_priority
: cct
->_conf
->osd_scrub_priority
;
2407 void PG::try_mark_clean()
2409 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2410 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
2411 state_set(PG_STATE_CLEAN
);
2412 info
.history
.last_epoch_clean
= get_osdmap_epoch();
2413 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2414 past_intervals
.clear();
2415 dirty_big_info
= true;
2421 } else if (is_peered()) {
2424 if (pool
.info
.is_pending_merge(info
.pgid
.pgid
, &target
)) {
2426 ldout(cct
, 10) << "ready to merge (target)" << dendl
;
2427 osd
->set_ready_to_merge_target(this,
2429 info
.history
.last_epoch_started
,
2430 info
.history
.last_epoch_clean
);
2432 ldout(cct
, 10) << "ready to merge (source)" << dendl
;
2433 osd
->set_ready_to_merge_source(this, info
.last_update
);
2437 ldout(cct
, 10) << "not clean, not ready to merge" << dendl
;
2438 // we should have notified OSD in Active state entry point
2442 state_clear(PG_STATE_FORCED_RECOVERY
| PG_STATE_FORCED_BACKFILL
);
2445 publish_stats_to_osd();
2446 requeue_ops(waiting_for_clean_to_primary_repair
);
2449 bool PG::set_force_recovery(bool b
)
2453 if (!(state
& PG_STATE_FORCED_RECOVERY
) &&
2454 (state
& (PG_STATE_DEGRADED
|
2455 PG_STATE_RECOVERY_WAIT
|
2456 PG_STATE_RECOVERING
))) {
2457 dout(20) << __func__
<< " set" << dendl
;
2458 state_set(PG_STATE_FORCED_RECOVERY
);
2459 publish_stats_to_osd();
2462 } else if (state
& PG_STATE_FORCED_RECOVERY
) {
2463 dout(20) << __func__
<< " clear" << dendl
;
2464 state_clear(PG_STATE_FORCED_RECOVERY
);
2465 publish_stats_to_osd();
2469 dout(20) << __func__
<< " state " << pgstate_history
.get_current_state() << dendl
;
2470 osd
->local_reserver
.update_priority(info
.pgid
, get_recovery_priority());
2475 bool PG::set_force_backfill(bool b
)
2479 if (!(state
& PG_STATE_FORCED_BACKFILL
) &&
2480 (state
& (PG_STATE_DEGRADED
|
2481 PG_STATE_BACKFILL_WAIT
|
2482 PG_STATE_BACKFILLING
))) {
2483 dout(10) << __func__
<< " set" << dendl
;
2484 state_set(PG_STATE_FORCED_BACKFILL
);
2485 publish_stats_to_osd();
2488 } else if (state
& PG_STATE_FORCED_BACKFILL
) {
2489 dout(10) << __func__
<< " clear" << dendl
;
2490 state_clear(PG_STATE_FORCED_BACKFILL
);
2491 publish_stats_to_osd();
2495 dout(20) << __func__
<< " state " << pgstate_history
.get_current_state() << dendl
;
2496 osd
->local_reserver
.update_priority(info
.pgid
, get_backfill_priority());
2501 int PG::clamp_recovery_priority(int priority
, int pool_recovery_priority
, int max
)
2503 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
2504 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
2506 ceph_assert(max
<= OSD_RECOVERY_PRIORITY_MAX
);
2508 // User can't set this too high anymore, but might be a legacy value
2509 if (pool_recovery_priority
> OSD_POOL_PRIORITY_MAX
)
2510 pool_recovery_priority
= OSD_POOL_PRIORITY_MAX
;
2511 if (pool_recovery_priority
< OSD_POOL_PRIORITY_MIN
)
2512 pool_recovery_priority
= OSD_POOL_PRIORITY_MIN
;
2513 // Shift range from min to max to 0 to max - min
2514 pool_recovery_priority
+= (0 - OSD_POOL_PRIORITY_MIN
);
2515 ceph_assert(pool_recovery_priority
>= 0 && pool_recovery_priority
<= (OSD_POOL_PRIORITY_MAX
- OSD_POOL_PRIORITY_MIN
));
2517 priority
+= pool_recovery_priority
;
2519 // Clamp to valid range
2520 if (priority
> max
) {
2522 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
2523 return OSD_RECOVERY_PRIORITY_MIN
;
2529 unsigned PG::get_recovery_priority()
2531 // a higher value -> a higher priority
2532 int ret
= OSD_RECOVERY_PRIORITY_BASE
;
2535 if (state
& PG_STATE_FORCED_RECOVERY
) {
2536 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
2538 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
2539 if (is_degraded() && info
.stats
.avail_no_missing
.size() < pool
.info
.min_size
) {
2540 base
= OSD_RECOVERY_INACTIVE_PRIORITY_BASE
;
2541 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2542 ret
= base
+ (pool
.info
.min_size
- info
.stats
.avail_no_missing
.size());
2545 int64_t pool_recovery_priority
= 0;
2546 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2548 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
2550 dout(20) << __func__
<< " recovery priority is " << ret
<< dendl
;
2551 return static_cast<unsigned>(ret
);
2554 unsigned PG::get_backfill_priority()
2556 // a higher value -> a higher priority
2557 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
2560 if (state
& PG_STATE_FORCED_BACKFILL
) {
2561 ret
= OSD_BACKFILL_PRIORITY_FORCED
;
2563 if (acting
.size() < pool
.info
.min_size
) {
2564 base
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
;
2565 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2566 ret
= base
+ (pool
.info
.min_size
- acting
.size());
2568 } else if (is_undersized()) {
2569 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2570 ceph_assert(pool
.info
.size
> actingset
.size());
2571 base
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
2572 ret
= base
+ (pool
.info
.size
- actingset
.size());
2574 } else if (is_degraded()) {
2575 // degraded: baseline degraded
2576 base
= ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
2579 // Adjust with pool's recovery priority
2580 int64_t pool_recovery_priority
= 0;
2581 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2583 ret
= clamp_recovery_priority(ret
, pool_recovery_priority
, max_prio_map
[base
]);
2586 dout(20) << __func__
<< " backfill priority is " << ret
<< dendl
;
2587 return static_cast<unsigned>(ret
);
2590 unsigned PG::get_delete_priority()
2592 auto state
= get_osdmap()->get_state(osd
->whoami
);
2593 if (state
& (CEPH_OSD_BACKFILLFULL
|
2595 return OSD_DELETE_PRIORITY_FULL
;
2596 } else if (state
& CEPH_OSD_NEARFULL
) {
2597 return OSD_DELETE_PRIORITY_FULLISH
;
2599 return OSD_DELETE_PRIORITY_NORMAL
;
2603 Context
*PG::finish_recovery()
2605 dout(10) << "finish_recovery" << dendl
;
2606 ceph_assert(info
.last_complete
== info
.last_update
);
2608 clear_recovery_state();
2611 * sync all this before purging strays. but don't block!
2613 finish_sync_event
= new C_PG_FinishRecovery(this);
2614 return finish_sync_event
;
2617 void PG::_finish_recovery(Context
*c
)
2620 // When recovery is initiated by a repair, that flag is left on
2621 state_clear(PG_STATE_REPAIR
);
2626 if (c
== finish_sync_event
) {
2627 dout(10) << "_finish_recovery" << dendl
;
2628 finish_sync_event
= 0;
2631 publish_stats_to_osd();
2633 if (scrub_after_recovery
) {
2634 dout(10) << "_finish_recovery requeueing for scrub" << dendl
;
2635 scrub_after_recovery
= false;
2636 scrubber
.must_deep_scrub
= true;
2637 scrubber
.check_repair
= true;
2641 dout(10) << "_finish_recovery -- stale" << dendl
;
2646 void PG::start_recovery_op(const hobject_t
& soid
)
2648 dout(10) << "start_recovery_op " << soid
2649 #ifdef DEBUG_RECOVERY_OIDS
2650 << " (" << recovering_oids
<< ")"
2653 ceph_assert(recovery_ops_active
>= 0);
2654 recovery_ops_active
++;
2655 #ifdef DEBUG_RECOVERY_OIDS
2656 recovering_oids
.insert(soid
);
2658 osd
->start_recovery_op(this, soid
);
2661 void PG::finish_recovery_op(const hobject_t
& soid
, bool dequeue
)
2663 dout(10) << "finish_recovery_op " << soid
2664 #ifdef DEBUG_RECOVERY_OIDS
2665 << " (" << recovering_oids
<< ")"
2668 ceph_assert(recovery_ops_active
> 0);
2669 recovery_ops_active
--;
2670 #ifdef DEBUG_RECOVERY_OIDS
2671 ceph_assert(recovering_oids
.count(soid
));
2672 recovering_oids
.erase(recovering_oids
.find(soid
));
2674 osd
->finish_recovery_op(this, soid
, dequeue
);
2681 void PG::split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
2683 child
->update_snap_mapper_bits(split_bits
);
2684 child
->update_osdmap_ref(get_osdmap());
2689 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2690 child
->info
.last_complete
= info
.last_complete
;
2692 info
.last_update
= pg_log
.get_head();
2693 child
->info
.last_update
= child
->pg_log
.get_head();
2695 child
->info
.last_user_version
= info
.last_user_version
;
2697 info
.log_tail
= pg_log
.get_tail();
2698 child
->info
.log_tail
= child
->pg_log
.get_tail();
2700 // reset last_complete, we might have modified pg_log & missing above
2701 pg_log
.reset_complete_to(&info
);
2702 child
->pg_log
.reset_complete_to(&child
->info
);
2705 child
->info
.history
= info
.history
;
2706 child
->info
.history
.epoch_created
= get_osdmap_epoch();
2707 child
->info
.purged_snaps
= info
.purged_snaps
;
2709 if (info
.last_backfill
.is_max()) {
2710 child
->info
.set_last_backfill(hobject_t::get_max());
2712 // restart backfill on parent and child to be safe. we could
2713 // probably do better in the bitwise sort case, but it's more
2714 // fragile (there may be special work to do on backfill completion
2716 info
.set_last_backfill(hobject_t());
2717 child
->info
.set_last_backfill(hobject_t());
2718 // restarting backfill implies that the missing set is empty,
2719 // since it is only used for objects prior to last_backfill
2720 pg_log
.reset_backfill();
2721 child
->pg_log
.reset_backfill();
2724 child
->info
.stats
= info
.stats
;
2725 child
->info
.stats
.parent_split_bits
= split_bits
;
2726 info
.stats
.stats_invalid
= true;
2727 child
->info
.stats
.stats_invalid
= true;
2728 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2729 child
->info
.last_interval_started
= info
.last_interval_started
;
2731 child
->snap_trimq
= snap_trimq
;
2733 // There can't be recovery/backfill going on now
2734 int primary
, up_primary
;
2735 vector
<int> newup
, newacting
;
2736 get_osdmap()->pg_to_up_acting_osds(
2737 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2738 child
->init_primary_up_acting(
2743 child
->role
= OSDMap::calc_pg_role(osd
->whoami
, child
->acting
);
2745 // this comparison includes primary rank via pg_shard_t
2746 if (get_primary() != child
->get_primary())
2747 child
->info
.history
.same_primary_since
= get_osdmap_epoch();
2749 child
->info
.stats
.up
= up
;
2750 child
->info
.stats
.up_primary
= up_primary
;
2751 child
->info
.stats
.acting
= acting
;
2752 child
->info
.stats
.acting_primary
= primary
;
2753 child
->info
.stats
.mapping_epoch
= get_osdmap_epoch();
2756 child
->past_intervals
= past_intervals
;
2758 _split_into(child_pgid
, child
, split_bits
);
2760 // release all backoffs for simplicity
2761 release_backoffs(hobject_t(), hobject_t::get_max());
2763 child
->on_new_interval();
2765 child
->send_notify
= !child
->is_primary();
2767 child
->dirty_info
= true;
2768 child
->dirty_big_info
= true;
2770 dirty_big_info
= true;
2773 void PG::start_split_stats(const set
<spg_t
>& childpgs
, vector
<object_stat_sum_t
> *out
)
2775 out
->resize(childpgs
.size() + 1);
2776 info
.stats
.stats
.sum
.split(*out
);
2779 void PG::finish_split_stats(const object_stat_sum_t
& stats
, ObjectStore::Transaction
*t
)
2781 info
.stats
.stats
.sum
= stats
;
2785 void PG::merge_from(map
<spg_t
,PGRef
>& sources
, RecoveryCtx
*rctx
,
2786 unsigned split_bits
,
2787 const pg_merge_meta_t
& last_pg_merge_meta
)
2789 dout(10) << __func__
<< " from " << sources
<< " split_bits " << split_bits
2791 bool incomplete
= false;
2792 if (info
.last_complete
!= info
.last_update
||
2793 info
.is_incomplete() ||
2795 dout(10) << __func__
<< " target incomplete" << dendl
;
2798 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
2799 if (info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
.get_parent()) {
2800 dout(10) << __func__
<< " target doesn't match expected parent "
2801 << last_pg_merge_meta
.source_pgid
.get_parent()
2802 << " of source_pgid " << last_pg_merge_meta
.source_pgid
2806 if (info
.last_update
!= last_pg_merge_meta
.target_version
) {
2807 dout(10) << __func__
<< " target version doesn't match expected "
2808 << last_pg_merge_meta
.target_version
<< dendl
;
2813 PGLogEntryHandler handler
{this, rctx
->transaction
};
2814 pg_log
.roll_forward(&handler
);
2816 info
.last_complete
= info
.last_update
; // to fake out trim()
2817 pg_log
.reset_recovery_pointers();
2818 pg_log
.trim(info
.last_update
, info
);
2820 vector
<PGLog
*> log_from
;
2821 for (auto& i
: sources
) {
2822 auto& source
= i
.second
;
2824 dout(10) << __func__
<< " source " << i
.first
<< " missing" << dendl
;
2828 if (source
->info
.last_complete
!= source
->info
.last_update
||
2829 source
->info
.is_incomplete() ||
2830 source
->info
.dne()) {
2831 dout(10) << __func__
<< " source " << source
->pg_id
<< " incomplete"
2835 if (last_pg_merge_meta
.source_pgid
!= pg_t()) {
2836 if (source
->info
.pgid
.pgid
!= last_pg_merge_meta
.source_pgid
) {
2837 dout(10) << __func__
<< " source " << source
->info
.pgid
.pgid
2838 << " doesn't match expected source pgid "
2839 << last_pg_merge_meta
.source_pgid
<< dendl
;
2842 if (source
->info
.last_update
!= last_pg_merge_meta
.source_version
) {
2843 dout(10) << __func__
<< " source version doesn't match expected "
2844 << last_pg_merge_meta
.target_version
<< dendl
;
2850 PGLogEntryHandler handler
{source
.get(), rctx
->transaction
};
2851 source
->pg_log
.roll_forward(&handler
);
2852 source
->info
.last_complete
= source
->info
.last_update
; // to fake out trim()
2853 source
->pg_log
.reset_recovery_pointers();
2854 source
->pg_log
.trim(source
->info
.last_update
, source
->info
);
2855 log_from
.push_back(&source
->pg_log
);
2857 // wipe out source's pgmeta
2858 rctx
->transaction
->remove(source
->coll
, source
->pgmeta_oid
);
2860 // merge (and destroy source collection)
2861 rctx
->transaction
->merge_collection(source
->coll
, coll
, split_bits
);
2864 info
.stats
.add(source
->info
.stats
);
2866 // pull up last_update
2867 info
.last_update
= std::max(info
.last_update
, source
->info
.last_update
);
2869 // adopt source's PastIntervals if target has none. we can do this since
2870 // pgp_num has been reduced prior to the merge, so the OSD mappings for
2871 // the PGs are identical.
2872 if (past_intervals
.empty() && !source
->past_intervals
.empty()) {
2873 dout(10) << __func__
<< " taking source's past_intervals" << dendl
;
2874 past_intervals
= source
->past_intervals
;
2878 // merge_collection does this, but maybe all of our sources were missing.
2879 rctx
->transaction
->collection_set_bits(coll
, split_bits
);
2881 info
.last_complete
= info
.last_update
;
2882 info
.log_tail
= info
.last_update
;
2884 info
.last_backfill
= hobject_t();
2887 snap_mapper
.update_bits(split_bits
);
2890 pg_log
.merge_from(log_from
, info
.last_update
);
2892 // make sure we have a meaningful last_epoch_started/clean (if we were a
2894 if (info
.history
.epoch_created
== 0) {
2895 // start with (a) source's history, since these PGs *should* have been
2896 // remapped in concert with each other...
2897 info
.history
= sources
.begin()->second
->info
.history
;
2899 // we use the last_epoch_{started,clean} we got from
2900 // the caller, which are the epochs that were reported by the PGs were
2901 // found to be ready for merge.
2902 info
.history
.last_epoch_clean
= last_pg_merge_meta
.last_epoch_clean
;
2903 info
.history
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
2904 info
.last_epoch_started
= last_pg_merge_meta
.last_epoch_started
;
2905 dout(10) << __func__
2906 << " set les/c to " << last_pg_merge_meta
.last_epoch_started
<< "/"
2907 << last_pg_merge_meta
.last_epoch_clean
2908 << " from pool last_dec_*, source pg history was "
2909 << sources
.begin()->second
->info
.history
2912 // if the past_intervals start is later than last_epoch_clean, it
2913 // implies the source repeered again but the target didn't, or
2914 // that the source became clean in a later epoch than the target.
2915 // avoid the discrepancy but adjusting the interval start
2916 // backwards to match so that check_past_interval_bounds() will
2918 auto pib
= past_intervals
.get_bounds();
2919 if (info
.history
.last_epoch_clean
< pib
.first
) {
2920 dout(10) << __func__
<< " last_epoch_clean "
2921 << info
.history
.last_epoch_clean
<< " < past_interval start "
2922 << pib
.first
<< ", adjusting start backwards" << dendl
;
2923 past_intervals
.adjust_start_backwards(info
.history
.last_epoch_clean
);
2926 // Similarly, if the same_interval_since value is later than
2927 // last_epoch_clean, the next interval change will result in a
2928 // past_interval start that is later than last_epoch_clean. This
2929 // can happen if we use the pg_history values from the merge
2930 // source. Adjust the same_interval_since value backwards if that
2931 // happens. (We trust the les and lec values more because they came from
2932 // the real target, whereas the history value we stole from the source.)
2933 if (info
.history
.last_epoch_started
< info
.history
.same_interval_since
) {
2934 dout(10) << __func__
<< " last_epoch_started "
2935 << info
.history
.last_epoch_started
<< " < same_interval_since "
2936 << info
.history
.same_interval_since
2937 << ", adjusting pg_history backwards" << dendl
;
2938 info
.history
.same_interval_since
= info
.history
.last_epoch_clean
;
2939 // make sure same_{up,primary}_since are <= same_interval_since
2940 info
.history
.same_up_since
= std::min(
2941 info
.history
.same_up_since
, info
.history
.same_interval_since
);
2942 info
.history
.same_primary_since
= std::min(
2943 info
.history
.same_primary_since
, info
.history
.same_interval_since
);
2948 dirty_big_info
= true;
2951 void PG::add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
)
2953 ConnectionRef con
= s
->con
;
2954 if (!con
) // OSD::ms_handle_reset clears s->con without a lock
2956 BackoffRef
b(s
->have_backoff(info
.pgid
, begin
));
2958 derr
<< __func__
<< " already have backoff for " << s
<< " begin " << begin
2959 << " " << *b
<< dendl
;
2962 std::lock_guard
l(backoff_lock
);
2964 b
= new Backoff(info
.pgid
, this, s
, ++s
->backoff_seq
, begin
, end
);
2965 backoffs
[begin
].insert(b
);
2967 dout(10) << __func__
<< " session " << s
<< " added " << *b
<< dendl
;
2973 CEPH_OSD_BACKOFF_OP_BLOCK
,
2979 void PG::release_backoffs(const hobject_t
& begin
, const hobject_t
& end
)
2981 dout(10) << __func__
<< " [" << begin
<< "," << end
<< ")" << dendl
;
2982 vector
<BackoffRef
> bv
;
2984 std::lock_guard
l(backoff_lock
);
2985 auto p
= backoffs
.lower_bound(begin
);
2986 while (p
!= backoffs
.end()) {
2987 int r
= cmp(p
->first
, end
);
2988 dout(20) << __func__
<< " ? " << r
<< " " << p
->first
2989 << " " << p
->second
<< dendl
;
2990 // note: must still examine begin=end=p->first case
2991 if (r
> 0 || (r
== 0 && begin
< end
)) {
2994 dout(20) << __func__
<< " checking " << p
->first
2995 << " " << p
->second
<< dendl
;
2996 auto q
= p
->second
.begin();
2997 while (q
!= p
->second
.end()) {
2998 dout(20) << __func__
<< " checking " << *q
<< dendl
;
2999 int r
= cmp((*q
)->begin
, begin
);
3000 if (r
== 0 || (r
> 0 && (*q
)->end
< end
)) {
3002 q
= p
->second
.erase(q
);
3007 if (p
->second
.empty()) {
3008 p
= backoffs
.erase(p
);
3015 std::lock_guard
l(b
->lock
);
3016 dout(10) << __func__
<< " " << *b
<< dendl
;
3018 ceph_assert(b
->pg
== this);
3019 ConnectionRef con
= b
->session
->con
;
3020 if (con
) { // OSD::ms_handle_reset clears s->con without a lock
3025 CEPH_OSD_BACKOFF_OP_UNBLOCK
,
3031 b
->state
= Backoff::STATE_DELETING
;
3033 b
->session
->rm_backoff(b
);
3041 void PG::clear_backoffs()
3043 dout(10) << __func__
<< " " << dendl
;
3044 map
<hobject_t
,set
<BackoffRef
>> ls
;
3046 std::lock_guard
l(backoff_lock
);
3049 for (auto& p
: ls
) {
3050 for (auto& b
: p
.second
) {
3051 std::lock_guard
l(b
->lock
);
3052 dout(10) << __func__
<< " " << *b
<< dendl
;
3054 ceph_assert(b
->pg
== this);
3056 b
->state
= Backoff::STATE_DELETING
;
3058 b
->session
->rm_backoff(b
);
3067 // called by Session::clear_backoffs()
3068 void PG::rm_backoff(BackoffRef b
)
3070 dout(10) << __func__
<< " " << *b
<< dendl
;
3071 std::lock_guard
l(backoff_lock
);
3072 ceph_assert(b
->lock
.is_locked_by_me());
3073 ceph_assert(b
->pg
== this);
3074 auto p
= backoffs
.find(b
->begin
);
3075 // may race with release_backoffs()
3076 if (p
!= backoffs
.end()) {
3077 auto q
= p
->second
.find(b
);
3078 if (q
!= p
->second
.end()) {
3080 if (p
->second
.empty()) {
3087 void PG::clear_recovery_state()
3089 dout(10) << "clear_recovery_state" << dendl
;
3091 pg_log
.reset_recovery_pointers();
3092 finish_sync_event
= 0;
3095 while (recovery_ops_active
> 0) {
3096 #ifdef DEBUG_RECOVERY_OIDS
3097 soid
= *recovering_oids
.begin();
3099 finish_recovery_op(soid
, true);
3102 async_recovery_targets
.clear();
3103 backfill_targets
.clear();
3104 backfill_info
.clear();
3105 peer_backfill_info
.clear();
3106 waiting_on_backfill
.clear();
3107 _clear_recovery_state(); // pg impl specific hook
3110 void PG::cancel_recovery()
3112 dout(10) << "cancel_recovery" << dendl
;
3113 clear_recovery_state();
3117 void PG::purge_strays()
3119 if (is_premerge()) {
3120 dout(10) << "purge_strays " << stray_set
<< " but premerge, doing nothing"
3124 if (cct
->_conf
.get_val
<bool>("osd_debug_no_purge_strays")) {
3127 dout(10) << "purge_strays " << stray_set
<< dendl
;
3129 bool removed
= false;
3130 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
3131 p
!= stray_set
.end();
3133 ceph_assert(!is_acting_recovery_backfill(*p
));
3134 if (get_osdmap()->is_up(p
->osd
)) {
3135 dout(10) << "sending PGRemove to osd." << *p
<< dendl
;
3136 vector
<spg_t
> to_remove
;
3137 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
3138 MOSDPGRemove
*m
= new MOSDPGRemove(
3141 osd
->send_message_osd_cluster(p
->osd
, m
, get_osdmap_epoch());
3143 dout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
3145 peer_missing
.erase(*p
);
3146 peer_info
.erase(*p
);
3147 peer_purged
.insert(*p
);
3151 // if we removed anyone, update peers (which include peer_info)
3153 update_heartbeat_peers();
3157 // clear _requested maps; we may have to peer() again if we discover
3158 // (more) stray content
3159 peer_log_requested
.clear();
3160 peer_missing_requested
.clear();
3163 void PG::set_probe_targets(const set
<pg_shard_t
> &probe_set
)
3165 std::lock_guard
l(heartbeat_peer_lock
);
3166 probe_targets
.clear();
3167 for (set
<pg_shard_t
>::iterator i
= probe_set
.begin();
3168 i
!= probe_set
.end();
3170 probe_targets
.insert(i
->osd
);
3174 void PG::clear_probe_targets()
3176 std::lock_guard
l(heartbeat_peer_lock
);
3177 probe_targets
.clear();
3180 void PG::update_heartbeat_peers()
3182 ceph_assert(is_locked());
3188 for (unsigned i
=0; i
<acting
.size(); i
++) {
3189 if (acting
[i
] != CRUSH_ITEM_NONE
)
3190 new_peers
.insert(acting
[i
]);
3192 for (unsigned i
=0; i
<up
.size(); i
++) {
3193 if (up
[i
] != CRUSH_ITEM_NONE
)
3194 new_peers
.insert(up
[i
]);
3196 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
3197 p
!= peer_info
.end();
3199 new_peers
.insert(p
->first
.osd
);
3201 bool need_update
= false;
3202 heartbeat_peer_lock
.Lock();
3203 if (new_peers
== heartbeat_peers
) {
3204 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " unchanged" << dendl
;
3206 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " -> " << new_peers
<< dendl
;
3207 heartbeat_peers
.swap(new_peers
);
3210 heartbeat_peer_lock
.Unlock();
3213 osd
->need_heartbeat_peer_update();
3217 bool PG::check_in_progress_op(
3218 const osd_reqid_t
&r
,
3219 eversion_t
*version
,
3220 version_t
*user_version
,
3221 int *return_code
) const
3224 projected_log
.get_request(r
, version
, user_version
, return_code
) ||
3225 pg_log
.get_log().get_request(r
, version
, user_version
, return_code
));
3228 static bool find_shard(const set
<pg_shard_t
> & pgs
, shard_id_t shard
)
3231 if (p
.shard
== shard
)
3236 static pg_shard_t
get_another_shard(const set
<pg_shard_t
> & pgs
, pg_shard_t skip
, shard_id_t shard
)
3238 for (auto&p
: pgs
) {
3241 if (p
.shard
== shard
)
3244 return pg_shard_t();
3247 void PG::_update_calc_stats()
3249 info
.stats
.version
= info
.last_update
;
3250 info
.stats
.created
= info
.history
.epoch_created
;
3251 info
.stats
.last_scrub
= info
.history
.last_scrub
;
3252 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
3253 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
3254 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
3255 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
3256 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
3258 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
3259 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
3260 info
.stats
.log_start
= pg_log
.get_tail();
3261 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
3262 info
.stats
.snaptrimq_len
= snap_trimq
.size();
3264 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
3266 // In rare case that upset is too large (usually transient), use as target
3267 // for calculations below.
3268 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
3269 // For undersized actingset may be larger with OSDs out
3270 unsigned nrep
= std::max(actingset
.size(), upset
.size());
3271 // calc num_object_copies
3272 info
.stats
.stats
.calc_copies(std::max(target
, nrep
));
3273 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
3274 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
3275 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
3276 info
.stats
.avail_no_missing
.clear();
3277 info
.stats
.object_location_counts
.clear();
3279 // We should never hit this condition, but if end up hitting it,
3280 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3281 if (info
.stats
.stats
.sum
.num_objects
< 0) {
3282 dout(0) << __func__
<< " negative num_objects = "
3283 << info
.stats
.stats
.sum
.num_objects
<< " setting it to 0 "
3285 info
.stats
.stats
.sum
.num_objects
= 0;
3286 state_set(PG_STATE_INCONSISTENT
);
3289 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3290 dout(20) << __func__
<< " actingset " << actingset
<< " upset "
3291 << upset
<< " acting_recovery_backfill " << acting_recovery_backfill
<< dendl
;
3292 dout(20) << __func__
<< " acting " << acting
<< " up " << up
<< dendl
;
3294 ceph_assert(!acting_recovery_backfill
.empty());
3296 bool estimate
= false;
3298 // NOTE: we only generate degraded, misplaced and unfound
3299 // values for the summation, not individual stat categories.
3300 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
3302 // Objects missing from up nodes, sorted by # objects.
3303 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
3304 // Objects missing from nodes not in up, sort by # objects
3305 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
3307 // Fill missing_target_objects/acting_source_objects
3313 missing
= pg_log
.get_missing().num_missing();
3314 ceph_assert(acting_recovery_backfill
.count(pg_whoami
));
3315 if (upset
.count(pg_whoami
)) {
3316 missing_target_objects
.insert(make_pair(missing
, pg_whoami
));
3318 acting_source_objects
.insert(make_pair(missing
, pg_whoami
));
3320 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
3322 info
.stats
.avail_no_missing
.push_back(pg_whoami
);
3323 dout(20) << __func__
<< " shard " << pg_whoami
3324 << " primary objects " << num_objects
3325 << " missing " << missing
3330 for (auto& peer
: peer_info
) {
3331 // Primary should not be in the peer_info, skip if it is.
3332 if (peer
.first
== pg_whoami
) continue;
3333 int64_t missing
= 0;
3334 int64_t peer_num_objects
= peer
.second
.stats
.stats
.sum
.num_objects
;
3335 // Backfill targets always track num_objects accurately
3336 // all other peers track missing accurately.
3337 if (is_backfill_targets(peer
.first
)) {
3338 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3340 if (peer_missing
.count(peer
.first
)) {
3341 missing
= peer_missing
[peer
.first
].num_missing();
3343 dout(20) << __func__
<< " no peer_missing found for " << peer
.first
<< dendl
;
3344 if (is_recovering()) {
3347 missing
= std::max((int64_t)0, num_objects
- peer_num_objects
);
3350 if (upset
.count(peer
.first
)) {
3351 missing_target_objects
.insert(make_pair(missing
, peer
.first
));
3352 } else if (actingset
.count(peer
.first
)) {
3353 acting_source_objects
.insert(make_pair(missing
, peer
.first
));
3355 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
3357 info
.stats
.avail_no_missing
.push_back(peer
.first
);
3358 dout(20) << __func__
<< " shard " << peer
.first
3359 << " objects " << peer_num_objects
3360 << " missing " << missing
3364 // Compute object_location_counts
3365 for (auto& ml
: missing_loc
.get_missing_locs()) {
3366 info
.stats
.object_location_counts
[ml
.second
]++;
3367 dout(30) << __func__
<< " " << ml
.first
<< " object_location_counts["
3368 << ml
.second
<< "]=" << info
.stats
.object_location_counts
[ml
.second
]
3371 int64_t not_missing
= num_objects
- missing_loc
.get_missing_locs().size();
3373 // During recovery we know upset == actingset and is being populated
3374 // During backfill we know that all non-missing objects are in the actingset
3375 info
.stats
.object_location_counts
[actingset
] = not_missing
;
3377 dout(30) << __func__
<< " object_location_counts["
3378 << upset
<< "]=" << info
.stats
.object_location_counts
[upset
]
3380 dout(20) << __func__
<< " object_location_counts "
3381 << info
.stats
.object_location_counts
<< dendl
;
3383 // A misplaced object is not stored on the correct OSD
3384 int64_t misplaced
= 0;
3385 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3386 int64_t degraded
= 0;
3388 if (is_recovering()) {
3389 for (auto& sml
: missing_loc
.get_missing_by_count()) {
3390 for (auto& ml
: sml
.second
) {
3392 if (sml
.first
== shard_id_t::NO_SHARD
) {
3393 dout(20) << __func__
<< " ml " << ml
.second
<< " upset size " << upset
.size() << " up " << ml
.first
.up
<< dendl
;
3394 missing_shards
= (int)upset
.size() - ml
.first
.up
;
3396 // Handle shards not even in upset below
3397 if (!find_shard(upset
, sml
.first
))
3399 missing_shards
= std::max(0, 1 - ml
.first
.up
);
3400 dout(20) << __func__
<< " shard " << sml
.first
<< " ml " << ml
.second
<< " missing shards " << missing_shards
<< dendl
;
3402 int odegraded
= ml
.second
* missing_shards
;
3403 // Copies on other osds but limited to the possible degraded
3404 int more_osds
= std::min(missing_shards
, ml
.first
.other
);
3405 int omisplaced
= ml
.second
* more_osds
;
3406 ceph_assert(omisplaced
<= odegraded
);
3407 odegraded
-= omisplaced
;
3409 misplaced
+= omisplaced
;
3410 degraded
+= odegraded
;
3414 dout(20) << __func__
<< " missing based degraded " << degraded
<< dendl
;
3415 dout(20) << __func__
<< " missing based misplaced " << misplaced
<< dendl
;
3417 // Handle undersized case
3418 if (pool
.info
.is_replicated()) {
3419 // Add degraded for missing targets (num_objects missing)
3420 ceph_assert(target
>= upset
.size());
3421 unsigned needed
= target
- upset
.size();
3422 degraded
+= num_objects
* needed
;
3424 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3425 shard_id_t
shard(i
);
3427 if (!find_shard(upset
, shard
)) {
3428 pg_shard_t pgs
= get_another_shard(actingset
, pg_shard_t(), shard
);
3430 if (pgs
!= pg_shard_t()) {
3433 if (pgs
== pg_whoami
)
3434 missing
= info
.stats
.stats
.sum
.num_objects_missing_on_primary
;
3436 missing
= peer_info
[pgs
].stats
.stats
.sum
.num_objects_missing
;
3438 degraded
+= missing
;
3439 misplaced
+= std::max((int64_t)0, num_objects
- missing
);
3441 // No shard anywhere
3442 degraded
+= num_objects
;
3450 // Handle undersized case
3451 if (pool
.info
.is_replicated()) {
3452 // Add to missing_target_objects
3453 ceph_assert(target
>= missing_target_objects
.size());
3454 unsigned needed
= target
- missing_target_objects
.size();
3456 missing_target_objects
.insert(make_pair(num_objects
* needed
, pg_shard_t(pg_shard_t::NO_OSD
)));
3458 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
3459 shard_id_t
shard(i
);
3461 for (const auto& t
: missing_target_objects
) {
3462 if (std::get
<1>(t
).shard
== shard
) {
3468 missing_target_objects
.insert(make_pair(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
)));
3472 for (const auto& item
: missing_target_objects
)
3473 dout(20) << __func__
<< " missing shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
3474 for (const auto& item
: acting_source_objects
)
3475 dout(20) << __func__
<< " acting shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
3477 // Handle all objects not in missing for remapped
3479 for (auto m
= missing_target_objects
.rbegin();
3480 m
!= missing_target_objects
.rend(); ++m
) {
3482 int64_t extra_missing
= -1;
3484 if (pool
.info
.is_replicated()) {
3485 if (!acting_source_objects
.empty()) {
3486 auto extra_copy
= acting_source_objects
.begin();
3487 extra_missing
= std::get
<0>(*extra_copy
);
3488 acting_source_objects
.erase(extra_copy
);
3490 } else { // Erasure coded
3491 // Use corresponding shard
3492 for (const auto& a
: acting_source_objects
) {
3493 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
3494 extra_missing
= std::get
<0>(a
);
3495 acting_source_objects
.erase(a
);
3501 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
3502 // We don't know which of the objects on the target
3503 // are part of extra_missing so assume are all degraded.
3504 misplaced
+= std::get
<0>(*m
) - extra_missing
;
3505 degraded
+= extra_missing
;
3507 // 1. extra_missing == -1, more targets than sources so degraded
3508 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3509 // previously degraded are now present on the target.
3510 degraded
+= std::get
<0>(*m
);
3513 // If there are still acting that haven't been accounted for
3514 // then they are misplaced
3515 for (const auto& a
: acting_source_objects
) {
3516 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
3517 dout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
<< dendl
;
3518 misplaced
+= extra_misplaced
;
3521 // NOTE: Tests use these messages to verify this code
3522 dout(20) << __func__
<< " degraded " << degraded
<< (estimate
? " (est)": "") << dendl
;
3523 dout(20) << __func__
<< " misplaced " << misplaced
<< (estimate
? " (est)": "")<< dendl
;
3525 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
3526 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
3527 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
3531 void PG::_update_blocked_by()
3533 // set a max on the number of blocking peers we report. if we go
3534 // over, report a random subset. keep the result sorted.
3535 unsigned keep
= std::min
<unsigned>(blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
3536 unsigned skip
= blocked_by
.size() - keep
;
3537 info
.stats
.blocked_by
.clear();
3538 info
.stats
.blocked_by
.resize(keep
);
3540 for (set
<int>::iterator p
= blocked_by
.begin();
3541 p
!= blocked_by
.end() && keep
> 0;
3543 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
3546 info
.stats
.blocked_by
[pos
++] = *p
;
3552 void PG::publish_stats_to_osd()
3557 pg_stats_publish_lock
.Lock();
3559 if (info
.stats
.stats
.sum
.num_scrub_errors
)
3560 state_set(PG_STATE_INCONSISTENT
);
3562 state_clear(PG_STATE_INCONSISTENT
);
3563 state_clear(PG_STATE_FAILED_REPAIR
);
3566 utime_t now
= ceph_clock_now();
3567 if (info
.stats
.state
!= state
) {
3568 info
.stats
.last_change
= now
;
3569 // Optimistic estimation, if we just find out an inactive PG,
3570 // assumt it is active till now.
3571 if (!(state
& PG_STATE_ACTIVE
) &&
3572 (info
.stats
.state
& PG_STATE_ACTIVE
))
3573 info
.stats
.last_active
= now
;
3575 if ((state
& PG_STATE_ACTIVE
) &&
3576 !(info
.stats
.state
& PG_STATE_ACTIVE
))
3577 info
.stats
.last_became_active
= now
;
3578 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
3579 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
3580 info
.stats
.last_became_peered
= now
;
3581 info
.stats
.state
= state
;
3584 _update_calc_stats();
3585 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
3586 state_set(PG_STATE_DEGRADED
);
3588 state_clear(PG_STATE_DEGRADED
);
3590 _update_blocked_by();
3592 pg_stat_t pre_publish
= info
.stats
;
3593 pre_publish
.stats
.add(unstable_stats
);
3594 utime_t cutoff
= now
;
3595 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
3597 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_MIMIC
) {
3598 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3599 // because we don't want to make the pg_stat_t structures too expensive.
3600 unsigned max
= cct
->_conf
->osd_max_snap_prune_intervals_per_epoch
;
3602 auto i
= info
.purged_snaps
.begin();
3603 while (num
< max
&& i
!= info
.purged_snaps
.end()) {
3604 pre_publish
.purged_snaps
.insert(i
.get_start(), i
.get_len());
3608 dout(20) << __func__
<< " reporting purged_snaps "
3609 << pre_publish
.purged_snaps
<< dendl
;
3612 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
3613 info
.stats
.last_fresh
> cutoff
) {
3614 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3615 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
3617 // update our stat summary and timestamps
3618 info
.stats
.reported_epoch
= get_osdmap_epoch();
3619 ++info
.stats
.reported_seq
;
3621 info
.stats
.last_fresh
= now
;
3623 if (info
.stats
.state
& PG_STATE_CLEAN
)
3624 info
.stats
.last_clean
= now
;
3625 if (info
.stats
.state
& PG_STATE_ACTIVE
)
3626 info
.stats
.last_active
= now
;
3627 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
3628 info
.stats
.last_peered
= now
;
3629 info
.stats
.last_unstale
= now
;
3630 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
3631 info
.stats
.last_undegraded
= now
;
3632 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
3633 info
.stats
.last_fullsized
= now
;
3635 pg_stats_publish_valid
= true;
3636 pg_stats_publish
= pre_publish
;
3638 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
3639 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
3641 pg_stats_publish_lock
.Unlock();
3644 void PG::clear_publish_stats()
3646 dout(15) << "clear_stats" << dendl
;
3647 pg_stats_publish_lock
.Lock();
3648 pg_stats_publish_valid
= false;
3649 pg_stats_publish_lock
.Unlock();
3653 * initialize a newly instantiated pg
3655 * Initialize PG state, as when a PG is initially created, or when it
3656 * is first instantiated on the current node.
3658 * @param role our role/rank
3659 * @param newup up set
3660 * @param newacting acting set
3661 * @param history pg history
3662 * @param pi past_intervals
3663 * @param backfill true if info should be marked as backfill
3664 * @param t transaction to write out our new state in
3668 const vector
<int>& newup
, int new_up_primary
,
3669 const vector
<int>& newacting
, int new_acting_primary
,
3670 const pg_history_t
& history
,
3671 const PastIntervals
& pi
,
3673 ObjectStore::Transaction
*t
)
3675 dout(10) << "init role " << role
<< " up " << newup
<< " acting " << newacting
3676 << " history " << history
3677 << " past_intervals " << pi
3681 init_primary_up_acting(
3685 new_acting_primary
);
3687 info
.history
= history
;
3688 past_intervals
= pi
;
3691 info
.stats
.up_primary
= new_up_primary
;
3692 info
.stats
.acting
= acting
;
3693 info
.stats
.acting_primary
= new_acting_primary
;
3694 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
3697 dout(10) << __func__
<< ": Setting backfill" << dendl
;
3698 info
.set_last_backfill(hobject_t());
3699 info
.last_complete
= info
.last_update
;
3700 pg_log
.mark_log_for_rewrite();
3706 dirty_big_info
= true;
3718 #pragma GCC diagnostic ignored "-Wpragmas"
3719 #pragma GCC diagnostic push
3720 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3722 void PG::upgrade(ObjectStore
*store
)
3724 dout(0) << __func__
<< " " << info_struct_v
<< " -> " << latest_struct_v
3726 ceph_assert(info_struct_v
<= 10);
3727 ObjectStore::Transaction t
;
3729 // <do upgrade steps here>
3731 // finished upgrade!
3732 ceph_assert(info_struct_v
== 10);
3734 // update infover_key
3735 if (info_struct_v
< latest_struct_v
) {
3736 map
<string
,bufferlist
> v
;
3737 __u8 ver
= latest_struct_v
;
3738 encode(ver
, v
[infover_key
]);
3739 t
.omap_setkeys(coll
, pgmeta_oid
, v
);
3743 dirty_big_info
= true;
3746 ObjectStore::CollectionHandle ch
= store
->open_collection(coll
);
3747 int r
= store
->queue_transaction(ch
, std::move(t
));
3749 derr
<< __func__
<< ": queue_transaction returned "
3750 << cpp_strerror(r
) << dendl
;
3753 ceph_assert(r
== 0);
3756 if (!ch
->flush_commit(&waiter
)) {
3761 #pragma GCC diagnostic pop
3762 #pragma GCC diagnostic warning "-Wpragmas"
3764 int PG::_prepare_write_info(CephContext
* cct
,
3765 map
<string
,bufferlist
> *km
,
3767 pg_info_t
&info
, pg_info_t
&last_written_info
,
3768 PastIntervals
&past_intervals
,
3769 bool dirty_big_info
,
3772 PerfCounters
*logger
)
3775 encode(epoch
, (*km
)[epoch_key
]);
3779 logger
->inc(l_osd_pg_info
);
3781 // try to do info efficiently?
3782 if (!dirty_big_info
&& try_fast_info
&&
3783 info
.last_update
> last_written_info
.last_update
) {
3784 pg_fast_info_t fast
;
3785 fast
.populate_from(info
);
3786 bool did
= fast
.try_apply_to(&last_written_info
);
3787 ceph_assert(did
); // we verified last_update increased above
3788 if (info
== last_written_info
) {
3789 encode(fast
, (*km
)[fastinfo_key
]);
3791 logger
->inc(l_osd_pg_fastinfo
);
3794 generic_dout(30) << __func__
<< " fastinfo failed, info:\n";
3796 JSONFormatter
jf(true);
3797 jf
.dump_object("info", info
);
3801 *_dout
<< "\nlast_written_info:\n";
3802 JSONFormatter
jf(true);
3803 jf
.dump_object("last_written_info", last_written_info
);
3808 last_written_info
= info
;
3810 // info. store purged_snaps separately.
3811 interval_set
<snapid_t
> purged_snaps
;
3812 purged_snaps
.swap(info
.purged_snaps
);
3813 encode(info
, (*km
)[info_key
]);
3814 purged_snaps
.swap(info
.purged_snaps
);
3816 if (dirty_big_info
) {
3817 // potentially big stuff
3818 bufferlist
& bigbl
= (*km
)[biginfo_key
];
3819 encode(past_intervals
, bigbl
);
3820 encode(info
.purged_snaps
, bigbl
);
3821 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3823 logger
->inc(l_osd_pg_biginfo
);
3829 void PG::_create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
)
3832 t
.create_collection(coll
, bits
);
3835 void PG::_init(ObjectStore::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
)
3840 // Give a hint to the PG collection
3842 uint32_t pg_num
= pool
->get_pg_num();
3843 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
3844 encode(pg_num
, hint
);
3845 encode(expected_num_objects_pg
, hint
);
3846 uint32_t hint_type
= ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
3847 t
.collection_hint(coll
, hint_type
, hint
);
3850 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3851 t
.touch(coll
, pgmeta_oid
);
3852 map
<string
,bufferlist
> values
;
3853 __u8 struct_v
= latest_struct_v
;
3854 encode(struct_v
, values
[infover_key
]);
3855 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
3858 void PG::prepare_write_info(map
<string
,bufferlist
> *km
)
3860 info
.stats
.stats
.add(unstable_stats
);
3861 unstable_stats
.clear();
3863 bool need_update_epoch
= last_epoch
< get_osdmap_epoch();
3864 int ret
= _prepare_write_info(cct
, km
, get_osdmap_epoch(),
3868 dirty_big_info
, need_update_epoch
,
3869 cct
->_conf
->osd_fast_info
,
3871 ceph_assert(ret
== 0);
3872 if (need_update_epoch
)
3873 last_epoch
= get_osdmap_epoch();
3874 last_persisted_osdmap
= last_epoch
;
3877 dirty_big_info
= false;
3880 #pragma GCC diagnostic ignored "-Wpragmas"
3881 #pragma GCC diagnostic push
3882 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3884 bool PG::_has_removal_flag(ObjectStore
*store
,
3888 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3890 // first try new way
3892 keys
.insert("_remove");
3893 map
<string
,bufferlist
> values
;
3894 auto ch
= store
->open_collection(coll
);
3896 if (store
->omap_get_values(ch
, pgmeta_oid
, keys
, &values
) == 0 &&
3903 int PG::peek_map_epoch(ObjectStore
*store
,
3908 ghobject_t
legacy_infos_oid(OSD::make_infos_oid());
3909 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3910 epoch_t cur_epoch
= 0;
3912 // validate collection name
3913 ceph_assert(coll
.is_pg());
3917 keys
.insert(infover_key
);
3918 keys
.insert(epoch_key
);
3919 map
<string
,bufferlist
> values
;
3920 auto ch
= store
->open_collection(coll
);
3922 int r
= store
->omap_get_values(ch
, pgmeta_oid
, keys
, &values
);
3924 ceph_assert(values
.size() == 2);
3926 // sanity check version
3927 auto bp
= values
[infover_key
].cbegin();
3929 decode(struct_v
, bp
);
3930 ceph_assert(struct_v
>= 8);
3933 bp
= values
[epoch_key
].begin();
3934 decode(cur_epoch
, bp
);
3936 // probably bug 10617; see OSD::load_pgs()
3940 *pepoch
= cur_epoch
;
3944 #pragma GCC diagnostic pop
3945 #pragma GCC diagnostic warning "-Wpragmas"
3947 void PG::write_if_dirty(ObjectStore::Transaction
& t
)
3949 map
<string
,bufferlist
> km
;
3950 if (dirty_big_info
|| dirty_info
)
3951 prepare_write_info(&km
);
3952 pg_log
.write_log_and_missing(t
, &km
, coll
, pgmeta_oid
, pool
.info
.require_rollback());
3954 t
.omap_setkeys(coll
, pgmeta_oid
, km
);
3957 void PG::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3959 // raise last_complete only if we were previously up to date
3960 if (info
.last_complete
== info
.last_update
)
3961 info
.last_complete
= e
.version
;
3963 // raise last_update.
3964 ceph_assert(e
.version
> info
.last_update
);
3965 info
.last_update
= e
.version
;
3967 // raise user_version, if it increased (it may have not get bumped
3968 // by all logged updates)
3969 if (e
.user_version
> info
.last_user_version
)
3970 info
.last_user_version
= e
.user_version
;
3973 pg_log
.add(e
, applied
);
3974 dout(10) << "add_log_entry " << e
<< dendl
;
3978 void PG::append_log(
3979 const vector
<pg_log_entry_t
>& logv
,
3981 eversion_t roll_forward_to
,
3982 ObjectStore::Transaction
&t
,
3983 bool transaction_applied
,
3986 if (transaction_applied
)
3987 update_snap_map(logv
, t
);
3989 /* The primary has sent an info updating the history, but it may not
3990 * have arrived yet. We want to make sure that we cannot remember this
3991 * write without remembering that it happened in an interval which went
3992 * active in epoch history.last_epoch_started.
3994 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3995 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3997 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3998 info
.history
.last_interval_started
= info
.last_interval_started
;
4000 dout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
4002 PGLogEntryHandler handler
{this, &t
};
4003 if (!transaction_applied
) {
4004 /* We must be a backfill or async recovery peer, so it's ok if we apply
4005 * out-of-turn since we won't be considered when
4006 * determining a min possible last_update.
4008 * We skip_rollforward() here, which advances the crt, without
4009 * doing an actual rollforward. This avoids cleaning up entries
4010 * from the backend and we do not end up in a situation, where the
4011 * object is deleted before we can _merge_object_divergent_entries().
4013 pg_log
.skip_rollforward();
4016 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
4019 add_log_entry(*p
, transaction_applied
);
4021 /* We don't want to leave the rollforward artifacts around
4022 * here past last_backfill. It's ok for the same reason as
4024 if (transaction_applied
&&
4025 p
->soid
> info
.last_backfill
) {
4026 pg_log
.roll_forward(&handler
);
4029 auto last
= logv
.rbegin();
4030 if (is_primary() && last
!= logv
.rend()) {
4031 projected_log
.skip_can_rollback_to_to_head();
4032 projected_log
.trim(cct
, last
->version
, nullptr, nullptr, nullptr);
4035 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
4036 pg_log
.roll_forward_to(
4039 last_rollback_info_trimmed_to_applied
= roll_forward_to
;
4042 dout(10) << __func__
<< " approx pg log length = "
4043 << pg_log
.get_log().approx_size() << dendl
;
4044 dout(10) << __func__
<< " transaction_applied = "
4045 << transaction_applied
<< dendl
;
4046 if (!transaction_applied
|| async
)
4047 dout(10) << __func__
<< " " << pg_whoami
4048 << " is async_recovery or backfill target" << dendl
;
4049 pg_log
.trim(trim_to
, info
, transaction_applied
, async
);
4051 // update the local pg, pg log
4056 bool PG::check_log_for_corruption(ObjectStore
*store
)
4058 /// TODO: this method needs to work with the omap log
4062 //! Get the name we're going to save our corrupt page log as
4063 std::string
PG::get_corrupt_pg_log_name() const
4065 const int MAX_BUF
= 512;
4068 time_t my_time(time(NULL
));
4069 const struct tm
*t
= localtime_r(&my_time
, &tm_buf
);
4070 int ret
= strftime(buf
, sizeof(buf
), "corrupt_log_%Y-%m-%d_%k:%M_", t
);
4072 dout(0) << "strftime failed" << dendl
;
4073 return "corrupt_log_unknown_time";
4076 out
+= stringify(info
.pgid
);
4081 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
,
4082 pg_info_t
&info
, PastIntervals
&past_intervals
,
4086 keys
.insert(infover_key
);
4087 keys
.insert(info_key
);
4088 keys
.insert(biginfo_key
);
4089 keys
.insert(fastinfo_key
);
4090 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
4091 map
<string
,bufferlist
> values
;
4092 auto ch
= store
->open_collection(coll
);
4094 int r
= store
->omap_get_values(ch
, pgmeta_oid
, keys
, &values
);
4095 ceph_assert(r
== 0);
4096 ceph_assert(values
.size() == 3 ||
4097 values
.size() == 4);
4099 auto p
= values
[infover_key
].cbegin();
4100 decode(struct_v
, p
);
4101 ceph_assert(struct_v
>= 10);
4103 p
= values
[info_key
].begin();
4106 p
= values
[biginfo_key
].begin();
4107 decode(past_intervals
, p
);
4108 decode(info
.purged_snaps
, p
);
4110 p
= values
[fastinfo_key
].begin();
4112 pg_fast_info_t fast
;
4114 fast
.try_apply_to(&info
);
4119 void PG::read_state(ObjectStore
*store
)
4121 int r
= read_info(store
, pg_id
, coll
, info
, past_intervals
,
4123 ceph_assert(r
>= 0);
4125 if (info_struct_v
< compat_struct_v
) {
4126 derr
<< "PG needs upgrade, but on-disk data is too old; upgrade to"
4127 << " an older version first." << dendl
;
4128 ceph_abort_msg("PG too old to upgrade");
4131 last_written_info
= info
;
4134 pg_log
.read_log_and_missing(
4140 cct
->_conf
->osd_ignore_stale_divergent_priors
,
4141 cct
->_conf
->osd_debug_verify_missing_on_start
);
4143 osd
->clog
->error() << oss
.str();
4145 // log any weirdness
4148 if (info_struct_v
< latest_struct_v
) {
4152 // initialize current mapping
4154 int primary
, up_primary
;
4155 vector
<int> acting
, up
;
4156 get_osdmap()->pg_to_up_acting_osds(
4157 pg_id
.pgid
, &up
, &up_primary
, &acting
, &primary
);
4158 init_primary_up_acting(
4163 int rr
= OSDMap::calc_pg_role(osd
->whoami
, acting
);
4164 if (pool
.info
.is_replicated() || rr
== pg_whoami
.shard
)
4170 // init pool options
4171 store
->set_collection_opts(ch
, pool
.info
.opts
);
4173 PG::RecoveryCtx
rctx(0, 0, 0, new ObjectStore::Transaction
);
4174 handle_initialize(&rctx
);
4175 // note: we don't activate here because we know the OSD will advance maps
4177 write_if_dirty(*rctx
.transaction
);
4178 store
->queue_transaction(ch
, std::move(*rctx
.transaction
));
4179 delete rctx
.transaction
;
4182 void PG::log_weirdness()
4184 if (pg_log
.get_tail() != info
.log_tail
)
4185 osd
->clog
->error() << info
.pgid
4186 << " info mismatch, log.tail " << pg_log
.get_tail()
4187 << " != info.log_tail " << info
.log_tail
;
4188 if (pg_log
.get_head() != info
.last_update
)
4189 osd
->clog
->error() << info
.pgid
4190 << " info mismatch, log.head " << pg_log
.get_head()
4191 << " != info.last_update " << info
.last_update
;
4193 if (!pg_log
.get_log().empty()) {
4195 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
4196 osd
->clog
->error() << info
.pgid
4197 << " log bound mismatch, info (tail,head] ("
4198 << pg_log
.get_tail() << "," << pg_log
.get_head() << "]"
4200 << pg_log
.get_log().log
.begin()->version
<< ","
4201 << pg_log
.get_log().log
.rbegin()->version
<< "]";
4204 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
4205 osd
->clog
->error() << info
.pgid
4206 << " caller_ops.size " << pg_log
.get_log().caller_ops
.size()
4207 << " > log size " << pg_log
.get_log().log
.size();
4211 void PG::update_snap_map(
4212 const vector
<pg_log_entry_t
> &log_entries
,
4213 ObjectStore::Transaction
&t
)
4215 for (vector
<pg_log_entry_t
>::const_iterator i
= log_entries
.begin();
4216 i
!= log_entries
.end();
4218 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4219 if (i
->soid
.snap
< CEPH_MAXSNAP
) {
4220 if (i
->is_delete()) {
4221 int r
= snap_mapper
.remove_oid(
4225 derr
<< __func__
<< " remove_oid " << i
->soid
<< " failed with " << r
<< dendl
;
4226 // On removal tolerate missing key corruption
4227 ceph_assert(r
== 0 || r
== -ENOENT
);
4228 } else if (i
->is_update()) {
4229 ceph_assert(i
->snaps
.length() > 0);
4230 vector
<snapid_t
> snaps
;
4231 bufferlist snapbl
= i
->snaps
;
4232 auto p
= snapbl
.cbegin();
4236 derr
<< __func__
<< " decode snaps failure on " << *i
<< dendl
;
4239 set
<snapid_t
> _snaps(snaps
.begin(), snaps
.end());
4241 if (i
->is_clone() || i
->is_promote()) {
4242 snap_mapper
.add_oid(
4246 } else if (i
->is_modify()) {
4247 int r
= snap_mapper
.update_snaps(
4252 ceph_assert(r
== 0);
4254 ceph_assert(i
->is_clean());
4262 * filter trimming|trimmed snaps out of snapcontext
4264 void PG::filter_snapc(vector
<snapid_t
> &snaps
)
4266 // nothing needs to trim, we can return immediately
4267 if (snap_trimq
.empty() && info
.purged_snaps
.empty())
4270 bool filtering
= false;
4271 vector
<snapid_t
> newsnaps
;
4272 for (vector
<snapid_t
>::iterator p
= snaps
.begin();
4275 if (snap_trimq
.contains(*p
) || info
.purged_snaps
.contains(*p
)) {
4277 // start building a new vector with what we've seen so far
4278 dout(10) << "filter_snapc filtering " << snaps
<< dendl
;
4279 newsnaps
.insert(newsnaps
.begin(), snaps
.begin(), p
);
4282 dout(20) << "filter_snapc removing trimq|purged snap " << *p
<< dendl
;
4285 newsnaps
.push_back(*p
); // continue building new vector
4289 snaps
.swap(newsnaps
);
4290 dout(10) << "filter_snapc result " << snaps
<< dendl
;
4294 void PG::requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
)
4296 for (map
<hobject_t
, list
<OpRequestRef
>>::iterator it
= m
.begin();
4299 requeue_ops(it
->second
);
4303 void PG::requeue_op(OpRequestRef op
)
4305 auto p
= waiting_for_map
.find(op
->get_source());
4306 if (p
!= waiting_for_map
.end()) {
4307 dout(20) << __func__
<< " " << op
<< " (waiting_for_map " << p
->first
<< ")"
4309 p
->second
.push_front(op
);
4311 dout(20) << __func__
<< " " << op
<< dendl
;
4314 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(info
.pgid
, op
)),
4315 op
->get_req()->get_cost(),
4316 op
->get_req()->get_priority(),
4317 op
->get_req()->get_recv_stamp(),
4318 op
->get_req()->get_source().num(),
4319 get_osdmap_epoch()));
4323 void PG::requeue_ops(list
<OpRequestRef
> &ls
)
4325 for (list
<OpRequestRef
>::reverse_iterator i
= ls
.rbegin();
4333 void PG::requeue_map_waiters()
4335 epoch_t epoch
= get_osdmap_epoch();
4336 auto p
= waiting_for_map
.begin();
4337 while (p
!= waiting_for_map
.end()) {
4338 if (epoch
< p
->second
.front()->min_epoch
) {
4339 dout(20) << __func__
<< " " << p
->first
<< " front op "
4340 << p
->second
.front() << " must still wait, doing nothing"
4344 dout(20) << __func__
<< " " << p
->first
<< " " << p
->second
<< dendl
;
4345 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
4347 osd
->enqueue_front(OpQueueItem(
4348 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(info
.pgid
, req
)),
4349 req
->get_req()->get_cost(),
4350 req
->get_req()->get_priority(),
4351 req
->get_req()->get_recv_stamp(),
4352 req
->get_req()->get_source().num(),
4355 p
= waiting_for_map
.erase(p
);
4361 // ==========================================================================================
4365 * when holding pg and sched_scrub_lock, then the states are:
4367 * scrubber.local_reserved = true
4368 * scrubber.active = false
4369 * scrubber.reserved_peers includes whoami
4370 * osd->scrubs_local++
4371 * scheduling, replica declined:
4372 * scrubber.local_reserved = true
4373 * scrubber.reserved_peers includes -1
4374 * osd->scrub_local++
4376 * scrubber.local_reserved = true
4377 * scrubber.active = false
4378 * scrubber.reserved_peers.size() == acting.size();
4380 * osd->scrub_local++
4382 * scrubber.local_reserved = true;
4383 * scrubber.active = true
4384 * scrubber.reserved_peers empty
4387 // returns true if a scrub has been newly kicked off
4388 bool PG::sched_scrub()
4390 ceph_assert(is_locked());
4391 ceph_assert(!is_scrubbing());
4392 if (!(is_primary() && is_active() && is_clean())) {
4396 // All processing the first time through commits us to whatever
4397 // choices are made.
4398 if (!scrubber
.local_reserved
) {
4399 dout(20) << __func__
<< ": Start processing pg " << info
.pgid
<< dendl
;
4401 bool allow_deep_scrub
= !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB
) ||
4402 pool
.info
.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB
));
4403 bool allow_scrub
= !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB
) ||
4404 pool
.info
.has_flag(pg_pool_t::FLAG_NOSCRUB
));
4405 bool has_deep_errors
= (info
.stats
.stats
.sum
.num_deep_scrub_errors
> 0);
4406 bool try_to_auto_repair
= (cct
->_conf
->osd_scrub_auto_repair
4407 && get_pgbackend()->auto_repair_supported());
4409 scrubber
.time_for_deep
= false;
4410 // Clear these in case user issues the scrub/repair command during
4411 // the scheduling of the scrub/repair (e.g. request reservation)
4412 scrubber
.deep_scrub_on_error
= false;
4413 scrubber
.auto_repair
= false;
4415 // All periodic scrub handling goes here because must_scrub is
4416 // always set for must_deep_scrub and must_repair.
4417 if (!scrubber
.must_scrub
) {
4418 ceph_assert(!scrubber
.must_deep_scrub
&& !scrubber
.must_repair
);
4419 // Handle deep scrub determination only if allowed
4420 if (allow_deep_scrub
) {
4421 // Initial entry and scheduled scrubs without nodeep_scrub set get here
4422 if (scrubber
.need_auto
) {
4423 dout(20) << __func__
<< ": need repair after scrub errors" << dendl
;
4424 scrubber
.time_for_deep
= true;
4426 double deep_scrub_interval
= 0;
4427 pool
.info
.opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
4428 if (deep_scrub_interval
<= 0) {
4429 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
4431 scrubber
.time_for_deep
= ceph_clock_now() >=
4432 info
.history
.last_deep_scrub_stamp
+ deep_scrub_interval
;
4434 bool deep_coin_flip
= false;
4435 // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
4436 // we will deep scrub because this function is called often.
4437 if (!scrubber
.time_for_deep
&& allow_scrub
)
4438 deep_coin_flip
= (rand() % 100) < cct
->_conf
->osd_deep_scrub_randomize_ratio
* 100;
4439 dout(20) << __func__
<< ": time_for_deep=" << scrubber
.time_for_deep
<< " deep_coin_flip=" << deep_coin_flip
<< dendl
;
4441 scrubber
.time_for_deep
= (scrubber
.time_for_deep
|| deep_coin_flip
);
4444 if (!scrubber
.time_for_deep
&& has_deep_errors
) {
4445 osd
->clog
->info() << "osd." << osd
->whoami
4446 << " pg " << info
.pgid
4447 << " Deep scrub errors, upgrading scrub to deep-scrub";
4448 scrubber
.time_for_deep
= true;
4451 if (try_to_auto_repair
) {
4452 if (scrubber
.time_for_deep
) {
4453 dout(20) << __func__
<< ": auto repair with deep scrubbing" << dendl
;
4454 scrubber
.auto_repair
= true;
4455 } else if (allow_scrub
) {
4456 dout(20) << __func__
<< ": auto repair with scrubbing, rescrub if errors found" << dendl
;
4457 scrubber
.deep_scrub_on_error
= true;
4460 } else { // !allow_deep_scrub
4461 dout(20) << __func__
<< ": nodeep_scrub set" << dendl
;
4462 if (has_deep_errors
) {
4463 osd
->clog
->error() << "osd." << osd
->whoami
4464 << " pg " << info
.pgid
4465 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4470 //NOSCRUB so skip regular scrubs
4471 if (!allow_scrub
&& !scrubber
.time_for_deep
) {
4474 // scrubber.must_scrub
4475 } else if (!scrubber
.must_deep_scrub
&& has_deep_errors
) {
4476 osd
->clog
->error() << "osd." << osd
->whoami
4477 << " pg " << info
.pgid
4478 << " Regular scrub request, deep-scrub details will be lost";
4480 // Unless precluded this was handle above
4481 scrubber
.need_auto
= false;
4483 ceph_assert(scrubber
.reserved_peers
.empty());
4484 bool allow_scrubing
= cct
->_conf
->osd_scrub_during_recovery
||
4485 (cct
->_conf
->osd_repair_during_recovery
&& scrubber
.must_repair
) ||
4486 !osd
->is_recovery_active();
4487 if (allow_scrubing
&&
4488 osd
->inc_scrubs_local()) {
4489 dout(20) << __func__
<< ": reserved locally, reserving replicas" << dendl
;
4490 scrubber
.local_reserved
= true;
4491 scrubber
.reserved_peers
.insert(pg_whoami
);
4492 scrub_reserve_replicas();
4494 dout(20) << __func__
<< ": failed to reserve locally" << dendl
;
4499 if (scrubber
.local_reserved
) {
4500 if (scrubber
.reserve_failed
) {
4501 dout(20) << __func__
<< ": failed, a peer declined" << dendl
;
4502 clear_scrub_reserved();
4503 scrub_unreserve_replicas();
4505 } else if (scrubber
.reserved_peers
.size() == actingset
.size()) {
4506 dout(20) << __func__
<< ": success, reserved self and replicas" << dendl
;
4507 if (scrubber
.time_for_deep
) {
4508 dout(10) << __func__
<< ": scrub will be deep" << dendl
;
4509 state_set(PG_STATE_DEEP_SCRUB
);
4510 scrubber
.time_for_deep
= false;
4514 // none declined, since scrubber.reserved is set
4515 dout(20) << __func__
<< ": reserved " << scrubber
.reserved_peers
4516 << ", waiting for replicas" << dendl
;
4522 bool PG::is_scrub_registered()
4524 return !scrubber
.scrub_reg_stamp
.is_zero();
4527 void PG::reg_next_scrub()
4534 if (scrubber
.must_scrub
|| scrubber
.need_auto
) {
4535 // Set the smallest time that isn't utime_t()
4536 reg_stamp
= Scrubber::scrub_must_stamp();
4538 } else if (info
.stats
.stats_invalid
&& cct
->_conf
->osd_scrub_invalid_stats
) {
4539 reg_stamp
= ceph_clock_now();
4542 reg_stamp
= info
.history
.last_scrub_stamp
;
4544 // note down the sched_time, so we can locate this scrub, and remove it
4546 double scrub_min_interval
= 0, scrub_max_interval
= 0;
4547 pool
.info
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &scrub_min_interval
);
4548 pool
.info
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
4549 ceph_assert(!is_scrub_registered());
4550 scrubber
.scrub_reg_stamp
= osd
->reg_pg_scrub(info
.pgid
,
4555 dout(10) << __func__
<< " pg " << pg_id
<< " register next scrub, scrub time "
4556 << scrubber
.scrub_reg_stamp
<< ", must = " << (int)must
<< dendl
;
4559 void PG::unreg_next_scrub()
4561 if (is_scrub_registered()) {
4562 osd
->unreg_pg_scrub(info
.pgid
, scrubber
.scrub_reg_stamp
);
4563 scrubber
.scrub_reg_stamp
= utime_t();
4567 void PG::on_info_history_change()
4573 void PG::scrub_requested(bool deep
, bool repair
, bool need_auto
)
4577 scrubber
.need_auto
= true;
4579 scrubber
.must_scrub
= true;
4580 scrubber
.must_deep_scrub
= deep
|| repair
;
4581 scrubber
.must_repair
= repair
;
4582 // User might intervene, so clear this
4583 scrubber
.need_auto
= false;
4588 void PG::do_replica_scrub_map(OpRequestRef op
)
4590 const MOSDRepScrubMap
*m
= static_cast<const MOSDRepScrubMap
*>(op
->get_req());
4591 dout(7) << __func__
<< " " << *m
<< dendl
;
4592 if (m
->map_epoch
< info
.history
.same_interval_since
) {
4593 dout(10) << __func__
<< " discarding old from "
4594 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
4598 if (!scrubber
.is_chunky_scrub_active()) {
4599 dout(10) << __func__
<< " scrub isn't active" << dendl
;
4605 auto p
= const_cast<bufferlist
&>(m
->get_data()).cbegin();
4606 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
4607 dout(10) << "map version is "
4608 << scrubber
.received_maps
[m
->from
].valid_through
4611 dout(10) << __func__
<< " waiting_on_whom was " << scrubber
.waiting_on_whom
4613 ceph_assert(scrubber
.waiting_on_whom
.count(m
->from
));
4614 scrubber
.waiting_on_whom
.erase(m
->from
);
4616 dout(10) << __func__
<< " replica was preempted, setting flag" << dendl
;
4617 scrub_preempted
= true;
4619 if (scrubber
.waiting_on_whom
.empty()) {
4620 requeue_scrub(ops_blocked_by_scrub());
4624 // send scrub v3 messages (chunky scrub)
4625 void PG::_request_scrub_map(
4626 pg_shard_t replica
, eversion_t version
,
4627 hobject_t start
, hobject_t end
,
4629 bool allow_preemption
)
4631 ceph_assert(replica
!= pg_whoami
);
4632 dout(10) << "scrub requesting scrubmap from osd." << replica
4633 << " deep " << (int)deep
<< dendl
;
4634 MOSDRepScrub
*repscrubop
= new MOSDRepScrub(
4635 spg_t(info
.pgid
.pgid
, replica
.shard
), version
,
4637 get_last_peering_reset(),
4641 ops_blocked_by_scrub());
4642 // default priority, we want the rep scrub processed prior to any recovery
4643 // or client io messages (we are holding a lock!)
4644 osd
->send_message_osd_cluster(
4645 replica
.osd
, repscrubop
, get_osdmap_epoch());
4648 void PG::handle_scrub_reserve_request(OpRequestRef op
)
4650 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4652 if (scrubber
.local_reserved
) {
4653 dout(10) << __func__
<< " ignoring reserve request: Already reserved"
4657 if ((cct
->_conf
->osd_scrub_during_recovery
|| !osd
->is_recovery_active()) &&
4658 osd
->inc_scrubs_remote()) {
4659 scrubber
.remote_reserved
= true;
4661 dout(20) << __func__
<< ": failed to reserve remotely" << dendl
;
4662 scrubber
.remote_reserved
= false;
4664 const MOSDScrubReserve
*m
=
4665 static_cast<const MOSDScrubReserve
*>(op
->get_req());
4666 Message
*reply
= new MOSDScrubReserve(
4667 spg_t(info
.pgid
.pgid
, primary
.shard
),
4669 scrubber
.remote_reserved
? MOSDScrubReserve::GRANT
: MOSDScrubReserve::REJECT
,
4671 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
4674 void PG::handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
)
4676 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4678 if (!scrubber
.local_reserved
) {
4679 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
4682 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
4683 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
4685 dout(10) << " osd." << from
<< " scrub reserve = success" << dendl
;
4686 scrubber
.reserved_peers
.insert(from
);
4691 void PG::handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
)
4693 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4695 if (!scrubber
.local_reserved
) {
4696 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
4699 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
4700 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
4702 /* One decline stops this pg from being scheduled for scrubbing. */
4703 dout(10) << " osd." << from
<< " scrub reserve = fail" << dendl
;
4704 scrubber
.reserve_failed
= true;
4709 void PG::handle_scrub_reserve_release(OpRequestRef op
)
4711 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
4713 clear_scrub_reserved();
4716 // We can zero the value of primary num_bytes as just an atomic.
4717 // However, setting above zero reserves space for backfill and requires
4718 // the OSDService::stat_lock which protects all OSD usage
4719 void PG::set_reserved_num_bytes(int64_t primary
, int64_t local
) {
4720 ceph_assert(osd
->stat_lock
.is_locked_by_me());
4721 primary_num_bytes
.store(primary
);
4722 local_num_bytes
.store(local
);
4726 void PG::clear_reserved_num_bytes() {
4727 primary_num_bytes
.store(0);
4728 local_num_bytes
.store(0);
4732 void PG::reject_reservation()
4734 clear_reserved_num_bytes();
4735 osd
->send_message_osd_cluster(
4737 new MBackfillReserve(
4738 MBackfillReserve::REJECT_TOOFULL
,
4739 spg_t(info
.pgid
.pgid
, primary
.shard
),
4740 get_osdmap_epoch()),
4741 get_osdmap_epoch());
4744 void PG::schedule_backfill_retry(float delay
)
4746 std::lock_guard
lock(osd
->recovery_request_lock
);
4747 osd
->recovery_request_timer
.add_event_after(
4749 new QueuePeeringEvt
<RequestBackfill
>(
4750 this, get_osdmap_epoch(),
4751 RequestBackfill()));
4754 void PG::schedule_recovery_retry(float delay
)
4756 std::lock_guard
lock(osd
->recovery_request_lock
);
4757 osd
->recovery_request_timer
.add_event_after(
4759 new QueuePeeringEvt
<DoRecovery
>(
4760 this, get_osdmap_epoch(),
4764 void PG::clear_scrub_reserved()
4766 scrubber
.reserved_peers
.clear();
4767 scrubber
.reserve_failed
= false;
4769 if (scrubber
.local_reserved
) {
4770 scrubber
.local_reserved
= false;
4771 osd
->dec_scrubs_local();
4773 if (scrubber
.remote_reserved
) {
4774 scrubber
.remote_reserved
= false;
4775 osd
->dec_scrubs_remote();
4779 void PG::scrub_reserve_replicas()
4781 ceph_assert(backfill_targets
.empty());
4782 for (set
<pg_shard_t
>::iterator i
= actingset
.begin();
4783 i
!= actingset
.end();
4785 if (*i
== pg_whoami
) continue;
4786 dout(10) << "scrub requesting reserve from osd." << *i
<< dendl
;
4787 osd
->send_message_osd_cluster(
4789 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4791 MOSDScrubReserve::REQUEST
, pg_whoami
),
4792 get_osdmap_epoch());
4796 void PG::scrub_unreserve_replicas()
4798 ceph_assert(backfill_targets
.empty());
4799 for (set
<pg_shard_t
>::iterator i
= actingset
.begin();
4800 i
!= actingset
.end();
4802 if (*i
== pg_whoami
) continue;
4803 dout(10) << "scrub requesting unreserve from osd." << *i
<< dendl
;
4804 osd
->send_message_osd_cluster(
4806 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4808 MOSDScrubReserve::RELEASE
, pg_whoami
),
4809 get_osdmap_epoch());
4813 void PG::_scan_rollback_obs(const vector
<ghobject_t
> &rollback_obs
)
4815 ObjectStore::Transaction t
;
4816 eversion_t trimmed_to
= last_rollback_info_trimmed_to_applied
;
4817 for (vector
<ghobject_t
>::const_iterator i
= rollback_obs
.begin();
4818 i
!= rollback_obs
.end();
4820 if (i
->generation
< trimmed_to
.version
) {
4821 dout(10) << __func__
<< "osd." << osd
->whoami
4822 << " pg " << info
.pgid
4823 << " found obsolete rollback obj "
4824 << *i
<< " generation < trimmed_to "
4826 << "...repaired" << dendl
;
4831 derr
<< __func__
<< ": queueing trans to clean up obsolete rollback objs"
4833 osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4837 void PG::_scan_snaps(ScrubMap
&smap
)
4842 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4843 // caller using clean_meta_map(), and it works properly.
4844 dout(20) << __func__
<< " start" << dendl
;
4846 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4847 i
!= smap
.objects
.rend();
4849 const hobject_t
&hoid
= i
->first
;
4850 ScrubMap::object
&o
= i
->second
;
4852 dout(20) << __func__
<< " " << hoid
<< dendl
;
4854 ceph_assert(!hoid
.is_snapdir());
4855 if (hoid
.is_head()) {
4856 // parse the SnapSet
4858 if (o
.attrs
.find(SS_ATTR
) == o
.attrs
.end()) {
4861 bl
.push_back(o
.attrs
[SS_ATTR
]);
4862 auto p
= bl
.cbegin();
4868 head
= hoid
.get_head();
4871 if (hoid
.snap
< CEPH_MAXSNAP
) {
4872 // check and if necessary fix snap_mapper
4873 if (hoid
.get_head() != head
) {
4874 derr
<< __func__
<< " no head for " << hoid
<< " (have " << head
<< ")"
4878 set
<snapid_t
> obj_snaps
;
4879 auto p
= snapset
.clone_snaps
.find(hoid
.snap
);
4880 if (p
== snapset
.clone_snaps
.end()) {
4881 derr
<< __func__
<< " no clone_snaps for " << hoid
<< " in " << snapset
4885 obj_snaps
.insert(p
->second
.begin(), p
->second
.end());
4886 set
<snapid_t
> cur_snaps
;
4887 int r
= snap_mapper
.get_snaps(hoid
, &cur_snaps
);
4888 if (r
!= 0 && r
!= -ENOENT
) {
4889 derr
<< __func__
<< ": get_snaps returned " << cpp_strerror(r
) << dendl
;
4892 if (r
== -ENOENT
|| cur_snaps
!= obj_snaps
) {
4893 ObjectStore::Transaction t
;
4894 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4896 r
= snap_mapper
.remove_oid(hoid
, &_t
);
4898 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
)
4902 osd
->clog
->error() << "osd." << osd
->whoami
4903 << " found snap mapper error on pg "
4905 << " oid " << hoid
<< " snaps in mapper: "
4906 << cur_snaps
<< ", oi: "
4910 osd
->clog
->error() << "osd." << osd
->whoami
4911 << " found snap mapper error on pg "
4913 << " oid " << hoid
<< " snaps missing in mapper"
4916 << " was " << cur_snaps
<< " r " << r
4919 snap_mapper
.add_oid(hoid
, obj_snaps
, &_t
);
4921 // wait for repair to apply to avoid confusing other bits of the system.
4924 Mutex
my_lock("PG::_scan_snaps my_lock");
4927 t
.register_on_applied_sync(
4928 new C_SafeCond(&my_lock
, &my_cond
, &done
, &r
));
4929 r
= osd
->store
->queue_transaction(ch
, std::move(t
));
4931 derr
<< __func__
<< ": queue_transaction got " << cpp_strerror(r
)
4936 my_cond
.Wait(my_lock
);
4945 void PG::_repair_oinfo_oid(ScrubMap
&smap
)
4947 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4948 i
!= smap
.objects
.rend();
4950 const hobject_t
&hoid
= i
->first
;
4951 ScrubMap::object
&o
= i
->second
;
4954 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4957 bl
.push_back(o
.attrs
[OI_ATTR
]);
4964 if (oi
.soid
!= hoid
) {
4965 ObjectStore::Transaction t
;
4966 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4967 osd
->clog
->error() << "osd." << osd
->whoami
4968 << " found object info error on pg "
4970 << " oid " << hoid
<< " oid in object info: "
4976 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4978 bufferptr
bp(bl
.c_str(), bl
.length());
4979 o
.attrs
[OI_ATTR
] = bp
;
4981 t
.setattr(coll
, ghobject_t(hoid
), OI_ATTR
, bl
);
4982 int r
= osd
->store
->queue_transaction(ch
, std::move(t
));
4984 derr
<< __func__
<< ": queue_transaction got " << cpp_strerror(r
)
4990 int PG::build_scrub_map_chunk(
4992 ScrubMapBuilder
&pos
,
4996 ThreadPool::TPHandle
&handle
)
4998 dout(10) << __func__
<< " [" << start
<< "," << end
<< ") "
5003 while (pos
.empty()) {
5005 map
.valid_through
= info
.last_update
;
5008 vector
<ghobject_t
> rollback_obs
;
5009 pos
.ret
= get_pgbackend()->objects_list_range(
5015 dout(5) << "objects_list_range error: " << pos
.ret
<< dendl
;
5018 if (pos
.ls
.empty()) {
5021 _scan_rollback_obs(rollback_obs
);
5023 return -EINPROGRESS
;
5027 while (!pos
.done()) {
5028 int r
= get_pgbackend()->be_scan_list(map
, pos
);
5029 if (r
== -EINPROGRESS
) {
5035 dout(20) << __func__
<< " finishing" << dendl
;
5036 ceph_assert(pos
.done());
5037 _repair_oinfo_oid(map
);
5038 if (!is_primary()) {
5039 ScrubMap for_meta_scrub
;
5040 // In case we restarted smaller chunk, clear old data
5041 scrubber
.cleaned_meta_map
.clear_from(scrubber
.start
);
5042 scrubber
.cleaned_meta_map
.insert(map
);
5043 scrubber
.clean_meta_map(for_meta_scrub
);
5044 _scan_snaps(for_meta_scrub
);
5047 dout(20) << __func__
<< " done, got " << map
.objects
.size() << " items"
5052 void PG::Scrubber::cleanup_store(ObjectStore::Transaction
*t
) {
5055 struct OnComplete
: Context
{
5056 std::unique_ptr
<Scrub::Store
> store
;
5057 explicit OnComplete(
5058 std::unique_ptr
<Scrub::Store
> &&store
)
5059 : store(std::move(store
)) {}
5060 void finish(int) override
{}
5063 t
->register_on_complete(new OnComplete(std::move(store
)));
5064 ceph_assert(!store
);
5067 void PG::repair_object(
5068 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
5069 pg_shard_t bad_peer
)
5071 list
<pg_shard_t
> op_shards
;
5072 for (auto i
: *ok_peers
) {
5073 op_shards
.push_back(i
.second
);
5075 dout(10) << "repair_object " << soid
<< " bad_peer osd."
5076 << bad_peer
<< " ok_peers osd.{" << op_shards
<< "}" << dendl
;
5077 ScrubMap::object
&po
= ok_peers
->back().first
;
5080 bv
.push_back(po
.attrs
[OI_ATTR
]);
5083 auto bliter
= bv
.cbegin();
5086 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
5089 if (bad_peer
!= primary
) {
5090 peer_missing
[bad_peer
].add(soid
, oi
.version
, eversion_t(), false);
5092 // We should only be scrubbing if the PG is clean.
5093 ceph_assert(waiting_for_unreadable_object
.empty());
5095 pg_log
.missing_add(soid
, oi
.version
, eversion_t());
5097 pg_log
.set_last_requested(0);
5098 dout(10) << __func__
<< ": primary = " << primary
<< dendl
;
5101 if (is_ec_pg() || bad_peer
== primary
) {
5102 // we'd better collect all shard for EC pg, and prepare good peers as the
5103 // source of pull in the case of replicated pg.
5104 missing_loc
.add_missing(soid
, oi
.version
, eversion_t());
5105 list
<pair
<ScrubMap::object
, pg_shard_t
> >::iterator i
;
5106 for (i
= ok_peers
->begin();
5107 i
!= ok_peers
->end();
5109 missing_loc
.add_location(soid
, i
->second
);
5115 * Wait for last_update_applied to match msg->scrub_to as above. Wait
5116 * for pushes to complete in case of recent recovery. Build a single
5117 * scrubmap of objects that are in the range [msg->start, msg->end).
5119 void PG::replica_scrub(
5121 ThreadPool::TPHandle
&handle
)
5123 const MOSDRepScrub
*msg
= static_cast<const MOSDRepScrub
*>(op
->get_req());
5124 ceph_assert(!scrubber
.active_rep_scrub
);
5125 dout(7) << "replica_scrub" << dendl
;
5127 if (msg
->map_epoch
< info
.history
.same_interval_since
) {
5128 dout(10) << "replica_scrub discarding old replica_scrub from "
5129 << msg
->map_epoch
<< " < " << info
.history
.same_interval_since
5134 ceph_assert(msg
->chunky
);
5135 if (active_pushes
> 0) {
5136 dout(10) << "waiting for active pushes to finish" << dendl
;
5137 scrubber
.active_rep_scrub
= op
;
5141 scrubber
.state
= Scrubber::BUILD_MAP_REPLICA
;
5142 scrubber
.replica_scrub_start
= msg
->min_epoch
;
5143 scrubber
.start
= msg
->start
;
5144 scrubber
.end
= msg
->end
;
5145 scrubber
.max_end
= msg
->end
;
5146 scrubber
.deep
= msg
->deep
;
5147 scrubber
.epoch_start
= info
.history
.same_interval_since
;
5148 if (msg
->priority
) {
5149 scrubber
.priority
= msg
->priority
;
5151 scrubber
.priority
= get_scrub_priority();
5154 scrub_can_preempt
= msg
->allow_preemption
;
5155 scrub_preempted
= false;
5156 scrubber
.replica_scrubmap_pos
.reset();
5158 requeue_scrub(msg
->high_priority
);
5162 * PG_STATE_SCRUBBING is set when the scrub is queued
5164 * scrub will be chunky if all OSDs in PG support chunky scrub
5165 * scrub will fail if OSDs are too old.
5167 void PG::scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
)
5169 if (cct
->_conf
->osd_scrub_sleep
> 0 &&
5170 (scrubber
.state
== PG::Scrubber::NEW_CHUNK
||
5171 scrubber
.state
== PG::Scrubber::INACTIVE
) &&
5172 scrubber
.needs_sleep
) {
5173 ceph_assert(!scrubber
.sleeping
);
5174 dout(20) << __func__
<< " state is INACTIVE|NEW_CHUNK, sleeping" << dendl
;
5176 // Do an async sleep so we don't block the op queue
5177 OSDService
*osds
= osd
;
5178 spg_t pgid
= get_pgid();
5179 int state
= scrubber
.state
;
5180 auto scrub_requeue_callback
=
5181 new FunctionContext([osds
, pgid
, state
](int r
) {
5182 PGRef pg
= osds
->osd
->lookup_lock_pg(pgid
);
5183 if (pg
== nullptr) {
5184 lgeneric_dout(osds
->osd
->cct
, 20)
5185 << "scrub_requeue_callback: Could not find "
5186 << "PG " << pgid
<< " can't complete scrub requeue after sleep"
5190 pg
->scrubber
.sleeping
= false;
5191 pg
->scrubber
.needs_sleep
= false;
5192 lgeneric_dout(pg
->cct
, 20)
5193 << "scrub_requeue_callback: slept for "
5194 << ceph_clock_now() - pg
->scrubber
.sleep_start
5195 << ", re-queuing scrub with state " << state
<< dendl
;
5196 pg
->scrub_queued
= false;
5197 pg
->requeue_scrub();
5198 pg
->scrubber
.sleep_start
= utime_t();
5201 std::lock_guard
l(osd
->sleep_lock
);
5202 osd
->sleep_timer
.add_event_after(cct
->_conf
->osd_scrub_sleep
,
5203 scrub_requeue_callback
);
5204 scrubber
.sleeping
= true;
5205 scrubber
.sleep_start
= ceph_clock_now();
5208 if (pg_has_reset_since(queued
)) {
5211 ceph_assert(scrub_queued
);
5212 scrub_queued
= false;
5213 scrubber
.needs_sleep
= true;
5216 if (!is_primary() &&
5217 scrubber
.state
== PG::Scrubber::BUILD_MAP_REPLICA
) {
5218 chunky_scrub(handle
);
5222 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5223 dout(10) << "scrub -- not primary or active or not clean" << dendl
;
5224 state_clear(PG_STATE_SCRUBBING
);
5225 state_clear(PG_STATE_REPAIR
);
5226 state_clear(PG_STATE_DEEP_SCRUB
);
5227 publish_stats_to_osd();
5231 if (!scrubber
.active
) {
5232 ceph_assert(backfill_targets
.empty());
5234 scrubber
.deep
= state_test(PG_STATE_DEEP_SCRUB
);
5236 dout(10) << "starting a new chunky scrub" << dendl
;
5239 chunky_scrub(handle
);
5243 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5246 * The object store is partitioned into chunks which end on hash boundaries. For
5247 * each chunk, the following logic is performed:
5249 * (1) Block writes on the chunk
5250 * (2) Request maps from replicas
5251 * (3) Wait for pushes to be applied (after recovery)
5252 * (4) Wait for writes to flush on the chunk
5253 * (5) Wait for maps from replicas
5254 * (6) Compare / repair all scrub maps
5255 * (7) Wait for digest updates to apply
5257 * This logic is encoded in the mostly linear state machine:
5259 * +------------------+
5260 * _________v__________ |
5263 * |____________________| |
5266 * _________v___v______ | |
5269 * |____________________| | |
5271 * _________v__________ | |
5273 * | WAIT_PUSHES | | |
5274 * |____________________| | |
5276 * _________v__________ | |
5278 * | WAIT_LAST_UPDATE | | |
5279 * |____________________| | |
5281 * _________v__________ | |
5284 * |____________________| | |
5286 * _________v__________ | |
5288 * | WAIT_REPLICAS | | |
5289 * |____________________| | |
5291 * _________v__________ | |
5293 * | COMPARE_MAPS | | |
5294 * |____________________| | |
5297 * _________v__________ | |
5299 * |WAIT_DIGEST_UPDATES | | |
5300 * |____________________| | |
5303 * _________v__________ |
5306 * |____________________| |
5308 * +------------------+
5310 * The primary determines the last update from the subset by walking the log. If
5311 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5312 * to wait until that update is applied before building a scrub map. Both the
5313 * primary and replicas will wait for any active pushes to be applied.
5315 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5317 * scrubber.state encodes the current state of the scrub (refer to state diagram
5320 void PG::chunky_scrub(ThreadPool::TPHandle
&handle
)
5322 // check for map changes
5323 if (scrubber
.is_chunky_scrub_active()) {
5324 if (scrubber
.epoch_start
!= info
.history
.same_interval_since
) {
5325 dout(10) << "scrub pg changed, aborting" << dendl
;
5326 scrub_clear_state();
5327 scrub_unreserve_replicas();
5336 dout(20) << "scrub state " << Scrubber::state_string(scrubber
.state
)
5337 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
5338 << " max_end " << scrubber
.max_end
<< dendl
;
5340 switch (scrubber
.state
) {
5341 case PG::Scrubber::INACTIVE
:
5342 dout(10) << "scrub start" << dendl
;
5343 ceph_assert(is_primary());
5345 publish_stats_to_osd();
5346 scrubber
.epoch_start
= info
.history
.same_interval_since
;
5347 scrubber
.active
= true;
5350 ObjectStore::Transaction t
;
5351 scrubber
.cleanup_store(&t
);
5352 scrubber
.store
.reset(Scrub::Store::create(osd
->store
, &t
,
5354 osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
5357 // Don't include temporary objects when scrubbing
5358 scrubber
.start
= info
.pgid
.pgid
.get_hobj_start();
5359 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
5362 bool repair
= state_test(PG_STATE_REPAIR
);
5363 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5364 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5366 oss
<< info
.pgid
.pgid
<< " " << mode
<< " starts" << std::endl
;
5367 osd
->clog
->debug(oss
);
5370 scrubber
.preempt_left
= cct
->_conf
.get_val
<uint64_t>(
5371 "osd_scrub_max_preemptions");
5372 scrubber
.preempt_divisor
= 1;
5375 case PG::Scrubber::NEW_CHUNK
:
5376 scrubber
.primary_scrubmap
= ScrubMap();
5377 scrubber
.received_maps
.clear();
5379 // begin (possible) preemption window
5380 if (scrub_preempted
) {
5381 scrubber
.preempt_left
--;
5382 scrubber
.preempt_divisor
*= 2;
5383 dout(10) << __func__
<< " preempted, " << scrubber
.preempt_left
5384 << " left" << dendl
;
5385 scrub_preempted
= false;
5387 scrub_can_preempt
= scrubber
.preempt_left
> 0;
5390 /* get the start and end of our scrub chunk
5392 * Our scrub chunk has an important restriction we're going to need to
5393 * respect. We can't let head be start or end.
5394 * Using a half-open interval means that if end == head,
5395 * we'd scrub/lock head and the clone right next to head in different
5396 * chunks which would allow us to miss clones created between
5397 * scrubbing that chunk and scrubbing the chunk including head.
5398 * This isn't true for any of the other clones since clones can
5399 * only be created "just to the left of" head. There is one exception
5400 * to this: promotion of clones which always happens to the left of the
5401 * left-most clone, but promote_object checks the scrubber in that
5402 * case, so it should be ok. Also, it's ok to "miss" clones at the
5403 * left end of the range if we are a tier because they may legitimately
5404 * not exist (see _scrub).
5406 int min
= std::max
<int64_t>(3, cct
->_conf
->osd_scrub_chunk_min
/
5407 scrubber
.preempt_divisor
);
5408 int max
= std::max
<int64_t>(min
, cct
->_conf
->osd_scrub_chunk_max
/
5409 scrubber
.preempt_divisor
);
5410 hobject_t start
= scrubber
.start
;
5411 hobject_t candidate_end
;
5412 vector
<hobject_t
> objects
;
5413 ret
= get_pgbackend()->objects_list_partial(
5419 ceph_assert(ret
>= 0);
5421 if (!objects
.empty()) {
5422 hobject_t back
= objects
.back();
5423 while (candidate_end
.is_head() &&
5424 candidate_end
== back
.get_head()) {
5425 candidate_end
= back
;
5427 if (objects
.empty()) {
5429 "Somehow we got more than 2 objects which"
5430 "have the same head but are not clones");
5432 back
= objects
.back();
5434 if (candidate_end
.is_head()) {
5435 ceph_assert(candidate_end
!= back
.get_head());
5436 candidate_end
= candidate_end
.get_object_boundary();
5439 ceph_assert(candidate_end
.is_max());
5442 if (!_range_available_for_scrub(scrubber
.start
, candidate_end
)) {
5443 // we'll be requeued by whatever made us unavailable for scrub
5444 dout(10) << __func__
<< ": scrub blocked somewhere in range "
5445 << "[" << scrubber
.start
<< ", " << candidate_end
<< ")"
5450 scrubber
.end
= candidate_end
;
5451 if (scrubber
.end
> scrubber
.max_end
)
5452 scrubber
.max_end
= scrubber
.end
;
5455 // walk the log to find the latest update that affects our chunk
5456 scrubber
.subset_last_update
= eversion_t();
5457 for (auto p
= projected_log
.log
.rbegin();
5458 p
!= projected_log
.log
.rend();
5460 if (p
->soid
>= scrubber
.start
&&
5461 p
->soid
< scrubber
.end
) {
5462 scrubber
.subset_last_update
= p
->version
;
5466 if (scrubber
.subset_last_update
== eversion_t()) {
5467 for (list
<pg_log_entry_t
>::const_reverse_iterator p
=
5468 pg_log
.get_log().log
.rbegin();
5469 p
!= pg_log
.get_log().log
.rend();
5471 if (p
->soid
>= scrubber
.start
&&
5472 p
->soid
< scrubber
.end
) {
5473 scrubber
.subset_last_update
= p
->version
;
5479 scrubber
.state
= PG::Scrubber::WAIT_PUSHES
;
5482 case PG::Scrubber::WAIT_PUSHES
:
5483 if (active_pushes
== 0) {
5484 scrubber
.state
= PG::Scrubber::WAIT_LAST_UPDATE
;
5486 dout(15) << "wait for pushes to apply" << dendl
;
5491 case PG::Scrubber::WAIT_LAST_UPDATE
:
5492 if (last_update_applied
< scrubber
.subset_last_update
) {
5493 // will be requeued by op_applied
5494 dout(15) << "wait for EC read/modify/writes to queue" << dendl
;
5499 // ask replicas to scan
5500 scrubber
.waiting_on_whom
.insert(pg_whoami
);
5502 // request maps from replicas
5503 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
5504 i
!= acting_recovery_backfill
.end();
5506 if (*i
== pg_whoami
) continue;
5507 _request_scrub_map(*i
, scrubber
.subset_last_update
,
5508 scrubber
.start
, scrubber
.end
, scrubber
.deep
,
5509 scrubber
.preempt_left
> 0);
5510 scrubber
.waiting_on_whom
.insert(*i
);
5512 dout(10) << __func__
<< " waiting_on_whom " << scrubber
.waiting_on_whom
5515 scrubber
.state
= PG::Scrubber::BUILD_MAP
;
5516 scrubber
.primary_scrubmap_pos
.reset();
5519 case PG::Scrubber::BUILD_MAP
:
5520 ceph_assert(last_update_applied
>= scrubber
.subset_last_update
);
5522 // build my own scrub map
5523 if (scrub_preempted
) {
5524 dout(10) << __func__
<< " preempted" << dendl
;
5525 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
5528 ret
= build_scrub_map_chunk(
5529 scrubber
.primary_scrubmap
,
5530 scrubber
.primary_scrubmap_pos
,
5531 scrubber
.start
, scrubber
.end
,
5534 if (ret
== -EINPROGRESS
) {
5539 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
5542 case PG::Scrubber::BUILD_MAP_DONE
:
5543 if (scrubber
.primary_scrubmap_pos
.ret
< 0) {
5544 dout(5) << "error: " << scrubber
.primary_scrubmap_pos
.ret
5545 << ", aborting" << dendl
;
5546 scrub_clear_state();
5547 scrub_unreserve_replicas();
5550 dout(10) << __func__
<< " waiting_on_whom was "
5551 << scrubber
.waiting_on_whom
<< dendl
;
5552 ceph_assert(scrubber
.waiting_on_whom
.count(pg_whoami
));
5553 scrubber
.waiting_on_whom
.erase(pg_whoami
);
5555 scrubber
.state
= PG::Scrubber::WAIT_REPLICAS
;
5558 case PG::Scrubber::WAIT_REPLICAS
:
5559 if (!scrubber
.waiting_on_whom
.empty()) {
5560 // will be requeued by sub_op_scrub_map
5561 dout(10) << "wait for replicas to build scrub map" << dendl
;
5565 // end (possible) preemption window
5566 scrub_can_preempt
= false;
5567 if (scrub_preempted
) {
5568 dout(10) << __func__
<< " preempted, restarting chunk" << dendl
;
5569 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
5571 scrubber
.state
= PG::Scrubber::COMPARE_MAPS
;
5575 case PG::Scrubber::COMPARE_MAPS
:
5576 ceph_assert(last_update_applied
>= scrubber
.subset_last_update
);
5577 ceph_assert(scrubber
.waiting_on_whom
.empty());
5579 scrub_compare_maps();
5580 scrubber
.start
= scrubber
.end
;
5581 scrubber
.run_callbacks();
5583 // requeue the writes from the chunk that just finished
5584 requeue_ops(waiting_for_scrub
);
5586 scrubber
.state
= PG::Scrubber::WAIT_DIGEST_UPDATES
;
5590 case PG::Scrubber::WAIT_DIGEST_UPDATES
:
5591 if (scrubber
.num_digest_updates_pending
) {
5592 dout(10) << __func__
<< " waiting on "
5593 << scrubber
.num_digest_updates_pending
5594 << " digest updates" << dendl
;
5599 scrubber
.preempt_left
= cct
->_conf
.get_val
<uint64_t>(
5600 "osd_scrub_max_preemptions");
5601 scrubber
.preempt_divisor
= 1;
5603 if (!(scrubber
.end
.is_max())) {
5604 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
5608 scrubber
.state
= PG::Scrubber::FINISH
;
5613 case PG::Scrubber::FINISH
:
5615 scrubber
.state
= PG::Scrubber::INACTIVE
;
5618 if (!snap_trimq
.empty()) {
5619 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl
;
5620 snap_trimmer_scrub_complete();
5625 case PG::Scrubber::BUILD_MAP_REPLICA
:
5626 // build my own scrub map
5627 if (scrub_preempted
) {
5628 dout(10) << __func__
<< " preempted" << dendl
;
5631 ret
= build_scrub_map_chunk(
5632 scrubber
.replica_scrubmap
,
5633 scrubber
.replica_scrubmap_pos
,
5634 scrubber
.start
, scrubber
.end
,
5638 if (ret
== -EINPROGRESS
) {
5645 MOSDRepScrubMap
*reply
= new MOSDRepScrubMap(
5646 spg_t(info
.pgid
.pgid
, get_primary().shard
),
5647 scrubber
.replica_scrub_start
,
5649 reply
->preempted
= scrub_preempted
;
5650 ::encode(scrubber
.replica_scrubmap
, reply
->get_data());
5651 osd
->send_message_osd_cluster(
5652 get_primary().osd
, reply
,
5653 scrubber
.replica_scrub_start
);
5655 scrub_preempted
= false;
5656 scrub_can_preempt
= false;
5657 scrubber
.state
= PG::Scrubber::INACTIVE
;
5658 scrubber
.replica_scrubmap
= ScrubMap();
5659 scrubber
.replica_scrubmap_pos
= ScrubMapBuilder();
5660 scrubber
.start
= hobject_t();
5661 scrubber
.end
= hobject_t();
5662 scrubber
.max_end
= hobject_t();
5670 dout(20) << "scrub final state " << Scrubber::state_string(scrubber
.state
)
5671 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
5672 << " max_end " << scrubber
.max_end
<< dendl
;
5675 bool PG::write_blocked_by_scrub(const hobject_t
& soid
)
5677 if (soid
< scrubber
.start
|| soid
>= scrubber
.end
) {
5680 if (scrub_can_preempt
) {
5681 if (!scrub_preempted
) {
5682 dout(10) << __func__
<< " " << soid
<< " preempted" << dendl
;
5683 scrub_preempted
= true;
5685 dout(10) << __func__
<< " " << soid
<< " already preempted" << dendl
;
5692 bool PG::range_intersects_scrub(const hobject_t
&start
, const hobject_t
& end
)
5694 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5695 return (start
< scrubber
.max_end
&&
5696 end
>= scrubber
.start
);
5699 void PG::scrub_clear_state(bool has_error
)
5701 ceph_assert(is_locked());
5702 state_clear(PG_STATE_SCRUBBING
);
5704 state_clear(PG_STATE_REPAIR
);
5705 state_clear(PG_STATE_DEEP_SCRUB
);
5706 publish_stats_to_osd();
5708 // local -> nothing.
5709 if (scrubber
.local_reserved
) {
5710 osd
->dec_scrubs_local();
5711 scrubber
.local_reserved
= false;
5712 scrubber
.reserved_peers
.clear();
5715 requeue_ops(waiting_for_scrub
);
5719 // type-specific state clear
5720 _scrub_clear_state();
5723 void PG::scrub_compare_maps()
5725 dout(10) << __func__
<< " has maps, analyzing" << dendl
;
5727 // construct authoritative scrub map for type specific scrubbing
5728 scrubber
.cleaned_meta_map
.insert(scrubber
.primary_scrubmap
);
5730 pair
<boost::optional
<uint32_t>,
5731 boost::optional
<uint32_t>>> missing_digest
;
5733 map
<pg_shard_t
, ScrubMap
*> maps
;
5734 maps
[pg_whoami
] = &scrubber
.primary_scrubmap
;
5736 for (const auto& i
: acting_recovery_backfill
) {
5737 if (i
== pg_whoami
) continue;
5738 dout(2) << __func__
<< " replica " << i
<< " has "
5739 << scrubber
.received_maps
[i
].objects
.size()
5740 << " items" << dendl
;
5741 maps
[i
] = &scrubber
.received_maps
[i
];
5744 set
<hobject_t
> master_set
;
5746 // Construct master set
5747 for (const auto map
: maps
) {
5748 for (const auto i
: map
.second
->objects
) {
5749 master_set
.insert(i
.first
);
5754 get_pgbackend()->be_omap_checks(maps
, master_set
,
5755 scrubber
.omap_stats
, ss
);
5757 if (!ss
.str().empty()) {
5758 osd
->clog
->warn(ss
);
5761 if (acting
.size() > 1) {
5762 dout(10) << __func__
<< " comparing replica scrub maps" << dendl
;
5764 // Map from object with errors to good peer
5765 map
<hobject_t
, list
<pg_shard_t
>> authoritative
;
5767 dout(2) << __func__
<< " osd." << acting
[0] << " has "
5768 << scrubber
.primary_scrubmap
.objects
.size() << " items" << dendl
;
5773 get_pgbackend()->be_compare_scrubmaps(
5776 state_test(PG_STATE_REPAIR
),
5778 scrubber
.inconsistent
,
5781 scrubber
.shallow_errors
,
5782 scrubber
.deep_errors
,
5783 scrubber
.store
.get(),
5786 dout(2) << ss
.str() << dendl
;
5788 if (!ss
.str().empty()) {
5789 osd
->clog
->error(ss
);
5792 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5793 i
!= authoritative
.end();
5795 list
<pair
<ScrubMap::object
, pg_shard_t
> > good_peers
;
5796 for (list
<pg_shard_t
>::const_iterator j
= i
->second
.begin();
5797 j
!= i
->second
.end();
5799 good_peers
.push_back(make_pair(maps
[*j
]->objects
[i
->first
], *j
));
5801 scrubber
.authoritative
.insert(
5807 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5808 i
!= authoritative
.end();
5810 scrubber
.cleaned_meta_map
.objects
.erase(i
->first
);
5811 scrubber
.cleaned_meta_map
.objects
.insert(
5812 *(maps
[i
->second
.back()]->objects
.find(i
->first
))
5817 ScrubMap for_meta_scrub
;
5818 scrubber
.clean_meta_map(for_meta_scrub
);
5820 // ok, do the pg-type specific scrubbing
5821 scrub_snapshot_metadata(for_meta_scrub
, missing_digest
);
5822 // Called here on the primary can use an authoritative map if it isn't the primary
5823 _scan_snaps(for_meta_scrub
);
5824 if (!scrubber
.store
->empty()) {
5825 if (state_test(PG_STATE_REPAIR
)) {
5826 dout(10) << __func__
<< ": discarding scrub results" << dendl
;
5827 scrubber
.store
->flush(nullptr);
5829 dout(10) << __func__
<< ": updating scrub object" << dendl
;
5830 ObjectStore::Transaction t
;
5831 scrubber
.store
->flush(&t
);
5832 osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
5837 bool PG::scrub_process_inconsistent()
5839 dout(10) << __func__
<< ": checking authoritative" << dendl
;
5840 bool repair
= state_test(PG_STATE_REPAIR
);
5841 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5842 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5844 // authoriative only store objects which missing or inconsistent.
5845 if (!scrubber
.authoritative
.empty()) {
5847 ss
<< info
.pgid
<< " " << mode
<< " "
5848 << scrubber
.missing
.size() << " missing, "
5849 << scrubber
.inconsistent
.size() << " inconsistent objects";
5850 dout(2) << ss
.str() << dendl
;
5851 osd
->clog
->error(ss
);
5853 state_clear(PG_STATE_CLEAN
);
5854 for (map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >>::iterator i
=
5855 scrubber
.authoritative
.begin();
5856 i
!= scrubber
.authoritative
.end();
5858 set
<pg_shard_t
>::iterator j
;
5860 auto missing_entry
= scrubber
.missing
.find(i
->first
);
5861 if (missing_entry
!= scrubber
.missing
.end()) {
5862 for (j
= missing_entry
->second
.begin();
5863 j
!= missing_entry
->second
.end();
5872 if (scrubber
.inconsistent
.count(i
->first
)) {
5873 for (j
= scrubber
.inconsistent
[i
->first
].begin();
5874 j
!= scrubber
.inconsistent
[i
->first
].end();
5876 repair_object(i
->first
,
5885 return (!scrubber
.authoritative
.empty() && repair
);
5888 bool PG::ops_blocked_by_scrub() const {
5889 return (waiting_for_scrub
.size() != 0);
5892 // the part that actually finalizes a scrub
5893 void PG::scrub_finish()
5895 dout(20) << __func__
<< dendl
;
5896 bool repair
= state_test(PG_STATE_REPAIR
);
5897 bool do_auto_scrub
= false;
5898 // if the repair request comes from auto-repair and large number of errors,
5899 // we would like to cancel auto-repair
5900 if (repair
&& scrubber
.auto_repair
5901 && scrubber
.authoritative
.size() > cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
5902 state_clear(PG_STATE_REPAIR
);
5905 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5906 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5908 // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5909 if (scrubber
.deep_scrub_on_error
5910 && scrubber
.authoritative
.size()
5911 && scrubber
.authoritative
.size() <= cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
5912 ceph_assert(!deep_scrub
);
5913 do_auto_scrub
= true;
5914 dout(20) << __func__
<< " Try to auto repair after scrub errors" << dendl
;
5916 scrubber
.deep_scrub_on_error
= false;
5918 // type-specific finish (can tally more errors)
5921 bool has_error
= scrub_process_inconsistent();
5925 oss
<< info
.pgid
.pgid
<< " " << mode
<< " ";
5926 int total_errors
= scrubber
.shallow_errors
+ scrubber
.deep_errors
;
5928 oss
<< total_errors
<< " errors";
5931 if (!deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
)
5932 oss
<< " ( " << info
.stats
.stats
.sum
.num_deep_scrub_errors
5933 << " remaining deep scrub error details lost)";
5935 oss
<< ", " << scrubber
.fixed
<< " fixed";
5937 osd
->clog
->error(oss
);
5939 osd
->clog
->debug(oss
);
5944 utime_t now
= ceph_clock_now();
5945 info
.history
.last_scrub
= info
.last_update
;
5946 info
.history
.last_scrub_stamp
= now
;
5947 if (scrubber
.deep
) {
5948 info
.history
.last_deep_scrub
= info
.last_update
;
5949 info
.history
.last_deep_scrub_stamp
= now
;
5951 // Since we don't know which errors were fixed, we can only clear them
5952 // when every one has been fixed.
5954 if (scrubber
.fixed
== scrubber
.shallow_errors
+ scrubber
.deep_errors
) {
5955 ceph_assert(deep_scrub
);
5956 scrubber
.shallow_errors
= scrubber
.deep_errors
= 0;
5957 dout(20) << __func__
<< " All may be fixed" << dendl
;
5958 } else if (has_error
) {
5959 // Deep scrub in order to get corrected error counts
5960 scrub_after_recovery
= true;
5961 dout(20) << __func__
<< " Set scrub_after_recovery" << dendl
;
5962 } else if (scrubber
.shallow_errors
|| scrubber
.deep_errors
) {
5963 // We have errors but nothing can be fixed, so there is no repair
5965 state_set(PG_STATE_FAILED_REPAIR
);
5966 dout(10) << __func__
<< " " << (scrubber
.shallow_errors
+ scrubber
.deep_errors
)
5967 << " error(s) present with no repair possible" << dendl
;
5971 if ((scrubber
.shallow_errors
== 0) && (scrubber
.deep_errors
== 0))
5972 info
.history
.last_clean_scrub_stamp
= now
;
5973 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5974 info
.stats
.stats
.sum
.num_deep_scrub_errors
= scrubber
.deep_errors
;
5975 info
.stats
.stats
.sum
.num_large_omap_objects
= scrubber
.omap_stats
.large_omap_objects
;
5976 info
.stats
.stats
.sum
.num_omap_bytes
= scrubber
.omap_stats
.omap_bytes
;
5977 info
.stats
.stats
.sum
.num_omap_keys
= scrubber
.omap_stats
.omap_keys
;
5978 dout(25) << __func__
<< " shard " << pg_whoami
<< " num_omap_bytes = "
5979 << info
.stats
.stats
.sum
.num_omap_bytes
<< " num_omap_keys = "
5980 << info
.stats
.stats
.sum
.num_omap_keys
<< dendl
;
5982 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5983 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5984 // because of deep-scrub errors
5985 if (scrubber
.shallow_errors
== 0)
5986 info
.history
.last_clean_scrub_stamp
= now
;
5988 info
.stats
.stats
.sum
.num_scrub_errors
=
5989 info
.stats
.stats
.sum
.num_shallow_scrub_errors
+
5990 info
.stats
.stats
.sum
.num_deep_scrub_errors
;
5991 if (scrubber
.check_repair
) {
5992 scrubber
.check_repair
= false;
5993 if (info
.stats
.stats
.sum
.num_scrub_errors
) {
5994 state_set(PG_STATE_FAILED_REPAIR
);
5995 dout(10) << __func__
<< " " << info
.stats
.stats
.sum
.num_scrub_errors
5996 << " error(s) still present after re-scrub" << dendl
;
5999 publish_stats_to_osd();
6002 ObjectStore::Transaction t
;
6005 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
6006 ceph_assert(tr
== 0);
6011 queue_peering_event(
6013 std::make_shared
<PGPeeringEvent
>(
6019 scrub_clear_state(has_error
);
6020 scrub_unreserve_replicas();
6022 if (do_auto_scrub
) {
6023 scrub_requested(false, false, true);
6028 if (is_active() && is_primary()) {
6033 void PG::share_pg_info()
6035 dout(10) << "share_pg_info" << dendl
;
6037 // share new pg_info_t with replicas
6038 ceph_assert(!acting_recovery_backfill
.empty());
6039 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
6040 i
!= acting_recovery_backfill
.end();
6042 if (*i
== pg_whoami
) continue;
6044 auto peer
= peer_info
.find(pg_shard
);
6045 if (peer
!= peer_info
.end()) {
6046 peer
->second
.last_epoch_started
= info
.last_epoch_started
;
6047 peer
->second
.last_interval_started
= info
.last_interval_started
;
6048 peer
->second
.history
.merge(info
.history
);
6050 MOSDPGInfo
*m
= new MOSDPGInfo(get_osdmap_epoch());
6051 m
->pg_list
.push_back(
6054 pg_shard
.shard
, pg_whoami
.shard
,
6059 osd
->send_message_osd_cluster(pg_shard
.osd
, m
, get_osdmap_epoch());
6063 bool PG::append_log_entries_update_missing(
6064 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
6065 ObjectStore::Transaction
&t
, boost::optional
<eversion_t
> trim_to
,
6066 boost::optional
<eversion_t
> roll_forward_to
)
6068 ceph_assert(!entries
.empty());
6069 ceph_assert(entries
.begin()->version
> info
.last_update
);
6071 PGLogEntryHandler rollbacker
{this, &t
};
6072 bool invalidate_stats
=
6073 pg_log
.append_new_log_entries(info
.last_backfill
,
6074 info
.last_backfill_bitwise
,
6078 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
6079 pg_log
.roll_forward(&rollbacker
);
6081 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
6082 pg_log
.roll_forward_to(*roll_forward_to
, &rollbacker
);
6083 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
6086 info
.last_update
= pg_log
.get_head();
6088 if (pg_log
.get_missing().num_missing() == 0) {
6089 // advance last_complete since nothing else is missing!
6090 info
.last_complete
= info
.last_update
;
6092 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
6094 dout(20) << __func__
<< " trim_to bool = " << bool(trim_to
) << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
6096 pg_log
.trim(*trim_to
, info
);
6099 return invalidate_stats
;
6103 void PG::merge_new_log_entries(
6104 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
6105 ObjectStore::Transaction
&t
,
6106 boost::optional
<eversion_t
> trim_to
,
6107 boost::optional
<eversion_t
> roll_forward_to
)
6109 dout(10) << __func__
<< " " << entries
<< dendl
;
6110 ceph_assert(is_primary());
6112 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
6113 for (set
<pg_shard_t
>::const_iterator i
= acting_recovery_backfill
.begin();
6114 i
!= acting_recovery_backfill
.end();
6116 pg_shard_t
peer(*i
);
6117 if (peer
== pg_whoami
) continue;
6118 ceph_assert(peer_missing
.count(peer
));
6119 ceph_assert(peer_info
.count(peer
));
6120 pg_missing_t
& pmissing(peer_missing
[peer
]);
6121 dout(20) << __func__
<< " peer_missing for " << peer
<< " = " << pmissing
<< dendl
;
6122 pg_info_t
& pinfo(peer_info
[peer
]);
6123 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
6124 pinfo
.last_backfill
,
6125 info
.last_backfill_bitwise
,
6132 pinfo
.last_update
= info
.last_update
;
6133 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
6134 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
6137 if (!rebuild_missing
) {
6141 for (auto &&i
: entries
) {
6142 missing_loc
.rebuild(
6145 acting_recovery_backfill
,
6147 pg_log
.get_missing(),
6153 void PG::update_history(const pg_history_t
& new_history
)
6155 if (info
.history
.merge(new_history
)) {
6156 dout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
6158 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
6159 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
6160 past_intervals
.clear();
6161 dirty_big_info
= true;
6164 on_info_history_change();
6167 void PG::fulfill_info(
6168 pg_shard_t from
, const pg_query_t
&query
,
6169 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
6171 ceph_assert(from
== primary
);
6172 ceph_assert(query
.type
== pg_query_t::INFO
);
6175 dout(10) << "sending info" << dendl
;
6176 notify_info
= make_pair(from
, info
);
6179 void PG::fulfill_log(
6180 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
6182 dout(10) << "log request from " << from
<< dendl
;
6183 ceph_assert(from
== primary
);
6184 ceph_assert(query
.type
!= pg_query_t::INFO
);
6185 ConnectionRef con
= osd
->get_con_osd_cluster(
6186 from
.osd
, get_osdmap_epoch());
6189 MOSDPGLog
*mlog
= new MOSDPGLog(
6190 from
.shard
, pg_whoami
.shard
,
6193 mlog
->missing
= pg_log
.get_missing();
6195 // primary -> other, when building master log
6196 if (query
.type
== pg_query_t::LOG
) {
6197 dout(10) << " sending info+missing+log since " << query
.since
6199 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
6200 osd
->clog
->error() << info
.pgid
<< " got broken pg_query_t::LOG since " << query
.since
6201 << " when my log.tail is " << pg_log
.get_tail()
6202 << ", sending full log instead";
6203 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
6205 mlog
->log
.copy_after(cct
, pg_log
.get_log(), query
.since
);
6207 else if (query
.type
== pg_query_t::FULLLOG
) {
6208 dout(10) << " sending info+missing+full log" << dendl
;
6209 mlog
->log
= pg_log
.get_log();
6212 dout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
6214 osd
->share_map_peer(from
.osd
, con
.get(), get_osdmap());
6215 osd
->send_message_osd_cluster(mlog
, con
.get());
6218 void PG::fulfill_query(const MQuery
& query
, RecoveryCtx
*rctx
)
6220 if (query
.query
.type
== pg_query_t::INFO
) {
6221 pair
<pg_shard_t
, pg_info_t
> notify_info
;
6222 update_history(query
.query
.history
);
6223 fulfill_info(query
.from
, query
.query
, notify_info
);
6227 notify_info
.first
.shard
, pg_whoami
.shard
,
6230 notify_info
.second
),
6233 update_history(query
.query
.history
);
6234 fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
6238 void PG::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
6240 bool changed
= false;
6241 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
) &&
6242 !lastmap
->test_flag(CEPH_OSDMAP_FULL
)) {
6243 dout(10) << " cluster was marked full in " << osdmap
->get_epoch() << dendl
;
6246 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
6248 return; // pool deleted
6250 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
6251 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
6252 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
6253 dout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
6258 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
6263 bool PG::should_restart_peering(
6265 int newactingprimary
,
6266 const vector
<int>& newup
,
6267 const vector
<int>& newacting
,
6271 if (PastIntervals::is_new_interval(
6283 dout(20) << "new interval newup " << newup
6284 << " newacting " << newacting
<< dendl
;
6287 if (!lastmap
->is_up(osd
->whoami
) && osdmap
->is_up(osd
->whoami
)) {
6288 dout(10) << __func__
<< " osd transitioned from down -> up" << dendl
;
6294 bool PG::old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
)
6296 if (last_peering_reset
> reply_epoch
||
6297 last_peering_reset
> query_epoch
) {
6298 dout(10) << "old_peering_msg reply_epoch " << reply_epoch
<< " query_epoch " << query_epoch
6299 << " last_peering_reset " << last_peering_reset
6306 void PG::set_last_peering_reset()
6308 dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl
;
6309 if (last_peering_reset
!= get_osdmap_epoch()) {
6310 last_peering_reset
= get_osdmap_epoch();
6311 reset_interval_flush();
6318 FlushState(PG
*pg
, epoch_t epoch
) : pg(pg
), epoch(epoch
) {}
6321 if (!pg
->pg_has_reset_since(epoch
))
6326 typedef std::shared_ptr
<FlushState
> FlushStateRef
;
6328 void PG::start_flush(ObjectStore::Transaction
*t
)
6330 // flush in progress ops
6331 FlushStateRef
flush_trigger (std::make_shared
<FlushState
>(
6332 this, get_osdmap_epoch()));
6333 flushes_in_progress
++;
6334 t
->register_on_applied(new ContainerContext
<FlushStateRef
>(flush_trigger
));
6335 t
->register_on_commit(new ContainerContext
<FlushStateRef
>(flush_trigger
));
6338 void PG::reset_interval_flush()
6340 dout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
6341 recovery_state
.clear_blocked_outgoing();
6343 Context
*c
= new QueuePeeringEvt
<IntervalFlush
>(
6344 this, get_osdmap_epoch(), IntervalFlush());
6345 if (!ch
->flush_commit(c
)) {
6346 dout(10) << "Beginning to block outgoing recovery messages" << dendl
;
6347 recovery_state
.begin_block_outgoing();
6349 dout(10) << "Not blocking outgoing recovery messages" << dendl
;
6354 /* Called before initializing peering during advance_map */
6355 void PG::start_peering_interval(
6356 const OSDMapRef lastmap
,
6357 const vector
<int>& newup
, int new_up_primary
,
6358 const vector
<int>& newacting
, int new_acting_primary
,
6359 ObjectStore::Transaction
*t
)
6361 const OSDMapRef osdmap
= get_osdmap();
6363 set_last_peering_reset();
6365 vector
<int> oldacting
, oldup
;
6366 int oldrole
= get_role();
6369 osd
->clear_ready_to_merge(this);
6372 pg_shard_t old_acting_primary
= get_primary();
6373 pg_shard_t old_up_primary
= up_primary
;
6374 bool was_old_primary
= is_primary();
6375 bool was_old_replica
= is_replica();
6377 acting
.swap(oldacting
);
6379 init_primary_up_acting(
6383 new_acting_primary
);
6385 if (info
.stats
.up
!= up
||
6386 info
.stats
.acting
!= acting
||
6387 info
.stats
.up_primary
!= new_up_primary
||
6388 info
.stats
.acting_primary
!= new_acting_primary
) {
6390 info
.stats
.up_primary
= new_up_primary
;
6391 info
.stats
.acting
= acting
;
6392 info
.stats
.acting_primary
= new_acting_primary
;
6393 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
6396 pg_stats_publish_lock
.Lock();
6397 pg_stats_publish_valid
= false;
6398 pg_stats_publish_lock
.Unlock();
6400 // This will now be remapped during a backfill in cases
6401 // that it would not have been before.
6403 state_set(PG_STATE_REMAPPED
);
6405 state_clear(PG_STATE_REMAPPED
);
6407 int role
= osdmap
->calc_pg_role(osd
->whoami
, acting
, acting
.size());
6408 if (pool
.info
.is_replicated() || role
== pg_whoami
.shard
)
6413 // did acting, up, primary|acker change?
6415 dout(10) << " no lastmap" << dendl
;
6417 dirty_big_info
= true;
6418 info
.history
.same_interval_since
= osdmap
->get_epoch();
6420 std::stringstream debug
;
6421 ceph_assert(info
.history
.same_interval_since
!= 0);
6422 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
6423 get_is_recoverable_predicate());
6424 bool new_interval
= PastIntervals::check_new_interval(
6425 old_acting_primary
.osd
,
6427 oldacting
, newacting
,
6431 info
.history
.same_interval_since
,
6432 info
.history
.last_epoch_clean
,
6439 dout(10) << __func__
<< ": check_new_interval output: "
6440 << debug
.str() << dendl
;
6442 if (osdmap
->get_epoch() == osd
->get_superblock().oldest_map
&&
6443 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
6444 dout(10) << " map gap, clearing past_intervals and faking" << dendl
;
6445 // our information is incomplete and useless; someone else was clean
6446 // after everything we know if osdmaps were trimmed.
6447 past_intervals
.clear();
6449 dout(10) << " noting past " << past_intervals
<< dendl
;
6452 dirty_big_info
= true;
6453 info
.history
.same_interval_since
= osdmap
->get_epoch();
6454 if (osdmap
->have_pg_pool(info
.pgid
.pgid
.pool()) &&
6455 info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
6456 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
6458 info
.history
.last_epoch_split
= osdmap
->get_epoch();
6463 if (old_up_primary
!= up_primary
||
6465 info
.history
.same_up_since
= osdmap
->get_epoch();
6467 // this comparison includes primary rank via pg_shard_t
6468 if (old_acting_primary
!= get_primary()) {
6469 info
.history
.same_primary_since
= osdmap
->get_epoch();
6474 dout(1) << __func__
<< " up " << oldup
<< " -> " << up
6475 << ", acting " << oldacting
<< " -> " << acting
6476 << ", acting_primary " << old_acting_primary
<< " -> " << new_acting_primary
6477 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
6478 << ", role " << oldrole
<< " -> " << role
6479 << ", features acting " << acting_features
6480 << " upacting " << upacting_features
6484 state_clear(PG_STATE_ACTIVE
);
6485 state_clear(PG_STATE_PEERED
);
6486 state_clear(PG_STATE_PREMERGE
);
6487 state_clear(PG_STATE_DOWN
);
6488 state_clear(PG_STATE_RECOVERY_WAIT
);
6489 state_clear(PG_STATE_RECOVERY_TOOFULL
);
6490 state_clear(PG_STATE_RECOVERING
);
6492 peer_purged
.clear();
6493 acting_recovery_backfill
.clear();
6494 scrub_queued
= false;
6496 // reset primary/replica state?
6497 if (was_old_primary
|| is_primary()) {
6498 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
6499 } else if (was_old_replica
|| is_replica()) {
6500 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
6502 clear_primary_state();
6508 projected_last_update
= eversion_t();
6510 ceph_assert(!deleting
);
6512 // should we tell the primary we are here?
6513 send_notify
= !is_primary();
6515 if (role
!= oldrole
||
6516 was_old_primary
!= is_primary()) {
6517 // did primary change?
6518 if (was_old_primary
!= is_primary()) {
6519 state_clear(PG_STATE_CLEAN
);
6520 clear_publish_stats();
6525 // take active waiters
6526 requeue_ops(waiting_for_peered
);
6530 // did primary change?
6531 if (get_primary() != old_acting_primary
) {
6532 dout(10) << *this << " " << oldacting
<< " -> " << acting
6533 << ", acting primary "
6534 << old_acting_primary
<< " -> " << get_primary()
6537 // primary is the same.
6539 // i am (still) primary. but my replica set changed.
6540 state_clear(PG_STATE_CLEAN
);
6542 dout(10) << oldacting
<< " -> " << acting
6543 << ", replicas changed" << dendl
;
6549 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
6550 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
6551 osd
->queue_want_pg_temp(info
.pgid
.pgid
, acting
);
6555 void PG::on_new_interval()
6557 const OSDMapRef osdmap
= get_osdmap();
6559 on_info_history_change();
6561 // initialize features
6562 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
6563 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
6564 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
6565 if (*p
== CRUSH_ITEM_NONE
)
6567 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
6568 acting_features
&= f
;
6569 upacting_features
&= f
;
6571 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
6572 if (*p
== CRUSH_ITEM_NONE
)
6574 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
6580 void PG::proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
6582 ceph_assert(!is_primary());
6584 update_history(oinfo
.history
);
6585 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
6586 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
6587 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
6588 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
6592 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
6593 dout(10) << __func__
<< " updating purged_snaps to " << oinfo
.purged_snaps
6595 info
.purged_snaps
= oinfo
.purged_snaps
;
6597 dirty_big_info
= true;
6601 ostream
& operator<<(ostream
& out
, const PG
& pg
)
6603 out
<< "pg[" << pg
.info
6605 if (pg
.acting
!= pg
.up
)
6606 out
<< "/" << pg
.acting
;
6608 out
<< "p" << pg
.get_primary();
6609 if (!pg
.async_recovery_targets
.empty())
6610 out
<< " async=[" << pg
.async_recovery_targets
<< "]";
6611 if (!pg
.backfill_targets
.empty())
6612 out
<< " backfill=[" << pg
.backfill_targets
<< "]";
6613 out
<< " r=" << pg
.get_role();
6614 out
<< " lpr=" << pg
.get_last_peering_reset();
6619 if (!pg
.past_intervals
.empty()) {
6620 out
<< " pi=[" << pg
.past_intervals
.get_bounds()
6621 << ")/" << pg
.past_intervals
.size();
6624 if (pg
.is_peered()) {
6625 if (pg
.last_update_ondisk
!= pg
.info
.last_update
)
6626 out
<< " luod=" << pg
.last_update_ondisk
;
6627 if (pg
.last_update_applied
!= pg
.info
.last_update
)
6628 out
<< " lua=" << pg
.last_update_applied
;
6631 if (pg
.recovery_ops_active
)
6632 out
<< " rops=" << pg
.recovery_ops_active
;
6634 if (pg
.pg_log
.get_tail() != pg
.info
.log_tail
||
6635 pg
.pg_log
.get_head() != pg
.info
.last_update
)
6636 out
<< " (info mismatch, " << pg
.pg_log
.get_log() << ")";
6638 if (!pg
.pg_log
.get_log().empty()) {
6639 if ((pg
.pg_log
.get_log().log
.begin()->version
<= pg
.pg_log
.get_tail())) {
6640 out
<< " (log bound mismatch, actual=["
6641 << pg
.pg_log
.get_log().log
.begin()->version
<< ","
6642 << pg
.pg_log
.get_log().log
.rbegin()->version
<< "]";
6647 out
<< " crt=" << pg
.pg_log
.get_can_rollback_to();
6649 if (pg
.last_complete_ondisk
!= pg
.info
.last_complete
)
6650 out
<< " lcod " << pg
.last_complete_ondisk
;
6652 if (pg
.is_primary()) {
6653 out
<< " mlcod " << pg
.min_last_complete_ondisk
;
6656 out
<< " " << pg_state_string(pg
.get_state());
6657 if (pg
.should_send_notify())
6660 if (pg
.scrubber
.must_repair
)
6661 out
<< " MUST_REPAIR";
6662 if (pg
.scrubber
.auto_repair
)
6663 out
<< " AUTO_REPAIR";
6664 if (pg
.scrubber
.check_repair
)
6665 out
<< " CHECK_REPAIR";
6666 if (pg
.scrubber
.deep_scrub_on_error
)
6667 out
<< " DEEP_SCRUB_ON_ERROR";
6668 if (pg
.scrubber
.must_deep_scrub
)
6669 out
<< " MUST_DEEP_SCRUB";
6670 if (pg
.scrubber
.must_scrub
)
6671 out
<< " MUST_SCRUB";
6672 if (pg
.scrubber
.time_for_deep
)
6673 out
<< " TIME_FOR_DEEP";
6674 if (pg
.scrubber
.need_auto
)
6675 out
<< " NEED_AUTO";
6677 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6678 if (pg
.pg_log
.get_missing().num_missing()) {
6679 out
<< " m=" << pg
.pg_log
.get_missing().num_missing();
6680 if (pg
.is_primary()) {
6681 uint64_t unfound
= pg
.get_num_unfound();
6683 out
<< " u=" << unfound
;
6686 if (!pg
.is_clean()) {
6687 out
<< " mbc=" << pg
.missing_loc
.get_missing_by_count();
6689 if (!pg
.snap_trimq
.empty()) {
6691 // only show a count if the set is large
6692 if (pg
.snap_trimq
.num_intervals() > 16) {
6693 out
<< pg
.snap_trimq
.size();
6695 out
<< pg
.snap_trimq
;
6698 if (!pg
.info
.purged_snaps
.empty()) {
6699 out
<< " ps="; // snap trim queue / purged snaps
6700 if (pg
.info
.purged_snaps
.num_intervals() > 16) {
6701 out
<< pg
.info
.purged_snaps
.size();
6703 out
<< pg
.info
.purged_snaps
;
6713 bool PG::can_discard_op(OpRequestRef
& op
)
6715 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
6716 if (cct
->_conf
->osd_discard_disconnected_ops
&& OSD::op_is_discardable(m
)) {
6717 dout(20) << " discard " << *m
<< dendl
;
6721 if (m
->get_map_epoch() < info
.history
.same_primary_since
) {
6722 dout(7) << " changed after " << m
->get_map_epoch()
6723 << ", dropping " << *m
<< dendl
;
6727 if (m
->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT
)) {
6728 // >= luminous client
6729 if (m
->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS
)) {
6730 // >= nautilus client
6731 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend()) {
6732 dout(7) << __func__
<< " sent before last_force_op_resend "
6733 << pool
.info
.last_force_op_resend
6734 << ", dropping" << *m
<< dendl
;
6738 // == < nautilus client (luminous or mimic)
6739 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend_prenautilus()) {
6740 dout(7) << __func__
<< " sent before last_force_op_resend_prenautilus "
6741 << pool
.info
.last_force_op_resend_prenautilus
6742 << ", dropping" << *m
<< dendl
;
6746 if (m
->get_map_epoch() < info
.history
.last_epoch_split
) {
6747 dout(7) << __func__
<< " pg split in "
6748 << info
.history
.last_epoch_split
<< ", dropping" << dendl
;
6751 } else if (m
->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND
)) {
6752 // < luminous client
6753 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend_preluminous()) {
6754 dout(7) << __func__
<< " sent before last_force_op_resend_preluminous "
6755 << pool
.info
.last_force_op_resend_preluminous
6756 << ", dropping" << *m
<< dendl
;
6764 template<typename T
, int MSGTYPE
>
6765 bool PG::can_discard_replica_op(OpRequestRef
& op
)
6767 const T
*m
= static_cast<const T
*>(op
->get_req());
6768 ceph_assert(m
->get_type() == MSGTYPE
);
6770 int from
= m
->get_source().num();
6772 // if a repop is replied after a replica goes down in a new osdmap, and
6773 // before the pg advances to this new osdmap, the repop replies before this
6774 // repop can be discarded by that replica OSD, because the primary resets the
6775 // connection to it when handling the new osdmap marking it down, and also
6776 // resets the messenger sesssion when the replica reconnects. to avoid the
6777 // out-of-order replies, the messages from that replica should be discarded.
6778 OSDMapRef next_map
= osd
->get_next_osdmap();
6779 if (next_map
->is_down(from
))
6781 /* Mostly, this overlaps with the old_peering_msg
6782 * condition. An important exception is pushes
6783 * sent by replicas not in the acting set, since
6784 * if such a replica goes down it does not cause
6785 * a new interval. */
6786 if (next_map
->get_down_at(from
) >= m
->map_epoch
)
6790 // if pg changes _at all_, we reset and repeer!
6791 if (old_peering_msg(m
->map_epoch
, m
->map_epoch
)) {
6792 dout(10) << "can_discard_replica_op pg changed " << info
.history
6793 << " after " << m
->map_epoch
6794 << ", dropping" << dendl
;
6800 bool PG::can_discard_scan(OpRequestRef op
)
6802 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
6803 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
6805 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
6806 dout(10) << " got old scan, ignoring" << dendl
;
6812 bool PG::can_discard_backfill(OpRequestRef op
)
6814 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
6815 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
6817 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
6818 dout(10) << " got old backfill, ignoring" << dendl
;
6826 bool PG::can_discard_request(OpRequestRef
& op
)
6828 switch (op
->get_req()->get_type()) {
6829 case CEPH_MSG_OSD_OP
:
6830 return can_discard_op(op
);
6831 case CEPH_MSG_OSD_BACKOFF
:
6832 return false; // never discard
6834 return can_discard_replica_op
<MOSDRepOp
, MSG_OSD_REPOP
>(op
);
6835 case MSG_OSD_PG_PUSH
:
6836 return can_discard_replica_op
<MOSDPGPush
, MSG_OSD_PG_PUSH
>(op
);
6837 case MSG_OSD_PG_PULL
:
6838 return can_discard_replica_op
<MOSDPGPull
, MSG_OSD_PG_PULL
>(op
);
6839 case MSG_OSD_PG_PUSH_REPLY
:
6840 return can_discard_replica_op
<MOSDPGPushReply
, MSG_OSD_PG_PUSH_REPLY
>(op
);
6841 case MSG_OSD_REPOPREPLY
:
6842 return can_discard_replica_op
<MOSDRepOpReply
, MSG_OSD_REPOPREPLY
>(op
);
6843 case MSG_OSD_PG_RECOVERY_DELETE
:
6844 return can_discard_replica_op
<MOSDPGRecoveryDelete
, MSG_OSD_PG_RECOVERY_DELETE
>(op
);
6846 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
6847 return can_discard_replica_op
<MOSDPGRecoveryDeleteReply
, MSG_OSD_PG_RECOVERY_DELETE_REPLY
>(op
);
6849 case MSG_OSD_EC_WRITE
:
6850 return can_discard_replica_op
<MOSDECSubOpWrite
, MSG_OSD_EC_WRITE
>(op
);
6851 case MSG_OSD_EC_WRITE_REPLY
:
6852 return can_discard_replica_op
<MOSDECSubOpWriteReply
, MSG_OSD_EC_WRITE_REPLY
>(op
);
6853 case MSG_OSD_EC_READ
:
6854 return can_discard_replica_op
<MOSDECSubOpRead
, MSG_OSD_EC_READ
>(op
);
6855 case MSG_OSD_EC_READ_REPLY
:
6856 return can_discard_replica_op
<MOSDECSubOpReadReply
, MSG_OSD_EC_READ_REPLY
>(op
);
6857 case MSG_OSD_REP_SCRUB
:
6858 return can_discard_replica_op
<MOSDRepScrub
, MSG_OSD_REP_SCRUB
>(op
);
6859 case MSG_OSD_SCRUB_RESERVE
:
6860 return can_discard_replica_op
<MOSDScrubReserve
, MSG_OSD_SCRUB_RESERVE
>(op
);
6861 case MSG_OSD_REP_SCRUBMAP
:
6862 return can_discard_replica_op
<MOSDRepScrubMap
, MSG_OSD_REP_SCRUBMAP
>(op
);
6863 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
6864 return can_discard_replica_op
<
6865 MOSDPGUpdateLogMissing
, MSG_OSD_PG_UPDATE_LOG_MISSING
>(op
);
6866 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
6867 return can_discard_replica_op
<
6868 MOSDPGUpdateLogMissingReply
, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
>(op
);
6870 case MSG_OSD_PG_SCAN
:
6871 return can_discard_scan(op
);
6872 case MSG_OSD_PG_BACKFILL
:
6873 return can_discard_backfill(op
);
6874 case MSG_OSD_PG_BACKFILL_REMOVE
:
6875 return can_discard_replica_op
<MOSDPGBackfillRemove
,
6876 MSG_OSD_PG_BACKFILL_REMOVE
>(op
);
6881 void PG::take_waiters()
6883 dout(10) << "take_waiters" << dendl
;
6884 requeue_map_waiters();
6887 void PG::do_peering_event(PGPeeringEventRef evt
, RecoveryCtx
*rctx
)
6889 dout(10) << __func__
<< ": " << evt
->get_desc() << dendl
;
6890 ceph_assert(have_same_or_newer_map(evt
->get_epoch_sent()));
6891 if (old_peering_evt(evt
)) {
6892 dout(10) << "discard old " << evt
->get_desc() << dendl
;
6894 recovery_state
.handle_event(evt
, rctx
);
6896 // write_if_dirty regardless of path above to ensure we capture any work
6897 // done by OSD::advance_pg().
6898 write_if_dirty(*rctx
->transaction
);
6901 void PG::queue_peering_event(PGPeeringEventRef evt
)
6903 if (old_peering_evt(evt
))
6905 osd
->osd
->enqueue_peering_evt(info
.pgid
, evt
);
6908 void PG::queue_null(epoch_t msg_epoch
,
6909 epoch_t query_epoch
)
6911 dout(10) << "null" << dendl
;
6912 queue_peering_event(
6913 PGPeeringEventRef(std::make_shared
<PGPeeringEvent
>(msg_epoch
, query_epoch
,
6917 void PG::find_unfound(epoch_t queued
, RecoveryCtx
*rctx
)
6920 * if we couldn't start any recovery ops and things are still
6921 * unfound, see if we can discover more missing object locations.
6922 * It may be that our initial locations were bad and we errored
6923 * out while trying to pull.
6925 discover_all_missing(*rctx
->query_map
);
6926 if (rctx
->query_map
->empty()) {
6928 if (state_test(PG_STATE_BACKFILLING
)) {
6929 auto evt
= PGPeeringEventRef(
6933 PG::UnfoundBackfill()));
6934 queue_peering_event(evt
);
6935 action
= "in backfill";
6936 } else if (state_test(PG_STATE_RECOVERING
)) {
6937 auto evt
= PGPeeringEventRef(
6941 PG::UnfoundRecovery()));
6942 queue_peering_event(evt
);
6943 action
= "in recovery";
6945 action
= "already out of recovery/backfill";
6947 dout(10) << __func__
<< ": no luck, giving up on this pg for now (" << action
<< ")" << dendl
;
6949 dout(10) << __func__
<< ": no luck, giving up on this pg for now (queue_recovery)" << dendl
;
6954 void PG::handle_advance_map(
6955 OSDMapRef osdmap
, OSDMapRef lastmap
,
6956 vector
<int>& newup
, int up_primary
,
6957 vector
<int>& newacting
, int acting_primary
,
6960 ceph_assert(lastmap
->get_epoch() == osdmap_ref
->get_epoch());
6961 ceph_assert(lastmap
== osdmap_ref
);
6962 dout(10) << "handle_advance_map "
6963 << newup
<< "/" << newacting
6964 << " -- " << up_primary
<< "/" << acting_primary
6966 update_osdmap_ref(osdmap
);
6967 osd_shard
->update_pg_epoch(pg_slot
, osdmap
->get_epoch());
6969 pool
.update(cct
, osdmap
);
6972 osdmap
, lastmap
, newup
, up_primary
,
6973 newacting
, acting_primary
);
6974 recovery_state
.handle_event(evt
, rctx
);
6975 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
6977 update_store_with_options();
6979 last_require_osd_release
= osdmap
->require_osd_release
;
6982 void PG::handle_activate_map(RecoveryCtx
*rctx
)
6984 dout(10) << "handle_activate_map " << dendl
;
6986 recovery_state
.handle_event(evt
, rctx
);
6987 if (osdmap_ref
->get_epoch() - last_persisted_osdmap
>
6988 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
6989 dout(20) << __func__
<< ": Dirtying info: last_persisted is "
6990 << last_persisted_osdmap
6991 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6994 dout(20) << __func__
<< ": Not dirtying info: last_persisted is "
6995 << last_persisted_osdmap
6996 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6998 if (osdmap_ref
->check_new_blacklist_entries()) {
6999 check_blacklisted_watchers();
7001 write_if_dirty(*rctx
->transaction
);
7004 void PG::handle_initialize(RecoveryCtx
*rctx
)
7006 dout(10) << __func__
<< dendl
;
7008 recovery_state
.handle_event(evt
, rctx
);
7011 void PG::handle_query_state(Formatter
*f
)
7013 dout(10) << "handle_query_state" << dendl
;
7015 recovery_state
.handle_event(q
, 0);
7018 void PG::init_collection_pool_opts()
7020 auto r
= osd
->store
->set_collection_opts(ch
, pool
.info
.opts
);
7021 if (r
< 0 && r
!= -EOPNOTSUPP
) {
7022 derr
<< __func__
<< " set_collection_opts returns error:" << r
<< dendl
;
7026 void PG::update_store_with_options()
7028 init_collection_pool_opts();
7031 struct C_DeleteMore
: public Context
{
7034 C_DeleteMore(PG
*p
, epoch_t e
) : pg(p
), epoch(e
) {}
7035 void finish(int r
) override
{
7038 void complete(int r
) override
{
7039 ceph_assert(r
== 0);
7041 if (!pg
->pg_has_reset_since(epoch
)) {
7042 pg
->osd
->queue_for_pg_delete(pg
->get_pgid(), epoch
);
7049 void PG::_delete_some(ObjectStore::Transaction
*t
)
7051 dout(10) << __func__
<< dendl
;
7054 float osd_delete_sleep
= osd
->osd
->get_osd_delete_sleep();
7055 if (osd_delete_sleep
> 0 && delete_needs_sleep
) {
7056 epoch_t e
= get_osdmap()->get_epoch();
7058 auto delete_requeue_callback
= new FunctionContext([this, pgref
, e
](int r
) {
7059 dout(20) << __func__
<< " wake up at "
7061 << ", re-queuing delete" << dendl
;
7063 delete_needs_sleep
= false;
7064 if (!pg_has_reset_since(e
)) {
7065 osd
->queue_for_pg_delete(get_pgid(), e
);
7070 utime_t delete_schedule_time
= ceph_clock_now();
7071 delete_schedule_time
+= osd_delete_sleep
;
7072 Mutex::Locker
l(osd
->sleep_lock
);
7073 osd
->sleep_timer
.add_event_at(delete_schedule_time
,
7074 delete_requeue_callback
);
7075 dout(20) << __func__
<< " Delete scheduled at " << delete_schedule_time
<< dendl
;
7080 delete_needs_sleep
= true;
7082 vector
<ghobject_t
> olist
;
7083 int max
= std::min(osd
->store
->get_ideal_list_max(),
7084 (int)cct
->_conf
->osd_target_transaction_size
);
7086 osd
->store
->collection_list(
7089 ghobject_t::get_max(),
7093 dout(20) << __func__
<< " " << olist
<< dendl
;
7095 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
7097 for (auto& oid
: olist
) {
7098 if (oid
== pgmeta_oid
) {
7101 if (oid
.is_pgmeta()) {
7102 osd
->clog
->warn() << info
.pgid
<< " found stray pgmeta-like " << oid
7103 << " during PG removal";
7105 int r
= snap_mapper
.remove_oid(oid
.hobj
, &_t
);
7106 if (r
!= 0 && r
!= -ENOENT
) {
7109 t
->remove(coll
, oid
);
7113 dout(20) << __func__
<< " deleting " << num
<< " objects" << dendl
;
7114 Context
*fin
= new C_DeleteMore(this, get_osdmap_epoch());
7115 t
->register_on_commit(fin
);
7117 dout(20) << __func__
<< " finished" << dendl
;
7118 if (cct
->_conf
->osd_inject_failure_on_pg_removal
) {
7122 // final flush here to ensure completions drop refs. Of particular concern
7123 // are the SnapMapper ContainerContexts.
7126 PGLog::clear_info_log(info
.pgid
, t
);
7127 t
->remove_collection(coll
);
7128 t
->register_on_commit(new ContainerContext
<PGRef
>(pgref
));
7129 t
->register_on_applied(new ContainerContext
<PGRef
>(pgref
));
7130 osd
->store
->queue_transaction(ch
, std::move(*t
));
7134 if (!osd
->try_finish_pg_delete(this, pool
.info
.get_pg_num())) {
7135 dout(1) << __func__
<< " raced with merge, reinstantiating" << dendl
;
7136 ch
= osd
->store
->create_new_collection(coll
);
7139 info
.pgid
.get_split_bits(pool
.info
.get_pg_num()));
7140 _init(*t
, info
.pgid
, &pool
.info
);
7141 last_epoch
= 0; // to ensure pg epoch is also written
7143 dirty_big_info
= true;
7147 // cancel reserver here, since the PG is about to get deleted and the
7148 // exit() methods don't run when that happens.
7149 osd
->local_reserver
.cancel_reservation(info
.pgid
);
7151 osd
->logger
->dec(l_osd_pg_removing
);
7156 // Compute pending backfill data
7157 static int64_t pending_backfill(CephContext
*cct
, int64_t bf_bytes
, int64_t local_bytes
)
7159 lgeneric_dout(cct
, 20) << __func__
<< " Adjust local usage " << (local_bytes
>> 10) << "KiB"
7160 << " primary usage " << (bf_bytes
>> 10) << "KiB" << dendl
;
7161 return std::max((int64_t)0, bf_bytes
- local_bytes
);
7164 int PG::pg_stat_adjust(osd_stat_t
*ns
)
7166 osd_stat_t
&new_stat
= *ns
;
7170 // Adjust the kb_used by adding pending backfill data
7171 uint64_t reserved_num_bytes
= get_reserved_num_bytes();
7173 // For now we don't consider projected space gains here
7174 // I suggest we have an optional 2 pass backfill that frees up
7175 // space in a first pass. This could be triggered when at nearfull
7176 // or near to backfillfull.
7177 if (reserved_num_bytes
> 0) {
7178 // TODO: Handle compression by adjusting by the PGs average
7179 // compression precentage.
7180 dout(20) << __func__
<< " reserved_num_bytes " << (reserved_num_bytes
>> 10) << "KiB"
7181 << " Before kb_used " << new_stat
.statfs
.kb_used() << "KiB" << dendl
;
7182 if (new_stat
.statfs
.available
> reserved_num_bytes
)
7183 new_stat
.statfs
.available
-= reserved_num_bytes
;
7185 new_stat
.statfs
.available
= 0;
7186 dout(20) << __func__
<< " After kb_used " << new_stat
.statfs
.kb_used() << "KiB" << dendl
;
7193 /*------------ Recovery State Machine----------------*/
7195 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7196 << "state<" << get_state_name() << ">: ")
7198 /*------Crashed-------*/
7199 PG::RecoveryState::Crashed::Crashed(my_context ctx
)
7201 NamedState(context
< RecoveryMachine
>().pg
, "Crashed")
7203 context
< RecoveryMachine
>().log_enter(state_name
);
7204 ceph_abort_msg("we got a bad state machine event");
7208 /*------Initial-------*/
7209 PG::RecoveryState::Initial::Initial(my_context ctx
)
7211 NamedState(context
< RecoveryMachine
>().pg
, "Initial")
7213 context
< RecoveryMachine
>().log_enter(state_name
);
7216 boost::statechart::result
PG::RecoveryState::Initial::react(const MNotifyRec
& notify
)
7218 PG
*pg
= context
< RecoveryMachine
>().pg
;
7219 pg
->proc_replica_info(
7220 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
7221 pg
->set_last_peering_reset();
7222 return transit
< Primary
>();
7225 boost::statechart::result
PG::RecoveryState::Initial::react(const MInfoRec
& i
)
7227 PG
*pg
= context
< RecoveryMachine
>().pg
;
7228 ceph_assert(!pg
->is_primary());
7230 return transit
< Stray
>();
7233 boost::statechart::result
PG::RecoveryState::Initial::react(const MLogRec
& i
)
7235 PG
*pg
= context
< RecoveryMachine
>().pg
;
7236 ceph_assert(!pg
->is_primary());
7238 return transit
< Stray
>();
7241 void PG::RecoveryState::Initial::exit()
7243 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7244 PG
*pg
= context
< RecoveryMachine
>().pg
;
7245 utime_t dur
= ceph_clock_now() - enter_time
;
7246 pg
->osd
->recoverystate_perf
->tinc(rs_initial_latency
, dur
);
7249 /*------Started-------*/
7250 PG::RecoveryState::Started::Started(my_context ctx
)
7252 NamedState(context
< RecoveryMachine
>().pg
, "Started")
7254 context
< RecoveryMachine
>().log_enter(state_name
);
7257 boost::statechart::result
7258 PG::RecoveryState::Started::react(const IntervalFlush
&)
7260 PG
*pg
= context
< RecoveryMachine
>().pg
;
7261 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
7262 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
7263 return discard_event();
7266 boost::statechart::result
PG::RecoveryState::Started::react(const AdvMap
& advmap
)
7268 PG
*pg
= context
< RecoveryMachine
>().pg
;
7269 ldout(pg
->cct
, 10) << "Started advmap" << dendl
;
7270 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
7271 if (pg
->should_restart_peering(
7273 advmap
.acting_primary
,
7278 ldout(pg
->cct
, 10) << "should_restart_peering, transitioning to Reset"
7281 return transit
< Reset
>();
7283 pg
->remove_down_peer_info(advmap
.osdmap
);
7284 return discard_event();
7287 boost::statechart::result
PG::RecoveryState::Started::react(const QueryState
& q
)
7289 q
.f
->open_object_section("state");
7290 q
.f
->dump_string("name", state_name
);
7291 q
.f
->dump_stream("enter_time") << enter_time
;
7292 q
.f
->close_section();
7293 return discard_event();
7296 void PG::RecoveryState::Started::exit()
7298 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7299 PG
*pg
= context
< RecoveryMachine
>().pg
;
7300 utime_t dur
= ceph_clock_now() - enter_time
;
7301 pg
->osd
->recoverystate_perf
->tinc(rs_started_latency
, dur
);
7304 /*--------Reset---------*/
7305 PG::RecoveryState::Reset::Reset(my_context ctx
)
7307 NamedState(context
< RecoveryMachine
>().pg
, "Reset")
7309 context
< RecoveryMachine
>().log_enter(state_name
);
7310 PG
*pg
= context
< RecoveryMachine
>().pg
;
7312 pg
->flushes_in_progress
= 0;
7313 pg
->set_last_peering_reset();
7316 boost::statechart::result
7317 PG::RecoveryState::Reset::react(const IntervalFlush
&)
7319 PG
*pg
= context
< RecoveryMachine
>().pg
;
7320 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
7321 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
7322 return discard_event();
7325 boost::statechart::result
PG::RecoveryState::Reset::react(const AdvMap
& advmap
)
7327 PG
*pg
= context
< RecoveryMachine
>().pg
;
7328 ldout(pg
->cct
, 10) << "Reset advmap" << dendl
;
7330 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
7332 if (pg
->should_restart_peering(
7334 advmap
.acting_primary
,
7339 ldout(pg
->cct
, 10) << "should restart peering, calling start_peering_interval again"
7341 pg
->start_peering_interval(
7343 advmap
.newup
, advmap
.up_primary
,
7344 advmap
.newacting
, advmap
.acting_primary
,
7345 context
< RecoveryMachine
>().get_cur_transaction());
7347 pg
->remove_down_peer_info(advmap
.osdmap
);
7348 pg
->check_past_interval_bounds();
7349 return discard_event();
7352 boost::statechart::result
PG::RecoveryState::Reset::react(const ActMap
&)
7354 PG
*pg
= context
< RecoveryMachine
>().pg
;
7355 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
7356 context
< RecoveryMachine
>().send_notify(
7359 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
7360 pg
->get_osdmap_epoch(),
7361 pg
->get_osdmap_epoch(),
7363 pg
->past_intervals
);
7366 pg
->update_heartbeat_peers();
7369 return transit
< Started
>();
7372 boost::statechart::result
PG::RecoveryState::Reset::react(const QueryState
& q
)
7374 q
.f
->open_object_section("state");
7375 q
.f
->dump_string("name", state_name
);
7376 q
.f
->dump_stream("enter_time") << enter_time
;
7377 q
.f
->close_section();
7378 return discard_event();
7381 void PG::RecoveryState::Reset::exit()
7383 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7384 PG
*pg
= context
< RecoveryMachine
>().pg
;
7385 utime_t dur
= ceph_clock_now() - enter_time
;
7386 pg
->osd
->recoverystate_perf
->tinc(rs_reset_latency
, dur
);
7389 /*-------Start---------*/
7390 PG::RecoveryState::Start::Start(my_context ctx
)
7392 NamedState(context
< RecoveryMachine
>().pg
, "Start")
7394 context
< RecoveryMachine
>().log_enter(state_name
);
7396 PG
*pg
= context
< RecoveryMachine
>().pg
;
7397 if (pg
->is_primary()) {
7398 ldout(pg
->cct
, 1) << "transitioning to Primary" << dendl
;
7399 post_event(MakePrimary());
7401 ldout(pg
->cct
, 1) << "transitioning to Stray" << dendl
;
7402 post_event(MakeStray());
7406 void PG::RecoveryState::Start::exit()
7408 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7409 PG
*pg
= context
< RecoveryMachine
>().pg
;
7410 utime_t dur
= ceph_clock_now() - enter_time
;
7411 pg
->osd
->recoverystate_perf
->tinc(rs_start_latency
, dur
);
7414 /*---------Primary--------*/
7415 PG::RecoveryState::Primary::Primary(my_context ctx
)
7417 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary")
7419 context
< RecoveryMachine
>().log_enter(state_name
);
7420 PG
*pg
= context
< RecoveryMachine
>().pg
;
7421 ceph_assert(pg
->want_acting
.empty());
7423 // set CREATING bit until we have peered for the first time.
7424 if (pg
->info
.history
.last_epoch_started
== 0) {
7425 pg
->state_set(PG_STATE_CREATING
);
7426 // use the history timestamp, which ultimately comes from the
7427 // monitor in the create case.
7428 utime_t t
= pg
->info
.history
.last_scrub_stamp
;
7429 pg
->info
.stats
.last_fresh
= t
;
7430 pg
->info
.stats
.last_active
= t
;
7431 pg
->info
.stats
.last_change
= t
;
7432 pg
->info
.stats
.last_peered
= t
;
7433 pg
->info
.stats
.last_clean
= t
;
7434 pg
->info
.stats
.last_unstale
= t
;
7435 pg
->info
.stats
.last_undegraded
= t
;
7436 pg
->info
.stats
.last_fullsized
= t
;
7437 pg
->info
.stats
.last_scrub_stamp
= t
;
7438 pg
->info
.stats
.last_deep_scrub_stamp
= t
;
7439 pg
->info
.stats
.last_clean_scrub_stamp
= t
;
7443 boost::statechart::result
PG::RecoveryState::Primary::react(const MNotifyRec
& notevt
)
7445 PG
*pg
= context
< RecoveryMachine
>().pg
;
7446 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
7447 pg
->proc_replica_info(
7448 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
7449 return discard_event();
7452 boost::statechart::result
PG::RecoveryState::Primary::react(const ActMap
&)
7454 PG
*pg
= context
< RecoveryMachine
>().pg
;
7455 ldout(pg
->cct
, 7) << "handle ActMap primary" << dendl
;
7456 pg
->publish_stats_to_osd();
7458 return discard_event();
7461 boost::statechart::result
PG::RecoveryState::Primary::react(
7462 const SetForceRecovery
&)
7464 PG
*pg
= context
< RecoveryMachine
>().pg
;
7465 pg
->set_force_recovery(true);
7466 return discard_event();
7469 boost::statechart::result
PG::RecoveryState::Primary::react(
7470 const UnsetForceRecovery
&)
7472 PG
*pg
= context
< RecoveryMachine
>().pg
;
7473 pg
->set_force_recovery(false);
7474 return discard_event();
7477 boost::statechart::result
PG::RecoveryState::Primary::react(
7478 const RequestScrub
& evt
)
7480 PG
*pg
= context
< RecoveryMachine
>().pg
;
7481 if (pg
->is_primary()) {
7482 pg
->scrub_requested(evt
.deep
, evt
.repair
);
7483 ldout(pg
->cct
,10) << "marking for scrub" << dendl
;
7485 return discard_event();
7488 boost::statechart::result
PG::RecoveryState::Primary::react(
7489 const SetForceBackfill
&)
7491 PG
*pg
= context
< RecoveryMachine
>().pg
;
7492 pg
->set_force_backfill(true);
7493 return discard_event();
7496 boost::statechart::result
PG::RecoveryState::Primary::react(
7497 const UnsetForceBackfill
&)
7499 PG
*pg
= context
< RecoveryMachine
>().pg
;
7500 pg
->set_force_backfill(false);
7501 return discard_event();
7504 void PG::RecoveryState::Primary::exit()
7506 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7507 PG
*pg
= context
< RecoveryMachine
>().pg
;
7508 pg
->want_acting
.clear();
7509 utime_t dur
= ceph_clock_now() - enter_time
;
7510 pg
->osd
->recoverystate_perf
->tinc(rs_primary_latency
, dur
);
7511 pg
->clear_primary_state();
7512 pg
->state_clear(PG_STATE_CREATING
);
7515 /*---------Peering--------*/
7516 PG::RecoveryState::Peering::Peering(my_context ctx
)
7518 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering"),
7519 history_les_bound(false)
7521 context
< RecoveryMachine
>().log_enter(state_name
);
7523 PG
*pg
= context
< RecoveryMachine
>().pg
;
7524 ceph_assert(!pg
->is_peered());
7525 ceph_assert(!pg
->is_peering());
7526 ceph_assert(pg
->is_primary());
7527 pg
->state_set(PG_STATE_PEERING
);
7530 boost::statechart::result
PG::RecoveryState::Peering::react(const AdvMap
& advmap
)
7532 PG
*pg
= context
< RecoveryMachine
>().pg
;
7533 ldout(pg
->cct
, 10) << "Peering advmap" << dendl
;
7534 if (prior_set
.affected_by_map(*(advmap
.osdmap
), pg
)) {
7535 ldout(pg
->cct
, 1) << "Peering, affected_by_map, going to Reset" << dendl
;
7537 return transit
< Reset
>();
7540 pg
->adjust_need_up_thru(advmap
.osdmap
);
7542 return forward_event();
7545 boost::statechart::result
PG::RecoveryState::Peering::react(const QueryState
& q
)
7547 PG
*pg
= context
< RecoveryMachine
>().pg
;
7549 q
.f
->open_object_section("state");
7550 q
.f
->dump_string("name", state_name
);
7551 q
.f
->dump_stream("enter_time") << enter_time
;
7553 q
.f
->open_array_section("past_intervals");
7554 pg
->past_intervals
.dump(q
.f
);
7555 q
.f
->close_section();
7557 q
.f
->open_array_section("probing_osds");
7558 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
7559 p
!= prior_set
.probe
.end();
7561 q
.f
->dump_stream("osd") << *p
;
7562 q
.f
->close_section();
7564 if (prior_set
.pg_down
)
7565 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
7567 q
.f
->open_array_section("down_osds_we_would_probe");
7568 for (set
<int>::iterator p
= prior_set
.down
.begin();
7569 p
!= prior_set
.down
.end();
7571 q
.f
->dump_int("osd", *p
);
7572 q
.f
->close_section();
7574 q
.f
->open_array_section("peering_blocked_by");
7575 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
7576 p
!= prior_set
.blocked_by
.end();
7578 q
.f
->open_object_section("osd");
7579 q
.f
->dump_int("osd", p
->first
);
7580 q
.f
->dump_int("current_lost_at", p
->second
);
7581 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
7582 q
.f
->close_section();
7584 q
.f
->close_section();
7586 if (history_les_bound
) {
7587 q
.f
->open_array_section("peering_blocked_by_detail");
7588 q
.f
->open_object_section("item");
7589 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
7590 q
.f
->close_section();
7591 q
.f
->close_section();
7594 q
.f
->close_section();
7595 return forward_event();
7598 void PG::RecoveryState::Peering::exit()
7600 PG
*pg
= context
< RecoveryMachine
>().pg
;
7601 ldout(pg
->cct
, 10) << "Leaving Peering" << dendl
;
7602 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7603 pg
->state_clear(PG_STATE_PEERING
);
7604 pg
->clear_probe_targets();
7606 utime_t dur
= ceph_clock_now() - enter_time
;
7607 pg
->osd
->recoverystate_perf
->tinc(rs_peering_latency
, dur
);
7611 /*------Backfilling-------*/
7612 PG::RecoveryState::Backfilling::Backfilling(my_context ctx
)
7614 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Backfilling")
7616 context
< RecoveryMachine
>().log_enter(state_name
);
7617 PG
*pg
= context
< RecoveryMachine
>().pg
;
7618 pg
->backfill_reserved
= true;
7619 pg
->queue_recovery();
7620 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
7621 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7622 pg
->state_set(PG_STATE_BACKFILLING
);
7623 pg
->publish_stats_to_osd();
7626 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7628 PG
*pg
= context
< RecoveryMachine
>().pg
;
7629 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7630 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
7631 it
!= pg
->backfill_targets
.end();
7633 ceph_assert(*it
!= pg
->pg_whoami
);
7634 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7635 it
->osd
, pg
->get_osdmap_epoch());
7637 pg
->osd
->send_message_osd_cluster(
7638 new MBackfillReserve(
7639 MBackfillReserve::RELEASE
,
7640 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
7641 pg
->get_osdmap_epoch()),
7647 void PG::RecoveryState::Backfilling::cancel_backfill()
7649 PG
*pg
= context
< RecoveryMachine
>().pg
;
7650 backfill_release_reservations();
7651 if (!pg
->waiting_on_backfill
.empty()) {
7652 pg
->waiting_on_backfill
.clear();
7653 pg
->finish_recovery_op(hobject_t::get_max());
7657 boost::statechart::result
7658 PG::RecoveryState::Backfilling::react(const Backfilled
&c
)
7660 backfill_release_reservations();
7661 return transit
<Recovered
>();
7664 boost::statechart::result
7665 PG::RecoveryState::Backfilling::react(const DeferBackfill
&c
)
7667 PG
*pg
= context
< RecoveryMachine
>().pg
;
7668 ldout(pg
->cct
, 10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
7669 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7670 pg
->state_clear(PG_STATE_BACKFILLING
);
7672 pg
->schedule_backfill_retry(c
.delay
);
7673 return transit
<NotBackfilling
>();
7676 boost::statechart::result
7677 PG::RecoveryState::Backfilling::react(const UnfoundBackfill
&c
)
7679 PG
*pg
= context
< RecoveryMachine
>().pg
;
7680 ldout(pg
->cct
, 10) << "backfill has unfound, can't continue" << dendl
;
7681 pg
->state_set(PG_STATE_BACKFILL_UNFOUND
);
7682 pg
->state_clear(PG_STATE_BACKFILLING
);
7684 return transit
<NotBackfilling
>();
7687 boost::statechart::result
7688 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull
&)
7690 PG
*pg
= context
< RecoveryMachine
>().pg
;
7691 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
7692 pg
->state_clear(PG_STATE_BACKFILLING
);
7694 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_backfill_retry_interval
);
7695 return transit
<NotBackfilling
>();
7698 boost::statechart::result
7699 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked
&)
7701 PG
*pg
= context
< RecoveryMachine
>().pg
;
7702 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7704 if (pg
->needs_backfill()) {
7705 return transit
<WaitLocalBackfillReserved
>();
7707 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7708 return discard_event();
7712 void PG::RecoveryState::Backfilling::exit()
7714 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7715 PG
*pg
= context
< RecoveryMachine
>().pg
;
7716 pg
->backfill_reserved
= false;
7717 pg
->backfill_reserving
= false;
7718 pg
->state_clear(PG_STATE_BACKFILLING
);
7719 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
7720 utime_t dur
= ceph_clock_now() - enter_time
;
7721 pg
->osd
->recoverystate_perf
->tinc(rs_backfilling_latency
, dur
);
7724 /*--WaitRemoteBackfillReserved--*/
7726 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
7728 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7729 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
7731 context
< RecoveryMachine
>().log_enter(state_name
);
7732 PG
*pg
= context
< RecoveryMachine
>().pg
;
7733 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7734 pg
->publish_stats_to_osd();
7735 post_event(RemoteBackfillReserved());
7738 boost::statechart::result
7739 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
7741 PG
*pg
= context
< RecoveryMachine
>().pg
;
7743 int64_t num_bytes
= pg
->info
.stats
.stats
.sum
.num_bytes
;
7744 ldout(pg
->cct
, 10) << __func__
<< " num_bytes " << num_bytes
<< dendl
;
7745 if (backfill_osd_it
!= context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
7746 //The primary never backfills itself
7747 ceph_assert(*backfill_osd_it
!= pg
->pg_whoami
);
7748 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7749 backfill_osd_it
->osd
, pg
->get_osdmap_epoch());
7751 pg
->osd
->send_message_osd_cluster(
7752 new MBackfillReserve(
7753 MBackfillReserve::REQUEST
,
7754 spg_t(pg
->info
.pgid
.pgid
, backfill_osd_it
->shard
),
7755 pg
->get_osdmap_epoch(),
7756 pg
->get_backfill_priority(),
7758 pg
->peer_bytes
[*backfill_osd_it
]),
7763 pg
->peer_bytes
.clear();
7764 post_event(AllBackfillsReserved());
7766 return discard_event();
7769 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7771 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7772 PG
*pg
= context
< RecoveryMachine
>().pg
;
7773 utime_t dur
= ceph_clock_now() - enter_time
;
7774 pg
->osd
->recoverystate_perf
->tinc(rs_waitremotebackfillreserved_latency
, dur
);
7777 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7779 PG
*pg
= context
< RecoveryMachine
>().pg
;
7780 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7782 // Send CANCEL to all previously acquired reservations
7783 set
<pg_shard_t
>::const_iterator it
, begin
, end
;
7784 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
7785 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
7786 ceph_assert(begin
!= end
);
7787 for (it
= begin
; it
!= backfill_osd_it
; ++it
) {
7788 //The primary never backfills itself
7789 ceph_assert(*it
!= pg
->pg_whoami
);
7790 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7791 it
->osd
, pg
->get_osdmap_epoch());
7793 pg
->osd
->send_message_osd_cluster(
7794 new MBackfillReserve(
7795 MBackfillReserve::RELEASE
,
7796 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
7797 pg
->get_osdmap_epoch()),
7802 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7803 pg
->publish_stats_to_osd();
7805 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_backfill_retry_interval
);
7808 boost::statechart::result
7809 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull
&evt
)
7811 PG
*pg
= context
< RecoveryMachine
>().pg
;
7812 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
7814 return transit
<NotBackfilling
>();
7817 boost::statechart::result
7818 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked
&evt
)
7821 return transit
<NotBackfilling
>();
7824 /*--WaitLocalBackfillReserved--*/
7825 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
7827 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalBackfillReserved")
7829 context
< RecoveryMachine
>().log_enter(state_name
);
7830 PG
*pg
= context
< RecoveryMachine
>().pg
;
7831 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
7832 pg
->osd
->local_reserver
.request_reservation(
7834 new QueuePeeringEvt
<LocalBackfillReserved
>(
7835 pg
, pg
->get_osdmap_epoch(),
7836 LocalBackfillReserved()),
7837 pg
->get_backfill_priority(),
7838 new QueuePeeringEvt
<DeferBackfill
>(
7839 pg
, pg
->get_osdmap_epoch(),
7840 DeferBackfill(0.0)));
7841 pg
->publish_stats_to_osd();
7844 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7846 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7847 PG
*pg
= context
< RecoveryMachine
>().pg
;
7848 utime_t dur
= ceph_clock_now() - enter_time
;
7849 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalbackfillreserved_latency
, dur
);
7852 /*----NotBackfilling------*/
7853 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx
)
7855 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotBackfilling")
7857 context
< RecoveryMachine
>().log_enter(state_name
);
7858 PG
*pg
= context
< RecoveryMachine
>().pg
;
7859 pg
->state_clear(PG_STATE_REPAIR
);
7860 pg
->publish_stats_to_osd();
7863 boost::statechart::result
7864 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
7866 return discard_event();
7869 boost::statechart::result
7870 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejectedTooFull
&evt
)
7872 return discard_event();
7875 void PG::RecoveryState::NotBackfilling::exit()
7877 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7878 PG
*pg
= context
< RecoveryMachine
>().pg
;
7879 pg
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
7880 utime_t dur
= ceph_clock_now() - enter_time
;
7881 pg
->osd
->recoverystate_perf
->tinc(rs_notbackfilling_latency
, dur
);
7884 /*----NotRecovering------*/
7885 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx
)
7887 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotRecovering")
7889 context
< RecoveryMachine
>().log_enter(state_name
);
7890 PG
*pg
= context
< RecoveryMachine
>().pg
;
7891 pg
->publish_stats_to_osd();
7894 void PG::RecoveryState::NotRecovering::exit()
7896 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7897 PG
*pg
= context
< RecoveryMachine
>().pg
;
7898 pg
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
7899 utime_t dur
= ceph_clock_now() - enter_time
;
7900 pg
->osd
->recoverystate_perf
->tinc(rs_notrecovering_latency
, dur
);
7903 /*---RepNotRecovering----*/
7904 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx
)
7906 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepNotRecovering")
7908 context
< RecoveryMachine
>().log_enter(state_name
);
7911 boost::statechart::result
7912 PG::RecoveryState::RepNotRecovering::react(const RejectTooFullRemoteReservation
&evt
)
7914 PG
*pg
= context
< RecoveryMachine
>().pg
;
7915 pg
->reject_reservation();
7916 post_event(RemoteReservationRejectedTooFull());
7917 return discard_event();
7920 void PG::RecoveryState::RepNotRecovering::exit()
7922 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7923 PG
*pg
= context
< RecoveryMachine
>().pg
;
7924 utime_t dur
= ceph_clock_now() - enter_time
;
7925 pg
->osd
->recoverystate_perf
->tinc(rs_repnotrecovering_latency
, dur
);
7928 /*---RepWaitRecoveryReserved--*/
7929 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
7931 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitRecoveryReserved")
7933 context
< RecoveryMachine
>().log_enter(state_name
);
7936 boost::statechart::result
7937 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
7939 PG
*pg
= context
< RecoveryMachine
>().pg
;
7940 pg
->osd
->send_message_osd_cluster(
7942 new MRecoveryReserve(
7943 MRecoveryReserve::GRANT
,
7944 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
7945 pg
->get_osdmap_epoch()),
7946 pg
->get_osdmap_epoch());
7947 return transit
<RepRecovering
>();
7950 boost::statechart::result
7951 PG::RecoveryState::RepWaitRecoveryReserved::react(
7952 const RemoteReservationCanceled
&evt
)
7954 PG
*pg
= context
< RecoveryMachine
>().pg
;
7955 pg
->clear_reserved_num_bytes();
7956 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7957 return transit
<RepNotRecovering
>();
7960 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7962 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7963 PG
*pg
= context
< RecoveryMachine
>().pg
;
7964 utime_t dur
= ceph_clock_now() - enter_time
;
7965 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitrecoveryreserved_latency
, dur
);
7968 /*-RepWaitBackfillReserved*/
7969 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
7971 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitBackfillReserved")
7973 context
< RecoveryMachine
>().log_enter(state_name
);
7976 boost::statechart::result
7977 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
7979 PG
*pg
= context
< RecoveryMachine
>().pg
;
7980 // Use tentative_bacfill_full() to make sure enough
7981 // space is available to handle target bytes from primary.
7983 // TODO: If we passed num_objects from primary we could account for
7984 // an estimate of the metadata overhead.
7986 // TODO: If we had compressed_allocated and compressed_original from primary
7987 // we could compute compression ratio and adjust accordingly.
7989 // XXX: There is no way to get omap overhead and this would only apply
7990 // to whatever possibly different partition that is storing the database.
7992 // update_osd_stat() from heartbeat will do this on a new
7993 // statfs using pg->primary_num_bytes.
7994 uint64_t pending_adjustment
= 0;
7995 int64_t primary_num_bytes
= evt
.primary_num_bytes
;
7996 int64_t local_num_bytes
= evt
.local_num_bytes
;
7997 if (primary_num_bytes
) {
7998 // For erasure coded pool overestimate by a full stripe per object
7999 // because we don't know how each objected rounded to the nearest stripe
8000 if (pg
->pool
.info
.is_erasure()) {
8001 primary_num_bytes
/= (int)pg
->get_pgbackend()->get_ec_data_chunk_count();
8002 primary_num_bytes
+= pg
->get_pgbackend()->get_ec_stripe_chunk_size() * pg
->info
.stats
.stats
.sum
.num_objects
;
8003 local_num_bytes
/= (int)pg
->get_pgbackend()->get_ec_data_chunk_count();
8004 local_num_bytes
+= pg
->get_pgbackend()->get_ec_stripe_chunk_size() * pg
->info
.stats
.stats
.sum
.num_objects
;
8006 pending_adjustment
= pending_backfill(pg
->cct
, primary_num_bytes
, local_num_bytes
);
8007 ldout(pg
->cct
, 10) << __func__
<< " primary_num_bytes " << (primary_num_bytes
>> 10) << "KiB"
8008 << " local " << (local_num_bytes
>> 10) << "KiB"
8009 << " pending_adjustments " << (pending_adjustment
>> 10) << "KiB"
8012 // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
8013 // That's why we don't immediately unlock
8014 Mutex::Locker
l(pg
->osd
->stat_lock
);
8015 osd_stat_t cur_stat
= pg
->osd
->osd_stat
;
8016 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
8017 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
8018 ldout(pg
->cct
, 10) << "backfill reservation rejected: failure injection"
8020 post_event(RejectTooFullRemoteReservation());
8021 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
8022 pg
->osd
->tentative_backfill_full(pg
, pending_adjustment
, cur_stat
)) {
8023 ldout(pg
->cct
, 10) << "backfill reservation rejected: backfill full"
8025 post_event(RejectTooFullRemoteReservation());
8027 Context
*preempt
= nullptr;
8028 // Don't reserve space if skipped reservation check, this is used
8029 // to test the other backfill full check AND in case a corruption
8030 // of num_bytes requires ignoring that value and trying the
8032 if (primary_num_bytes
&& !pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
)
8033 pg
->set_reserved_num_bytes(primary_num_bytes
, local_num_bytes
);
8035 pg
->clear_reserved_num_bytes();
8036 // Use un-ec-adjusted bytes for stats.
8037 pg
->info
.stats
.stats
.sum
.num_bytes
= evt
.local_num_bytes
;
8038 if (HAVE_FEATURE(pg
->upacting_features
, RECOVERY_RESERVATION_2
)) {
8039 // older peers will interpret preemption as TOOFULL
8040 preempt
= new QueuePeeringEvt
<RemoteBackfillPreempted
>(
8041 pg
, pg
->get_osdmap_epoch(),
8042 RemoteBackfillPreempted());
8044 pg
->osd
->remote_reserver
.request_reservation(
8046 new QueuePeeringEvt
<RemoteBackfillReserved
>(
8047 pg
, pg
->get_osdmap_epoch(),
8048 RemoteBackfillReserved()),
8052 return transit
<RepWaitBackfillReserved
>();
8055 boost::statechart::result
8056 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio
&evt
)
8058 PG
*pg
= context
< RecoveryMachine
>().pg
;
8060 // fall back to a local reckoning of priority of primary doesn't pass one
8061 // (pre-mimic compat)
8062 int prio
= evt
.priority
? evt
.priority
: pg
->get_recovery_priority();
8064 Context
*preempt
= nullptr;
8065 if (HAVE_FEATURE(pg
->upacting_features
, RECOVERY_RESERVATION_2
)) {
8066 // older peers can't handle this
8067 preempt
= new QueuePeeringEvt
<RemoteRecoveryPreempted
>(
8068 pg
, pg
->get_osdmap_epoch(),
8069 RemoteRecoveryPreempted());
8072 pg
->osd
->remote_reserver
.request_reservation(
8074 new QueuePeeringEvt
<RemoteRecoveryReserved
>(
8075 pg
, pg
->get_osdmap_epoch(),
8076 RemoteRecoveryReserved()),
8079 return transit
<RepWaitRecoveryReserved
>();
8082 void PG::RecoveryState::RepWaitBackfillReserved::exit()
8084 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8085 PG
*pg
= context
< RecoveryMachine
>().pg
;
8086 utime_t dur
= ceph_clock_now() - enter_time
;
8087 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitbackfillreserved_latency
, dur
);
8090 boost::statechart::result
8091 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
8093 PG
*pg
= context
< RecoveryMachine
>().pg
;
8095 pg
->osd
->send_message_osd_cluster(
8097 new MBackfillReserve(
8098 MBackfillReserve::GRANT
,
8099 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
8100 pg
->get_osdmap_epoch()),
8101 pg
->get_osdmap_epoch());
8102 return transit
<RepRecovering
>();
8105 boost::statechart::result
8106 PG::RecoveryState::RepWaitBackfillReserved::react(
8107 const RejectTooFullRemoteReservation
&evt
)
8109 PG
*pg
= context
< RecoveryMachine
>().pg
;
8110 pg
->reject_reservation();
8111 post_event(RemoteReservationRejectedTooFull());
8112 return discard_event();
8115 boost::statechart::result
8116 PG::RecoveryState::RepWaitBackfillReserved::react(
8117 const RemoteReservationRejectedTooFull
&evt
)
8119 PG
*pg
= context
< RecoveryMachine
>().pg
;
8120 pg
->clear_reserved_num_bytes();
8121 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
8122 return transit
<RepNotRecovering
>();
8125 boost::statechart::result
8126 PG::RecoveryState::RepWaitBackfillReserved::react(
8127 const RemoteReservationCanceled
&evt
)
8129 PG
*pg
= context
< RecoveryMachine
>().pg
;
8130 pg
->clear_reserved_num_bytes();
8131 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
8132 return transit
<RepNotRecovering
>();
8135 /*---RepRecovering-------*/
8136 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx
)
8138 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepRecovering")
8140 context
< RecoveryMachine
>().log_enter(state_name
);
8143 boost::statechart::result
8144 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted
&)
8146 PG
*pg
= context
< RecoveryMachine
>().pg
;
8147 pg
->clear_reserved_num_bytes();
8148 pg
->osd
->send_message_osd_cluster(
8150 new MRecoveryReserve(
8151 MRecoveryReserve::REVOKE
,
8152 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
8153 pg
->get_osdmap_epoch()),
8154 pg
->get_osdmap_epoch());
8155 return discard_event();
8158 boost::statechart::result
8159 PG::RecoveryState::RepRecovering::react(const BackfillTooFull
&)
8161 PG
*pg
= context
< RecoveryMachine
>().pg
;
8162 pg
->clear_reserved_num_bytes();
8163 pg
->osd
->send_message_osd_cluster(
8165 new MBackfillReserve(
8166 MBackfillReserve::REVOKE_TOOFULL
,
8167 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
8168 pg
->get_osdmap_epoch()),
8169 pg
->get_osdmap_epoch());
8170 return discard_event();
8173 boost::statechart::result
8174 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted
&)
8176 PG
*pg
= context
< RecoveryMachine
>().pg
;
8177 pg
->clear_reserved_num_bytes();
8178 pg
->osd
->send_message_osd_cluster(
8180 new MBackfillReserve(
8181 MBackfillReserve::REVOKE
,
8182 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
8183 pg
->get_osdmap_epoch()),
8184 pg
->get_osdmap_epoch());
8185 return discard_event();
8188 void PG::RecoveryState::RepRecovering::exit()
8190 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8191 PG
*pg
= context
< RecoveryMachine
>().pg
;
8192 pg
->clear_reserved_num_bytes();
8193 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
8194 utime_t dur
= ceph_clock_now() - enter_time
;
8195 pg
->osd
->recoverystate_perf
->tinc(rs_reprecovering_latency
, dur
);
8198 /*------Activating--------*/
8199 PG::RecoveryState::Activating::Activating(my_context ctx
)
8201 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Activating")
8203 context
< RecoveryMachine
>().log_enter(state_name
);
8206 void PG::RecoveryState::Activating::exit()
8208 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8209 PG
*pg
= context
< RecoveryMachine
>().pg
;
8210 utime_t dur
= ceph_clock_now() - enter_time
;
8211 pg
->osd
->recoverystate_perf
->tinc(rs_activating_latency
, dur
);
8214 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
8216 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalRecoveryReserved")
8218 context
< RecoveryMachine
>().log_enter(state_name
);
8219 PG
*pg
= context
< RecoveryMachine
>().pg
;
8221 // Make sure all nodes that part of the recovery aren't full
8222 if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
8223 pg
->osd
->check_osdmap_full(pg
->acting_recovery_backfill
)) {
8224 post_event(RecoveryTooFull());
8228 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
8229 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
8230 pg
->osd
->local_reserver
.request_reservation(
8232 new QueuePeeringEvt
<LocalRecoveryReserved
>(
8233 pg
, pg
->get_osdmap_epoch(),
8234 LocalRecoveryReserved()),
8235 pg
->get_recovery_priority(),
8236 new QueuePeeringEvt
<DeferRecovery
>(
8237 pg
, pg
->get_osdmap_epoch(),
8238 DeferRecovery(0.0)));
8239 pg
->publish_stats_to_osd();
8242 boost::statechart::result
8243 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
8245 PG
*pg
= context
< RecoveryMachine
>().pg
;
8246 pg
->state_set(PG_STATE_RECOVERY_TOOFULL
);
8247 pg
->schedule_recovery_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
8248 return transit
<NotRecovering
>();
8251 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8253 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8254 PG
*pg
= context
< RecoveryMachine
>().pg
;
8255 utime_t dur
= ceph_clock_now() - enter_time
;
8256 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
8259 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
8261 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8262 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
8264 context
< RecoveryMachine
>().log_enter(state_name
);
8265 post_event(RemoteRecoveryReserved());
8268 boost::statechart::result
8269 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
8270 PG
*pg
= context
< RecoveryMachine
>().pg
;
8272 if (remote_recovery_reservation_it
!= context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
8273 ceph_assert(*remote_recovery_reservation_it
!= pg
->pg_whoami
);
8274 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
8275 remote_recovery_reservation_it
->osd
, pg
->get_osdmap_epoch());
8277 pg
->osd
->send_message_osd_cluster(
8278 new MRecoveryReserve(
8279 MRecoveryReserve::REQUEST
,
8280 spg_t(pg
->info
.pgid
.pgid
, remote_recovery_reservation_it
->shard
),
8281 pg
->get_osdmap_epoch(),
8282 pg
->get_recovery_priority()),
8285 ++remote_recovery_reservation_it
;
8287 post_event(AllRemotesReserved());
8289 return discard_event();
8292 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8294 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8295 PG
*pg
= context
< RecoveryMachine
>().pg
;
8296 utime_t dur
= ceph_clock_now() - enter_time
;
8297 pg
->osd
->recoverystate_perf
->tinc(rs_waitremoterecoveryreserved_latency
, dur
);
8300 PG::RecoveryState::Recovering::Recovering(my_context ctx
)
8302 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovering")
8304 context
< RecoveryMachine
>().log_enter(state_name
);
8306 PG
*pg
= context
< RecoveryMachine
>().pg
;
8307 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
8308 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
8309 pg
->state_set(PG_STATE_RECOVERING
);
8310 ceph_assert(!pg
->state_test(PG_STATE_ACTIVATING
));
8311 pg
->publish_stats_to_osd();
8312 pg
->queue_recovery();
8315 void PG::RecoveryState::Recovering::release_reservations(bool cancel
)
8317 PG
*pg
= context
< RecoveryMachine
>().pg
;
8318 ceph_assert(cancel
|| !pg
->pg_log
.get_missing().have_missing());
8320 // release remote reservations
8321 for (set
<pg_shard_t
>::const_iterator i
=
8322 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
8323 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
8325 if (*i
== pg
->pg_whoami
) // skip myself
8327 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
8328 i
->osd
, pg
->get_osdmap_epoch());
8330 pg
->osd
->send_message_osd_cluster(
8331 new MRecoveryReserve(
8332 MRecoveryReserve::RELEASE
,
8333 spg_t(pg
->info
.pgid
.pgid
, i
->shard
),
8334 pg
->get_osdmap_epoch()),
8340 boost::statechart::result
8341 PG::RecoveryState::Recovering::react(const AllReplicasRecovered
&evt
)
8343 PG
*pg
= context
< RecoveryMachine
>().pg
;
8344 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
8345 release_reservations();
8346 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
8347 return transit
<Recovered
>();
8350 boost::statechart::result
8351 PG::RecoveryState::Recovering::react(const RequestBackfill
&evt
)
8353 PG
*pg
= context
< RecoveryMachine
>().pg
;
8354 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
8355 release_reservations();
8356 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
8357 // XXX: Is this needed?
8358 pg
->publish_stats_to_osd();
8359 return transit
<WaitLocalBackfillReserved
>();
8362 boost::statechart::result
8363 PG::RecoveryState::Recovering::react(const DeferRecovery
&evt
)
8365 PG
*pg
= context
< RecoveryMachine
>().pg
;
8366 if (!pg
->state_test(PG_STATE_RECOVERING
)) {
8367 // we may have finished recovery and have an AllReplicasRecovered
8368 // event queued to move us to the next state.
8369 ldout(pg
->cct
, 10) << "got defer recovery but not recovering" << dendl
;
8370 return discard_event();
8372 ldout(pg
->cct
, 10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
8373 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
8374 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
8375 release_reservations(true);
8376 pg
->schedule_recovery_retry(evt
.delay
);
8377 return transit
<NotRecovering
>();
8380 boost::statechart::result
8381 PG::RecoveryState::Recovering::react(const UnfoundRecovery
&evt
)
8383 PG
*pg
= context
< RecoveryMachine
>().pg
;
8384 ldout(pg
->cct
, 10) << "recovery has unfound, can't continue" << dendl
;
8385 pg
->state_set(PG_STATE_RECOVERY_UNFOUND
);
8386 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
8387 release_reservations(true);
8388 return transit
<NotRecovering
>();
8391 void PG::RecoveryState::Recovering::exit()
8393 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8394 PG
*pg
= context
< RecoveryMachine
>().pg
;
8395 utime_t dur
= ceph_clock_now() - enter_time
;
8396 pg
->state_clear(PG_STATE_RECOVERING
);
8397 pg
->osd
->recoverystate_perf
->tinc(rs_recovering_latency
, dur
);
8400 PG::RecoveryState::Recovered::Recovered(my_context ctx
)
8402 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovered")
8404 pg_shard_t auth_log_shard
;
8406 context
< RecoveryMachine
>().log_enter(state_name
);
8408 PG
*pg
= context
< RecoveryMachine
>().pg
;
8410 ceph_assert(!pg
->needs_recovery());
8412 // if we finished backfill, all acting are active; recheck if
8413 // DEGRADED | UNDERSIZED is appropriate.
8414 ceph_assert(!pg
->acting_recovery_backfill
.empty());
8415 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <=
8416 pg
->acting_recovery_backfill
.size()) {
8417 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
8418 pg
->publish_stats_to_osd();
8421 // adjust acting set? (e.g. because backfill completed...)
8422 bool history_les_bound
= false;
8423 if (pg
->acting
!= pg
->up
&& !pg
->choose_acting(auth_log_shard
,
8424 true, &history_les_bound
)) {
8425 ceph_assert(pg
->want_acting
.size());
8426 } else if (!pg
->async_recovery_targets
.empty()) {
8427 pg
->choose_acting(auth_log_shard
, true, &history_les_bound
);
8430 if (context
< Active
>().all_replicas_activated
&&
8431 pg
->async_recovery_targets
.empty())
8432 post_event(GoClean());
8435 void PG::RecoveryState::Recovered::exit()
8437 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8438 PG
*pg
= context
< RecoveryMachine
>().pg
;
8439 utime_t dur
= ceph_clock_now() - enter_time
;
8440 pg
->osd
->recoverystate_perf
->tinc(rs_recovered_latency
, dur
);
8443 PG::RecoveryState::Clean::Clean(my_context ctx
)
8445 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Clean")
8447 context
< RecoveryMachine
>().log_enter(state_name
);
8449 PG
*pg
= context
< RecoveryMachine
>().pg
;
8451 if (pg
->info
.last_complete
!= pg
->info
.last_update
) {
8454 Context
*c
= pg
->finish_recovery();
8455 context
< RecoveryMachine
>().get_cur_transaction()->register_on_commit(c
);
8457 pg
->try_mark_clean();
8460 void PG::RecoveryState::Clean::exit()
8462 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8463 PG
*pg
= context
< RecoveryMachine
>().pg
;
8464 pg
->state_clear(PG_STATE_CLEAN
);
8465 utime_t dur
= ceph_clock_now() - enter_time
;
8466 pg
->osd
->recoverystate_perf
->tinc(rs_clean_latency
, dur
);
8469 template <typename T
>
8470 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
8472 set
<int> osds_found
;
8473 set
<pg_shard_t
> out
;
8474 for (typename
T::const_iterator i
= in
.begin();
8477 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
8478 osds_found
.insert(i
->osd
);
8485 /*---------Active---------*/
8486 PG::RecoveryState::Active::Active(my_context ctx
)
8488 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active"),
8489 remote_shards_to_reserve_recovery(
8490 unique_osd_shard_set(
8491 context
< RecoveryMachine
>().pg
->pg_whoami
,
8492 context
< RecoveryMachine
>().pg
->acting_recovery_backfill
)),
8493 remote_shards_to_reserve_backfill(
8494 unique_osd_shard_set(
8495 context
< RecoveryMachine
>().pg
->pg_whoami
,
8496 context
< RecoveryMachine
>().pg
->backfill_targets
)),
8497 all_replicas_activated(false)
8499 context
< RecoveryMachine
>().log_enter(state_name
);
8501 PG
*pg
= context
< RecoveryMachine
>().pg
;
8503 ceph_assert(!pg
->backfill_reserving
);
8504 ceph_assert(!pg
->backfill_reserved
);
8505 ceph_assert(pg
->is_primary());
8506 ldout(pg
->cct
, 10) << "In Active, about to call activate" << dendl
;
8507 pg
->start_flush(context
< RecoveryMachine
>().get_cur_transaction());
8508 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
8509 pg
->get_osdmap_epoch(),
8510 *context
< RecoveryMachine
>().get_query_map(),
8511 context
< RecoveryMachine
>().get_info_map(),
8512 context
< RecoveryMachine
>().get_recovery_ctx());
8514 // everyone has to commit/ack before we are truly active
8515 pg
->blocked_by
.clear();
8516 for (set
<pg_shard_t
>::iterator p
= pg
->acting_recovery_backfill
.begin();
8517 p
!= pg
->acting_recovery_backfill
.end();
8519 if (p
->shard
!= pg
->pg_whoami
.shard
) {
8520 pg
->blocked_by
.insert(p
->shard
);
8523 pg
->publish_stats_to_osd();
8524 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
8527 boost::statechart::result
PG::RecoveryState::Active::react(const AdvMap
& advmap
)
8529 PG
*pg
= context
< RecoveryMachine
>().pg
;
8530 if (pg
->should_restart_peering(
8532 advmap
.acting_primary
,
8537 ldout(pg
->cct
, 10) << "Active advmap interval change, fast return" << dendl
;
8538 return forward_event();
8540 ldout(pg
->cct
, 10) << "Active advmap" << dendl
;
8541 bool need_publish
= false;
8543 if (advmap
.osdmap
->require_osd_release
>= CEPH_RELEASE_MIMIC
) {
8544 const auto& new_removed_snaps
= advmap
.osdmap
->get_new_removed_snaps();
8545 auto i
= new_removed_snaps
.find(pg
->info
.pgid
.pool());
8546 if (i
!= new_removed_snaps
.end()) {
8548 for (auto j
: i
->second
) {
8549 if (pg
->snap_trimq
.intersects(j
.first
, j
.second
)) {
8550 decltype(pg
->snap_trimq
) added
, overlap
;
8551 added
.insert(j
.first
, j
.second
);
8552 overlap
.intersection_of(pg
->snap_trimq
, added
);
8553 if (pg
->last_require_osd_release
< CEPH_RELEASE_MIMIC
) {
8554 lderr(pg
->cct
) << __func__
<< " removed_snaps already contains "
8555 << overlap
<< ", but this is the first mimic+ osdmap,"
8556 << " so it's expected" << dendl
;
8558 lderr(pg
->cct
) << __func__
<< " removed_snaps already contains "
8559 << overlap
<< dendl
;
8562 pg
->snap_trimq
.union_of(added
);
8564 pg
->snap_trimq
.insert(j
.first
, j
.second
);
8567 if (pg
->last_require_osd_release
< CEPH_RELEASE_MIMIC
) {
8568 // at upgrade, we report *all* previously removed snaps as removed in
8569 // the first mimic epoch. remove the ones we previously divined were
8570 // removed (and subsequently purged) from the trimq.
8571 lderr(pg
->cct
) << __func__
<< " first mimic map, filtering purged_snaps"
8572 << " from new removed_snaps" << dendl
;
8573 pg
->snap_trimq
.subtract(pg
->info
.purged_snaps
);
8575 ldout(pg
->cct
,10) << __func__
<< " new removed_snaps " << i
->second
8576 << ", snap_trimq now " << pg
->snap_trimq
<< dendl
;
8577 ceph_assert(!bad
|| !pg
->cct
->_conf
->osd_debug_verify_cached_snaps
);
8578 pg
->dirty_info
= true;
8579 pg
->dirty_big_info
= true;
8582 const auto& new_purged_snaps
= advmap
.osdmap
->get_new_purged_snaps();
8583 auto j
= new_purged_snaps
.find(pg
->info
.pgid
.pool());
8584 if (j
!= new_purged_snaps
.end()) {
8586 for (auto k
: j
->second
) {
8587 if (!pg
->info
.purged_snaps
.contains(k
.first
, k
.second
)) {
8588 decltype(pg
->info
.purged_snaps
) rm
, overlap
;
8589 rm
.insert(k
.first
, k
.second
);
8590 overlap
.intersection_of(pg
->info
.purged_snaps
, rm
);
8591 lderr(pg
->cct
) << __func__
<< " purged_snaps does not contain "
8592 << rm
<< ", only " << overlap
<< dendl
;
8593 pg
->info
.purged_snaps
.subtract(overlap
);
8594 // This can currently happen in the normal (if unlikely) course of
8595 // events. Because adding snaps to purged_snaps does not increase
8596 // the pg version or add a pg log entry, we don't reliably propagate
8597 // purged_snaps additions to other OSDs.
8600 // - primary and replicas update purged_snaps
8601 // - no object updates
8602 // - pg mapping changes, new primary on different node
8603 // - new primary pg version == eversion_t(), so info is not
8607 pg
->info
.purged_snaps
.erase(k
.first
, k
.second
);
8610 ldout(pg
->cct
,10) << __func__
<< " new purged_snaps " << j
->second
8611 << ", now " << pg
->info
.purged_snaps
<< dendl
;
8612 ceph_assert(!bad
|| !pg
->cct
->_conf
->osd_debug_verify_cached_snaps
);
8613 pg
->dirty_info
= true;
8614 pg
->dirty_big_info
= true;
8616 if (pg
->dirty_big_info
) {
8617 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8618 // purged snaps and (b) perhaps share more snaps that we have purged
8619 // but didn't fit in pg_stat_t.
8620 need_publish
= true;
8621 pg
->share_pg_info();
8623 } else if (!pg
->pool
.newly_removed_snaps
.empty()) {
8624 pg
->snap_trimq
.union_of(pg
->pool
.newly_removed_snaps
);
8625 ldout(pg
->cct
, 10) << *pg
<< " snap_trimq now " << pg
->snap_trimq
<< dendl
;
8626 pg
->dirty_info
= true;
8627 pg
->dirty_big_info
= true;
8630 for (size_t i
= 0; i
< pg
->want_acting
.size(); i
++) {
8631 int osd
= pg
->want_acting
[i
];
8632 if (!advmap
.osdmap
->is_up(osd
)) {
8633 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
8634 ceph_assert(pg
->is_acting(osd_with_shard
) || pg
->is_up(osd_with_shard
));
8638 /* Check for changes in pool size (if the acting set changed as a result,
8639 * this does not matter) */
8640 if (advmap
.lastmap
->get_pg_size(pg
->info
.pgid
.pgid
) !=
8641 pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
)) {
8642 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <= pg
->actingset
.size()) {
8643 pg
->state_clear(PG_STATE_UNDERSIZED
);
8645 pg
->state_set(PG_STATE_UNDERSIZED
);
8647 // degraded changes will be detected by call from publish_stats_to_osd()
8648 need_publish
= true;
8651 // if we haven't reported our PG stats in a long time, do so now.
8652 if (pg
->info
.stats
.reported_epoch
+ pg
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
8653 ldout(pg
->cct
, 20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - pg
->info
.stats
.reported_epoch
)
8654 << " epochs" << dendl
;
8655 need_publish
= true;
8659 pg
->publish_stats_to_osd();
8661 return forward_event();
8664 boost::statechart::result
PG::RecoveryState::Active::react(const ActMap
&)
8666 PG
*pg
= context
< RecoveryMachine
>().pg
;
8667 ldout(pg
->cct
, 10) << "Active: handling ActMap" << dendl
;
8668 ceph_assert(pg
->is_primary());
8670 if (pg
->have_unfound()) {
8671 // object may have become unfound
8672 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
8675 if (pg
->cct
->_conf
->osd_check_for_log_corruption
)
8676 pg
->check_log_for_corruption(pg
->osd
->store
);
8678 uint64_t unfound
= pg
->missing_loc
.num_unfound();
8680 pg
->all_unfound_are_queried_or_lost(pg
->get_osdmap())) {
8681 if (pg
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
8682 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has " << unfound
8683 << " objects unfound and apparently lost, would automatically "
8684 << "mark these objects lost but this feature is not yet implemented "
8685 << "(osd_auto_mark_unfound_lost)";
8687 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has "
8688 << unfound
<< " objects unfound and apparently lost";
8691 if (pg
->is_active()) {
8692 ldout(pg
->cct
, 10) << "Active: kicking snap trim" << dendl
;
8693 pg
->kick_snap_trim();
8696 if (pg
->is_peered() &&
8698 !pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
) &&
8699 (!pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) || pg
->is_degraded())) {
8700 pg
->queue_recovery();
8702 return forward_event();
8705 boost::statechart::result
PG::RecoveryState::Active::react(const MNotifyRec
& notevt
)
8707 PG
*pg
= context
< RecoveryMachine
>().pg
;
8708 ceph_assert(pg
->is_primary());
8709 if (pg
->peer_info
.count(notevt
.from
)) {
8710 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
8711 << ", already have info from that osd, ignoring"
8713 } else if (pg
->peer_purged
.count(notevt
.from
)) {
8714 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
8715 << ", already purged that peer, ignoring"
8718 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
8719 << ", calling proc_replica_info and discover_all_missing"
8721 pg
->proc_replica_info(
8722 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
8723 if (pg
->have_unfound() || (pg
->is_degraded() && pg
->might_have_unfound
.count(notevt
.from
))) {
8724 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
8727 return discard_event();
8730 boost::statechart::result
PG::RecoveryState::Active::react(const MTrim
& trim
)
8732 PG
*pg
= context
< RecoveryMachine
>().pg
;
8733 ceph_assert(pg
->is_primary());
8735 // peer is informing us of their last_complete_ondisk
8736 ldout(pg
->cct
,10) << " replica osd." << trim
.from
<< " lcod " << trim
.trim_to
<< dendl
;
8737 pg
->peer_last_complete_ondisk
[pg_shard_t(trim
.from
, trim
.shard
)] = trim
.trim_to
;
8739 // trim log when the pg is recovered
8740 pg
->calc_min_last_complete_ondisk();
8741 return discard_event();
8744 boost::statechart::result
PG::RecoveryState::Active::react(const MInfoRec
& infoevt
)
8746 PG
*pg
= context
< RecoveryMachine
>().pg
;
8747 ceph_assert(pg
->is_primary());
8749 ceph_assert(!pg
->acting_recovery_backfill
.empty());
8750 // don't update history (yet) if we are active and primary; the replica
8751 // may be telling us they have activated (and committed) but we can't
8752 // share that until _everyone_ does the same.
8753 if (pg
->is_acting_recovery_backfill(infoevt
.from
) &&
8754 pg
->peer_activated
.count(infoevt
.from
) == 0) {
8755 ldout(pg
->cct
, 10) << " peer osd." << infoevt
.from
8756 << " activated and committed" << dendl
;
8757 pg
->peer_activated
.insert(infoevt
.from
);
8758 pg
->blocked_by
.erase(infoevt
.from
.shard
);
8759 pg
->publish_stats_to_osd();
8760 if (pg
->peer_activated
.size() == pg
->acting_recovery_backfill
.size()) {
8761 pg
->all_activated_and_committed();
8764 return discard_event();
8767 boost::statechart::result
PG::RecoveryState::Active::react(const MLogRec
& logevt
)
8769 PG
*pg
= context
< RecoveryMachine
>().pg
;
8770 ldout(pg
->cct
, 10) << "searching osd." << logevt
.from
8771 << " log for unfound items" << dendl
;
8772 pg
->proc_replica_log(
8773 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
8774 bool got_missing
= pg
->search_for_missing(
8775 pg
->peer_info
[logevt
.from
],
8776 pg
->peer_missing
[logevt
.from
],
8778 context
< RecoveryMachine
>().get_recovery_ctx());
8779 // If there are missing AND we are "fully" active then start recovery now
8780 if (got_missing
&& pg
->state_test(PG_STATE_ACTIVE
)) {
8781 post_event(DoRecovery());
8783 return discard_event();
8786 boost::statechart::result
PG::RecoveryState::Active::react(const QueryState
& q
)
8788 PG
*pg
= context
< RecoveryMachine
>().pg
;
8790 q
.f
->open_object_section("state");
8791 q
.f
->dump_string("name", state_name
);
8792 q
.f
->dump_stream("enter_time") << enter_time
;
8795 q
.f
->open_array_section("might_have_unfound");
8796 for (set
<pg_shard_t
>::iterator p
= pg
->might_have_unfound
.begin();
8797 p
!= pg
->might_have_unfound
.end();
8799 q
.f
->open_object_section("osd");
8800 q
.f
->dump_stream("osd") << *p
;
8801 if (pg
->peer_missing
.count(*p
)) {
8802 q
.f
->dump_string("status", "already probed");
8803 } else if (pg
->peer_missing_requested
.count(*p
)) {
8804 q
.f
->dump_string("status", "querying");
8805 } else if (!pg
->get_osdmap()->is_up(p
->osd
)) {
8806 q
.f
->dump_string("status", "osd is down");
8808 q
.f
->dump_string("status", "not queried");
8810 q
.f
->close_section();
8812 q
.f
->close_section();
8815 q
.f
->open_object_section("recovery_progress");
8816 pg
->dump_recovery_info(q
.f
);
8817 q
.f
->close_section();
8821 q
.f
->open_object_section("scrub");
8822 q
.f
->dump_stream("scrubber.epoch_start") << pg
->scrubber
.epoch_start
;
8823 q
.f
->dump_bool("scrubber.active", pg
->scrubber
.active
);
8824 q
.f
->dump_string("scrubber.state", Scrubber::state_string(pg
->scrubber
.state
));
8825 q
.f
->dump_stream("scrubber.start") << pg
->scrubber
.start
;
8826 q
.f
->dump_stream("scrubber.end") << pg
->scrubber
.end
;
8827 q
.f
->dump_stream("scrubber.max_end") << pg
->scrubber
.max_end
;
8828 q
.f
->dump_stream("scrubber.subset_last_update") << pg
->scrubber
.subset_last_update
;
8829 q
.f
->dump_bool("scrubber.deep", pg
->scrubber
.deep
);
8831 q
.f
->open_array_section("scrubber.waiting_on_whom");
8832 for (set
<pg_shard_t
>::iterator p
= pg
->scrubber
.waiting_on_whom
.begin();
8833 p
!= pg
->scrubber
.waiting_on_whom
.end();
8835 q
.f
->dump_stream("shard") << *p
;
8837 q
.f
->close_section();
8839 q
.f
->close_section();
8842 q
.f
->close_section();
8843 return forward_event();
8846 boost::statechart::result
PG::RecoveryState::Active::react(const AllReplicasActivated
&evt
)
8848 PG
*pg
= context
< RecoveryMachine
>().pg
;
8849 pg_t pgid
= pg
->info
.pgid
.pgid
;
8851 all_replicas_activated
= true;
8853 pg
->state_clear(PG_STATE_ACTIVATING
);
8854 pg
->state_clear(PG_STATE_CREATING
);
8855 pg
->state_clear(PG_STATE_PREMERGE
);
8858 if (pg
->pool
.info
.is_pending_merge(pgid
, &merge_target
)) {
8859 pg
->state_set(PG_STATE_PEERED
);
8860 pg
->state_set(PG_STATE_PREMERGE
);
8862 if (pg
->actingset
.size() != pg
->get_osdmap()->get_pg_size(pgid
)) {
8865 src
.set_ps(pg
->pool
.info
.get_pg_num_pending());
8866 assert(src
.get_parent() == pgid
);
8867 pg
->osd
->set_not_ready_to_merge_target(pgid
, src
);
8869 pg
->osd
->set_not_ready_to_merge_source(pgid
);
8872 } else if (pg
->acting
.size() < pg
->pool
.info
.min_size
) {
8873 pg
->state_set(PG_STATE_PEERED
);
8875 pg
->state_set(PG_STATE_ACTIVE
);
8878 if (pg
->pool
.info
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8879 pg
->osd
->send_pg_created(pgid
);
8882 pg
->info
.history
.last_epoch_started
= pg
->info
.last_epoch_started
;
8883 pg
->info
.history
.last_interval_started
= pg
->info
.last_interval_started
;
8884 pg
->dirty_info
= true;
8886 pg
->share_pg_info();
8887 pg
->publish_stats_to_osd();
8892 if (pg
->flushes_in_progress
== 0) {
8893 pg
->requeue_ops(pg
->waiting_for_peered
);
8894 } else if (!pg
->waiting_for_peered
.empty()) {
8895 ldout(pg
->cct
, 10) << __func__
<< " flushes in progress, moving "
8896 << pg
->waiting_for_peered
.size()
8897 << " items to waiting_for_flush"
8899 ceph_assert(pg
->waiting_for_flush
.empty());
8900 pg
->waiting_for_flush
.swap(pg
->waiting_for_peered
);
8905 return discard_event();
8908 void PG::RecoveryState::Active::exit()
8910 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8911 PG
*pg
= context
< RecoveryMachine
>().pg
;
8912 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
8914 pg
->blocked_by
.clear();
8915 pg
->backfill_reserved
= false;
8916 pg
->backfill_reserving
= false;
8917 pg
->state_clear(PG_STATE_ACTIVATING
);
8918 pg
->state_clear(PG_STATE_DEGRADED
);
8919 pg
->state_clear(PG_STATE_UNDERSIZED
);
8920 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
8921 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
8922 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
8923 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
8924 utime_t dur
= ceph_clock_now() - enter_time
;
8925 pg
->osd
->recoverystate_perf
->tinc(rs_active_latency
, dur
);
8929 /*------ReplicaActive-----*/
8930 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx
)
8932 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive")
8934 context
< RecoveryMachine
>().log_enter(state_name
);
8936 PG
*pg
= context
< RecoveryMachine
>().pg
;
8937 pg
->start_flush(context
< RecoveryMachine
>().get_cur_transaction());
8941 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
8942 const Activate
& actevt
) {
8943 PG
*pg
= context
< RecoveryMachine
>().pg
;
8944 ldout(pg
->cct
, 10) << "In ReplicaActive, about to call activate" << dendl
;
8945 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
8946 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
8947 actevt
.activation_epoch
,
8948 query_map
, NULL
, NULL
);
8949 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
8950 return discard_event();
8953 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MInfoRec
& infoevt
)
8955 PG
*pg
= context
< RecoveryMachine
>().pg
;
8956 pg
->proc_primary_info(*context
<RecoveryMachine
>().get_cur_transaction(),
8958 return discard_event();
8961 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MLogRec
& logevt
)
8963 PG
*pg
= context
< RecoveryMachine
>().pg
;
8964 ldout(pg
->cct
, 10) << "received log from " << logevt
.from
<< dendl
;
8965 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
8966 pg
->merge_log(*t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
8967 ceph_assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
8969 return discard_event();
8972 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MTrim
& trim
)
8974 PG
*pg
= context
< RecoveryMachine
>().pg
;
8975 // primary is instructing us to trim
8976 pg
->pg_log
.trim(trim
.trim_to
, pg
->info
);
8977 pg
->dirty_info
= true;
8978 return discard_event();
8981 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const ActMap
&)
8983 PG
*pg
= context
< RecoveryMachine
>().pg
;
8984 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
8985 context
< RecoveryMachine
>().send_notify(
8988 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
8989 pg
->get_osdmap_epoch(),
8990 pg
->get_osdmap_epoch(),
8992 pg
->past_intervals
);
8995 return discard_event();
8998 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
8999 const MQuery
& query
)
9001 PG
*pg
= context
< RecoveryMachine
>().pg
;
9002 pg
->fulfill_query(query
, context
<RecoveryMachine
>().get_recovery_ctx());
9003 return discard_event();
9006 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const QueryState
& q
)
9008 q
.f
->open_object_section("state");
9009 q
.f
->dump_string("name", state_name
);
9010 q
.f
->dump_stream("enter_time") << enter_time
;
9011 q
.f
->close_section();
9012 return forward_event();
9015 void PG::RecoveryState::ReplicaActive::exit()
9017 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9018 PG
*pg
= context
< RecoveryMachine
>().pg
;
9019 pg
->clear_reserved_num_bytes();
9020 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
9021 utime_t dur
= ceph_clock_now() - enter_time
;
9022 pg
->osd
->recoverystate_perf
->tinc(rs_replicaactive_latency
, dur
);
9026 PG::RecoveryState::Stray::Stray(my_context ctx
)
9028 NamedState(context
< RecoveryMachine
>().pg
, "Started/Stray")
9030 context
< RecoveryMachine
>().log_enter(state_name
);
9032 PG
*pg
= context
< RecoveryMachine
>().pg
;
9033 ceph_assert(!pg
->is_peered());
9034 ceph_assert(!pg
->is_peering());
9035 ceph_assert(!pg
->is_primary());
9037 if (!pg
->get_osdmap()->have_pg_pool(pg
->get_pgid().pool())) {
9038 ldout(pg
->cct
,10) << __func__
<< " pool is deleted" << dendl
;
9039 post_event(DeleteStart());
9041 pg
->start_flush(context
< RecoveryMachine
>().get_cur_transaction());
9045 boost::statechart::result
PG::RecoveryState::Stray::react(const MLogRec
& logevt
)
9047 PG
*pg
= context
< RecoveryMachine
>().pg
;
9048 MOSDPGLog
*msg
= logevt
.msg
.get();
9049 ldout(pg
->cct
, 10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
9051 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
9052 if (msg
->info
.last_backfill
== hobject_t()) {
9054 pg
->info
= msg
->info
;
9055 pg
->on_info_history_change();
9056 pg
->dirty_info
= true;
9057 pg
->dirty_big_info
= true; // maybe.
9059 PGLogEntryHandler rollbacker
{pg
, t
};
9060 pg
->pg_log
.reset_backfill_claim_log(msg
->log
, &rollbacker
);
9062 pg
->pg_log
.reset_backfill();
9064 pg
->merge_log(*t
, msg
->info
, msg
->log
, logevt
.from
);
9067 ceph_assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
9069 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
9070 return transit
<ReplicaActive
>();
9073 boost::statechart::result
PG::RecoveryState::Stray::react(const MInfoRec
& infoevt
)
9075 PG
*pg
= context
< RecoveryMachine
>().pg
;
9076 ldout(pg
->cct
, 10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
9078 if (pg
->info
.last_update
> infoevt
.info
.last_update
) {
9079 // rewind divergent log entries
9080 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
9081 pg
->rewind_divergent_log(*t
, infoevt
.info
.last_update
);
9082 pg
->info
.stats
= infoevt
.info
.stats
;
9083 pg
->info
.hit_set
= infoevt
.info
.hit_set
;
9086 ceph_assert(infoevt
.info
.last_update
== pg
->info
.last_update
);
9087 ceph_assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
9089 post_event(Activate(infoevt
.info
.last_epoch_started
));
9090 return transit
<ReplicaActive
>();
9093 boost::statechart::result
PG::RecoveryState::Stray::react(const MQuery
& query
)
9095 PG
*pg
= context
< RecoveryMachine
>().pg
;
9096 pg
->fulfill_query(query
, context
<RecoveryMachine
>().get_recovery_ctx());
9097 return discard_event();
9100 boost::statechart::result
PG::RecoveryState::Stray::react(const ActMap
&)
9102 PG
*pg
= context
< RecoveryMachine
>().pg
;
9103 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
9104 context
< RecoveryMachine
>().send_notify(
9107 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
9108 pg
->get_osdmap_epoch(),
9109 pg
->get_osdmap_epoch(),
9111 pg
->past_intervals
);
9114 return discard_event();
9117 void PG::RecoveryState::Stray::exit()
9119 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9120 PG
*pg
= context
< RecoveryMachine
>().pg
;
9121 utime_t dur
= ceph_clock_now() - enter_time
;
9122 pg
->osd
->recoverystate_perf
->tinc(rs_stray_latency
, dur
);
9126 /*--------ToDelete----------*/
9127 PG::RecoveryState::ToDelete::ToDelete(my_context ctx
)
9129 NamedState(context
< RecoveryMachine
>().pg
, "Started/ToDelete")
9131 context
< RecoveryMachine
>().log_enter(state_name
);
9132 PG
*pg
= context
< RecoveryMachine
>().pg
;
9133 pg
->osd
->logger
->inc(l_osd_pg_removing
);
9136 void PG::RecoveryState::ToDelete::exit()
9138 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9139 PG
*pg
= context
< RecoveryMachine
>().pg
;
9140 // note: on a successful removal, this path doesn't execute. see
9142 pg
->osd
->logger
->dec(l_osd_pg_removing
);
9143 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
9146 /*----WaitDeleteReserved----*/
9147 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx
)
9149 NamedState(context
< RecoveryMachine
>().pg
,
9150 "Started/ToDelete/WaitDeleteReseved")
9152 context
< RecoveryMachine
>().log_enter(state_name
);
9153 PG
*pg
= context
< RecoveryMachine
>().pg
;
9154 context
<ToDelete
>().priority
= pg
->get_delete_priority();
9155 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
9156 pg
->osd
->local_reserver
.request_reservation(
9158 new QueuePeeringEvt
<DeleteReserved
>(
9159 pg
, pg
->get_osdmap_epoch(),
9161 context
<ToDelete
>().priority
,
9162 new QueuePeeringEvt
<DeleteInterrupted
>(
9163 pg
, pg
->get_osdmap_epoch(),
9164 DeleteInterrupted()));
9167 boost::statechart::result
PG::RecoveryState::ToDelete::react(
9170 PG
*pg
= context
< RecoveryMachine
>().pg
;
9171 if (pg
->get_delete_priority() != priority
) {
9172 ldout(pg
->cct
,10) << __func__
<< " delete priority changed, resetting"
9174 return transit
<ToDelete
>();
9176 return discard_event();
9179 void PG::RecoveryState::WaitDeleteReserved::exit()
9181 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9184 /*----Deleting-----*/
9185 PG::RecoveryState::Deleting::Deleting(my_context ctx
)
9187 NamedState(context
< RecoveryMachine
>().pg
, "Started/ToDelete/Deleting")
9189 context
< RecoveryMachine
>().log_enter(state_name
);
9190 PG
*pg
= context
< RecoveryMachine
>().pg
;
9191 pg
->deleting
= true;
9192 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
9194 t
->register_on_commit(new C_DeleteMore(pg
, pg
->get_osdmap_epoch()));
9197 boost::statechart::result
PG::RecoveryState::Deleting::react(
9198 const DeleteSome
& evt
)
9200 PG
*pg
= context
< RecoveryMachine
>().pg
;
9201 pg
->_delete_some(context
<RecoveryMachine
>().get_cur_transaction());
9202 return discard_event();
9205 void PG::RecoveryState::Deleting::exit()
9207 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9208 PG
*pg
= context
< RecoveryMachine
>().pg
;
9209 pg
->deleting
= false;
9210 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
9213 /*--------GetInfo---------*/
9214 PG::RecoveryState::GetInfo::GetInfo(my_context ctx
)
9216 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetInfo")
9218 context
< RecoveryMachine
>().log_enter(state_name
);
9220 PG
*pg
= context
< RecoveryMachine
>().pg
;
9221 pg
->check_past_interval_bounds();
9222 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
9224 ceph_assert(pg
->blocked_by
.empty());
9226 prior_set
= pg
->build_prior();
9228 pg
->reset_min_peer_features();
9230 if (prior_set
.pg_down
) {
9231 post_event(IsDown());
9232 } else if (peer_info_requested
.empty()) {
9233 post_event(GotInfo());
9237 void PG::RecoveryState::GetInfo::get_infos()
9239 PG
*pg
= context
< RecoveryMachine
>().pg
;
9240 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
9242 pg
->blocked_by
.clear();
9243 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
9244 it
!= prior_set
.probe
.end();
9246 pg_shard_t peer
= *it
;
9247 if (peer
== pg
->pg_whoami
) {
9250 if (pg
->peer_info
.count(peer
)) {
9251 ldout(pg
->cct
, 10) << " have osd." << peer
<< " info " << pg
->peer_info
[peer
] << dendl
;
9254 if (peer_info_requested
.count(peer
)) {
9255 ldout(pg
->cct
, 10) << " already requested info from osd." << peer
<< dendl
;
9256 pg
->blocked_by
.insert(peer
.osd
);
9257 } else if (!pg
->get_osdmap()->is_up(peer
.osd
)) {
9258 ldout(pg
->cct
, 10) << " not querying info from down osd." << peer
<< dendl
;
9260 ldout(pg
->cct
, 10) << " querying info from osd." << peer
<< dendl
;
9261 context
< RecoveryMachine
>().send_query(
9262 peer
, pg_query_t(pg_query_t::INFO
,
9263 it
->shard
, pg
->pg_whoami
.shard
,
9265 pg
->get_osdmap_epoch()));
9266 peer_info_requested
.insert(peer
);
9267 pg
->blocked_by
.insert(peer
.osd
);
9271 pg
->publish_stats_to_osd();
9274 boost::statechart::result
PG::RecoveryState::GetInfo::react(const MNotifyRec
& infoevt
)
9276 PG
*pg
= context
< RecoveryMachine
>().pg
;
9278 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
9279 if (p
!= peer_info_requested
.end()) {
9280 peer_info_requested
.erase(p
);
9281 pg
->blocked_by
.erase(infoevt
.from
.osd
);
9284 epoch_t old_start
= pg
->info
.history
.last_epoch_started
;
9285 if (pg
->proc_replica_info(
9286 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
9287 // we got something new ...
9288 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
9289 if (old_start
< pg
->info
.history
.last_epoch_started
) {
9290 ldout(pg
->cct
, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
9291 prior_set
= pg
->build_prior();
9293 // filter out any osds that got dropped from the probe set from
9294 // peer_info_requested. this is less expensive than restarting
9295 // peering (which would re-probe everyone).
9296 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
9297 while (p
!= peer_info_requested
.end()) {
9298 if (prior_set
.probe
.count(*p
) == 0) {
9299 ldout(pg
->cct
, 20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
9300 peer_info_requested
.erase(p
++);
9307 ldout(pg
->cct
, 20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
9308 << hex
<< infoevt
.features
<< dec
<< dendl
;
9309 pg
->apply_peer_features(infoevt
.features
);
9311 // are we done getting everything?
9312 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
9313 ldout(pg
->cct
, 20) << "Common peer features: " << hex
<< pg
->get_min_peer_features() << dec
<< dendl
;
9314 ldout(pg
->cct
, 20) << "Common acting features: " << hex
<< pg
->get_min_acting_features() << dec
<< dendl
;
9315 ldout(pg
->cct
, 20) << "Common upacting features: " << hex
<< pg
->get_min_upacting_features() << dec
<< dendl
;
9316 post_event(GotInfo());
9319 return discard_event();
9322 boost::statechart::result
PG::RecoveryState::GetInfo::react(const QueryState
& q
)
9324 PG
*pg
= context
< RecoveryMachine
>().pg
;
9325 q
.f
->open_object_section("state");
9326 q
.f
->dump_string("name", state_name
);
9327 q
.f
->dump_stream("enter_time") << enter_time
;
9329 q
.f
->open_array_section("requested_info_from");
9330 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
9331 p
!= peer_info_requested
.end();
9333 q
.f
->open_object_section("osd");
9334 q
.f
->dump_stream("osd") << *p
;
9335 if (pg
->peer_info
.count(*p
)) {
9336 q
.f
->open_object_section("got_info");
9337 pg
->peer_info
[*p
].dump(q
.f
);
9338 q
.f
->close_section();
9340 q
.f
->close_section();
9342 q
.f
->close_section();
9344 q
.f
->close_section();
9345 return forward_event();
9348 void PG::RecoveryState::GetInfo::exit()
9350 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9351 PG
*pg
= context
< RecoveryMachine
>().pg
;
9352 utime_t dur
= ceph_clock_now() - enter_time
;
9353 pg
->osd
->recoverystate_perf
->tinc(rs_getinfo_latency
, dur
);
9354 pg
->blocked_by
.clear();
9357 /*------GetLog------------*/
9358 PG::RecoveryState::GetLog::GetLog(my_context ctx
)
9361 context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetLog"),
9364 context
< RecoveryMachine
>().log_enter(state_name
);
9366 PG
*pg
= context
< RecoveryMachine
>().pg
;
9369 if (!pg
->choose_acting(auth_log_shard
, false,
9370 &context
< Peering
>().history_les_bound
)) {
9371 if (!pg
->want_acting
.empty()) {
9372 post_event(NeedActingChange());
9374 post_event(IsIncomplete());
9380 if (auth_log_shard
== pg
->pg_whoami
) {
9381 post_event(GotLog());
9385 const pg_info_t
& best
= pg
->peer_info
[auth_log_shard
];
9388 if (pg
->info
.last_update
< best
.log_tail
) {
9389 ldout(pg
->cct
, 10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
9390 post_event(IsIncomplete());
9394 // how much log to request?
9395 eversion_t request_log_from
= pg
->info
.last_update
;
9396 ceph_assert(!pg
->acting_recovery_backfill
.empty());
9397 for (set
<pg_shard_t
>::iterator p
= pg
->acting_recovery_backfill
.begin();
9398 p
!= pg
->acting_recovery_backfill
.end();
9400 if (*p
== pg
->pg_whoami
) continue;
9401 pg_info_t
& ri
= pg
->peer_info
[*p
];
9402 if (ri
.last_update
< pg
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
9403 ri
.last_update
< request_log_from
)
9404 request_log_from
= ri
.last_update
;
9408 ldout(pg
->cct
, 10) << " requesting log from osd." << auth_log_shard
<< dendl
;
9409 context
<RecoveryMachine
>().send_query(
9413 auth_log_shard
.shard
, pg
->pg_whoami
.shard
,
9414 request_log_from
, pg
->info
.history
,
9415 pg
->get_osdmap_epoch()));
9417 ceph_assert(pg
->blocked_by
.empty());
9418 pg
->blocked_by
.insert(auth_log_shard
.osd
);
9419 pg
->publish_stats_to_osd();
9422 boost::statechart::result
PG::RecoveryState::GetLog::react(const AdvMap
& advmap
)
9424 PG
*pg
= context
< RecoveryMachine
>().pg
;
9425 // make sure our log source didn't go down. we need to check
9426 // explicitly because it may not be part of the prior set, which
9427 // means the Peering state check won't catch it going down.
9428 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
9429 ldout(pg
->cct
, 10) << "GetLog: auth_log_shard osd."
9430 << auth_log_shard
.osd
<< " went down" << dendl
;
9432 return transit
< Reset
>();
9435 // let the Peering state do its checks.
9436 return forward_event();
9439 boost::statechart::result
PG::RecoveryState::GetLog::react(const MLogRec
& logevt
)
9441 PG
*pg
= context
< RecoveryMachine
>().pg
;
9443 if (logevt
.from
!= auth_log_shard
) {
9444 ldout(pg
->cct
, 10) << "GetLog: discarding log from "
9445 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
9446 return discard_event();
9448 ldout(pg
->cct
, 10) << "GetLog: received master log from osd"
9449 << logevt
.from
<< dendl
;
9451 post_event(GotLog());
9452 return discard_event();
9455 boost::statechart::result
PG::RecoveryState::GetLog::react(const GotLog
&)
9457 PG
*pg
= context
< RecoveryMachine
>().pg
;
9458 ldout(pg
->cct
, 10) << "leaving GetLog" << dendl
;
9460 ldout(pg
->cct
, 10) << "processing master log" << dendl
;
9461 pg
->proc_master_log(*context
<RecoveryMachine
>().get_cur_transaction(),
9462 msg
->info
, msg
->log
, msg
->missing
,
9465 pg
->start_flush(context
< RecoveryMachine
>().get_cur_transaction());
9466 return transit
< GetMissing
>();
9469 boost::statechart::result
PG::RecoveryState::GetLog::react(const QueryState
& q
)
9471 q
.f
->open_object_section("state");
9472 q
.f
->dump_string("name", state_name
);
9473 q
.f
->dump_stream("enter_time") << enter_time
;
9474 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
9475 q
.f
->close_section();
9476 return forward_event();
9479 void PG::RecoveryState::GetLog::exit()
9481 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9482 PG
*pg
= context
< RecoveryMachine
>().pg
;
9483 utime_t dur
= ceph_clock_now() - enter_time
;
9484 pg
->osd
->recoverystate_perf
->tinc(rs_getlog_latency
, dur
);
9485 pg
->blocked_by
.clear();
9488 /*------WaitActingChange--------*/
9489 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx
)
9491 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/WaitActingChange")
9493 context
< RecoveryMachine
>().log_enter(state_name
);
9496 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const AdvMap
& advmap
)
9498 PG
*pg
= context
< RecoveryMachine
>().pg
;
9499 OSDMapRef osdmap
= advmap
.osdmap
;
9501 ldout(pg
->cct
, 10) << "verifying no want_acting " << pg
->want_acting
<< " targets didn't go down" << dendl
;
9502 for (vector
<int>::iterator p
= pg
->want_acting
.begin(); p
!= pg
->want_acting
.end(); ++p
) {
9503 if (!osdmap
->is_up(*p
)) {
9504 ldout(pg
->cct
, 10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
9506 return transit
< Reset
>();
9509 return forward_event();
9512 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MLogRec
& logevt
)
9514 PG
*pg
= context
< RecoveryMachine
>().pg
;
9515 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
9516 return discard_event();
9519 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MInfoRec
& evt
)
9521 PG
*pg
= context
< RecoveryMachine
>().pg
;
9522 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
9523 return discard_event();
9526 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MNotifyRec
& evt
)
9528 PG
*pg
= context
< RecoveryMachine
>().pg
;
9529 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
9530 return discard_event();
9533 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const QueryState
& q
)
9535 q
.f
->open_object_section("state");
9536 q
.f
->dump_string("name", state_name
);
9537 q
.f
->dump_stream("enter_time") << enter_time
;
9538 q
.f
->dump_string("comment", "waiting for pg acting set to change");
9539 q
.f
->close_section();
9540 return forward_event();
9543 void PG::RecoveryState::WaitActingChange::exit()
9545 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9546 PG
*pg
= context
< RecoveryMachine
>().pg
;
9547 utime_t dur
= ceph_clock_now() - enter_time
;
9548 pg
->osd
->recoverystate_perf
->tinc(rs_waitactingchange_latency
, dur
);
9551 /*------Down--------*/
9552 PG::RecoveryState::Down::Down(my_context ctx
)
9554 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Down")
9556 context
< RecoveryMachine
>().log_enter(state_name
);
9557 PG
*pg
= context
< RecoveryMachine
>().pg
;
9559 pg
->state_clear(PG_STATE_PEERING
);
9560 pg
->state_set(PG_STATE_DOWN
);
9562 auto &prior_set
= context
< Peering
>().prior_set
;
9563 ceph_assert(pg
->blocked_by
.empty());
9564 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
9565 pg
->publish_stats_to_osd();
9568 void PG::RecoveryState::Down::exit()
9570 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9571 PG
*pg
= context
< RecoveryMachine
>().pg
;
9573 pg
->state_clear(PG_STATE_DOWN
);
9574 utime_t dur
= ceph_clock_now() - enter_time
;
9575 pg
->osd
->recoverystate_perf
->tinc(rs_down_latency
, dur
);
9577 pg
->blocked_by
.clear();
9580 boost::statechart::result
PG::RecoveryState::Down::react(const QueryState
& q
)
9582 q
.f
->open_object_section("state");
9583 q
.f
->dump_string("name", state_name
);
9584 q
.f
->dump_stream("enter_time") << enter_time
;
9585 q
.f
->dump_string("comment",
9586 "not enough up instances of this PG to go active");
9587 q
.f
->close_section();
9588 return forward_event();
9591 boost::statechart::result
PG::RecoveryState::Down::react(const MNotifyRec
& infoevt
)
9593 PG
*pg
= context
< RecoveryMachine
>().pg
;
9595 ceph_assert(pg
->is_primary());
9596 epoch_t old_start
= pg
->info
.history
.last_epoch_started
;
9597 if (!pg
->peer_info
.count(infoevt
.from
) &&
9598 pg
->get_osdmap()->has_been_up_since(infoevt
.from
.osd
, infoevt
.notify
.epoch_sent
)) {
9599 pg
->update_history(infoevt
.notify
.info
.history
);
9601 // if we got something new to make pg escape down state
9602 if (pg
->info
.history
.last_epoch_started
> old_start
) {
9603 ldout(pg
->cct
, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl
;
9604 pg
->state_clear(PG_STATE_DOWN
);
9605 pg
->state_set(PG_STATE_PEERING
);
9606 return transit
< GetInfo
>();
9609 return discard_event();
9613 /*------Incomplete--------*/
9614 PG::RecoveryState::Incomplete::Incomplete(my_context ctx
)
9616 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Incomplete")
9618 context
< RecoveryMachine
>().log_enter(state_name
);
9619 PG
*pg
= context
< RecoveryMachine
>().pg
;
9621 pg
->state_clear(PG_STATE_PEERING
);
9622 pg
->state_set(PG_STATE_INCOMPLETE
);
9624 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
9625 ceph_assert(pg
->blocked_by
.empty());
9626 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
9627 pg
->publish_stats_to_osd();
9630 boost::statechart::result
PG::RecoveryState::Incomplete::react(const AdvMap
&advmap
) {
9631 PG
*pg
= context
< RecoveryMachine
>().pg
;
9632 int64_t poolnum
= pg
->info
.pgid
.pool();
9634 // Reset if min_size turn smaller than previous value, pg might now be able to go active
9635 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
9636 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
9637 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
9639 return transit
< Reset
>();
9642 return forward_event();
9645 boost::statechart::result
PG::RecoveryState::Incomplete::react(const MNotifyRec
& notevt
) {
9646 PG
*pg
= context
< RecoveryMachine
>().pg
;
9647 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
9648 if (pg
->proc_replica_info(
9649 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
9650 // We got something new, try again!
9651 return transit
< GetLog
>();
9653 return discard_event();
9657 boost::statechart::result
PG::RecoveryState::Incomplete::react(
9658 const QueryState
& q
)
9660 q
.f
->open_object_section("state");
9661 q
.f
->dump_string("name", state_name
);
9662 q
.f
->dump_stream("enter_time") << enter_time
;
9663 q
.f
->dump_string("comment", "not enough complete instances of this PG");
9664 q
.f
->close_section();
9665 return forward_event();
9668 void PG::RecoveryState::Incomplete::exit()
9670 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9671 PG
*pg
= context
< RecoveryMachine
>().pg
;
9673 pg
->state_clear(PG_STATE_INCOMPLETE
);
9674 utime_t dur
= ceph_clock_now() - enter_time
;
9675 pg
->osd
->recoverystate_perf
->tinc(rs_incomplete_latency
, dur
);
9677 pg
->blocked_by
.clear();
9680 /*------GetMissing--------*/
9681 PG::RecoveryState::GetMissing::GetMissing(my_context ctx
)
9683 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetMissing")
9685 context
< RecoveryMachine
>().log_enter(state_name
);
9687 PG
*pg
= context
< RecoveryMachine
>().pg
;
9688 ceph_assert(!pg
->acting_recovery_backfill
.empty());
9690 for (set
<pg_shard_t
>::iterator i
= pg
->acting_recovery_backfill
.begin();
9691 i
!= pg
->acting_recovery_backfill
.end();
9693 if (*i
== pg
->get_primary()) continue;
9694 const pg_info_t
& pi
= pg
->peer_info
[*i
];
9695 // reset this so to make sure the pg_missing_t is initialized and
9696 // has the correct semantics even if we don't need to get a
9697 // missing set from a shard. This way later additions due to
9698 // lost+unfound delete work properly.
9699 pg
->peer_missing
[*i
].may_include_deletes
= !pg
->perform_deletes_during_peering();
9702 continue; // no pg data, nothing divergent
9704 if (pi
.last_update
< pg
->pg_log
.get_tail()) {
9705 ldout(pg
->cct
, 10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
9706 pg
->peer_missing
[*i
].clear();
9709 if (pi
.last_backfill
== hobject_t()) {
9710 ldout(pg
->cct
, 10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
9711 pg
->peer_missing
[*i
].clear();
9715 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
9716 pi
.last_update
== pg
->info
.last_update
) { // peer is up to date
9717 // replica has no missing and identical log as us. no need to
9719 // FIXME: we can do better here. if last_update==last_complete we
9720 // can infer the rest!
9721 ldout(pg
->cct
, 10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
9722 pg
->peer_missing
[*i
].clear();
9726 // We pull the log from the peer's last_epoch_started to ensure we
9727 // get enough log to detect divergent updates.
9728 since
.epoch
= pi
.last_epoch_started
;
9729 ceph_assert(pi
.last_update
>= pg
->info
.log_tail
); // or else choose_acting() did a bad thing
9730 if (pi
.log_tail
<= since
) {
9731 ldout(pg
->cct
, 10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
9732 context
< RecoveryMachine
>().send_query(
9736 i
->shard
, pg
->pg_whoami
.shard
,
9737 since
, pg
->info
.history
,
9738 pg
->get_osdmap_epoch()));
9740 ldout(pg
->cct
, 10) << " requesting fulllog+missing from osd." << *i
9741 << " (want since " << since
<< " < log.tail "
9742 << pi
.log_tail
<< ")" << dendl
;
9743 context
< RecoveryMachine
>().send_query(
9745 pg_query_t::FULLLOG
,
9746 i
->shard
, pg
->pg_whoami
.shard
,
9747 pg
->info
.history
, pg
->get_osdmap_epoch()));
9749 peer_missing_requested
.insert(*i
);
9750 pg
->blocked_by
.insert(i
->osd
);
9753 if (peer_missing_requested
.empty()) {
9754 if (pg
->need_up_thru
) {
9755 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
9757 post_event(NeedUpThru());
9762 post_event(Activate(pg
->get_osdmap_epoch()));
9764 pg
->publish_stats_to_osd();
9768 boost::statechart::result
PG::RecoveryState::GetMissing::react(const MLogRec
& logevt
)
9770 PG
*pg
= context
< RecoveryMachine
>().pg
;
9772 peer_missing_requested
.erase(logevt
.from
);
9773 pg
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
9775 if (peer_missing_requested
.empty()) {
9776 if (pg
->need_up_thru
) {
9777 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
9779 post_event(NeedUpThru());
9781 ldout(pg
->cct
, 10) << "Got last missing, don't need missing "
9782 << "posting Activate" << dendl
;
9783 post_event(Activate(pg
->get_osdmap_epoch()));
9786 return discard_event();
9789 boost::statechart::result
PG::RecoveryState::GetMissing::react(const QueryState
& q
)
9791 PG
*pg
= context
< RecoveryMachine
>().pg
;
9792 q
.f
->open_object_section("state");
9793 q
.f
->dump_string("name", state_name
);
9794 q
.f
->dump_stream("enter_time") << enter_time
;
9796 q
.f
->open_array_section("peer_missing_requested");
9797 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
9798 p
!= peer_missing_requested
.end();
9800 q
.f
->open_object_section("osd");
9801 q
.f
->dump_stream("osd") << *p
;
9802 if (pg
->peer_missing
.count(*p
)) {
9803 q
.f
->open_object_section("got_missing");
9804 pg
->peer_missing
[*p
].dump(q
.f
);
9805 q
.f
->close_section();
9807 q
.f
->close_section();
9809 q
.f
->close_section();
9811 q
.f
->close_section();
9812 return forward_event();
9815 void PG::RecoveryState::GetMissing::exit()
9817 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9818 PG
*pg
= context
< RecoveryMachine
>().pg
;
9819 utime_t dur
= ceph_clock_now() - enter_time
;
9820 pg
->osd
->recoverystate_perf
->tinc(rs_getmissing_latency
, dur
);
9821 pg
->blocked_by
.clear();
9824 /*------WaitUpThru--------*/
9825 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx
)
9827 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitUpThru")
9829 context
< RecoveryMachine
>().log_enter(state_name
);
9832 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const ActMap
& am
)
9834 PG
*pg
= context
< RecoveryMachine
>().pg
;
9835 if (!pg
->need_up_thru
) {
9836 post_event(Activate(pg
->get_osdmap_epoch()));
9838 return forward_event();
9841 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const MLogRec
& logevt
)
9843 PG
*pg
= context
< RecoveryMachine
>().pg
;
9844 ldout(pg
->cct
, 10) << "Noting missing from osd." << logevt
.from
<< dendl
;
9845 pg
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
9846 pg
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
9847 return discard_event();
9850 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const QueryState
& q
)
9852 q
.f
->open_object_section("state");
9853 q
.f
->dump_string("name", state_name
);
9854 q
.f
->dump_stream("enter_time") << enter_time
;
9855 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9856 q
.f
->close_section();
9857 return forward_event();
9860 void PG::RecoveryState::WaitUpThru::exit()
9862 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
9863 PG
*pg
= context
< RecoveryMachine
>().pg
;
9864 utime_t dur
= ceph_clock_now() - enter_time
;
9865 pg
->osd
->recoverystate_perf
->tinc(rs_waitupthru_latency
, dur
);
9868 /*----RecoveryState::RecoveryMachine Methods-----*/
9870 #define dout_prefix pg->gen_prefix(*_dout)
9872 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name
)
9874 PG
*pg
= context
< RecoveryMachine
>().pg
;
9875 ldout(pg
->cct
, 5) << "enter " << state_name
<< dendl
;
9876 pg
->osd
->pg_recovery_stats
.log_enter(state_name
);
9879 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name
, utime_t enter_time
)
9881 utime_t dur
= ceph_clock_now() - enter_time
;
9882 PG
*pg
= context
< RecoveryMachine
>().pg
;
9883 ldout(pg
->cct
, 5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
9884 pg
->osd
->pg_recovery_stats
.log_exit(state_name
, ceph_clock_now() - enter_time
,
9885 event_count
, event_time
);
9887 event_time
= utime_t();
9891 /*---------------------------------------------------*/
9893 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9895 void PG::RecoveryState::start_handle(RecoveryCtx
*new_ctx
) {
9897 ceph_assert(!orig_ctx
);
9900 if (messages_pending_flush
) {
9901 rctx
= RecoveryCtx(*messages_pending_flush
, *new_ctx
);
9905 rctx
->start_time
= ceph_clock_now();
9909 void PG::RecoveryState::begin_block_outgoing() {
9910 ceph_assert(!messages_pending_flush
);
9911 ceph_assert(orig_ctx
);
9913 messages_pending_flush
= BufferedRecoveryMessages();
9914 rctx
= RecoveryCtx(*messages_pending_flush
, *orig_ctx
);
9917 void PG::RecoveryState::clear_blocked_outgoing() {
9918 ceph_assert(orig_ctx
);
9920 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
9923 void PG::RecoveryState::end_block_outgoing() {
9924 ceph_assert(messages_pending_flush
);
9925 ceph_assert(orig_ctx
);
9928 rctx
= RecoveryCtx(*orig_ctx
);
9929 rctx
->accept_buffered_messages(*messages_pending_flush
);
9930 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
9933 void PG::RecoveryState::end_handle() {
9935 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
9936 machine
.event_time
+= dur
;
9939 machine
.event_count
++;
9940 rctx
= boost::optional
<RecoveryCtx
>();
9944 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
)
9946 out
<< "BackfillInfo(" << bi
.begin
<< "-" << bi
.end
9947 << " " << bi
.objects
.size() << " objects";
9948 if (!bi
.objects
.empty())
9949 out
<< " " << bi
.objects
;
9954 void PG::dump_pgstate_history(Formatter
*f
)
9957 pgstate_history
.dump(f
);
9961 void PG::dump_missing(Formatter
*f
)
9963 for (auto& i
: pg_log
.get_missing().get_items()) {
9964 f
->open_object_section("object");
9965 f
->dump_object("oid", i
.first
);
9966 f
->dump_object("missing_info", i
.second
);
9967 if (missing_loc
.needs_recovery(i
.first
)) {
9968 f
->dump_bool("unfound", missing_loc
.is_unfound(i
.first
));
9969 f
->open_array_section("locations");
9970 for (auto l
: missing_loc
.get_locations(i
.first
)) {
9971 f
->dump_object("shard", l
);
9979 void PG::get_pg_stats(std::function
<void(const pg_stat_t
&, epoch_t lec
)> f
)
9981 pg_stats_publish_lock
.Lock();
9982 if (pg_stats_publish_valid
) {
9983 f(pg_stats_publish
, pg_stats_publish
.get_effective_last_epoch_clean());
9985 pg_stats_publish_lock
.Unlock();
9988 void PG::with_heartbeat_peers(std::function
<void(int)> f
)
9990 heartbeat_peer_lock
.Lock();
9991 for (auto p
: heartbeat_peers
) {
9994 for (auto p
: probe_targets
) {
9997 heartbeat_peer_lock
.Unlock();